o
    }oi&                     @   s  d dl mZmZ d dlmZ d dlZd dlZd dl	m
Z
 d dlmZ d dlmZ d dlmZmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZ d dlmZ d dlmZm Z m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z' dZ(ej)j*e(ddej+ej, fddZ-										d2de.de.deej/ dee. de.de0de.d e.d!e.d"ee1ej+e
   dej+ej2 fd#d$Z3ej)j*ee(d%dd&ddefd'ee4 d(e4de.d e.d)edej5fd*d+Z6ej)j*ee(d%		&			,		d3d'ee4 d(e4de.d e.d-ee4 d.ee. d/ee0 dej5fd0d1Z7dS )4    )CallableOptionalN)Callback)DistributedDataParallelConfig)	lightning)finetunepretrain)MockDataModule)PackedSequenceSpecs)Llama32Config1B
LlamaModel)PEFT_STR2CLS)default_finetune_recipe)default_logdefault_resumetensorboard_logger),distributed_fused_adam_with_cosine_annealing)
bf16_mixed)TimingCallback
llama32_1bnamereturnc                  C   s   t t} d| _t jt| dS )ac  
    Factory function to create a Llama3.2 1B model configuration.

    Returns:
        run.Config[pl.LightningModule]: Configuration for the Llama3.2 1B model.

    Examples:
        CLI usage:
            $ nemo llm pretrain model=llama32_1b ...

        Python API usage:
            >>> model_config = model()
            >>> print(model_config)
        )config)runConfigr   
seq_lengthr   )conf r   [/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/llm/recipes/llama32_1b.pymodel'   s   
r!      F   { tensor_parallelismpipeline_parallelismpipeline_parallelism_typevirtual_pipeline_parallelismcontext_parallelismsequence_parallelism	num_nodesnum_gpus_per_node	max_steps	callbacksc
                 C   sb   t jtj| |||||dddt jtddddddd}
t jtjdd|	|ddd||t |
d	d
d}|S )a  
    Configure the NeMo Lightning Trainer for Llama3.2 1B model.

    Args:
        tensor_parallelism (int): Degree of tensor model parallelism.
        pipeline_parallelism (int): Degree of pipeline model parallelism.
        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
        context_parallelism (int): Degree of context parallelism.
        sequence_parallelism (bool): Whether to use sequence parallelism.
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
        max_steps (int): Maximum number of training steps.
        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.

    Returns:
        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.

    Examples:
        CLI usage:
            $ nemo llm pretrain trainer=llama32_1b ...

        Python API usage:
            >>> trainer_config = trainer(num_nodes=1, num_gpus_per_node=1)
            >>> print(trainer_config)

    Note:
        This configuration uses extensive parallelism to handle the large model size efficiently.
    T)check_for_nan_in_gradgrad_reduce_in_fp32overlap_grad_reduceoverlap_param_gatheraverage_in_collective)
tensor_model_parallel_sizepipeline_model_parallel_sizepipeline_dtype$virtual_pipeline_model_parallel_sizecontext_parallel_sizesequence_parallelgradient_as_bucket_viewckpt_async_saveckpt_parallel_loadddpgpur"   2       
   Fi  )acceleratoraccumulate_grad_batchesr.   deviceslimit_test_batcheslimit_val_batcheslog_every_n_stepsr-   r+   pluginsstrategyuse_distributed_samplerval_check_interval)r   r   nlMegatronStrategyr   Trainerr   )r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   rI   trainerr   r   r    rO   <   sJ   )rO   )targetr   defaultdirr   fnc                 C   sV   t j|t t||t tgdt jtddddt| |t|ddt	dd	t
 d
}|S )a  
    Create a pre-training recipe for Llama3.2 1B model.

    This function sets up a complete configuration for pre-training, including
    model, trainer, data, logging, optimization, and resumption settings.

    Args:
        dir (Optional[str]): Directory for saving logs and checkpoints.
        name (str): Name of the pre-training run.
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
        fn (Callable): The pre-training function to use.

    Returns:
        run.Partial: Partial configuration for pre-training.

    Examples:
        CLI usage:
            $ nemo llm pretrain --factory llama32_1b
            $ nemo llm pretrain --factory "llama32_1b(num_nodes=1, name='my_1b_pretrain')"

        Python API usage:
            >>> recipe = pretrain_recipe(name="llama32_1b_pretrain", num_nodes=1)
            >>> print(recipe)

    Note:
        This recipe is optimized for the large 8B model and requires significant computational resources.
    )r+   r,   r.   r   i   r"   )r   global_batch_sizemicro_batch_sizer   )rR   r   r   ga2U0*3?)max_lr)r!   rO   datalogoptimresume)r   Partialr!   rO   r   r   r	   r   r   r   r   )rR   r   r+   r,   rS   reciper   r   r    pretrain_recipe   s   $
r]   lorapeft_schemer   packed_sequencec                 C   s   |du r
|rdnd}t t d| ||||}|du s| dkr*d|jj_d|jj_n/| dv rRt	
t|  |_d	|j_d
|j_d|jj_d|jj_d|jj_ntd| ||jj_||j_|rsddi|j_t	j
t|d|j_|S )a&  
    Create a fine-tuning recipe for Llama3.2 1B model.

    This function sets up a complete configuration for fine-tuning, including
    model, trainer, data, logging, optimization, and resumption settings.
    The recipe uses LoRA (Low-Rank Adaptation) for efficient fine-tuning, unless peft_scheme is set to None.

    Args:
        dir (Optional[str]): Directory for saving logs and checkpoints.
        name (str): Name of the fine-tuning run.
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
            Allowed values: 'lora'/'dora'/'none'/None.
        seq_length (int): Maximum number of tokens per microbatch.
        packed_sequence (Optional[bool]): If true, fine-tuning sequences will be packed into batches up to the given
            maximum seq_length for better efficiency.

    Returns:
        run.Partial: Partial configuration for fine-tuning.

    Examples:
        CLI usage:
            $ nemo llm finetune --factory llama32_1b

        Python API usage:
            >>> recipe = finetune_recipe(name="llama32_1b_finetune", num_nodes=1)
            >>> print(recipe)

    Note:
        This recipe uses the SQuAD dataset for fine-tuning.
    Ni   i   zmeta-llama/Llama-3.2-1Bnoner"   gh㈵>)r^   dorar#      Fg-C6?zUnrecognized peft scheme: pad_to_max_lengthT)packed_sequence_size)r   r!   lowerrO   rI   r4   rY   r   lrr   r   r   peftdimalphause_distributed_optimizercross_entropy_loss_fusion
ValueErrorr   rW   dataset_kwargsr
   packed_sequence_specs)rR   r   r+   r,   r_   r   r`   r\   r   r   r    finetune_recipe   s,   ,



rp   )
r"   r"   NNr"   Fr"   r#   r$   N)NrQ   r"   r#   r^   NN)8typingr   r   lightning.pytorchpytorchplnemo_runr   torch$lightning.pytorch.callbacks.callbackr   megatron.core.distributedr   nemor   rL   nemo.collections.llm.apir   r   "nemo.collections.llm.gpt.data.mockr	   -nemo.collections.llm.gpt.data.packed_sequencer
   $nemo.collections.llm.gpt.model.llamar   r   nemo.collections.llm.peftr   -nemo.collections.llm.recipes.finetune_defaultr   (nemo.collections.llm.recipes.log.defaultr   r   r   'nemo.collections.llm.recipes.optim.adamr   6nemo.collections.llm.recipes.precision.mixed_precisionr   nemo.utils.exp_managerr   NAMEclifactoryr   LightningModuler!   intdtypeboollistrN   rO   strr[   r]   rp   r   r   r   r    <module>   s   	


R4