o
    }oiQ'                     @   s  d dl mZ d dlmZ d dlZd dlZd dlm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZmZ d dlmZ d d	lmZmZ d d
lmZmZmZ d dlmZ d dlm Z  d dl!m"Z" d dl#m$Z$m%Z% d dl&m'Z' d dl(m)Z) d dl*m+Z+ dZ,ej-j.e,ddej/ej0 fddZ1										d3de2de2deej3 dee2 de2d e4d!e2d"e2d#e2d$ee5ej/e	   dej/ej6 fd%d&Z7ej-j.ee,d'dd(ddefd)ee8 d*e8d!e2d"e2dej9f
d+d,Z:ej-j.ee,d'	-		(			.d4d/e8d)ee8 d*e8d!e2d"e2d0ee8 dej9fd1d2Z;dS )5    )OptionalN)Callback)DistributedDataParallelConfig)OptimizerConfig)	lightning)finetunepretrain)PEFT_STR2CLS)default_finetune_trainernemo_resume)default_logdefault_resumetensorboard_logger)
bf16_mixed)MockDataModule)SquadDataModule)T5Config220MT5Model)WarmupAnnealingScheduler)MegatronOptimizerModule)TimingCallbackt5_220mnamereturnc                   C   s   t jtt tdS )aX  
    Factory function to create a T5 220M model configuration.

    Returns:
        run.Config[pl.LightningModule]: Configuration for the T5 220M model.

    Examples:
        CLI usage:
            $ nemo llm pretrain model=t5_220m ...

        Python API usage:
            >>> model_config = model()
            >>> print(model_config)
    )config)runConfigr   r    r   r   X/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/llm/recipes/t5_220m.pymodel)   s   r       F   @B tensor_parallelismpipeline_parallelismpipeline_parallelism_typevirtual_pipeline_parallelismcontext_parallelismsequence_parallelism	num_nodesnum_gpus_per_node	max_steps	callbacksc
                 C   s`   t jtj| |||||dddt jtdddddd}
t jtjdd|	|ddd||t |
d	d
d}|S )a[  
    Configure the NeMo Lightning Trainer for T5 model.

    This function sets up the distributed training strategy and other training parameters.

    Args:
        tensor_parallelism (int): Degree of tensor model parallelism.
        pipeline_parallelism (int): Degree of pipeline model parallelism.
        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
        context_parallelism (int): Degree of context parallelism.
        sequence_parallelism (bool): Whether to use sequence parallelism.
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
        max_steps (int): Maximum number of training steps.
        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.

    Returns:
        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.

    Examples:
        CLI usage:
            $ nemo llm pretrain trainer=t5_220m ...

        Python API usage:
            >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8)
            >>> print(trainer_config)

    Note:
        For more information on distributed training strategies, refer to the
        NeMo documentation on multi-GPU and multi-node training.
    T)check_for_nan_in_gradgrad_reduce_in_fp32overlap_grad_reduceoverlap_param_gather)
tensor_model_parallel_sizepipeline_model_parallel_sizepipeline_dtype$virtual_pipeline_model_parallel_sizecontext_parallel_sizesequence_parallelgradient_as_bucket_viewckpt_async_saveckpt_parallel_loadddpgpur!   2       
   F  )acceleratoraccumulate_grad_batchesr-   deviceslimit_test_batcheslimit_val_batcheslog_every_n_stepsr,   r*   pluginsstrategyuse_distributed_samplerval_check_interval)r   r   nlMegatronStrategyr   Trainerr   )r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   rH   trainerr   r   r   rN   <   sH   ,rN   )targetr   defaultdirr   c                 C   s   t jtdddddd}t jtddddd	}t j|t t||t tgd
t jtdddddt	| |t
|ddt jt||dt dS )a  
    Create a pre-training recipe for T5 220m model.

    This function sets up a complete configuration for pre-training, including
    model, trainer, data, logging, optimization, and resumption settings.

    Args:
        dir (Optional[str]): Directory for saving logs and checkpoints.
        name (str): Name of the pre-training run.
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
        fn (Callable): The pre-training function to use.

    Returns:
        run.Partial: Partial configuration for pre-training.

    Examples:
        CLI usage:
            $ nemo llm pretrain --factory t5_220m
            $ nemo llm pretrain --factory "t5_220m(num_nodes=2, name='my_pretrain')"

        Python API usage:
            >>> recipe = pretrain_recipe(name="t5_220m_pretrain", num_nodes=2)
            >>> print(recipe)
    adam-C6?T{Gz?	optimizerlruse_distributed_optimizerbf16weight_decayNr#   h㈵>)warmup_stepswarmup_ratior,   min_lr)r*   r+   r-         r!   
seq_lengthseq_length_decglobal_batch_sizemicro_batch_sizer   rQ   r   r   r   lr_schedulerr    rN   datalogoptimresume)r   r   r   r   Partialr    rN   r   r   r   r   r   r   )rQ   r   r*   r+   fn
opt_configrh   r   r   r   pretrain_recipe   s8   	
rq    loracheckpoint_pathpeft_schemec           	      C   s   t jtdddddd}t jtdddd	}t jtt t||d
t jtdddddt	||t
|ddt jt||dt| d}|du sG| dkrSd|jj_d|jj_|S | dv rjt t|  |_d|jj_|S td| )ak  
    Create a fine-tuning recipe for T5 220M model.

    This function sets up a complete configuration for fine-tuning, including
    model, trainer, data, logging, optimization, and resumption settings.
    The recipe uses LoRA (Low-Rank Adaptation) for efficient fine-tuning, unless peft_scheme is set to None.

    Args:
        checkpoint_path (str): Path to pretrained checkpoint
        dir (Optional[str]): Directory for saving logs and checkpoints.
        name (str): Name of the fine-tuning run.
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
            Allowed values: 'lora'/'dora'/'none'/None.

    Returns:
        run.Partial: Partial configuration for fine-tuning.

    Examples:
        CLI usage:
            $ nemo llm finetune --factory t5_220m

        Python API usage:
            >>> recipe = finetune_recipe(name="t5_220m_finetune", num_nodes=1)
            >>> print(recipe)

    Note:
        This recipe uses the SQuAD dataset for fine-tuning.
    rR   rS   TrT   rU   r=   r@   r[   )r\   r,   r^   )r*   r+   r_   r`   r!   ra   r   rf   rg   ri   Nnonegh㈵>)rs   dorazUnrecognized peft scheme: )r   r   r   r   rn   r   r    r
   r   r   r   r   r   lowerrN   rH   r2   rl   r   rW   r	   peft
ValueError)	rt   rQ   r   r*   r+   ru   rp   rh   reciper   r   r   finetune_recipe   sJ   '	



r|   )
r!   r!   NNr!   Fr!   r"   r#   N)rr   NrP   r!   r"   rs   )<typingr   lightning.pytorchpytorchplnemo_runr   torch$lightning.pytorch.callbacks.callbackr   megatron.core.distributedr   megatron.core.optimizerr   nemor   rK   nemo.collections.llm.apir   r   nemo.collections.llm.peftr	   -nemo.collections.llm.recipes.finetune_defaultr
   r   (nemo.collections.llm.recipes.log.defaultr   r   r   6nemo.collections.llm.recipes.precision.mixed_precisionr   !nemo.collections.llm.t5.data.mockr   "nemo.collections.llm.t5.data.squadr    nemo.collections.llm.t5.model.t5r   r   )nemo.lightning.pytorch.optim.lr_schedulerr   %nemo.lightning.pytorch.optim.megatronr   nemo.utils.exp_managerr   NAMEclifactoryr   LightningModuler    intdtypeboollistrM   rN   strrn   rq   r|   r   r   r   r   <module>   s   	


T
=