o
    wi0                  &   @   s  d dl mZ d dlmZ d dlZd dlZd dlm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZmZ d dlmZ d d	lmZmZmZ d d
lmZ d dlmZ d dlmZ d dl m!Z! dZ"ej#j$e"dd>de%dej&ej' fddZ(ej#j$e"dd>de%dej&ej' fddZ)ej#j$ee"d														d?de*de*d eej+ d!ee* d"e*d#e,d$e*d%e*d&e*d'e*d(e*d)e*d*e*d+ee-ej&e	   dej&ej. fd,d-Z/ej#j$ee"ddd.ddddddddddd/ddefd0ee% d1e%de%d$e*d%e*de*de*d&e*d'e*d(e*d)e*d*e*d2e*d3e*d4e*dej0f d5d6Z1ej#j$ee"d		.							/								7d@d0ee% d1e%d8e%de%d$e*d%e*d9e*d:e*d2e*d&e*d'e*d(e*d)e*d*e*d3e*d4e*d;ee% dej0f$d<d=Z2dS )A    )OptionalN)Callback)DistributedDataParallelConfig)	lightning)llm)finetunepretrain)MockDataModule)default_logdefault_resumetensorboard_logger),distributed_fused_adam_with_cosine_annealing)
bf16_mixed)get_nmt_tokenizer)TimingCallbackmamba2_780mnametokenizer_modelreturnc                 C   s   t jtdd| ddS )z?
    Factory function to create a tokenizer configuration.
    huggingfacezEleutherAI/gpt-neox-20bT)library
model_namer   use_fast)runConfigr   r    r   e/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/llm/recipes/mamba2_780m.py	tokenizer%   s   r   c                 C   s    t jtjt tjt| ddS )ad  
    Factory function to create a Mamba2 780M model configuration.

    Returns:
        run.Config[pl.LightningModule]: Configuration for the Mamba2 780M model.

    Examples:
        CLI usage:
            $ nemo llm pretrain model=mamba2_780m ...

        Python API usage:
            >>> model_config = model()
            >>> print(model_config)
    r   )configr   )r   r   r   
MambaModelBaseMambaConfig780Mr   r   r   r   r   model3   s
   
r#   )targetr      F   d   2       
   tensor_parallelismpipeline_parallelismpipeline_parallelism_typevirtual_pipeline_parallelismcontext_parallelismsequence_parallelism	num_nodesnum_gpus_per_node	max_stepsval_check_intervallimit_test_batcheslimit_val_batcheslog_every_n_steps	callbacksc                 C   s`   t jtj| |||||dddt jtdddddd}t jtjdd||||t |d|	|
||d}|S )ah  
    Configure the NeMo Lightning Trainer for Mamba2 780M model.

    This function sets up the distributed training strategy and other training parameters.

    Args:
        tensor_parallelism (int): Degree of tensor model parallelism.
        pipeline_parallelism (int): Degree of pipeline model parallelism.
        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
        context_parallelism (int): Degree of context parallelism.
        sequence_parallelism (bool): Whether to use sequence parallelism.
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
        max_steps (int): Maximum number of training steps.
        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.

    Returns:
        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.

    Examples:
        CLI usage:
            $ nemo llm pretrain trainer=mamba2_780m ...

        Python API usage:
            >>> trainer_config = trainer(num_nodes=1, num_gpus_per_node=1)
            >>> print(trainer_config)

    Note:
        For more information on distributed training strategies, refer to the
        NeMo documentation on multi-GPU and multi-node training.
    TF)check_for_nan_in_gradgrad_reduce_in_fp32overlap_grad_reduceoverlap_param_gather)
tensor_model_parallel_sizepipeline_model_parallel_sizepipeline_dtype$virtual_pipeline_model_parallel_sizecontext_parallel_sizesequence_parallelgradient_as_bucket_viewckpt_async_saveckpt_parallel_loadddpgpur%   )acceleratoraccumulate_grad_batchesr8   devicesr3   r1   pluginsstrategyuse_distributed_samplerr4   r5   r6   r7   )r   r   nlMegatronStrategyr   Trainerr   )r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   rL   trainerr   r   r   rQ   J   sH   1rQ   default   dirr   
seq_lengthgbsmbsc                 C   sd   t j|t t|||||||	|
|t tgd
t jt|||t dt| |t	|ddt
ddt dS )a  
    Create a pre-training recipe for Mamba2 780M model.

    This function sets up a complete configuration for pre-training, including
    model, trainer, data, logging, optimization, and resumption settings.

    Args:
        dir (Optional[str]): Directory for saving logs and checkpoints.
        name (str): Name of the pre-training run.
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
        fn (Callable): The pre-training function to use.

    Returns:
        run.Partial: Partial configuration for pre-training.

    Examples:
        CLI usage:
            $ nemo llm pretrain --factory mamba2_780M
            $ nemo llm pretrain --factory "mamba2_780M(num_nodes=1, name='my_pretrain')"

        Python API usage:
            >>> recipe = pretrain_recipe(name="mamba2_780M_pretrain", num_nodes=1)
            >>> print(recipe)
    )
r3   r1   r+   r,   r2   r4   r5   r6   r7   r8   rU   global_batch_sizemicro_batch_sizer   r   rT   r   r   ga2U0*3?)max_lrr#   rQ   datalogoptimresume)r   Partialr#   rQ   r   r   r	   r   r
   r   r   r   )rT   r   r   r1   r2   r+   r,   r3   r4   r5   r6   r7   rU   rV   rW   fnr   r   r   pretrain_recipe   s4   ,
rd   noneresume_pathr=   r>   peft_schemec                 C   s  t jtjt jtj|dd}t jtj||ddddd}t jtjd| d}t jtjdd	||	|
||||t jtjd
t	j
d|g|dd}t jtjt|d|t jtj|||t|ddtj| |t|ddtdddd|d}|du su| dkrd	|jj_d|jj_|S td| )a  
    Create a fine-tuning recipe for Mamba2 780M model.

    This function sets up a complete configuration for fine-tuning, including
    model, trainer, data, logging, optimization, and resumption settings.

    Args:
        dir (Optional[str]): Directory for saving logs and checkpoints.
        name (str): Name of the fine-tuning run.
        resume_path (str): Path to the NeMo checkpoint (refer to notes below
                            on how to convert a pytorch checkpoint to NeMo)
        tokenizer_model (str): Path to tokenizer model (defaults to None)
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
    Returns:
        run.Partial: Partial configuration for fine-tuning.

    Examples:
        CLI usage:
            $ nemo llm finetune --factory mamba2_780m

        Python API usage:
            >>> recipe = finetune_recipe(name="mamba2_780m_finetune", num_nodes=1)
            >>> print(recipe)

    Note:
        This recipe uses the SQuAD dataset for fine-tuning.
        For converting an SSM pytorch checkpoint, use the following line of python code:

        llm.MambaModel(llm.BaseMambaConfig780M(), tokenizer=tokenizer()).import_ckpt(
            path="pytorch://ABSOLUTE_PATH_TO_CKPT/your_pytorch_state_dict_file",
            model_config=llm.BaseMambaConfig780M())
        This line will cache the nemo checkpoint to following directory:
            /root/.cache/nemo/models/your_pytorch_state_dict_file

    )path)restore_configTF)r=   r>   rC   ckpt_load_optimizerckpt_save_optimizerrD   r*   )every_n_train_stepsdirpathrG   r%   z
bf16-mixed)	precisionparams_dtype)rH   rI   rJ   r3   r4   r5   r6   r7   r1   rK   r8   rL   rM   r   rX   r   r[   g-C6?r   r(   )r\   min_lrwarmup_stepsr]   Nre   gh㈵>zUnrecognized peft scheme: )r   r   rN   
AutoResumeRestoreConfigrO   ModelCheckpointrP   MegatronMixedPrecisiontorchbfloat16rb   r   r   r#   SquadDataModuler   r
   r   r   lowerrQ   rL   r=   r`   r    lr
ValueError)rT   r   rf   r   r1   r2   r=   r>   rU   r3   r4   r5   r6   r7   rV   rW   rg   nemo_resumerL   checkpoint_callbackrQ   reciper   r   r   finetune_recipe   st   8	

r   )N)r%   r%   NNr%   Fr%   r&   r'   r'   r(   r)   r*   N)NrR   NNr%   r&   r%   r%   rS   r'   r'   r(   r)   r*   r&   r%   re   )3typingr   lightning.pytorchpytorchplnemo_runr   rv   $lightning.pytorch.callbacks.callbackr   megatron.core.distributedr   nemor   rN   nemo.collectionsr   nemo.collections.llm.apir   r   "nemo.collections.llm.gpt.data.mockr	   (nemo.collections.llm.recipes.log.defaultr
   r   r   'nemo.collections.llm.recipes.optim.adamr   6nemo.collections.llm.recipes.precision.mixed_precisionr   3nemo.collections.nlp.modules.common.tokenizer_utilsr   nemo.utils.exp_managerr   NAMEclifactorystrr   LightningModuler   r#   intdtypeboollistrP   rQ   rb   rd   r   r   r   r   r   <module>   sV  	

X	
G	
