o
    }oi0                  "   @   s  d dl mZ d dlmZ d dlZd dlZd dlm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZmZ d dlmZ d d	lmZmZmZ d d
lmZ d dlmZ d dlmZ d dl m!Z! dZ"ej#j$e"dd=de%dej&ej' fddZ(ej#j$e"dd=de%dej&ej' fddZ)ej#j$e"d														d>de*de*deej+ d ee* d!e*d"e,d#e*d$e*d%e*d&e*d'e*d(e*d)e*d*ee-ej&e	   dej&ej. fd+d,Z/ej#j$ee"d-dd.ddddddddddd/ddefd0ee% d1e%de%d#e*d$e*de*de*d%e*d&e*d'e*d(e*d)e*d2e*d3e*d4e*dej0f d5d6Z1ej#j$ee"d-		.					/								7d?d0ee% d1e%d#e*d$e*d8e*d9e*d2e*d%e*d&e*d'e*d(e*d)e*d3e*d4e*d:ee% dej0f d;d<Z2dS )@    )OptionalN)Callback)DistributedDataParallelConfig)	lightning)llm)finetunepretrain)MockDataModule)default_logdefault_resumetensorboard_logger),distributed_fused_adam_with_cosine_annealing)
bf16_mixed)get_nmt_tokenizer)TimingCallback	mamba2_8bnametokenizer_modelreturnc                 C   s   t jtdd| ddS )z?
    Factory function to create a tokenizer configuration.
    megatronGPTSentencePieceTokenizerT)library
model_namer   use_fast)runConfigr   r    r   Z/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/llm/recipes/mamba2_8b.py	tokenizer%   s   r    c                 C   s    t jtjt tjt| ddS )ae  
    Factory function to create a Mamba2 8B model configuration.

    Returns:
        run.Config[pl.LightningModule]: Configuration for the Mamba2 Hybrid 8B model.

    Examples:
        CLI usage:
            $ nemo llm pretrain model=mamba2_8b ...

        Python API usage:
            >>> model_config = model()
            >>> print(model_config)
    r   )configr    )r   r   r   
MambaModelNVIDIAMambaConfig8Br    r   r   r   r   model3   s
   
r$         Fd   2       
   tensor_parallelismpipeline_parallelismpipeline_parallelism_typevirtual_pipeline_parallelismcontext_parallelismsequence_parallelism	num_nodesnum_gpus_per_node	max_stepsval_check_intervallimit_test_batcheslimit_val_batcheslog_every_n_steps	callbacksc                 C   s`   t jtj| |||||dddt jtdddddd}t jtjdd||||t |d|	|
||d}|S )ak  
    Configure the NeMo Lightning Trainer for Mamba2 Hybrid 8B model.

    This function sets up the distributed training strategy and other training parameters.

    Args:
        tensor_parallelism (int): Degree of tensor model parallelism.
        pipeline_parallelism (int): Degree of pipeline model parallelism.
        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
        context_parallelism (int): Degree of context parallelism.
        sequence_parallelism (bool): Whether to use sequence parallelism.
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
        max_steps (int): Maximum number of training steps.
        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.

    Returns:
        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.

    Examples:
        CLI usage:
            $ nemo llm pretrain trainer=mamba2_8b ...

        Python API usage:
            >>> trainer_config = trainer(num_nodes=1, num_gpus_per_node=1)
            >>> print(trainer_config)

    Note:
        For more information on distributed training strategies, refer to the
        NeMo documentation on multi-GPU and multi-node training.
    TF)check_for_nan_in_gradgrad_reduce_in_fp32overlap_grad_reduceoverlap_param_gather)
tensor_model_parallel_sizepipeline_model_parallel_sizepipeline_dtype$virtual_pipeline_model_parallel_sizecontext_parallel_sizesequence_parallelgradient_as_bucket_viewckpt_async_saveckpt_parallel_loadddpgpur&   )acceleratoraccumulate_grad_batchesr8   devicesr3   r1   pluginsstrategyuse_distributed_samplerr4   r5   r6   r7   )r   r   nlMegatronStrategyr   Trainerr   )r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   rL   trainerr   r   r   rQ   J   sH   1rQ   )targetr   default   dirr   
seq_lengthgbsmbsc                 C   sl   t j|t|dt|||||||	|
|t tgd
t jt|||t|ddt| |t	|ddt
ddt dS )	a  
    Create a pre-training recipe for Mamba2 Hybrid 8B model.

    This function sets up a complete configuration for pre-training, including
    model, trainer, data, logging, optimization, and resumption settings.

    Args:
        dir (Optional[str]): Directory for saving logs and checkpoints.
        name (str): Name of the pre-training run.
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
        fn (Callable): The pre-training function to use.

    Returns:
        run.Partial: Partial configuration for pre-training.

    Examples:
        CLI usage:
            $ nemo llm pretrain --factory mamba2_8b
            $ nemo llm pretrain --factory "mamba2_8b(num_nodes=1, name='my_pretrain')"

        Python API usage:
            >>> recipe = pretrain_recipe(name="mamba2_8b_pretrain", num_nodes=1)
            >>> print(recipe)
    r   )
r3   r1   r+   r,   r2   r4   r5   r6   r7   r8   rV   global_batch_sizemicro_batch_sizer    r   rU   r   r   ga2U0*3?)max_lrr$   rQ   datalogoptimresume)r   Partialr$   rQ   r   r   r	   r    r
   r   r   r   )rU   r   r   r1   r2   r+   r,   r3   r4   r5   r6   r7   rV   rW   rX   fnr   r   r   pretrain_recipe   s4   ,
re   noner=   r>   peft_schemec                 C   s  t jtjt jtj| dd}t jtj||ddddd}t jtjd|d}t jtjdd	||	|
||||t jtjd
t	j
d|g|dd}t jtjt|d|t jtj|||t|ddtj||t|ddtdddd|d}|du su| dkrd|jj_d|jj_|S td| )a  
    Create a fine-tuning recipe for Mamba2 8B model.

    This function sets up a complete configuration for fine-tuning, including
    model, trainer, data, logging, optimization, and resumption settings.

    Args:
        dir (Optional[str]): Directory for saving logs and checkpoints.
        name (str): Name of the fine-tuning run.
        resume_path (str): Path to the NeMo checkpoint (refer to notes below
                            on how to convert a pytorch checkpoint to NeMo)
        tokenizer_model (str): Path to tokenizer model (defaults to None)
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
    Returns:
        run.Partial: Partial configuration for fine-tuning.

    Examples:
        CLI usage:
            $ nemo llm finetune --factory mamba2_8b

        Python API usage:
            >>> recipe = finetune_recipe(name="mamba2_8b_finetune", num_nodes=1)
            >>> print(recipe)

    Note:
        This recipe uses the SQuAD dataset for fine-tuning.
        For converting an SSM pytorch checkpoint, use the following line of python code:

        llm.MambaModel(llm.NVIDIAMambaConfig8B(), tokenizer=tokenizer(tokenizer_model=tokenizer_model)).import_ckpt(
            path="pytorch://ABSOLUTE_PATH_TO_CKPT/your_pytorch_state_dict_file",
            model_config=llm.NVIDIAMambaConfig8B())
        This line will cache the nemo checkpoint to following directory:
            /root/.cache/nemo/models/your_pytorch_state_dict_file

    )path)restore_configTF)r=   r>   rC   ckpt_load_optimizerckpt_save_optimizerrD   r*   )every_n_train_stepsdirpathrG   r&   z
bf16-mixed)	precisionparams_dtype)rH   rI   rJ   r3   r4   r5   r6   r7   r1   rK   r8   rL   rM   r   rY   r   r\   g-C6?r   r(   )r]   min_lrwarmup_stepsr^   Nrf   r%   gh㈵>zUnrecognized peft scheme: )r   r   rN   
AutoResumeRestoreConfigrO   ModelCheckpointrP   MegatronMixedPrecisiontorchbfloat16rc   r   r   r$   SquadDataModuler    r
   r   r   lowerrQ   rL   r=   ra   r!   lr
ValueError)resume_pathr   rU   r   r1   r2   r=   r>   rV   r3   r4   r5   r6   r7   rW   rX   rg   nemo_resumerL   checkpoint_callbackrQ   reciper   r   r   finetune_recipe   st   8	

r   )N)r%   r&   NNr&   Fr&   r%   r'   r'   r(   r)   r*   N)NrS   r&   r%   r%   r&   rT   r'   r'   r(   r)   r*   r%   r&   rf   )3typingr   lightning.pytorchpytorchplnemo_runr   rv   $lightning.pytorch.callbacks.callbackr   megatron.core.distributedr   nemor   rN   nemo.collectionsr   nemo.collections.llm.apir   r   "nemo.collections.llm.gpt.data.mockr	   (nemo.collections.llm.recipes.log.defaultr
   r   r   'nemo.collections.llm.recipes.optim.adamr   6nemo.collections.llm.recipes.precision.mixed_precisionr   3nemo.collections.nlp.modules.common.tokenizer_utilsr   nemo.utils.exp_managerr   NAMEclifactorystrr   LightningModuler    r$   intdtypeboollistrP   rQ   rc   re   r   r   r   r   r   <module>   sJ  	

X	
G	
