o
    }oi4                  ,   @   s  d dl mZ d dlmZ d dlZd dlZd dlZd dl	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZmZ d dlmZ d d	lmZmZmZ d d
lmZ d dlmZ d dlm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z& dej'j(_)dZ*ej+j,e*ddDde-dej.ej/ fddZ0ej+j,e*ddDde-dej.ej/ fddZ1ej+j,ee*ddddej2ddddddddddd d!dfd"e-d#e3d$e3d%ej4d&ee3 d'e3d(e5d)e3d*e3d+e3d,e3d-e3d.e3d/e3d0e3d1e5d2ee6ej.e
   dej.ej7 f$d3d4Z8ej+j,ee*ddd5dddddddddddd d!d6d7defd"ee- d8e-de-d)e3d*e3d#e3d(e5d$e3d+e3d,e3d-e3d.e3d/e3d0e3d1e5d9e3d:e3d;e3dej9f&d<d=Z:ej+j,ee*d	>			5						6						 	!	7		?dEd@e-de-d"ee- d8e-d)e3d*e3d#e3d(e5d$e3d9e3d+e3d,e3d-e3d.e3d/e3d0e3d1e5d:e3d;e3dAee- dej9f*dBdCZ;dS )F    )OptionalN)Callback)DistributedDataParallelConfig)	lightning)llm)finetunepretrain)MockDataModule)default_logdefault_resumetensorboard_logger),distributed_fused_adam_with_cosine_annealing)
bf16_mixed)get_nmt_tokenizer)ModelCheckpoint)MegatronCommOverlapCallback)TimingCallbackTnemotronh_8bname
vocab_filereturnc                 C   s*   | rt jtdd| ddS t jtddddS )zZ
    Factory function to create a tokenizer configuration for NemotronH Hybrid model.
    tiktokenTiktokenTokenizerT)library
model_namer   use_fasthuggingfaceznvidia/Nemotron-H-8B-Base-8K)r   r   r   )runConfigr   r    r!   ]/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/llm/recipes/nemotronh_8b.py	tokenizer*   s   r#   c                 C   s    t jtjt tjt| ddS )ar  
    Factory function to create a NemotronH Hybrid 8B model configuration.
    Returns:
        run.Config[pl.LightningModule]: Configuration for the NemotronH Hybrid 8B model.
    Examples:
        CLI usage:
            $ nemo llm pretrain model=nemotronh_8b ...
        Python API usage:
            >>> model_config = model()
            >>> print(model_config)
    r    )configr#   )r   r   r   
MambaModelNemotronHConfig8Br#   r    r!   r!   r"   model@   s
   
r'   )targetr            
   2          Fdirtensor_parallelismpipeline_parallelismpipeline_parallelism_typevirtual_pipeline_parallelismcontext_parallelismsequence_parallelism	num_nodesnum_gpus_per_node	max_stepsval_check_intervallimit_test_batcheslimit_val_batcheslog_every_n_steps
save_top_kckpt_async_save	callbacksc                 C   s   t jtj|||||dd|ddt jtdddddd}t tt jtdddt jt|
| |dddd	g}t jtj|||	d
|g |||ddt	 g|
dd}|S )ak  
    Configure the NeMo Lightning Trainer for NemotronH Hybrid 8B model.
    This function sets up the distributed training strategy and other training parameters.
    Args:
        tensor_parallelism (int): Degree of tensor model parallelism.
        pipeline_parallelism (int): Degree of pipeline model parallelism.
        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
        context_parallelism (int): Degree of context parallelism.
        sequence_parallelism (bool): Whether to use sequence parallelism.
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
        max_steps (int): Maximum number of training steps.
        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.
    Returns:
        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.
    Examples:
        CLI usage:
            $ nemo llm pretrain trainer=nemotronh_8b ...
        Python API usage:
            >>> trainer_config = trainer(num_nodes=1, num_gpus_per_node=1)
            >>> print(trainer_config)
    Note:
        For more information on distributed training strategies, refer to the
        NeMo documentation on multi-GPU and multi-node training.
    T
torch_distlog_allF)check_for_nan_in_gradoverlap_grad_reduceoverlap_param_gathergrad_reduce_in_fp32)tensor_model_parallel_sizepipeline_model_parallel_sizecontext_parallel_sizepipeline_dtypesequence_parallelckpt_load_optimizerckpt_save_optimizerr?   save_ckpt_formatckpt_load_strictnessddpnccl)tp_comm_bootstrap_backendtp_comm_overlap)every_n_train_stepsdirpathr>   always_save_contextsave_optim_on_train_endsave_context_on_train_endgpur   )devicesr7   r9   acceleratorstrategyloggerr@   r=   r<   num_sanity_val_stepsuse_distributed_samplerpluginsr:   enable_checkpointing)
r   r   nlMegatronStrategyr   r   r   r   Trainerr   )r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   r\   trainerr!   r!   r"   re   T   sl   .re   default       r   
seq_lengthgbsmbsc                 C   sn   t j|t|dt| |||||||	|
||||dt jt|||t|ddt| |t|ddt	ddt
 d}|S )	a  
    Create a pre-training recipe for NemotronH Hybrid 8B model.
    This function sets up a complete configuration for pre-training, including
    model, trainer, data, logging, optimization, and resumption settings.
    Args:
        dir (Optional[str]): Directory for saving logs and checkpoints.
        name (str): Name of the pre-training run.
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
        fn (Callable): The pre-training function to use.
    Returns:
        run.Partial: Partial configuration for pre-training.
    Examples:
        CLI usage:
            $ nemo llm pretrain --factory nemotronh_8b
            $ nemo llm pretrain --factory "nemotronh_8b(num_nodes=32, name='my_pretrain')"
        Python API usage:
            >>> recipe = pretrain_recipe(name="nemotronh_8b_pretrain", num_nodes=32)
            >>> print(recipe)
    r    )r0   r9   r7   r1   r2   r6   r8   r:   r;   r<   r=   r>   r?   ri   global_batch_sizemicro_batch_sizer#   r   r0   r   r   ga2U0*3?)max_lrr'   re   datalogoptimresume)r   Partialr'   re   r   r	   r#   r
   r   r   r   )r0   r   r   r7   r8   r1   r6   r2   r9   r:   r;   r<   r=   r>   r?   ri   rj   rk   fnreciper!   r!   r"   pretrain_recipe   s<   +ry   nemotronh-8b-pretrainnoneresume_pathpeft_schemec                 C   s   t jtjt jtj| dd}t jtjt|dt	|
|||||||||||dt jt
|	||t|ddtj||t|ddtdd	d
d|d}|du sO| dkr[d|j	j_d|jj_|S td| )a  
    Create a fine-tuning recipe for NemotronH Hybrid 8B model.
    This function sets up a complete configuration for fine-tuning, including
    model, trainer, data, logging, optimization, and resumption settings.
    Args:
        dir (Optional[str]): Directory for saving logs and checkpoints.
        name (str): Name of the fine-tuning run.
        resume_path (str): Path to the NeMo checkpoint (refer to notes below
                            on how to convert a pytorch checkpoint to NeMo)
        vocab_file (str): Path to vocab file (defaults to None)
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
    Returns:
        run.Partial: Partial configuration for fine-tuning.
    Examples:
        CLI usage:
            $ nemo llm finetune --factory nemotronh_8b
        Python API usage:
            >>> recipe = finetune_recipe(name="nemotronh_8b_finetune", num_nodes=32)
            >>> print(recipe)
    Note:
        This recipe uses the SQuAD dataset for fine-tuning.
        For converting an SSM pytorch checkpoint, use the following line of python code:
        llm.MambaModel(llm.NemotronHConfig8B(), tokenizer=tokenizer(vocab_file=vocab_file)).import_ckpt(
            path="pytorch://ABSOLUTE_PATH_TO_CKPT/your_pytorch_state_dict_file",
            model_config=llm.NemotronHConfig8B())
        This line will cache the nemo checkpoint to following directory:
            /root/.cache/nemo/models/your_pytorch_state_dict_file
    )path)restore_configr    )r9   r7   r1   r2   r6   r8   r:   r;   r<   r=   r>   r?   rl   r   ro   g-C6?r   r-   )rp   min_lrwarmup_stepsrq   Nr{   r+   gh㈵>zUnrecognized peft scheme: )r   r   rb   
AutoResumeRestoreConfigrv   r   r   r'   re   r	   r#   r
   r   r   lowerr\   rG   rt   r$   lr
ValueError)r|   r   r0   r   r7   r8   r1   r6   r2   ri   r9   r:   r;   r<   r=   r>   r?   rj   rk   r}   nemo_resumerx   r!   r!   r"   finetune_recipe  sJ   4

r   )N)rz   NNrf   r+   r+   r)   Tr*   rg   r,   r,   r-   r.   r,   r/   Frh   r*   r{   )<typingr   lightning.pytorchpytorchplnemo_runr   torchtorch._dynamo$lightning.pytorch.callbacks.callbackr   megatron.core.distributedr   nemor   rb   nemo.collectionsr   nemo.collections.llm.apir   r   "nemo.collections.llm.gpt.data.mockr	   (nemo.collections.llm.recipes.log.defaultr
   r   r   'nemo.collections.llm.recipes.optim.adamr   6nemo.collections.llm.recipes.precision.mixed_precisionr   3nemo.collections.nlp.modules.common.tokenizer_utilsr    nemo.lightning.pytorch.callbacksr   6nemo.lightning.pytorch.callbacks.megatron_comm_overlapr   nemo.utils.exp_managerr   _dynamor$   suppress_errorsNAMEclifactorystrr   LightningModuler#   r'   bfloat16intdtypeboollistrd   re   rv   ry   r   r!   r!   r!   r"   <module>   s  
	

g	
J	
