o
    wiC8                  .   @   s:  d dl mZ d dlmZ d dlZd dlZd dlZd dl	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZmZ d dlmZ d d	lmZmZmZ d d
lmZ d dlmZ d dlm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z& dej'j(_)dZ*ej+j,e*ddGde-dej.ej/ fddZ0ej+j,e*ddGde-dej.ej/ fddZ1ej+j,ee*ddddej2dddddddddddd dfd!e-d"e3d#e3d$ej4d%ee3 d&e3d'e5d(e3d)e3d*e3d+e3d,e3d-e3d.e3d/e3d0e5d1ee6ej.e
   dej.ej7 f$d2d3Z8ej+j,ee*ddd4ddddddddddddd d5d6dd efd!ee- d7e-de-d(e3d)e3d"e3d'e5d#e3d*e3d+e3d,e3d-e3d.e3d/e3d0e5d8e3d9e3d:e3d;e5dej9f(d<d=Z:ej+j,ee*d	>			4						5							 	6		 	?dHd@e-de-d!ee- d7e-d(e3d)e3d"e3d'e5d#e3d8e3d*e3d+e3d,e3d-e3d.e3d/e3d0e5d9e3d:e3d;e5dAee- dej9f,dBdCZ;dDej9dej9fdEdFZ<dS )I    )OptionalN)Callback)DistributedDataParallelConfig)	lightning)llm)finetunepretrain)MockDataModule)default_logdefault_resumetensorboard_logger),distributed_fused_adam_with_cosine_annealing).nemotron_h_bf16_with_fp8_current_scaling_mixed)get_nmt_tokenizer)ModelCheckpoint)MegatronCommOverlapCallback)TimingCallbackTnemotronh_56bname
vocab_filereturnc                 C   s*   | rt jtdd| ddS t jtddddS )zZ
    Factory function to create a tokenizer configuration for NemotronH Hybrid model.
    tiktokenTiktokenTokenizerT)library
model_namer   use_fasthuggingfaceznvidia/Nemotron-H-8B-Base-8K)r   r   r   )runConfigr   r    r!   g/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/llm/recipes/nemotronh_56b.py	tokenizer*   s   r#   c                 C   s    t jtjt tjt| ddS )au  
    Factory function to create a NemotronH Hybrid 56B model configuration.
    Returns:
        run.Config[pl.LightningModule]: Configuration for the NemotronH Hybrid 56B model.
    Examples:
        CLI usage:
            $ nemo llm pretrain model=nemotronh_56b ...
        Python API usage:
            >>> model_config = model()
            >>> print(model_config)
    r    )configr#   )r   r   r   
MambaModelNemotronHConfig56Br#   r    r!   r!   r"   model@   s
   
r'   )targetr             
   2      Fdirtensor_parallelismpipeline_parallelismpipeline_parallelism_typevirtual_pipeline_parallelismcontext_parallelismsequence_parallelism	num_nodesnum_gpus_per_node	max_stepsval_check_intervallimit_test_batcheslimit_val_batcheslog_every_n_steps
save_top_kckpt_async_save	callbacksc                 C   s   t jtj|||||dd|ddt jtdddddd}t tt jt|
| |ddddg}t jtj|||	d|g |||d	dt g|
dd
}|S )am  
    Configure the NeMo Lightning Trainer for NemotronH Hybrid 56B model.
    This function sets up the distributed training strategy and other training parameters.
    Args:
        tensor_parallelism (int): Degree of tensor model parallelism.
        pipeline_parallelism (int): Degree of pipeline model parallelism.
        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
        context_parallelism (int): Degree of context parallelism.
        sequence_parallelism (bool): Whether to use sequence parallelism.
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
        max_steps (int): Maximum number of training steps.
        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.
    Returns:
        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.
    Examples:
        CLI usage:
            $ nemo llm pretrain trainer=nemotronh_56b ...
        Python API usage:
            >>> trainer_config = trainer(num_nodes=1, num_gpus_per_node=1)
            >>> print(trainer_config)
    Note:
        For more information on distributed training strategies, refer to the
        NeMo documentation on multi-GPU and multi-node training.
    T
torch_distlog_allF)check_for_nan_in_gradoverlap_grad_reduceoverlap_param_gathergrad_reduce_in_fp32)tensor_model_parallel_sizepipeline_model_parallel_sizecontext_parallel_sizepipeline_dtypesequence_parallelckpt_load_optimizerckpt_save_optimizerr>   save_ckpt_formatckpt_load_strictnessddp)every_n_train_stepsdirpathr=   always_save_contextsave_optim_on_train_endsave_context_on_train_endgpur   )devicesr6   r8   acceleratorstrategyloggerr?   r<   r;   num_sanity_val_stepsuse_distributed_samplerpluginsr9   enable_checkpointing)	r   r   nlMegatronStrategyr   r   r   Trainerr   )r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   rX   trainerr!   r!   r"   ra   T   sb   .ra   default       r   
seq_lengthgbsmbsperformance_modec                 C   sz   t j|t|dt| |||||||	|
||||dt jt|||t|ddt| |t|ddt	ddt
 d}|r;t|}|S )	a  
    Create a pre-training recipe for NemotronH Hybrid 56B model.
    This function sets up a complete configuration for pre-training, including
    model, trainer, data, logging, optimization, and resumption settings.
    Args:
        dir (Optional[str]): Directory for saving logs and checkpoints.
        name (str): Name of the pre-training run.
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
        fn (Callable): The pre-training function to use.
    Returns:
        run.Partial: Partial configuration for pre-training.
    Examples:
        CLI usage:
            $ nemo llm pretrain --factory nemotronh_56b
            $ nemo llm pretrain --factory "nemotronh_56b(num_nodes=32, name='my_pretrain')"
        Python API usage:
            >>> recipe = pretrain_recipe(name="nemotronh_56b_pretrain", num_nodes=32)
            >>> print(recipe)
    r    )r/   r8   r6   r0   r1   r5   r7   r9   r:   r;   r<   r=   r>   re   global_batch_sizemicro_batch_sizer#   r   r/   r   r   ga2U0*3?)max_lrr'   ra   datalogoptimresume)r   Partialr'   ra   r   r	   r#   r
   r   r   r   performance_optimizations)r/   r   r   r6   r7   r0   r5   r1   r8   r9   r:   r;   r<   r=   r>   re   rf   rg   rh   fnreciper!   r!   r"   pretrain_recipe   s@   ,rw   nemotronh-56b-pretrainnoneresume_pathpeft_schemec                 C   s   t jtjt jtj| dd}t jtjt|dt	|
|||||||||||dt jt
|	||t|ddtj||t|ddtdd	d
d|d}|du sO| dkrZd|j	j_d|jj_ntd| |rgt|}|S )a  
    Create a fine-tuning recipe for NemotronH Hybrid 56B model.
    This function sets up a complete configuration for fine-tuning, including
    model, trainer, data, logging, optimization, and resumption settings.
    Args:
        dir (Optional[str]): Directory for saving logs and checkpoints.
        name (str): Name of the fine-tuning run.
        resume_path (str): Path to the NeMo checkpoint (refer to notes below
                            on how to convert a pytorch checkpoint to NeMo)
        vocab_file (str): Path to vocab file (defaults to None)
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
    Returns:
        run.Partial: Partial configuration for fine-tuning.
    Examples:
        CLI usage:
            $ nemo llm finetune --factory nemotronh_56b
        Python API usage:
            >>> recipe = finetune_recipe(name="nemotronh_56b_finetune", num_nodes=32)
            >>> print(recipe)
    Note:
        This recipe uses the SQuAD dataset for fine-tuning.
        For converting an SSM pytorch checkpoint, use the following line of python code:
        llm.MambaModel(llm.NemotronHConfig56B(), tokenizer=tokenizer(vocab_file=vocab_file)).import_ckpt(
            path="pytorch://ABSOLUTE_PATH_TO_CKPT/your_pytorch_state_dict_file",
            model_config=llm.NemotronHConfig56B())
        This line will cache the nemo checkpoint to following directory:
            /root/.cache/nemo/models/your_pytorch_state_dict_file
    )path)restore_configr    )r8   r6   r0   r1   r5   r7   r9   r:   r;   r<   r=   r>   ri   r   rl   g-C6?r   r-   )rm   min_lrwarmup_stepsrn   Nry   r)   gh㈵>zUnrecognized peft scheme: )r   r   r^   
AutoResumeRestoreConfigrs   r   r   r'   ra   r	   r#   r
   r   r   lowerrX   rF   rq   r$   lr
ValueErrorrt   )rz   r   r/   r   r6   r7   r0   r5   r1   re   r8   r9   r:   r;   r<   r=   r>   rf   rg   rh   r{   nemo_resumerv   r!   r!   r"   finetune_recipe  sN   5
r   rv   c                 C   s   | j jtjtdd | S )a  
    Create a performance-optimized pre-training recipe for NemotronH Hybrid 56B model.
    This method enables performance optimizations that may not be suitable for all use cases.
    It builds upon the standard pre-training recipe and adds additional performance enhancements.
    Args:
        recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added
    Returns:
        run.Partial: Partial configuration for performance-optimized pre-training.
    Note:
        Use this method with caution and only when you need maximum performance.
        It may not be suitable for all hardware configurations or use cases.
    T)tp_comm_overlap)ra   r?   appendr   r   r   )rv   r!   r!   r"   rt   d  s   rt   )N)rx   NNrb   r+   r)   r)   Tr*   rc   r,   r,   r-   r+   r*   r.   Frd   r*   Fry   )=typingr   lightning.pytorchpytorchplnemo_runr   torchtorch._dynamo$lightning.pytorch.callbacks.callbackr   megatron.core.distributedr   nemor   r^   nemo.collectionsr   nemo.collections.llm.apir   r   "nemo.collections.llm.gpt.data.mockr	   (nemo.collections.llm.recipes.log.defaultr
   r   r   'nemo.collections.llm.recipes.optim.adamr   6nemo.collections.llm.recipes.precision.mixed_precisionr   3nemo.collections.nlp.modules.common.tokenizer_utilsr    nemo.lightning.pytorch.callbacksr   6nemo.lightning.pytorch.callbacks.megatron_comm_overlapr   nemo.utils.exp_managerr   _dynamor$   suppress_errorsNAMEclifactorystrr   LightningModuler#   r'   bfloat16intdtypeboollistr`   ra   rs   rw   r   rt   r!   r!   r!   r"   <module>   s  
	

b	
M	
^