o
    }oiv@                     @   sx  d dl mZmZ d dlmZ d dlZd dlZd dl	m
Z
 d dlmZ d dlmZ d dlmZmZ d dlmZmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZm Z m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z) d dl*m+Z+ d dl,m-Z- dZ.ej/j0e.ddej1ej2 fddZ3ddej4dddddddf
de5de5d eej6 d!ee5 d"e5d#e7d$e5d%e5d&e5d'ee8ej1e
   dej1ej9 fd(d)Z:ej/j0ee.d*dd+d,dd-efd.ee; d/e;d$e5d%e5d0e7d1edej<fd2d3Z=d4ej<dej<fd5d6Z>ej/j0ee.d*		+			7	8			-dAd.ee; d/e;d$e5d%e5d9e;d:ee; d;ee5 d<ee7 d0e7dej<fd=d>Z?d4ej<d:e;dej<fd?d@Z@dS )B    )CallableOptionalN)Callback)DistributedDataParallelConfig)	lightning)Llama31Nemotron70BConfigLlamaNemotronModel)finetunepretrain)MockDataModule)PackedSequenceSpecs)PEFT_STR2CLS)default_finetune_recipe)default_logdefault_resumetensorboard_logger),distributed_fused_adam_with_cosine_annealing)
bf16_mixed)4userbuffers_bf16_h100_h16384_tp8_cp2_mbs1_seqlen8192)GarbageCollectionCallback)MegatronCommOverlapCallback)TimingCallbackllama31_nemotron_70bnamereturnc                  C   s   t t} d| _t jt| dS )a  
    Factory function to create a Llama-3.1-Nemotron-70B model configuration.
    This can be used for both Llama-3.1-Nemotron-70B-Instruct and Llama-3.1-Nemotron-70B-Reward.

    Returns:
        run.Config[pl.LightningModule]: Configuration for the Llama-3.1-Nemotron-70B model.

    Examples:
        CLI usage:
            $ nemo llm pretrain model=llama31_nemotron_70b ...

        Python API usage:
            >>> model_config = model()
            >>> print(model_config)
        )config)runConfigr   
seq_lengthr   )conf r"   e/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/llm/recipes/llama31_nemotron_70b.pymodel,   s   
r$            T   i{ tensor_parallelismpipeline_parallelismpipeline_parallelism_typevirtual_pipeline_parallelismcontext_parallelismsequence_parallelism	num_nodesnum_gpus_per_node	max_steps	callbacksc
                 C   sb   t jtj| |||||dddt jtddddddd}
t jtjdd|	|ddd||t |
d	d
d}|S )aS  
    Configure the NeMo Lightning Trainer for Llama-3.1-Nemotron-70B model.

    This function sets up the distributed training strategy optimized for the large 70B model.

    Args:
        tensor_parallelism (int): Degree of tensor model parallelism.
        pipeline_parallelism (int): Degree of pipeline model parallelism.
        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
        context_parallelism (int): Degree of context parallelism.
        sequence_parallelism (bool): Whether to use sequence parallelism.
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
        max_steps (int): Maximum number of training steps.
        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.

    Returns:
        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.

    Examples:
        CLI usage:
            $ nemo llm pretrain trainer=llama31_nemotron_70b ...

        Python API usage:
            >>> trainer_config = trainer(num_nodes=4, num_gpus_per_node=8)
            >>> print(trainer_config)

    Note:
        This configuration uses extensive parallelism to handle the large model size efficiently.
    Tcheck_for_nan_in_gradgrad_reduce_in_fp32overlap_grad_reduceoverlap_param_gatheraverage_in_collective)
tensor_model_parallel_sizepipeline_model_parallel_sizepipeline_dtype$virtual_pipeline_model_parallel_sizecontext_parallel_sizesequence_parallelgradient_as_bucket_viewckpt_async_saveckpt_parallel_loadddpgpu   2       
   Fi  )acceleratoraccumulate_grad_batchesr2   deviceslimit_test_batcheslimit_val_batcheslog_every_n_stepsr1   r/   pluginsstrategyuse_distributed_samplerval_check_interval)r   r   nlMegatronStrategyr   Trainerr   )r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   rO   trainerr"   r"   r#   rU   B   sJ   +rU   )targetr   defaultrD   Fdirr   performance_modefnc                 C   sb   t j|t t||t tgdt jtddddt| |t|ddt	dd	t
 d
}|r/t|}|S )a  
    Create a pre-training recipe for Llama-3.1-Nemotron-70B model.

    This function sets up a complete configuration for pre-training, including
    model, trainer, data, logging, optimization, and resumption settings.

    Args:
        dir (Optional[str]): Directory for saving logs and checkpoints.
        name (str): Name of the pre-training run.
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
        performance_mode (bool): If true, enables optimizations for maximum performance.
        fn (Callable): The pre-training function to use.

    Returns:
        run.Partial: Partial configuration for pre-training.

    Examples:
        CLI usage:
            $ nemo llm pretrain --factory llama31_nemotron_70b
            $ nemo llm pretrain --factory "llama31_nemotron_70b(num_nodes=4, name='my_70b_pretrain')"

        Python API usage:
            >>> recipe = pretrain_recipe(name="llama31_nemotron_70b_pretrain", num_nodes=4)
            >>> print(recipe)

    Note:
        This recipe is optimized for the large 70B model and requires significant computational resources.
    )r/   r0   r2   r   i   rD   )r    global_batch_sizemicro_batch_sizer   )rX   r   r   ga2U0*3?)max_lr)r$   rU   datalogoptimresume)r   Partialr$   rU   r   r   r   r   r   r   r   "pretrain_performance_optimizations)rX   r   r/   r0   rY   rZ   reciper"   r"   r#   pretrain_recipe   s    &
re   rd   c                 C   s&   | j jtjtdtddddd | S )a  
    Create a performance-optimized pre-training recipe for Llama-3.1-Nemotron-70B model.

    This method enables performance optimizations that may not be suitable for all use cases.
    It builds upon the standard pre-training recipe and adds additional performance enhancements.

    Args:
        recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added

    Returns:
        run.Partial: Partial configuration for performance-optimized pre-training.

    Note:
        Use this method with caution and only when you need maximum performance.
        It may not be suitable for all hardware configurations or use cases.
    TrE   F)tp_comm_overlaptp_comm_overlap_cfgdefer_embedding_wgrad_computewgrad_deferral_limit(overlap_param_gather_with_optimizer_stepalign_param_gather)rU   r2   appendr   r   r   r   )rd   r"   r"   r#   rc      s   rc   instructlora
model_typepeft_schemer    packed_sequencec	                 C   s~  |du r|}|du r|rdnd}|du r)|du s|  dkr!d}n|  dv r)d}|  dkr2d	}	n|  d
kr;d}	ntd| dtt |	| ||||}
|du sX|  dkrhd|
jj_d|
jj_d|
jj	_
n4|  dv rtt|   |
_d|
j_d|
j_d|
jj	_d|
jj	_d|
jj_d|
jj	_
ntd| ||
jj	_||
j_|rddi|
j_tjt|d|
j_|rt|
|}
|
S )a  
    Create a fine-tuning recipe for Llama-3.1-Nemotron-70B model.
    This can be used for both Llama-3.1-Nemotron-70B-Instruct and Llama-3.1-Nemotron-70B-Reward.
    By default it fine-tuned on Llama-3.1-Nemotron-70B-Instruct model.

    This function sets up a complete configuration for fine-tuning, including
    model, trainer, data, logging, optimization, and resumption settings.
    The recipe uses LoRA (Low-Rank Adaptation) for efficient fine-tuning, unless peft_scheme is set to None.

    Args:
        dir (Optional[str]): Directory for saving logs and checkpoints.
        name (str): Name of the fine-tuning run.
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
        model_type (str): Type of model to fine-tune. Either 'instruct' or 'reward'.
        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
            Allowed values: 'lora'/'dora'/'none'/None.
        seq_length (int): Maximum number of tokens per microbatch.
        packed_sequence (Optional[bool]): If true, fine-tuning sequences will be packed into batches up to the given
            maximum seq_length for better efficiency. By default, this value equals performance_mode.
        performance_mode (bool): If true, enables optimizations for maximum performance.

    Returns:
        run.Partial: Partial configuration for fine-tuning.

    Examples:
        CLI usage:
            $ nemo llm finetune --factory llama31_nemotron_70b
            $ nemo llm finetune --factory "llama31_nemotron_70b(num_nodes=4, name='my_70b_finetune')"

        Python API usage:
            >>> recipe = finetune_recipe(name="llama31_nemotron_70b_finetune", num_nodes=4)
            >>> print(recipe)

    Note:
        This recipe uses the SQuAD dataset for fine-tuning. Be aware that fine-tuning a 70B model
        requires substantial computational resources.
    Ni   i   noner%   )rn   dorarD   rm   z)nvidia/Llama-3.1-Nemotron-70B-Instruct-HFrewardz'nvidia/Llama-3.1-Nemotron-70B-Reward-HFzUnknown model_type z). Expected one of "instruct" or "reward".r(   gh㈵>   rF   Fg-C6?zUnrecognized peft scheme: pad_to_max_lengthT)packed_sequence_size)lower
ValueErrorr   r$   rU   rO   r9   r:   r`   r   lrr   r   r   peftdimalphause_distributed_optimizercross_entropy_loss_fusionr    r^   dataset_kwargsr   packed_sequence_specs"finetune_performance_optimizations)rX   r   r/   r0   ro   rp   r    rq   rY   resume_pathrd   r"   r"   r#   finetune_recipe   sH   4






r   c                 C   s   t | jds
g | j_|du s| dkrEd| jj_d| jj_d| jj_d| jj_	t
jtdddddd| jj_| jjt
jtddd	d
 nd| jj_d| jj_d| jj_dg| j_d| jj_| jjt
t | jjt
tdd | S )a  
    Modify the given recipe to optimize settings for performance.

    This method enables performance optimizations that may not be suitable for all use cases.
    Intended to build upon the standard fine-tuning recipe.

    Args:
        recipe (run.Partial): Base fine-tuning recipe to which performance optimizations will be added
        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
            Allowed values: 'lora'/'dora'/'none'/None.

    Returns:
        run.Partial: Partial configuration for performance-optimized fine-tuning.

    Note:
        Use this method with caution and only when you need maximum performance.
        It may not be suitable for all hardware configurations or use cases.
    r2   Nrr   r%   r&   FTr3      )rf   rh   ri   r'   
linear_qkvd   )hasattrrU   r2   rx   rO   r9   r:   r<   rN   r5   r   r   r   rB   rl   r   r{   target_modulesr>   r   r   )rd   rp   r"   r"   r#   r   _  sJ   




	



r   )	NrW   Nr(   rm   rn   NNF)Atypingr   r   lightning.pytorchpytorchplnemo_runr   torch$lightning.pytorch.callbacks.callbackr   megatron.core.distributedr   nemor   rR   nemo.collections.llmr   r   nemo.collections.llm.apir	   r
   "nemo.collections.llm.gpt.data.mockr   -nemo.collections.llm.gpt.data.packed_sequencer   nemo.collections.llm.peftr   -nemo.collections.llm.recipes.finetune_defaultr   (nemo.collections.llm.recipes.log.defaultr   r   r   'nemo.collections.llm.recipes.optim.adamr   6nemo.collections.llm.recipes.precision.mixed_precisionr   ;nemo.collections.llm.recipes.tp_overlap_configs.userbuffersr    nemo.lightning.pytorch.callbacksr   6nemo.lightning.pytorch.callbacks.megatron_comm_overlapr   nemo.utils.exp_managerr   NAMEclifactoryr   LightningModuler$   bfloat16intdtypeboollistrT   rU   strrb   re   rc   r   r   r"   r"   r"   r#   <module>   s   	


T9'	
g