o
    }o™iû<  ã                   @   s‚  d dl mZmZ d dlmZ d dlZd dlZd dl	m
Z
 d dlmZ d dlmZ d dlmZmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZ d dlmZ d dlmZm Z m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z) d dl*m+Z+ d dl,m-Z- dZ.ej/j0e.ddej1ej2 fdd„ƒZ3												dCde4d e4d!eej5 d"ee4 d#e4d$e6d%e4d&e4d'e4d(e4d)e4d*ee7ej1e
   dej1ej8 fd+d,„Z9ej/j0ee.d-dd.ddd/efd0ee: d1e:d'e4d(e4d2e6d3edej;fd4d5„ƒZ<d6ej;dej;fd7d8„Z=ej/j0ee.d-		9	.			:			/dDd0ee: d;e:d1e:d'e4d(e4d<ee: d=ee4 d>ee6 d2e6dej;fd?d@„ƒZ>d6ej;d<e:dej;fdAdB„Z?dS )Eé    )ÚCallableÚOptionalN)ÚCallback)ÚDistributedDataParallelConfig)Ú	lightning)ÚfinetuneÚpretrain)ÚMockDataModule)ÚPackedSequenceSpecs)ÚLlama4Experts128ConfigÚ
LlamaModel)ÚPEFT_STR2CLS)Údefault_finetune_recipe)Údefault_logÚdefault_resumeÚtensorboard_logger)Ú,distributed_fused_adam_with_cosine_annealing)Ú
bf16_mixed)ÚGarbageCollectionCallback)ÚMegatronCommOverlapCallback)ÚMegatronTokenDropCallback)ÚTimingCallbackÚllama4_e128©ÚnameÚreturnc                   C   s   t jtt  t¡dS )aˆ  
    Factory function to create a Llama4 128-Experts (Maverick) model configuration.

    Returns:
        run.Config[pl.LightningModule]: Configuration for the Llama4 128-Experts (Maverick) model.

    Examples:
        CLI usage:
            $ nemo llm pretrain model=llama4_e128 ...

        Python API usage:
            >>> model_config = model()
            >>> print(model_config)
    )Úconfig)ÚrunÚConfigr   r   © r   r   ú\/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/llm/recipes/llama4_e128.pyÚmodel*   s   r!   é   é   Té€   é@   é   é{Ó Útensor_parallelismÚpipeline_parallelismÚpipeline_parallelism_typeÚvirtual_pipeline_parallelismÚcontext_parallelismÚsequence_parallelismÚexpert_tensor_parallelismÚexpert_model_parallelismÚ	num_nodesÚnum_gpus_per_nodeÚ	max_stepsÚ	callbacksc                 C   sj   t jtj| |||||||dddt jtddddddddd}t jtjdd||	dd	d
|
|tƒ |ddd}|S )az  
    Configure the NeMo Lightning Trainer for Llama4 128-Experts (Maverick) model.

    This function sets up the distributed training strategy and other training parameters.

    Args:
        tensor_parallelism (int): Degree of tensor model parallelism.
        pipeline_parallelism (int): Degree of pipeline model parallelism.
        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
        context_parallelism (int): Degree of context parallelism.
        sequence_parallelism (bool): Whether to use sequence parallelism.
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
        max_steps (int): Maximum number of training steps.
        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.

    Returns:
        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.

    Examples:
        CLI usage:
            $ nemo llm pretrain trainer=llama4_e128 ...

        Python API usage:
            >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8)
            >>> print(trainer_config)

    Note:
        For more information on distributed training strategies, refer to the
        NeMo documentation on multi-GPU and multi-node training.
    TÚoptim_grads_params)Úcheck_for_nan_in_gradÚgrad_reduce_in_fp32Úoverlap_grad_reduceÚoverlap_param_gatherÚaverage_in_collectiveÚdata_parallel_sharding_strategyN)Útensor_model_parallel_sizeÚpipeline_model_parallel_sizeÚpipeline_dtypeÚ$virtual_pipeline_model_parallel_sizeÚcontext_parallel_sizeÚsequence_parallelÚexpert_tensor_parallel_sizeÚexpert_model_parallel_sizeÚgradient_as_bucket_viewÚckpt_async_saveÚckpt_parallel_loadÚddpÚfsdpÚgpur#   é2   é    é
   FiÐ  )ÚacceleratorÚaccumulate_grad_batchesr3   ÚdevicesÚlimit_test_batchesÚlimit_val_batchesÚlog_every_n_stepsr2   r0   ÚpluginsÚstrategyÚuse_distributed_samplerÚval_check_interval)r   r   ÚnlÚMegatronStrategyr   ÚTrainerr   )r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   rS   Útrainerr   r   r    rY   =   sR   .ù	êòrY   )Útargetr   ÚdefaultFÚdirr   Úperformance_modeÚfnc                 C   sb   t j|tƒ t||t  t¡gdt jtddddt| |t|ddt	dd	t
ƒ d
}|r/t|ƒ}|S )a‘  
    Create a pre-training recipe for Llama4 128-Experts (Maverick) model.

    This function sets up a complete configuration for pre-training, including
    model, trainer, data, logging, optimization, and resumption settings.

    Args:
        dir (Optional[str]): Directory for saving logs and checkpoints.
        name (str): Name of the pre-training run.
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
        performance_mode (bool): If true, enables optimizations for maximum performance.
        fn (Callable): The pre-training function to use.

    Returns:
        run.Partial: Partial configuration for pre-training.

    Examples:
        CLI usage:
            $ nemo llm pretrain --factory llama4_e128 ...
            $ nemo llm pretrain --factory "llama4_e128(num_nodes=2, name='my_pretrain')"

        Python API usage:
            >>> recipe = pretrain_recipe(name="llama4_e128_pretrain", num_nodes=2)
            >>> print(recipe)

    Note:
        For more details on pre-training LLMs with NeMo, see the pre-training
        guide in the `examples/llm/pretrain/` directory.
    )r0   r1   r3   i    i   r#   )Ú
seq_lengthÚglobal_batch_sizeÚmicro_batch_sizer   )r\   r   r   ga2U0*©3?)Úmax_lr)r!   rY   ÚdataÚlogÚoptimÚresume)r   ÚPartialr!   rY   r   r   r	   r   r   r   r   Ú"pretrain_performance_optimizations)r\   r   r0   r1   r]   r^   Úreciper   r   r    Úpretrain_recipe˜   s    '
ýõrj   ri   c                 C   sZ   | j jsg | j _tjtddd}tjtdd}t t¡}| j j |||g¡ d| j j_	| S )a   
    Create a performance-optimized pre-training recipe for Llama4 128-Experts (Maverick) model.

    This method enables performance optimizations that may not be suitable for all use cases.
    It builds upon the standard pre-training recipe and adds additional performance enhancements.

    Args:
        recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added

    Returns:
        run.Partial: Partial configuration for performance-optimized pre-training.

    Note:
        Use this method with caution and only when you need maximum performance.
        It may not be suitable for all hardware configurations or use cases.
    éd   )Úgc_interval_trainÚgc_interval_valT©Útp_comm_overlapF)
rY   r3   r   r   r   r   r   ÚextendrR   r6   )ri   Úgarbage_collection_callbackÚmcomm_overlap_callbackÚtoken_drop_callbackr   r   r    rh   Ó   s,   ýþÿýÿ
rh   ú-meta-llama/Llama-4-Maverick-17B-128E-InstructÚloraÚresume_pathÚpeft_schemer_   Úpacked_sequencec	           
      C   s@  |du r|}|du r|rdnd}t tƒ || ||||ƒ}	|du s%| ¡ dkr:d|	jj_d|	jj_d|	jj_d|	jj	_
nC| ¡ dv rvd	|	jj_d
|	jj_d
|	jj_d|	jj_t t| ¡  ¡|	_d
|	j_d|	j_d|	jj	_d|	jj	_d|	jj	_
ntd|› ƒ‚||	jj	_||	j_|r—dd	i|	j_tjt|d|	j_|ržt|	|ƒ}	|	S )aL  
    Create a fine-tuning recipe for Llama4 128-Experts (Maverick) model.

    This function sets up a complete configuration for fine-tuning, including
    model, trainer, data, logging, optimization, and resumption settings.
    The recipe uses LoRA (Low-Rank Adaptation) for efficient fine-tuning, unless peft_scheme is set to None.

    Args:
        dir (Optional[str]): Directory for saving logs and checkpoints.
        name (str): Name of the fine-tuning run.
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
            Allowed values: 'lora'/'dora'/'none'/None.
        seq_length (int): Maximum number of tokens per microbatch.
        packed_sequence (Optional[bool]): If true, fine-tuning sequences will be packed into batches up to the given
            maximum seq_length for better efficiency. By default, this value equals performance_mode.
        performance_mode (bool): If true, enables optimizations for maximum performance.

    Returns:
        run.Partial: Partial configuration for fine-tuning.

    Examples:
        CLI usage:
            $ nemo llm finetune --factory llama4_e128

        Python API usage:
            >>> recipe = finetune_recipe(name="llama4_e128_finetune", num_nodes=2)
            >>> print(recipe)

    Note:
        This recipe uses the SQuAD dataset for fine-tuning. For more information
        on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
        `examples/llm/finetune/` directory.
    Ni   i   Únoner"   rJ   gñhãˆµøÔ>)ru   ÚdoraTr&   é   Fg-Cëâ6?zUnrecognized peft scheme: Úpad_to_max_length)Úpacked_sequence_size)r   r!   ÚlowerrY   rS   r;   Ú!expert_tensor_model_parallel_sizerB   re   r   Úlrr@   r<   r   r   r   ÚpeftÚdimÚalphaÚuse_distributed_optimizerÚcross_entropy_loss_fusionÚ
ValueErrorr_   rc   Údataset_kwargsr
   Úpacked_sequence_specsÚ"finetune_performance_optimizations)
r\   rv   r   r0   r1   rw   r_   rx   r]   ri   r   r   r    Úfinetune_recipe   sL   1ù	






rŠ   c                 C   s¸   d| j j_t| j dƒr| j jdu rg | j _|du s| ¡ dkr.tjtdddddd| j j_	ndg| j
_d| j j_| j j tjtdd	¡ | j j t t¡¡ | j j t td
d
¡¡ | S )aé  
    Modify the given recipe to optimize settings for performance.

    This method enables performance optimizations that may not be suitable for all use cases.
    Intended to build upon the standard fine-tuning recipe.

    Args:
        recipe (run.Partial): Base fine-tuning recipe to which performance optimizations will be added
        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
            Allowed values: 'lora'/'dora'/'none'/None.

    Returns:
        run.Partial: Partial configuration for performance-optimized fine-tuning.

    Note:
        Use this method with caution and only when you need maximum performance.
        It may not be suitable for all hardware configurations or use cases.
    r#   r3   Nry   TF)r5   r6   r7   r8   r9   Ú
linear_qkvrn   rk   )rY   rS   r;   Úhasattrr3   r~   r   r   r   rF   r   Útarget_modulesrR   r6   Úappendr   r   r   )ri   rw   r   r   r    r‰   d  s:   
ú
	
þÿýÿr‰   )r"   r#   NNr#   Tr"   r$   r%   r&   r'   N)	Nrt   r[   r#   r&   ru   NNF)@Útypingr   r   Úlightning.pytorchÚpytorchÚplÚnemo_runr   ÚtorchÚ$lightning.pytorch.callbacks.callbackr   Úmegatron.core.distributedr   Únemor   rV   Únemo.collections.llm.apir   r   Ú"nemo.collections.llm.gpt.data.mockr	   Ú-nemo.collections.llm.gpt.data.packed_sequencer
   Ú$nemo.collections.llm.gpt.model.llamar   r   Únemo.collections.llm.peftr   Ú-nemo.collections.llm.recipes.finetune_defaultr   Ú(nemo.collections.llm.recipes.log.defaultr   r   r   Ú'nemo.collections.llm.recipes.optim.adamr   Ú6nemo.collections.llm.recipes.precision.mixed_precisionr   Ú3nemo.lightning.pytorch.callbacks.garbage_collectionr   Ú6nemo.lightning.pytorch.callbacks.megatron_comm_overlapr   Ú/nemo.lightning.pytorch.callbacks.moe_token_dropr   Únemo.utils.exp_managerr   ÚNAMEÚcliÚfactoryr   ÚLightningModuler!   ÚintÚdtypeÚboolÚlistrX   rY   Ústrrg   rj   rh   rŠ   r‰   r   r   r   r    Ú<module>   sö   ôÿþýüûúùø	÷
öõô

ó[úÿþýüûúù:-÷ÿþýüûúùø	÷
öcÿþý