o
    }oi18                     @   s:  d dl mZmZ d dlmZ d dlZd dlZd dl	m
Z
 d dlmZ d dlmZ d dlmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZmZ d dlmZ d dlm Z  d dl!m"Z# d dl$m%Z% d dl&m'Z' d dl(m)Z) d dl*m+Z+ dZ,ej-j.e,ddej/ej0 fddZ1												d<de2de2d eej3 d!ee2 d"e2d#e4d$e2d%e2d&e2d'e2d(e2d)ee5ej/e
   dej/ej6 fd*d+Z7ej-j.ee,d,dd-ddd.efd/ee8 d0e8d&e2d'e2d1e4d2edej9fd3d4Z:d5ej9dej9fd6d7Z;ej-j.ej<e,d,		-			8d=d/ee8 d0e8d&e2d'e2d9ee8 dej9fd:d;Z=dS )>    )CallableOptionalN)Callback)DistributedDataParallelConfig)	lightning)llmvlm)AutoTokenizer)pretrain)nemo_resume)default_logdefault_resumetensorboard_logger),distributed_fused_adam_with_cosine_annealing)
bf16_mixed)MockDataModule)GarbageCollectionCallback)MegatronCommOverlapCallback)MegatronTokenDropCallback)TimingCallbackllama4_omni_e16namereturnc                   C   sF   t jtjt jtjt tjt tjt jtjdddddddddS )a  
    Factory function to create a Llama4 16-Experts (Scout) VLM model configuration.

    Returns:
        run.Config[pl.LightningModule]: Configuration for the Llama4 16-Experts
        (Scout) VLM model model.

    Examples:
        CLI usage:
            $ nemo llm pretrain model=llama4_omni_e16 ...

        Python API usage:
            >>> model_config = model()
            >>> print(model_config)
    mcore_affinei   i   F)projector_type
input_sizehidden_sizeffn_hidden_sizebiasbias_activation_fusion)language_transformer_configvision_transformer_configvision_projection_config)config)	runConfigr   Llama4OmniModelLlama4ScoutExperts16Configr   Llama4Experts16ConfigLlama4VisionConfigMultimodalProjectorConfig r,   r,   `/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/vlm/recipes/llama4_omni_e16.pymodel)   s"   

r.         T          { tensor_parallelismpipeline_parallelismpipeline_parallelism_typevirtual_pipeline_parallelismcontext_parallelismsequence_parallelismexpert_tensor_parallelismexpert_model_parallelism	num_nodesnum_gpus_per_node	max_steps	callbacksc                 C   sj   t jtj| |||||||dddt jtddddddddd}t jtjdd||	dd	d
|
|t |ddd}|S )az  
    Configure the NeMo Lightning Trainer for Llama4 16-Experts (Scout) VLM model.

    This function sets up the distributed training strategy and other training parameters.

    Args:
        tensor_parallelism (int): Degree of tensor model parallelism.
        pipeline_parallelism (int): Degree of pipeline model parallelism.
        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
        context_parallelism (int): Degree of context parallelism.
        sequence_parallelism (bool): Whether to use sequence parallelism.
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
        max_steps (int): Maximum number of training steps.
        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.

    Returns:
        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.

    Examples:
        CLI usage:
            $ nemo llm pretrain trainer=llama4_e128 ...

        Python API usage:
            >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8)
            >>> print(trainer_config)

    Note:
        For more information on distributed training strategies, refer to the
        NeMo documentation on multi-GPU and multi-node training.
    Toptim_grads_params)check_for_nan_in_gradgrad_reduce_in_fp32overlap_grad_reduceoverlap_param_gatheraverage_in_collectivedata_parallel_sharding_strategyN)tensor_model_parallel_sizepipeline_model_parallel_sizepipeline_dtype$virtual_pipeline_model_parallel_sizecontext_parallel_sizesequence_parallelexpert_tensor_parallel_sizeexpert_model_parallel_sizegradient_as_bucket_viewckpt_async_saveckpt_parallel_loadddpfsdpgpur0   2   r2   
   Fi  )acceleratoraccumulate_grad_batchesr@   deviceslimit_test_batcheslimit_val_batcheslog_every_n_stepsr?   r=   pluginsstrategyuse_distributed_samplerval_check_interval)r%   r&   nlMegatronStrategyr   Trainerr   )r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   r_   trainerr,   r,   r-   re   M   sR   .	re   )targetr   defaultFdirr   performance_modefnc                 C   sb   t j|t t||t tgdt jtddddt| |t|ddt	dd	t
 d
}|r/t|}|S )a  
    Create a pre-training recipe for Llama4 128-Experts (Maverick) model.

    This function sets up a complete configuration for pre-training, including
    model, trainer, data, logging, optimization, and resumption settings.

    Args:
        dir (Optional[str]): Directory for saving logs and checkpoints.
        name (str): Name of the pre-training run.
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
        performance_mode (bool): If true, enables optimizations for maximum performance.
        fn (Callable): The pre-training function to use.

    Returns:
        run.Partial: Partial configuration for pre-training.

    Examples:
        CLI usage:
            $ nemo llm pretrain --factory llama4_e128 ...
            $ nemo llm pretrain --factory "llama4_e128(num_nodes=2, name='my_pretrain')"

        Python API usage:
            >>> recipe = pretrain_recipe(name="llama4_e128_pretrain", num_nodes=2)
            >>> print(recipe)

    Note:
        For more details on pre-training LLMs with NeMo, see the pre-training
        guide in the `examples/llm/pretrain/` directory.
    )r=   r>   r@       i   r0   )
seq_lengthglobal_batch_sizemicro_batch_sizer   rh   r   r   ga2U0*3?)max_lrr.   re   datalogoptimresume)r%   Partialr.   re   r&   r   Llama4MockDataModuler   r   r   r   "pretrain_performance_optimizations)rh   r   r=   r>   ri   rj   reciper,   r,   r-   pretrain_recipe   s    '
rz   ry   c                 C   sZ   | j jsg | j _tjtddd}tjtdd}tt}| j j|||g d| j j_	| S )a  
    Create a performance-optimized pre-training recipe for Llama4 128-Experts (Maverick) model.

    This method enables performance optimizations that may not be suitable for all use cases.
    It builds upon the standard pre-training recipe and adds additional performance enhancements.

    Args:
        recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added

    Returns:
        run.Partial: Partial configuration for performance-optimized pre-training.

    Note:
        Use this method with caution and only when you need maximum performance.
        It may not be suitable for all hardware configurations or use cases.
    d   )gc_interval_traingc_interval_valTtp_comm_overlapF)
re   r@   r%   r&   r   r   r   extendr^   rC   )ry   garbage_collection_callbackmcomm_overlap_callbacktoken_drop_callbackr,   r,   r-   rx      s,   
rx   nonepeft_schemec                 C   sn  t jtjddddddtjt jtddddddd	}t jtjdd|d	dd
|t |dt t	t jt
ddgd}t jtjt |t jtdddt jtdddddtj| |t|ddtddddtdd}|du sm| dkrd|jj_d|jj_d|jj_d|jj_d|jj_|S | dkrd|jj_d|jj_d|jj_t jtj dg dd |_!d!|jj_|S t"d"| )#a?  
    Create a fine-tuning recipe for Llava1.5 7B model.

    This function sets up a complete configuration for fine-tuning, including
    model, trainer, data, logging, optimization, and resumption settings.
    The recipe uses LoRA (Low-Rank Adaptation) for efficient fine-tuning, unless peft_scheme is set to None.

    Args:
        dir (Optional[str]): Directory for saving logs and checkpoints.
        name (str): Name of the fine-tuning run.
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.

    Returns:
        run.Partial: Partial configuration for fine-tuning.

    Examples:
        CLI usage:
            $ nemo llm finetune --factory llama4_omni_e16

        Python API usage:
            >>> recipe = finetune_recipe(name="llama4_omni_e16_finetune", num_nodes=1)
            >>> print(recipe)

    Note:
        This recipe uses the SQuAD dataset for fine-tuning. For more information
        on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
        `examples/llm/finetune/` directory.
    r/   r1   r0   r   T)rB   rC   rD   rE   rF   )rH   rN   rO   rI   $encoder_pipeline_model_parallel_sizerM   rJ   rS   rU   rW   iF  i  r~   )rX   rY   rZ   r\   r]   r?   r=   r^   r_   ra   r@   rk      z)meta-llama/Llama-4-Scout-17B-16E-Instruct)pretrained_model_nameN)rl   rm   rn   	tokenizerimage_processornum_workersr   ro   gh㈵>gH׊>   )rp   min_lrwarmup_stepsrq   r   r3   loraF)z*.language_model.*.linear_qkvz*.language_model.*.linear_qz*.language_model.*.linear_kvz*.language_model.*.linear_projz*.language_model.*.linear_fc1z*.language_model.*.linear_fc2)freeze_vision_modeltarget_modulesg-C6?zUnrecognized peft scheme: )#r%   r&   rb   rc   torchbfloat16r   rd   r   r   r   rv   r   finetuner.   rw   r	   r   r   r   r   lowerre   r_   rM   rH   !expert_tensor_model_parallel_sizerI   rt   r$   lrr   LoRApeft
ValueError)rh   r   r=   r>   r   r_   re   ry   r,   r,   r-   finetune_recipe  s   &	








r   )r/   r0   NNr0   Tr/   r1   r2   r3   r4   N)Nrg   r2   r3   r   )>typingr   r   lightning.pytorchpytorchplnemo_runr%   r   $lightning.pytorch.callbacks.callbackr   megatron.core.distributedr   nemor   rb   nemo.collectionsr   r   "nemo.collections.common.tokenizersr	   nemo.collections.llm.apir
   -nemo.collections.llm.recipes.finetune_defaultr   (nemo.collections.llm.recipes.log.defaultr   r   r   'nemo.collections.llm.recipes.optim.adamr   6nemo.collections.llm.recipes.precision.mixed_precisionr   %nemo.collections.vlm.llama4.data.mockr   rw   3nemo.lightning.pytorch.callbacks.garbage_collectionr   6nemo.lightning.pytorch.callbacks.megatron_comm_overlapr   /nemo.lightning.pytorch.callbacks.moe_token_dropr   nemo.utils.exp_managerr   NAMEclifactoryr&   LightningModuler.   intdtypeboollistrd   re   strrv   rz   rx   r   r   r,   r,   r,   r-   <module>   s   $	


[:-