o
    }oiO8                     @   s:  d dl mZmZ d dlmZ d dlZd dlZd dl	m
Z
 d dlmZ d dlmZ d dlmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZmZ d dlmZ d dlm Z  d dl!m"Z# d dl$m%Z% d dl&m'Z' d dl(m)Z) d dl*m+Z+ dZ,ej-j.e,ddej/ej0 fddZ1												d<de2de2d eej3 d!ee2 d"e2d#e2d$e2d%e4d&e2d'e2d(e2d)ee5ej/e
   dej/ej6 fd*d+Z7ej-j.ee,d,dd-ddd.efd/ee8 d0e8d&e2d'e2d1e4d2edej9fd3d4Z:d5ej9dej9fd6d7Z;ej-j.ej<e,d,		-			8d=d/ee8 d0e8d&e2d'e2d9ee8 dej9fd:d;Z=dS )>    )CallableOptionalN)Callback)DistributedDataParallelConfig)	lightning)llmvlm)AutoTokenizer)pretrain)nemo_resume)default_logdefault_resumetensorboard_logger),distributed_fused_adam_with_cosine_annealing)
bf16_mixed)MockDataModule)GarbageCollectionCallback)MegatronCommOverlapCallback)MegatronTokenDropCallback)TimingCallbackllama4_omni_e128namereturnc                   C   sF   t jtjt jtjt tjt tjt jtjdddddddddS )a  
    Factory function to create a Llama4 128-Experts (Maverick) VLM model configuration.

    Returns:
        run.Config[pl.LightningModule]: Configuration for the Llama4 128-Experts
        (Maverick) VLM model model.

    Examples:
        CLI usage:
            $ nemo llm pretrain model=llama4_omni_e128 ...

        Python API usage:
            >>> model_config = model()
            >>> print(model_config)
    mcore_affinei   i   F)projector_type
input_sizehidden_sizeffn_hidden_sizebiasbias_activation_fusion)language_transformer_configvision_transformer_configvision_projection_config)config)	runConfigr   Llama4OmniModelLlama4MaverickExperts128Configr   Llama4Experts128ConfigLlama4VisionConfigMultimodalProjectorConfig r,   r,   a/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/vlm/recipes/llama4_omni_e128.pymodel)   s"   

r.            T@      { tensor_parallelismpipeline_parallelismpipeline_parallelism_typevirtual_pipeline_parallelismcontext_parallelismexpert_tensor_parallelismexpert_model_parallelismsequence_parallelism	num_nodesnum_gpus_per_node	max_steps	callbacksc                 C   sj   t jtj| |||||||dddt jtddddddddd}t jtjdd||	dd	d
|
|t |ddd}|S )a~  
    Configure the NeMo Lightning Trainer for Llama4 128-Experts (Maverick) VLM model.

    This function sets up the distributed training strategy and other training parameters.

    Args:
        tensor_parallelism (int): Degree of tensor model parallelism.
        pipeline_parallelism (int): Degree of pipeline model parallelism.
        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
        context_parallelism (int): Degree of context parallelism.
        sequence_parallelism (bool): Whether to use sequence parallelism.
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
        max_steps (int): Maximum number of training steps.
        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.

    Returns:
        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.

    Examples:
        CLI usage:
            $ nemo llm pretrain trainer=llama4_e128 ...

        Python API usage:
            >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8)
            >>> print(trainer_config)

    Note:
        For more information on distributed training strategies, refer to the
        NeMo documentation on multi-GPU and multi-node training.
    Toptim_grads_params)check_for_nan_in_gradgrad_reduce_in_fp32overlap_grad_reduceoverlap_param_gatheraverage_in_collectivedata_parallel_sharding_strategyN)tensor_model_parallel_sizepipeline_model_parallel_sizepipeline_dtype$virtual_pipeline_model_parallel_sizecontext_parallel_sizesequence_parallelexpert_tensor_parallel_sizeexpert_model_parallel_sizegradient_as_bucket_viewckpt_async_saveckpt_parallel_loadddpfsdpgpur0   2       
   Fi  )acceleratoraccumulate_grad_batchesr@   deviceslimit_test_batcheslimit_val_batcheslog_every_n_stepsr?   r=   pluginsstrategyuse_distributed_samplerval_check_interval)r%   r&   nlMegatronStrategyr   Trainerr   )r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   r`   trainerr,   r,   r-   rf   M   sR   .	rf   )targetr   defaultFdirr   performance_modefnc                 C   sb   t j|t t||t tgdt jtddddt| |t|ddt	dd	t
 d
}|r/t|}|S )a  
    Create a pre-training recipe for Llama4 128-Experts (Maverick) model.

    This function sets up a complete configuration for pre-training, including
    model, trainer, data, logging, optimization, and resumption settings.

    Args:
        dir (Optional[str]): Directory for saving logs and checkpoints.
        name (str): Name of the pre-training run.
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
        performance_mode (bool): If true, enables optimizations for maximum performance.
        fn (Callable): The pre-training function to use.

    Returns:
        run.Partial: Partial configuration for pre-training.

    Examples:
        CLI usage:
            $ nemo llm pretrain --factory llama4_e128 ...
            $ nemo llm pretrain --factory "llama4_e128(num_nodes=2, name='my_pretrain')"

        Python API usage:
            >>> recipe = pretrain_recipe(name="llama4_e128_pretrain", num_nodes=2)
            >>> print(recipe)

    Note:
        For more details on pre-training LLMs with NeMo, see the pre-training
        guide in the `examples/llm/pretrain/` directory.
    )r=   r>   r@       i   r0   )
seq_lengthglobal_batch_sizemicro_batch_sizer   ri   r   r   ga2U0*3?)max_lrr.   rf   datalogoptimresume)r%   Partialr.   rf   r&   r   Llama4MockDataModuler   r   r   r   "pretrain_performance_optimizations)ri   r   r=   r>   rj   rk   reciper,   r,   r-   pretrain_recipe   s    '
r{   rz   c                 C   sZ   | j jsg | j _tjtddd}tjtdd}tt}| j j|||g d| j j_	| S )a  
    Create a performance-optimized pre-training recipe for Llama4 128-Experts (Maverick) model.

    This method enables performance optimizations that may not be suitable for all use cases.
    It builds upon the standard pre-training recipe and adds additional performance enhancements.

    Args:
        recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added

    Returns:
        run.Partial: Partial configuration for performance-optimized pre-training.

    Note:
        Use this method with caution and only when you need maximum performance.
        It may not be suitable for all hardware configurations or use cases.
    d   )gc_interval_traingc_interval_valTtp_comm_overlapF)
rf   r@   r%   r&   r   r   r   extendr_   rC   )rz   garbage_collection_callbackmcomm_overlap_callbacktoken_drop_callbackr,   r,   r-   ry      s,   
ry   nonepeft_schemec                 C   sn  t jtjddddddtjt jtddddddd	}t jtjdd|d	dd
|t |dt t	t jt
ddgd}t jtjt |t jtdddt jtdddddtj| |t|ddtddddtdd}|du sm| dkrd|jj_d|jj_d|jj_d|jj_|S | dkrd|jj_d|jj_d|jj_d|jj_t jt j!dg dd |_"d!|jj_|S t#d"| )#aA  
    Create a fine-tuning recipe for Llava1.5 7B model.

    This function sets up a complete configuration for fine-tuning, including
    model, trainer, data, logging, optimization, and resumption settings.
    The recipe uses LoRA (Low-Rank Adaptation) for efficient fine-tuning, unless peft_scheme is set to None.

    Args:
        dir (Optional[str]): Directory for saving logs and checkpoints.
        name (str): Name of the fine-tuning run.
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.

    Returns:
        run.Partial: Partial configuration for fine-tuning.

    Examples:
        CLI usage:
            $ nemo llm finetune --factory llama4_omni_e128

        Python API usage:
            >>> recipe = finetune_recipe(name="llama4_omni_e128_finetune", num_nodes=1)
            >>> print(recipe)

    Note:
        This recipe uses the SQuAD dataset for fine-tuning. For more information
        on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
        `examples/llm/finetune/` directory.
    r/   r1   r0   r   T)rB   rC   rD   rE   rF   )rH   rN   rO   rI   $encoder_pipeline_model_parallel_sizerM   rJ   rS   rU   rX   iF  i  r   )rY   rZ   r[   r]   r^   r?   r=   r_   r`   rb   r@   rl   z)meta-llama/Llama-4-Scout-17B-16E-Instruct)pretrained_model_nameN)rm   rn   ro   	tokenizerimage_processornum_workersr   rp   gh㈵>gH׊>   )rq   min_lrwarmup_stepsrr   r   rW   lorar3   F)z*.language_model.*.linear_qkvz*.language_model.*.linear_qz*.language_model.*.linear_kvz*.language_model.*.linear_projz*.language_model.*.linear_fc1z*.language_model.*.linear_fc2)freeze_vision_modeltarget_modulesg-C6?zUnrecognized peft scheme: )$r%   r&   rc   rd   torchbfloat16r   re   r   r   r   rw   r   finetuner.   rx   r	   r   r   r   r   lowerrf   r`   rH   !expert_tensor_model_parallel_sizerO   ru   r$   lrrM   rI   r   LoRApeft
ValueError)ri   r   r=   r>   r   r`   rf   rz   r,   r,   r-   finetune_recipe  s   &	








r   )r/   r0   NNr0   r/   r1   Tr2   r3   r4   N)Nrh   r2   r3   r   )>typingr   r   lightning.pytorchpytorchplnemo_runr%   r   $lightning.pytorch.callbacks.callbackr   megatron.core.distributedr   nemor   rc   nemo.collectionsr   r   "nemo.collections.common.tokenizersr	   nemo.collections.llm.apir
   -nemo.collections.llm.recipes.finetune_defaultr   (nemo.collections.llm.recipes.log.defaultr   r   r   'nemo.collections.llm.recipes.optim.adamr   6nemo.collections.llm.recipes.precision.mixed_precisionr   %nemo.collections.vlm.llama4.data.mockr   rx   3nemo.lightning.pytorch.callbacks.garbage_collectionr   6nemo.lightning.pytorch.callbacks.megatron_comm_overlapr   /nemo.lightning.pytorch.callbacks.moe_token_dropr   nemo.utils.exp_managerr   NAMEclifactoryr&   LightningModuler.   intdtypeboollistre   rf   strrw   r{   ry   r   r   r,   r,   r,   r-   <module>   s   $	


[:-