o
    }oi-                     @   s2  d dl mZmZ d dlmZ d dlZd dlZd dl	m
Z
 d dlmZ d dlmZ d dlmZmZ d dlmZ d dlmZmZ d d	lmZ d d
lmZ d dlmZmZmZ d dl m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z) dZ*ej+j,e*ddej-ej. fddZ/ddej0ddddddddfde1de1deej2 d ee1 d!e1d"e3d#e1d$e1d%e1d&e1d'ee4ej-e
   dej-ej5 fd(d)Z6ej+j,ee*d*dd+ddd,efd-ee7 d.e7d$e1d%e1d/e3d0edej8fd1d2Z9d3ej8dej8fd4d5Z:ej+j,ee*d*		+			6	,d;d-ee7 d.e7d$e1d%e1d7ee7 d8e3dej8fd9d:Z;dS )<    )CallableOptionalN)Callback)DistributedDataParallelConfig)	lightning)finetunepretrain)MockDataModule)MixtralConfig8x22BMixtralModel)PEFT_STR2CLS)default_finetune_recipe)default_logdefault_resumetensorboard_logger),distributed_fused_adam_with_cosine_annealing)GarbageCollectionCallback)MegatronCommOverlapCallback)MegatronTokenDropCallback)TimingCallbackmixtral_8x22bnamereturnc                   C   s   t jtt tdS )aj  
    Factory function to create a Mixtral 8x22B model configuration.

    Returns:
        run.Config[pl.LightningModule]: Configuration for the Mixtral 8x22B model.

    Examples:
        CLI usage:
            $ nemo llm pretrain model=mixtral_8x22b ...

        Python API usage:
            >>> model_config = model()
            >>> print(model_config)
    )config)runConfigr   r
    r   r   ^/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/llm/recipes/mixtral_8x22b.pymodel(   s   r            T      i{ tensor_parallelismpipeline_parallelismpipeline_parallelism_typevirtual_pipeline_parallelismcontext_parallelismsequence_parallelismexpert_parallelism	num_nodesnum_gpus_per_node	max_steps	callbacksc                 C   sn   t jtj| ||||||dddt jtddddddd}t jtjdd|
|ddd|	|t jtjd	d
|ddd}|S )a  
    Configure the NeMo Lightning Trainer for Mixtral 8x22B model.

    This function sets up the distributed training strategy optimized for the large Mixtral 8x22B model.

    Args:
        tensor_parallelism (int): Degree of tensor model parallelism.
        pipeline_parallelism (int): Degree of pipeline model parallelism.
        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
        context_parallelism (int): Degree of context parallelism.
        sequence_parallelism (bool): Whether to use sequence parallelism.
        expert_parallelism (int): Degree of expert parallelism.
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
        max_steps (int): Maximum number of training steps.
        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.

    Returns:
        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.

    Examples:
        CLI usage:
            $ nemo llm pretrain trainer=mixtral_8x22b ...

        Python API usage:
            >>> trainer_config = trainer(num_nodes=16, num_gpus_per_node=8)
            >>> print(trainer_config)

    Note:
        This configuration uses extensive parallelism to handle the large model size efficiently.
    T)check_for_nan_in_gradgrad_reduce_in_fp32overlap_grad_reduceoverlap_param_gatheraverage_in_collective)tensor_model_parallel_sizepipeline_model_parallel_sizepipeline_dtype$virtual_pipeline_model_parallel_sizecontext_parallel_sizesequence_parallelexpert_model_parallel_sizegradient_as_bucket_viewckpt_async_saveckpt_parallel_loadddpgpu   2       
   z
bf16-mixed)	precisionFi  )acceleratoraccumulate_grad_batchesr/   deviceslimit_test_batcheslimit_val_batcheslog_every_n_stepsr.   r,   pluginsstrategyuse_distributed_samplerval_check_interval)r   r   nlMegatronStrategyr   TrainerMegatronMixedPrecision)r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   rM   trainerr   r   r   rT   ;   sL   -rT   )targetr   defaultFdirr   performance_modefnc                 C   sb   t j|t t||t tgdt jtddddt| |t|ddt	dd	t
 d
}|r/t|}|S )a  
    Create a pre-training recipe for Mixtral 8x22B model.

    This function sets up a complete configuration for pre-training, including
    model, trainer, data, logging, optimization, and resumption settings.

    Args:
        dir (Optional[str]): Directory for saving logs and checkpoints.
        name (str): Name of the pre-training run.
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
        performance_mode (bool): If true, enables optimizations for maximum performance.
        fn (Callable): The pre-training function to use.

    Returns:
        run.Partial: Partial configuration for pre-training.

    Examples:
        CLI usage:
            $ nemo llm pretrain --factory mixtral_8x22b
            $ nemo llm pretrain --factory "mixtral_8x22b(num_nodes=16, name='my_mixtral_pretrain')"

        Python API usage:
            >>> recipe = pretrain_recipe(name="mixtral_pretrain", num_nodes=16)
            >>> print(recipe)
    )r,   r-   r/   i   i   rA   )
seq_lengthglobal_batch_sizemicro_batch_sizer   )rW   r   r   ga2U0*3?)max_lr)r   rT   datalogoptimresume)r   Partialr   rT   r   r   r	   r   r   r   r   "pretrain_performance_optimizations)rW   r   r,   r-   rX   rY   reciper   r   r   pretrain_recipe   s   #re   rd   c                 C   st   | j jsg | j _tjtddd}tjtdd}| j jtt||g d| j j_	d| j j_
d| j j_d| j j_| S )a  
    Create a performance-optimized pre-training recipe for Mixtral 8x22B model.

    This method enables performance optimizations that may not be suitable for all use cases.
    It builds upon the standard pre-training recipe and adds additional performance enhancements.

    Args:
        recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added

    Returns:
        run.Partial: Partial configuration for performance-optimized pre-training.

    Note:
        Use this method with caution and only when you need maximum performance.
        It may not be suitable for all hardware configurations or use cases.
    d   )gc_interval_traingc_interval_valF)(overlap_param_gather_with_optimizer_steprA   r#   T)rT   r/   r   r   r   r   extendr   rM   r;   r5   r:   rL   r1   )rd   garbage_collection_callbackmcomm_overlap_callbackr   r   r   rc      s,   



rc   lorapeft_schemepacked_sequencec                 C   s   t t d| ||||}d|jj_d|jj_|du s| dkr0d|jj_d|jj_d|j	j
_|S | dv rLtjt|  d	d
gdd|_d|j	j
_|S td| )aH  
    Create a fine-tuning recipe for Mixtral 8x22B model.

    This function sets up a complete configuration for fine-tuning, including
    model, trainer, data, logging, optimization, and resumption settings.
    The recipe uses LoRA (Low-Rank Adaptation) for efficient fine-tuning, unless peft_scheme is set to None.

    Args:
        dir (Optional[str]): Directory for saving logs and checkpoints.
        name (str): Name of the fine-tuning run.
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
            Allowed values: 'lora'/'dora'/'none'/None.
        packed_sequence (Optional[bool]): If true, fine-tuning sequences will be packed into batches up to the given
            maximum seq_length for better efficiency.

    Returns:
        run.Partial: Partial configuration for fine-tuning.

    Examples:
        CLI usage:
            $ nemo llm finetune --factory mixtral_8x22b
            $ nemo llm finetune --factory "mixtral_8x22b(num_nodes=2, name='my_mixtral_finetune')"

        Python API usage:
            >>> recipe = finetune_recipe(name="mixtral_finetune", num_nodes=2)
            >>> print(recipe)

    Note:
        This recipe uses the SQuAD dataset for fine-tuning.
    zmistralai/Mixtral-8x22B-v0.1r#   Nnoner!   r"   gh㈵>)rm   dora
linear_qkvlinear_projrC   )target_modulesdimg-C6?zUnrecognized peft scheme: )r   r   rT   rM   r;   r5   lowerr6   r8   r`   r   lrr   r   r   peft
ValueError)rW   r   r,   r-   rn   ro   rd   r   r   r   finetune_recipe   s"   )





rz   )NrV   r#   r#   rm   F)<typingr   r   lightning.pytorchpytorchplnemo_runr   torch$lightning.pytorch.callbacks.callbackr   megatron.core.distributedr   nemor   rP   nemo.collections.llm.apir   r   "nemo.collections.llm.gpt.data.mockr	   &nemo.collections.llm.gpt.model.mixtralr
   r   nemo.collections.llm.peftr   -nemo.collections.llm.recipes.finetune_defaultr   (nemo.collections.llm.recipes.log.defaultr   r   r   'nemo.collections.llm.recipes.optim.adamr   3nemo.lightning.pytorch.callbacks.garbage_collectionr   6nemo.lightning.pytorch.callbacks.megatron_comm_overlapr   /nemo.lightning.pytorch.callbacks.moe_token_dropr   nemo.utils.exp_managerr   NAMEclifactoryr   LightningModuler   bfloat16intdtypeboollistrR   rT   strrb   re   rc   rz   r   r   r   r   <module>   s   	


W40