o
    wiJ2                     @   s  d dl mZmZ d dlmZ d dlZd dlZd dl	m
Z
mZ d dlmZ d dlmZ d dlmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZmZ d dlmZ d dlm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z& dZ'ej(j)e'dd2dej*ej+ fddZ,ej(j)ee'dddddeddfdee- de-de.de.ded e/d!e/dej0fd"d#Z1d$ej0dej0fd%d&Z2ej(j)e
e'd		'		(		)			d3dee- d*e-de-de.de.d+ee- d,ee. d-ee/ d!e/dej0fd.d/Z3d$ej0dej0fd0d1Z4dS )4    )CallableOptionalN)finetunepretrain)MockDataModule)PackedSequenceSpecs)DeepSeekModelDeepSeekV3Config)PEFT_STR2CLS)trainer)default_finetune_recipe)default_logdefault_resumetensorboard_logger),distributed_fused_adam_with_cosine_annealing)DeepEPCallback)GarbageCollectionCallback)MegatronCommOverlapCallback)TimingCallbackdeepseek_v3nameFreturnc                 C   s.   | rt jtddd}nt t}t jt|dS )ak  
    Factory function to create a DeepSeek-V3 (671B) model configuration.

    Returns:
        run.Config[pl.LightningModule]: Configuration for the DeepSeek V3 model.

    Examples:
        CLI usage:
            $ nemo llm pretrain model=deepseek_v3 ...

        Python API usage:
            >>> model_config = model()
            >>> print(model_config)
       g?)mtp_num_layersmtp_loss_scaling_factor)config)runConfigr	   r   )use_mtpconf r!   e/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/llm/recipes/deepseek_v3.pymodel%   s   
r#   )targetr   default      Tdirr   	num_nodesnum_gpus_per_nodefnr   performance_modec                 C   sv  t j|t|tddd||t tgdt jtddddt| |t|ddt	d	d
t
 d}d|jj_d|jj_d|jj_d|jj_d|jj_d|jjj_d|jj_ddlm} t j|dd|jj_d|jj_ddg|jj_t t}	d|jj_tj |jj_!tj"|jj_#tj"|jj_$tj"|jj_%t jt&ddd}
t jt'dd}|jj()|	 |jj()|
 |jj()| |rt*|}|S )a  
    Create a pre-training recipe for DeepSeek-V3 (671B) model.

    This function sets up a complete configuration for pre-training, including
    model, trainer, data, logging, optimization, and resumption settings.

    Args:
        dir (Optional[str]): Directory for saving logs and checkpoints.
        name (str): Name of the pre-training run.
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
        performance_mode (bool): If true, enables optimizations for maximum performance.
        fn (Callable): The pre-training function to use.

    Returns:
        run.Partial: Partial configuration for pre-training.

    Examples:
        CLI usage:
            $ nemo llm pretrain --factory deepseek_v3
            $ nemo llm pretrain --factory "deepseek_v3(num_nodes=128, name='my_deepseek_v3')"

        Python API usage:
            >>> recipe = pretrain_recipe(name="deepseek_v3_pretrain", num_nodes=128)
            >>> print(recipe)

    r      @   )tensor_parallelismpipeline_parallelismexpert_parallelismr)   r*   	callbacksi   )
seq_lengthglobal_batch_sizemicro_batch_sizer   )r(   r   r   ga2U0*3?)max_lr)r#   r   datalogoptimresume      NFr   )	timedelta<   )minutes	selectivemla_up_proj	layernormT   gc_interval_traingc_interval_valtp_comm_overlap)+r   Partialr#   r   r   r   r   r   r   r   r   strategy"num_layers_in_first_pipeline_stage!num_layers_in_last_pipeline_stage$virtual_pipeline_model_parallel_sizeexpert_tensor_parallel_sizetensor_model_parallel_sizeddpgrad_reduce_in_fp32r8   ckpt
save_top_kdatetimer=   train_time_intervalr   recompute_granularityrecompute_modulesr   r9   use_precision_aware_optimizertorchfloat32main_params_dtypebfloat16main_grads_dtypeexp_avg_dtypeexp_avg_sq_dtyper   r   r2   append"pretrain_performance_optimizations)r(   r   r)   r*   r+   r   r,   reciper=   deepep_callbackgarbage_collection_callbackcomm_overlap_callbackr!   r!   r"   pretrain_recipe<   s`   %









rf   rb   c                 C   ^   t | jdr| jjdu rg | j_tjtddd}tjtdd}| jj||g d| jj_	| S )a  
    Create a performance-optimized pre-training recipe for DeepSeek-V3 (671B) model.

    This method enables performance optimizations that may not be suitable for all use cases.
    It builds upon the standard pre-training recipe and adds additional performance enhancements.

    Args:
        recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added

    Returns:
        run.Partial: Partial configuration for performance-optimized pre-training.

    Note:
        Use this method with caution and only when you need maximum performance.
        It may not be suitable for all hardware configurations or use cases.
    r2   Nr>   rD   FrG   
hasattrr   r2   r   r   r   r   extendpluginsrQ   rb   rd   re   r!   r!   r"   ra      s$   
ra   deepseek-ai/DeepSeek-V3-BaserC   loraresume_pathpeft_schemer3   packed_sequencec	           
      C   s  |du rd}|du r|du s|  dkrd}n|  dv rd}tt || ||||}	|du s4|  dkrSd|	jj_d|	jj_d|	jj_d	|	jj_d
|	jj_	d|	j
j_nK|  dv rtt|   |	_g d|	j_d|	j
j_d|	jj_d|	jj_d|	jj_d|	jj_d|	jj_d|	jj_d|	jj_	d|	j
j_ntd| ||	jj_||	j_|rddi|	j_tjt|d|	j_|rt|	}	|	S )a`  
    Create a fine-tuning recipe for DeepSeek-V3 (671B) model.

    This function sets up a complete configuration for fine-tuning, including
    model, trainer, data, logging, optimization, and resumption settings.
    The recipe uses LoRA (Low-Rank Adaptation) for efficient fine-tuning, unless peft_scheme is set to None.

    Args:
        dir (Optional[str]): Directory for saving logs and checkpoints.
        resume_path (str): Path to the NeMo checkpoint
        name (str): Name of the fine-tuning run.
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
            Allowed values: 'lora'/'dora'/'none'/None.
        seq_length (int): Maximum number of tokens per microbatch.
        packed_sequence (Optional[bool]): If true, fine-tuning sequences will be packed into batches up to the given
            maximum seq_length for better efficiency. By default, this value equals performance_mode.
    Returns:
        run.Partial: Partial configuration for fine-tuning.

    Examples:
        CLI usage:
            $ nemo llm finetune --factory deepseek_v3
            $ nemo llm finetune --factory "deepseek_v3(num_nodes=5, name='my_deepseek_v3_finetune')"

        Python API usage:
            >>> recipe = finetune_recipe(name="deepseek_v3_finetune", num_nodes=6)
            >>> print(recipe)

    Note:
        This recipe uses the SQuAD dataset for fine-tuning. Be aware that fine-tuning the DeepSeek-V3 model
        requires substantial computational resources.
    Ni   noner.   )rn   dorarC   r   r'         gh㈵>)linear_q_down_projlinear_q_up_projlinear_kv_down_projlinear_kv_up_projlinear_projFT      g-C6?zUnrecognized peft scheme: pad_to_max_length)packed_sequence_size)lowerr   r#   r   rJ   expert_model_parallel_sizerO   pipeline_model_parallel_sizerK   rL   r9   r   lrr   r   r
   pefttarget_modulesuse_distributed_optimizercross_entropy_loss_fusionsequence_parallel
ValueErrorr3   r7   dataset_kwargsr   packed_sequence_specs"finetune_performance_optimizations)
r(   ro   r   r)   r*   rp   r3   rq   r,   rb   r!   r!   r"   finetune_recipe   sH   /













r   c                 C   rg   )a]  
    Modify the given recipe to optimize settings for performance.

    This method enables performance optimizations that may not be suitable for all use cases.
    Intended to build upon the standard fine-tuning recipe.

    Args:
        recipe (run.Partial): Base fine-tuning recipe to which performance optimizations will be added

    Returns:
        run.Partial: Partial configuration for performance-optimized fine-tuning.

    Note:
        Use this method with caution and only when you need maximum performance.
        It may not be suitable for all hardware configurations or use cases.
    r2   Nr>   rD   FrG   rh   rl   r!   r!   r"   r   *  s$   
r   )F)	Nrm   r%   rC   r'   rn   NNF)5typingr   r   lightning.pytorchpytorchplnemo_runr   rY   nemo.collections.llm.apir   r   "nemo.collections.llm.gpt.data.mockr   -nemo.collections.llm.gpt.data.packed_sequencer   'nemo.collections.llm.gpt.model.deepseekr   r	   nemo.collections.llm.peftr
   %nemo.collections.llm.recipes.deepseekr   -nemo.collections.llm.recipes.finetune_defaultr   (nemo.collections.llm.recipes.log.defaultr   r   r   'nemo.collections.llm.recipes.optim.adamr   'nemo.lightning.pytorch.callbacks.deepepr   3nemo.lightning.pytorch.callbacks.garbage_collectionr   6nemo.lightning.pytorch.callbacks.megatron_comm_overlapr   nemo.utils.exp_managerr   NAMEclifactoryr   LightningModuler#   strintboolrI   rf   ra   r   r   r!   r!   r!   r"   <module>   s   b)	
a