o
    wi                     @   s   d dl mZ d dlmZ d dlZd dlZd dlm	Z	 d dl
mZ d dlmZ ddejddddddddfd	ed
edeej dee dededededededeeejej   dejej fddZdS )    )OptionalN)DistributedDataParallelConfig)
bf16_mixed   T   i{ tensor_parallelismpipeline_parallelismpipeline_parallelism_typevirtual_pipeline_parallelismcontext_parallelismexpert_parallelismsequence_parallelism	num_nodesnum_gpus_per_node	max_steps	callbacksreturnc                 C   sf   t jtj| ||||||dddt jtddddddd}t jtjdd|
|ddd|	|t |d	d
dd}|S )a&  
    Configure the NeMo Lightning Trainer for DeepSeek V2/V3 models.

    This function sets up the distributed training strategy optimized for DeepSeek.

    Args:
        tensor_parallelism (int): Degree of tensor model parallelism.
        pipeline_parallelism (int): Degree of pipeline model parallelism.
        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
        context_parallelism (int): Degree of context parallelism.
        expert_parallelism (int): Degree of expert parallelism.
        sequence_parallelism (bool): Whether to use sequence parallelism.
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
        max_steps (int): Maximum number of training steps.
        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.

    Returns:
        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.

    T)check_for_nan_in_gradgrad_reduce_in_fp32overlap_grad_reduceoverlap_param_gatheraverage_in_collective)tensor_model_parallel_sizepipeline_model_parallel_sizepipeline_dtype$virtual_pipeline_model_parallel_sizecontext_parallel_sizeexpert_model_parallel_sizesequence_parallelgradient_as_bucket_viewckpt_async_saveckpt_parallel_loadddpgpur   2       
   Fi  r   )acceleratoraccumulate_grad_batchesr   deviceslimit_test_batcheslimit_val_batcheslog_every_n_stepsr   r   pluginsstrategyuse_distributed_samplerval_check_intervalnum_sanity_val_steps)runConfignlMegatronStrategyr   Trainerr   )r   r   r	   r
   r   r   r   r   r   r   r   r.   trainer r8   b/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/llm/recipes/deepseek.pyr7      sN   #r7   )typingr   lightning.pytorchpytorchplnemo_runr2   torchmegatron.core.distributedr   nemo.lightning	lightningr4   6nemo.collections.llm.recipes.precision.mixed_precisionr   bfloat16intdtypeboollistr3   Callbackr6   r7   r8   r8   r8   r9   <module>   sV   	

