o
    wi                  $   @   s*  d dl mZ d dlmZ d dlZd dlZd dlm	Z	 d dl
mZ d dlmZ d dlmZmZmZmZmZmZ d dlmZmZ ded	ejej fd
dZ																d'dededeej dee dededededededed ed!ed"ed#ed$ee eje	   d	ejej! f"d%d&Z"dS )(    )OptionalN)Callback)DistributedDataParallelConfig)	lightning)Nemotron3Config4BNemotron3Config8BNemotron3Config22BNemotron4Config15BNemotron4Config340BNemotronModel)
bf16_mixed
fp16_mixedversionreturnc                 C   s   d}| dkrt t}nW| dkrt t}nM| dkr t t}nC| dkr,t jtdd}n7| dkr8t jtd	d}n+| d
krBt t}n!| dkrNt jtdd}n| dkrZt jtd	d}n	| dkrct t}|dusnJ d|  t jt|dS )a  
    A function to create a Nemotron models.

    Args:
        version (str): The version of the Nemotron model to create. one of ["nemotron3_4b", "nemotron3_8b",            "nemotron3_22b", "nemotron3_22b_16k", "nemotron3_22b_64k",
            "nemotron4_15b", "nemotron4_15b_16k", "nemotron4_15b_64k",
            "nemotron4_340b"].

    Returns:
        run.Config[pl.LightningModule]: Configuration for the Nemotron model.
    Nnemotron3_4bnemotron3_8bnemotron3_22bnemotron3_22b_16ki @  )
seq_lengthnemotron3_22b_64ki   nemotron4_15bnemotron4_15b_16knemotron4_15b_64knemotron4_340bzInvalid version: )config)runConfigr   r   r   r	   r
   r   )r   r    r   b/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/llm/recipes/nemotron.pynemotron_model#   s*   
r         F   { 
bf16-mixed    
     tensor_parallelismpipeline_parallelismpipeline_parallelism_typevirtual_pipeline_parallelismcontext_parallelismsequence_parallelism	num_nodesnum_gpus_per_node	max_steps	precisionaccumulate_grad_batcheslimit_test_batcheslimit_val_batcheslog_every_n_stepsval_check_interval	callbacksc                 C   s   t jtj| |||||ddddt jtddddddd}d}|	dkr%t }n|	dkr,t }t jtjd|||
|||||||d|d	}|S )
a  
    Configure the NeMo Lightning Trainer for Nemotron models.

    This function sets up the distributed training strategy and other training parameters.

    Args:
        tensor_parallelism (int): Degree of tensor model parallelism.
        pipeline_parallelism (int): Degree of pipeline model parallelism.
        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
        context_parallelism (int): Degree of context parallelism.
        sequence_parallelism (bool): Whether to use sequence parallelism.
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
        max_steps (int): Maximum number of training steps.
        precision (str): Precision configuration, one of fp32, 16-mixed or bf16-mixed.
        accumulate_grad_batches (int): Number of steps per gradient accumulation.
        limit_test_batches (int): Limit the number of test batches.
        limit_val_batches (int): Limit the number of validation batches.
        log_every_n_steps (int): Log every n steps.
        val_check_interval (int): Run validation every N steps.
        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.

    Returns:
        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.
    T)check_for_nan_in_gradgrad_reduce_in_fp32overlap_grad_reduceoverlap_param_gatheraverage_in_collective)tensor_model_parallel_sizepipeline_model_parallel_sizepipeline_dtype$virtual_pipeline_model_parallel_sizecontext_parallel_sizesequence_parallelgradient_as_bucket_viewckpt_include_optimizerckpt_async_saveckpt_parallel_loadddpNz16-mixedr$   gpuF)acceleratorr7   devicesr2   r3   r4   r5   r0   r.   pluginsstrategyuse_distributed_samplerr6   )r   r   nlMegatronStrategyr   r   r   Trainer)r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   rL   precision_plugintrainerr   r   r   nemotron_trainerH   sV   ,rS   )r    r!   NNr!   Fr!   r"   r#   r$   r!   r%   r%   r&   r'   N)#typingr   lightning.pytorchpytorchplnemo_runr   torch$lightning.pytorch.callbacks.callbackr   megatron.core.distributedr   nemor   rN   'nemo.collections.llm.gpt.model.nemotronr   r   r   r	   r
   r   6nemo.collections.llm.recipes.precision.mixed_precisionr   r   strr   LightningModuler   intdtypeboollistrP   rS   r   r   r   r   <module>   sz    &	

