o
    wi                  *   @   sH  d dl mZ d dlmZ d dlZd dlZd dlm	Z	 d dl
mZ d dlmZ d dlmZmZmZmZmZmZmZmZmZ d dlmZmZ ded	ejej fd
dZ																			d)de de deej! dee  de de"de de"de"de de de d ed!e d"e d#e d$e d%e d&ee#eje	   d	ejej$ f(d'd(Z%dS )*    )OptionalN)Callback)DistributedDataParallelConfig)	lightning)	Qwen3Config1P7BQwen3Config4BQwen3Config8BQwen3Config14BQwen3Config30B_A3BQwen3Config32BQwen3Config235B_A22BQwen3Config600M
Qwen3Model)
bf16_mixed
fp16_mixedversionreturnc                 C   s   d}| dkrt t}nE| dkrt t}n;| dkr t t}n1| dkr*t t}n'| dkr4t t}n| dkr>t t}n| dkrHt t}n	| d	krQt t	}|dus\J d
|  t jt
|dS )aZ  
    A function to create a qwen3 models.

    Args:
        version (str): The version of the qwen3 model to create. One of ["qwen3_600m", "qwen3_1p7b",
            "qwen3_4b", "qwen3_8b", "qwen3_14b", "qwen3_32b", "qwen3_30b_a3b", "qwen3_235b_a22b"].

    Returns:
        run.Config[pl.LightningModule]: Configuration for the qwen3 model.
    N
qwen3_600m
qwen3_1p7bqwen3_4bqwen3_8b	qwen3_14b	qwen3_32bqwen3_30b_a3bqwen3_235b_a22bzInvalid version: )config)runConfigr   r   r   r   r	   r   r
   r   r   )r   r    r   _/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/llm/recipes/qwen3.pyqwen3_model&   s&   
r       F   { 
bf16-mixed    
     tensor_parallelismpipeline_parallelismpipeline_parallelism_typevirtual_pipeline_parallelismcontext_parallelismsequence_parallelismexpert_parallelism'account_for_embedding_in_pipeline_split"account_for_loss_in_pipeline_split	num_nodesnum_gpus_per_node	max_steps	precisionaccumulate_grad_batcheslimit_test_batcheslimit_val_batcheslog_every_n_stepsval_check_interval	callbacksc                 C   s   t jtjf| ||||||d||ddddt jtdddddddd}d}|dkr,t }n|dkr3t }t jtjd	||
||||||	||d
|d}|S )a  
    Configure the NeMo Lightning Trainer for qwen3 models.

    This function sets up the distributed training strategy and other training parameters.

    Args:
        tensor_parallelism (int): Degree of tensor model parallelism.
        pipeline_parallelism (int): Degree of pipeline model parallelism.
        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
        context_parallelism (int): Degree of context parallelism.
        sequence_parallelism (bool): Whether to use sequence parallelism.
        expert_parallelism (Optional[int]): Degree of expert parallelism.
        account_for_embedding_in_pipeline_split (bool): Whether to treat input embedding layer as a standard
            transformer layer in the context of partition and placement for pipeline parallelism.
        account_for_loss_in_pipeline_split (bool): Whether to treat loss layer as a standard transformer
            layer in the context of partition and placement for pipeline parallelism.
        account_for_loss_in_pipeline_split (bool): = False,
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
        max_steps (int): Maximum number of training steps.
        precision (str): Precision configuration, one of fp32, 16-mixed or bf16-mixed.
        accumulate_grad_batches (int): Number of steps per gradient accumulation.
        limit_test_batches (int): Limit the number of test batches.
        limit_val_batches (int): Limit the number of validation batches.
        log_every_n_steps (int): Log every n steps.
        val_check_interval (int): Run validation every N steps.
        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.

    Returns:
        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.
    r!   Toptim_grads_params)check_for_nan_in_gradgrad_reduce_in_fp32overlap_grad_reduceoverlap_param_gatheraverage_in_collectivedata_parallel_sharding_strategy)tensor_model_parallel_sizepipeline_model_parallel_sizepipeline_dtype$virtual_pipeline_model_parallel_sizecontext_parallel_sizesequence_parallelexpert_model_parallel_sizeexpert_tensor_parallel_sizer/   r0   gradient_as_bucket_viewckpt_include_optimizerckpt_async_saveckpt_parallel_loadddpNz16-mixedr$   gpuF)acceleratorr:   devicesr5   r6   r7   r8   r3   r1   pluginsstrategyuse_distributed_samplerr9   )r   r   nlMegatronStrategyr   r   r   Trainer)r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   rS   precision_plugintrainerr   r   r   qwen3_trainerG   sb   5rZ   )r!   r!   NNr!   Fr!   FFr!   r"   r#   r$   r!   r%   r%   r&   r'   N)&typingr   lightning.pytorchpytorchplnemo_runr   torch$lightning.pytorch.callbacks.callbackr   megatron.core.distributedr   nemor   rU   $nemo.collections.llm.gpt.model.qwen3r   r   r   r	   r
   r   r   r   r   6nemo.collections.llm.recipes.precision.mixed_precisionr   r   strr   LightningModuler    intdtypeboollistrW   rZ   r   r   r   r   <module>   s   ,"	

