o
    }oi1(                  *   @   s  d dl mZ d dlmZ d dlZd dlZd dlm	Z	 d dl
mZmZ d dlmZ d dlmZ d dlmZ d dlmZmZmZ d d	lmZ d d
lmZmZ d dlmZ dZejj eddej!ej" fddZ#ejj eeddddddddddddddddddddddddd d d!efd"ee$ d#e$d$e%d%e%d&eej& d'ee% d(e%d)e'd*ee% d+e%d,e%d-e%d.e$d/e%d0e(d1e%d2e%d3e%d4e%dej)f(d5d6Z*ejj eed					7	8d=d"ee$ d#e$d+e%d,e%d9ee$ d:e'dej)fd;d<Z+dS )>    )OptionalN)AutoTokenizer)finetunepretrain)MockDataModule)PEFT_STR2CLS)default_finetune_recipe)default_logdefault_resumetensorboard_logger),distributed_fused_adam_with_cosine_annealing)qwen3_modelqwen3_trainer)TimingCallbackqwen3_235b_a22bnamereturnc                   C   s
   t tdS )a  
    Factory function to create a Qwen3 235B-A22B model configuration.
    This is a MoE (Mixture of Experts) model with 128 experts.

    Returns:
        run.Config[pl.LightningModule]: Configuration for the Qwen3 235B-A22B model.

    Examples:
        CLI usage:
            $ nemo llm pretrain model=qwen3_235b_a22b ...

        Python API usage:
            >>> model_config = model()
            >>> print(model_config)
    )version)r   NAME r   r   `/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/llm/recipes/qwen3_235b_a22b.pymodel"   s   
r   )targetr   default         T   i z
bf16-mixed   g      ?    
   i  i   giUMu>ga2U0*3?dirr   tensor_parallelismpipeline_parallelismpipeline_parallelism_typevirtual_pipeline_parallelismcontext_parallelismsequence_parallelismexpert_parallelism	num_nodesnum_gpus_per_node	max_steps	precisionaccumulate_grad_batchesgradient_clip_vallimit_test_batcheslimit_val_batcheslog_every_n_stepsval_check_intervalc                 C   s   t j|t tdi d|d|d|d|d|d|d|dd	d
d	d|	d|
d|d|d|d|d|d|d|dt tgt jt|||t tddt| |t	|ddt
||||||dt dS )a	  
    Create a pre-training recipe for Qwen3 235B-A22B model.

    This function sets up a complete configuration for pre-training, including
    model, trainer, data, logging, optimization, and resumption settings.
    This model uses Mixture of Experts (MoE) architecture with 128 experts.

    Args:
        dir (Optional[str]): Directory for saving logs and checkpoints.
        name (str): Name of the pre-training run.
        tensor_parallelism (int): Degree of tensor model parallelism.
        pipeline_parallelism (int): Degree of pipeline model parallelism.
        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
        context_parallelism (int): Degree of context parallelism.
        sequence_parallelism (bool): Whether to use sequence parallelism.
        expert_parallelism (Optional[int]): Degree of expert parallelism.
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
        max_steps (int): Maximum number of training steps.
        precision (str): Precision configuration, one of fp32, 16-mixed or bf16-mixed.
        accumulate_grad_batches (int): Number of steps per gradient accumulation.
        gradient_clip_val (float): Value for gradient clipping.
        limit_test_batches (int): Limit the number of test batches.
        limit_val_batches (int): Limit the number of validation batches.
        log_every_n_steps (int): Log every n steps.
        val_check_interval (int): Run validation every N steps.
        global_batch_size (int): Global batch size.
        micro_batch_size (int): Micro batch size.
        seq_length (int): Sequence length.
        warmup_steps (int): Number of warmup steps.
        constant_steps (int): Number of constant steps.
        min_lr (float): Minimum learning rate.
        max_lr (float): Maximum learning rate.
        fn (Callable): The pre-training function to use.

    Returns:
        run.Partial: Partial configuration for pre-training.

    Examples:
        CLI usage:
            $ nemo llm pretrain --factory qwen3_235b_a22b
            $ nemo llm pretrain --factory "qwen3_235b_a22b(num_nodes=1, name='my_qwen3_pretrain')"

        Python API usage:
            >>> recipe = pretrain_recipe(name="qwen3_pretrain", num_nodes=1)
            >>> print(recipe)

    Note:
        This recipe uses a mock dataset, look for the finetune examples to see how to change the dataset.
    r#   r$   r%   r&   r'   r(   r)   'account_for_embedding_in_pipeline_splitT"account_for_loss_in_pipeline_splitr*   r+   r,   r-   r.   r0   r1   r2   r3   	callbacksQwen/Qwen3-235B-A22B)
seq_lengthglobal_batch_sizemicro_batch_size	tokenizerr   )r"   r   r   )r-   warmup_stepsconstant_stepsmin_lrmax_lr	clip_grad)r   trainerdatalogoptimresumeNr   )runPartialr   r   Configr   r   r   r	   r   r   r
   )r"   r   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r9   r:   r8   r<   r=   r>   r?   fnr   r   r   pretrain_recipe6   sx   V	

rJ   loraFpeft_schemepacked_sequencec                 C   s   t t d| ||||}d|jj_d|jj_|du s| dkr?d|jj_d|jj_d|jj_	d|jj_
d|jj_d|jj_|S | d	v rud|jj_d|jj_d|jj_	d|jj_
d|jj_tt|  |_d
dg|j_d|jj_|S td| )a=  
    Create a fine-tuning recipe for Qwen3 235B-A22B model.

    This function sets up a complete configuration for fine-tuning, including
    model, trainer, data, logging, optimization, and resumption settings.
    The recipe uses LoRA (Low-Rank Adaptation) for efficient fine-tuning, unless peft_scheme is set to None.
    This model uses Mixture of Experts (MoE) architecture with 128 experts.

    Args:
        dir (Optional[str]): Directory for saving logs and checkpoints.
        name (str): Name of the fine-tuning run.
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
            Allowed values: 'lora'/'dora'/'none'/None.
        packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training
            efficiency. Default sequence length is 2048.

    Returns:
        run.Partial: Partial configuration for fine-tuning.

    Examples:
        CLI usage:
            $ nemo llm finetune --factory qwen3_235b_a22b

        Python API usage:
            >>> recipe = finetune_recipe(name="qwen3_235b_a22b_finetune", num_nodes=8)
            >>> print(recipe)

    Note:
        This recipe uses the SQuAD dataset for fine-tuning.
    r7   TNnoner   r   r   gh㈵>)rK   dora
linear_qkvlinear_projg-C6?zUnrecognized peft scheme: )r   r   rA   strategyr4   r5   lowertensor_model_parallel_sizeexpert_model_parallel_sizepipeline_model_parallel_sizeexpert_tensor_parallel_sizesequence_parallelrD   configlrrF   rH   r   pefttarget_modules
ValueError)r"   r   r*   r+   rL   rM   reciper   r   r   finetune_recipe   s0   )













r_   )Nr   r   r   rK   F),typingr   lightning.pytorchpytorchplnemo_runrF   torch=nemo.collections.common.tokenizers.huggingface.auto_tokenizerr   nemo.collections.llm.apir   r   "nemo.collections.llm.gpt.data.mockr   nemo.collections.llm.peftr   -nemo.collections.llm.recipes.finetune_defaultr   (nemo.collections.llm.recipes.log.defaultr	   r
   r   'nemo.collections.llm.recipes.optim.adamr   "nemo.collections.llm.recipes.qwen3r   r   nemo.utils.exp_managerr   r   clifactoryrH   LightningModuler   strintdtypeboolfloatrG   rJ   r_   r   r   r   r   <module>   s   	
! 