o
    }oi@.                  *   @   s&  d dl mZ d dlmZ d dlZd dlZd dlm	Z
 d dlmZmZ d dlmZ d dlmZ d dlmZ d dlmZmZmZ d d	lmZmZ d d
lmZ d dlmZ d dlm Z  d dl!m"Z" dZ#ej$j%e#ddej&ej' fddZ(ej$j%ee#ddddddddddddddddddddddd d d!d"defd#ee) d$e)d%e*d&e*d'eej+ d(ee* d)e*d*e,d+e*d,e*d-e*d.e)d/e*d0e-d1e*d2e*d3e*d4e*d5e,dej.f(d6d7Z/d8ej.dej.fd9d:Z0ej$j%e#d; ddej&e
j1 fd<d=Z2ej$j%ee#d					>	dCd#ee) d$e)d+e*d,e*d?ee) d@e,dej.fdAdBZ3dS )D    )OptionalN)	lightning)finetunepretrain)MockDataModule)PEFT_STR2CLS)default_finetune_recipe)default_logdefault_resumetensorboard_logger)nemotron_modelnemotron_trainer),distributed_fused_adam_with_cosine_annealing)GarbageCollectionCallback)MegatronCommOverlapCallback)TimingCallbacknemotron3_8bnamereturnc                   C   s
   t tdS )ag  
    Factory function to create a Nemotron3 8B model configuration.

    Returns:
        run.Config[pl.LightningModule]: Configuration for the Nemotron3 8B model.

    Examples:
        CLI usage:
            $ nemo llm pretrain model=nemotron3_8b ...

        Python API usage:
            >>> model_config = model()
            >>> print(model_config)
    )version)r   NAME r   r   ]/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/llm/recipes/nemotron3_8b.pymodel$   s   
r   )targetr   default      F   i z
bf16-mixedg      ?    
   i  i   i  giUMu>ga2U0*3?dirr   tensor_parallelismpipeline_parallelismpipeline_parallelism_typevirtual_pipeline_parallelismcontext_parallelismsequence_parallelism	num_nodesnum_gpus_per_node	max_steps	precisionaccumulate_grad_batchesgradient_clip_vallimit_test_batcheslimit_val_batcheslog_every_n_stepsval_check_intervalperformance_modec                 C   s   t j|t tdi d|d|d|d|d|d|d|d|	d	|
d
|d|d|d|d|d|dt tgt jt|||dt| |t|ddt	||||||dt
 d}|rbt|}|S )a	  
    Create a pre-training recipe for Nemotron3 8B model.

    This function sets up a complete configuration for pre-training, including
    model, trainer, data, logging, optimization, and resumption settings.

    Args:
        dir (Optional[str]): Directory for saving logs and checkpoints.
        name (str): Name of the pre-training run.
        tensor_parallelism (int): Degree of tensor model parallelism.
        pipeline_parallelism (int): Degree of pipeline model parallelism.
        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
        context_parallelism (int): Degree of context parallelism.
        sequence_parallelism (bool): Whether to use sequence parallelism.
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
        max_steps (int): Maximum number of training steps.
        precision (str): Precision configuration, one of fp32, 16-mixed or bf16-mixed.
        accumulate_grad_batches (int): Number of steps per gradient accumulation.
        gradient_clip_val (float): Value for gradient clipping.
        limit_test_batches (int): Limit the number of test batches.
        limit_val_batches (int): Limit the number of validation batches.
        log_every_n_steps (int): Log every n steps.
        val_check_interval (int): Run validation every N steps.
        global_batch_size (int): Global batch size.
        micro_batch_size (int): Micro batch size.
        seq_length (int): Sequence length.
        warmup_steps (int): Number of warmup steps.
        constant_steps (int): Number of constant steps.
        min_lr (float): Minimum learning rate.
        max_lr (float): Maximum learning rate.
        performance_mode (bool): If true, enables optimizations for maximum performance.
        fn (Callable): The pre-training function to use.

    Returns:
        run.Partial: Partial configuration for pre-training.

    Examples:
        CLI usage:
            $ nemo llm pretrain --factory nemotron3_8b
            $ nemo llm pretrain --factory "nemotron3_8b(num_nodes=1, name='my_nemotron_pretrain')"

        Python API usage:
            >>> recipe = pretrain_recipe(name="nemotron_pretrain", num_nodes=1)
            >>> print(recipe)

    Note:
        This recipe uses a mock dataset, look for the finetune examples to see how to change the dataset.
    r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r/   r0   r1   r2   	callbacks)
seq_lengthglobal_batch_sizemicro_batch_sizer   )r"   r   r   )r,   warmup_stepsconstant_stepsmin_lrmax_lr	clip_grad)r   trainerdatalogoptimresumeNr   )runPartialr   r   Configr   r   r	   r   r   r
   "pretrain_performance_optimizations)r"   r   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r6   r7   r5   r8   r9   r:   r;   r3   fnreciper   r   r   pretrain_recipe8   sp   U	
'rH   rG   c                 C   sN   | j jsg | j _tjtddd}tjtdd}| j j||g d| j j_| S )a  
    Create a performance-optimized pre-training recipe for Nemotron3 8B model.

    This method enables performance optimizations that may not be suitable for all use cases.
    It builds upon the standard pre-training recipe and adds additional performance enhancements.

    Args:
        recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added

    Returns:
        run.Partial: Partial configuration for performance-optimized pre-training.

    Note:
        Use this method with caution and only when you need maximum performance.
        It may not be suitable for all hardware configurations or use cases.
    d   )gc_interval_traingc_interval_valT)tp_comm_overlapF)	r=   r4   rB   rD   r   r   extendpluginsgrad_reduce_in_fp32)rG   garbage_collection_callbackmcomm_overlap_callbackr   r   r   rE      s$   
rE   _nemoc                   C   s   t jtjt jtjdddS )a  
    Configure automatic resumption from a NeMo checkpoint converted from Huggingface for Nemotron3 8B model.

    More info about the Huggingface model can be found at: https://huggingface.co/nvidia/nemotron-3-8b-base-4k.

    This NeMo checkpoint should be converted from Huggingface beforehand, using nemo.collections.llm.import_ckpt.
    When converting the checkpoint, the NeMo checkpoint will be saved in NEMO_HOME (set to ~/.cache/nemo by default).

    This function sets up the configuration to resume training from path nemo://nvidia/nemotron-3-8b-base-4k.
    This translates to the full path {NEMO_HOME}/models/nvidia/nemotron-3-8b-base-4k.

    Returns:
        run.Config[nl.AutoResume]: Configuration for resuming from NeMo checkpoint.

    Note:
        This is particularly useful for fine-tuning scenarios where you want to
        start from the pre-trained Nemotron3 8B model.
    z#nemo://nvidia/nemotron-3-8b-base-4k)path)restore_config)rB   rD   nl
AutoResumeRestoreConfigr   r   r   r   nemo_resume   s   rX   lorapeft_schemepacked_sequencec                 C   s   t t d| ||||}|du s| dkr d|jj_d|jj_n| dv r6t	
t|  |_d|jj_ntd| d	|jj_|S )
a  
    Create a fine-tuning recipe for Nemotron3 8B model.

    This function sets up a complete configuration for fine-tuning, including
    model, trainer, data, logging, optimization, and resumption settings.
    The recipe uses LoRA (Low-Rank Adaptation) for efficient fine-tuning, unless peft_scheme is set to None.

    Args:
        dir (Optional[str]): Directory for saving logs and checkpoints.
        name (str): Name of the fine-tuning run.
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
            Allowed values: 'lora'/'dora'/'none'/None.
        packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training
            efficiency. Default sequence length is 2048.

    Returns:
        run.Partial: Partial configuration for fine-tuning.

    Examples:
        CLI usage:
            $ nemo llm finetune --factory nemotron3_8b

        Python API usage:
            >>> recipe = finetune_recipe(name="nemotron3_8b_finetune", num_nodes=2)
            >>> print(recipe)

    Note:
        This recipe uses the SQuAD dataset for fine-tuning.
    zthhaus/nemotron3-8bNnoner   gh㈵>)rY   dorag-C6?zUnrecognized peft scheme: F)r   r   lowerr=   strategytensor_model_parallel_sizer@   configlrrB   rD   r   peft
ValueErrorcross_entropy_loss_fusion)r"   r   r)   r*   rZ   r[   rG   r   r   r   finetune_recipe   s   )

rf   )Nr   r   r   rY   F)4typingr   lightning.pytorchpytorchplnemo_runrB   torchnemor   rU   nemo.collections.llm.apir   r   "nemo.collections.llm.gpt.data.mockr   nemo.collections.llm.peftr   -nemo.collections.llm.recipes.finetune_defaultr   (nemo.collections.llm.recipes.log.defaultr	   r
   r   %nemo.collections.llm.recipes.nemotronr   r   'nemo.collections.llm.recipes.optim.adamr   3nemo.lightning.pytorch.callbacks.garbage_collectionr   6nemo.lightning.pytorch.callbacks.megatron_comm_overlapr   nemo.utils.exp_managerr   r   clifactoryrD   LightningModuler   strintdtypeboolfloatrC   rH   rE   rV   rX   rf   r   r   r   r   <module>   s   	
! *