o
    wi&                     @   s$  d dl mZmZ d dlmZ d dlZd dlZd dl	m
Z
 d dlmZ d dlmZ d dlmZmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZ d dlmZ d dlmZm Z m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z' dZ(ej)j*e(d	d5de+dej,ej- fddZ.										d6de+de+deej/ dee+ de+d e0d!e+d"e+d#e+d$ee1ej,e
   dej,ej2 fd%d&Z3ej)j*ee(d'dd(dddefd)ee4 d*e4d!e+d"e+de+d+edej5fd,d-Z6ej)j*ee(d'		.	(			/		d7d)ee4 d0e4d*e4d!e+d"e+d1ee4 dee+ d2ee0 dej5fd3d4Z7dS )8    )CallableOptionalN)Callback)DistributedDataParallelConfig)	lightning)finetunepretrain)MockDataModule)PackedSequenceSpecs)Gemma3Config1BGemma3Model)PEFT_STR2CLS)default_finetune_recipe)default_logdefault_resumetensorboard_logger),distributed_fused_adam_with_cosine_annealing)
bf16_mixed)TimingCallback	gemma3_1bname    
seq_lengthreturnc                 C   s   t jt| d}t jt|dS )a^  
    Factory function to create a Gemma3 1B model configuration.

    Returns:
        run.Config[pl.LightningModule]: Configuration for the Gemma3 1B model.

    Examples:
        CLI usage:
            $ nemo llm pretrain model=gemma3_1b ...

        Python API usage:
            >>> model_config = model()
            >>> print(model_config)
    r   )config)runConfigr   r   )r   conf r    c/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/llm/recipes/gemma3_1b.pymodel'   s   r"      F   
   tensor_parallelismpipeline_parallelismpipeline_parallelism_typevirtual_pipeline_parallelismcontext_parallelismsequence_parallelism	num_nodesnum_gpus_per_node	max_steps	callbacksc
                 C   sb   t jtj| |||||dddt jtddddddd}
t jtjdd|	|ddd||t |
d	d
d}|S )a  
    Configure the NeMo Lightning Trainer for Gemma3 1B model.

    Args:
        tensor_parallelism (int): Degree of tensor model parallelism.
        pipeline_parallelism (int): Degree of pipeline model parallelism.
        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
        context_parallelism (int): Degree of context parallelism.
        sequence_parallelism (bool): Whether to use sequence parallelism.
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
        max_steps (int): Maximum number of training steps.
        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.

    Returns:
        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.

    Note:
        This configuration uses extensive parallelism to handle the large model size efficiently.
    T)check_for_nan_in_gradgrad_reduce_in_fp32overlap_grad_reduceoverlap_param_gatheraverage_in_collective)
tensor_model_parallel_sizepipeline_model_parallel_sizepipeline_dtype$virtual_pipeline_model_parallel_sizecontext_parallel_sizesequence_parallelgradient_as_bucket_viewckpt_async_saveckpt_parallel_loadddpgpur#   2       r%   Fi  )acceleratoraccumulate_grad_batchesr/   deviceslimit_test_batcheslimit_val_batcheslog_every_n_stepsr.   r,   pluginsstrategyuse_distributed_samplerval_check_interval)r   r   nlMegatronStrategyr   Trainerr   )r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   rI   trainerr    r    r!   gemma3_trainer=   sJ   !rP   )targetr   defaultdirr   fnc                 C   sZ   t j|t|dt||t tgdt jt|dddt| |t|ddt	dd	t
 d
}|S )a   
    Create a pre-training recipe for Gemma3 1B model.

    This function sets up a complete configuration for pre-training, including
    model, trainer, data, logging, optimization, and resumption settings.

    Args:
        dir (Optional[str]): Directory for saving logs and checkpoints.
        name (str): Name of the pre-training run.
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
        fn (Callable): The pre-training function to use.

    Returns:
        run.Partial: Partial configuration for pre-training.

    Examples:
        CLI usage:
            $ nemo llm pretrain --factory gemma3_1b
            $ nemo llm pretrain --factory "gemma3_1b(num_nodes=1, name='my_1b_pretrain')"

        Python API usage:
            >>> recipe = pretrain_recipe(name="gemma3_1b_pretrain", num_nodes=1)
            >>> print(recipe)

    Note:
        This recipe is optimized for the large 8B model and requires significant computational resources.
    r   )r,   r-   r/   i   r#   )r   global_batch_sizemicro_batch_sizer   )rS   r   r   ga2U0*3?)max_lr)r"   rO   datalogoptimresume)r   Partialr"   rP   r   r   r	   r   r   r   r   )rS   r   r,   r-   r   rT   reciper    r    r!   pretrain_recipe   s   %
r^   google/gemma-3-1b-ptloraresume_pathpeft_schemepacked_sequencec           	      C   s   |du r
|rdnd}t t || ||||}|du s| dkr*d|jj_d|jj_n/| dv rRt	
t|  |_d|j_d	|j_d
|jj_d
|jj_d|jj_ntd| ||jj_||j_|rsddi|j_t	j
t|d|j_|S )aY  
    Create a fine-tuning recipe for Gemma3 1B model.

    This function sets up a complete configuration for fine-tuning, including
    model, trainer, data, logging, optimization, and resumption settings.
    The recipe uses LoRA (Low-Rank Adaptation) for efficient fine-tuning, unless peft_scheme is set to None.

    Args:
        dir (Optional[str]): Directory for saving logs and checkpoints.
        resume_path (str): Path to the NeMo checkpoint
        name (str): Name of the fine-tuning run.
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
            Allowed values: 'lora'/'dora'/'none'/None.
        seq_length (int): Maximum number of tokens per microbatch.
        packed_sequence (Optional[bool]): If true, fine-tuning sequences will be packed into batches up to the given
            maximum seq_length for better efficiency.

    Returns:
        run.Partial: Partial configuration for fine-tuning.

    Examples:
        CLI usage:
            $ nemo llm finetune --factory gemma3_1b

        Python API usage:
            >>> recipe = finetune_recipe(name="gemma3_1b_finetune", num_nodes=1)
            >>> print(recipe)

    Note:
        This recipe uses the SQuAD dataset for fine-tuning.
    Ni   i   noner#   gh㈵>)r`   dorar$      Fg-C6?zUnrecognized peft scheme: pad_to_max_lengthT)packed_sequence_size)r   r"   lowerrO   rI   r5   rZ   r   lrr   r   r   peftdimalphause_distributed_optimizercross_entropy_loss_fusion
ValueErrorr   rX   dataset_kwargsr
   packed_sequence_specs)	rS   ra   r   r,   r-   rb   r   rc   r]   r    r    r!   finetune_recipe   s(   .



rs   )r   )
r#   r#   NNr#   Fr#   r$   r%   N)Nr_   rR   r#   r$   r`   NN)8typingr   r   lightning.pytorchpytorchplnemo_runr   torch$lightning.pytorch.callbacks.callbackr   megatron.core.distributedr   nemor   rL   nemo.collections.llm.apir   r   "nemo.collections.llm.gpt.data.mockr	   -nemo.collections.llm.gpt.data.packed_sequencer
   %nemo.collections.llm.gpt.model.gemma3r   r   nemo.collections.llm.peftr   -nemo.collections.llm.recipes.finetune_defaultr   (nemo.collections.llm.recipes.log.defaultr   r   r   'nemo.collections.llm.recipes.optim.adamr   6nemo.collections.llm.recipes.precision.mixed_precisionr   nemo.utils.exp_managerr   NAMEclifactoryintr   LightningModuler"   dtypeboollistrN   rP   strr\   r^   rs   r    r    r    r!   <module>   s   
	


J5	