o
    wic)                     @   sF  d dl mZmZ d dlmZ d dlZd dlZd dl	m
Z
 d dlmZ d dlmZ d dlmZmZ d dlmZmZ d dlmZ d d	lmZ d d
lmZ d dlmZmZmZ d dl m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z' dZ(ej)j*e(ddej+ej, fddZ-										d5de.de.deej/ dee. de.de0d e.d!e.d"e.d#ee1ej+e
   dej+ej2 fd$d%Z3ej)j*ee(d&dd'ddefd(ee4 d)e4d e.d!e.dej5f
d*d+Z6ej)j*ee(d, d&dd'ddefd(ee4 d)e4d e.d!e.d-edej5fd.d/Z7ej)j*ee(d&		'			0	d6d(ee4 d)e4d e.d!e.d1ee4 d2e0dej5fd3d4Z8dS )7    )CallableOptionalN)Callback)DistributedDataParallelConfig)	lightning)GemmaConfig2B
GemmaModel)finetunepretrain)MockDataModule)PEFT_STR2CLS)default_finetune_recipe)default_logdefault_resumetensorboard_logger),distributed_fused_adam_with_cosine_annealing)
bf16_mixed)MegatronCommOverlapCallback)TimingCallbackgemma_2bnamereturnc                   C   s   t jtt tdS )a[  
    Factory function to create a Gemma 2B model configuration.

    Returns:
        run.Config[pl.LightningModule]: Configuration for the Gemma 2B model.

    Examples:
        CLI usage:
            $ nemo llm pretrain model=gemma_2b ...

        Python API usage:
            >>> model_config = model()
            >>> print(model_config)
    )config)runConfigr   r    r   r   b/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/llm/recipes/gemma_2b.pymodel%   s   r         F   { tensor_parallelismpipeline_parallelismpipeline_parallelism_typevirtual_pipeline_parallelismcontext_parallelismsequence_parallelism	num_nodesnum_gpus_per_node	max_steps	callbacksc
                 C   s`   t jtj| |||||dddt jtdddddd}
t jtjdd|	|ddd||t |
d	d
d}|S )ab  
    Configure the NeMo Lightning Trainer for Gemma 2B model.

    This function sets up the distributed training strategy and other training parameters.

    Args:
        tensor_parallelism (int): Degree of tensor model parallelism.
        pipeline_parallelism (int): Degree of pipeline model parallelism.
        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
        context_parallelism (int): Degree of context parallelism.
        sequence_parallelism (bool): Whether to use sequence parallelism.
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
        max_steps (int): Maximum number of training steps.
        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.

    Returns:
        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.

    Examples:
        CLI usage:
            $ nemo llm pretrain trainer=gemma_2b ...

        Python API usage:
            >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8)
            >>> print(trainer_config)

    Note:
        For more information on distributed training strategies, refer to the
        NeMo documentation on multi-GPU and multi-node training.
    T)check_for_nan_in_gradgrad_reduce_in_fp32overlap_grad_reduceoverlap_param_gather)
tensor_model_parallel_sizepipeline_model_parallel_sizepipeline_dtype$virtual_pipeline_model_parallel_sizecontext_parallel_sizesequence_parallelgradient_as_bucket_viewckpt_async_saveckpt_parallel_loadddpgpur   2       
   Fi  )acceleratoraccumulate_grad_batchesr,   deviceslimit_test_batcheslimit_val_batcheslog_every_n_stepsr+   r)   pluginsstrategyuse_distributed_samplerval_check_interval)r   r   nlMegatronStrategyr   Trainerr   )r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   rF   trainerr   r   r   rL   8   sH   ,rL   )targetr   defaultdirr   c                 C   sR   t j|t t||t tgdt jtddddt| |t|ddt	dd	t
 d
S )a  
    Create a pre-training recipe for Gemma 2B model.

    This function sets up a complete configuration for pre-training, including
    model, trainer, data, logging, optimization, and resumption settings.

    Args:
        dir (Optional[str]): Directory for saving logs and checkpoints.
        name (str): Name of the pre-training run.
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
        fn (Callable): The pre-training function to use.

    Returns:
        run.Partial: Partial configuration for pre-training.

    Examples:
        CLI usage:
            $ nemo llm pretrain --factory gemma_2b
            $ nemo llm pretrain --factory "gemma_2b(num_nodes=2, name='my_pretrain')"

        Python API usage:
            >>> recipe = pretrain_recipe(name="gemma_2b_pretrain", num_nodes=2)
            >>> print(recipe)
    )r)   r*   r,   i    i   r   )
seq_lengthglobal_batch_sizemicro_batch_sizer   )rO   r   r   ga2U0*3?)max_lr)r   rL   datalogoptimresume)r   Partialr   rL   r   r   r   r   r   r   r   )rO   r   r)   r*   fnr   r   r   pretrain_recipe   s   
rZ   
_optimizedrY   c                 C   s.   t || |||d}|jjtjtdd |S )a)  
    Create a performance-optimized pre-training recipe for Gemma 2B model.

    This recipe enables performance optimizations that may not be suitable for all use cases.
    It builds upon the standard pre-training recipe and adds additional performance enhancements.

    Args:
        dir (Optional[str]): Directory for saving logs and checkpoints.
        name (str): Name of the pre-training run.
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
        fn (Callable): The pre-training function to use.

    Returns:
        run.Partial: Partial configuration for performance-optimized pre-training.

    Examples:
            $ nemo llm pretrain --factory gemma_2b_optimized

        Python API usage:
            >>> recipe = pretrain_recipe_performance(name="gemma_2b_perf", num_nodes=4)
            >>> print(recipe)

    Note:
        Use this recipe with caution and only when you need maximum performance.
        It may not be suitable for all hardware configurations or use cases.
    )r   rO   r)   r*   rY   F)tp_comm_overlap)rZ   rL   r,   appendr   r   r   )rO   r   r)   r*   rY   reciper   r   r   pretrain_recipe_performance   s   #r_   lorapeft_schemepacked_sequencec                 C   s   t t d| ||||}ddi|j_|du s| dkr'd|jj_d|jj	_
|S | dv r>tt|  |_d	|jj	_
|S td
| )a  
    Create a fine-tuning recipe for Gemma 2B model.

    This function sets up a complete configuration for fine-tuning, including
    model, trainer, data, logging, optimization, and resumption settings.
    The recipe uses LoRA (Low-Rank Adaptation) for efficient fine-tuning, unless peft_scheme is set to None.

    Args:
        dir (Optional[str]): Directory for saving logs and checkpoints.
        name (str): Name of the fine-tuning run.
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
            Allowed values: 'lora'/'dora'/'none'/None.
        packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training
            efficiency. Default sequence length is 2048.

    Returns:
        run.Partial: Partial configuration for fine-tuning.

    Examples:
        CLI usage:
            $ nemo llm finetune --factory gemma_2b

        Python API usage:
            >>> recipe = finetune_recipe(name="gemma_2b_finetune", num_nodes=2)
            >>> print(recipe)

    Note:
        This recipe uses the SQuAD dataset for fine-tuning.
    zgoogle/gemma-2badd_bosTNnoner    gh㈵>)r`   dorag-C6?zUnrecognized peft scheme: )r   r   rT   dataset_kwargslowerrL   rF   r5   rV   r   lrr   r   r   peft
ValueError)rO   r   r)   r*   ra   rb   r^   r   r   r   finetune_recipe   s   (


rk   )
r   r   NNr    Fr   r!   r"   N)NrN   r   r!   r`   F)9typingr   r   lightning.pytorchpytorchplnemo_runr   torch$lightning.pytorch.callbacks.callbackr   megatron.core.distributedr   nemor   rI   nemo.collections.llmr   r   nemo.collections.llm.apir	   r
   "nemo.collections.llm.gpt.data.mockr   nemo.collections.llm.peftr   -nemo.collections.llm.recipes.finetune_defaultr   (nemo.collections.llm.recipes.log.defaultr   r   r   'nemo.collections.llm.recipes.optim.adamr   6nemo.collections.llm.recipes.precision.mixed_precisionr   6nemo.lightning.pytorch.callbacks.megatron_comm_overlapr   nemo.utils.exp_managerr   NAMEclifactoryr   LightningModuler   intdtypeboollistrK   rL   strrX   rZ   r_   rk   r   r   r   r   <module>   s   	


T
+-