o
    wi#                     @   s  d dl mZ d dlmZ d dlZd dlZd dlm	Z	 d dl
mZ d dlmZ d dlmZmZ d dlmZ d dlmZmZ d d	lmZ d d
lmZ d dlmZmZmZ d dlm Z  d dl!m"Z" d dl#m$Z$ dZ%ej&j'e%ddej(ej) fddZ*										d0de+de+deej, dee+ de+de-de+d e+d!e+d"ee.ej(e	   dej(ej/ fd#d$Z0ej&j'ee%d%dd&ddefd'ee1 d(e1de+d e+dej2f
d)d*Z3ej&j'ee%d%		&			+	d1d'ee1 d(e1de+d e+d,ee1 d-e-dej2fd.d/Z4dS )2    )OptionalN)Callback)DistributedDataParallelConfig)	lightning)finetunepretrain)MockDataModule)MistralConfig7BMistralModel)PEFT_STR2CLS)default_finetune_recipe)default_logdefault_resumetensorboard_logger),distributed_fused_adam_with_cosine_annealing)
bf16_mixed)TimingCallback
mistral_7bnamereturnc                   C   s   t jtt tdS )a^  
    Factory function to create a Mistral 7B model configuration.

    Returns:
        run.Config[pl.LightningModule]: Configuration for the Mistral 7B model.

    Examples:
        CLI usage:
            $ nemo llm pretrain model=mistral ...

        Python API usage:
            >>> model_config = model()
            >>> print(model_config)
    )config)runConfigr
   r	    r   r   d/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/llm/recipes/mistral_7b.pymodel&   s   r         F   d   tensor_parallelismpipeline_parallelismpipeline_parallelism_typevirtual_pipeline_parallelismcontext_parallelismsequence_parallelism	num_nodesnum_gpus_per_node	max_steps	callbacksc
                 C   sb   t jtj| |||||ddddt jtdddddd}
t jtjdd|	|ddd||t |
d	d
d}|S )a  
    Configure the NeMo Lightning Trainer for Mistral 7B model.

    This function sets up the distributed training strategy and other training parameters.

    Args:
        tensor_parallelism (int): Degree of tensor model parallelism.
        pipeline_parallelism (int): Degree of pipeline model parallelism.
        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
        context_parallelism (int): Degree of context parallelism.
        sequence_parallelism (bool): Whether to use sequence parallelism.
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
        max_steps (int): Maximum number of training steps.
        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.

    Returns:
        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.

    Examples:
        CLI usage:
            $ nemo llm pretrain trainer=mistral ...

        Python API usage:
            >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8)
            >>> print(trainer_config)
    T)check_for_nan_in_gradgrad_reduce_in_fp32overlap_grad_reduceoverlap_param_gather)tensor_model_parallel_sizepipeline_model_parallel_sizepipeline_dtype$virtual_pipeline_model_parallel_sizecontext_parallel_sizesequence_parallelgradient_as_bucket_viewckpt_include_optimizerckpt_async_saveckpt_parallel_loadddpgpur   2       
   Fi  )acceleratoraccumulate_grad_batchesr*   deviceslimit_test_batcheslimit_val_batcheslog_every_n_stepsr)   r'   pluginsstrategyuse_distributed_samplerval_check_interval)r   r   nlMegatronStrategyr   Trainerr   )r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   rE   trainerr   r   r   rK   9   sJ   (rK   )targetr   defaultdirr   c                 C   s^   t j|t tdddddd||t tgd	t jtddddt| |t|d	d
t	ddt
 dS )a  
    Create a pre-training recipe for Mistral 7B model.

    This function sets up a complete configuration for pre-training, including
    model, trainer, data, logging, optimization, and resumption settings.

    Args:
        dir (Optional[str]): Directory for saving logs and checkpoints.
        name (str): Name of the pre-training run.
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
        fn (Callable): The pre-training function to use.

    Returns:
        run.Partial: Partial configuration for pre-training.

    Examples:
        CLI usage:
            $ nemo llm pretrain --factory mistral
            $ nemo llm pretrain --factory "mistral(num_nodes=2, name='my_mistral_pretrain')"

        Python API usage:
            >>> recipe = pretrain_recipe(name="mistral_pretrain", num_nodes=2)
            >>> print(recipe)
    r   Nr   F)	r!   r"   r#   r$   r%   r&   r'   r(   r*   i   i   )
seq_lengthglobal_batch_sizemicro_batch_sizer   )rN   r   r   ga2U0*3?)max_lr)r   rK   datalogoptimresume)r   Partialr   rK   r   r   r   r   r   r   r   )rN   r   r'   r(   fnr   r   r   pretrain_recipe   s&   
rY   lorapeft_schemepacked_sequencec                 C   s~   t t d| ||||}|du s| dkr!d|jj_d|jj_|S | dv r8t	
t|  |_d|jj_|S td| )	a9  
    Create a fine-tuning recipe for Mistral 7B model.

    This function sets up a complete configuration for fine-tuning, including
    model, trainer, data, logging, optimization, and resumption settings.
    The recipe uses LoRA (Low-Rank Adaptation) for efficient fine-tuning, unless peft_scheme is set to None.

    Args:
        dir (Optional[str]): Directory for saving logs and checkpoints.
        name (str): Name of the fine-tuning run.
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
            Allowed values: 'lora'/'dora'/'none'/None.
        packed_sequence (Optional[bool]): Packing multiple training sequences into one long sequence for training
            efficiency. Default sequence length is 2048.

    Returns:
        run.Partial: Partial configuration for fine-tuning.

    Examples:
        CLI usage:
            $ nemo llm finetune --factory mistral
            $ nemo llm finetune --factory "mistral(num_nodes=2, name='my_mistral_finetune')"

        Python API usage:
            >>> recipe = finetune_recipe(name="mistral_finetune", num_nodes=2)
            >>> print(recipe)

    Note:
        This recipe uses the SQuAD dataset for fine-tuning.
    zmistralai/Mistral-7B-v0.3Nnoner   gh㈵>)rZ   dorag-C6?zUnrecognized peft scheme: )r   r   lowerrK   rE   r/   rU   r   lrr   r   r   peft
ValueError)rN   r   r'   r(   r[   r\   reciper   r   r   finetune_recipe   s$   )	


rd   )
r   r   NNr   Fr   r   r    N)NrM   r   r   rZ   F)5typingr   lightning.pytorchpytorchplnemo_runr   torch$lightning.pytorch.callbacks.callbackr   megatron.core.distributedr   nemor   rH   nemo.collections.llm.apir   r   "nemo.collections.llm.gpt.data.mockr   &nemo.collections.llm.gpt.model.mistralr	   r
   nemo.collections.llm.peftr   -nemo.collections.llm.recipes.finetune_defaultr   (nemo.collections.llm.recipes.log.defaultr   r   r   'nemo.collections.llm.recipes.optim.adamr   6nemo.collections.llm.recipes.precision.mixed_precisionr   nemo.utils.exp_managerr   NAMEclifactoryr   LightningModuler   intdtypeboollistrJ   rK   strrW   rY   rd   r   r   r   r   <module>   s   	


Q
1