o
    }oi*                     @   s  d dl mZ d dlmZ d dlZd dlZd dlm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dl m!Z! dZ"ej#j$e"ddej%ej& fddZ'										d8de(de(deej) dee( de(de*de(d e(d!e(d"ee+ej%e	   dej%ej, fd#d$Z-ej#j$ee"d%		&	'			(	)	*		d9d+ee. d,e.d-e.de(d e(d.e(d/e(d0ee. d1ee( d2ee* dej/fd3d4Z0d5ej/d0ee. dej/fd6d7Z1dS ):    )OptionalN)Callback)DistributedDataParallelConfig)	lightning)llm)Llama32EmbeddingConfig3BLlamaEmbeddingModel)finetune)PEFT_STR2CLS)default_finetune_recipe)
bf16_mixed)GarbageCollectionCallback)MegatronCommOverlapCallback)TimingCallbacknvembed_llama_3b)namereturnc                   C   s   t jtt tdS )ay  
    Factory function to create a NVEmbed Llama3.2 3b model configuration.

    Returns:
        run.Config[pl.LightningModule]: Configuration for the NVEmbed Llama3.2 3B model.

    Examples:
        CLI usage:
            $ nemo llm pretrain model=nvembed_llama_3b ...

        Python API usage:
            >>> model_config = model()
            >>> print(model_config)
    )config)runConfigr   r    r   r   c/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/llm/recipes/llama_embedding_3b.pymodel&   s   r         F   { tensor_parallelismpipeline_parallelismpipeline_parallelism_typevirtual_pipeline_parallelismcontext_parallelismsequence_parallelism	num_nodesnum_gpus_per_node	max_steps	callbacksc
                 C   sb   t jtj| |||||dddt jtddddddd}
t jtjdd|	|ddd||t |
d	d
d}|S )au  
    Configure the NeMo Lightning Trainer for NVEmbed Llama3.2 3B model.

    This function sets up the distributed training strategy and other training parameters.

    Args:
        tensor_parallelism (int): Degree of tensor model parallelism.
        pipeline_parallelism (int): Degree of pipeline model parallelism.
        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
        context_parallelism (int): Degree of context parallelism.
        sequence_parallelism (bool): Whether to use sequence parallelism.
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
        max_steps (int): Maximum number of training steps.
        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.

    Returns:
        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.

    Examples:
        CLI usage:
            $ nemo llm pretrain trainer=nvembed_llama_3b ...

        Python API usage:
            >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8)
            >>> print(trainer_config)

    Note:
        For more information on distributed training strategies, refer to the
        NeMo documentation on multi-GPU and multi-node training.
    Tcheck_for_nan_in_gradgrad_reduce_in_fp32overlap_grad_reduceoverlap_param_gatheraverage_in_collective)
tensor_model_parallel_sizepipeline_model_parallel_sizepipeline_dtype$virtual_pipeline_model_parallel_sizecontext_parallel_sizesequence_parallelgradient_as_bucket_viewckpt_async_saveckpt_parallel_loadddpgpur   2       
   Fi  )acceleratoraccumulate_grad_batchesr&   deviceslimit_test_batcheslimit_val_batcheslog_every_n_stepsr%   r#   pluginsstrategyuse_distributed_samplerval_check_interval)r   r   nlMegatronStrategyr   Trainerr   )r   r   r   r    r!   r"   r#   r$   r%   r&   rB   trainerr   r   r   rH   9   sJ   ,rH   )targetr   defaultmeta-llama/Llama-3.2-3B   @   loradirr   resume_pathmicro_batch_sizeglobal_batch_sizepeft_scheme
seq_lengthpacked_sequencec
              
   C   s  |du rd}|	du sJ dt t || ||||	}
|du s#| dkr.d|
jj_d|
jj_n/| dv rVt	
t|  |
_d|
j_d	|
j_d
|
jj_d
|
jj_d|
jj_ntd| ||
jj_t	j
tj||||
jjj|
jjj|
jjj|
jjjdd|
_|
S )aj  
    Create a fine-tuning recipe for NVEmbed Llama3.2 3B model.

    This function sets up a complete configuration for fine-tuning, including
    model, trainer, data, logging, optimization, and resumption settings.

    Args:
        dir (Optional[str]): Directory for saving logs and checkpoints.
        name (str): Name of the fine-tuning run.
        resume_path (str): Path to the Huggingface model or pretrained distributed checkpoint for resume
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
        micro_batch_size (int): Size of micro batch.
        global_batch_size (int): Size of global batch.
        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
            Allowed values: 'lora'/'dora'/'none'/None.
        seq_length (int): Maximum number of tokens per microbatch.
        packed_sequence (Optional[bool]): If true, fine-tuning sequences will be packed into batches up to the given
            maximum seq_length for better efficiency. pack sequence is not supported for embedding model training.

    Returns:
        run.Partial: Partial configuration for fine-tuning.

    Examples:
        CLI usage:
            $ nemo llm finetune --factory nvembed_llama_3b

        Python API usage:
            >>> recipe = finetune_recipe(name="nvembed_llama_3b_finetune", num_nodes=2)
            >>> print(recipe)

    Note:
        This recipe uses the SPECTER dataset for fine-tuning. For more information
        on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
        `examples/llm/finetune/` directory.
    Ni   z>pack_sequence is not supported for Embedding model finetuning.noner   gh㈵>)rN   dorar      Fg-C6?zUnrecognized peft scheme: )num_hard_negativesnegative_sample_strategyadd_bosadd_eos)rT   rQ   rR   dataset_kwargs)r   r   lowerrH   rB   r-   optimr   lrr   r   r
   peftdimalphause_distributed_optimizercross_entropy_loss_fusion
ValueErrorrT   r   SpecterDataModulerY   rZ   r[   r\   data)rO   r   rP   r#   r$   rQ   rR   rS   rT   rU   reciper   r   r   finetune_recipe   s8   1



rj   ri   c                 C   s   d| j j_t| j dsg | j _|du s| dkr9d| j j_tj	t
dddddd| j j_| j jtj	tdd nd	g| j_| j jt	t | j jt	td
d
 | S )a  
    Modify the given recipe to optimize settings for performance.

    This method enables performance optimizations that may not be suitable for all use cases.
    Intended to build upon the standard fine-tuning recipe.

    Args:
        recipe (run.Partial): Base fine-tuning recipe to which performance optimizations will be added
        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
            Allowed values: 'lora'/'dora'/'none'/None.

    Returns:
        run.Partial: Partial configuration for performance-optimized fine-tuning.

    Note:
        Use this method with caution and only when you need maximum performance.
        It may not be suitable for all hardware configurations or use cases.
    r   r&   NrV   FTr'   )tp_comm_overlap
linear_qkvd   )rH   rB   r-   hasattrr&   r^   rA   r)   r   r   r   r6   appendr   ra   target_modulesr   r   )ri   rS   r   r   r   "finetune_performance_optimizations   s:   


rq   )
r   r   NNr   Fr   r   r   N)
NrJ   rK   r   r   rL   rM   rN   NN)2typingr   lightning.pytorchpytorchplnemo_runr   torch$lightning.pytorch.callbacks.callbackr   megatron.core.distributedr   nemor   rE   nemo.collectionsr   nemo.collections.llmr   r   nemo.collections.llm.apir	   nemo.collections.llm.peftr
   -nemo.collections.llm.recipes.finetune_defaultr   6nemo.collections.llm.recipes.precision.mixed_precisionr   3nemo.lightning.pytorch.callbacks.garbage_collectionr   6nemo.lightning.pytorch.callbacks.megatron_comm_overlapr   nemo.utils.exp_managerr   NAMEclifactoryr   LightningModuler   intdtypeboollistrG   rH   strPartialrj   rq   r   r   r   r   <module>   s   	


U	
X