o
    }oiZ#                     @   s  d dl mZmZ d dlmZ d dlZd dlZd dl	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ d dlmZmZ d d	lmZmZmZ d d
lmZ d dlmZ d dlm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z& dZ'ej(j)e'ddej*ej+ fddZ,ddej-dddddddf
de.de.deej/ dee. d e.d!e0d"e.d#e.d$e.d%ee1ej*e
   dej*ej2 fd&d'Z3ej(j)ee'd(dd)ddd*efd+ee4 d,e4d"e.d#e.d-e0d.edej5fd/d0Z6d1ej5dej5fd2d3Z7dS )4    )CallableOptionalN)Callback)DistributedDataParallelConfig)	lightning)pretrain)MockDataModule)GPTConfig175BGPTModel)default_logdefault_resumetensorboard_logger),distributed_fused_adam_with_cosine_annealing)
bf16_mixed)0userbuffers_bf16_h100_h12288_tp4_mbs1_seqlen2048)GarbageCollectionCallback)MegatronCommOverlapCallback)TimingCallback	gpt3_175bnamereturnc                   C   s   t jtt tdS )a^  
    Factory function to create a GPT3 175B model configuration.

    Returns:
        run.Config[pl.LightningModule]: Configuration for the GPT3 175B model.

    Examples:
        CLI usage:
            $ nemo llm pretrain model=gpt3_175b ...

        Python API usage:
            >>> model_config = model()
            >>> print(model_config)
    )config)runConfigr
   r	    r   r   Z/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/llm/recipes/gpt3_175b.pymodel)   s   r               T@   i{ tensor_parallelismpipeline_parallelismpipeline_parallelism_typevirtual_pipeline_parallelismcontext_parallelismsequence_parallelism	num_nodesnum_gpus_per_node	max_steps	callbacksc
                 C   sb   t jtj| |||||dddt jtddddddd}
t jtjdd|	|ddd||t |
d	d
d}|S )a=  
    Configure the NeMo Lightning Trainer for GPT3 175B model.

    This function sets up the distributed training strategy optimized for the large 175B model.

    Args:
        tensor_parallelism (int): Degree of tensor model parallelism.
        pipeline_parallelism (int): Degree of pipeline model parallelism.
        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
        context_parallelism (int): Degree of context parallelism.
        sequence_parallelism (bool): Whether to use sequence parallelism.
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
        max_steps (int): Maximum number of training steps.
        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.

    Returns:
        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.

    Examples:
        CLI usage:
            $ nemo llm pretrain trainer=gpt3_175b ...

        Python API usage:
            >>> trainer_config = trainer(num_nodes=64, num_gpus_per_node=8)
            >>> print(trainer_config)

    Note:
        This configuration uses extensive parallelism to handle the large model size efficiently.
    T)check_for_nan_in_gradgrad_reduce_in_fp32overlap_grad_reduceoverlap_param_gatheraverage_in_collective)
tensor_model_parallel_sizepipeline_model_parallel_sizepipeline_dtype$virtual_pipeline_model_parallel_sizecontext_parallel_sizesequence_parallelgradient_as_bucket_viewckpt_async_saveckpt_parallel_loadddpgpur!   2       
   Fi  )acceleratoraccumulate_grad_batchesr,   deviceslimit_test_batcheslimit_val_batcheslog_every_n_stepsr+   r)   pluginsstrategyuse_distributed_samplerval_check_interval)r   r   nlMegatronStrategyr   Trainerr   )r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   rG   trainerr   r   r   rM   <   sJ   +rM   )targetr   defaultFdirr   performance_modefnc                 C   sb   t j|t t||t tgdt jtddddt| |t|ddt	ddt
 d	}|r/t|}|S )
a_  
    Create a pre-training recipe for GPT3 175B model.

    This function sets up a complete configuration for pre-training, including
    model, trainer, data, logging, optimization, and resumption settings.

    Args:
        dir (Optional[str]): Directory for saving logs and checkpoints.
        name (str): Name of the pre-training run.
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
        performance_mode (bool): If true, enables optimizations for maximum performance.
        fn (Callable): The pre-training function to use.

    Returns:
        run.Partial: Partial configuration for pre-training.

    Examples:
        CLI usage:
            $ nemo llm pretrain --factory gpt3_175b
            $ nemo llm pretrain --factory "gpt3_175b(num_nodes=64, name='my_175b_pretrain')"

        Python API usage:
            >>> recipe = pretrain_recipe(name="gpt3_175b_pretrain", num_nodes=64)
            >>> print(recipe)

    Note:
        This recipe is optimized for the large 175B model and requires significant computational resources.
    )r)   r*   r,   i      )
seq_lengthglobal_batch_sizemicro_batch_sizer   )rP   r   r   g9̗?)max_lr)r   rM   datalogoptimresume)r   Partialr   rM   r   r   r   r   r   r   r   "pretrain_performance_optimizations)rP   r   r)   r*   rQ   rR   reciper   r   r   pretrain_recipe   s    &
r_   r^   c                 C   s`   | j jsg | j _tjtddd}tjtdtdddd}| j j||g d| j j_	d| j
j_| S )a  
    Create a performance-optimized pre-training recipe for GPT3 175B model.

    This method enables performance optimizations that may not be suitable for all use cases.
    It builds upon the standard pre-training recipe and adds additional performance enhancements.

    Args:
        recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added

    Returns:
        run.Partial: Partial configuration for performance-optimized pre-training.

    Note:
        Use this method with caution and only when you need maximum performance.
        It may not be suitable for all hardware configurations or use cases.
    d   )gc_interval_traingc_interval_valTr=   F)tp_comm_overlaptp_comm_overlap_cfgdefer_embedding_wgrad_computewgrad_deferral_limit(overlap_param_gather_with_optimizer_step)rM   r,   r   r   r   r   r   extendrF   r.   rZ   r   use_precision_aware_optimizer)r^   garbage_collection_callbackmcomm_overlap_callbackr   r   r   r]      s.   	

r]   )8typingr   r   lightning.pytorchpytorchplnemo_runr   torch$lightning.pytorch.callbacks.callbackr   megatron.core.distributedr   nemor   rJ   nemo.collections.llm.apir   "nemo.collections.llm.gpt.data.mockr   nemo.collections.llm.gpt.modelr	   r
   (nemo.collections.llm.recipes.log.defaultr   r   r   'nemo.collections.llm.recipes.optim.adamr   6nemo.collections.llm.recipes.precision.mixed_precisionr   ;nemo.collections.llm.recipes.tp_overlap_configs.userbuffersr   3nemo.lightning.pytorch.callbacks.garbage_collectionr   6nemo.lightning.pytorch.callbacks.megatron_comm_overlapr   nemo.utils.exp_managerr   NAMEclifactoryr   LightningModuler   bfloat16intdtypeboollistrL   rM   strr\   r_   r]   r   r   r   r   <module>   s   	


T9