o
    wiJx                     @   s  d dl Z d dlZd dlmZ d dlmZ d dlmZmZm	Z	m
Z
mZmZ d dlmZ d dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlm Z  d dlm!Z! d dl"m#Z# d dl$m%Z%m&Z& d dl'm(Z( d dl)m*Z*m+Z+ d dl,m-Z- d dl.m/Z/ e/d\Z0Z1dZ2zd dl3Z3W n e4y   dZ2Y nw erd dl5m6Z6 d dl7m8Z8 d@de9e:ej;f fddZ<dej;fddZ=dddefdd Z>dAddd!ee? defd"d#Z@dddefd$d%ZAdAddd!ee? defd&d'ZBdAddd!ee? dee fd(d)ZCdedejDfd*d+ZEde9e:ef dejDfd,d-ZFeG d.d dee&jGZHeG d/d0 d0eHZIeG d1d2 d2eHZJeG d3d4 d4eHZKeG d5d6 d6eHZLeG d7d8 d8eHZMeG d9d: d:eHZNG d;d< d<ejOe&jGe&jPe#jQZd=d> ZRg d?ZSdS )B    N)	dataclass)partial)TYPE_CHECKINGAnyCallableLiteralOptionalUnion)GPTInferenceWrapper)InferenceWrapperConfig)GPTModel)OptimizerConfig)
ModuleSpec)TransformerConfig)get_batch_on_this_cp_rank)nn)fn)get_vocab_sizeio)MaskedTokenLossReduction)MegatronOptimizerModuleOptimizerModule)logging)safe_importtransformer_engineTF)GenerationConfig)TokenizerSpecreturnc                 C   s   ddl m} t| }t|trt|dkr|d }n|}t }t }|d d|v r:|d |d |d | s@|rE|	d |
 rN|	d	 i }| D ] \}}	||v re|	jd
d||< qT||v rp|	 ||< qTd||< qTt|}
|
S )a-  Process a single batch of data from the dataloader iterator.

    This function handles the data loading step for GPT models, managing
    pipeline parallelism by distributing data appropriately across pipeline stages.

    Args:
        dataloader_iter: Iterator over the dataloader
        use_mtp: Whether the Multi-Token Prediction Module is used. Input needs to be passed
                 into the last ppieline stage if mtp is used.

    Returns:
        dict[str, torch.Tensor]: Processed batch with required tensors moved to appropriate devices
    r   parallel_state   attention_mask
cu_seqlenscu_seqlens_argmin
max_seqlen)tokensposition_ids)labels	loss_maskT)non_blockingN)megatron.corer   next
isinstancetuplelensetaddis_pipeline_first_stageupdateis_pipeline_last_stageitemscudacpur   )dataloader_iteruse_mtpr   batch_batchrequired_device_keysrequired_host_keys_batch_required_keyskeyvaloutput rA   `/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/llm/gpt/model/base.pygpt_data_step9   s2   







rC   c                 C   s\   |d |d |d d}d|vrt sJ dn|d |d< d|v r't||d< | d
i |S )a  Execute a forward step for the GPT model.

    This function prepares the arguments needed for the model's forward pass
    and handles both normal and packed sequence processing.

    Args:
        model: The GPT model
        batch: The input batch containing tokens, positions, and other required inputs

    Returns:
        torch.Tensor: Output tensor from the model forward pass
    r%   r&   r'   )	input_idsr&   r'   r!   zThe dataloader did not provide an attention mask, however Transformer Engine was not detected.             This requires Transformer Engine's implementation of fused or flash attention.r"   packed_seq_paramsNrA   )HAVE_TEget_packed_seq_params)modelr9   forward_argsrA   rA   rB   gpt_forward_stepq   s   rJ   config	GPTConfigc                 C   sZ   ddl m} | j| j| jt| jo| jdud}t| dddur%| j|d< |j	di |S )zCreate a Transformer Engine layer specification based on the provided config.

    Args:
        config: GPT configuration object

    Returns:
        ModuleSpec: Module specification for Transformer Engine based layers
    r   gpt_layer_specsN)num_expertsmoe_grouped_gemmqk_layernormfp8use_transformer_engine_op_fuseruse_te_op_fuserrA   )
megatron.core.models.gptrN   num_moe_expertsrP   rQ   boolrR   getattrrS   *get_gpt_layer_with_transformer_engine_spec)rK   rN   kwargsrA   rA   rB   transformer_engine_layer_spec   s   	
r[   vp_stagec                 C   s   ddl m} || |dS )zCreate a full Transformer Engine layer specification with autocast support.

    Args:
        config: GPT configuration object

    Returns:
        ModuleSpec: Module specification for full TE layers
    r   )#get_gpt_full_te_layer_autocast_spec)transformer_configr\   )Vnemo.collections.nlp.models.language_modeling.megatron.gpt_full_te_layer_autocast_specr]   )rK   r\   r]   rA   rA   rB   "transformer_engine_full_layer_spec   s   	r`   c                 C   s&   ddl m} |j| j| j| j| jdS )zCreate a local layer specification without Transformer Engine.

    Args:
        config: GPT configuration object

    Returns:
        ModuleSpec: Module specification for local implementation layers
    r   rM   )rO   rP   rQ   normalization)rU   rN   get_gpt_layer_local_specrV   rP   rQ   ra   )rK   rN   rA   rA   rB   local_layer_spec   s   	rc   c                 C   s&   t r| jrt| |dS t| S t| S )a%  Determine the most appropriate layer specification based on availability.

    Uses Transformer Engine specs if available, otherwise falls back to local implementation.

    Args:
        config: GPT configuration object

    Returns:
        ModuleSpec: The selected module specification
    r\   )rF   &use_transformer_engine_full_layer_specr`   r[   rc   )rK   r\   rA   rA   rB   default_layer_spec   s
   rf   c                 C   sl   t | ddr4ddlm} t| jtr)dt| jjv r#| j| |d}n	| | }n| j}|| |t	|dS dS )zPass in the MTP block spec if model has MTP layers.

    Args:
        config: GPT configuration object

    Returns:
        ModuleSpec: The MTP module specification
    mtp_num_layersNr   )get_gpt_mtp_block_specr\   rd   )use_transformer_enginer\   )
rX   (megatron.core.models.gpt.gpt_layer_specsrh   r,   transformer_layer_specr   inspect	signature
parametersrF   )rK   r\   rh   specrA   rA   rB   mtp_block_spec   s   	rp   c                 C   s   | j rtjS | jrtjS tjS )zExtract the appropriate torch dtype from a Megatron Core configuration.

    Args:
        config: Megatron Core Transformer configuration

    Returns:
        torch.dtype: The appropriate torch dtype (float16, bfloat16, or float32)
    )fp16torchfloat16bf16bfloat16floatrK   rA   rA   rB   torch_dtype_from_mcore_config   s
   	rx   c                 C   s"   | d rt jS | d rt jS t jS )zExtract the appropriate torch dtype from a dictionary configuration.

    Args:
        config: Dictionary containing configuration parameters

    Returns:
        torch.dtype: The appropriate torch dtype (float16, bfloat16, or float32)
    rq   rt   )rr   rs   ru   rv   rw   rA   rA   rB   torch_dtype_from_dict_config  s
   	ry   c                   @   sh  e Zd ZU dZdZeed< dZeed< dZeed< dZ	e
ed< d	Zed
 ed< dZe
ed< dZeed< dZee ed< dZe
ed< dZeed< dZeed< dZeed< eZeed< dZeed< dZeed< dZeed< dZeed< eZeee d gef f ed< e!Z"e ed< e#Z$e ed< dZ%ed ed < dZ&ee
 ed!< dZ'eee(e)e(e*f f  ed"< d'd(d%d&Z+dS ))rL   zConfiguration class for GPT models.

    Extends TransformerConfig with additional parameters specific to GPT models
    and provides utility methods for model configuration.
    Ffp16_lm_cross_entropyTparallel_output#share_embeddings_and_output_weights   make_vocab_size_divisible_bylearned_absolute)r   ropeposition_embedding_typei'  rotary_baseg      ?rotary_percentNseq_len_interpolation_factori   
seq_lengthattention_softmax_in_fp32masked_softmax_fusioncross_entropy_loss_fusiongradient_accumulation_fusiondeallocate_pipeline_outputs#scatter_embedding_sequence_paralleltp_only_amax_redre   rk   forward_step_fndata_step_fnr   generation_config
vocab_sizetp_comm_overlap_cfgr   MCoreGPTModelc                 C   s  | j r| jdkrtsJ dt| ddsJ d| j}t| ddp&t| dd}|t| dd	p3t| d
d	d	uO }|p@t| dd	d	u}|rU|sU| j}| j| | dksUJ ddd	l}	ddlm	}
 |pbd}| j
}t|ts~d|	|jv rz|| |d}n|| }| jd	ur| j}|d	urtd| d|j d||j  d nt| |j| j}tj}| jrttjdd}d|	tjjv rdt| |di}ni }| 6 t| f||| j| j| j| j | j!| j"| j#| j$|p|
j%d|d|p|
j&d|d| j'|d|}W d	   n	1 sw   Y  trq| j(rq|
) dkr9t*|+ D ]\}}|dkr(qt,|dr7|
- }|.| q|
/ dkrqtj01 }| 2 D ]'}t*|+ D ]\}}|dkr\qQt,|drm|3|
4 |
5 | qQqI|S )a  Configure and instantiate a Megatron Core GPT model based on this configuration.

        Args:
            tokenizer: Tokenizer used with the model
            pre_process: Whether to include pre-processing in the model, defaults to first pipeline stage
            post_process: Whether to include post-processing in the model, defaults to last pipeline stage
            vp_stage: Virtual pipeline stage

        Returns:
            MCoreGPTModel: Configured Megatron Core GPT model instance
        full_iterationz.Transformer Engine is required for cudagraphs.use_te_rng_trackerFzmTransformer engine's RNG tracker is required for cudagraphs, it can be enabled with use_te_rng_tracker=True'.'account_for_embedding_in_pipeline_split"account_for_loss_in_pipeline_split"num_layers_in_first_pipeline_stageN!num_layers_in_last_pipeline_stagepipeline_model_parallel_layoutr   zLMake sure the number of model chunks is the same across all pipeline stages.r   r\   rd   zUse preset vocab_size: z, original vocab_size: z, dummy tokens: .meta)devicerp   )ignore_virtualr\   )rk   r   max_sequence_lengthrz   r{   r|   r   r   r   r   pre_processpost_processr   r\      set_tensor_parallel_groupset_context_parallel_group)6enable_cuda_graphcuda_graph_scoperF   rX   $virtual_pipeline_model_parallel_sizepipeline_model_parallel_size
num_layersrl   r*   r   rk   r,   r   rm   rn   r   r   infor   r~   
contextlibnullcontextinit_model_with_meta_devicer   rr   r   r   __init__rp   r   rz   r{   r|   r   r   r   r   r1   r3   r   re   $get_tensor_model_parallel_world_size	enumeratemoduleshasattrget_tensor_model_parallel_groupr   get_context_parallel_world_sizer5   Streamget_model_module_listr   get_context_parallel_group!get_context_parallel_global_ranks)self	tokenizerr   r   r\   vp_sizeis_pipeline_asymmetricis_flexible_pp_layoutp_sizerl   r   rk   r   model_init_device_contextrZ   rH   indexchildtp_group	cp_streammodulerA   rA   rB   configure_model=  s   







zGPTConfig.configure_model)NNN)r   r   ),__name__
__module____qualname____doc__rz   rW   __annotations__r{   r|   r~   intr   r   r   r   rv   r   r   r   r   r   r   _grad_accum_fusion_availabler   r   r   r   re   rf   rk   r	   r   r   rJ   r   rC   r   r   r   r   strdictr   r   rA   rA   rA   rB   rL     s4   
  c                   @   r   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed	< dZ
eed
< dZeed< dZeed< dZeed< dS )GPTConfig126MzConfiguration for a 126M parameter GPT model.

    Predefined configuration for a small GPT model with 12 layers,
    768 hidden size, and 12 attention heads.
       r      r   i   hidden_sizei   ffn_hidden_sizenum_attention_headsTbias_activation_fusionbias_dropout_add_fusionre   Nr   r   r   r   r   r   r   r   r   r   r   r   rW   r   re   rA   rA   rA   rB   r        
 r   c                   @   r   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed	< d
Z
eed< dZeed< dZeed< dZeed< dS )GPTConfig5BzConfiguration for a 5B parameter GPT model.

    Predefined configuration for a medium-sized GPT model with 24 layers,
    4096 hidden size, and 32 attention heads.
    r   r      r      r   i @  r       r   Tr   r   re   Nr   rA   rA   rA   rB   r     r   r   c                   @   r   )GPTConfig7BzConfiguration for a 7B parameter GPT model.

    Predefined configuration for a medium-sized GPT model with 32 layers,
    4096 hidden size, and 32 attention heads.
    r   r   r   r   r   r   i*  r   r   Tr   r   re   Nr   rA   rA   rA   rB   r     r   r   c                   @   r   )GPTConfig20BzConfiguration for a 20B parameter GPT model.

    Predefined configuration for a large GPT model with 44 layers,
    6144 hidden size, and 48 attention heads.
    r   r   ,   r   i   r   i `  r   0   r   Tr   r   re   Nr   rA   rA   rA   rB   r     r   r   c                   @   r   )GPTConfig40BzConfiguration for a 40B parameter GPT model.

    Predefined configuration for a large GPT model with 48 layers,
    8192 hidden size, and 64 attention heads.
    r   r   r   r   i    r   i   r   @   r   Tr   r   re   Nr   rA   rA   rA   rB   r     r   r   c                   @   s   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed	< dZ
eed
< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dS )GPTConfig175BzConfiguration for a 175B parameter GPT model.

    Predefined configuration for a massive GPT model with 96 layers,
    12288 hidden size, and 96 attention heads.
    r   r   `   r   i 0  r   i   r   r   g        hidden_dropoutattention_dropoutTr   r   re   layernorm_zero_centered_gammaN)r   r   r   r   r   r   r   r   r   r   r   r   rv   r   r   rW   r   re   r   rA   rA   rA   rB   r     s   
 r   c                       sL  e Zd ZdZdddg fdedee ded deeej	gej	f  dee
 f
 fd	d
Zd(dee ddfddZ					d)dejdejdeej deej deej dejfddZdeeejf fddZdejfddZd(dejfddZd(dejfddZ	d*dejd ed!edefd"d#Zedefd$d%Zedefd&d'Z  ZS )+r   zGPT model implementation using Megatron Core and PyTorch Lightning.

    This class provides a high-level interface for training and using GPT models
    with proper integration with NeMo's infrastructure.
    NrK   optimr   r   model_transformmodel_context_managersc                    sV   t    || _|| _|pttdddd| _| j|  || _|| _	d| _
d| _dS )a  Initialize the GPT model.

        Args:
            config: Configuration for the GPT model
            optim: Optional optimizer module
            tokenizer: Optional tokenizer specification
            model_transform: Optional function to transform the model after initialization
            model_context_managers: Optional list of context managers to apply when configuring and instantiating
                the model.
        g-C6?T)lruse_distributed_optimizerrw   N)superr   rK   r   r   r   r   connectr   r   _training_loss_reduction_validation_loss_reduction)r   rK   r   r   r   r   	__class__rA   rB   r   /  s   

zGPTModel.__init__r\   r   c                 C   s   ddl m} t| ds>t }| jD ]}|| q| jj| j	|d| _
W d   n1 s/w   Y  || j
| jd dS dS )zConfigure the underlying model if not already configured.

        This method ensures the model is instantiated from the configuration.
        r   )restore_modelopt_stater   rd   N)trainer))nemo.collections.llm.modelopt.model_utilsr   r   r   	ExitStackr   enter_contextrK   r   r   r   _trainer)r   r\   r   stackcmrA   rA   rB   r   L  s   


zGPTModel.configure_modelrD   r&   r!   r'   decoder_inputc           
      C   s6   |durd|ini }| j |||f|||d|}	|	S )a  Forward pass through the GPT model.

        Args:
            input_ids: Input token IDs
            position_ids: Position IDs for the input
            attention_mask: Optional attention mask
            labels: Optional labels for computing loss
            decoder_input: Optional decoder input
            inference_context: Optional parameters for inference
            packed_seq_params: Optional parameters for packed sequence processing

        Returns:
            torch.Tensor: Output tensor from the model
        NrE   )r   r'   inference_context)r   )
r   rD   r&   r!   r'   r   r   rE   extra_kwargsoutput_tensorrA   rA   rB   forwarda  s   
zGPTModel.forwardc                 C   s   | j |S )zProcess a batch of data from the dataloader.

        Args:
            dataloader_iter: Iterator over the dataloader

        Returns:
            dict[str, torch.Tensor]: Processed batch
        )rK   r   )r   r7   rA   rA   rB   	data_step  s   	zGPTModel.data_stepc                 C   s   | j | |S )zExecute a forward step using the provided batch.

        Args:
            batch: Input batch

        Returns:
            torch.Tensor: Output from the forward pass
        )rK   r   )r   r9   rA   rA   rB   forward_step  s   	zGPTModel.forward_stepc                 C   
   |  |S )zExecute a training step.

        Args:
            batch: Input batch
            batch_idx: Optional batch index

        Returns:
            torch.Tensor: Loss value
        r  r   r9   	batch_idxrA   rA   rB   training_step  s   
zGPTModel.training_stepc                 C   r  )zExecute a validation step.

        Args:
            batch: Input batch
            batch_idx: Optional batch index

        Returns:
            torch.Tensor: Loss value
        r  r  rA   rA   rB   validation_step  s   
zGPTModel.validation_step 
  params_dtype&inference_batch_times_seqlen_thresholdinference_max_seq_lengthc                 C   s   | j }|rt|tu rnt|dd}|s|du st|tur"tdd}t| jdr/| jj}n| jdur9| jj}ntdt	|jj
||||d}t||}|S )a  Get an inference wrapper for the model.

        Creates and configures a GPTInferenceWrapper around the model for efficient inference.

        Args:
            params_dtype: Data type for parameters
            inference_batch_times_seqlen_threshold: Threshold for optimizing inference
            inference_max_seq_length: Maximum sequence length for inference (prefill and decode)

        Returns:
            GPTInferenceWrapper: Wrapped model for inference
        r   Nz>Exact McoreGPTModel instance not found in the model structure.r   zlUnable to find vocab size. Either pass in a tokenizer with vocab size, or set vocab size in the model config)r   r
  r  padded_vocab_sizer  )r   typer   rX   
ValueErrorr   rK   r   r   r   r   r
   )r   r
  r  r  mcore_modelr   inference_wrapper_configmodel_inference_wrapperrA   rA   rB   get_inference_wrapper  s2   



zGPTModel.get_inference_wrapperc                 C   s   | j st | _ | j S )zGet the loss reduction module for training.

        Returns:
            MaskedTokenLossReduction: Loss reduction module for training
        )r   r   r   rA   rA   rB   training_loss_reduction  s   z GPTModel.training_loss_reductionc                 C   s   | j s	tdd| _ | j S )zGet the loss reduction module for validation.

        Returns:
            MaskedTokenLossReduction: Loss reduction module for validation
        T)r  )r   r   r  rA   rA   rB   validation_loss_reduction  s   z"GPTModel.validation_loss_reductionN)NNNNN)r	  )r   r   r   r   rL   r   r   r   r   Modulelistr   r   r   rr   Tensorr   r   r   r  r  r  r  dtyper
   r  propertyr   r  r  __classcell__rA   rA   r   rB   r   (  sl    
	
%
1r   c                 C   sz   ddl m} | d  }| dd }dur|d|  }n	|dt| }d| v r2| d  nd}|||||ddS )	aG  Extract packed sequence parameters from the batch.

    Creates and returns a PackedSeqParams object with appropriate parameters
    for packed sequence processing.

    Args:
        batch: Input batch containing packed sequence information

    Returns:
        PackedSeqParams: Parameters for packed sequence processing
    r   )PackedSeqParamsr"   r#   Nr$   thd)cu_seqlens_qcu_seqlens_kvmax_seqlen_qmax_seqlen_kv
qkv_format)megatron.core.packed_seq_paramsr  squeezegetitemrr   argmin)r9   r  r"   r#   r$   rA   rA   rB   rG     s   rG   )r   rL   rC   rJ   r[   rc   )Fr  )Tr   rl   dataclassesr   	functoolsr   typingr   r   r   r   r   r	   lightning.pytorchpytorchLrr   torch.distributedJmegatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapperr
   Imegatron.core.inference.model_inference_wrappers.inference_wrapper_configr   "megatron.core.models.gpt.gpt_modelr   r   megatron.core.optimizerr   $megatron.core.transformer.spec_utilsr   ,megatron.core.transformer.transformer_configr   megatron.core.utilsr   r   nemo.collections.llmr   nemo.lightningr   r    nemo.lightning.megatron_parallelr   nemo.lightning.pytorch.optimr   r   
nemo.utilsr   nemo.utils.import_utilsr   _rF   r   fused_weight_gradient_mlp_cudaImportErrortransformersr   1nemo.collections.common.tokenizers.tokenizer_specr   r   r   r  rC   rJ   r[   r   r`   rc   rf   rp   r  rx   ry   IOMixinrL   r   r   r   r   r   r   LightningModuleConnectorMixinFNMixinrG   __all__rA   rA   rA   rB   <module>   sx    8!    Z#