o
    }oit                  	   @   s  d dl mZ d dlmZmZmZmZmZmZ d dl	m
Z d dlZd dlZd dlmZmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZ d dl m!Z! d dl"m#Z# d dl$m%Z%m&Z& d dl'm(Z) d dl*m+Z+m,Z, d dlm-Z-m.Z. d dl/m0Z0 d dl1m2Z2 d dl3m4Z4m5Z5 d dl6m7Z7m8Z8 d dl9m:Z:m;Z; dZ<zd dl=Z=d dl>m?Z? d dl@mAZB W n eCeDfy   dZ<e%ZBY nw erd dlEmFZF deeGej-f fddZHdejIdeeGej-f dej-fd d!ZJd"d#defd$d%ZKeG d&d# d#e#e8jLZMG d'd( d(eBZNeG d)d* d*e&ZOG d+d, d,e%ZPG d-d. d.e!ZQG d/d0 d0ejIe8jLe8jRe0jSZAdeeGej-f defd1d2ZTdS )3    )	dataclass)TYPE_CHECKINGCallableDictLiteralOptionalUnionN)InferenceParamsparallel_statetensor_parallel)FusedLayerNorm)
BertLMHead)Pooler)OptimizerConfig)PackedSeqParams)
ModuleSpecbuild_module)TransformerBlock)TransformerConfig)TransformerLayerTransformerLayerSubmodules)get_linear_layer)get_batch_on_this_cp_rankmake_viewless_tensor)Tensornn)fn)BERTLossReduction) get_bert_layer_local_spec_postln2get_bert_layer_with_transformer_engine_spec_postln)get_vocab_sizeio)MegatronOptimizerModuleOptimizerModuleT)bert_layer_specs)	BertModelF)TokenizerSpecreturnc                    s   t | }t|trt|dkr|d }n|}t   d t r' d t r0 	d  fdd|
 D }t|}|S )zSetup BERT dataloader batch.   r   padding_masktext)labels	loss_masktypes	is_randomc                    s*   i | ]\}}|| v r|j d dndqS )T)non_blockingN)cuda).0keyvalrequired_keys X/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/llm/bert/model/base.py
<dictcomp>J   s   * z"bert_data_step.<locals>.<dictcomp>)next
isinstancetuplelensetaddr
   is_pipeline_first_stageis_pipeline_last_stageupdateitemsr   )dataloder_iterbatch_batchoutputr6   r4   r7   bert_data_step9   s   



rG   modelrD   c                 C   sX   |d |d |d |d d}| j jdkr|d |d< d	|v r%t||d
< | di |S )a  
    This subsets the batch keys to the ones actually used by forward pass of the model,
    and then calls the model's forward pass. if "cu_seqsens" are defined in the batch,
    then the packed sequence parameters are also passed to the model for forward pass efficiency.
    r*   r)   r+   r,   )	input_idsattention_mask	lm_labelsr,   r   r-   tokentype_ids
cu_seqlenspacked_seq_paramsNr6   )confignum_tokentypesget_packed_seq_params)rH   rD   forward_argsr6   r6   r7   bert_forward_stepQ   s   rS   rO   
BertConfigc                 C   sR   | j }|dks|dksJ d| dtr|dkrtjS t S |dkr&tjS t S )z
    Return MCore layer spec based on the bert type.
    For bert_type == 'megatron', use mcore's default layer spec;
    For bert_type == 'huggingface', use Post-LayerNorm layer spec.
    megatronhuggingfacezUnknown bert type z9, supported type for bert model is: megatron, huggingface)	bert_typeHAVE_TEr$   'bert_layer_with_transformer_engine_specr   bert_layer_local_specr   )rO   rW   r6   r6   r7   default_layer_specg   s   
r[   c                   @   s<  e Zd ZU dZdZeed< dZeed< dZeed< dZ	e
d ed	< d
Zeed< dZeed< dZee ed< dZeed< dZeed< dZeed< dZdZeed< eZeeed gef f ed< eZeed< eZeed< dZe
d ed< dZ eed< dZ!eed< dZ"eed< dZ#eed< dZ$eed < d&d!ee d"d#fd$d%Z%dS )'rT   zZ "Model config for BERT model. Adpated from megatron.core.models.bert.bert_model.BertModelFfp16_lm_cross_entropyTparallel_output#share_embeddings_and_output_weightslearned_absolute)r_   ropeposition_embedding_typei'  rotary_baseg      ?rotary_percentNseq_len_interpolation_factori   
seq_lengthattention_softmax_in_fp32masked_softmax_fusion   make_vocab_size_divisible_bytransformer_layer_specforward_step_fndata_step_fnrU   )rU   rV   rW   
add_poolerbert_binary_headadd_lm_headrP   mask_vocab_padding_tokensvp_stager'   &MCoreBertModelWrapperWithPostLNSupportc                 C   s  | j }|r| j}| j| | dksJ dddlm} | j}t|ts'|| }| jdu r4| j	r1dnd| _|p7d}t
di d| jd| jd| d	| jd
|dt| |j| jd| jr\|n:dd| jd|jd|dd|jd|dd| jd| jd| jd| jd| jd| jd| j	ddd|S d| jd|jd|dd|jd|dd| jd| jd| jd| jd| jd| jd| j	ddd|S )zConfigure the BERT Model.
        For bert_type == 'megatron', num_tokentypes in embedding is controlled by whether model has binary head.
        For bert_type == 'huggingface', tokentypes embedding is always added with num_tokentypes = 2.
        r   zLMake sure the number of model chunks is the same across all pipeline stages.)r
   N   rW   rm   rO   rP   rj   
vocab_size	tokenizermax_sequence_lengthpre_processF)ignore_virtualrq   post_processr\   r]   r^   ra   rc   rd   add_binary_headreturn_embeddingsrq   r6   )$virtual_pipeline_model_parallel_sizepipeline_model_parallel_size
num_layersmegatron.corer
   rj   r:   r   rP   rn   rr   rW   rm   r    rt   ri   rp   re   r?   r@   r\   r]   r^   ra   rc   rd   )selfru   rq   vp_sizep_sizer
   rj   r6   r6   r7   configure_model   s   

	
	
zBertConfig.configure_modelN)&__name__
__module____qualname____doc__r\   bool__annotations__r]   r^   ra   r   rb   intrc   floatrd   r   re   rf   rg   deallocate_pipeline_outputsri   r[   rj   r   r   r   rS   rk   rG   rl   rW   rm   rn   ro   rP   rp   r   r6   r6   r6   r7   rT   }   s0   
 c                       s\   e Zd ZdZ	dded f fddZ						dd
ededededef
 fddZ  ZS )rr   a  
    This class is used for working with HF Bert Checkpoints. These checkpoints
    by default have post layer norm, while the vanilla mcore bert model does not support it.
    when bert_type is set to 'huggingface', it will initialize post layer norm BERT model.
    rU   TNru   r&   c                    sh  t t| j|i | || _|| _|| _| jdks'| jdks'J d| j dt| j| j| j	| j
| jdkr7dnd| j|dd d| _| j	rN| j| jj | jret| jj| jj| j| jj| jj| _| j
rd | _| jjrxt| jj| j| _tj| jj| j| j| jjdd| j | j	o| jd	| _d | _| jrt | jjd
| jj| jj!| _| j	s| j
r| "  d S d S )NrU   rV   z<bert_type should either be megatron or huggingface, but got .TFrq   )rO   specrw   ry   post_layer_normrW   rq   )rO   init_methodbiasskip_bias_addgather_outputskip_weight_param_allocationrs   )#superrr   __init__rm   rW   ru   !TransformerBlockWithPostLNSupportrO   rj   rw   ry   getencoder	embeddingtoparams_dtyper   hidden_sizer   sequence_parallelpoolerlm_headro   MCoreBertLMHeadr   ColumnParallelLinearrt   r]   r^   output_layerbinary_headrz   mcore_get_linear_layerperform_initialization!setup_embeddings_and_output_layer)r   rW   rm   ru   argskwargs	__class__r6   r7   r      sb   


z/MCoreBertModelWrapperWithPostLNSupport.__init__FrI   rJ   rL   rK   r,   c                    s  | j }d| _ t |||||}	|| _ | j r|r|	S | jrbt|	dd}
tj|dd}tj|
jd |
jd ftj	tj
 d}tt|
|D ]\}\}}tj|d|d  dd||ddf< qF|S d}| jrk|  }| j|	d}| j||d	\}}d}| jdur| jr| |	d}| |}|du r|dd ||d
S | jrddlm} | jj}|j}| d }t }t }||||\}}t || d}t!d|dddd|df< | "||}|||dS )aM  Forward function of BERT model

        Forward function of the BERT Model This function passes the input tensors
        through the embedding layer, and then the encoder and finally into the post
        processing layer (optional).

        It either returns the Loss values if labels are given  or the final hidden units
        Fr      )dimrs   )sizedtypedeviceN)hidden_states)weight)logitsbinary_logitsr,   )VocabUtilityz-inf)lm_lossr   r,   )#ry   r   forwardr{   torch	transposesumzerosshapefloat32r0   current_device	enumeratezipmeanr^   !shared_embedding_or_output_weightr   r   r   rm   r   
contiguousru   #megatron.core.tensor_parallel.utilsr   rt   )vocab_range_from_per_partition_vocab_sizer   r
   get_tensor_model_parallel_rank$get_tensor_model_parallel_world_sizemaxr   compute_language_model_loss)r   rI   rJ   rL   rK   r,   inference_paramshidden_states_onlyoriginal_post_processr   
embeddingsmasksrF   ir   maskoutput_weighthidden_states_after_lm_headr   _r   pooled_outputr   unpadded_vocab_sizeget_vocab_rangepadded_vocab_sizerank
world_sizevocab_start_index
mask_startlossr   r6   r7   r     s\   
(
z.MCoreBertModelWrapperWithPostLNSupport.forward)rU   TN)NNNNF)	r   r   r   r   r   r   r   r   __classcell__r6   r6   r   r7   rr      s,    Err   c                       s    e Zd ZdZ fddZ  ZS )+TransformerLayerSubmodulesWithPostLNSupportzXWrapper for TransformerLayerSubmodules with additional post-attention LN and post MLP LNc                    s&   t t| jdi | || _|| _d S )Nr6   )r   r   r   post_att_layernormpost_mlp_layernorm)r   r   r   r   r   r6   r7   r   k  s   
z4TransformerLayerSubmodulesWithPostLNSupport.__init__)r   r   r   r   r   r   r6   r6   r   r7   r   g  s    r   c                       s4   e Zd ZdZ fddZ					dddZ  ZS )!TransformerLayerWithPostLNSupportzn
    Adapted from mcore's TransformerLayer with additional post-attention LN and
    post MLP LN support.
    c                    sZ   t t| j|i | t| jj| j| jj| jjd| _t| jj	| j| jj| jjd| _	d S )NrO   r   eps)
r   r   r   r   submodules_configr   rO   r   layernorm_epsilonr   )r   r   r   r   r6   r7   r   w  s   z*TransformerLayerWithPostLNSupport.__init__Nc                 K   sl  |}	|  |}
| j|
||||d}|   | | j| jj||	| j}W d   n1 s.w   Y  |}	| |}| 	|}| j
||||d}t|trUd|v rU|d }|   | | j| jj||	| j}W d   n1 srw   Y  |}	| |}| |}|   | | j| jj||	| j}W d   n1 sw   Y  | |}t||jdd}||fS )a  
        Perform a forward pass through the transformer layer.
        Perform post-attention LN and post MLP LN if module exists.

        This method implements the core computation of a transformer layer, including
        self-attention, cross-attention (if applicable), and feed-forward operations.

        Args:
            hidden_states (Tensor): Input tensor of shape [s, b, h] where s is sequence length,
                b is batch size, and h is hidden size.
            attention_mask (Tensor): Mask tensor for self-attention.
            context (Tensor, optional): Context tensor for cross-attention.
            context_mask (Tensor, optional): Mask tensor for cross-attention.
            rotary_pos_emb (Tensor, optional): Rotary positional embeddings.
            inference_params (object, optional): Parameters for inference-time optimizations.
            packed_seq_params (object, optional): Parameters for packed sequence processing.

        Returns:
            Tuple[Tensor, Tensor]: A tuple containing:
                output (Tensor): Transformed hidden states of shape [s, b, h].
                context (Tensor): Updated context tensor if cross-attention is used,
                otherwise None.
        )rJ   r   rotary_pos_embrN   N)rJ   key_value_statesr   contextT)inprequires_grad
keep_graph)input_layernormself_attentionbias_dropout_add_exec_handlerself_attn_bdatrainingrO   bias_dropout_fusionhidden_dropoutr   pre_cross_attn_layernormcross_attentionr:   dictcross_attn_bdapre_mlp_layernormmlpmlp_bdar   r   r   )r   r   rJ   r   context_maskr   r   rN   r   residualinput_layernorm_outputattention_output_with_biaspre_cross_attn_layernorm_outputpre_mlp_layernorm_outputmlp_output_with_biasrF   r6   r6   r7   r     sR   %









z)TransformerLayerWithPostLNSupport.forwardNNNNN)r   r   r   r   r   r   r   r6   r6   r   r7   r   q  s    r   c                       sX   e Zd ZdZd fdd	Z					ddededed	ed
ededef fddZ  Z	S )r   z`Adapted from mcore's TransformerBlock with additional post-attention LN and post MLP LN support.rU   c                    sR   t t| j|i | || _| jdkr't| j| jj| jjd| jj	| _
d S d S )NrV   r   )r   r   r   transformer_block_typer   rO   r   r   r   r   initial_layernorm)r   rW   r   r   r   r6   r7   r     s   

z*TransformerBlockWithPostLNSupport.__init__Nr   rJ   r   r   r   r   rN   c           	   	      s<   | j s| j}| jdkr| |}tt| |||||||S )a  
        Perform the forward pass through the transformer block.
        Perform additional post-attention LN and post MLP LN support if needed.

        This method handles the core computation of the transformer, including
        self-attention, optional cross-attention, and feed-forward operations.

        Args:
            hidden_states (Tensor): Input tensor of shape [s, b, h] where s is the
                sequence length, b is the batch size, and h is the hidden size.
            attention_mask (Tensor): Boolean tensor of shape [1, 1, s, s] for masking
                self-attention.
            context (Tensor, optional): Context tensor for cross-attention.
            context_mask (Tensor, optional): Mask for cross-attention context
            rotary_pos_emb (Tensor, optional): Rotary positional embeddings.
            inference_params (InferenceParams, optional): Parameters for inference-time
                optimizations.
            packed_seq_params (PackedSeqParams, optional): Parameters for packed sequence
                processing.

        Returns:
            Union[Tensor, Tuple[Tensor, Tensor]]: The output hidden states tensor of shape
            [s, b, h], and optionally the updated context tensor if cross-attention is used.
        rV   )rw   input_tensorr  r  r   r   r   )	r   r   rJ   r   r   r   r   rN   r   r   r6   r7   r   
  s   %


z)TransformerBlockWithPostLNSupport.forward)rU   r  )
r   r   r   r   r   r   r	   r   r   r   r6   r6   r   r7   r     s.    r   c                       s   e Zd ZdZ			ddedee ded deeej	gej	f  f fdd	Z
dd
ee ddfddZdejfddZdeeejf fddZdejfddZddejfddZddejfddZedefddZedefddZ  ZS )r%   zBert Lightning ModuleNrO   optimru   r&   model_transformc                    s\   t sJ dt   || _|| _|pttdddd| _| j|  || _	d | _
d | _d S )Nz6NeMo BERT requires Transformer Engine to be installed.g-C6?T)lruse_distributed_optimizer)rO   )rX   r   r   rO   ru   r"   r   r  connectr  _training_loss_reduction_validation_loss_reduction)r   rO   r  ru   r  r   r6   r7   r   <  s   

zBertModel.__init__rq   r'   c                 C   s$   t | ds| j| j|| _dS dS )z0Setup the BERT Model based on config definition.moduleN)hasattrrO   r   ru   r  )r   rq   r6   r6   r7   r   V  s   
zBertModel.configure_modelc                 O   s   | j |i |}|S )zPCall the forward method of the underlying model, and return whatever it outputs.)r  )r   r   r   output_tensorr6   r6   r7   r   [  s   zBertModel.forwardc                 C   s   | j |S r   )rO   rl   )r   dataloader_iterr6   r6   r7   	data_stepd  s   zBertModel.data_stepc                 C   s   | j | |S r   )rO   rk   )r   rD   r6   r6   r7   forward_stepg  s   zBertModel.forward_stepc                 C   
   |  |S r   r  r   rD   	batch_idxr6   r6   r7   training_stepj     
zBertModel.training_stepc                 C   r  r   r  r  r6   r6   r7   validation_stepn  r  zBertModel.validation_stepc                 C   s   | j st| jjd| _ | j S )N)add_sop_loss)r  r   rO   rn   r   r6   r6   r7   training_loss_reductionr  s   z!BertModel.training_loss_reductionc                 C   s   | j std| jjd| _ | j S )NT)r  r  )r  r   rO   rn   r  r6   r6   r7   validation_loss_reductiony  s
   z#BertModel.validation_loss_reduction)NNNr   )r   r   r   r   rT   r   r#   r   r   Moduler   r   r   r   r   r   r   strr  r  r  r  propertyr   r  r  r   r6   r6   r   r7   r%   9  s4    
	r%   c                 C   sn   | d   }| dddu }r|d|  }n	|dt| }d| v r,| d   nd}t||||ddS )a  
    Get the packed sequence parameters for the given batch.
    This function should only be called if `cu_seqlens` is defined in the batch.

    Args:
        batch (dict): The input batch containing the following keys:
            - cu_seqlens (torch.Tensor): The sequence lengths of the input batch.
            - cu_seqlens_argmin (torch.Tensor, optional): The minimum sequence length index.
            - max_seqlen (torch.Tensor, optional): The maximum sequence length.

    Returns:
        PackedSeqParams: The packed sequence parameters containing the following attributes:
            - cu_seqlens_q (torch.Tensor): The sequence lengths for query.
            - cu_seqlens_kv (torch.Tensor): The sequence lengths for key and value.
            - max_seqlen_q (torch.Tensor, optional): The maximum sequence length for query.
            - max_seqlen_kv (torch.Tensor, optional): The maximum sequence length for key and value.
            - qkv_format (str): The format of query, key, and value tensors.

    rM   cu_seqlens_argminN
max_seqlenthd)cu_seqlens_qcu_seqlens_kvmax_seqlen_qmax_seqlen_kv
qkv_format)squeezer   itemr   argminr   )rD   rM   r"  r#  r6   r6   r7   rQ     s   rQ   )Udataclassesr   typingr   r   r   r   r   r   lightning.pytorchpytorchLr   torch.distributedr   r	   r
   r   &megatron.core.fusions.fused_layer_normr   &megatron.core.models.bert.bert_lm_headr   r    megatron.core.models.bert.poolerr   megatron.core.optimizerr   megatron.core.packed_seq_paramsr   $megatron.core.transformer.spec_utilsr   r   +megatron.core.transformer.transformer_blockr   ,megatron.core.transformer.transformer_configr   +megatron.core.transformer.transformer_layerr   r   megatron.core.transformer.utilsr   r   megatron.core.utilsr   r   r   r   nemo.collections.llmr   nemo.collections.llm.bert.lossr   )nemo.collections.llm.bert.model.bert_specr   r   nemo.lightningr    r!   nemo.lightning.pytorch.optimr"   r#   rX   transformer_enginemegatron.core.models.bertr$   $megatron.core.models.bert.bert_modelr%   	MCoreBertImportErrorModuleNotFoundError1nemo.collections.common.tokenizers.tokenizer_specr&   r   rG   LightningModulerS   r[   IOMixinrT   rr   r   r   r   ConnectorMixinFNMixinrQ   r6   r6   r6   r7   <module>   s`    $J  	 > J