o
    }oi$                  	   @   sL  d dl mZ d dlmZ zPd dlmZ d dlmZ d dlm	Z	m
Z
 d dlmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZ d dlmZmZ d dlmZmZ d dlmZ dZW n e e!fyk   eZ"dZY nw zd dl#m$Z$m%Z%m&Z&m'Z' dZ(W n e e!fy   dZ(Y nw eG dd deZ)G dd deZ*dd Z+dd Z,dS )    )	dataclass)ApexGuardDefaults)get_bias_dropout_add)FusedLayerNorm)ColumnParallelLinearRowParallelLinear)SelfAttentionSelfAttentionSubmodules)DotProductAttention)AttnMaskType)
IdentityOp)MLPMLPSubmodules)
ModuleSpecbuild_module)TransformerLayerTransformerLayerSubmodules)make_viewless_tensorTF)TEColumnParallelLinearTEDotProductAttentionTENormTERowParallelLinearc                       s    e Zd ZdZ fddZ  ZS )+TransformerLayerSubmodulesWithPostLNSupportz/TransformerLayerSubmodules with post layer normc                    s&   t t| jdi | || _|| _d S )N )superr   __init__post_att_layernormpost_mlp_layernorm)selfr   r   kwargs	__class__r   ]/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/llm/bert/model/bert_spec.pyr   6   s   
z4TransformerLayerSubmodulesWithPostLNSupport.__init__)__name__
__module____qualname____doc__r   __classcell__r   r   r    r"   r   2   s    r   c                       s:   e Zd ZdZ fddZ								dddZ  ZS )!TransformerLayerWithPostLNSupportz&TransformerLayer with post layer norm.c                    sZ   t t| j|i | t| jj| j| jj| jjd| _t| jj	| j| jj| jjd| _	d S )N)confighidden_sizeeps)
r   r(   r   r   submodules_configr   r)   r*   layernorm_epsilonr   )r   argsr   r    r   r"   r   ?   s   z*TransformerLayerWithPostLNSupport.__init__Nc                 K   sl  |}|  |}| j|||	||
d}|   | | j| jj||| j}W d   n1 s.w   Y  |}| |}| 	|}| j
||||	d}t|trUd|v rU|d }|   | | j| jj||| j}W d   n1 srw   Y  |}| |}| |}|   | | j| jj||| j}W d   n1 sw   Y  | |}t||jdd}||fS )zCopy from megatron/core/transformer/transformer_layer.py with modification of applying
        extra post layer norm if needed.)attention_maskinference_paramsrotary_pos_embpacked_seq_paramsN)r/   key_value_statesr0   contextT)inprequires_grad
keep_graph)input_layernormself_attentionbias_dropout_add_exec_handlerself_attn_bdatrainingr)   bias_dropout_fusionhidden_dropoutr   pre_cross_attn_layernormcross_attention
isinstancedictcross_attn_bdapre_mlp_layernormmlpmlp_bdar   r   r6   )r   hidden_statesr/   r4   context_maskr1   rotary_pos_cosrotary_pos_sinattention_biasr0   r2   r   residualinput_layernorm_outputattention_output_with_biaspre_cross_attn_layernorm_outputpre_mlp_layernorm_outputmlp_output_with_biasoutputr   r   r"   forwardP   sR   









z)TransformerLayerWithPostLNSupport.forward)NNNNNNNN)r#   r$   r%   r&   r   rS   r'   r   r   r    r"   r(   <   s    r(   c                   C   J   t ttt tdtjitttt	t
t
ddttt tttt	ddttddS )z5Retrieve the Layer Spec when using Transformer Engineattn_mask_type
linear_qkvcore_attentionlinear_projq_layernormk_layernormmoduleparams
submodules
linear_fc1
linear_fc2r]   r_   r9   r;   r   rE   rF   r   )r   r(   r   r   r   paddingr	   r   r   r   r   r   r   r   r   r   r   r   r"   2get_bert_layer_with_transformer_engine_spec_postln   6   rf   c                   C   rT   )z/Retrieve the Layer Spec when using MCore EnginerU   rV   r\   r`   rc   rd   )r   r(   r   r   r   re   r	   r   r
   r   r   r   r   r   r   r   r   r   r"    get_bert_layer_local_spec_postln   rg   rh   N)-dataclassesr   2nemo.collections.nlp.modules.common.megatron.utilsr   (megatron.core.fusions.fused_bias_dropoutr   &megatron.core.fusions.fused_layer_normr   $megatron.core.tensor_parallel.layersr   r   #megatron.core.transformer.attentionr   r	   /megatron.core.transformer.dot_product_attentionr
   megatron.core.transformer.enumsr   %megatron.core.transformer.identity_opr   megatron.core.transformer.mlpr   r   $megatron.core.transformer.spec_utilsr   r   +megatron.core.transformer.transformer_layerr   r   megatron.core.utilsr   HAVE_MEGATRON_COREImportErrorModuleNotFoundErrorTransformerConfig:megatron.core.transformer.custom_layers.transformer_enginer   r   r   r   HAVE_TEr   r(   rf   rh   r   r   r   r"   <module>   s<   	v 