o
    Ti                     @   sX   d dl T ddlmZ ddlZddlmZ ddlmZ G dd	 d	eZ	G d
d deZ
dS )   )*    )DeepSpeedBERTInferenceN)	Parameter   )TransformerPolicyc                       s&   e Zd Z fddZdddZ  ZS )DS_BERTContainerc                    s4   t  jdi | d| _d| _|d jotj| _d S )NTFconfig )super__init__return_tupletriangular_masking
use_triton	deepspeed
HAS_TRITON)selfkwargs	__class__r
   [/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/module_inject/containers/bert.pyr      s   zDS_BERTContainer.__init__Nc                 C   s4   |d ur|n| j }t|| jd| _| j| jj_| jS )N)mp_group)ds_model_configr   r   modulescale_attentionr	   )r   r	   _configr
   r
   r   create_module   s   zDS_BERTContainer.create_moduleN)__name__
__module____qualname__r   r   __classcell__r
   r
   r   r   r      s    r   c                       sB   e Zd Zd fdd	Zdd ZdddZddd	Zd
d Z  ZS )HFBertLayerPolicyFc                    sf   t  j|dd || _d| _tjd u r1zdd l}|jjj	j
|jjjjgt_W d S    d t_Y d S d S )NF)pre_attn_normTr   )r   r   client_modulecuda_graph_supportedr"   _orig_layer_classtransformersmodelsbertmodeling_bert	BertLayerrobertamodeling_robertaRobertaLayer)r   r$   	inferencer'   r   r
   r   r       s   


zHFBertLayerPolicy.__init__c                 C   sB   | j r| jj}n| jjjj}| jjjjjj	d | jjjj
|jtfS )Nr   )r#   r$   PostAttentionLayerNorm	attentionoutput	LayerNormr   queryweightshapenum_attention_headsepsDEFAULT_INTERMEDIATE_SIZE)r   attention_layernormr
   r
   r   get_hidden_heads/   s   

z"HFBertLayerPolicy.get_hidden_headsc           
      C   s   | j jjjj}| j jjjj}| j jjjj}| j jjjj}| j jjjj}| j jjjj}tt	j
|||fdd|d}tt	j
|||fdd|d}	||	| j jjjj| j jjjjfS )Nr   )dim)requires_grad)r$   r1   r   r4   r5   biaskeyvaluer   torchcatr2   dense)
r   enable_trainingqwqbkwkbvwvbqkvwqkvbr
   r
   r   r1   9   s   zHFBertLayerPolicy.attentionc                 C   s<   | j r	| jjj}n| jjj}|j|j| jjjj| jjjjfS r   )r#   r$   intermediate	dense_actrC   r5   r>   r2   )r   rD   intermediate_ffr
   r
   r   mlpI   s   


zHFBertLayerPolicy.mlpc                 C   sB   | j r| jj}| jj}n| jjjj}| jjj}|j|j|j|jfS r   )	r#   r$   r0   PreAttentionLayerNormr1   r2   r3   r5   r>   )r   r:   transformer_layernormr
   r
   r   	layernormS   s   

zHFBertLayerPolicy.layernorm)F)	r   r   r    r   r;   r1   rP   rS   r!   r
   r
   r   r   r"      s    



r"   )base4deepspeed.model_implementations.transformers.ds_bertr   rA   torch.nn.parameterr   policyr   BaseTransformerContainerr   r"   r
   r
   r
   r   <module>   s   