o
    Tit                     @   sX   d dl T ddlmZ ddlZddlmZ ddlmZ G dd	 d	eZ	G d
d deZ
dS )   )*    )DeepSpeedBERTInferenceN)	Parameter   )TransformerPolicyc                       s&   e Zd Z fddZdddZ  ZS )DS_DistilBERTContainerc                    s4   t  jdi | d| _d| _|d jotj| _d S )NFTconfig )super__init__triangular_maskingreturn_single_tuple
use_triton	deepspeed
HAS_TRITON)selfkwargs	__class__r
   b/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/module_inject/containers/distil_bert.pyr      s   zDS_DistilBERTContainer.__init__Nc                 C   s4   |d ur|n| j }t|| jd| _| j| jj_| jS )N)mp_group)ds_model_configr   r   modulescale_attentionr	   )r   r	   _configr
   r
   r   create_module   s   z$DS_DistilBERTContainer.create_moduleN)__name__
__module____qualname__r   r   __classcell__r
   r
   r   r   r      s    r   c                       sF   e Zd ZdZd fdd	Zdd ZdddZdd	d
Zdd Z  Z	S )HFDistilBertLayerPolicyNFc                    s^   t  | || _|| _d| _tjd u r-zdd l}|jj	j
jgt_W d S    d t_Y d S d S )NTr   )r   r   client_moduleprelncuda_graph_supportedr"   _orig_layer_classtransformersmodels
distilbertmodeling_distilbertTransformerBlock)r   r#   	inferencer$   r'   r   r
   r   r   !   s   

z HFDistilBertLayerPolicy.__init__c                 C   s&   | j jjjjd | j jj| j jjtfS )Nr   )	r#   	attentionq_linweightshapen_headssa_layer_normepsDEFAULT_INTERMEDIATE_SIZE)r   r
   r
   r   get_hidden_heads/   s
   z(HFDistilBertLayerPolicy.get_hidden_headsc           
      C   s   | j jjj}| j jjj}| j jjj}| j jjj}| j jjj}| j jjj}ttj	|||fdd|d}ttj	|||fdd|d}	||	| j jj
j| j jj
jfS )Nr   )dim)requires_grad)r#   r-   r.   r/   biask_linv_linr   torchcatout_lin)
r   enable_trainingqwqbkwkbvwvbqkvwqkvbr
   r
   r   r-   5   s   

z!HFDistilBertLayerPolicy.attentionc                 C   s*   | j jj}|j|j| j jjj| j jjjfS r   )r#   ffnlin1r/   r8   lin2)r   r>   intermediate_ffr
   r
   r   mlpE   s
   


zHFDistilBertLayerPolicy.mlpc                 C   s$   | j j}| j j}|j|j|j|jfS r   )r#   r2   output_layer_normr/   r8   )r   attention_layernormtransformer_layernormr
   r
   r   	layernormL   s   z!HFDistilBertLayerPolicy.layernorm)FF)F)
r   r   r    r&   r   r5   r-   rK   rO   r!   r
   r
   r   r   r"      s    

r"   )base4deepspeed.model_implementations.transformers.ds_bertr   r;   torch.nn.parameterr   policyr   BaseTransformerContainerr   r"   r
   r
   r
   r   <module>   s   