o
    i                     @   s   d dl Z d dlmZmZ d dl mZ ddlmZ ddlmZm	Z	 ddl
mZmZ d	d
lmZ G dd deZG dd deZG dd deZdS )    N)LinearModule)Tensor   )GeluNew)AttentionMaskScaledDotProductAttention   )BertAttentionConfigBertLayerConfig   )Errorsc                       s\   e Zd Zdef fddZdedefddZdedefdd	Zded
edefddZ	  Z
S )BertSelfAttentionconfigc                    s   t    |j| _|j| _| j| j dkr!ttjj	| j| jd| j| j | _
t|jd| _t| j| jd | _t| j| j| _d S )Nr   )hidden_width	num_heads)dropout_probr   )super__init__r   	model_dimnum_attention_headsr   
ValueErrorr   E003formatdims_per_headr   r   	attentionr   inputoutputselfr   	__class__ Z/home/ubuntu/.local/lib/python3.10/site-packages/curated_transformers/models/bert/layer.pyr      s   
zBertSelfAttention.__init__xreturnc                 C   s*   |  \}}}|||| j| jddS )z}
        Shapes:
            x - (batch, seq_len, width)
            output - (batch, head, seq_len, width_per_head)
        r	   r   )sizeviewr   r   	transpose)r   r$   
batch_sizeseq_lenr   r"   r"   r#   _split_heads   s   zBertSelfAttention._split_headsc                 C   s.   |  \}}}}|dd |||| S )z}
        Shapes:
            x - (batch, head, seq_len, width_per_head)
            output - (batch, seq_len, width)
        r	   r   )r&   r(   
contiguousr'   )r   r$   r)   headr*   r   r"   r"   r#   _combine_heads)   s   z BertSelfAttention._combine_heads	attn_maskc           	      C   s`   |  |}|jddd\}}}| |}| |}| |}| | ||||}| |}|S )j
        Shapes:
            x - (batch, seq_len, width)
            attn_mask - (batch, seq_len)
        r   )dim)r   chunkr+   r.   r   r   )	r   r$   r/   projqkvattnoutr"   r"   r#   forward4   s   




zBertSelfAttention.forward)__name__
__module____qualname__r
   r   r   r+   r.   r   r:   __classcell__r"   r"   r    r#   r      s
    r   c                       s4   e Zd Zdef fddZdedefddZ  ZS )BertFeedForwardr   c                    s   t    t|j|j| _t|j|j| _|jdkr"tj	
 | _d S |jdkr/tj	 | _d S |jdkr:t | _d S ttjjdd)Nrelugelugelu_new)r@   rA   rB   )activation_funcs)r   r   r   r   intermediate_widthintermediater   
hidden_acttorchnnReLU
activationGELUr   r   r   E004r   r   r    r"   r#   r   K   s   



zBertFeedForward.__init__r$   r%   c                 C   s"   |  |}| |}| |}|S )zA
        Shapes:
            x - (batch, seq_len, width)
        )rE   rJ   r   )r   r$   r9   r"   r"   r#   r:   _   s   


zBertFeedForward.forward)r;   r<   r=   r   r   r   r:   r>   r"   r"   r    r#   r?   J   s    r?   c                       s<   e Zd Zdedef fddZdededefdd	Z  Z	S )
BertEncoderLayerlayer_configattention_configc                    sr   t    t|| _tjj|j|jd| _	tjj
|jd| _t|| _tjj|j|jd| _tjj
|jd| _d S )N)eps)p)r   r   r   mharG   rH   	LayerNormr   layer_norm_epsattn_output_layernormDropoutr   attn_output_dropoutr?   ffnffn_output_layernormffn_output_dropout)r   rN   rO   r    r"   r#   r   k   s   


zBertEncoderLayer.__init__r$   r/   r%   c                 C   sJ   |  ||}| |}| || }| |}| |}| || }|S )r0   )rR   rW   rU   rX   rZ   rY   )r   r$   r/   attn_outffn_outr"   r"   r#   r:   {   s   


zBertEncoderLayer.forward)
r;   r<   r=   r   r
   r   r   r   r:   r>   r"   r"   r    r#   rM   j   s    rM   )rG   torch.nnr   r   r    r   r   r   r   r   r
   r   errorsr   r   r?   rM   r"   r"   r"   r#   <module>   s    > 