o
    ̳id/                     @   s   d dl Z d dlmZmZmZ d dlZd dlm  mZ	 d dlmZm
Z
 d dlmZ d dlmZ G dd dejZG dd	 d	ejZG d
d dejZG dd dejZdededede
fddZdS )    N)ListOptionalUnion)nnTensor)MultiHeadAttention)_get_clonesc                       sz   e Zd ZdZdddejdeejeej ej	f dejde
de
d	e
d
e
dee
 ddf fddZdedefddZ  ZS )	T5Encodera  
    The T5 encoder module.

    T5 paper: https://arxiv.org/abs/1910.10683

    Args:
        token_embedding (nn.Embedding): PyTorch embedding layer to place tokens in an embedding space.
        layers (Union[nn.Module, List[nn.Module], nn.ModuleList]): A single encoder layer.
        final_norm (nn.Module): Module that applies normalization to the output of the encoder
        num_heads (int): The number of attention heads.
        rel_pos_num_buckets (int): Number of discrete buckets to divide the relative positions into.
            See: :class:`~torchtune.models.t5._encoder.T5EncoderRelativePositionBias`
        rel_pos_max_dist (int): Maximum distance for relative positions.
            Distances beyond this are grouped into the last bucket.
            See: :class:`~torchtune.models.t5._encoder.T5EncoderRelativePositionBias`
        max_seq_len (int): The maximum sequence length (context length) of the model.
        num_layers (Optional[int]): Number of encoder layers, only define when layers is not a list.

    Raises:
        AssertionError:
            If ``num_layers`` is set and layer is a list, **or**
            ``num_layers`` is not set and layer is an ``nn.Module``.

    N)
num_layerstoken_embeddinglayers
final_norm	num_headsrel_pos_num_bucketsrel_pos_max_distmax_seq_lenr
   returnc          	         s   t    || _|| _|| _t||||d| _d | _t|t	j
r%|| _d S t|tr2t	
|| _d S t|t	js<td|d u rDtdt||| _d S )N)num_bucketsmax_distr   r   z.num_layers is defined, layers must be a modulez0num_layers is not defined, layers must be a list)super__init__r   r   r   T5EncoderRelativePositionBiasrelative_position_biasr   
isinstancer   
ModuleListlistModuleAssertionErrorr   )	selfr   r   r   r   r   r   r   r
   	__class__ P/home/ubuntu/.local/lib/python3.10/site-packages/torchtune/models/t5/_encoder.pyr   +   s(   


zT5Encoder.__init__tokensc                 C   s^   |j \}}|| jkrtd| d| j d| |}|  }| jD ]}|||}q"| |S )a  
        Args:
            tokens (Tensor): input tensor with shape ``[bsz, max_seq_len]``

        Returns:
            Tensor: output tensor with shape [bsz, max_seq_len, embed_dim]

        Raises:
            ValueError: if seq_len of tokens is bigger than max_seq_len
        z	seq_len (z6) of input tensor should be smaller than max_seq_len ())shaper   
ValueErrorr   r   r   r   )r   r#   bszseq_lenxrel_pos_biaslayerr!   r!   r"   forwardN   s   




zT5Encoder.forward)__name__
__module____qualname____doc__r   	Embeddingr   r   r   r   intr   r   r   r,   __classcell__r!   r!   r   r"   r	      s.    #	
#r	   c                
       sR   e Zd ZdZdedejdejdejddf
 fdd	Zd
ededefddZ	  Z
S )T5EncoderLayeray  
    Single layer of the T5 encoder (standard transformer layer with relative position bias).

    Args:
        attn (MultiHeadAttention): Attention module.
        mlp (nn.Module): Feed-forward module.
        sa_norm (nn.Module): Normalization to be applied before self-attention.
        mlp_norm (nn.Module): Normalization to be applied before the feed-forward layer.
    attnmlpsa_normmlp_normr   Nc                    s&   t    || _|| _|| _|| _d S )N)r   r   r5   r6   r7   r8   )r   r5   r6   r7   r8   r   r!   r"   r   y   s
   

zT5EncoderLayer.__init__r)   r*   c                 C   s.   ||  | || }|| | | }|S )  
        Args:
            x (Tensor): input tensor with shape [bsz, seq_len, embed_dim]
            rel_pos_bias (Tensor): relative position bias with shape [1, num_heads, max_seq_len, max_seq_len]
                See: :class:`~torchtune.models.t5._encoder.T5EncoderRelativePositionBias`

        Returns:
            Tensor: output tensor with shape [bsz, seq_len, embed_dim]
        )r5   r7   r6   r8   )r   r)   r*   r!   r!   r"   r,      s   
zT5EncoderLayer.forward)r-   r.   r/   r0   r   r   r   r   r   r,   r3   r!   r!   r   r"   r4   n   s    
r4   c                       s\   e Zd ZdZdedededejdejdejdejf fd	d
ZdededefddZ	  Z
S )T5EncoderSelfAttentionaR  
    Self-attention for the T5 encoder.

    Standard self-attention with two differences:
        - No scaling factor
        - Add "relative position bias" to the attention scores.
            (See: :class:`~torchtune.models.t5._encoder.T5EncoderRelativePositionBias`)

    Args:
        embed_dim (int): The model dimension.
        num_heads (int): Number of attention heads.
        head_dim (int): Dimension of the attention heads (should equal `embed_dim // num_heads`)
        q_proj (nn.Module): Projection layer for query.
        k_proj (nn.Module): Projection layer for key.
        v_proj (nn.Module): Projection layer for value.
        output_proj (nn.Module): Projection layer for output.

    Raises:
        ValueError:
            If ``embed_dim % num_heads != 0``, **or**
            if ``embed_dim // num_heads != head_dim``
    	embed_dimr   head_dimq_projk_projv_projoutput_projc                    sp   t    || dkrtd| d| d|| |kr$td| d|| _|| _|| _|| _|| _|| _d S )Nr   zembed_dim (z") must be divisible by num_heads (r$   z
head_dim (z)) must be equal to embed_dim // num_heads)	r   r   r&   r   r<   r=   r>   r?   r@   )r   r;   r   r<   r=   r>   r?   r@   r   r!   r"   r      s"   



zT5EncoderSelfAttention.__init__r)   r*   r   c                 C   s   |j \}}}| |}| |}| |}|||| j| jdd}|||| j| jdd}|||| j| jdd}t	||dd}	|	|7 }	t
j|	 dd|	j}
t	|
|}|dd|||}| |S )r9         )dim)r%   r=   r>   r?   viewr   r<   	transposetorchmatmulFsoftmaxfloattodtypereshaper@   )r   r)   r*   r'   r(   r;   qkv
attn_scoreattn_weightattn_outr!   r!   r"   r,      s   




zT5EncoderSelfAttention.forward)r-   r.   r/   r0   r2   r   r   r   r   r,   r3   r!   r!   r   r"   r:      s$    r:   c                       s@   e Zd ZdZdedededef fddZdefd	d
Z  ZS )r   a-  
    Computes binned birectional relative position bias for the T5 encoder.

    It places relative positions into buckets and for each bucket, learns bias values for each attention head.

    Args:
        num_buckets (int): Number of discrete buckets to divide the relative positions into.
        max_dist (int): Maximum distance for relative positions (distances beyond this are grouped into the last bucket)
        num_heads (int): Number of attention heads in the transformer.
        max_seq_len (int): Maximum sequence length (context length).
    r   r   r   r   c                    s:   t    || _t||| _| jdt|||dd d S )Nrelative_position_to_bucketF)
persistent)r   r   r   r   r1   	embeddingregister_buffer#_calc_birectional_rel_pos_to_bucket)r   r   r   r   r   r   r!   r"   r      s   


z&T5EncoderRelativePositionBias.__init__r   c                 C   s    |  | j}|g ddS )z
        Returns:
            torch.Tensor: relative position bias tensor with shape [1, num_heads, max_seq_len, max_seq_len]
        )rB   r   rA   r   )rX   rV   permute	unsqueeze)r   r)   r!   r!   r"   r,   	  s   z%T5EncoderRelativePositionBias.forward)	r-   r.   r/   r0   r2   r   r   r,   r3   r!   r!   r   r"   r      s    r   r   r   r   r   c                 C   s   t j|t jddddf }t j|t jddddf }|| }t |}| d }|d }||k }	|t | | t||  ||  t j }
t |
t 	|
|d }
|dkt j| t 
|	||
 }|S )a  
    Calculate the mapping from relative positions to bucket indices.

    NOTE: This is for the T5 encoder (birectional), not the decoder (unidirectional).

    Args:
        num_buckets (int): Number of discrete buckets to divide the relative positions into.
        max_dist (int): Maximum distance for relative positions (distances beyond this are grouped into the last bucket)
        max_seq_len (int): Maximum sequence length (context length).

    Returns:
        Tensor: shape=[max_seq_len, max_seq_len], range=[0, num_buckets]
    )rN   NrB   rA   r   )rH   arangelongabslogrL   mathrM   min	full_likewhere)r   r   r   query_positionskey_positionsrelative_positionsabs_relative_positionshalf_num_buckets	max_exactis_exactrelative_position_if_not_exactrV   r!   r!   r"   rZ     s:   
rZ   )ra   typingr   r   r   rH   torch.nn.functionalr   
functionalrJ   r   torchtune.modulesr   torchtune.modules.transformerr   r   r	   r4   r:   r   r2   rZ   r!   r!   r!   r"   <module>   s(   ]'W)