o
    ̳i                     @   s`   d dl Z d dlmZ d dlZd dlmZmZ d dlmZ G dd dejZ	G dd dejZ
dS )	    N)Optional)nnTensor)	_MaskTypec                       sp   e Zd ZdZdddddddejd	ejd
ededededef fddZdddede	e
 defddZ  ZS )CLIPTextEncodera  
    Text encoder for CLIP.

    CLIP is a model that encodes text and images into a shared vector space.
    Blog post: https://openai.com/index/clip/
    Paper: https://arxiv.org/abs/2103.00020

    Args:
        layer (nn.Module): A single encoder layer.
        final_norm (nn.Module): Callable that applies normalization to the output of the encoder
        vocab_size (int): size of the vocabulary, default 49408
        max_seq_len (int): context size, default 77
        embed_dim (int): embedding/model dimension size, default 768
        num_layers (int): number of transformer layers, default 12
        eot_token (int): the id of the end-of-text token (for selecting the final output)
    i   M   i      i  )
vocab_sizemax_seq_len	embed_dim
num_layers	eot_tokenlayer
final_normr	   r
   r   r   r   c                   s`   t    t fddt|D | _|| _|| _|| _t	||| _
tt||| _d S )Nc                    s   g | ]}t  qS  )copydeepcopy).0ir   r   W/home/ubuntu/.local/lib/python3.10/site-packages/torchtune/models/clip/_text_encoder.py
<listcomp>.   s    z,CLIPTextEncoder.__init__.<locals>.<listcomp>)super__init__r   
ModuleListrangelayersr   r
   r   	Embeddingtoken_embedding	Parametertorchemptyposition_embedding)selfr   r   r	   r
   r   r   r   	__class__r   r   r   "   s   
zCLIPTextEncoder.__init__Nmasktokensr'   returnc                C   s   |j \}}|| jkrtd| d| j d| || j }| jD ]}|||d}q!| |}|| jk j	dd}|j
|dddddjdd}|S )a!  
        Args:
            tokens (Tensor): input tensor with shape ``[b x s]``
            mask (Optional[_MaskType]): Used to mask the scores after the query-key multiplication
                and before the softmax.
                Default is None.

        Returns:
            Tensor: output tensor with shape [b x d]

        Raises:
            ValueError: if seq_len of tokens is bigger than max_seq_len

        Shape notation:
            - b: batch size
            - s: token sequence length
            - d: token embed dim
        z	seq_len (z6) of input tensor should be smaller than max_seq_len ()r&   )dim   )shaper
   
ValueErrorr   r"   r   r   r   intargmaxtake_along_dimviewsqueeze)r#   r(   r'   bszseq_lenxr   eos_token_positionsr   r   r   forward6   s"   



 zCLIPTextEncoder.forward)__name__
__module____qualname____doc__r   Moduler0   r   r   r   r   r9   __classcell__r   r   r$   r   r      s<    	r   c                   @   s&   e Zd ZdZdejdejfddZdS )	QuickGELUz%
    Fast approximation of GELU.
    r7   r)   c                 C   s   |t d|  S )NgZd;?)r    sigmoid)r#   r7   r   r   r   r9   n   s   zQuickGELU.forwardN)r:   r;   r<   r=   r    r   r9   r   r   r   r   r@   i   s    r@   )r   typingr   r    r   r   !torchtune.modules.attention_utilsr   r>   r   r@   r   r   r   r   <module>   s   Y