o
    ei                     @   s\  d Z ddlZddlmZ ddlmZ ddlmZ ddlZddlm	Z	 ddl
mZ dd	lmZ dd
lmZ ddlmZmZmZmZ ddlmZmZ ddlmZ ddlmZ ddlmZmZm Z m!Z!m"Z" ddl#m$Z$m%Z%m&Z& e"'e(Z)ee ddG dd deZ*ee ddG dd deZ+ee G dd deZ,dej-dej-fddZ.dej-dej-fd d!Z/d"e&d#e0fd$d%Z1dbd'e0e2B d(e3fd)d*Z4G d+d, d,e	j5Z6G d-d. d.e	j7Z8G d/d0 d0e	j5Z9G d1d2 d2e	j5Z:G d3d4 d4e	j5Z;G d5d6 d6e	j5Z<G d7d8 d8e	j5Z=G d9d: d:e	j5Z>G d;d< d<e	j5Z?	=dcd>e	j5d?ej-d@ej-dAej-dBej-dB dCe@dDe@fdEdFZAG dGdH dHe	j5ZBG dIdJ dJe	j5ZCG dKdL dLe	j5ZDG dMdN dNe	j5ZEG dOdP dPe	j5ZFG dQdR dReZGG dSdT dTe	j5ZHG dUdV dVe	j5ZIe G dWdX dXeZJe dYdG dZd[ d[eJZKe d\dG d]d^ d^eJZLe G d_d` d`eJZMg daZNdS )dzPyTorch ALIGN model.    N)Callable)	dataclass)Any)nn   )initialization)ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithNoAttentionBaseModelOutputWithPooling(BaseModelOutputWithPoolingAndNoAttention)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)apply_chunking_to_forward)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging   )AlignConfigAlignTextConfigAlignVisionConfigz}
    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
    )custom_introc                   @   sL   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	e
ej dB ed< dS )AlignVisionModelOutputz
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The image embeddings obtained by applying the projection layer to the pooler_output.
    Nimage_embedslast_hidden_statehidden_states)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__r   r   tuple r(   r(   f/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/align/modeling_align.pyr   +   s
   
 r   ze
    Base class for text model's outputs that also contains a pooling of the last hidden states.
    c                   @   sb   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	e
ej dB ed< dZe
ej dB ed< dS )AlignTextModelOutputz
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The text embeddings obtained by applying the projection layer to the pooler_output.
    Ntext_embedsr   r   
attentions)r    r!   r"   r#   r+   r$   r%   r&   r   r   r'   r,   r(   r(   r(   r)   r*   <   s   
 r*   c                   @   s   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	ejdB ed< dZ
ejdB ed< dZejdB ed< dZeed< dZeed	< d
ee fddZdS )AlignOutputar  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`AlignTextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The output of [`AlignVisionModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`AlignTextModel`].
    vision_model_output (`BaseModelOutputWithPoolingAndNoAttention`):
        The output of the [`AlignVisionModel`].
    Nlosslogits_per_imagelogits_per_textr+   r   text_model_outputvision_model_outputreturnc                    s   t  fdd  D S )Nc                 3   s.    | ]}|d vr | nt  | V  qdS ))r1   r2   N)getattrto_tuple).0kselfr(   r)   	<genexpr>m   s
    
z'AlignOutput.to_tuple.<locals>.<genexpr>)r'   keysr8   r(   r8   r)   r5   l   s   zAlignOutput.to_tuple)r    r!   r"   r#   r.   r$   r%   r&   r/   r0   r+   r   r1   r   r2   r   r'   r   r5   r(   r(   r(   r)   r-   N   s   
 r-   logitsr3   c                 C   s"   t jj| tjt| | jdddS )Ndeviceg?)label_smoothing)r   
functionalcross_entropyr$   arangelenr>   )r<   r(   r(   r)   contrastive_lossu   s   "rD   
similarityc                 C   s    t | }t |  }|| d S )Ng       @)rD   t)rE   caption_loss
image_lossr(   r(   r)   
align_lossy   s   rI   confignum_channelsc                 C   sJ   | j }|| j9 }t|t||d  | | }|d| k r!||7 }t|S )z<
    Round number of filters based on depth multiplier.
       g?)depth_divisorwidth_coefficientmaxint)rJ   rK   divisornew_dimr(   r(   r)   round_filters   s   
rS   Tkernel_sizeadjustc                 C   sn   t | tr	| | f} | d d | d d f}|r)|d d |d |d d |d fS |d |d |d |d fS )aJ  
    Utility function to get the tuple padding value for the depthwise convolution.

    Args:
        kernel_size (`int` or `tuple`):
            Kernel size of the convolution layers.
        adjust (`bool`, *optional*, defaults to `True`):
            Adjusts padding value to apply to right and bottom sides of the input.
    r   rL   r   )
isinstancerP   )rT   rU   correctr(   r(   r)   correct_pad   s   

$rX   c                       s<   e Zd ZdZdef fddZdejdejfddZ  Z	S )	AlignVisionEmbeddingszL
    A module that corresponds to the stem module of the original work.
    rJ   c                    sh   t    t|d| _tjdd| _tj|j| jddddd| _	tj
| j|j|jd	| _t|j | _d S )
N    )r   r   r   r   paddingr   rL   validFrT   strider\   bias)epsmomentum)super__init__rS   out_dimr   	ZeroPad2dr\   Conv2drK   convolutionBatchNorm2dbatch_norm_epsbatch_norm_momentum	batchnormr   
hidden_act
activationr9   rJ   	__class__r(   r)   rd      s   
zAlignVisionEmbeddings.__init__pixel_valuesr3   c                 C   s,   |  |}| |}| |}| |}|S N)r\   rh   rl   rn   )r9   rr   featuresr(   r(   r)   forward   s
   



zAlignVisionEmbeddings.forward)
r    r!   r"   r#   r   rd   r$   Tensorru   __classcell__r(   r(   rp   r)   rY      s    rY   c                       s,   e Zd Z							d fdd	Z  ZS )	AlignVisionDepthwiseConv2dr   r   r   Tzerosc	           
         s*   || }	t  j||	|||||||d	 d S )N)	in_channelsout_channelsrT   r_   r\   dilationgroupsr`   padding_mode)rc   rd   )
r9   rz   depth_multiplierrT   r_   r\   r|   r`   r~   r{   rp   r(   r)   rd      s   
z#AlignVisionDepthwiseConv2d.__init__)r   r   r   r   r   Try   )r    r!   r"   rd   rw   r(   r(   rp   r)   rx      s    rx   c                       sH   e Zd ZdZdedededef fddZdejd	ej	fd
dZ
  ZS )AlignVisionExpansionLayerz_
    This corresponds to the expansion phase of each block in the original implementation.
    rJ   in_dimre   r_   c                    sB   t    tj||dddd| _tj||jd| _t|j	 | _
d S )Nr   sameFrz   r{   rT   r\   r`   )num_featuresra   )rc   rd   r   rg   expand_convri   rj   	expand_bnr   rm   
expand_act)r9   rJ   r   re   r_   rp   r(   r)   rd      s   
z"AlignVisionExpansionLayer.__init__r   r3   c                 C   s"   |  |}| |}| |}|S rs   )r   r   r   r9   r   r(   r(   r)   ru      s   


z!AlignVisionExpansionLayer.forward)r    r!   r"   r#   r   rP   rd   r$   r%   rv   ru   rw   r(   r(   rp   r)   r      s    r   c                
       sL   e Zd ZdZdededededef
 fddZd	ej	d
ej
fddZ  ZS )AlignVisionDepthwiseLayerzk
    This corresponds to the depthwise convolution phase of each block in the original implementation.
    rJ   r   r_   rT   adjust_paddingc                    sv   t    || _| jdkrdnd}t||d}tj|d| _t||||dd| _tj	||j
|jd| _t|j | _d S )	NrL   r]   r   )rU   r[   Fr^   r   ra   rb   )rc   rd   r_   rX   r   rf   depthwise_conv_padrx   depthwise_convri   rj   rk   depthwise_normr   rm   depthwise_act)r9   rJ   r   r_   rT   r   conv_padr\   rp   r(   r)   rd      s   


z"AlignVisionDepthwiseLayer.__init__r   r3   c                 C   s6   | j dkr
| |}| |}| |}| |}|S )NrL   )r_   r   r   r   r   r   r(   r(   r)   ru     s   




z!AlignVisionDepthwiseLayer.forwardr    r!   r"   r#   r   rP   boolrd   r$   r%   rv   ru   rw   r(   r(   rp   r)   r      s    r   c                	       sJ   e Zd ZdZddedededef fddZd	ej	d
ej
fddZ  ZS )AlignVisionSqueezeExciteLayerzl
    This corresponds to the Squeeze and Excitement phase of each block in the original implementation.
    FrJ   r   
expand_dimexpandc                    s   t    |r	|n|| _tdt||j | _tjdd| _	tj
| j| jddd| _tj
| j| jddd| _t|j | _t | _d S )Nr   )output_sizer   )rz   r{   rT   r\   )rc   rd   dimrO   rP   squeeze_expansion_ratiodim_ser   AdaptiveAvgPool2dsqueezerg   reducer   r   rm   
act_reduceSigmoid
act_expand)r9   rJ   r   r   r   rp   r(   r)   rd   "  s$   
z&AlignVisionSqueezeExciteLayer.__init__r   r3   c                 C   sF   |}|  |}| |}| |}| |}| |}t||}|S rs   )r   r   r   r   r   r$   mul)r9   r   inputsr(   r(   r)   ru   7  s   




z%AlignVisionSqueezeExciteLayer.forward)Fr   r(   r(   rp   r)   r     s     r   c                       sV   e Zd ZdZdedededededef fdd	Zd
e	j
de	j
de	jfddZ  ZS )AlignVisionFinalBlockLayerz[
    This corresponds to the final phase of each block in the original implementation.
    rJ   r   re   r_   	drop_rateid_skipc                    sX   t    |dko| | _tj||dddd| _tj||j|jd| _	tj
|d| _d S )Nr   r   Fr   r   )p)rc   rd   apply_dropoutr   rg   project_convri   rj   rk   
project_bnDropoutdropout)r9   rJ   r   re   r_   r   r   rp   r(   r)   rd   I  s   

z#AlignVisionFinalBlockLayer.__init__
embeddingsr   r3   c                 C   s0   |  |}| |}| jr| |}|| }|S rs   )r   r   r   r   )r9   r   r   r(   r(   r)   ru   Z  s   


z"AlignVisionFinalBlockLayer.forwardr    r!   r"   r#   r   rP   floatr   rd   r$   r%   rv   ru   rw   r(   r(   rp   r)   r   D  s     $r   c                       s\   e Zd ZdZdededededededed	ed
ef fddZde	j
de	jfddZ  ZS )AlignVisionBlocka  
    This corresponds to the block module of original the EfficientNet vision encoder implementation.

    Args:
        config ([`AlignVisionConfig`]):
            Model configuration class.
        in_dim (`int`):
            Number of input channels.
        out_dim (`int`):
            Number of output channels.
        stride (`int`):
            Stride size to be used in convolution layers.
        expand_ratio (`int`):
            Expand ratio to set the output dimensions for the expansion and squeeze-excite layers.
        kernel_size (`int`):
            Kernel size for the depthwise convolution layer.
        drop_rate (`float`):
            Dropout rate to be used in the final phase of each block.
        id_skip (`bool`):
            Whether to apply dropout and sum the final hidden states with the input embeddings during the final phase
            of each block. Set to `True` for the first block of each stage.
        adjust_padding (`bool`):
            Whether to apply padding to only right and bottom side of the input kernel before the depthwise convolution
            operation, set to `True` for inputs with odd input sizes.
    rJ   r   re   r_   expand_ratiorT   r   r   r   c
                    s   t    || _| jdk| _|| }
| jrt|||
|d| _t|| jr%|
n||||	d| _t|||
| jd| _	t
|| jr>|
n|||||d| _d S )Nr   )rJ   r   re   r_   )rJ   r   r_   rT   r   )rJ   r   r   r   )rJ   r   re   r_   r   r   )rc   rd   r   r   r   	expansionr   r   r   squeeze_exciter   
projection)r9   rJ   r   re   r_   r   rT   r   r   r   expand_in_dimrp   r(   r)   rd     s4   

zAlignVisionBlock.__init__r   r3   c                 C   s<   |}| j dkr| |}| |}| |}| ||}|S Nr   )r   r   r   r   r   )r9   r   r   r(   r(   r)   ru     s   



zAlignVisionBlock.forwardr   r(   r(   rp   r)   r   e  s,    	
)r   c                	       sP   e Zd ZdZdef fddZ		ddejded	B d
ed	B de	fddZ
  ZS )AlignVisionEncoderz
    Forward propagates the embeddings through each vision encoder (EfficientNet) block.

    Args:
        config ([`AlignVisionConfig`]):
            Model configuration class.
    rJ   c                    s(  t    |j_fdd t|j}t fdd|jD }d}g }t|D ]c}t||j| }t||j	| }|j
| }	|j| }
|j| }t |j| D ]8}|dk}|dkr^dn|	}	|dkrf|n|}||jv}|j| | }t||||	|
||||d	}|| |d7 }qRq(t|_d S )Nc                    s   t t j|  S rs   )rP   mathceildepth_coefficient)repeatsr8   r(   r)   round_repeats  s   z2AlignVisionEncoder.__init__.<locals>.round_repeatsc                 3   s    | ]} |V  qd S rs   r(   )r6   n)r   r(   r)   r:     s    z.AlignVisionEncoder.__init__.<locals>.<genexpr>r   r   )	rJ   r   re   r_   rT   r   r   r   r   )rc   rd   r   rC   rz   sumnum_block_repeatsrangerS   r{   strideskernel_sizesexpand_ratiosdepthwise_paddingdrop_connect_rater   appendr   
ModuleListblocks)r9   rJ   num_base_blocks
num_blockscurr_block_numr   ir   re   r_   rT   r   jr   r   r   blockrp   )r   r9   r)   rd     sD   







zAlignVisionEncoder.__init__FTr   output_hidden_statesNreturn_dictr3   c                 C   sV   |r|fnd }| j D ]}||}|r||f7 }q
|s%tdd ||fD S t||dS )Nc                 s   s    | ]	}|d ur|V  qd S rs   r(   )r6   vr(   r(   r)   r:     s    z-AlignVisionEncoder.forward.<locals>.<genexpr>)r   r   )r   r'   r   )r9   r   r   r   all_hidden_statesr   r(   r(   r)   ru     s   

zAlignVisionEncoder.forward)FT)r    r!   r"   r#   r   rd   r$   r%   r   r   ru   rw   r(   r(   rp   r)   r     s    .r   c                       sb   e Zd ZdZ fddZ				ddejdB dejdB dejdB dejdB d	ejf
d
dZ	  Z
S )AlignTextEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    s   t    tj|j|j|jd| _t|j|j| _	t|j
|j| _tj|j|jd| _t|j| _| jdt|jddd | jdtj| j tjddd d S )	N)padding_idxra   position_idsr   F)
persistenttoken_type_ids)dtype)rc   rd   r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsr   hidden_dropout_probr   register_bufferr$   rB   r   ry   r   sizelongro   rp   r(   r)   rd     s   

zAlignTextEmbeddings.__init__N	input_idsr   r   inputs_embedsr3   c                 C   s   |d ur	|  }n|  d d }|d }|d u r$| jd d d |f }|d u rNt| drC| jd d d |f }||d |}|}ntj|tj| jjd}|d u rW| 	|}| 
|}	||	 }
| |}|
|7 }
| |
}
| |
}
|
S )Nr   r   r   r   r   r>   )r   r   hasattrr   r   r$   ry   r   r>   r   r   r   r   r   )r9   r   r   r   r   input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr   r   r   r(   r(   r)   ru     s*   






zAlignTextEmbeddings.forward)NNNN)r    r!   r"   r#   rd   r$   
LongTensorr%   rv   ru   rw   r(   r(   rp   r)   r      s$    r           modulequerykeyvalueattention_maskscalingr   c           
      K   s|   t ||dd| }|d ur|| }tjj|dt jd|j}tjj	||| j
d}t ||}	|	dd }	|	|fS )NrL   r   r   )r   r   )r   trainingr   )r$   matmul	transposer   r@   softmaxfloat32tor   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputr(   r(   r)   eager_attention_forward<  s   
r  c                       V   e Zd Z fddZ		ddejdejdB dedB dee	 d	e
ej f
d
dZ  ZS )AlignTextSelfAttentionc                    s   t    |j|j dkrt|dstd|j d|j d|| _|j| _t|j|j | _| j| j | _	t
|j| j	| _t
|j| j	| _t
|j| j	| _t
|j| _|j| _| jd | _d S )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()g      )rc   rd   r   num_attention_headsr   
ValueErrorrJ   rP   attention_head_sizeall_head_sizer   Linearr   r   r   r   attention_probs_dropout_probr   attention_dropoutr   ro   rp   r(   r)   rd   S  s"   

zAlignTextSelfAttention.__init__NFr   r   output_attentionsr   r3   c                 K   s   |j d d }g |d| jR }| ||dd}| ||dd}| ||dd}	t| j	j
t}
|
| |||	|f| jsIdn| j| jd|\}}|jg |dR   }|ri||f}|S |f}|S )Nr   r   rL   r   )r   r   )shaper
  r   viewr   r   r   r   get_interfacerJ   _attn_implementationr  r   r  r   reshaper   )r9   r   r   r  r   r   hidden_shapequery_states
key_statesvalue_statesattention_interfacer  r  outputsr(   r(   r)   ru   h  s2   
zAlignTextSelfAttention.forwardNFr    r!   r"   rd   r$   rv   r%   r   r   r   r'   ru   rw   r(   r(   rp   r)   r  R  s    r  c                       8   e Zd Z fddZdejdejdejfddZ  ZS )AlignTextSelfOutputc                    sB   t    t|j|j| _tj|j|jd| _t|j	| _
d S Nr   )rc   rd   r   r  r   denser   r   r   r   r   ro   rp   r(   r)   rd        
zAlignTextSelfOutput.__init__r   input_tensorr3   c                 C   &   |  |}| |}| || }|S rs   r   r   r   r9   r   r"  r(   r(   r)   ru        

zAlignTextSelfOutput.forwardr    r!   r"   rd   r$   rv   ru   rw   r(   r(   rp   r)   r        $r  c                       r  )AlignTextAttentionc                    s"   t    t|| _t|| _d S rs   )rc   rd   r  r9   r  outputro   rp   r(   r)   rd     s   

zAlignTextAttention.__init__NFr   r   r  r   r3   c                 K   s>   | j |f||d|}| |d |}|f|dd   }|S N)r   r  r   r   )r9   r*  )r9   r   r   r  r   self_outputsattention_outputr  r(   r(   r)   ru     s   zAlignTextAttention.forwardr  r  r(   r(   rp   r)   r)    s    r)  c                       2   e Zd Z fddZdejdejfddZ  ZS )AlignTextIntermediatec                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S rs   )rc   rd   r   r  r   intermediate_sizer   rV   rm   strr   intermediate_act_fnro   rp   r(   r)   rd     s
   
zAlignTextIntermediate.__init__r   r3   c                 C   s   |  |}| |}|S rs   )r   r2  r   r(   r(   r)   ru     s   

zAlignTextIntermediate.forwardr'  r(   r(   rp   r)   r/    s    r/  c                       r  )AlignTextOutputc                    sB   t    t|j|j| _tj|j|jd| _t	|j
| _d S r  )rc   rd   r   r  r0  r   r   r   r   r   r   r   ro   rp   r(   r)   rd     r!  zAlignTextOutput.__init__r   r"  r3   c                 C   r#  rs   r$  r%  r(   r(   r)   ru     r&  zAlignTextOutput.forwardr'  r(   r(   rp   r)   r3    r(  r3  c                       s^   e Zd Z fddZ		ddejdejdB dedB dee	 d	e
ej f
d
dZdd Z  ZS )AlignTextLayerc                    s:   t    |j| _d| _t|| _t|| _t|| _	d S r   )
rc   rd   chunk_size_feed_forwardseq_len_dimr)  	attentionr/  intermediater3  r*  ro   rp   r(   r)   rd     s   


zAlignTextLayer.__init__NFr   r   r  r   r3   c           	      K   sN   | j |f||d|}|d }|dd  }t| j| j| j|}|f| }|S r+  )r7  r   feed_forward_chunkr5  r6  )	r9   r   r   r  r   self_attention_outputsr-  r  layer_outputr(   r(   r)   ru     s   
zAlignTextLayer.forwardc                 C   s   |  |}| ||}|S rs   )r8  r*  )r9   r-  intermediate_outputr;  r(   r(   r)   r9    s   
z!AlignTextLayer.feed_forward_chunkr  )r    r!   r"   rd   r$   rv   r%   r   r   r   r'   ru   r9  rw   r(   r(   rp   r)   r4    s     
r4  c                       sr   e Zd Z fddZe				ddejdejdB dedB d	edB d
edB de	e
 deej eB fddZ  ZS )AlignTextEncoderc                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r(   )r4  )r6   r   rJ   r(   r)   
<listcomp>  s    z-AlignTextEncoder.__init__.<locals>.<listcomp>F)	rc   rd   rJ   r   r   r   num_hidden_layerslayergradient_checkpointingro   rp   r>  r)   rd     s   
 
zAlignTextEncoder.__init__NFTr   r   r  r   r   r   r3   c                 K   s   |rdnd }|r
dnd }t | jD ]"\}	}
|r||f }|
|||fi |}|d }|r3||d f }q|r;||f }t|||dS )Nr(   r   r   )r   r   r,   )	enumeraterA  r
   )r9   r   r   r  r   r   r   r   all_self_attentionsr   layer_modulelayer_outputsr(   r(   r)   ru     s.   


zAlignTextEncoder.forward)NFFT)r    r!   r"   rd   r   r$   rv   r%   r   r   r   r'   r
   ru   rw   r(   r(   rp   r)   r=    s,    r=  c                       r.  )AlignTextPoolerc                    s*   t    t|j|j| _t | _d S rs   )rc   rd   r   r  r   r   Tanhrn   ro   rp   r(   r)   rd   %  s   
zAlignTextPooler.__init__r   r3   c                 C   s(   |d d df }|  |}| |}|S )Nr   )r   rn   )r9   r   first_token_tensorpooled_outputr(   r(   r)   ru   *  s   

zAlignTextPooler.forwardr'  r(   r(   rp   r)   rG  $  s    rG  c                   @   s:   e Zd ZU eed< dZdZdZe	 de
jfddZdS )	AlignPreTrainedModelrJ   align)imagetextTr   c                 C   sl  | j j}t|tjtjfr"tj|jd|d |j	dur!t
|j	 nAt|tr?t|jj t
|jj	 t|j| j j n$t|tjrctj|jd|d |jdurct|jddsct
|j|j  t|tjtjfrt
|j	 t|j t|dddurt
|j t|j t
|j dS dS t|trt|jt|jjd  d t
|j! dS dS )	zInitialize the weightsr   )meanstdN_is_hf_initializedFrunning_meanr   r   )"rJ   initializer_rangerV   r   r  rg   initnormal_weightr`   zeros_
AlignModelxavier_uniform_text_projection	constant_temperaturetemperature_init_valuer   r   r4   r   ri   ones_rR  running_varnum_batches_trackedr   copy_r   r$   rB   r  r   r   )r9   r   rP  r(   r(   r)   _init_weights:  s4   


"z"AlignPreTrainedModel._init_weightsN)r    r!   r"   r   r&   base_model_prefixinput_modalitiessupports_gradient_checkpointingr$   no_gradr   Modulerb  r(   r(   r(   r)   rK  3  s   
 rK  zJ
    The text model from ALIGN without any head or projection on top.
    c                       s   e Zd ZU eed< dZdgZddedef fddZdd	 Z	d
d Z
ee								ddejdB dejdB dejdB dejdB dejdB dedB dedB dedB dee deeB fddZ  ZS )AlignTextModelrJ   )rN  r   Tadd_pooling_layerc                    sD   t  | || _t|| _t|| _|rt|nd| _| 	  dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)
rc   rd   rJ   r   r   r=  encoderrG  pooler	post_init)r9   rJ   ri  rp   r(   r)   rd   a  s   

zAlignTextModel.__init__c                 C   s   | j jS rs   r   r   r8   r(   r(   r)   get_input_embeddingsq  s   z#AlignTextModel.get_input_embeddingsc                 C   s   || j _d S rs   rm  )r9   r   r(   r(   r)   set_input_embeddingst  s   z#AlignTextModel.set_input_embeddingsNr   r   r   r   r   r  r   r   r   r3   c	                 K   s  |dur|n| j j}|dur|n| j j}|dur|n| j j}|dur*|dur*td|dur9| || | }
n|durF| dd }
ntd|
\}}|durU|jn|j}|du retj	||f|d}|du rt
| jdr| jjddd|f }|||}|}n	tj|
tj|d}| ||
}| j||||d}| j|f|||d	d
|	}|d }| jdur| |nd}t|||j|jdS )a-  
        Examples:

        ```python
        >>> from transformers import AutoTokenizer, AlignTextModel

        >>> model = AlignTextModel.from_pretrained("kakaobrain/align-base")
        >>> tokenizer = AutoTokenizer.from_pretrained("kakaobrain/align-base")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```NzDYou cannot specify both input_ids and inputs_embeds at the same timer   z5You have to specify either input_ids or inputs_embedsr=   r   r   )r   r   r   r   T)r   r  r   r   r   )r   pooler_outputr   r,   )rJ   r  r   use_return_dictr	  %warn_if_padding_and_no_attention_maskr   r>   r$   onesr   r   r   r   ry   r   get_extended_attention_maskrj  rk  r   r   r,   )r9   r   r   r   r   r   r  r   r   r   r   
batch_sizer   r>   r   r   extended_attention_maskembedding_outputencoder_outputssequence_outputrJ  r(   r(   r)   ru   w  s^   
zAlignTextModel.forwardT)NNNNNNNN)r    r!   r"   r   r&   rd  _no_split_modulesr   rd   rn  ro  r   r   r$   rv   r   r   r'   r   ru   rw   r(   r(   rp   r)   rh  W  sL   
 	
rh  zL
    The vision model from ALIGN without any head or projection on top.
    c                       s   e Zd ZU eed< dZdZdZdZdgZ	def fddZ
ee						ddejd	B d
ed	B ded	B dee deeB f
ddZ  ZS )AlignVisionModelrJ   rr   )rM  Frh   r   c                    s~   t  | || _t|| _t|| _|jdkr"tj	|j
dd| _n|jdkr1tj|j
dd| _ntd|j |   d S )NrO  T)	ceil_moderO   z2config.pooling must be one of ['mean', 'max'] got )rc   rd   rJ   rY   r   r   rj  pooling_typer   	AvgPool2d
hidden_dimrk  	MaxPool2dr	  poolingrl  ro   rp   r(   r)   rd     s   



zAlignVisionModel.__init__Nr   r   r   r3   c           	      K   s   |dur|n| j j}|dur|n| j j}|du rtd| |}| j||dd}|d }| |}||jdd }t	|||j
dS )a  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, AlignVisionModel

        >>> model = AlignVisionModel.from_pretrained("kakaobrain/align-base")
        >>> processor = AutoProcessor.from_pretrained("kakaobrain/align-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```Nz You have to specify pixel_valuesT)r   r   r   rL   )r   rp  r   )rJ   r   rq  r	  r   rj  rk  r  r  r   r   )	r9   rr   r   r   r   rw  rx  r   rJ  r(   r(   r)   ru     s&    

zAlignVisionModel.forward)NNN)r    r!   r"   r   r&   main_input_namerd  re  _input_embed_layerr{  rd   r   r   r$   r%   r   r   r   r'   r   ru   rw   r(   r(   rp   r)   r|    s0   
 r|  c                       sH  e Zd ZU eed< def fddZee					ddej	dB dej	dB dej	dB dej	dB d	ej	dB d
e
e deeB fddZeedejd
e
e deeB fddZee										ddejdB dejdB dej	dB dej	dB dej	dB d	ej	dB dedB dedB dedB dedB d
e
e deeB fddZ  ZS )rX  rJ   c                    s   t  | t|jtstdt|j dt|jts(tdt|j d|j}|j}|j	| _	|j
| _t|| _t|| _t| j| j	| _tt| jj| _|   d S )NzLconfig.text_config is expected to be of type AlignTextConfig but is of type .zPconfig.vision_config is expected to be of type AlignVisionConfig but is of type )rc   rd   rV   text_configr   	TypeErrortypevision_configr   projection_dimr   text_embed_dimrh  
text_modelr|  vision_modelr   r  rZ  	Parameterr$   tensorrJ   r]  r\  rl  )r9   rJ   r  r  rp   r(   r)   rd   +  s,   

zAlignModel.__init__Nr   r   r   r   r   r   r3   c           	   	   K   sH   | j d|||||dd|}|d dddddf }| ||_|S )a  
        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, AlignModel

        >>> model = AlignModel.from_pretrained("kakaobrain/align-base")
        >>> tokenizer = AutoTokenizer.from_pretrained("kakaobrain/align-base")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
        >>> with torch.inference_mode():
        ...     text_features = model.get_text_features(**inputs)
        ```T)r   r   r   r   r   r   r   Nr(   )r  rZ  rp  )	r9   r   r   r   r   r   r   text_outputsr   r(   r(   r)   get_text_featuresI  s   	zAlignModel.get_text_featuresrr   c                 K   s   | j dd|i|S )a}  
        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, AlignModel
        >>> from transformers.image_utils import load_image

        >>> model = AlignModel.from_pretrained("kakaobrain/align-base")
        >>> processor = AutoProcessor.from_pretrained("kakaobrain/align-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(images=image, return_tensors="pt")
        >>> with torch.inference_mode():
        ...     image_features = model.get_image_features(**inputs)
        ```rr   Nr(   )r  )r9   rr   r   r(   r(   r)   get_image_featuresp  s   zAlignModel.get_image_featuresreturn_lossr  r   r   c              
   K   s   |dur|n| j j}|	dur|	n| j j}	|
dur|
n| j j}
| j||	dd}| j|||||||	dd}|d }|d dddddf }| |}||jdddd	 }||jdddd	 }t	||
 | j }|
 }d}|rtt|}t|||||||d
S )a  
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, AlignModel
        >>> from transformers.image_utils import load_image

        >>> model = AlignModel.from_pretrained("kakaobrain/align-base")
        >>> processor = AutoProcessor.from_pretrained("kakaobrain/align-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(
        ...     images=image, text=["a photo of a cat", "a photo of a dog"], return_tensors="pt", padding=True
        ... )

        >>> with torch.inference_mode():
        ...     outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```NT)rr   r   r   )r   r   r   r   r   r  r   r   r   r   rL   r   )r   r   keepdim)r.   r/   r0   r+   r   r1   r2   )rJ   r  r   rq  r  r  rZ  normr$   r   rF   r\  rI   r-   )r9   r   rr   r   r   r   r   r  r  r   r   r   vision_outputsr  r   r+   r0   r/   r.   r(   r(   r)   ru     sL   +
zAlignModel.forward)NNNNN)
NNNNNNNNNN)r    r!   r"   r   r&   rd   r   r   r$   rv   r   r   r'   r   r  r%   r  r   r   r-   ru   rw   r(   r(   rp   r)   rX  '  s   
 %	
rX  )rK  rh  r|  rX  rz  )r   )Or#   r   collections.abcr   dataclassesr   typingr   r$   r    r   rT  activationsr   modeling_layersr	   modeling_outputsr
   r   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   utilsr   r   r   r   r   configuration_alignr   r   r   
get_loggerr    loggerr   r*   r-   rv   rD   rI   rP   rS   r'   r   rX   rg  rY   rg   rx   r   r   r   r   r   r   r   r   r  r  r  r)  r/  r3  r4  r=  rG  rK  rh  r|  rX  __all__r(   r(   r(   r)   <module>   s   
%('!QJC
9&.#sS ?