o
    ۷i	                     @   s\  d Z ddlZddlmZ ddlmZmZmZmZ ddl	Z	ddl	m
Z
 ddlmZ ddlmZ dd	lmZmZmZmZ dd
lmZmZ ddlmZmZmZ ddlmZmZmZmZm Z  ddl!m"Z"m#Z#m$Z$ e %e&Z'eeddG dd deZ(eeddG dd deZ)eeG dd deZ*de	j+de	j+fddZ,de	j+de	j+fddZ-de$d e.fd!d"Z/d`d$ee.e0f d%e1fd&d'Z2G d(d) d)e
j3Z4G d*d+ d+e
j5Z6G d,d- d-e
j3Z7G d.d/ d/e
j3Z8G d0d1 d1e
j3Z9G d2d3 d3e
j3Z:G d4d5 d5e
j3Z;G d6d7 d7e
j3Z<G d8d9 d9e
j3Z=	:	dad;e
j3d<e	j+d=e	j+d>e	j+d?ee	j+ d@e>dAe>dBee	j+ fdCdDZ?G dEdF dFe
j3Z@G dGdH dHe
j3ZAG dIdJ dJe
j3ZBG dKdL dLe
j3ZCG dMdN dNe
j3ZDG dOdP dPeZEG dQdR dRe
j3ZFG dSdT dTe
j3ZGeG dUdV dVeZHedWdG dXdY dYeHZIedZdG d[d\ d\eHZJeG d]d^ d^eHZKg d_ZLdS )bzPyTorch ALIGN model.    N)	dataclass)AnyCallableOptionalUnion)nn   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithNoAttentionBaseModelOutputWithPooling(BaseModelOutputWithPoolingAndNoAttention)ALL_ATTENTION_FUNCTIONSPreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputauto_docstringcan_return_tuplefilter_out_non_signature_kwargslogging   )AlignConfigAlignTextConfigAlignVisionConfigz}
    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
    )custom_introc                   @   sL   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeej  ed< dS )AlignVisionModelOutputz
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The image embeddings obtained by applying the projection layer to the pooler_output.
    Nimage_embedslast_hidden_statehidden_states)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r    r!   tuple r*   r*   ^/home/ubuntu/vllm_env/lib/python3.10/site-packages/transformers/models/align/modeling_align.pyr   )   s
   
 r   ze
    Base class for text model's outputs that also contains a pooling of the last hidden states.
    c                   @   sb   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeej  ed< dZeeej  ed< dS )AlignTextModelOutputz
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The text embeddings obtained by applying the projection layer to the pooler_output.
    Ntext_embedsr    r!   
attentions)r"   r#   r$   r%   r-   r   r&   r'   r(   r    r!   r)   r.   r*   r*   r*   r+   r,   :   s   
 r,   c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeej ed< dZeej ed< dZeed< dZeed	< d
ee fddZdS )AlignOutputar  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`AlignTextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The output of [`AlignVisionModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`AlignTextModel`].
    vision_model_output (`BaseModelOutputWithPoolingAndNoAttention`):
        The output of the [`AlignVisionModel`].
    Nlosslogits_per_imagelogits_per_textr-   r   text_model_outputvision_model_outputreturnc                    s   t  fdd  D S )Nc                 3   s.    | ]}|d vr | nt  | V  qdS ))r3   r4   N)getattrto_tuple).0kselfr*   r+   	<genexpr>k   s
    
z'AlignOutput.to_tuple.<locals>.<genexpr>)r)   keysr:   r*   r:   r+   r7   j   s   zAlignOutput.to_tuple)r"   r#   r$   r%   r0   r   r&   r'   r(   r1   r2   r-   r   r3   r   r4   r   r)   r   r7   r*   r*   r*   r+   r/   L   s   
 r/   logitsr5   c                 C   s"   t jj| tjt| | jdddS )Ndeviceg?)label_smoothing)r   
functionalcross_entropyr&   arangelenr@   )r>   r*   r*   r+   contrastive_losss   s   "rF   
similarityc                 C   s    t | }t |  }|| d S )Ng       @)rF   t)rG   caption_loss
image_lossr*   r*   r+   
align_lossw   s   rK   confignum_channelsc                 C   sJ   | j }|| j9 }t|t||d  | | }|d| k r!||7 }t|S )z<
    Round number of filters based on depth multiplier.
       g?)depth_divisorwidth_coefficientmaxint)rL   rM   divisornew_dimr*   r*   r+   round_filters~   s   
rU   Tkernel_sizeadjustc                 C   sn   t | tr	| | f} | d d | d d f}|r)|d d |d |d d |d fS |d |d |d |d fS )aJ  
    Utility function to get the tuple padding value for the depthwise convolution.

    Args:
        kernel_size (`int` or `tuple`):
            Kernel size of the convolution layers.
        adjust (`bool`, *optional*, defaults to `True`):
            Adjusts padding value to apply to right and bottom sides of the input.
    r   rN   r   )
isinstancerR   )rV   rW   correctr*   r*   r+   correct_pad   s   

$rZ   c                       s<   e Zd ZdZdef fddZdejdejfddZ  Z	S )	AlignVisionEmbeddingszL
    A module that corresponds to the stem module of the original work.
    rL   c                    sh   t    t|d| _tjdd| _tj|j| jddddd| _	tj
| j|j|jd	| _t|j | _d S )
N    )r   r   r   r   paddingr   rN   validFrV   strider^   bias)epsmomentum)super__init__rU   out_dimr   	ZeroPad2dr^   Conv2drM   convolutionBatchNorm2dbatch_norm_epsbatch_norm_momentum	batchnormr	   
hidden_act
activationr;   rL   	__class__r*   r+   rf      s   
zAlignVisionEmbeddings.__init__pixel_valuesr5   c                 C   s,   |  |}| |}| |}| |}|S N)r^   rj   rn   rp   )r;   rt   featuresr*   r*   r+   forward   s
   



zAlignVisionEmbeddings.forward)
r"   r#   r$   r%   r   rf   r&   Tensorrw   __classcell__r*   r*   rr   r+   r[      s    r[   c                       s,   e Zd Z							d fdd	Z  ZS )	AlignVisionDepthwiseConv2dr   r   r   Tzerosc	           
         s*   || }	t  j||	|||||||d	 d S )N)	in_channelsout_channelsrV   ra   r^   dilationgroupsrb   padding_mode)re   rf   )
r;   r|   depth_multiplierrV   ra   r^   r~   rb   r   r}   rr   r*   r+   rf      s   
z#AlignVisionDepthwiseConv2d.__init__)r   r   r   r   r   Tr{   )r"   r#   r$   rf   ry   r*   r*   rr   r+   rz      s    rz   c                       sH   e Zd ZdZdedededef fddZdejd	ej	fd
dZ
  ZS )AlignVisionExpansionLayerz_
    This corresponds to the expansion phase of each block in the original implementation.
    rL   in_dimrg   ra   c                    sB   t    tj||dddd| _tj||jd| _t|j	 | _
d S )Nr   sameFr|   r}   rV   r^   rb   )num_featuresrc   )re   rf   r   ri   expand_convrk   rl   	expand_bnr	   ro   
expand_act)r;   rL   r   rg   ra   rr   r*   r+   rf      s   
z"AlignVisionExpansionLayer.__init__r!   r5   c                 C   s"   |  |}| |}| |}|S ru   )r   r   r   r;   r!   r*   r*   r+   rw      s   


z!AlignVisionExpansionLayer.forward)r"   r#   r$   r%   r   rR   rf   r&   r'   rx   rw   ry   r*   r*   rr   r+   r      s    r   c                
       sL   e Zd ZdZdededededef
 fddZd	ej	d
ej
fddZ  ZS )AlignVisionDepthwiseLayerzk
    This corresponds to the depthwise convolution phase of each block in the original implementation.
    rL   r   ra   rV   adjust_paddingc                    sv   t    || _| jdkrdnd}t||d}tj|d| _t||||dd| _tj	||j
|jd| _t|j | _d S )	NrN   r_   r   )rW   r]   Fr`   r   rc   rd   )re   rf   ra   rZ   r   rh   depthwise_conv_padrz   depthwise_convrk   rl   rm   depthwise_normr	   ro   depthwise_act)r;   rL   r   ra   rV   r   conv_padr^   rr   r*   r+   rf      s   


z"AlignVisionDepthwiseLayer.__init__r!   r5   c                 C   s6   | j dkr
| |}| |}| |}| |}|S )NrN   )ra   r   r   r   r   r   r*   r*   r+   rw     s   




z!AlignVisionDepthwiseLayer.forwardr"   r#   r$   r%   r   rR   boolrf   r&   r'   rx   rw   ry   r*   r*   rr   r+   r      s    r   c                	       sJ   e Zd ZdZddedededef fddZd	ej	d
ej
fddZ  ZS )AlignVisionSqueezeExciteLayerzl
    This corresponds to the Squeeze and Excitement phase of each block in the original implementation.
    FrL   r   
expand_dimexpandc                    s   t    |r	|n|| _tdt||j | _tjdd| _	tj
| j| jddd| _tj
| j| jddd| _t|j | _t | _d S )Nr   )output_sizer   )r|   r}   rV   r^   )re   rf   dimrQ   rR   squeeze_expansion_ratiodim_ser   AdaptiveAvgPool2dsqueezeri   reducer   r	   ro   
act_reduceSigmoid
act_expand)r;   rL   r   r   r   rr   r*   r+   rf      s$   
z&AlignVisionSqueezeExciteLayer.__init__r!   r5   c                 C   sF   |}|  |}| |}| |}| |}| |}t||}|S ru   )r   r   r   r   r   r&   mul)r;   r!   inputsr*   r*   r+   rw   5  s   




z%AlignVisionSqueezeExciteLayer.forward)Fr   r*   r*   rr   r+   r     s     r   c                       sV   e Zd ZdZdedededededef fdd	Zd
e	j
de	j
de	jfddZ  ZS )AlignVisionFinalBlockLayerz[
    This corresponds to the final phase of each block in the original implementation.
    rL   r   rg   ra   	drop_rateid_skipc                    sX   t    |dko| | _tj||dddd| _tj||j|jd| _	tj
|d| _d S )Nr   r   Fr   r   )p)re   rf   apply_dropoutr   ri   project_convrk   rl   rm   
project_bnDropoutdropout)r;   rL   r   rg   ra   r   r   rr   r*   r+   rf   G  s   

z#AlignVisionFinalBlockLayer.__init__
embeddingsr!   r5   c                 C   s0   |  |}| |}| jr| |}|| }|S ru   )r   r   r   r   )r;   r   r!   r*   r*   r+   rw   X  s   


z"AlignVisionFinalBlockLayer.forwardr"   r#   r$   r%   r   rR   floatr   rf   r&   r'   rx   rw   ry   r*   r*   rr   r+   r   B  s     $r   c                       s\   e Zd ZdZdededededededed	ed
ef fddZde	j
de	jfddZ  ZS )AlignVisionBlocka  
    This corresponds to the block module of original the EfficientNet vision encoder implementation.

    Args:
        config ([`AlignVisionConfig`]):
            Model configuration class.
        in_dim (`int`):
            Number of input channels.
        out_dim (`int`):
            Number of output channels.
        stride (`int`):
            Stride size to be used in convolution layers.
        expand_ratio (`int`):
            Expand ratio to set the output dimensions for the expansion and squeeze-excite layers.
        kernel_size (`int`):
            Kernel size for the depthwise convolution layer.
        drop_rate (`float`):
            Dropout rate to be used in the final phase of each block.
        id_skip (`bool`):
            Whether to apply dropout and sum the final hidden states with the input embeddings during the final phase
            of each block. Set to `True` for the first block of each stage.
        adjust_padding (`bool`):
            Whether to apply padding to only right and bottom side of the input kernel before the depthwise convolution
            operation, set to `True` for inputs with odd input sizes.
    rL   r   rg   ra   expand_ratiorV   r   r   r   c
                    s   t    || _| jdk| _|| }
| jrt|||
|d| _t|| jr%|
n||||	d| _t|||
| jd| _	t
|| jr>|
n|||||d| _d S )Nr   )rL   r   rg   ra   )rL   r   ra   rV   r   )rL   r   r   r   )rL   r   rg   ra   r   r   )re   rf   r   r   r   	expansionr   r   r   squeeze_exciter   
projection)r;   rL   r   rg   ra   r   rV   r   r   r   expand_in_dimrr   r*   r+   rf   ~  s4   

zAlignVisionBlock.__init__r!   r5   c                 C   s<   |}| j dkr| |}| |}| |}| ||}|S Nr   )r   r   r   r   r   )r;   r!   r   r*   r*   r+   rw     s   



zAlignVisionBlock.forwardr   r*   r*   rr   r+   r   c  s,    	
)r   c                	       sP   e Zd ZdZdef fddZ		ddejdee	 d	ee	 d
e
fddZ  ZS )AlignVisionEncoderz
    Forward propagates the embeddings through each vision encoder (EfficientNet) block.

    Args:
        config ([`AlignVisionConfig`]):
            Model configuration class.
    rL   c                    s(  t    |j_fdd t|j}t fdd|jD }d}g }t|D ]c}t||j| }t||j	| }|j
| }	|j| }
|j| }t |j| D ]8}|dk}|dkr^dn|	}	|dkrf|n|}||jv}|j| | }t||||	|
||||d	}|| |d7 }qRq(t|_d S )Nc                    s   t t j|  S ru   )rR   mathceildepth_coefficient)repeatsr:   r*   r+   round_repeats  s   z2AlignVisionEncoder.__init__.<locals>.round_repeatsc                 3   s    | ]} |V  qd S ru   r*   )r8   n)r   r*   r+   r<     s    z.AlignVisionEncoder.__init__.<locals>.<genexpr>r   r   )	rL   r   rg   ra   rV   r   r   r   r   )re   rf   r   rE   r|   sumnum_block_repeatsrangerU   r}   strideskernel_sizesexpand_ratiosdepthwise_paddingdrop_connect_rater   appendr   
ModuleListblocks)r;   rL   num_base_blocks
num_blockscurr_block_numr   ir   rg   ra   rV   r   jr   r   r   blockrr   )r   r;   r+   rf     sD   







zAlignVisionEncoder.__init__FTr!   output_hidden_statesreturn_dictr5   c                 C   sV   |r|fnd }| j D ]}||}|r||f7 }q
|s%tdd ||fD S t||dS )Nc                 s   s    | ]	}|d ur|V  qd S ru   r*   )r8   vr*   r*   r+   r<     s    z-AlignVisionEncoder.forward.<locals>.<genexpr>)r    r!   )r   r)   r   )r;   r!   r   r   all_hidden_statesr   r*   r*   r+   rw     s   

zAlignVisionEncoder.forward)FT)r"   r#   r$   r%   r   rf   r&   r'   r   r   r   rw   ry   r*   r*   rr   r+   r     s    .r   c                       sb   e Zd ZdZ fddZ				ddeej deej deej deej d	ej	f
d
dZ
  ZS )AlignTextEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    s   t    tj|j|j|jd| _t|j|j| _	t|j
|j| _tj|j|jd| _t|j| _t|dd| _| jdt|jddd | jd	tj| j tjd
dd d S )N)padding_idxrc   position_embedding_typeabsoluteposition_ids)r   F)
persistenttoken_type_ids)dtype)re   rf   r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsr   hidden_dropout_probr   r6   r   register_bufferr&   rD   r   r{   r   sizelongrq   rr   r*   r+   rf     s   

zAlignTextEmbeddings.__init__N	input_idsr   r   inputs_embedsr5   c                 C   s   |d ur	|  }n|  d d }|d }|d u r$| jd d d |f }|d u rNt| drC| jd d d |f }||d |}|}ntj|tj| jjd}|d u rW| 	|}| 
|}	||	 }
| jdkrn| |}|
|7 }
| |
}
| |
}
|
S )Nr   r   r   r   r   r@   r   )r   r   hasattrr   r   r&   r{   r   r@   r   r   r   r   r   r   )r;   r   r   r   r   input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr   r   r   r*   r*   r+   rw     s,   







zAlignTextEmbeddings.forward)NNNN)r"   r#   r$   r%   rf   r   r&   
LongTensorr'   rx   rw   ry   r*   r*   rr   r+   r     s$    r           modulequerykeyvalueattention_maskscalingr   	head_maskc                 K   s   t ||dd| }	|d ur'|d d d d d d d |jd f }
|	|
 }	tjj|	dt jd|j	}	tjj
|	|| jd}	|d urM|	|dddd }	t |	|}|dd }||	fS )NrN   r   r   )r   r   )r   trainingr   )r&   matmul	transposeshaper   rB   softmaxfloat32tor   r   r   view
contiguous)r   r   r   r   r   r   r   r   kwargsattn_weightscausal_maskattn_outputr*   r*   r+   eager_attention_forward=  s   &r  c                       sZ   e Zd Z fddZ			ddejdeej deej dee d	e	ej f
d
dZ
  ZS )AlignTextSelfAttentionc                    s   t    |j|j dkrt|dstd|j d|j d|| _|j| _t|j|j | _| j| j | _	t
|j| j	| _t
|j| j	| _t
|j| j	| _t
|j| _|j| _| jd | _d S )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()g      )re   rf   r   num_attention_headsr   
ValueErrorrL   rR   attention_head_sizeall_head_sizer   Linearr   r   r   r   attention_probs_dropout_probr   attention_dropoutr   rq   rr   r*   r+   rf   Y  s"   

zAlignTextSelfAttention.__init__NFr!   r   r   output_attentionsr5   c                 K   s   |j d d }g |d| jR }| ||dd}| ||dd}	| ||dd}
t}| jj	dkrCt
| jj	 }|| ||	|
|f| jsOdn| j| j|d|\}}|jg |dR   }|rp||f}|S |f}|S )Nr   r   rN   eagerr   )r   r   r   )r  r  r   r  r   r   r   r  rL   _attn_implementationr   r   r  r   reshaper  )r;   r!   r   r   r  r  r   hidden_shapequery_states
key_statesvalue_statesattention_interfacer
  r  outputsr*   r*   r+   rw   n  s4   	
zAlignTextSelfAttention.forwardNNF)r"   r#   r$   rf   r&   rx   r   r'   r   r)   rw   ry   r*   r*   rr   r+   r  X  s     r  c                       8   e Zd Z fddZdejdejdejfddZ  ZS )AlignTextSelfOutputc                    sB   t    t|j|j| _tj|j|jd| _t|j	| _
d S Nr   )re   rf   r   r  r   denser   r   r   r   r   rq   rr   r*   r+   rf        
zAlignTextSelfOutput.__init__r!   input_tensorr5   c                 C   &   |  |}| |}| || }|S ru   r$  r   r   r;   r!   r&  r*   r*   r+   rw        

zAlignTextSelfOutput.forwardr"   r#   r$   rf   r&   rx   rw   ry   r*   r*   rr   r+   r"        $r"  c                       sb   e Zd Z fddZdd Z			ddejdeej d	eej d
ee	 de
ej f
ddZ  ZS )AlignTextAttentionc                    s*   t    t|| _t|| _t | _d S ru   )re   rf   r  r;   r"  outputsetpruned_headsrq   rr   r*   r+   rf     s   


zAlignTextAttention.__init__c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   )r   )rE   r   r;   r  r  r0  r   r   r   r   r.  r$  r  union)r;   headsindexr*   r*   r+   prune_heads  s   zAlignTextAttention.prune_headsNFr!   r   r   r  r5   c           	      K   s@   | j |f|||d|}| |d |}|f|dd   }|S N)r   r   r  r   r   )r;   r.  )	r;   r!   r   r   r  r  self_outputsattention_outputr  r*   r*   r+   rw     s   zAlignTextAttention.forwardr   )r"   r#   r$   rf   r4  r&   rx   r   r'   r   r)   rw   ry   r*   r*   rr   r+   r-    s"    r-  c                       2   e Zd Z fddZdejdejfddZ  ZS )AlignTextIntermediatec                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S ru   )re   rf   r   r  r   intermediate_sizer$  rX   ro   strr	   intermediate_act_fnrq   rr   r*   r+   rf     s
   
zAlignTextIntermediate.__init__r!   r5   c                 C   s   |  |}| |}|S ru   )r$  r<  r   r*   r*   r+   rw     s   

zAlignTextIntermediate.forwardr+  r*   r*   rr   r+   r9    s    r9  c                       r!  )AlignTextOutputc                    sB   t    t|j|j| _tj|j|jd| _t	|j
| _d S r#  )re   rf   r   r  r:  r   r$  r   r   r   r   r   rq   rr   r*   r+   rf     r%  zAlignTextOutput.__init__r!   r&  r5   c                 C   r'  ru   r(  r)  r*   r*   r+   rw     r*  zAlignTextOutput.forwardr+  r*   r*   rr   r+   r=    r,  r=  c                       sb   e Zd Z fddZ			ddejdeej deej dee d	e	ej f
d
dZ
dd Z  ZS )AlignTextLayerc                    s:   t    |j| _d| _t|| _t|| _t|| _	d S r   )
re   rf   chunk_size_feed_forwardseq_len_dimr-  	attentionr9  intermediater=  r.  rq   rr   r*   r+   rf     s   


zAlignTextLayer.__init__NFr!   r   r   r  r5   c           
      K   sP   | j |f|||d|}|d }|dd  }t| j| j| j|}	|	f| }|S r5  )rA  r   feed_forward_chunkr?  r@  )
r;   r!   r   r   r  r  self_attention_outputsr7  r  layer_outputr*   r*   r+   rw     s    
zAlignTextLayer.forwardc                 C   s   |  |}| ||}|S ru   )rB  r.  )r;   r7  intermediate_outputrE  r*   r*   r+   rC    s   
z!AlignTextLayer.feed_forward_chunkr   )r"   r#   r$   rf   r&   rx   r   r'   r   r)   rw   rC  ry   r*   r*   rr   r+   r>    s"    
r>  c                       sz   e Zd Z fddZe					ddejdeej deej d	ee	 d
ee	 dee	 de
eej ef fddZ  ZS )AlignTextEncoderc                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r*   )r>  )r8   r   rL   r*   r+   
<listcomp>  s    z-AlignTextEncoder.__init__.<locals>.<listcomp>F)	re   rf   rL   r   r   r   num_hidden_layerslayergradient_checkpointingrq   rr   rH  r+   rf     s   
 
zAlignTextEncoder.__init__NFTr!   r   r   r  r   r   r5   c                 K   s   |rdnd }|r
dnd }	t | jD ].\}
}|r||f }|d ur$||
 nd }|d||||d|}|d }|r?|	|d f }	q|rG||f }t|||	dS )Nr*   )r!   r   r   r  r   r   )r    r!   r.   )	enumeraterK  r   )r;   r!   r   r   r  r   r   r  r   all_self_attentionsr   layer_modulelayer_head_masklayer_outputsr*   r*   r+   rw     s2   

zAlignTextEncoder.forward)NNFFT)r"   r#   r$   rf   r   r&   rx   r   r'   r   r   r)   r   rw   ry   r*   r*   rr   r+   rG    s.    	rG  c                       r8  )AlignTextPoolerc                    s*   t    t|j|j| _t | _d S ru   )re   rf   r   r  r   r$  Tanhrp   rq   rr   r*   r+   rf   H  s   
zAlignTextPooler.__init__r!   r5   c                 C   s(   |d d df }|  |}| |}|S )Nr   )r$  rp   )r;   r!   first_token_tensorpooled_outputr*   r*   r+   rw   M  s   

zAlignTextPooler.forwardr+  r*   r*   rr   r+   rR  G  s    rR  c                   @   s.   e Zd ZU eed< dZdZdejfddZ	dS )AlignPreTrainedModelrL   alignTr   c                 C   s   | j j}t|tjtjfr"|jjjd|d |j	dur!|j	j
  n;t|tr@tj|jj |jj	j
  |jj| j j nt|tjr]|jjjd|d |jdur]|jj|j 
  t|tjtjfru|j	j
  |jjd dS dS )zInitialize the weightsr   )meanstdNg      ?)rL   initializer_rangerX   r   r  ri   weightdatanormal_rb   zero_
AlignModelinitxavier_uniform_text_projectiontemperaturefill_temperature_init_valuer   r   r   rk   )r;   r   rY  r*   r*   r+   _init_weights\  s$   


z"AlignPreTrainedModel._init_weightsN)
r"   r#   r$   r   r(   base_model_prefixsupports_gradient_checkpointingr   Modulerf  r*   r*   r*   r+   rV  V  s
   
 rV  zJ
    The text model from ALIGN without any head or projection on top.
    c                       s   e Zd ZU eed< dgZddedef fddZdd Zd	d
 Z	e
e									ddeej deej deej deej deej deej dee dee dee deeef fddZ  ZS )AlignTextModelrL   r   Tadd_pooling_layerc                    sD   t  | || _t|| _t|| _|rt|nd| _| 	  dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)
re   rf   rL   r   r   rG  encoderrR  pooler	post_init)r;   rL   rk  rr   r*   r+   rf   y  s   

zAlignTextModel.__init__c                 C   s   | j jS ru   r   r   r:   r*   r*   r+   get_input_embeddings  s   z#AlignTextModel.get_input_embeddingsc                 C   s   || j _d S ru   ro  )r;   r   r*   r*   r+   set_input_embeddings  s   z#AlignTextModel.set_input_embeddingsNr   r   r   r   r   r   r  r   r   r5   c
                 K   s  |dur|n| j j}|dur|n| j j}|	dur|	n| j j}	|dur*|dur*td|dur9| || | }n|durF| dd }ntd|\}}|durU|jn|j}|du retj	||f|d}|du rt
| jdr| jjddd|f }|||}|}n	tj|tj|d}| ||}| || j j}| j||||d}| j|f||||d	d
|
}|d }| jdur| |nd}t|||j|jdS )a-  
        Examples:

        ```python
        >>> from transformers import AutoTokenizer, AlignTextModel

        >>> model = AlignTextModel.from_pretrained("kakaobrain/align-base")
        >>> tokenizer = AutoTokenizer.from_pretrained("kakaobrain/align-base")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```NzDYou cannot specify both input_ids and inputs_embeds at the same timer   z5You have to specify either input_ids or inputs_embedsr?   r   r   )r   r   r   r   T)r   r   r  r   r   r   )r    pooler_outputr!   r.   )rL   r  r   use_return_dictr  %warn_if_padding_and_no_attention_maskr   r@   r&   onesr   r   r   r   r{   r   get_extended_attention_maskget_head_maskrJ  rl  rm  r   r!   r.   )r;   r   r   r   r   r   r   r  r   r   r  r   
batch_sizer   r@   r   r   extended_attention_maskembedding_outputencoder_outputssequence_outputrU  r*   r*   r+   rw     sb   
	zAlignTextModel.forwardT)	NNNNNNNNN)r"   r#   r$   r   r(   _no_split_modulesr   rf   rp  rq  r   r   r   r&   rx   r'   r   r)   r   rw   ry   r*   r*   rr   r+   rj  p  sL   
 	

rj  zL
    The vision model from ALIGN without any head or projection on top.
    c                       s   e Zd ZU eed< dZdZdef fddZdej	fddZ
ee						ddeej d
ee dee deeef fddZ  ZS )AlignVisionModelrL   rt   Fc                    s~   t  | || _t|| _t|| _|jdkr"tj	|j
dd| _n|jdkr1tj|j
dd| _ntd|j |   d S )NrX  T)	ceil_moderQ   z2config.pooling must be one of ['mean', 'max'] got )re   rf   rL   r[   r   r   rl  pooling_typer   	AvgPool2d
hidden_dimrm  	MaxPool2dr  poolingrn  rq   rr   r*   r+   rf     s   



zAlignVisionModel.__init__r5   c                 C   s
   | j jjS ru   )vision_modelr   rj   r:   r*   r*   r+   rp    s   
z%AlignVisionModel.get_input_embeddingsNr   r   c                 C   s   |dur|n| j j}|dur|n| j j}|du rtd| |}| j||dd}|d }| |}||jdd }t	|||j
dS )a  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, AlignVisionModel

        >>> model = AlignVisionModel.from_pretrained("kakaobrain/align-base")
        >>> processor = AutoProcessor.from_pretrained("kakaobrain/align-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```Nz You have to specify pixel_valuesT)r   r   r   rN   )r    rr  r!   )rL   r   rs  r  r   rl  rm  r  r  r   r!   )r;   rt   r   r   rz  r{  r    rU  r*   r*   r+   rw     s&   

zAlignVisionModel.forward)NNN)r"   r#   r$   r   r(   main_input_namerh  rf   r   ri  rp  r   r   r   r&   r'   r   r   r)   r   rw   ry   r*   r*   rr   r+   r    s(   
 
r  c                       sL  e Zd ZU eed< def fddZe e						ddee	j
 dee	j
 dee	j
 dee	j
 d	ee	j
 d
ee	j
 de	jfddZe ede	jde	jfddZee											ddee	j dee	j dee	j
 dee	j
 dee	j
 d	ee	j
 d
ee	j
 dee dee dee dee deeef fddZ  ZS )r_  rL   c                    s   t  | t|jtstdt|j dt|jts(tdt|j d|j}|j}|j	| _	|j
| _t|| _t|| _t| j| j	| _tt| jj| _|   d S )NzLconfig.text_config is expected to be of type AlignTextConfig but is of type .zPconfig.vision_config is expected to be of type AlignVisionConfig but is of type )re   rf   rX   text_configr   	TypeErrortypevision_configr   projection_dimr   text_embed_dimrj  
text_modelr  r  r   r  rb  	Parameterr&   tensorrL   re  rc  rn  )r;   rL   r  r  rr   r*   r+   rf   I  s,   

zAlignModel.__init__Nr   r   r   r   r   r   r5   c           
      C   s>   | j ||||||d}|d dddddf }| |}	|	S )a  
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`AlignTextModel`].

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, AlignModel

        >>> model = AlignModel.from_pretrained("kakaobrain/align-base")
        >>> tokenizer = AutoTokenizer.from_pretrained("kakaobrain/align-base")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
        >>> with torch.inference_mode():
        ...     text_features = model.get_text_features(**inputs)
        ```)r   r   r   r   r   r   r   N)r  rb  )
r;   r   r   r   r   r   r   text_outputsr    text_featuresr*   r*   r+   get_text_featuresg  s   
zAlignModel.get_text_featuresrt   c                 C   s   | j |d}|j}|S )a]  
        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`AlignVisionModel`].

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, AlignModel
        >>> from transformers.image_utils import load_image

        >>> model = AlignModel.from_pretrained("kakaobrain/align-base")
        >>> processor = AutoProcessor.from_pretrained("kakaobrain/align-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(images=image, return_tensors="pt")
        >>> with torch.inference_mode():
        ...     image_features = model.get_image_features(**inputs)
        ```)rt   )r  rr  )r;   rt   vision_outputsimage_featuresr*   r*   r+   get_image_features  s   zAlignModel.get_image_featuresreturn_lossr  r   r   c                 C   s   |	dur|	n| j j}	|
dur|
n| j j}
|dur|n| j j}| j||
dd}| j|||||||	|
dd	}|d }|d dddddf }| |}||jdddd	 }||jdddd	 }t	||
 | j }|
 }d}|rut|}t|||||||d
S )a  
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, AlignModel
        >>> from transformers.image_utils import load_image

        >>> model = AlignModel.from_pretrained("kakaobrain/align-base")
        >>> processor = AutoProcessor.from_pretrained("kakaobrain/align-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(
        ...     images=image, text=["a photo of a cat", "a photo of a dog"], return_tensors="pt", padding=True
        ... )

        >>> with torch.inference_mode():
        ...     outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```NT)rt   r   r   )	r   r   r   r   r   r   r  r   r   r   r   rN   r   )r   r   keepdim)r0   r1   r2   r-   r   r3   r4   )rL   r  r   rs  r  r  rb  normr&   r   rH   rc  rK   r/   )r;   r   rt   r   r   r   r   r   r  r  r   r   r  r  r   r-   r2   r1   r0   r*   r*   r+   rw     sN   +
zAlignModel.forward)NNNNNN)NNNNNNNNNNN)r"   r#   r$   r   r(   rf   r   r   r   r&   rx   r'   r  r  r   r   r   r   r)   r/   rw   ry   r*   r*   rr   r+   r_  E  s   
 (	

r_  )rV  rj  r  r_  r}  )r   N)Mr%   r   dataclassesr   typingr   r   r   r   r&   r   activationsr	   modeling_layersr
   modeling_outputsr   r   r   r   modeling_utilsr   r   pytorch_utilsr   r   r   utilsr   r   r   r   r   configuration_alignr   r   r   
get_loggerr"   loggerr   r,   r/   rx   rF   rK   rR   rU   r)   r   rZ   ri  r[   ri   rz   r   r   r   r   r   r   r   r   r  r  r"  r-  r9  r=  r>  rG  rR  rV  rj  r  r_  __all__r*   r*   r*   r+   <module>   s   
%('!QJF
;.(2{P G