o
    wi                     @   s  d Z ddlZddlmZ ddlmZmZmZ ddlZddl	Zddlm
Z
 ddlmZ ddlmZ dd	lmZmZmZmZ dd
lmZ ddlmZmZmZ ddlmZmZmZ ddlmZm Z m!Z! e"e#Z$eeddG dd deZ%eeddG dd deZ&eeG dd deZ'dej(dej(fddZ)dej(dej(fddZ*de!d e+fd!d"Z,dVd$ee+e-f d%e.fd&d'Z/G d(d) d)e
j0Z1G d*d+ d+e
j2Z3G d,d- d-e
j0Z4G d.d/ d/e
j0Z5G d0d1 d1e
j0Z6G d2d3 d3e
j0Z7G d4d5 d5e
j0Z8G d6d7 d7e
j0Z9G d8d9 d9e
j0Z:G d:d; d;e
j0Z;G d<d= d=e
j0Z<d>e;iZ=G d?d@ d@e
j0Z>G dAdB dBe
j0Z?G dCdD dDe
j0Z@G dEdF dFeZAG dGdH dHe
j0ZBG dIdJ dJe
j0ZCeG dKdL dLeZDedMdG dNdO dOeDZEedPdG dQdR dReDZFeG dSdT dTeDZGg dUZHdS )WzPyTorch ALIGN model.    N)	dataclass)AnyOptionalUnion)nn   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputWithNoAttention)BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentions(BaseModelOutputWithPoolingAndNoAttention)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputauto_docstringlogging   )AlignConfigAlignTextConfigAlignVisionConfigz}
    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
    )custom_introc                   @   sL   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeej  ed< dS )AlignVisionModelOutputz
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The image embeddings obtained by applying the projection layer to the pooler_output.
    Nimage_embedslast_hidden_statehidden_states)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   r   tuple r&   r&   e/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/align/modeling_align.pyr   *   s
   
 r   ze
    Base class for text model's outputs that also contains a pooling of the last hidden states.
    c                   @   sb   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeej  ed< dZeeej  ed< dS )AlignTextModelOutputz
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The text embeddings obtained by applying the projection layer to the pooler_output.
    Ntext_embedsr   r   
attentions)r   r   r    r!   r)   r   r"   r#   r$   r   r   r%   r*   r&   r&   r&   r'   r(   ;   s   
 r(   c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeej ed< dZeej ed< dZeed< dZeed	< d
ee fddZdS )AlignOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`AlignTextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The output of [`AlignVisionModel`].
    text_model_output (`BaseModelOutputWithPoolingAndCrossAttentions`):
        The output of the [`AlignTextModel`].
    vision_model_output (`BaseModelOutputWithPoolingAndNoAttention`):
        The output of the [`AlignVisionModel`].
    Nlosslogits_per_imagelogits_per_textr)   r   text_model_outputvision_model_outputreturnc                    s   t  fdd  D S )Nc                 3   s.    | ]}|d vr | nt  | V  qdS ))r/   r0   N)getattrto_tuple).0kselfr&   r'   	<genexpr>l   s
    
z'AlignOutput.to_tuple.<locals>.<genexpr>)r%   keysr6   r&   r6   r'   r3   k   s   zAlignOutput.to_tuple)r   r   r    r!   r,   r   r"   r#   r$   r-   r.   r)   r   r/   r   r0   r   r%   r   r3   r&   r&   r&   r'   r+   M   s   
 r+   logitsr1   c                 C   s"   t jj| tjt| | jdddS )Ndeviceg?)label_smoothing)r   
functionalcross_entropyr"   arangelenr<   )r:   r&   r&   r'   contrastive_losst   s   "rB   
similarityc                 C   s    t | }t |  }|| d S )Ng       @)rB   t)rC   caption_loss
image_lossr&   r&   r'   
align_lossx   s   rG   confignum_channelsc                 C   sJ   | j }|| j9 }t|t||d  | | }|d| k r!||7 }t|S )z<
    Round number of filters based on depth multiplier.
       g?)depth_divisorwidth_coefficientmaxint)rH   rI   divisornew_dimr&   r&   r'   round_filters   s   
rQ   Tkernel_sizeadjustc                 C   sn   t | tr	| | f} | d d | d d f}|r)|d d |d |d d |d fS |d |d |d |d fS )aJ  
    Utility function to get the tuple padding value for the depthwise convolution.

    Args:
        kernel_size (`int` or `tuple`):
            Kernel size of the convolution layers.
        adjust (`bool`, *optional*, defaults to `True`):
            Adjusts padding value to apply to right and bottom sides of the input.
    r   rJ   r   )
isinstancerN   )rR   rS   correctr&   r&   r'   correct_pad   s   

$rV   c                       s<   e Zd ZdZdef fddZdejdejfddZ  Z	S )	AlignVisionEmbeddingszL
    A module that corresponds to the stem module of the original work.
    rH   c                    sh   t    t|d| _tjdd| _tj|j| jddddd| _	tj
| j|j|jd	| _t|j | _d S )
N    )r   r   r   r   paddingr   rJ   validFrR   striderZ   bias)epsmomentum)super__init__rQ   out_dimr   	ZeroPad2drZ   Conv2drI   convolutionBatchNorm2dbatch_norm_epsbatch_norm_momentum	batchnormr   
hidden_act
activationr7   rH   	__class__r&   r'   rb      s   
zAlignVisionEmbeddings.__init__pixel_valuesr1   c                 C   s,   |  |}| |}| |}| |}|S N)rZ   rf   rj   rl   )r7   rp   featuresr&   r&   r'   forward   s
   



zAlignVisionEmbeddings.forward)
r   r   r    r!   r   rb   r"   Tensorrs   __classcell__r&   r&   rn   r'   rW      s    rW   c                       s,   e Zd Z							d fdd	Z  ZS )	AlignVisionDepthwiseConv2dr   r   r   Tzerosc	           
         s*   || }	t  j||	|||||||d	 d S )N)	in_channelsout_channelsrR   r]   rZ   dilationgroupsr^   padding_mode)ra   rb   )
r7   rx   depth_multiplierrR   r]   rZ   rz   r^   r|   ry   rn   r&   r'   rb      s   
z#AlignVisionDepthwiseConv2d.__init__)r   r   r   r   r   Trw   )r   r   r    rb   ru   r&   r&   rn   r'   rv      s    rv   c                       sH   e Zd ZdZdedededef fddZdejd	ej	fd
dZ
  ZS )AlignVisionExpansionLayerz_
    This corresponds to the expansion phase of each block in the original implementation.
    rH   in_dimrc   r]   c                    sB   t    tj||dddd| _tj||jd| _t|j	 | _
d S )Nr   sameFrx   ry   rR   rZ   r^   )num_featuresr_   )ra   rb   r   re   expand_convrg   rh   	expand_bnr   rk   
expand_act)r7   rH   r   rc   r]   rn   r&   r'   rb      s   
z"AlignVisionExpansionLayer.__init__r   r1   c                 C   s"   |  |}| |}| |}|S rq   )r   r   r   r7   r   r&   r&   r'   rs      s   


z!AlignVisionExpansionLayer.forward)r   r   r    r!   r   rN   rb   r"   r#   rt   rs   ru   r&   r&   rn   r'   r~      s    r~   c                
       sL   e Zd ZdZdededededef
 fddZd	ej	d
ej
fddZ  ZS )AlignVisionDepthwiseLayerzk
    This corresponds to the depthwise convolution phase of each block in the original implementation.
    rH   r   r]   rR   adjust_paddingc                    sv   t    || _| jdkrdnd}t||d}tj|d| _t||||dd| _tj	||j
|jd| _t|j | _d S )	NrJ   r[   r   )rS   rY   Fr\   r   r_   r`   )ra   rb   r]   rV   r   rd   depthwise_conv_padrv   depthwise_convrg   rh   ri   depthwise_normr   rk   depthwise_act)r7   rH   r   r]   rR   r   conv_padrZ   rn   r&   r'   rb      s   


z"AlignVisionDepthwiseLayer.__init__r   r1   c                 C   s6   | j dkr
| |}| |}| |}| |}|S )NrJ   )r]   r   r   r   r   r   r&   r&   r'   rs     s   




z!AlignVisionDepthwiseLayer.forwardr   r   r    r!   r   rN   boolrb   r"   r#   rt   rs   ru   r&   r&   rn   r'   r      s    r   c                	       sJ   e Zd ZdZddedededef fddZd	ej	d
ej
fddZ  ZS )AlignVisionSqueezeExciteLayerzl
    This corresponds to the Squeeze and Excitement phase of each block in the original implementation.
    FrH   r   
expand_dimexpandc                    s   t    |r	|n|| _tdt||j | _tjdd| _	tj
| j| jddd| _tj
| j| jddd| _t|j | _t | _d S )Nr   )output_sizer   )rx   ry   rR   rZ   )ra   rb   dimrM   rN   squeeze_expansion_ratiodim_ser   AdaptiveAvgPool2dsqueezere   reducer   r   rk   
act_reduceSigmoid
act_expand)r7   rH   r   r   r   rn   r&   r'   rb   !  s$   
z&AlignVisionSqueezeExciteLayer.__init__r   r1   c                 C   sF   |}|  |}| |}| |}| |}| |}t||}|S rq   )r   r   r   r   r   r"   mul)r7   r   inputsr&   r&   r'   rs   6  s   




z%AlignVisionSqueezeExciteLayer.forward)Fr   r&   r&   rn   r'   r     s     r   c                       sV   e Zd ZdZdedededededef fdd	Zd
e	j
de	j
de	jfddZ  ZS )AlignVisionFinalBlockLayerz[
    This corresponds to the final phase of each block in the original implementation.
    rH   r   rc   r]   	drop_rateid_skipc                    sX   t    |dko| | _tj||dddd| _tj||j|jd| _	tj
|d| _d S )Nr   r   Fr   r   )p)ra   rb   apply_dropoutr   re   project_convrg   rh   ri   
project_bnDropoutdropout)r7   rH   r   rc   r]   r   r   rn   r&   r'   rb   H  s   

z#AlignVisionFinalBlockLayer.__init__
embeddingsr   r1   c                 C   s0   |  |}| |}| jr| |}|| }|S rq   )r   r   r   r   )r7   r   r   r&   r&   r'   rs   Y  s   


z"AlignVisionFinalBlockLayer.forwardr   r   r    r!   r   rN   floatr   rb   r"   r#   rt   rs   ru   r&   r&   rn   r'   r   C  s     $r   c                       s\   e Zd ZdZdededededededed	ed
ef fddZde	j
de	jfddZ  ZS )AlignVisionBlocka  
    This corresponds to the block module of original the EfficientNet vision encoder implementation.

    Args:
        config ([`AlignVisionConfig`]):
            Model configuration class.
        in_dim (`int`):
            Number of input channels.
        out_dim (`int`):
            Number of output channels.
        stride (`int`):
            Stride size to be used in convolution layers.
        expand_ratio (`int`):
            Expand ratio to set the output dimensions for the expansion and squeeze-excite layers.
        kernel_size (`int`):
            Kernel size for the depthwise convolution layer.
        drop_rate (`float`):
            Dropout rate to be used in the final phase of each block.
        id_skip (`bool`):
            Whether to apply dropout and sum the final hidden states with the input embeddings during the final phase
            of each block. Set to `True` for the first block of each stage.
        adjust_padding (`bool`):
            Whether to apply padding to only right and bottom side of the input kernel before the depthwise convolution
            operation, set to `True` for inputs with odd input sizes.
    rH   r   rc   r]   expand_ratiorR   r   r   r   c
                    s   t    || _| jdkrdnd| _|| }
| jr"t|||
|d| _t|| jr)|
n||||	d| _t|||
| jd| _	t
|| jrB|
n|||||d| _d S )Nr   TF)rH   r   rc   r]   )rH   r   r]   rR   r   )rH   r   r   r   )rH   r   rc   r]   r   r   )ra   rb   r   r   r~   	expansionr   r   r   squeeze_exciter   
projection)r7   rH   r   rc   r]   r   rR   r   r   r   expand_in_dimrn   r&   r'   rb     s4   

zAlignVisionBlock.__init__r   r1   c                 C   s<   |}| j dkr| |}| |}| |}| ||}|S )Nr   )r   r   r   r   r   )r7   r   r   r&   r&   r'   rs     s   



zAlignVisionBlock.forwardr   r&   r&   rn   r'   r   d  s,    	
)r   c                	       sP   e Zd ZdZdef fddZ		ddejdee	 d	ee	 d
e
fddZ  ZS )AlignVisionEncoderz
    Forward propagates the embeddings through each vision encoder (EfficientNet) block.

    Args:
        config ([`AlignVisionConfig`]):
            Model configuration class.
    rH   c                    s8  t    |j_fdd t|j}t fdd|jD }d}g }t|D ]k}t||j| }t||j	| }|j
| }	|j| }
|j| }t |j| D ]@}|dkrZdnd}|dkrbdn|	}	|dkrj|n|}||jv rsdnd}|j| | }t||||	|
||||d		}|| |d7 }qRq(t|_d S )
Nc                    s   t t j|  S rq   )rN   mathceildepth_coefficient)repeatsr6   r&   r'   round_repeats  s   z2AlignVisionEncoder.__init__.<locals>.round_repeatsc                 3   s    | ]} |V  qd S rq   r&   )r4   n)r   r&   r'   r8     s    z.AlignVisionEncoder.__init__.<locals>.<genexpr>r   TFr   )	rH   r   rc   r]   rR   r   r   r   r   )ra   rb   r   rA   rx   sumnum_block_repeatsrangerQ   ry   strideskernel_sizesexpand_ratiosdepthwise_paddingdrop_connect_rater   appendr   
ModuleListblocks)r7   rH   num_base_blocks
num_blockscurr_block_numr   ir   rc   r]   rR   r   jr   r   r   blockrn   )r   r7   r'   rb     sD   






zAlignVisionEncoder.__init__FTr   output_hidden_statesreturn_dictr1   c                 C   sV   |r|fnd }| j D ]}||}|r||f7 }q
|s%tdd ||fD S t||dS )Nc                 s       | ]	}|d ur|V  qd S rq   r&   r4   vr&   r&   r'   r8     s    z-AlignVisionEncoder.forward.<locals>.<genexpr>)r   r   )r   r%   r
   )r7   r   r   r   all_hidden_statesr   r&   r&   r'   rs     s   

zAlignVisionEncoder.forward)FT)r   r   r    r!   r   rb   r"   r#   r   r   r   rs   ru   r&   r&   rn   r'   r     s    .r   c                       sh   e Zd ZdZ fddZ					ddeej deej deej d	eej d
e	dej
fddZ  ZS )AlignTextEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    s   t    tj|j|j|jd| _t|j|j| _	t|j
|j| _tj|j|jd| _t|j| _t|dd| _| jdt|jddd | jd	tj| j tjd
dd d S )N)padding_idxr_   position_embedding_typeabsoluteposition_ids)r   F)
persistenttoken_type_idsdtype)ra   rb   r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsr   hidden_dropout_probr   r2   r   register_bufferr"   r@   r   rw   r   sizelongrm   rn   r&   r'   rb     s   

zAlignTextEmbeddings.__init__Nr   	input_idsr   r   inputs_embedspast_key_values_lengthr1   c                 C   s   |d ur	|  }n|  d d }|d }|d u r&| jd d ||| f }|d u rPt| drE| jd d d |f }||d |}	|	}ntj|tj| jjd}|d u rY| 	|}| 
|}
||
 }| jdkrp| |}||7 }| |}| |}|S )Nr   r   r   r   r   r<   r   )r   r   hasattrr   r   r"   rw   r   r<   r   r   r   r   r   r   )r7   r   r   r   r   r   input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr   r   r   r&   r&   r'   rs     s,   







zAlignTextEmbeddings.forward)NNNNr   )r   r   r    r!   rb   r   r"   
LongTensorr#   rN   rt   rs   ru   r&   r&   rn   r'   r      s*    r   c                       s   e Zd Zd fdd	ZdejdejfddZ						dd	ejd
eej deej deej deej dee	e	ej   dee
 de	ej fddZ  ZS )AlignTextSelfAttentionNc                    s   t    |j|j dkrt|dstd|j d|j d|j| _t|j|j | _| j| j | _t	
|j| j| _t	
|j| j| _t	
|j| j| _t	|j| _|p\t|dd| _| jdksh| jd	kry|j| _t	d
|j d | j| _|j| _d S )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()r   r   relative_keyrelative_key_queryrJ   r   )ra   rb   r   num_attention_headsr   
ValueErrorrN   attention_head_sizeall_head_sizer   Linearquerykeyvaluer   attention_probs_dropout_probr   r2   r   r   r   distance_embedding
is_decoderr7   rH   r   rn   r&   r'   rb   B  s*   

zAlignTextSelfAttention.__init__xr1   c                 C   s6   |  d d | j| jf }||}|ddddS )Nr   r   rJ   r   r   )r   r   r   viewpermute)r7   r  new_x_shaper&   r&   r'   transpose_for_scores\  s   
z+AlignTextSelfAttention.transpose_for_scoresFr   attention_mask	head_maskencoder_hidden_statesencoder_attention_maskpast_key_valueoutput_attentionsc                 C   s  |  |}|d u}	|	r|d ur|d }
|d }|}nP|	r/| | |}
| | |}|}n;|d urZ| | |}
| | |}tj|d |
gdd}
tj|d |gdd}n| | |}
| | |}| |}|d u}| jrz|
|f}t||
dd}| j	dks| j	dkr	|j
d |
j
d }}|rtj|d tj|jd	dd}ntj|tj|jd	dd}tj|tj|jd	dd}|| }| || j d }|j|jd
}| j	dkrtd||}|| }n| j	dkr	td||}td|
|}|| | }|t| j }|d ur|| }tjj|dd}| |}|d ur0|| }t||}|dddd }| d d | jf }||}|rX||fn|f}| jrd||f }|S )Nr   r   rJ   r   r   r   r   r   r   zbhld,lrd->bhlrzbhrd,lrd->bhlrr   ) r   r	  r   r   r"   catr  matmul	transposer   shapetensorr   r<   r  r@   r  r   tor   einsumr   sqrtr   r   r>   softmaxr   r  
contiguousr   r   )r7   r   r
  r  r  r  r  r  mixed_query_layeris_cross_attention	key_layervalue_layerquery_layer	use_cacheattention_scoresquery_length
key_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layernew_context_layer_shapeoutputsr&   r&   r'   rs   a  sn   









zAlignTextSelfAttention.forwardrq   NNNNNF)r   r   r    rb   r"   rt   r	  r   r#   r%   r   rs   ru   r&   r&   rn   r'   r   A  s4    	r   c                       8   e Zd Z fddZdejdejdejfddZ  ZS )AlignTextSelfOutputc                    sB   t    t|j|j| _tj|j|jd| _t|j	| _
d S Nr   )ra   rb   r   r   r   denser   r   r   r   r   rm   rn   r&   r'   rb        
zAlignTextSelfOutput.__init__r   input_tensorr1   c                 C   &   |  |}| |}| || }|S rq   r4  r   r   r7   r   r6  r&   r&   r'   rs        

zAlignTextSelfOutput.forwardr   r   r    rb   r"   rt   rs   ru   r&   r&   rn   r'   r2        $r2  eagerc                       s   e Zd Zd fdd	Zdd Z						ddejdeej d	eej d
eej deej dee	e	ej   dee
 de	ej fddZ  ZS )AlignTextAttentionNc                    s4   t    t|j ||d| _t|| _t | _d S )Nr   )	ra   rb   !ALIGN_TEXT_SELF_ATTENTION_CLASSES_attn_implementationr7   r2  outputsetpruned_headsr  rn   r&   r'   rb     s   

zAlignTextAttention.__init__c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   r  )rA   r   r7   r   r   rD  r   r   r   r   rB  r4  r   union)r7   headsindexr&   r&   r'   prune_heads  s   zAlignTextAttention.prune_headsFr   r
  r  r  r  r  r  r1   c              	   C   s<   |  |||||||}| |d |}	|	f|dd   }
|
S )Nr   r   )r7   rB  )r7   r   r
  r  r  r  r  r  self_outputsattention_outputr/  r&   r&   r'   rs     s   
	zAlignTextAttention.forwardrq   r0  )r   r   r    rb   rH  r"   rt   r   r#   r%   r   rs   ru   r&   r&   rn   r'   r>    s4    	r>  c                       2   e Zd Z fddZdejdejfddZ  ZS )AlignTextIntermediatec                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S rq   )ra   rb   r   r   r   intermediate_sizer4  rT   rk   strr   intermediate_act_fnrm   rn   r&   r'   rb     s
   
zAlignTextIntermediate.__init__r   r1   c                 C   s   |  |}| |}|S rq   )r4  rO  r   r&   r&   r'   rs     s   

zAlignTextIntermediate.forwardr;  r&   r&   rn   r'   rL    s    rL  c                       r1  )AlignTextOutputc                    sB   t    t|j|j| _tj|j|jd| _t	|j
| _d S r3  )ra   rb   r   r   rM  r   r4  r   r   r   r   r   rm   rn   r&   r'   rb   !  r5  zAlignTextOutput.__init__r   r6  r1   c                 C   r7  rq   r8  r9  r&   r&   r'   rs   '  r:  zAlignTextOutput.forwardr;  r&   r&   rn   r'   rP     r<  rP  c                       s   e Zd Z fddZ						ddejdeej deej deej d	eej d
eeeej   dee	 deej fddZ
dd Z  ZS )AlignTextLayerc                    sr   t    |j| _d| _t|| _|j| _|j| _| jr-| js&t|  dt|dd| _	t
|| _t|| _d S )Nr   z> should be used as a decoder model if cross attention is addedr   r?  )ra   rb   chunk_size_feed_forwardseq_len_dimr>  	attentionr  add_cross_attentionr   crossattentionrL  intermediaterP  rB  rm   rn   r&   r'   rb   0  s   


zAlignTextLayer.__init__NFr   r
  r  r  r  r  r  r1   c              	   C   s  |d ur
|d d nd }| j |||||d}	|	d }
| jr(|	dd }|	d }n|	dd  }d }| jro|d urot| dsDtd|  d|d urN|d	d  nd }| |
||||||}|d }
||dd  }|d }|| }t| j| j| j|
}|f| }| jr||f }|S )
NrJ   )r  r  r   r   r   rV  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r  )	rT  r  r   r   rV  r   feed_forward_chunkrR  rS  )r7   r   r
  r  r  r  r  r  self_attn_past_key_valueself_attention_outputsrJ  r/  present_key_valuecross_attn_present_key_valuecross_attn_past_key_valuecross_attention_outputslayer_outputr&   r&   r'   rs   >  sP   


	

zAlignTextLayer.forwardc                 C   s   |  |}| ||}|S rq   )rW  rB  )r7   rJ  intermediate_outputr_  r&   r&   r'   rX    s   
z!AlignTextLayer.feed_forward_chunkr0  )r   r   r    rb   r"   rt   r   r#   r%   r   rs   rX  ru   r&   r&   rn   r'   rQ  /  s4    	
ArQ  c                       s   e Zd Z fddZ									ddejdeej deej d	eej d
eej deeeej   dee	 dee	 dee	 dee	 de
eej ef fddZ  ZS )AlignTextEncoderc                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r&   )rQ  )r4   _rH   r&   r'   
<listcomp>  s    z-AlignTextEncoder.__init__.<locals>.<listcomp>F)	ra   rb   rH   r   r   r   num_hidden_layerslayergradient_checkpointingrm   rn   rc  r'   rb     s   
 
zAlignTextEncoder.__init__NFTr   r
  r  r  r  past_key_valuesr!  r  r   r   r1   c              
   C   s8  |	rdnd }|r
dnd }|r| j jrdnd }| jr%| jr%|r%td d}|r)dnd }t| jD ]K\}}|	r;||f }|d urC|| nd }|d urM|| nd }||||||||d}|d }|rg||d f7 }|r{||d f }| j jr{||d f }q0|	r||f }|
std	d
 |||||fD S t	|||||dS )Nr&   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)r  r  r  r   r   r   rJ   c                 s   r   rq   r&   r   r&   r&   r'   r8     s    z+AlignTextEncoder.forward.<locals>.<genexpr>)r   rh  r   r*   cross_attentions)
rH   rU  rg  trainingloggerwarning_once	enumeraterf  r%   r   )r7   r   r
  r  r  r  rh  r!  r  r   r   r   all_self_attentionsall_cross_attentionsnext_decoder_cacher   layer_modulelayer_head_maskr  layer_outputsr&   r&   r'   rs     sd   


zAlignTextEncoder.forward)	NNNNNNFFT)r   r   r    rb   r"   rt   r   r#   r%   r   r   r   rs   ru   r&   r&   rn   r'   ra    sD    		
ra  c                       rK  )AlignTextPoolerc                    s*   t    t|j|j| _t | _d S rq   )ra   rb   r   r   r   r4  Tanhrl   rm   rn   r&   r'   rb     s   
zAlignTextPooler.__init__r   r1   c                 C   s(   |d d df }|  |}| |}|S )Nr   )r4  rl   )r7   r   first_token_tensorpooled_outputr&   r&   r'   rs     s   

zAlignTextPooler.forwardr;  r&   r&   rn   r'   rt    s    rt  c                   @   s    e Zd ZeZdZdZdd ZdS )AlignPreTrainedModelalignTc                 C   s   t |tjtjfr |jjjd| jjd |j	dur|j	j
  n8t |tr9tj|jj |jj	j
  d|j_nt |tjrX|jjjd| jjd |jdurX|jj|j 
  t |tjrm|j	j
  |jjd dS dS )zInitialize the weightsg        )meanstdNTg      ?)rT   r   r   re   weightdatanormal_rH   initializer_ranger^   zero_
AlignModelinitxavier_uniform_text_projection_is_hf_initializedr   r   r   fill_)r7   moduler&   r&   r'   _init_weights  s"   



z"AlignPreTrainedModel._init_weightsN)r   r   r    r   config_classbase_model_prefixsupports_gradient_checkpointingr  r&   r&   r&   r'   rx    s
    rx  zJ
    The text model from ALIGN without any head or projection on top.
    c                       s   e Zd ZeZdgZddedef fddZdd Zd	d
 Z	e
									ddeej deej deej deej deej deej dee dee dee deeef fddZ  ZS )AlignTextModelr   TrH   add_pooling_layerc                    sD   t  | || _t|| _t|| _|rt|nd| _| 	  dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)
ra   rb   rH   r   r   ra  encoderrt  pooler	post_init)r7   rH   r  rn   r&   r'   rb   	  s   

zAlignTextModel.__init__c                 C   s   | j jS rq   r   r   r6   r&   r&   r'   get_input_embeddings  s   z#AlignTextModel.get_input_embeddingsc                 C   s   || j _d S rq   r  )r7   r   r&   r&   r'   set_input_embeddings  s   z#AlignTextModel.set_input_embeddingsNr   r
  r   r   r  r   r  r   r   r1   c
                 C   s  |dur|n| j j}|dur|n| j j}|	dur|	n| j j}	|dur*|dur*td|dur9| || | }
n|durF| dd }
ntd|
\}}|durU|jn|j}|du retj	||f|d}|du rt
| jdr| jjddd|f }|||}|}n	tj|
tj|d}| ||
}| || j j}| j||||d}| j||||||	d	}|d
 }| jdur| |nd}|	s||f|dd  S t|||j|j|jdS )a-  
        Examples:

        ```python
        >>> from transformers import AutoTokenizer, AlignTextModel

        >>> model = AlignTextModel.from_pretrained("kakaobrain/align-base")
        >>> tokenizer = AutoTokenizer.from_pretrained("kakaobrain/align-base")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```NzDYou cannot specify both input_ids and inputs_embeds at the same timer   z5You have to specify either input_ids or inputs_embedsr;   r   r   )r   r   r   r   )r
  r  r  r   r   r   r   )r   pooler_outputr   r*   ri  )rH   r  r   use_return_dictr   %warn_if_padding_and_no_attention_maskr   r<   r"   onesr   r   r   r   rw   r   get_extended_attention_maskget_head_maskre  r  r  r   r   r*   ri  )r7   r   r
  r   r   r  r   r  r   r   r   
batch_sizer   r<   r   r   extended_attention_maskembedding_outputencoder_outputssequence_outputrw  r&   r&   r'   rs     sb   
zAlignTextModel.forwardT	NNNNNNNNN)r   r   r    r   r  _no_split_modulesr   rb   r  r  r   r   r"   rt   r   r%   r   rs   ru   r&   r&   rn   r'   r     sJ    	

r  zL
    The vision model from ALIGN without any head or projection on top.
    c                       sz   e Zd ZeZdZdZdef fddZdej	fddZ
e						ddeej d
ee dee deeef fddZ  ZS )AlignVisionModelrp   FrH   c                    s~   t  | || _t|| _t|| _|jdkr"tj	|j
dd| _n|jdkr1tj|j
dd| _ntd|j |   d S )Nrz  T)	ceil_moderM   z2config.pooling must be one of ['mean', 'max'] got )ra   rb   rH   rW   r   r   r  pooling_typer   	AvgPool2d
hidden_dimr  	MaxPool2dr   poolingr  rm   rn   r&   r'   rb     s   



zAlignVisionModel.__init__r1   c                 C   s
   | j jjS rq   )vision_modelr   rf   r6   r&   r&   r'   r    s   
z%AlignVisionModel.get_input_embeddingsNr   r   c                 C   s   |dur|n| j j}|dur|n| j j}|du rtd| |}| j|||d}|d }| |}||jdd }|sH||f|dd  S t	|||j
dS )a  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, AlignVisionModel

        >>> model = AlignVisionModel.from_pretrained("kakaobrain/align-base")
        >>> processor = AutoProcessor.from_pretrained("kakaobrain/align-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```Nz You have to specify pixel_values)r   r   r   rJ   r   )r   r  r   )rH   r   r  r   r   r  r  reshaper  r   r   )r7   rp   r   r   r  r  r   rw  r&   r&   r'   rs     s*   

zAlignVisionModel.forwardNNN)r   r   r    r   r  main_input_namer  rb   r   Moduler  r   r   r"   r#   r   r   r%   r   rs   ru   r&   r&   rn   r'   r    s&    
r  c                       sp  e Zd ZeZdef fddZe									ddeej	 deej	 deej	 deej	 d	eej	 d
eej	 dee
 dee
 dee
 dejfddZe			ddeej dee
 dee
 dejfddZe											ddeej deej deej	 deej	 deej	 d	eej	 d
eej	 dee
 dee
 dee
 dee
 deeef fddZ  ZS )r  rH   c                    s   t  | t|jtstdt|j dt|jts(tdt|j d|j}|j}|j	| _	|j
| _t|| _t|| _t| j| j	| _tt| jj| _|   d S )NzLconfig.text_config is expected to be of type AlignTextConfig but is of type .zPconfig.vision_config is expected to be of type AlignVisionConfig but is of type )ra   rb   rT   text_configr   	TypeErrortypevision_configr   projection_dimr   text_embed_dimr  
text_modelr  r  r   r   r  	Parameterr"   r  rH   temperature_init_valuetemperaturer  )r7   rH   r  r  rn   r&   r'   rb     s,   

zAlignModel.__init__Nr   r
  r   r   r  r   r  r   r   r1   c
                 C   s   |dur|n| j j}|dur|n| j j}|	dur|	n| j j}	| j|||||||||	d	}
|
d dddddf }| |}|S )a  
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`AlignTextModel`].

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, AlignModel

        >>> model = AlignModel.from_pretrained("kakaobrain/align-base")
        >>> tokenizer = AutoTokenizer.from_pretrained("kakaobrain/align-base")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
        >>> text_features = model.get_text_features(**inputs)
        ```N	r   r
  r   r   r  r   r  r   r   r   )rH   r  r   r  r  r  )r7   r   r
  r   r   r  r   r  r   r   text_outputsr   text_featuresr&   r&   r'   get_text_features  s$   
zAlignModel.get_text_featuresrp   c                 C   sD   |dur|n| j j}|dur|n| j j}| j|||d}|d }|S )a9  
        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`AlignVisionModel`].

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, AlignModel

        >>> model = AlignModel.from_pretrained("kakaobrain/align-base")
        >>> processor = AutoProcessor.from_pretrained("kakaobrain/align-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> image_features = model.get_image_features(**inputs)
        ```Nrp   r   r   r   )rH   r   r  r  )r7   rp   r   r   vision_outputsimage_featuresr&   r&   r'   get_image_features/  s   zAlignModel.get_image_featuresreturn_lossc                 C   s*  |	dur|	n| j j}	|
dur|
n| j j}
|dur|n| j j}| j||
|d}| j|||||||	|
|d	}|d }|d dddddf }| |}||jdddd	 }||jdddd	 }t	||
 | j }|
 }d}|rut|}|s||||||f}|dur|f| S |S t|||||||d
S )a  
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, AlignModel

        >>> model = AlignModel.from_pretrained("kakaobrain/align-base")
        >>> processor = AutoProcessor.from_pretrained("kakaobrain/align-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(
        ...     images=image, text=["a photo of a cat", "a photo of a dog"], return_tensors="pt", padding=True
        ... )

        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```Nr  r  r   r   rJ   r   T)r   r   keepdim)r,   r-   r.   r)   r   r/   r0   )rH   r  r   r  r  r  r  normr"   r  rD   r  rG   r+   )r7   r   rp   r
  r   r   r  r   r  r  r   r   r  r  r   r)   r.   r-   r,   rB  r&   r&   r'   rs   \  sT   )
zAlignModel.forwardr  r  )NNNNNNNNNNN)r   r   r    r   r  rb   r   r   r"   rt   r   r#   r  r  r   r   r%   r+   rs   ru   r&   r&   rn   r'   r    s    	
4,	

r  )rx  r  r  r  r  )Ir!   r   dataclassesr   typingr   r   r   r"   torch.utils.checkpointr   activationsr   modeling_layersr	   modeling_outputsr
   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r   configuration_alignr   r   r   
get_loggerr   rk  r   r(   r+   rt   rB   rG   rN   rQ   r%   r   rV   r  rW   re   rv   r~   r   r   r   r   r   r   r   r2  r@  r>  rL  rP  rQ  ra  rt  rx  r  r  r  __all__r&   r&   r&   r'   <module>   s   
%('!QKA 4WR|R d