o
    ei                     @   s  d Z ddlZddlmZ ddlmZ ddlmZ ddlZddl	m
Z
 ddlmZ ddlmZ dd	lmZ dd
lmZmZmZmZ ddlmZmZ ddlmZ ddlmZ ddlmZm Z m!Z!m"Z"m#Z#m$Z$ ddl%m&Z&m'Z'm(Z( e#)e*Z+dej,dej,fddZ-dej,dej,fddZ.ee!G dd deZ/G dd de
j0Z1G dd de
j0Z2G dd de
j0Z3d e2iZ4G d!d" d"e
j0Z5G d#d$ d$e
j0Z6G d%d& d&e
j0Z7G d'd( d(eZ8G d)d* d*e
j0Z9G d+d, d,e
j0Z:	-dPd.e
j0d/ej,d0ej,d1ej,d2ej,dB d3e;d4e;fd5d6Z<G d7d8 d8e
j0Z=G d9d: d:e
j0Z>G d;d< d<eZ?G d=d> d>e
j0Z@G d?d@ d@e
j0ZAe!G dAdB dBeZBG dCdD dDe
j0ZCG dEdF dFeBZDe!dGdHG dIdJ dJeBZEG dKdL dLeBZFG dMdN dNeBZGg dOZHdS )QzPyTorch AltCLIP model.    N)Callable)	dataclass)Any   )initialization)ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling,BaseModelOutputWithPoolingAndCrossAttentions'BaseModelOutputWithPoolingAndProjection)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)apply_chunking_to_forward)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging	torch_int   )AltCLIPConfigAltCLIPTextConfigAltCLIPVisionConfiglogitsreturnc                 C   s   t j| tjt| | jdS )Ndevice)nn
functionalcross_entropytorcharangelenr   )r    r%   j/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/altclip/modeling_altclip.pycontrastive_loss-   s   r'   
similarityc                 C   s    t | }t |  }|| d S )Ng       @)r'   t)r(   caption_loss
image_lossr%   r%   r&   	clip_loss1   s   r,   c                   @   s   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	ejdB ed< dZ
ejdB ed< dZejdB ed< dZeed< dZeed	< d
ee fddZdS )AltCLIPOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`AltCLIPTextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The image embeddings obtained by applying the projection layer to the pooled output of [`AltCLIPVisionModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`AltCLIPTextModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`AltCLIPVisionModel`].
    Nlosslogits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_outputr   c                    s   t  fdd  D S )Nc                 3   s.    | ]}|d vr | nt  | V  qdS ))r3   r4   N)getattrto_tuple).0kselfr%   r&   	<genexpr>W   s
    
z)AltCLIPOutput.to_tuple.<locals>.<genexpr>)tuplekeysr9   r%   r9   r&   r6   V   s   zAltCLIPOutput.to_tuple)__name__
__module____qualname____doc__r.   r"   FloatTensor__annotations__r/   r0   r1   r2   r3   r
   r4   r<   r   r6   r%   r%   r%   r&   r-   7   s   
 r-   c                       s   e Zd ZdZ fddZ					ddejdB dejdB dejdB d	ejdB d
edej	fddZ
edd ZedddZ  ZS )AltRobertaEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    s   t    tj|j|j|jd| _t|j|j| _	tj
|j|jd| _
t|j| _| jdt|jddd | jdtj| j tjddd |j| _tj|j|j| jd| _d S )	N)padding_idxepsposition_idsr   F
persistenttoken_type_idsdtype)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_bufferr"   r#   max_position_embeddingsexpandzerosrH   sizelongrE   position_embeddingsr:   config	__class__r%   r&   rQ   a   s   
zAltRobertaEmbeddings.__init__Nr   	input_idsrM   rH   inputs_embedspast_key_values_lengthr   c                 C   s  |d u r|d ur|  || j|}n| || j}|d ur!| }n| d d }|\}}|d u rZt| drO| j|jd d}	tj	|	d|d}	|	||}ntj
|tj| jjd}|d u rc| |}| |}
||
 }| |}|| }| |}| |}|S )NrJ   rM   r   r   )dimindexrO   r   )"create_position_ids_from_input_idsrE   &create_position_ids_from_inputs_embedsrb   hasattrrM   r`   shaper"   gatherra   rc   rH   r   rV   rX   rd   rY   r]   )r:   ri   rM   rH   rj   rk   input_shape
batch_size
seq_lengthbuffered_token_type_idsrX   
embeddingsrd   r%   r%   r&   forwardu   s2   






zAltRobertaEmbeddings.forwardc                 C   sJ   |   dd }|d }tj|d || d tj| jd}|d|S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        NrJ   r   rn   r   )rb   r"   r#   rc   r   	unsqueezer`   )rj   rE   rt   sequence_lengthrH   r%   r%   r&   rp      s   
z;AltRobertaEmbeddings.create_position_ids_from_inputs_embedsc                 C   s6   |  | }tj|dd|| | }| | S )a  
        Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
        are ignored. This is modified from fairseq's `utils.make_positions`.

        Args:
            x: torch.Tensor x:

        Returns: torch.Tensor
        r   rl   )neintr"   cumsumtype_asrc   )ri   rE   rk   maskincremental_indicesr%   r%   r&   ro      s   z7AltRobertaEmbeddings.create_position_ids_from_input_ids)NNNNr   )r   )r>   r?   r@   rA   rQ   r"   
LongTensorrB   r~   Tensorry   staticmethodrp   ro   __classcell__r%   r%   rg   r&   rD   ^   s2    
0
rD   c                
       N   e Zd Z fddZ		ddejdejdB dedB deej fd	d
Z	  Z
S )AltRobertaSelfAttentionc                    s   t    |j|j dkrt|dstd|j d|j d|j| _t|j|j | _| j| j | _t	
|j| j| _t	
|j| j| _t	
|j| j| _t	|j| _d S )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ())rP   rQ   rT   num_attention_headsrq   
ValueErrorr~   attention_head_sizeall_head_sizer   Linearquerykeyvaluer[   attention_probs_dropout_probr]   re   rg   r%   r&   rQ      s   

z AltRobertaSelfAttention.__init__NFhidden_statesattention_maskoutput_attentionsr   c                 C   s  |j d d }g |d| jR }| ||dd}| ||dd}| ||dd}t||dd}	|	t	
| j }	|d urO|	| }	tjj|	dd}
| |
}
t|
|}|dddd }| d d | jf }||}|r||
f}|S |f}|S )NrJ   r      r|   r   r   )rr   r   r   view	transposer   r   r"   matmulmathsqrtr   r    softmaxr]   permute
contiguousrb   r   )r:   r   r   r   rt   hidden_shapequery_layer	key_layervalue_layerattention_scoresattention_probscontext_layernew_context_layer_shapeoutputsr%   r%   r&   ry      s&   

zAltRobertaSelfAttention.forwardNFr>   r?   r@   rQ   r"   r   rB   boolr<   ry   r   r%   r%   rg   r&   r      s    r   c                       8   e Zd Z fddZdejdejdejfddZ  ZS )AltRobertaSelfOutputc                    sB   t    t|j|j| _tj|j|jd| _t|j	| _
d S NrF   )rP   rQ   r   r   rT   denserY   rZ   r[   r\   r]   re   rg   r%   r&   rQ        
zAltRobertaSelfOutput.__init__r   input_tensorr   c                 C   &   |  |}| |}| || }|S Nr   r]   rY   r:   r   r   r%   r%   r&   ry   
     

zAltRobertaSelfOutput.forwardr>   r?   r@   rQ   r"   r   ry   r   r%   r%   rg   r&   r         $r   eagerc                
       r   )AltRobertaAttentionc                    s(   t    t|j || _t|| _d S r   )rP   rQ   "ALT_ROBERTA_SELF_ATTENTION_CLASSES_attn_implementationr:   r   outputre   rg   r%   r&   rQ     s   
zAltRobertaAttention.__init__NFr   r   r   r   c                 C   s6   | j |||d}| |d |}|f|dd   }|S N)r   r   r   r   )r:   r   )r:   r   r   r   self_outputsattention_outputr   r%   r%   r&   ry     s   zAltRobertaAttention.forwardr   r   r%   r%   rg   r&   r     s    r   c                       2   e Zd Z fddZdejdejfddZ  ZS )AltRobertaIntermediatec                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S r   )rP   rQ   r   r   rT   intermediate_sizer   
isinstance
hidden_actstrr   intermediate_act_fnre   rg   r%   r&   rQ   .  s
   
zAltRobertaIntermediate.__init__r   r   c                 C   s   |  |}| |}|S r   )r   r   r:   r   r%   r%   r&   ry   6  s   

zAltRobertaIntermediate.forwardr   r%   r%   rg   r&   r   -  s    r   c                       r   )AltRobertaOutputc                    sB   t    t|j|j| _tj|j|jd| _t	|j
| _d S r   )rP   rQ   r   r   r   rT   r   rY   rZ   r[   r\   r]   re   rg   r%   r&   rQ   >  r   zAltRobertaOutput.__init__r   r   r   c                 C   r   r   r   r   r%   r%   r&   ry   D  r   zAltRobertaOutput.forwardr   r%   r%   rg   r&   r   =  r   r   c                       s^   e Zd Z fddZ		ddejdejdB dedB dee	 d	e
ej f
d
dZdd Z  ZS )AltRobertaLayerc                    s:   t    |j| _d| _t|| _t|| _t|| _	d S )Nr   )
rP   rQ   chunk_size_feed_forwardseq_len_dimr   	attentionr   intermediater   r   re   rg   r%   r&   rQ   M  s   


zAltRobertaLayer.__init__NFr   r   r   kwargsr   c           	      K   sN   | j |f||d|}|d }|dd  }t| j| j| j|}|f| }|S r   )r   r   feed_forward_chunkr   r   )	r:   r   r   r   r   self_attention_outputsr   r   layer_outputr%   r%   r&   ry   U  s   
zAltRobertaLayer.forwardc                 C   s   |  |}| ||}|S r   )r   r   )r:   r   intermediate_outputr   r%   r%   r&   r   l  s   
z"AltRobertaLayer.feed_forward_chunkr   )r>   r?   r@   rQ   r"   r   rB   r   r   r   r<   ry   r   r   r%   r%   rg   r&   r   L  s     
r   c                       sr   e Zd Z fddZe				ddejdejdB dedB d	edB d
edB de	e
 deej eB fddZ  ZS )AltRobertaEncoderc                    :   t     | _t fddt jD | _d| _d S )Nc                       g | ]}t  qS r%   )r   )r7   irf   r%   r&   
<listcomp>w      z.AltRobertaEncoder.__init__.<locals>.<listcomp>F)	rP   rQ   rf   r   
ModuleListrangenum_hidden_layerslayergradient_checkpointingre   rg   r   r&   rQ   t     
 
zAltRobertaEncoder.__init__NFTr   r   r   output_hidden_statesreturn_dictr   r   c                 K   s   |rdnd }|r
dnd }t | jD ]"\}	}
|r||f }|
|||fi |}|d }|r3||d f }q|r;||f }t|||dS )Nr%   r   r   last_hidden_stater   
attentions)	enumerater   r	   )r:   r   r   r   r   r   r   all_hidden_statesall_self_attentionsr   layer_modulelayer_outputsr%   r%   r&   ry   z  s.   


zAltRobertaEncoder.forward)NFFT)r>   r?   r@   rQ   r   r"   r   rB   r   r   r   r<   r	   ry   r   r%   r%   rg   r&   r   s  s,    r   c                       r   )AltRobertaPoolerc                    s*   t    t|j|j| _t | _d S r   )rP   rQ   r   r   rT   r   Tanh
activationre   rg   r%   r&   rQ     s   
zAltRobertaPooler.__init__r   r   c                 C   s(   |d d df }|  |}| |}|S )Nr   )r   r   )r:   r   first_token_tensorpooled_outputr%   r%   r&   ry     s   

zAltRobertaPooler.forwardr   r%   r%   rg   r&   r     s    r           moduler   r   r   r   scalingr]   c           
      K   s|   t ||dd| }|d ur|| }tjj|dt jd|j}tjj	||| j
d}t ||}	|	dd }	|	|fS )NrJ   r   )rl   rO   )ptrainingr   r   )r"   r   r   r   r    r   float32torO   r]   r   r   )
r   r   r   r   r   r   r]   r   attn_weightsattn_outputr%   r%   r&   eager_attention_forward  s   
r   c                       sd   e Zd ZdZ fddZ		ddejdejdB dedB d	ee	 d
e
ejejdB f f
ddZ  ZS )AltCLIPAttentionz=Multi-headed attention from 'Attention Is All You Need' paperc                    s   t    || _|j| _|j| _| j| j | _| j| j | jkr-td| j d| j d| jd | _	|j
| _d| _t| j| j| _t| j| j| _t| j| j| _t| j| j| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: ).      F)rP   rQ   rf   rT   	embed_dimr   	num_headshead_dimr   scaleattention_dropoutr]   	is_causalr   r   k_projv_projq_projout_projre   rg   r%   r&   rQ     s$   

zAltCLIPAttention.__init__NFr   r   r   r   r   c                 K   s   |j \}}}| |}| |}	| |}
|||| j| jdd}|	||| j| jdd}	|
||| j| jdd}
t	| j
jt}|| ||	|
|f| j| jsUdn| jd|\}}|||| }| |}|sqd}||fS )z#Input shape: Batch x Time x Channelr   r   r   )r   r]   N)rr   r  r  r  r   r   r   r   r   get_interfacerf   r   r   r  r   r]   reshaper   r  )r:   r   r   r   r   ru   rv   r   queriesr=   valuesattention_interfacer   r   r%   r%   r&   ry     s6   	




zAltCLIPAttention.forwardr   )r>   r?   r@   rA   rQ   r"   r   r   r   r   r<   ry   r   r%   r%   rg   r&   r     s     r   c                       r   )
AltCLIPMLPc                    sD   t    || _t|j | _t|j|j	| _
t|j	|j| _d S r   )rP   rQ   rf   r   r   activation_fnr   r   rT   r   fc1fc2re   rg   r%   r&   rQ   
  s
   
zAltCLIPMLP.__init__r   r   c                 C   s"   |  |}| |}| |}|S r   )r  r  r  r   r%   r%   r&   ry     s   


zAltCLIPMLP.forwardr   r%   r%   rg   r&   r  	  s    r  c                       sV   e Zd Zdef fddZ	ddejdejdedB d	ee	 d
e
ej f
ddZ  ZS )AltCLIPEncoderLayerrf   c                    sR   t    |j| _t|| _tj| j|jd| _	t
|| _tj| j|jd| _d S r   )rP   rQ   rT   r   r   	self_attnr   rY   rZ   layer_norm1r  mlplayer_norm2re   rg   r%   r&   rQ     s   


zAltCLIPEncoderLayer.__init__Fr   r   r   Nr   r   c                 K   sj   |}|  |}| jd|||d|\}}|| }|}| |}| |}|| }|f}|r3||f7 }|S )aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r   r   r   Nr%   )r  r  r  r  )r:   r   r   r   r   residualr   r   r%   r%   r&   ry   !  s$   




zAltCLIPEncoderLayer.forwardF)r>   r?   r@   r   rQ   r"   r   r   r   r   r<   rB   ry   r   r%   r%   rg   r&   r    s    r  c                       sp   e Zd ZdZdef fddZe				ddejdB de	dB de	dB d	e	dB d
e
e deeB fddZ  ZS )AltCLIPEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`AltCLIPEncoderLayer`].

    Args:
        config: AltCLIPConfig
    rf   c                    r   )Nc                    r   r%   )r  )r7   _r   r%   r&   r   V  r   z+AltCLIPEncoder.__init__.<locals>.<listcomp>F)	rP   rQ   rf   r   r   r   r   layersr   re   rg   r   r&   rQ   S  r   zAltCLIPEncoder.__init__Nr   r   r   r   r   r   c                 K   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}|r"dnd}|r(dnd}|}	t| jD ]#\}
}|r<||	f }||	|fd|i|}|d }	|rT||d f }q1|r\||	f }t|	||dS )ad  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr%   r   r   r   r   )rf   r   r   use_return_dictr   r  r	   )r:   rj   r   r   r   r   r   encoder_statesall_attentionsr   idxencoder_layerr   r%   r%   r&   ry   Y  s6    

zAltCLIPEncoder.forward)NNNN)r>   r?   r@   rA   r   rQ   r   r"   r   r   r   r   r<   r	   ry   r   r%   r%   rg   r&   r  J  s*    r  c                       sX   e Zd Zdef fddZdejdededejfdd	Zddej	dejfddZ
  ZS )AltCLIPVisionEmbeddingsrf   c                    s   t    || _|j| _|j| _|j| _tt	
| j| _tj|j| j| j| jdd| _| j| j d | _| jd | _t| j| j| _| jdt	| jddd d S )NF)in_channelsout_channelskernel_sizestridebiasr   r   rH   rI   rK   )rP   rQ   rf   rT   r   
image_size
patch_sizer   	Parameterr"   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positionsrR   position_embeddingr^   r#   r`   re   rg   r%   r&   rQ     s"   
"z AltCLIPVisionEmbeddings.__init__rx   heightwidthr   c                 C   s  |j d d }| jjd}|j d d }tj s(||kr(||kr(| | jS |ddddf }|ddddf }|j d }	|| j }
|| j }t	|d }|
d|||	}|dddd}tjj||
|fdd	d
}|dddddd|	}tj||fddS )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   NrJ   g      ?r   r   bicubicF)rb   modealign_cornersr|   )rr   r0  weightrz   r"   jit
is_tracingrH   r'  r   r	  r   r   r    interpolater   cat)r:   rx   r1  r2  r.  r0  r/  class_pos_embedpatch_pos_embedrl   
new_height	new_widthsqrt_num_positionsr%   r%   r&   interpolate_pos_encoding  s*   



z0AltCLIPVisionEmbeddings.interpolate_pos_encodingFpixel_valuesc              
   C   s   |j \}}}}|s&|| jks|| jkr&td| d| d| j d| j d	| jjj}| |j|d}|ddd}| j	
|dd}	tj|	|gdd	}
|r[|
| |
|| }
|
S |
| | j }
|
S )
NzInput image size (*z) doesn't match model (r   rN   r   r   rJ   r|   )rr   r&  r   r-  r6  rO   r   flattenr   r*  r`   r"   r:  r@  r0  rH   )r:   rA  r@  ru   r  r1  r2  target_dtypepatch_embedsclass_embedsrx   r%   r%   r&   ry     s    
zAltCLIPVisionEmbeddings.forwardr  )r>   r?   r@   r   rQ   r"   r   r~   r@  rB   ry   r   r%   r%   rg   r&   r     s     )r   c                   @   s6   e Zd ZU eed< dZdZdZg Ze	
 dd ZdS )AltCLIPPreTrainedModelrf   altclip)imagetextTc                 C   s  | j j}t|trE| j j}tj|jd|jd | d tj|jj	|j j
| d tj|jj	|j j
| d t|jt|jd dS t|tr| j j}|jd d|j j d  | }|jd | }tj|jj	|d tj|jj	|d tj|jj	|d tj|jj	|d dS t|tr| j j}|j jd d|j j d  | }d|j j d | }tj|jj	|d tj|jj	|d dS t|trtj|jj	|jd | j j d tj|jj	|j d | j j d dS t|t!j"rt#|j$ t%|j	 dS t|t!j&rtj|j	d| j jd |j$durt#|j$ dS dS t|t!j'rMtj|j	d| j jd |j(durIt)|j	dd	sKt#|j	|j(  dS dS dS t|t*rlt|jt|jj+d
 d t#|j, dS dS )zInitialize the weightsr   r   )meanstd)rL  rI   r   N_is_hf_initializedFrJ   )-rf   initializer_factorr   r   initnormal_r*  r   r-  r6  initializer_ranger0  copy_rH   r"   r#   r/  r`   r   r   r  r  r  r  r  rT   r  r  AltCLIPModeltext_projectiontext_embed_dimvisual_projectionvision_embed_dimr   rY   zeros_r%  ones_r   rR   rE   r5   rD   rr   rM   )r:   r   factorin_proj_stdout_proj_stdfc_stdr%   r%   r&   _init_weights  s^   
 

 

"z$AltCLIPPreTrainedModel._init_weightsN)r>   r?   r@   r   rC   base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_moduler"   no_gradr^  r%   r%   r%   r&   rG    s   
 rG  c                       sr   e Zd Zdef fddZee					ddejdB de	dB de	dB d	e	dB d
e	dB de
eB fddZ  ZS )AltCLIPVisionTransformerrf   c                    sR   t    || _|j}t|| _tj||jd| _	t
|| _tj||jd| _d S r   )rP   rQ   rf   rT   r   rx   r   rY   rZ   pre_layrnormr  encoderpost_layernorm)r:   rf   r   rg   r%   r&   rQ   )  s   


z!AltCLIPVisionTransformer.__init__NFrA  r   r   r   r@  r   c           
      C   s   |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u r&td| j||d}| |}| j|||dd}|d }|d d dd d f }	| |	}	t	||	|j
|jdS )Nz You have to specify pixel_values)r@  T)rj   r   r   r   r   r   pooler_outputr   r   )rf   r   r   r  r   rx   re  rf  rg  r
   r   r   )
r:   rA  r   r   r   r@  r   encoder_outputsr   r   r%   r%   r&   ry   3  s.   


z AltCLIPVisionTransformer.forward)NNNNF)r>   r?   r@   r   rQ   r   r   r"   rB   r   r<   r
   ry   r   r%   r%   rg   r&   rd  (  s,    
rd  c                       s   e Zd ZU eed< dZdZdef fddZdej	fddZ
e							
		ddejd	B ded	B ded	B deded	B dee deeB fddZ  ZS )AltCLIPVisionModelrf   rA  )rI  c                    s"   t  | t|| _|   d S r   )rP   rQ   rd  vision_model	post_initre   rg   r%   r&   rQ   a  s   
zAltCLIPVisionModel.__init__r   c                 C   
   | j jjS r   )rl  rx   r-  r9   r%   r%   r&   get_input_embeddingsg     
z'AltCLIPVisionModel.get_input_embeddingsNFr   r   r@  r   r   c                 K   s(   |dur|n| j j}| j|||||dS )a  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, AltCLIPVisionModel

        >>> model = AltCLIPVisionModel.from_pretrained("BAAI/AltCLIP")
        >>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```NrA  r   r   r@  r   )rf   r  rl  )r:   rA  r   r   r@  r   r   r%   r%   r&   ry   j  s    zAltCLIPVisionModel.forward)NNNFN)r>   r?   r@   r   rC   main_input_namer`  rQ   r   Modulero  r   r"   rB   r   r   r   r<   r
   ry   r   r%   r%   rg   r&   rk  \  s6   
 rk  aE  
    The model behaves as an encoder following the architecture described in *Attention is
    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
    Kaiser and Illia Polosukhin.

    .. _*Attention is all you need*: https://huggingface.co/papers/1706.03762
    )custom_introc                       s   e Zd ZU eed< d fdd	Zdd Zdd Ze																dd
e	j
d	B de	j
d	B de	j
d	B de	j
d	B de	j
d	B ded	B ded	B ded	B dee	j
 eB fddZ  ZS )AltRobertaModelrf   Tc                    sD   t  | || _t|| _t|| _|rt|nd| _| 	  dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)
rP   rQ   rf   rD   rx   r   rf  r   poolerrm  )r:   rf   add_pooling_layerrg   r%   r&   rQ     s   

zAltRobertaModel.__init__c                 C   s   | j jS r   rx   rV   r9   r%   r%   r&   ro    s   z$AltRobertaModel.get_input_embeddingsc                 C   s   || j _d S r   rx  r:   r   r%   r%   r&   set_input_embeddings     z$AltRobertaModel.set_input_embeddingsNri   r   rM   rH   rj   r   r   r   r   c	                 K   s  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d ur*|d ur*td|d ur9| || | }
n|d urF| d d }
ntd|
\}}|d urU|jn|j}|d u retj	||f|d}|d u rt
| jdr| jjd d d |f }|||}|}n	tj|
tj|d}| ||
}| j||||d}| j||||dd	}|d
 }| jd ur| |nd }t|||j|jdS )NzDYou cannot specify both input_ids and inputs_embeds at the same timerJ   z5You have to specify either input_ids or inputs_embedsr   rM   rn   )ri   rH   rM   rj   T)r   r   r   r   r   rh  )rf   r   r   r  r   %warn_if_padding_and_no_attention_maskrb   r   r"   onesrq   rx   rM   r`   ra   rc   get_extended_attention_maskrf  rv  r
   r   r   )r:   ri   r   rM   rH   rj   r   r   r   r   rt   ru   rv   r   rw    buffered_token_type_ids_expandedextended_attention_maskembedding_outputrj  sequence_outputr   r%   r%   r&   ry     sX   
zAltRobertaModel.forward)TNNNNNNNN)r>   r?   r@   r   rC   rQ   ro  rz  r   r"   r   r   r<   r   ry   r   r%   r%   rg   r&   ru    sB   
 
	ru  c                       s   e Zd ZU eed< dZ fddZdejfddZ	dej
dd	fd
dZdded	B dej
f fddZee																ddejd	B dejd	B dejd	B dejd	B dejd	B ded	B ded	B ded	B dee deeB fddZ  ZS )AltCLIPTextModelrf   )rJ  c                    sL   t  | t|dd| _t|j|j| _tj	|j|j
d| _|   d S )NF)rw  rF   )rP   rQ   ru  robertar   r   rT   project_dimtransformationrY   rZ   pre_LNrm  re   rg   r%   r&   rQ     s
   zAltCLIPTextModel.__init__r   c                 C   rn  r   r  rx   rV   r9   r%   r%   r&   ro    rp  z%AltCLIPTextModel.get_input_embeddingsr   Nc                 C   s   || j j_d S r   r  ry  r%   r%   r&   rz    s   z%AltCLIPTextModel.set_input_embeddingsnew_num_tokensc                    s   t  |S r   )rP   resize_token_embeddings)r:   r  rg   r%   r&   r    r{  z(AltCLIPTextModel.resize_token_embeddingsri   r   rM   rH   rj   r   r   r   r   c	              
   K   sn   |dur|n| j j}| j|||||||dd}
|
d }| |}| |}|dddf }t|||
j|
jdS )a+  
        Examples:

        ```python
        >>> from transformers import AutoProcessor, AltCLIPTextModel

        >>> model = AltCLIPTextModel.from_pretrained("BAAI/AltCLIP")
        >>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")

        >>> texts = ["it's a cat", "it's a dog"]

        >>> inputs = processor(text=texts, padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```NT)ri   r   rM   rH   rj   r   r   r   r   rh  )rf   r  r  r  r  r   r   r   )r:   ri   r   rM   rH   rj   r   r   r   r   r   r  projection_stateri  r%   r%   r&   ry     s*    

zAltCLIPTextModel.forwardr   r  )r>   r?   r@   r   rC   r`  rQ   r   rs  ro  rR   rz  r~   r  r   r   r"   r   r   r   r   r<   r   ry   r   r%   r%   rg   r&   r     sL   
 	
r  c                       s4  e Zd ZU eed< def fddZee			ddej	dej	dB dej	dB dej	dB d	e
e d
eeB fddZee	ddejded	e
e d
eeB fddZe										ddejdB dejdB dej	dB dejdB dej	dB dedB dedB dedB dededB d	e
e d
eeB fddZ  ZS )rS  rf   c                    s   t  | t|jtstdt|j dt|jts(tdt|j d|j}|j}|j	|_	|j
| _
|j| _|j| _t|| _t|| _tj| j| j
dd| _tj| j| j
dd| _tt| jj| _|   d S )NzRconfig.vision_config is expected to be of type AltCLIPVisionConfig but is of type .zNconfig.text_config is expected to be of type AltCLIPTextConfig but is of type F)r%  )rP   rQ   r   vision_configr   	TypeErrortypetext_configr   r   projection_dimr  rU  rT   rW  r  
text_modelrd  rl  r   r   rV  rT  r(  r"   tensorrf   logit_scale_init_valuelogit_scalerm  )r:   rf   r  r  rg   r%   r&   rQ   V  s2   

zAltCLIPModel.__init__Nri   r   rH   rM   r   r   c                 K   s2   | j d||||dd|}|j}| ||_|S )a  
        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, AltCLIPModel

        >>> model = AltCLIPModel.from_pretrained("BAAI/AltCLIP")
        >>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")

        >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
        >>> with torch.inference_mode():
        ...     text_features = model.get_text_features(**inputs)
        ```T)ri   r   rH   rM   r   Nr%   )r  ri  rT  )r:   ri   r   rH   rM   r   text_outputsr   r%   r%   r&   get_text_featuresw  s   zAltCLIPModel.get_text_featuresFrA  r@  c                 K   s.   | j d||dd|}|j}| ||_|S )ao  
        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, AltCLIPModel
        >>> from transformers.image_utils import load_image

        >>> model = AltCLIPModel.from_pretrained("BAAI/AltCLIP")
        >>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(images=image, return_tensors="pt")
        >>> with torch.inference_mode():
        ...     image_features = model.get_image_features(**inputs)
        ```T)rA  r@  r   Nr%   )rl  ri  rV  )r:   rA  r@  r   vision_outputsr   r%   r%   r&   get_image_features  s   zAltCLIPModel.get_image_featuresreturn_lossr   r   r   c              	   K   s(  |dur|n| j j}|dur|n| j j}|
dur|
n| j j}
| j|||||||
d}| j||||	|
d}|d }| |}|d }| |}||jdddd }||jdddd }| j	
 }t|| | }|j}d}|rtt|}|
s||||||f}|dur|f| S |S t|||||||d	S )
a  
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, AltCLIPModel

        >>> model = AltCLIPModel.from_pretrained("BAAI/AltCLIP")
        >>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))
        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
        ... )
        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```N)ri   r   rM   rH   r   r   r   rq  r   r   rJ   T)r   rl   keepdim)r.   r/   r0   r1   r2   r3   r4   )rf   r   r   r  r  rl  rV  rT  normr  expr"   r   r)   Tr,   r-   )r:   ri   rA  r   rH   rM   r  r   r   r@  r   r   r  r  r2   r1   r  r0   r/   r.   r   r%   r%   r&   ry     sX   (



zAltCLIPModel.forward)NNNr  )
NNNNNNNNFN)r>   r?   r@   r   rC   rQ   r   r   r"   r   r   r   r<   r
   r  rB   r   r  r   r-   ry   r   r%   r%   rg   r&   rS  S  s   
 !##	
rS  )rG  rk  r  rS  )r   )IrA   r   collections.abcr   dataclassesr   typingr   r"   torch.nnr    r   rO  activationsr   modeling_layersr   modeling_outputsr	   r
   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   utilsr   r   r   r   r   r   configuration_altclipr   r   r   
get_loggerr>   loggerr   r'   r,   r-   rs  rD   r   r   r   r   r   r   r   r   r   floatr   r   r  r  r  r   rG  rd  rk  ru  r  rS  __all__r%   r%   r%   r&   <module>   s    
$j;'.
A2QS949	bS Q