o
    	۷i                     @   s  d Z ddlmZ ddlmZmZmZmZ ddlZddlm	Z	 ddl
mZ ddlmZmZ dd	lmZ dd
lmZmZmZ ddlmZmZ ddlmZmZmZmZmZmZ ddlm Z m!Z!m"Z" e#e$Z%dej&dej&fddZ'dej&dej&fddZ(dej&dej&fddZ)eeddG dd deZ*eeddG dd deZ+eeG d d! d!eZ,G d"d# d#e	j-Z.G d$d% d%e	j-Z/	&	'dPd(e	j-d)ej&d*ej&d+ej&d,eej& d-e0d.e0d/e1fd0d1Z2G d2d3 d3e	j-Z3G d4d5 d5e	j-Z4G d6d7 d7eZ5eG d8d9 d9eZ6G d:d; d;e	j-Z7G d<d= d=e	j-Z8ed>dG d?d@ d@e6Z9G dAdB dBe	j-Z:edCdG dDdE dEe6Z;eG dFdG dGe6Z<eG dHdI dIe6Z=eG dJdK dKe6Z>edLdG dMdN dNe6Z?g dOZ@dS )QzPyTorch CLIP model.    )	dataclass)AnyCallableOptionalUnionN)nn   )ACT2FN) _create_4d_causal_attention_mask_prepare_4d_attention_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)ModelOutputauto_docstringcan_return_tuplefilter_out_non_signature_kwargslogging	torch_int   )
CLIPConfigCLIPTextConfigCLIPVisionConfiglogitsreturnc                 C   s   t j| tjt| | jdS )Ndevice)r   
functionalcross_entropytorcharangelenr   )r    r%   \/home/ubuntu/vllm_env/lib/python3.10/site-packages/transformers/models/clip/modeling_clip.pycontrastive_loss%   s   r'   
similarityc                 C   s    t | }t |  }|| d S )Ng       @)r'   t)r(   caption_loss
image_lossr%   r%   r&   	clip_loss)   s   r,   tensorc                 C   s,   t | d}t j|ddd}t |d}|S )z
    This method is equivalent to tensor.norm(p=2, dim=-1, keepdim=True) and used to make
    model `executorch` exportable. See issue https://github.com/pytorch/executorch/issues/3566
       T)dimkeepdim      ?)r"   powsum)r-   square_tensor
sum_tensornormed_tensorr%   r%   r&   _get_vector_norm/   s   r8   z}
    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
    )custom_introc                   @   j   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeejdf  ed< dZeeejdf  ed< dS )CLIPVisionModelOutputz
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The image embeddings obtained by applying the projection layer to the pooler_output.
    Nimage_embedslast_hidden_state.hidden_states
attentions)__name__
__module____qualname____doc__r<   r   r"   FloatTensor__annotations__r=   r>   tupler?   r%   r%   r%   r&   r;   :      
 r;   ze
    Base class for text model's outputs that also contains a pooling of the last hidden states.
    c                   @   r:   )CLIPTextModelOutputz
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The text embeddings obtained by applying the projection layer to the pooler_output.
    Ntext_embedsr=   .r>   r?   )r@   rA   rB   rC   rI   r   r"   rD   rE   r=   r>   rF   r?   r%   r%   r%   r&   rH   L   rG   rH   c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeej ed< dZeej ed< dZeed< dZeed	< d
ee fddZdS )
CLIPOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPTextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The image embeddings obtained by applying the projection layer to the pooled output of [`CLIPVisionModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`CLIPTextModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`CLIPVisionModel`].
    Nlosslogits_per_imagelogits_per_textrI   r<   text_model_outputvision_model_outputr   c                    s   t  fdd  D S )Nc                 3   s.    | ]}|d vr | nt  | V  qdS ))rN   rO   N)getattrto_tuple).0kselfr%   r&   	<genexpr>}   s
    
z&CLIPOutput.to_tuple.<locals>.<genexpr>)rF   keysrT   r%   rT   r&   rQ   |   s   zCLIPOutput.to_tuple)r@   rA   rB   rC   rK   r   r"   rD   rE   rL   rM   rI   r<   rN   r   rO   rF   r   rQ   r%   r%   r%   r&   rJ   ^   s   
 rJ   c                       sX   e Zd Zdef fddZdejdededejfdd	Zddej	dejfddZ
  ZS )CLIPVisionEmbeddingsconfigc                    s   t    || _|j| _|j| _|j| _tt	
| j| _tj|j| j| j| jdd| _| j| j d | _| jd | _t| j| j| _| jdt	| jddd d S )NF)in_channelsout_channelskernel_sizestridebiasr.   r   position_idsr   r/   
persistent)super__init__rY   hidden_size	embed_dim
image_size
patch_sizer   	Parameterr"   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr#   expandrU   rY   	__class__r%   r&   rd      s"   
"zCLIPVisionEmbeddings.__init__
embeddingsheightwidthr   c                 C   s  |j d d }| jjd}|j d d }tj s(||kr(||kr(| | jS |ddddf }|ddddf }|j d }	|| j }
|| j }t	|d }|
d|||	}|dddd}tjj||
|fdd	d
}|dddddd|	}tj||fddS )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   Nr/   r2   r   r.   bicubicF)sizemodealign_cornersr0   )shaperr   weight	unsqueezer"   jit
is_tracingr_   rh   r   reshapepermuter   r    interpolateviewcat)rU   rx   ry   rz   ro   rr   rp   class_pos_embedpatch_pos_embedr0   
new_height	new_widthsqrt_num_positionsr%   r%   r&   interpolate_pos_encoding   s*   



z-CLIPVisionEmbeddings.interpolate_pos_encodingFpixel_valuesc              
   C   s   |j \}}}}|s&|| jks|| jkr&td| d| d| j d| j d	| jjj}| |j|d}|ddd}| j	
|dd}	tj|	|gdd	}
|r[|
| |
|| }
|
S |
| | j }
|
S )
NzInput image size (*z) doesn't match model ().)dtyper.   r   r/   r   )r   rg   
ValueErrorrn   r   r   toflatten	transposerk   rt   r"   r   r   rr   r_   )rU   r   r   
batch_size_ry   rz   target_dtypepatch_embedsclass_embedsrx   r%   r%   r&   forward   s    
zCLIPVisionEmbeddings.forwardF)r@   rA   rB   r   rd   r"   Tensorintr   rD   r   __classcell__r%   r%   rv   r&   rX      s     )rX   c                	       sX   e Zd Zdef fddZ			ddeej deej deej dej	fd	d
Z
  ZS )CLIPTextEmbeddingsrY   c                    sR   t    |j}t|j|| _t|j|| _| j	dt
|jddd d S )Nr_   r`   Fra   )rc   rd   re   r   rq   
vocab_sizetoken_embeddingmax_position_embeddingsrr   rs   r"   r#   rt   rU   rY   rf   rv   r%   r&   rd      s   

zCLIPTextEmbeddings.__init__N	input_idsr_   inputs_embedsr   c                 C   s   |d ur	|j d n|j d }| jjj d }||kr#td| d| |d u r2| jd d d |f }|d u r;| |}| |}|| }|S )Nr/   r   zRSequence length must be less than max_position_embeddings (got `sequence length`: z and max_position_embeddings: )r   rr   r   r   r_   r   )rU   r   r_   r   
seq_lengthmax_position_embeddingposition_embeddingsrx   r%   r%   r&   r      s"   

zCLIPTextEmbeddings.forward)NNN)r@   rA   rB   r   rd   r   r"   
LongTensorrD   r   r   r   r%   r%   rv   r&   r      s    r           Tmodulequerykeyvalueattention_maskscalingdropoutoutput_attentionsc                 K   s   t ||dd| }	|d ur|	| }	tjj|	dt jd|j}	tjj	|	|| j
d}	t |	|}
|
dd }
|s>d }	|
|	fS )Nr/   r   )r0   r   )ptrainingr   r.   )r"   matmulr   r   r    softmaxfloat32r   r   r   r   
contiguous)r   r   r   r   r   r   r   r   kwargsattn_weightsattn_outputr%   r%   r&   eager_attention_forward   s   r   c                       sv   e Zd ZdZdeeef f fddZ			ddej	de
ej	 d	e
ej	 d
e
e deej	e
ej	 f f
ddZ  ZS )CLIPAttentionz=Multi-headed attention from 'Attention Is All You Need' paperrY   c                    s   t    || _|j| _|j| _| j| j | _| j| j | jkr-td| j d| j d| jd | _	|j
| _d| _t| j| j| _t| j| j| _t| j| j| _t| j| j| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: r         F)rc   rd   rY   re   rf   num_attention_heads	num_headshead_dimr   scaleattention_dropoutr   	is_causalr   Lineark_projv_projq_projout_projru   rv   r%   r&   rd     s$   

zCLIPAttention.__init__NFr>   r   causal_attention_maskr   r   c                 C   s,  |j \}}}| |}| |}	| |}
|||d| jdd}|	||d| jdd}	|
||d| jdd}
| jjdkrH|du| _	n|durU|durU|| }n|dur[|}t
}| jjdkrit| jj }|| ||	|
|| j	| j| jsxdn| j|d	\}}|||| }| |}|sd}||fS )	z#Input shape: Batch x Time x Channelr/   r   r.   flash_attention_2Neagerr   )r   r   r   r   )r   r   r   r   r   r   r   rY   _attn_implementationr   r   r   r   r   r   r   r   r   )rU   r>   r   r   r   r   r   rf   queriesrW   valuesattention_interfacer   r   r%   r%   r&   r   -  s@   	





zCLIPAttention.forward)NNF)r@   rA   rB   rC   r   r   r   rd   r"   r   r   boolrF   r   r   r%   r%   rv   r&   r     s"    r   c                       s2   e Zd Z fddZdejdejfddZ  ZS )CLIPMLPc                    sD   t    || _t|j | _t|j|j	| _
t|j	|j| _d S N)rc   rd   rY   r	   
hidden_actactivation_fnr   r   re   intermediate_sizefc1fc2ru   rv   r%   r&   rd   b  s
   
zCLIPMLP.__init__r>   r   c                 C   s"   |  |}| |}| |}|S r   )r   r   r   )rU   r>   r%   r%   r&   r   i  s   


zCLIPMLP.forward)r@   rA   rB   rd   r"   r   r   r   r%   r%   rv   r&   r   a  s    r   c                       s\   e Zd Zdeeef f fddZ	ddejdejdejde	e
 d	eej f
d
dZ  ZS )CLIPEncoderLayerrY   c                    sR   t    |j| _t|| _tj| j|jd| _	t
|| _tj| j|jd| _d S N)eps)rc   rd   re   rf   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2ru   rv   r%   r&   rd   q  s   


zCLIPEncoderLayer.__init__Fr>   r   r   r   r   c                 C   sd   |}|  |}| j||||d\}}|| }|}| |}| |}|| }|f}|r0||f7 }|S )aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r>   r   r   r   )r   r   r   r   )rU   r>   r   r   r   residualr   outputsr%   r%   r&   r   y  s"   




zCLIPEncoderLayer.forwardr   )r@   rA   rB   r   r   r   rd   r"   r   r   r   rF   rD   r   r   r%   r%   rv   r&   r   p  s    r   c                   @   s6   e Zd ZU eed< dZdZdZdZdZ	dZ
dd ZdS )CLIPPreTrainedModelrY   clipTc                 C   s  | j j}t|tr#|jjjjd|d d |jjjjd|d d n)t|t	rX| j j}t
jj|jd|jd | d t
jj|jj|j j| d t
jj|jj|j j| d nt|tr| j j}|jd d|j j d  | }|jd | }t
jj|jj|d t
jj|jj|d t
jj|jj|d t
jj|jj|d nt|tr| j j}|j jd d|j j d  | }d|j j d | }t
jj|jj|d t
jj|jj|d ntt|trt
jj|jj|jd | j j d t
jj|jj|jd | j j d nKt|trt
jj|jj| j jd | j j d n2t|t r3t
jj|jj| j jd | j j d nt|t!rLt
jj|j"j| j j#jd | j j d t|t
j$r`|j%j&  |jj'd t|t
j(ru|j%durw|j%j&  dS dS dS )	zInitialize the weightsr   g{Gz?)meanstdr   )r   r.   g      ?N))rY   initializer_factor
isinstancer   r   r   datanormal_rr   rX   r   initrk   rf   rn   initializer_ranger   num_hidden_layersr   r   r   r   r   re   r   r   	CLIPModeltext_projectiontext_embed_dimvisual_projectionvision_embed_dimCLIPVisionModelWithProjectionCLIPTextModelWithProjectionCLIPForImageClassification
classifiervision_configr   r^   zero_fill_r   )rU   r   factorin_proj_stdout_proj_stdfc_stdr%   r%   r&   _init_weights  sj   



 z!CLIPPreTrainedModel._init_weightsN)r@   rA   rB   r   rE   base_model_prefixsupports_gradient_checkpointing_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr  r%   r%   r%   r&   r     s   
 r   c                       sb   e Zd ZdZdef fddZ				ddeej deej dee	 d	ee	 d
e
f
ddZ  ZS )CLIPEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`CLIPEncoderLayer`].

    Args:
        config: CLIPConfig
    rY   c                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r%   )r   )rR   r   rY   r%   r&   
<listcomp>  s    z(CLIPEncoder.__init__.<locals>.<listcomp>F)	rc   rd   rY   r   
ModuleListranger   layersgradient_checkpointingru   rv   r  r&   rd     s   
 
zCLIPEncoder.__init__Nr   r   r   output_hidden_statesr   c                 C   s   |dur|n| j j}|dur|n| j j}|rdnd}|rdnd}|}t| jD ] \}	}
|r2||f }|
||||d}|d }|rG||d f }q'|rO||f }t|||dS )a  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Causal mask for the text model. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr%   )r   r   r   )r=   r>   r?   )rY   r   r  	enumerater  r   )rU   r   r   r   r   r  encoder_statesall_attentionsr>   idxencoder_layerlayer_outputsr%   r%   r&   r     s4   %

zCLIPEncoder.forwardNNNN)r@   rA   rB   rC   r   rd   r   r"   r   r   r   r   r   r%   r%   rv   r&   r
    s$    	r
  c                       sn   e Zd Zdef fddZe					ddeej deej deej dee	 d	ee	 d
e
fddZ  ZS )CLIPTextTransformerrY   c                    sH   t    || _|j}t|| _t|| _tj	||j
d| _|j| _d S r   )rc   rd   rY   re   r   rx   r
  encoderr   r   r   final_layer_normeos_token_idr   rv   r%   r&   rd   <  s   


zCLIPTextTransformer.__init__Nr   r   r_   r   r  r   c                 C   sF  |d ur|n| j j}|d ur|n| j j}|d u rtd| }|d|d }| j||d}t||j|j	d}|d urH| j j
dkrHt||j}| j|||||d}	|	j}
| |
}
| jdkrz|
tj|
jd |
j	d|jtj|
j	d	jdd
f }n|
tj|
jd |
j	d|jtj|
j	d	| jk jdd
f }t|
||	j|	jdS )NzYou have to specify input_idsr/   )r   r_   r   r   )r   r   r   r   r  r.   r   )r   r   r   r=   pooler_outputr>   r?   )rY   r   r  r   r|   r   rx   r
   r   r   r   r   r  r=   r  r  r"   r#   r   r   r   argmaxr   r>   r?   )rU   r   r   r_   r   r  input_shaper>   r   encoder_outputsr=   pooled_outputr%   r%   r&   r   G  sT   	


	zCLIPTextTransformer.forwardNNNNN)r@   rA   rB   r   rd   r   r   r"   r   r   r   r   r   r%   r%   rv   r&   r  ;  s*    r  zI
    The text model from CLIP without any head or projection on top.
    c                       s   e Zd ZU eed< ddgZdZdef fddZdej	fdd	Z
d
d Zee					ddeej deej deej dee dee defddZ  ZS )CLIPTextModelrY   r   r   Fc                    "   t  | t|| _|   d S r   )rc   rd   r  
text_model	post_initru   rv   r%   r&   rd        
zCLIPTextModel.__init__r   c                 C   
   | j jjS r   r&  rx   r   rT   r%   r%   r&   get_input_embeddings     
z"CLIPTextModel.get_input_embeddingsc                 C      || j j_d S r   r*  rU   r   r%   r%   r&   set_input_embeddings     z"CLIPTextModel.set_input_embeddingsNr   r   r_   r   r  c                 C   s   | j |||||dS )a9  
        Examples:

        ```python
        >>> from transformers import AutoTokenizer, CLIPTextModel

        >>> model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```r   r   r_   r   r  )r&  )rU   r   r   r_   r   r  r%   r%   r&   r     s   zCLIPTextModel.forwardr#  )r@   rA   rB   r   rE   _no_split_modulesr  rd   r   Moduler+  r/  r   r   r   r"   r   r   r   r   r   r%   r%   rv   r&   r$    s6   
 r$  c                       s`   e Zd Zdef fddZe				ddeej dee	 dee	 d	ee	 d
e
f
ddZ  ZS )CLIPVisionTransformerrY   c                    sR   t    || _|j}t|| _tj||jd| _	t
|| _tj||jd| _d S r   )rc   rd   rY   re   rX   rx   r   r   r   pre_layrnormr
  r  post_layernormr   rv   r%   r&   rd     s   


zCLIPVisionTransformer.__init__NFr   r   r  r   r   c           	      C   s   |d ur|n| j j}|d ur|n| j j}|d u rtd| j||d}| |}| j|||d}|j}|d d dd d f }| |}t	|||j
|jdS )Nz You have to specify pixel_values)r   )r   r   r  r   r  )rY   r   r  r   rx   r5  r  r=   r6  r   r>   r?   )	rU   r   r   r  r   r>   r!  r=   r"  r%   r%   r&   r     s*   

zCLIPVisionTransformer.forwardNNNF)r@   rA   rB   r   rd   r   r   r"   rD   r   r   r   r   r%   r%   rv   r&   r4    s$    
r4  zK
    The vision model from CLIP without any head or projection on top.
    c                       s   e Zd ZU eed< dZdgZdef fddZdej	fddZ
ee							
ddeej dee dee dedef
ddZ  ZS )CLIPVisionModelrY   r   r   c                    r%  r   )rc   rd   r4  vision_modelr'  ru   rv   r%   r&   rd     r(  zCLIPVisionModel.__init__r   c                 C   r)  r   r9  rx   rn   rT   r%   r%   r&   r+    r,  z$CLIPVisionModel.get_input_embeddingsNFr   r  r   c                 C   s   | j ||||dS )a  
        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, CLIPVisionModel

        >>> model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```r   r   r  r   )r9  )rU   r   r   r  r   r%   r%   r&   r     s   zCLIPVisionModel.forwardr7  )r@   rA   rB   r   rE   main_input_namer2  rd   r   r3  r+  r   r   r   r"   rD   r   r   r   r   r%   r%   rv   r&   r8    s.   
 r8  c                       s  e Zd ZU eed< g dZdZdef fddZe e			dde
jdee
j d	ee
j d
e
jfddZe e		dde
jded
e
jfddZee									ddee
j dee
j dee
j d	ee
j dee dee dee ded
efddZ  ZS )r   rY   )r   r   rX   Fc                    s   t  | t|jtstdt|j dt|jts(tdt|j d|j}|j}|j	| _	|j
| _|j
| _t|}|j| _t|}|j| _tj| j| j	dd| _tj| j| j	dd| _tt| jj| _|   d S )NzKconfig.text_config is expected to be of type CLIPTextConfig but is of type .zOconfig.vision_config is expected to be of type CLIPVisionConfig but is of type Fr^   )rc   rd   r   text_configr   	TypeErrortyper   r   projection_dimre   r   r   r$  _from_configr&  r8  r9  r   r   r   r   ri   r"   r-   rY   logit_scale_init_valuelogit_scaler'  )rU   rY   r?  r   r&  r9  rv   r%   r&   rd   :  s4   

zCLIPModel.__init__Nr   r   r_   r   c                 C   s$   | j |||d}|j}| |}|S )a  
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`CLIPTextModel`].

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, CLIPModel

        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> with torch.inference_mode():
        ...     text_features = model.get_text_features(**inputs)
        ```)r   r   r_   )r&  r  r   )rU   r   r   r_   text_outputsr"  text_featuresr%   r%   r&   get_text_features]  s   
zCLIPModel.get_text_featuresr   r   c                 C   s"   | j ||d}|j}| |}|S )ai  
        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`CLIPVisionModel`].

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, CLIPModel
        >>> from transformers.image_utils import load_image

        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> with torch.inference_mode():
        ...     image_features = model.get_image_features(**inputs)
        ```)r   r   )r9  r  r   )rU   r   r   vision_outputsr"  image_featuresr%   r%   r&   get_image_features  s   
zCLIPModel.get_image_featuresreturn_lossr   r  c	              	   C   s   |dur|n| j j}|dur|n| j j}| j||||d}	| j|||||d}
|	j}| |}|
j}| |}|t| }|t| }t	
|| |j}|| j |j }| }d}|rft|}t||||||
|	dS )a  
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, CLIPModel
        >>> from transformers.image_utils import load_image

        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
        ... )

        >>> with torch.inference_mode():
        ...     outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```Nr;  r1  )rK   rL   rM   rI   r<   rN   rO   )rY   r   r  r9  r&  r  r   r   r8   r"   r   r)   r   r   rE  expr,   rJ   )rU   r   r   r   r_   rL  r   r  r   rI  rF  r<   rI   rM   rL   rK   r%   r%   r&   r     sJ   (

zCLIPModel.forward)NNr   )NNNNNNNF)r@   rA   rB   r   rE   r2  r  rd   r   r   r"   r   r   rD   rH  r   rK  r   r   rJ   r   r   r%   r%   rv   r&   r   4  sr   
 ##%	
r   c                       s   e Zd ZU eed< dZddgZdef fddZdej	fdd	Z
d
d Zee					ddeej deej deej dee dee defddZ  ZS )r   rY   Fr   r   c                    @   t  | t|}|j| _tj|j|jdd| _	| 
  d S NFr>  )rc   rd   r$  rC  r&  r   r   re   rB  r   r'  )rU   rY   r&  rv   r%   r&   rd     
   
z$CLIPTextModelWithProjection.__init__r   c                 C   r)  r   r*  rT   r%   r%   r&   r+    r,  z0CLIPTextModelWithProjection.get_input_embeddingsc                 C   r-  r   r*  r.  r%   r%   r&   r/    r0  z0CLIPTextModelWithProjection.set_input_embeddingsNr   r   r_   r   r  c           	      C   s:   | j |||||d}|j}| |}t||j|j|jdS )a@  
        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, CLIPTextModelWithProjection

        >>> model = CLIPTextModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")
        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> with torch.inference_mode():
        ...     outputs = model(**inputs)
        >>> text_embeds = outputs.text_embeds
        ```r1  )rI   r=   r>   r?   )r&  r  r   rH   r=   r>   r?   )	rU   r   r   r_   r   r  rF  r"  rI   r%   r%   r&   r     s   
z#CLIPTextModelWithProjection.forwardr#  )r@   rA   rB   r   rE   r  r2  rd   r   r3  r+  r/  r   r   r   r"   r   r   rH   r   r   r%   r%   rv   r&   r     s6   
 r   c                       s~   e Zd ZU eed< dZdef fddZdejfddZ	e
e					ddeej d
ee dee dedef
ddZ  ZS )r   rY   r   c                    rN  rO  )rc   rd   r8  rC  r9  r   r   re   rB  r   r'  rU   rY   r9  rv   r%   r&   rd   N  rP  z&CLIPVisionModelWithProjection.__init__r   c                 C   r)  r   r:  rT   r%   r%   r&   r+  Y  r,  z2CLIPVisionModelWithProjection.get_input_embeddingsNFr   r  r   c                 C   s8   | j ||||d}|j}| |}t||j|j|jdS )a  
        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, CLIPVisionModelWithProjection
        >>> from transformers.image_utils import load_image

        >>> model = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> with torch.inference_mode():
        ...     outputs = model(**inputs)
        >>> image_embeds = outputs.image_embeds
        ```r;  )r<   r=   r>   r?   )r9  r  r   r;   r=   r>   r?   )rU   r   r   r  r   rI  r"  r<   r%   r%   r&   r   \  s   
z%CLIPVisionModelWithProjection.forwardr7  )r@   rA   rB   r   rE   r<  rd   r   r3  r+  r   r   r   r"   rD   r   r;   r   r   r%   r%   rv   r&   r   I  s,   
 r   z
    CLIP vision encoder with an image classification head on top (a linear layer on top of the pooled final hidden states of
    the patch tokens) e.g. for ImageNet.
    c                       sn   e Zd ZdZdeddf fddZee				ddee	j
 dee	j
 dee d	ee def
d
dZ  ZS )r   r   rY   r   Nc                    sZ   t  | |j| _t|j}|j| _|jdkr"t|jj	|jnt
 | _|   d S )Nr   )rc   rd   
num_labelsr8  rC  r   r9  r   r   re   Identityr   r'  rQ  rv   r%   r&   rd     s   "z#CLIPForImageClassification.__init__labelsr   r  c           	      C   s   |dur|n| j j}|dur|n| j j}| j|||d}|j}tj|ddddddf dd}| |}d}|durD| ||| j }t	|||j
|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N)r   r  r   r   )rK   r   r>   r?   )rY   r   r  r9  r=   r"   r   r   loss_functionr   r>   r?   )	rU   r   rT  r   r  r   sequence_outputr   rK   r%   r%   r&   r     s(   $
z"CLIPForImageClassification.forwardr  )r@   rA   rB   r<  r   rd   r   r   r   r"   r   r   r   r   r   r%   r%   rv   r&   r     s(    r   )r   r   r$  r   r8  r   r   )r   T)ArC   dataclassesr   typingr   r   r   r   r"   r   activationsr	   modeling_attn_mask_utilsr
   r   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   utilsr   r   r   r   r   r   configuration_clipr   r   r   
get_loggerr@   loggerr   r'   r,   r8   r;   rH   rJ   r3  rX   r   floatr   r   r   r   r   r   r
  r  r$  r4  r8  r   r   r   r   __all__r%   r%   r%   r&   <module>   s    
#S/
K2BVV504 PDA?