o
    ei                     @   s  d Z ddlmZ ddlmZ ddlmZ ddlZddlmZ ddl	m
Z dd	lmZ dd
lmZ ddlmZ ddlmZmZmZ ddlmZmZ ddlmZ ddlmZmZmZmZm Z  ddl!m"Z"m#Z# ddl$m%Z% ddl&m'Z'm(Z(m)Z) e*e+Z,dej-dej-fddZ.dej-dej-fddZ/dej-dej-fddZ0eeddG d d! d!eZ1eed"dG d#d$ d$eZ2eeG d%d& d&eZ3G d'd( d(ej4Z5G d)d* d*ej4Z6	+dTd,ej4d-ej-d.ej-d/ej-d0ej-dB d1e7d2e7d3ee fd4d5Z8G d6d7 d7ej4Z9G d8d9 d9ej4Z:G d:d; d;eZ;eG d<d= d=eZ<G d>d? d?ej4Z=G d@dA dAe<Z>edBdG dCdD dDe<Z?G dEdF dFe<Z@edGdG dHdI dIe<ZAeG dJdK dKe<ZBeG dLdM dMe<ZCeG dNdO dOe<ZDedPdG dQdR dRe<ZEg dSZFdS )UzPyTorch CLIP model.    )Callable)	dataclass)AnyN)nn   )initialization)ACT2FN)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringlogging	torch_int)can_return_tuplemerge_with_config_defaults)capture_outputs   )
CLIPConfigCLIPTextConfigCLIPVisionConfiglogitsreturnc                 C   s   t j| tjt| | jdS )Ndevice)r   
functionalcross_entropytorcharangelenr    )r    r&   d/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/clip/modeling_clip.pycontrastive_loss/   s   r(   
similarityc                 C   s    t | }t |  }|| d S )Ng       @)r(   t)r)   caption_loss
image_lossr&   r&   r'   	clip_loss3   s   r-   tensorc                 C   s,   t | d}t j|ddd}t |d}|S )z
    This method is equivalent to tensor.norm(p=2, dim=-1, keepdim=True) and used to make
    model `executorch` exportable. See issue https://github.com/pytorch/executorch/issues/3566
       T)dimkeepdim      ?)r#   powsum)r.   square_tensor
sum_tensornormed_tensorr&   r&   r'   _get_vector_norm9   s   r9   z}
    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
    )custom_introc                   @   j   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	e
ejdf dB ed< dZe
ejdf dB ed< dS )CLIPVisionModelOutputz
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The image embeddings obtained by applying the projection layer to the pooler_output.
    Nimage_embedslast_hidden_state.hidden_states
attentions)__name__
__module____qualname____doc__r=   r#   FloatTensor__annotations__r>   r?   tupler@   r&   r&   r&   r'   r<   D      
 r<   ze
    Base class for text model's outputs that also contains a pooling of the last hidden states.
    c                   @   r;   )CLIPTextModelOutputz
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The text embeddings obtained by applying the projection layer to the pooler_output.
    Ntext_embedsr>   .r?   r@   )rA   rB   rC   rD   rJ   r#   rE   rF   r>   r?   rG   r@   r&   r&   r&   r'   rI   V   rH   rI   c                   @   s   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	ejdB ed< dZ
ejdB ed< dZejdB ed< dZeed< dZeed	< d
ee fddZdS )
CLIPOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPTextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The image embeddings obtained by applying the projection layer to the pooled output of [`CLIPVisionModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`CLIPTextModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`CLIPVisionModel`].
    Nlosslogits_per_imagelogits_per_textrJ   r=   text_model_outputvision_model_outputr   c                    s   t  fdd  D S )Nc                 3   s.    | ]}|d vr | nt  | V  qdS ))rO   rP   N)getattrto_tuple).0kselfr&   r'   	<genexpr>   s
    
z&CLIPOutput.to_tuple.<locals>.<genexpr>)rG   keysrU   r&   rU   r'   rR      s   zCLIPOutput.to_tuple)rA   rB   rC   rD   rL   r#   rE   rF   rM   rN   rJ   r=   rO   r   rP   rG   r   rR   r&   r&   r&   r'   rK   h   s   
 rK   c                       sX   e Zd Zdef fddZdejdededejfdd	Zddej	dejfddZ
  ZS )CLIPVisionEmbeddingsconfigc                    s   t    || _|j| _|j| _|j| _tt	
| j| _tj|j| j| j| jdd| _| j| j d | _| jd | _t| j| j| _| jdt	| jddd d S )NF)in_channelsout_channelskernel_sizestridebiasr/   r   position_idsr   r0   
persistent)super__init__rZ   hidden_size	embed_dim
image_size
patch_sizer   	Parameterr#   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr$   expandrV   rZ   	__class__r&   r'   re      s"   
"zCLIPVisionEmbeddings.__init__
embeddingsheightwidthr   c                 C   s  |j d d }| jjd}|j d d }tj s(||kr(||kr(| | jS |ddddf }|ddddf }|j d }	|| j }
|| j }t	|d }|
d|||	}|dddd}tjj||
|fdd	d
}|dddddd|	}tj||fddS )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   Nr0   r3   r   r/   bicubicF)sizemodealign_cornersr1   )shapers   weight	unsqueezer#   jit
is_tracingr`   ri   r   reshapepermuter   r!   interpolateviewcat)rV   ry   rz   r{   rp   rs   rq   class_pos_embedpatch_pos_embedr1   
new_height	new_widthsqrt_num_positionsr&   r&   r'   interpolate_pos_encoding   s*   



z-CLIPVisionEmbeddings.interpolate_pos_encodingFpixel_valuesc              
   C   s   |j \}}}}|s&|| jks|| jkr&td| d| d| j d| j d	| jjj}| |j|d}|ddd}| j	
|dd}	tj|	|gdd	}
|r[|
| |
|| }
|
S |
| | j }
|
S )
NzInput image size (*z) doesn't match model ().)dtyper/   r   r0   r   )r   rh   
ValueErrorro   r   r   toflatten	transposerl   ru   r#   r   r   rs   r`   )rV   r   r   
batch_size_rz   r{   target_dtypepatch_embedsclass_embedsry   r&   r&   r'   forward   s    
zCLIPVisionEmbeddings.forwardF)rA   rB   rC   r   re   r#   Tensorintr   rE   r   __classcell__r&   r&   rw   r'   rY      s     )rY   c                	       sX   e Zd Zdef fddZ			ddejdB dejdB dejdB dejfd	d
Z	  Z
S )CLIPTextEmbeddingsrZ   c                    sR   t    |j}t|j|| _t|j|| _| j	dt
|jddd d S )Nr`   ra   Frb   )rd   re   rf   r   rr   
vocab_sizetoken_embeddingmax_position_embeddingsrs   rt   r#   r$   ru   rV   rZ   rg   rw   r&   r'   re      s   

zCLIPTextEmbeddings.__init__N	input_idsr`   inputs_embedsr   c                 C   s   |d ur	|j d n|j d }| jjj d }||kr#td| d| |d u r2| jd d d |f }|d u r;| |}| |}|| }|S )Nr0   r   zRSequence length must be less than max_position_embeddings (got `sequence length`: z and max_position_embeddings: )r   rs   r   r   r`   r   )rV   r   r`   r   
seq_lengthmax_position_embeddingposition_embeddingsry   r&   r&   r'   r      s"   

zCLIPTextEmbeddings.forwardNNN)rA   rB   rC   r   re   r#   
LongTensorrE   r   r   r   r&   r&   rw   r'   r      s    r           modulequerykeyvalueattention_maskscalingdropoutkwargsc           
      K   s|   t ||dd| }|d ur|| }tjj|dt jd|j}tjj	||| j
d}t ||}	|	dd }	|	|fS )Nr0   r   )r1   r   )ptrainingr   r/   )r#   matmulr   r   r!   softmaxfloat32r   r   r   r   
contiguous)
r   r   r   r   r   r   r   r   attn_weightsattn_outputr&   r&   r'   eager_attention_forward  s   
r   c                       sd   e Zd ZdZdeeB f fddZ	ddejdejdB de	e
 d	eejejdB f fd
dZ  ZS )CLIPAttentionz=Multi-headed attention from 'Attention Is All You Need' paperrZ   c                    s   t    || _|j| _|j| _| j| j | _| j| j | jkr-td| j d| j d| jd | _	|j
| _d| _t| j| j| _t| j| j| _t| j| j| _t| j| j| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: r         F)rd   re   rZ   rf   rg   num_attention_heads	num_headshead_dimr   scaleattention_dropoutr   	is_causalr   Lineark_projv_projq_projout_projrv   rw   r&   r'   re      s$   

zCLIPAttention.__init__Nr?   r   r   r   c                 K   s   |j \}}}| |}| |}| |}	|||d| jdd}|||d| jdd}|	||d| jdd}	t| j	j
t}
|
| |||	|f| j| jsRdn| jd|\}}|||d }| |}||fS )z#Input shape: Batch x Time x Channelr0   r   r/   r   )r   r   )r   r   r   r   r   r   r   r   get_interfacerZ   _attn_implementationr   r   r   r   r   r   r   )rV   r?   r   r   r   r   rg   queriesrX   valuesattention_interfacer   r   r&   r&   r'   r   4  s2   




zCLIPAttention.forwardN)rA   rB   rC   rD   r   r   re   r#   r   r   r   rG   r   r   r&   r&   rw   r'   r     s    r   c                       s2   e Zd Z fddZdejdejfddZ  ZS )CLIPMLPc                    sD   t    || _t|j | _t|j|j	| _
t|j	|j| _d S r   )rd   re   rZ   r   
hidden_actactivation_fnr   r   rf   intermediate_sizefc1fc2rv   rw   r&   r'   re   \  s
   
zCLIPMLP.__init__r?   r   c                 C   s"   |  |}| |}| |}|S r   )r   r   r   )rV   r?   r&   r&   r'   r   c  s   


zCLIPMLP.forward)rA   rB   rC   re   r#   r   r   r   r&   r&   rw   r'   r   [  s    r   c                       sJ   e Zd ZdeeB f fddZdejdejdee	 dej
fdd	Z  ZS )
CLIPEncoderLayerrZ   c                    sR   t    |j| _t|| _tj| j|jd| _	t
|| _tj| j|jd| _d S N)eps)rd   re   rf   rg   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2rv   rw   r&   r'   re   k  s   


zCLIPEncoderLayer.__init__r?   r   r   r   c                 K   sT   |}|  |}| jd||d|\}}|| }|}| |}| |}|| }|S )N)r?   r   r&   )r   r   r   r   )rV   r?   r   r   residualr   r&   r&   r'   r   s  s   



zCLIPEncoderLayer.forward)rA   rB   rC   r   r   re   r#   r   r   r   rE   r   r   r&   r&   rw   r'   r   j  s    r   c                   @   sL   e Zd ZU eed< dZdZdZdZdZ	dZ
dZeedZe dd ZdS )	CLIPPreTrainedModelrZ   clip)imagetextT)r?   r@   c                 C   s  | j j}t|tr4tj|jjd|d d tj|jjd|d d t	|j
t|j
jd d n)t|trt| j j}tj|jd|jd | d tj|jj|j j| d tj|jj|j j| d t	|j
t|jd nt|tr| j j}|jd d|j j d  | }|jd | }tj|jj|d tj|jj|d tj|jj|d tj|jj|d nt|tr| j j}|j jd d|j j d  | }d|j j d | }tj|jj|d tj|jj|d not|trtj|j j|j!d | j j d tj|j"j|j#d | j j d nHt|t$r-tj|j"j| j jd | j j d n0t|t%rEtj|j j| j jd | j j d nt|t&r]tj|j'j| j j(jd | j j d t|t)j*rpt+|j, t-|j t|t)j.r|j,d	urt+|j, d	S d	S d	S )
zInitialize the weightsr   g{Gz?)meanstdr0   ra   r   )r   r/   N)/rZ   initializer_factor
isinstancer   initnormal_r   r   rs   copy_r`   r#   r$   r   ru   rY   rl   rg   ro   initializer_rangerq   r   num_hidden_layersr   r   r   r   r   rf   r   r   	CLIPModeltext_projectiontext_embed_dimvisual_projectionvision_embed_dimCLIPVisionModelWithProjectionCLIPTextModelWithProjectionCLIPForImageClassification
classifiervision_configr   r   zeros_r_   ones_r   )rV   r   factorin_proj_stdout_proj_stdfc_stdr&   r&   r'   _init_weights  sn   
&


 z!CLIPPreTrainedModel._init_weightsN)rA   rB   rC   r   rF   base_model_prefixinput_modalitiessupports_gradient_checkpointing_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr   r   _can_record_outputsr#   no_gradr  r&   r&   r&   r'   r     s   
 r   c                       sJ   e Zd ZdZdef fddZ	ddejdB dee	 de
fd	d
Z  ZS )CLIPEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`CLIPEncoderLayer`].

    Args:
        config: CLIPConfig
    rZ   c                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r&   )r   )rS   r   rZ   r&   r'   
<listcomp>  s    z(CLIPEncoder.__init__.<locals>.<listcomp>F)	rd   re   rZ   r   
ModuleListranger   layersgradient_checkpointingrv   rw   r  r'   re     s   
 
zCLIPEncoder.__init__Nr   r   r   c                 K   s,   |}| j D ]}|||fi |}qt|dS )a7  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
        )r>   )r  r   )rV   r   r   r   r?   encoder_layerr&   r&   r'   r     s   
zCLIPEncoder.forwardr   )rA   rB   rC   rD   r   re   r#   r   r   r   r   r   r   r&   r&   rw   r'   r    s    	r  c                       s   e Zd ZU eed< dZddgZdef fddZee	dde
						dd
ejd	B dejd	B dejd	B dee def
ddZ  ZS )CLIPTextTransformerrZ   r   r   r   c                    sR   t  | || _|j}t|| _t|| _tj	||j
d| _|j| _|   d S r   )rd   re   rZ   rf   r   ry   r  encoderr   r   r   final_layer_normeos_token_id	post_initr   rw   r&   r'   re     s   

zCLIPTextTransformer.__init__Ftie_last_hidden_statesNr   r   r`   r   r   c           
      K   s  |d u rt d| }|d|d }| j||d}t| j||tj|jd |j	dd d}|
dd  | jd||dd	|}|j}| |}| jd
krh|tj|jd |j	d|jtj|j	djddf }	n|tj|jd |j	d|jtj|j	d| jk jddf }	t||	dS )NzYou have to specify input_idsr0   )r   r`   r   r   )rZ   r   r   cache_positionpast_key_valuesr   T)r   r   r   r/   r   )r   r    r   r>   pooler_outputr&   )r   r}   r   ry   r	   rZ   r#   r$   r   r    popr  r>   r  r  r   r   argmaxr   )
rV   r   r   r`   r   input_shaper?   encoder_outputsr>   pooled_outputr&   r&   r'   r     sP   


	zCLIPTextTransformer.forwardr   )rA   rB   rC   r   rF   r  _no_split_modulesre   r   r   r   r#   r   r   r   r   r   r   r&   r&   rw   r'   r    s,   
 r  zI
    The text model from CLIP without any head or projection on top.
    c                       s   e Zd ZU eed< dZddgZdef fddZdej	fdd	Z
d
d Ze			ddejdB dejdB dejdB dee def
ddZ  ZS )CLIPTextModelrZ   r  r   r   c                    "   t  | t|| _|   d S r   )rd   re   r  
text_modelr  rv   rw   r&   r'   re   d     
zCLIPTextModel.__init__r   c                 C   
   | j jjS r   r(  ry   r   rU   r&   r&   r'   get_input_embeddingsj     
z"CLIPTextModel.get_input_embeddingsc                 C      || j j_d S r   r+  rV   r   r&   r&   r'   set_input_embeddingsm     z"CLIPTextModel.set_input_embeddingsNr   r   r`   r   c                 K   s   | j d|||d|S )a9  
        Examples:

        ```python
        >>> from transformers import AutoTokenizer, CLIPTextModel

        >>> model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```r   r   r`   Nr&   )r(  )rV   r   r   r`   r   r&   r&   r'   r   p  s   zCLIPTextModel.forwardr   )rA   rB   rC   r   rF   r  r%  re   r   Moduler,  r0  r   r#   r   r   r   r   r   r   r&   r&   rw   r'   r&  Y  s,   
 r&  c                       sz   e Zd ZU eed< dZdZdgZdef fddZe	e
dde			ddejd	B d
ed	B dee defddZ  ZS )CLIPVisionTransformerrZ   r   r   r   c                    s\   t  | || _|j}t|| _tj||jd| _	t
|| _tj||jd| _|   d S r   )rd   re   rZ   rf   rY   ry   r   r   r   pre_layrnormr  r  post_layernormr  r   rw   r&   r'   re     s   

zCLIPVisionTransformer.__init__Fr  Nr   r   r   c                 K   sn   |d u rt d| j||d}| |}| jdd|i|}|j}|d d dd d f }| |}t||dS )Nz You have to specify pixel_values)r   r   r   r  r&   )r   ry   r6  r  r>   r7  r   )rV   r   r   r   r?   r#  r>   r$  r&   r&   r'   r     s    	

zCLIPVisionTransformer.forwardNF)rA   rB   rC   r   rF   main_input_namer  r%  re   r   r   r   r#   rE   boolr   r   r   r   r   r&   r&   rw   r'   r4    s(   
 r4  zK
    The vision model from CLIP without any head or projection on top.
    c                
       sx   e Zd ZU eed< dZdZdgZdef fddZde	j
fdd	Ze	
	ddejd
B dedee defddZ  ZS )CLIPVisionModelrZ   r   r5  r   c                    r'  r   )rd   re   r4  vision_modelr  rv   rw   r&   r'   re     r)  zCLIPVisionModel.__init__r   c                 C   r*  r   r<  ry   ro   rU   r&   r&   r'   r,    r-  z$CLIPVisionModel.get_input_embeddingsNFr   r   c                 K   s   | j d||d|S )a(  
        Example:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, CLIPVisionModel

        >>> model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```r   r   Nr&   )r<  )rV   r   r   r   r&   r&   r'   r     s   zCLIPVisionModel.forwardr8  )rA   rB   rC   r   rF   r9  r  r%  re   r   r3  r,  r   r#   rE   r:  r   r   r   r   r   r&   r&   rw   r'   r;    s&   
 r;  c                       s  e Zd ZU eed< g dZdef fddZee		dde	j
de	j
dB de	j
dB d	ee d
eeB f
ddZee	dde	jded	ee d
eeB fddZee						dde	jdB de	jdB de	j
dB de	jdB dedB ded	ee d
efddZ  ZS )r   rZ   )r   r   rY   c                    s   t  | t|jtstdt|j dt|jts(tdt|j d|j}|j}|j	| _	|j
| _|j
| _t|}|j| _t|}|j| _tj| j| j	dd| _tj| j| j	dd| _tt| jj| _|   d S )NzKconfig.text_config is expected to be of type CLIPTextConfig but is of type .zOconfig.vision_config is expected to be of type CLIPVisionConfig but is of type Fr_   )rd   re   r   text_configr   	TypeErrortyper   r   projection_dimrf   r   r   r&  _from_configr(  r;  r<  r   r   r   r   rj   r#   r.   rZ   logit_scale_init_valuelogit_scaler  )rV   rZ   rA  r   r(  r<  rw   r&   r'   re     s4   

zCLIPModel.__init__Nr   r   r`   r   r   c                 K   s0   | j d|||dd|}|j}| ||_|S )a  
        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, CLIPModel

        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> with torch.inference_mode():
        ...     text_features = model.get_text_features(**inputs)
        ```T)r   r   r`   return_dictNr&   )r(  r  r   )rV   r   r   r`   r   text_outputsr$  r&   r&   r'   get_text_features   s   zCLIPModel.get_text_featuresFr   r   c                 K   s.   | j d||dd|}|j}| ||_|S )a  
        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, CLIPModel
        >>> from transformers.image_utils import load_image

        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> with torch.inference_mode():
        ...     image_features = model.get_image_features(**inputs)
        ```T)r   r   rH  Nr&   )r<  r  r   )rV   r   r   r   vision_outputsr$  r&   r&   r'   get_image_featuresD  s   zCLIPModel.get_image_featuresreturn_lossc              	   K   s   | j d||d|}| jd|||d|}	|j}
| |
}
|	j}| |}|
t|
 }
|t| }t||
 	|j
}|| j 	|j
 }| }d}|rVt|}t|||||
|	|dS )a  
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, CLIPModel
        >>> from transformers.image_utils import load_image

        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
        ... )

        >>> with torch.inference_mode():
        ...     outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```r>  r2  N)rL   rM   rN   rJ   r=   rO   rP   r&   )r<  r(  r  r   r   r9   r#   r   r*   r   r    rG  expr-   rK   )rV   r   r   r   r`   rM  r   r   rK  rI  r=   rJ   rN   rM   rL   r&   r&   r'   r   j  sD   &

zCLIPModel.forwardNNr   )NNNNNF)rA   rB   rC   r   rF   r%  re   r   r   r#   r   r   r   rG   r   rJ  rE   r:  rL  r   rK   r   r   r&   r&   rw   r'   r     sp   
 #"$	r   c                       s   e Zd ZU eed< dZddgZdef fddZdej	fdd	Z
d
d Zee			ddejdB dejdB dejdB dee def
ddZ  ZS )r   rZ   r  r   r   c                    @   t  | t|}|j| _tj|j|jdd| _	| 
  d S NFr@  )rd   re   r&  rE  r(  r   r   rf   rD  r   r  )rV   rZ   r(  rw   r&   r'   re     
   
z$CLIPTextModelWithProjection.__init__r   c                 C   r*  r   r+  rU   r&   r&   r'   r,    r-  z0CLIPTextModelWithProjection.get_input_embeddingsc                 C   r.  r   r+  r/  r&   r&   r'   r0    r1  z0CLIPTextModelWithProjection.set_input_embeddingsNr   r   r`   r   c                 K   s>   | j d|||d|}|j}| |}t||j|j|jdS )a@  
        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, CLIPTextModelWithProjection

        >>> model = CLIPTextModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")
        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> with torch.inference_mode():
        ...     outputs = model(**inputs)
        >>> text_embeds = outputs.text_embeds
        ```r2  )rJ   r>   r?   r@   Nr&   )r(  r  r   rI   r>   r?   r@   )rV   r   r   r`   r   rI  r$  rJ   r&   r&   r'   r     s   
z#CLIPTextModelWithProjection.forwardr   )rA   rB   rC   r   rF   r  r%  re   r   r3  r,  r0  r   r   r#   r   r   r   rI   r   r   r&   r&   rw   r'   r     s.   
 r   c                       sv   e Zd ZU eed< dZdZdef fddZdej	fddZ
ee			
ddejd	B dedee defddZ  ZS )r   rZ   r   r5  c                    rP  rQ  )rd   re   r;  rE  r<  r   r   rf   rD  r   r  rV   rZ   r<  rw   r&   r'   re     rR  z&CLIPVisionModelWithProjection.__init__r   c                 C   r*  r   r=  rU   r&   r&   r'   r,    r-  z2CLIPVisionModelWithProjection.get_input_embeddingsNFr   r   c                 K   s<   | j d||d|}|j}| |}t||j|j|jdS )a  
        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, CLIPVisionModelWithProjection
        >>> from transformers.image_utils import load_image

        >>> model = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> with torch.inference_mode():
        ...     outputs = model(**inputs)
        >>> image_embeds = outputs.image_embeds
        ```r>  )r=   r>   r?   r@   Nr&   )r<  r  r   r<   r>   r?   r@   )rV   r   r   r   rK  r$  r=   r&   r&   r'   r     s   
z%CLIPVisionModelWithProjection.forwardr8  )rA   rB   rC   r   rF   r9  r  re   r   r3  r,  r   r   r#   rE   r:  r   r   r<   r   r   r&   r&   rw   r'   r     s&   
 r   z
    CLIP vision encoder with an image classification head on top (a linear layer on top of the pooled final hidden states of
    the patch tokens) e.g. for ImageNet.
    c                       sf   e Zd ZdZdZdeddf fddZee		dde	j
dB de	j
dB d	ee defd
dZ  ZS )r   r   r5  rZ   r   Nc                    sZ   t  | |j| _t|j}|j| _|jdkr"t|jj	|jnt
 | _|   d S )Nr   )rd   re   
num_labelsr;  rE  r   r<  r   r   rf   Identityr   r  rS  rw   r&   r'   re   J  s   "z#CLIPForImageClassification.__init__labelsr   c                 K   sv   | j |fi |}|j}tj|ddddddf dd}| |}d}|dur1| ||| j}t|||j|j	dS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r   )rL   r   r?   r@   )
r<  r>   r#   r   r   loss_functionrZ   r   r?   r@   )rV   r   rV  r   outputssequence_outputr   rL   r&   r&   r'   r   Y  s"   $
z"CLIPForImageClassification.forwardrO  )rA   rB   rC   r9  r  r   re   r   r   r#   r   r   r   r   r   r   r&   r&   rw   r'   r   @  s"    r   )r   r   r&  r   r;  r   r   )r   )GrD   collections.abcr   dataclassesr   typingr   r#   r    r   r   activationsr   masking_utilsr	   modeling_layersr
   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.genericr   r   utils.output_capturingr   configuration_clipr   r   r   
get_loggerrA   loggerr   r(   r-   r9   r<   rI   rK   r3  rY   r   floatr   r   r   r   r   r  r  r&  r4  r;  r   r   r   r   __all__r&   r&   r&   r'   <module>   s   
#S/
>!J0S2/4 DB@7