o
    wi                     @   s  d Z ddlmZ ddlmZmZmZmZ ddlZddlm	Z	 ddl
mZmZmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZmZmZ ddlmZmZ ddlmZmZmZm Z m!Z! ddl"m#Z#m$Z$m%Z% e &e'Z(dej)dej)fddZ*dej)dej)fddZ+dej)dej)fddZ,eeddG dd deZ-eeddG dd  d eZ.eeG d!d" d"eZ/G d#d$ d$e	j0Z1G d%d& d&e	j0Z2	'	(dQd)e	j0d*ej)d+ej)d,ej)d-eej) d.e3d/e3d0e4fd1d2Z5G d3d4 d4e	j0Z6G d5d6 d6e	j0Z7G d7d8 d8eZ8eG d9d: d:eZ9G d;d< d<e	j0Z:G d=d> d>e	j0Z;ed?dG d@dA dAe9Z<G dBdC dCe	j0Z=edDdG dEdF dFe9Z>eG dGdH dHe9Z?eG dIdJ dJe9Z@eG dKdL dLe9ZAedMdG dNdO dOe9ZBg dPZCdS )RzPyTorch CLIP model.    )	dataclass)AnyCallableOptionalUnionN)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN) _create_4d_causal_attention_mask_prepare_4d_attention_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)ModelOutputauto_docstringcan_return_tuplelogging	torch_int   )
CLIPConfigCLIPTextConfigCLIPVisionConfiglogitsreturnc                 C   s   t j| tjt| | jdS )Ndevice)r   
functionalcross_entropytorcharangelenr!   )r    r'   c/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/clip/modeling_clip.pycontrastive_loss&   s   r)   
similarityc                 C   s    t | }t |  }|| d S )Ng       @)r)   t)r*   caption_loss
image_lossr'   r'   r(   	clip_loss*   s   r.   tensorc                 C   s,   t | d}t j|ddd}t |d}|S )z
    This method is equivalent to tensor.norm(p=2, dim=-1, keepdim=True) and used to make
    model `executorch` exportable. See issue https://github.com/pytorch/executorch/issues/3566
       T)dimkeepdim      ?)r$   powsum)r/   square_tensor
sum_tensornormed_tensorr'   r'   r(   _get_vector_norm0   s   r:   z}
    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
    )custom_introc                   @   j   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeejdf  ed< dZeeejdf  ed< dS )CLIPVisionModelOutputz
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The image embeddings obtained by applying the projection layer to the pooler_output.
    Nimage_embedslast_hidden_state.hidden_states
attentions)__name__
__module____qualname____doc__r>   r   r$   FloatTensor__annotations__r?   r@   tuplerA   r'   r'   r'   r(   r=   ;      
 r=   ze
    Base class for text model's outputs that also contains a pooling of the last hidden states.
    c                   @   r<   )CLIPTextModelOutputz
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The text embeddings obtained by applying the projection layer to the pooler_output.
    Ntext_embedsr?   .r@   rA   )rB   rC   rD   rE   rK   r   r$   rF   rG   r?   r@   rH   rA   r'   r'   r'   r(   rJ   M   rI   rJ   c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeej ed< dZeej ed< dZeed< dZeed	< d
ee fddZdS )
CLIPOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPTextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The image embeddings obtained by applying the projection layer to the pooled output of [`CLIPVisionModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`CLIPTextModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`CLIPVisionModel`].
    Nlosslogits_per_imagelogits_per_textrK   r>   text_model_outputvision_model_outputr   c                    s   t  fdd  D S )Nc                 3   s.    | ]}|d vr | nt  | V  qdS ))rP   rQ   N)getattrto_tuple).0kselfr'   r(   	<genexpr>~   s
    
z&CLIPOutput.to_tuple.<locals>.<genexpr>)rH   keysrV   r'   rV   r(   rS   }   s   zCLIPOutput.to_tuple)rB   rC   rD   rE   rM   r   r$   rF   rG   rN   rO   rK   r>   rP   r   rQ   rH   r   rS   r'   r'   r'   r(   rL   _   s   
 rL   c                       sX   e Zd Zdef fddZdejdededejfdd	Zddej	dejfddZ
  ZS )CLIPVisionEmbeddingsconfigc                    s   t    || _|j| _|j| _|j| _tt	
| j| _tj|j| j| j| jdd| _| j| j d | _| jd | _t| j| j| _| jdt	| jddd d S )NF)in_channelsout_channelskernel_sizestridebiasr0   r   position_idsr   r1   
persistent)super__init__r[   hidden_size	embed_dim
image_size
patch_sizer   	Parameterr$   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr%   expandrW   r[   	__class__r'   r(   rf      s"   
"zCLIPVisionEmbeddings.__init__
embeddingsheightwidthr   c                 C   s  |j d d }| jjd}|j d d }tj s(||kr(||kr(| | jS |ddddf }|ddddf }|j d }	|| j }
|| j }t	|d }|
d|||	}|dddd}tjj||
|fdd	d
}|dddddd|	}tj||fddS )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   Nr1   r4   r   r0   bicubicF)sizemodealign_cornersr2   )shapert   weight	unsqueezer$   jit
is_tracingra   rj   r   reshapepermuter   r"   interpolateviewcat)rW   rz   r{   r|   rq   rt   rr   class_pos_embedpatch_pos_embedr2   
new_height	new_widthsqrt_num_positionsr'   r'   r(   interpolate_pos_encoding   s*   



z-CLIPVisionEmbeddings.interpolate_pos_encodingFpixel_valuesc              
   C   s   |j \}}}}|s&|| jks|| jkr&td| d| d| j d| j d	| jjj}| |j|d}|ddd}| j	
|dd}	tj|	|gdd	}
|r[|
| |
|| }
|
S |
| | j }
|
S )
NzInput image size (*z) doesn't match model ().)dtyper0   r   r1   r   )r   ri   
ValueErrorrp   r   r   toflatten	transposerm   rv   r$   r   r   rt   ra   )rW   r   r   
batch_size_r{   r|   target_dtypepatch_embedsclass_embedsrz   r'   r'   r(   forward   s    
zCLIPVisionEmbeddings.forwardF)rB   rC   rD   r   rf   r$   Tensorintr   rF   r   __classcell__r'   r'   rx   r(   rZ      s     )rZ   c                	       sX   e Zd Zdef fddZ			ddeej deej deej dej	fd	d
Z
  ZS )CLIPTextEmbeddingsr[   c                    sR   t    |j}t|j|| _t|j|| _| j	dt
|jddd d S )Nra   rb   Frc   )re   rf   rg   r   rs   
vocab_sizetoken_embeddingmax_position_embeddingsrt   ru   r$   r%   rv   rW   r[   rh   rx   r'   r(   rf      s   

zCLIPTextEmbeddings.__init__N	input_idsra   inputs_embedsr   c                 C   s   |d ur	|j d n|j d }| jjj d }||kr#td| d| |d u r2| jd d d |f }|d u r;| |}| |}|| }|S )Nr1   r   zRSequence length must be less than max_position_embeddings (got `sequence length`: z and max_position_embeddings: )r   rt   r   r   ra   r   )rW   r   ra   r   
seq_lengthmax_position_embeddingposition_embeddingsrz   r'   r'   r(   r      s"   

zCLIPTextEmbeddings.forward)NNN)rB   rC   rD   r   rf   r   r$   
LongTensorrF   r   r   r   r'   r'   rx   r(   r      s    r           Tmodulequerykeyvalueattention_maskscalingdropoutoutput_attentionsc                 K   s   t ||dd| }	|d ur|	| }	tjj|	dt jd|j}	tjj	|	|| j
d}	t |	|}
|
dd }
|s>d }	|
|	fS )Nr1   r   )r2   r   )ptrainingr   r0   )r$   matmulr   r   r"   softmaxfloat32r   r   r   r   
contiguous)r   r   r   r   r   r   r   r   kwargsattn_weightsattn_outputr'   r'   r(   eager_attention_forward   s   r   c                       sv   e Zd ZdZdeeef f fddZ			ddej	de
ej	 d	e
ej	 d
e
e deej	e
ej	 f f
ddZ  ZS )CLIPAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr[   c                    s   t    || _|j| _|j| _| j| j | _| j| j | jkr-td| j d| j d| jd | _	|j
| _d| _t| j| j| _t| j| j| _t| j| j| _t| j| j| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: r         F)re   rf   r[   rg   rh   num_attention_heads	num_headshead_dimr   scaleattention_dropoutr   	is_causalr   Lineark_projv_projq_projout_projrw   rx   r'   r(   rf     s$   

zCLIPAttention.__init__NFr@   r   causal_attention_maskr   r   c                 C   sH  |j \}}}| |}| |}	| |}
|||d| jdd}|	||d| jdd}	|
||d| jdd}
| jjdkrH|du| _	n|durU|durU|| }n|dur[|}t
}| jjdkrw| jjdkrq|rqtd nt| jj }|| ||	|
|| j	| j| jsd	n| j|d
	\}}|||| }| |}|sd}||fS )z#Input shape: Batch x Time x Channelr1   r   r0   flash_attention_2Neagersdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.r   )r   r   r   r   )r   r   r   r   r   r   r   r[   _attn_implementationr   r   loggerwarning_oncer   r   r   r   r   r   r   )rW   r@   r   r   r   r   r   rh   queriesrY   valuesattention_interfacer   r   r'   r'   r(   r   .  sH   	





zCLIPAttention.forward)NNF)rB   rC   rD   rE   r   r   r   rf   r$   r   r   boolrH   r   r   r'   r'   rx   r(   r     s"    r   c                       s2   e Zd Z fddZdejdejfddZ  ZS )CLIPMLPc                    sD   t    || _t|j | _t|j|j	| _
t|j	|j| _d S N)re   rf   r[   r   
hidden_actactivation_fnr   r   rg   intermediate_sizefc1fc2rw   rx   r'   r(   rf   i  s
   
zCLIPMLP.__init__r@   r   c                 C   s"   |  |}| |}| |}|S r   )r   r   r   )rW   r@   r'   r'   r(   r   p  s   


zCLIPMLP.forward)rB   rC   rD   rf   r$   r   r   r   r'   r'   rx   r(   r   h  s    r   c                       s\   e Zd Zdeeef f fddZ	ddejdejdejde	e
 d	eej f
d
dZ  ZS )CLIPEncoderLayerr[   c                    sR   t    |j| _t|| _tj| j|jd| _	t
|| _tj| j|jd| _d S Neps)re   rf   rg   rh   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2rw   rx   r'   r(   rf   x  s   


zCLIPEncoderLayer.__init__Fr@   r   r   r   r   c                 C   sd   |}|  |}| j||||d\}}|| }|}| |}| |}|| }|f}|r0||f7 }|S )aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r@   r   r   r   )r   r   r   r   )rW   r@   r   r   r   residualr   outputsr'   r'   r(   r     s"   




zCLIPEncoderLayer.forwardr   )rB   rC   rD   r   r   r   rf   r$   r   r   r   rH   rF   r   r   r'   r'   rx   r(   r   w  s    r   c                   @   s0   e Zd ZeZdZdZdZdZdZ	dZ
dd ZdS )CLIPPreTrainedModelclipTc                 C   s  | j j}t|tr#|jjjjd|d d |jjjjd|d d n)t|t	rX| j j}t
jj|jd|jd | d t
jj|jj|j j| d t
jj|jj|j j| d nt|tr| j j}|jd d|j j d  | }|jd | }t
jj|jj|d t
jj|jj|d t
jj|jj|d t
jj|jj|d nt|tr| j j}|j jd d|j j d  | }d|j j d | }t
jj|jj|d t
jj|jj|d ntt|trt
jj|jj|jd | j j d t
jj|jj|jd | j j d nKt|trt
jj|jj| j jd | j j d n2t|t r3t
jj|jj| j jd | j j d nt|t!rLt
jj|j"j| j j#jd | j j d t|t
j$r`|j%j&  |jj'd t|t
j(ru|j%durw|j%j&  dS dS dS )	zInitialize the weightsr   g{Gz?)meanstdr   )r   r0   g      ?N))r[   initializer_factor
isinstancer   r   r   datanormal_rt   rZ   r   initrm   rh   rp   initializer_ranger   num_hidden_layersr   r   r   r   r   rg   r   r   	CLIPModeltext_projectiontext_embed_dimvisual_projectionvision_embed_dimCLIPVisionModelWithProjectionCLIPTextModelWithProjectionCLIPForImageClassification
classifiervision_configr   r`   zero_fill_r   )rW   r   factorin_proj_stdout_proj_stdfc_stdr'   r'   r(   _init_weights  sj   



 z!CLIPPreTrainedModel._init_weightsN)rB   rC   rD   r   config_classbase_model_prefixsupports_gradient_checkpointing_supports_sdpa_supports_flash_attn_2_supports_flex_attn_supports_attention_backendr	  r'   r'   r'   r(   r     s    r   c                       sf   e Zd ZdZdef fddZe				ddeej	 deej	 dee
 d	ee
 d
ef
ddZ  ZS )CLIPEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`CLIPEncoderLayer`].

    Args:
        config: CLIPConfig
    r[   c                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r'   )r   )rT   r   r[   r'   r(   
<listcomp>  s    z(CLIPEncoder.__init__.<locals>.<listcomp>F)	re   rf   r[   r   
ModuleListranger   layersgradient_checkpointingrw   rx   r  r(   rf     s   
 
zCLIPEncoder.__init__Nr   r   r   output_hidden_statesr   c                 C   s   |dur|n| j j}|dur|n| j j}|rdnd}|rdnd}|}t| jD ] \}	}
|r2||f }|
||||d}|d }|rG||d f }q'|rO||f }t|||dS )a  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Causal mask for the text model. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr'   )r   r   r   )r?   r@   rA   )r[   r   r  	enumerater  r   )rW   r   r   r   r   r  encoder_statesall_attentionsr@   idxencoder_layerlayer_outputsr'   r'   r(   r     s4   &

zCLIPEncoder.forwardNNNN)rB   rC   rD   rE   r   rf   r   r   r$   r   r   r   r   r   r'   r'   rx   r(   r    s&    r  c                       sr   e Zd Zdef fddZee					ddeej	 deej	 deej	 dee
 d	ee
 d
efddZ  ZS )CLIPTextTransformerr[   c                    sT   t    || _|j}t|| _t|| _tj	||j
d| _|j| _|jdk| _d S )Nr   r   )re   rf   r[   rg   r   rz   r  encoderr   r   r   final_layer_normeos_token_idr   _use_flash_attention_2r   rx   r'   r(   rf   D  s   


zCLIPTextTransformer.__init__Nr   r   ra   r   r  r   c                 C   s@  |d ur|n| j j}|d ur|n| j j}|d u rtd| }|d|d }| j||d}t||j|j	d}|d urE| j
sEt||j}| j|||||d}	|	j}
| |
}
| jdkrw|
tj|
jd |
j	d|jtj|
j	djdd	f }n|
tj|
jd |
j	d|jtj|
j	d| jk jdd	f }t|
||	j|	jd
S )NzYou have to specify input_idsr1   )r   ra   r    )r   r   r   r   r  r0   r   )r   r!   r   r?   pooler_outputr@   rA   )r[   r   r  r   r~   r   rz   r   r   r!   r$  r   r!  r?   r"  r#  r$   r%   r   r   r   argmaxr   r@   rA   )rW   r   r   ra   r   r  input_shaper@   r   encoder_outputsr?   pooled_outputr'   r'   r(   r   R  sT   



	zCLIPTextTransformer.forwardNNNNN)rB   rC   rD   r   rf   r   r   r   r$   r   r   r   r   r   r'   r'   rx   r(   r   C  s,    r   zI
    The text model from CLIP without any head or projection on top.
    c                          e Zd ZeZddgZdef fddZdejfddZ	d	d
 Z
ee					ddeej deej deej dee dee defddZ  ZS )CLIPTextModelr   r   r[   c                    "   t  | t|| _|   d S r   )re   rf   r   
text_model	post_initrw   rx   r'   r(   rf        
zCLIPTextModel.__init__r   c                 C   
   | j jjS r   r/  rz   r   rV   r'   r'   r(   get_input_embeddings     
z"CLIPTextModel.get_input_embeddingsc                 C      || j j_d S r   r3  rW   r   r'   r'   r(   set_input_embeddings     z"CLIPTextModel.set_input_embeddingsNr   r   ra   r   r  c                 C   s   | j |||||dS )a9  
        Examples:

        ```python
        >>> from transformers import AutoTokenizer, CLIPTextModel

        >>> model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```r   r   ra   r   r  )r/  )rW   r   r   ra   r   r  r'   r'   r(   r     s   zCLIPTextModel.forwardr+  )rB   rC   rD   r   r
  _no_split_modulesrf   r   Moduler4  r8  r   r   r   r$   r   r   r   r   r   r'   r'   rx   r(   r-    s4    r-  c                       sd   e Zd Zdef fddZee				ddeej	 dee
 dee
 d	ee
 d
ef
ddZ  ZS )CLIPVisionTransformerr[   c                    sR   t    || _|j}t|| _tj||jd| _	t
|| _tj||jd| _d S r   )re   rf   r[   rg   rZ   rz   r   r   r   pre_layrnormr  r!  post_layernormr   rx   r'   r(   rf     s   


zCLIPVisionTransformer.__init__NFr   r   r  r   r   c           	      C   s   |d ur|n| j j}|d ur|n| j j}|d u rtd| j||d}| |}| j|||d}|j}|d d dd d f }| |}t	|||j
|jdS )Nz You have to specify pixel_values)r   )r   r   r  r   r%  )r[   r   r  r   rz   r>  r!  r?   r?  r   r@   rA   )	rW   r   r   r  r   r@   r)  r?   r*  r'   r'   r(   r     s*   	

zCLIPVisionTransformer.forwardNNNF)rB   rC   rD   r   rf   r   r   r   r$   rF   r   r   r   r   r'   r'   rx   r(   r=    s&    
r=  zK
    The vision model from CLIP without any head or projection on top.
    c                       s~   e Zd ZeZdZdgZdef fddZdej	fddZ
ee							
ddeej dee dee dedef
ddZ  ZS )CLIPVisionModelr   r   r[   c                    r.  r   )re   rf   r=  vision_modelr0  rw   rx   r'   r(   rf     r1  zCLIPVisionModel.__init__r   c                 C   r2  r   rB  rz   rp   rV   r'   r'   r(   r4    r5  z$CLIPVisionModel.get_input_embeddingsNFr   r  r   c                 C   s   | j ||||dS )a  
        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, CLIPVisionModel

        >>> model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```r   r   r  r   )rB  )rW   r   r   r  r   r'   r'   r(   r     s   zCLIPVisionModel.forwardr@  )rB   rC   rD   r   r
  main_input_namer;  rf   r   r<  r4  r   r   r   r$   rF   r   r   r   r   r'   r'   rx   r(   rA    s.    rA  c                       s$  e Zd ZeZg dZdef fddZe					ddee	j
 dee	j
 dee	j
 d	ee d
ee de	jfddZe				ddee	j d	ee d
ee dede	jf
ddZee								ddee	j dee	j dee	j
 dee	j dee d	ee d
ee dedefddZ  ZS )r   )r   r   rZ   r[   c                    s   t  | t|jtstdt|j dt|jts(tdt|j d|j}|j}|j	| _	|j
| _|j
| _t|}|j| _t|}|j| _tj| j| j	dd| _tj| j| j	dd| _tt| jj| _|   d S )NzKconfig.text_config is expected to be of type CLIPTextConfig but is of type .zOconfig.vision_config is expected to be of type CLIPVisionConfig but is of type Fr`   )re   rf   r   text_configr   	TypeErrortyper  r   projection_dimrg   r   r   r-  _from_configr/  rA  rB  r   r   r   r   rk   r$   r/   r[   logit_scale_init_valuelogit_scaler0  )rW   r[   rH  r  r/  rB  rx   r'   r(   rf   E  s4   

zCLIPModel.__init__Nr   r   ra   r   r  r   c           	      C   sP   |dur|n| j j}|dur|n| j j}| j|||||d}|j}| |}|S )a  
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`CLIPTextModel`].

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, CLIPModel

        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
        >>> text_features = model.get_text_features(**inputs)
        ```Nr:  )r[   r   r  r/  r&  r   )	rW   r   r   ra   r   r  text_outputsr*  text_featuresr'   r'   r(   get_text_featuresh  s   
zCLIPModel.get_text_featuresFr   r   c                 C   sN   |dur|n| j j}|dur|n| j j}| j||||d}|j}| |}|S )aD  
        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`CLIPVisionModel`].

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, CLIPModel

        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> image_features = model.get_image_features(**inputs)
        ```NrD  )r[   r   r  rB  r&  r   )rW   r   r   r  r   vision_outputsr*  image_featuresr'   r'   r(   get_image_features  s   
zCLIPModel.get_image_featuresreturn_lossc	              	   C   s   |dur|n| j j}|dur|n| j j}| j||||d}	| j|||||d}
|	j}| |}|
j}| |}|t| }|t| }t	
|| |j}|| j |j }| }d}|rft|}t||||||
|	dS )a  
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, CLIPModel

        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
        ... )

        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```NrD  r:  )rM   rN   rO   rK   r>   rP   rQ   )r[   r   r  rB  r/  r&  r   r   r:   r$   r   r+   r   r!   rN  expr.   rL   )rW   r   r   r   ra   rU  r   r  r   rR  rO  r>   rK   rO   rN   rM   r'   r'   r(   r     sJ   '

zCLIPModel.forwardr+  r@  )NNNNNNNF)rB   rC   rD   r   r
  r;  rf   r   r   r$   r   r   rF   rQ  rT  r   r   rL   r   r   r'   r'   rx   r(   r   @  s    #+/	
r   c                       r,  )r   r   r   r[   c                    @   t  | t|}|j| _tj|j|jdd| _	| 
  d S NFrG  )re   rf   r-  rL  r/  r   r   rg   rK  r   r0  )rW   r[   r/  rx   r'   r(   rf   $  
   
z$CLIPTextModelWithProjection.__init__r   c                 C   r2  r   r3  rV   r'   r'   r(   r4  /  r5  z0CLIPTextModelWithProjection.get_input_embeddingsc                 C   r6  r   r3  r7  r'   r'   r(   r8  2  r9  z0CLIPTextModelWithProjection.set_input_embeddingsNr   r   ra   r   r  c           	      C   s:   | j |||||d}|j}| |}t||j|j|jdS )a  
        Examples:

        ```python
        >>> from transformers import AutoTokenizer, CLIPTextModelWithProjection

        >>> model = CLIPTextModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")
        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> text_embeds = outputs.text_embeds
        ```r:  )rK   r?   r@   rA   )r/  r&  r   rJ   r?   r@   rA   )	rW   r   r   ra   r   r  rO  r*  rK   r'   r'   r(   r   5  s   
z#CLIPTextModelWithProjection.forwardr+  )rB   rC   rD   r   r
  r;  rf   r   r<  r4  r8  r   r   r   r$   r   r   rJ   r   r   r'   r'   rx   r(   r     s4    r   c                       sx   e Zd ZeZdZdef fddZdejfddZ	e
e					ddeej d
ee dee dedef
ddZ  ZS )r   r   r[   c                    rW  rX  )re   rf   rA  rL  rB  r   r   rg   rK  r   r0  rW   r[   rB  rx   r'   r(   rf   e  rY  z&CLIPVisionModelWithProjection.__init__r   c                 C   r2  r   rC  rV   r'   r'   r(   r4  p  r5  z2CLIPVisionModelWithProjection.get_input_embeddingsNFr   r  r   c                 C   s8   | j ||||d}|j}| |}t||j|j|jdS )a  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, CLIPVisionModelWithProjection

        >>> model = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> image_embeds = outputs.image_embeds
        ```rD  )r>   r?   r@   rA   )rB  r&  r   r=   r?   r@   rA   )rW   r   r   r  r   rR  r*  r>   r'   r'   r(   r   s  s   
z%CLIPVisionModelWithProjection.forwardr@  )rB   rC   rD   r   r
  rE  rf   r   r<  r4  r   r   r   r$   rF   r   r=   r   r   r'   r'   rx   r(   r   `  s,    r   z
    CLIP vision encoder with an image classification head on top (a linear layer on top of the pooled final hidden states of
    the patch tokens) e.g. for ImageNet.
    c                       sn   e Zd ZdZdeddf fddZee				ddee	j
 dee	j
 dee d	ee def
d
dZ  ZS )r   r   r[   r   Nc                    sZ   t  | |j| _t|j}|j| _|jdkr"t|jj	|jnt
 | _|   d S )Nr   )re   rf   
num_labelsrA  rL  r  rB  r   r   rg   Identityr  r0  rZ  rx   r'   r(   rf     s   "z#CLIPForImageClassification.__init__labelsr   r  c           
      C   sr  |dur|n| j j}|dur|n| j j}| j|||d}|j}tj|ddddddf dd}| |}d}|dur||j	}| j j
du rl| jdkrRd| j _
n| jdkrh|jtjksc|jtjkrhd| j _
nd| j _
| j j
dkrt }	| jdkr|	| | }n+|	||}n%| j j
dkrt }	|	|d| j|d}n| j j
dkrt }	|	||}t|||j|jd	S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N)r   r  r   r   
regressionsingle_label_classificationmulti_label_classificationr1   )rM   r   r@   rA   )r[   r   r  rB  r?   r$   r   r  r   r!   problem_typer[  r   longr   r
   squeezer	   r   r   r   r@   rA   )
rW   r   r]  r   r  r   sequence_outputr   rM   loss_fctr'   r'   r(   r     sJ   $


"


z"CLIPForImageClassification.forwardr  )rB   rC   rD   rE  r   rf   r   r   r   r$   r   r   r   r   r   r'   r'   rx   r(   r     s(    r   )r   r   r-  r   rA  r   r   )r   T)DrE   dataclassesr   typingr   r   r   r   r$   r   torch.nnr   r	   r
   activationsr   modeling_attn_mask_utilsr   r   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   utilsr   r   r   r   r   configuration_clipr   r   r   
get_loggerrB   r   r   r)   r.   r:   r=   rJ   rL   r<  rZ   r   floatr   r   r   r   r   r   r  r   r-  r=  rA  r   r   r   r   __all__r'   r'   r'   r(   <module>   s   
#S/
Q2BWZ414 ^A@T