o
    	۷i_                     @   s  d Z ddlZddlZddlmZ ddlmZmZmZm	Z	 ddl
ZddlZddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZmZ ddlmZ ddlm Z m!Z!m"Z"m#Z#m$Z$m%Z% ddl&m'Z' ddl(m)Z)m*Z*m+Z+ dd Z,	dZdej-de.de.de.de.dej-fddZ/d[d"d#Z0d$d% Z1d&d' Z2ee"d(d)G d*d+ d+e Z3ee"d,d)G d-d. d.e Z4ee"G d/d0 d0e Z5G d1d2 d2ej6Z7G d3d4 d4ej6Z8	d\d5ej6d6ej-d7ej-d8ej-d9eej- d:e.d;e.fd<d=Z9G d>d? d?ej6Z:G d@dA dAej6Z;G dBdC dCeZ<e"G dDdE dEeZ=G dFdG dGej6Z>G dHdI dIej6Z?e"dJd)G dKdL dLe=Z@G dMdN dNej6ZAG dOdP dPej6ZBe"dQd)G dRdS dSe=ZCe"G dTdU dUe=ZDe"dVd)G dWdX dXe=ZEg dYZFdS )]zPyTorch Siglip model.    N)	dataclass)AnyCallableOptionalUnion)nn)_calculate_fan_in_and_fan_out   )ACT2FN)_prepare_4d_attention_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuplefilter_out_non_signature_kwargs	torch_int)check_model_inputs   )SiglipConfigSiglipTextConfigSiglipVisionConfigc                 C   s   dd }||d|  k s||d|  krt jddd ||| | }||| | }| d| d d| d  |   | |td  | | | j||d d S )	Nc                 S   s   dt | t d  d S )N      ?       @)matherfsqrt)x r$   `/home/ubuntu/vllm_env/lib/python3.10/site-packages/transformers/models/siglip/modeling_siglip.pynorm_cdf0   s   z _trunc_normal_.<locals>.norm_cdf   zjmean is more than 2 std from [a, b] in nn.init.trunc_normal_. The distribution of values may be incorrect.)
stacklevelr   r   )minmax)	warningswarnuniform_erfinv_mul_r    r"   add_clamp_)tensormeanstdabr&   lur$   r$   r%   _trunc_normal_-   s    	
r9           r          r   r2   r3   r4   r5   r6   returnc                 C   sN   t   t| dd|| | || W d   dS 1 s w   Y  dS )an  Fills the input Tensor with values drawn from a truncated
    normal distribution. The values are effectively drawn from the
    normal distribution :math:`\mathcal{N}(	ext{mean}, 	ext{std}^2)`
    with values outside :math:`[a, b]` redrawn until they are within
    the bounds. The method used for generating the random values works
    best when :math:`a \leq 	ext{mean} \leq b`.

    NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
    bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
    and the result is subsequently scaled and shifted by the mean and std args.

    Args:
        tensor: an n-dimensional `torch.Tensor`
        mean: the mean of the normal distribution
        std: the standard deviation of the normal distribution
        a: the minimum cutoff value
        b: the maximum cutoff value
    r   r   N)torchno_gradr9   r/   r0   )r2   r3   r4   r5   r6   r$   r$   r%   trunc_normal_tf_Q   s   
"r?   fan_innormalc           	      C   s  t | \}}|dkr|}n|dkr|}n
|dkr|| d }|| }|dkr3t| t|d d d S |dkrWt  | jt|d W d    d S 1 sPw   Y  d S |d	krtd
| }t  | | | W d    d S 1 syw   Y  d S td| )Nr@   fan_outfan_avgr'   truncated_normalg۶%?r4   rA   uniformr	   zinvalid distribution )	r   r?   r    r"   r=   r>   normal_r-   
ValueError)	r2   scalemodedistributionr@   rB   denomvarianceboundr$   r$   r%   variance_scaling_k   s(   
"
"rO   c                 C      t | ddd d S )Nr@   rD   rJ   rK   rO   r2   r$   r$   r%   lecun_normal_      rT   c                 C   rP   )Nr@   rA   rQ   rR   rS   r$   r$   r%   default_flax_embed_init   rU   rV   z}
    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
    )custom_introc                   @   j   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeejdf  ed< dZeeejdf  ed< dS )SiglipVisionModelOutputz
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The image embeddings obtained by applying the projection layer to the pooler_output.
    Nimage_embedslast_hidden_state.hidden_states
attentions)__name__
__module____qualname____doc__rZ   r   r=   FloatTensor__annotations__r[   r\   tupler]   r$   r$   r$   r%   rY         
 rY   ze
    Base class for text model's outputs that also contains a pooling of the last hidden states.
    c                   @   rX   )SiglipTextModelOutputz
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The text embeddings obtained by applying the projection layer to the pooler_output.
    Ntext_embedsr[   .r\   r]   )r^   r_   r`   ra   rg   r   r=   rb   rc   r[   r\   rd   r]   r$   r$   r$   r%   rf      re   rf   c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeej ed< dZeej ed< dZeed< dZeed	< d
ee fddZdS )SiglipOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`SiglipTextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The image embeddings obtained by applying the projection layer to the pooled output of [`SiglipVisionModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`SiglipTextModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`SiglipVisionModel`].
    Nlosslogits_per_imagelogits_per_textrg   rZ   text_model_outputvision_model_outputr<   c                    s   t  fdd  D S )Nc                 3   s.    | ]}|d vr | nt  | V  qdS ))rl   rm   N)getattrto_tuple).0kselfr$   r%   	<genexpr>   s
    
z(SiglipOutput.to_tuple.<locals>.<genexpr>)rd   keysrr   r$   rr   r%   ro      s   zSiglipOutput.to_tuple)r^   r_   r`   ra   ri   r   r=   rb   rc   rj   rk   rg   rZ   rl   r   rm   rd   r   ro   r$   r$   r$   r%   rh      s   
 rh   c                       sX   e Zd Zdef fddZdejdededejfdd	Zddej	dejfddZ
  ZS )SiglipVisionEmbeddingsconfigc                    s   t    || _|j| _|j| _|j| _tj|j	| j| j| jdd| _
| j| j d | _| j| _t| j| j| _| jdt| jddd d S )Nvalid)in_channelsout_channelskernel_sizestridepaddingr'   position_idsr   F
persistent)super__init__rw   hidden_size	embed_dim
image_size
patch_sizer   Conv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr=   arangeexpandrs   rw   	__class__r$   r%   r      s    
"zSiglipVisionEmbeddings.__init__
embeddingsheightwidthr<   c                 C   s   |j d }| jjj d }tj s||kr||kr| | jS | jjd}|j d }|| j }|| j }	t	|d }
|
d|
|
|}|dddd}tjj|||	fddd	}|dddddd|}|S )
a  
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing and no class embeddings.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   r   g      ?r	   r'   bicubicF)sizerJ   align_corners)shaper   weightr=   jit
is_tracingr~   	unsqueezer   r   reshapepermuter   
functionalinterpolateview)rs   r   r   r   r   r   patch_pos_embeddim
new_height	new_widthsqrt_num_positionsr$   r$   r%   interpolate_pos_encoding   s&   




z/SiglipVisionEmbeddings.interpolate_pos_encodingFpixel_valuesc           	      C   sj   |j \}}}}| jjj}| |j|d}|ddd}|r+|| ||| }|S || | j	 }|S )N)dtyper'   r   )
r   r   r   r   toflatten	transposer   r   r~   )	rs   r   r   _r   r   target_dtypepatch_embedsr   r$   r$   r%   forward  s   
zSiglipVisionEmbeddings.forwardF)r^   r_   r`   r   r   r=   Tensorintr   rb   r   __classcell__r$   r$   r   r%   rv      s     &rv   c                	       sX   e Zd Zdef fddZ			ddeej deej deej dej	fd	d
Z
  ZS )SiglipTextEmbeddingsrw   c                    sR   t    |j}t|j|| _t|j|| _| j	dt
|jddd d S )Nr~   r   Fr   )r   r   r   r   r   
vocab_sizetoken_embeddingmax_position_embeddingsr   r   r=   r   r   rs   rw   r   r   r$   r%   r   "  s   

zSiglipTextEmbeddings.__init__N	input_idsr~   inputs_embedsr<   c                 C   s   |d ur	|j d n|j d }| jjj d }||kr#td| d| |d u r2| jd d d |f }|d u r;| |}| |}|| }|S )Nr   r   zRSequence length must be less than max_position_embeddings (got `sequence length`: z and max_position_embeddings: )r   r   r   rH   r~   r   )rs   r   r~   r   
seq_lengthmax_position_embeddingposition_embeddingsr   r$   r$   r%   r   .  s"   

zSiglipTextEmbeddings.forwardNNN)r^   r_   r`   r   r   r   r=   
LongTensorrb   r   r   r   r$   r$   r   r%   r   !  s    r   modulequerykeyvalueattention_maskscalingdropoutc           
      K   s|   t ||dd| }|d ur|| }tjj|dt jd|j}tjj	||| j
d}t ||}	|	dd }	|	|fS )Nr   r   )r   r   )ptrainingr   r'   )r=   matmulr   r   r   softmaxfloat32r   r   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputr$   r$   r%   eager_attention_forwardI  s   
r   c                
       sR   e Zd ZdZ fddZ	d
dejdeej deejeej f fdd	Z	  Z
S )SiglipAttentionz=Multi-headed attention from 'Attention Is All You Need' paperc                    s   t    || _|j| _|j| _| j| j | _| j| j | jkr-td| j d| j d| jd | _	|j
| _d| _t| j| j| _t| j| j| _t| j| j| _t| j| j| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).      F)r   r   rw   r   r   num_attention_heads	num_headshead_dimrH   rI   attention_dropoutr   	is_causalr   Lineark_projv_projq_projout_projr   r   r$   r%   r   c  s$   

zSiglipAttention.__init__Nr\   r   r<   c              
   K   s   |j \}}}| |}| |}| |}	|||| j| jdd}|||| j| jdd}|	||| j| jdd}	t}
| j	j
dkrMt| j	j
 }
|
| |||	|| j| j| js\dn| jd\}}|||| }| |}||fS )z#Input shape: Batch x Time x Channelr   r'   eagerr:   )r   r   r   )r   r   r   r   r   r   r   r   r   rw   _attn_implementationr   r   rI   r   r   r   r   r   )rs   r\   r   r   
batch_sizer   r   queriesru   valuesattention_interfacer   r   r$   r$   r%   r   w  s.   




zSiglipAttention.forwardN)r^   r_   r`   ra   r   r=   r   r   rd   r   r   r$   r$   r   r%   r   `  s    r   c                       s2   e Zd Z fddZdejdejfddZ  ZS )	SiglipMLPc                    sD   t    || _t|j | _t|j|j	| _
t|j	|j| _d S r   )r   r   rw   r
   
hidden_actactivation_fnr   r   r   intermediate_sizefc1fc2r   r   r$   r%   r     s
   
zSiglipMLP.__init__r\   r<   c                 C   s"   |  |}| |}| |}|S r   )r   r   r   )rs   r\   r$   r$   r%   r     s   


zSiglipMLP.forward)r^   r_   r`   r   r=   r   r   r   r$   r$   r   r%   r     s    r   c                	       sR   e Zd Zdeeef f fddZedej	dej	de
e dejfdd	Z  ZS )
SiglipEncoderLayerrw   c                    sR   t    |j| _tj| j|jd| _t|| _	tj| j|jd| _
t|| _d S Neps)r   r   r   r   r   	LayerNormlayer_norm_epslayer_norm1r   	self_attnlayer_norm2r   mlpr   r   r$   r%   r     s   

zSiglipEncoderLayer.__init__r\   r   r   r<   c                 K   sT   |}|  |}| jd||d|\}}|| }|}| |}| |}|| }|S )N)r\   r   r$   )r   r   r   r   )rs   r\   r   r   residualr   r$   r$   r%   r     s   



zSiglipEncoderLayer.forward)r^   r_   r`   r   r   r   r   r   r=   r   r   r   rb   r   r   r$   r$   r   r%   r     s    r   c                   @   sH   e Zd ZU eed< dZdZg dZdZdZ	dZ
dZeedZdd ZdS )	SiglipPreTrainedModelrw   siglipT)r   rv   r   #SiglipMultiheadAttentionPoolingHead)r\   r]   c                 C   sf  t |tr%t | jtr| jjjn| jj}tjj|j	j
dt| d dS t |tjr2t|j
 dS t |trytj|jj
 tj|jj
 tj|jj
 tj|jj
 tj|jj tj|jj tj|jj tj|jj dS t |trtj|jj
 tj|jj
 tjj|jjdd tjj|jjdd dS t |trtj|jj tj|jjj tj|jjj dS t |t rt!"t!#d}|j$j%| |j&j'  dS t |t(rtjj|j)j
| jjjd | jj* d dS t |tj+tj,frt-|j
 |jdurtj|j dS dS t |tj.r1|jj'  |j
j%d dS dS )zInitialize the weightsr   rE   gư>r   r   N)/
isinstancerv   rw   r   vision_configr   r   initrG   r   r   npr"   r   rV   r   xavier_uniform_r   r   r   r   zeros_biasr   r   r   r   probedata	attentionin_proj_weightin_proj_biasSiglipModelr=   logr2   logit_scalefill_
logit_biaszero_SiglipForImageClassification
classifierinitializer_factorr   r   rT   r   )rs   r   r   logit_scale_initr$   r$   r%   _init_weights  sX   

"






z#SiglipPreTrainedModel._init_weightsN)r^   r_   r`   r   rc   base_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendr   r   _can_record_outputsr  r$   r$   r$   r%   r     s   
 r   c                       sN   e Zd ZdZdef fddZe	ddeej	 de
e defd	d
Z  ZS )SiglipEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`SiglipEncoderLayer`].

    Args:
        config: SiglipConfig
    rw   c                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r$   )r   )rp   r   rw   r$   r%   
<listcomp>"  s    z*SiglipEncoder.__init__.<locals>.<listcomp>F)	r   r   rw   r   
ModuleListrangenum_hidden_layerslayersgradient_checkpointingr   r   r  r%   r     s   
 
zSiglipEncoder.__init__Nr   r   r<   c                 K   s,   |}| j D ]}|||fi |}qt|dS )N)r[   )r#  r   )rs   r   r   r   r\   encoder_layerr$   r$   r%   r   &  s   

zSiglipEncoder.forwardr   )r^   r_   r`   ra   r   r   r   r   r=   r   r   r   r   r   r   r$   r$   r   r%   r    s    r  c                       sf   e Zd Zdef fddZee			ddeej	 deej	 deej	 de
e d	ef
d
dZ  ZS )SiglipTextTransformerrw   c                    sP   t    || _|j}t|| _t|| _tj	||j
d| _t||j| _d S r   )r   r   rw   r   r   r   r  encoderr   r   r   final_layer_normr   projection_sizeheadr   r   r$   r%   r   9  s   


zSiglipTextTransformer.__init__Nr   r   r~   r   r<   c                 K   s   |d u rt d| }|d|d }| j||d}d| jjv }|r&d }n|d ur2|s2t||j}| jd||d|}|j	}	| 
|	}	|	d d dd d f }
| |
}
t|	|
dS )NzYou have to specify input_idsr   )r   r~   flash)r   r   r[   pooler_outputr$   )rH   r   r   r   rw   r   r   r   r'  r[   r(  r*  r   )rs   r   r   r~   r   input_shaper\   uses_flash_attentionencoder_outputsr[   pooled_outputr$   r$   r%   r   C  s0   	

zSiglipTextTransformer.forwardr   )r^   r_   r`   r   r   r   r   r   r=   r   r   r   r   r   r   r$   r$   r   r%   r&  8  s$    
r&  zK
    The text model from SigLIP without any head or projection on top.
    c                       s   e Zd ZU eed< def fddZdejfddZdd Z	e
d	d
e			ddeej deej deej dee def
ddZ  ZS )SiglipTextModelrw   c                    "   t  | t|| _|   d S r   )r   r   r&  
text_model	post_initr   r   r$   r%   r   x  s   
zSiglipTextModel.__init__r<   c                 C   
   | j jjS r   r4  r   r   rr   r$   r$   r%   get_input_embeddings~     
z$SiglipTextModel.get_input_embeddingsc                 C   s   || j j_d S r   r7  )rs   r   r$   r$   r%   set_input_embeddings  s   z$SiglipTextModel.set_input_embeddingsFtie_last_hidden_statesNr   r   r~   r   c                 K   s   | j d|||d|S )a  
        Examples:

        ```python
        >>> from transformers import AutoTokenizer, SiglipTextModel

        >>> model = SiglipTextModel.from_pretrained("google/siglip-base-patch16-224")
        >>> tokenizer = AutoTokenizer.from_pretrained("google/siglip-base-patch16-224")

        >>> # important: make sure to set padding="max_length" as that's how the model was trained
        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding="max_length", return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```r   r   r~   Nr$   )r4  )rs   r   r   r~   r   r$   r$   r%   r     s   zSiglipTextModel.forwardr   )r^   r_   r`   r   rc   r   r   Moduler8  r:  r   r   r   r=   r   r   r   r   r   r   r$   r$   r   r%   r2  p  s*   
 r2  c                       sH   e Zd Zdef fddZe	d
dee dee	 de
fdd	Z  ZS )SiglipVisionTransformerrw   c                    sj   t    || _|j}t|| _t|| _tj	||j
d| _t|ds%dn|j| _| jr3t|| _d S d S )Nr   vision_use_headT)r   r   rw   r   rv   r   r  r'  r   r   r   post_layernormhasattrr@  use_headr   r*  r   r   r$   r%   r     s   


z SiglipVisionTransformer.__init__Fr   r   r<   c                 K   sR   | j ||d}| jdd|i|}|j}| |}| jr!| |nd }t||dS )N)r   r   r,  r$   )r   r'  r[   rA  rC  r*  r   )rs   r   r   r   r\   r0  r[   r-  r$   r$   r%   r     s   
zSiglipVisionTransformer.forwardr   )r^   r_   r`   r   r   r   r   boolr   r   r   r   r   r$   r$   r   r%   r?    s    r?  c                       s.   e Zd ZdZdef fddZdd Z  ZS )r   zMultihead Attention Pooling.rw   c                    s\   t    ttdd|j| _tjj|j|j	dd| _
tj|j|jd| _t|| _d S )Nr   T)batch_firstr   )r   r   r   	Parameterr=   randnr   r  MultiheadAttentionr   r  r   r   	layernormr   r   r   r   r$   r%   r     s
   
z,SiglipMultiheadAttentionPoolingHead.__init__c                 C   sX   |j d }| j|dd}| |||d }|}| |}|| | }|d d df S )Nr   r   )r   r  repeatr  rI  r   )rs   hidden_stater   r  r   r$   r$   r%   r     s   

z+SiglipMultiheadAttentionPoolingHead.forward)r^   r_   r`   ra   r   r   r   r   r$   r$   r   r%   r     s    r   zM
    The vision model from SigLIP without any head or projection on top.
    c                	       sl   e Zd ZU eed< dZdef fddZdejfddZ	e
dd	e	dd
edee defddZ  ZS )SiglipVisionModelrw   r   c                    r3  r   )r   r   r?  vision_modelr5  r   r   r$   r%   r     s   
zSiglipVisionModel.__init__r<   c                 C   r6  r   )rM  r   r   rr   r$   r$   r%   r8    r9  z&SiglipVisionModel.get_input_embeddingsFr;  r   r   c                 K   s   | j d||d|S )a  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, SiglipVisionModel

        >>> model = SiglipVisionModel.from_pretrained("google/siglip-base-patch16-224")
        >>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled features
        ```r   r   Nr$   )rM  )rs   r   r   r   r$   r$   r%   r     s   zSiglipVisionModel.forwardr   )r^   r_   r`   r   rc   main_input_namer   r   r>  r8  r   r   rD  r   r   r   r   r   r$   r$   r   r%   rL    s   
 rL  c                       s   e Zd ZU eed< def fddZe e		ddej	de
ej	 de
ej	 dejfd	d
Ze e	ddejdedee dejfddZee						dde
ej de
ej de
ej	 de
ej de
e dedee defddZ  ZS )r
  rw   c                    s   t  | t|jtstdt|j dt|jts(tdt|j d|j}|j}t	
|}t
|}|j| _|j| _ttd| _ttd| _|   d S )NzMconfig.text_config is expected to be of type SiglipTextConfig but is of type .zQconfig.vision_config is expected to be of type SiglipVisionConfig but is of type r   )r   r   r   text_configr   	TypeErrortyper   r   r2  _from_configrL  r4  rM  r   rF  r=   rG  r  r  r5  )rs   rw   rQ  r   r4  rM  r   r$   r%   r      s,   

zSiglipModel.__init__Nr   r   r~   r<   c                 C   s   | j |||d}|j}|S )aJ  
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`SiglipTextModel`].

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, AutoModel
        >>> import torch

        >>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
        >>> tokenizer = AutoTokenizer.from_pretrained("google/siglip-base-patch16-224")

        >>> # important: make sure to set padding="max_length" as that's how the model was trained
        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding="max_length", return_tensors="pt")
        >>> with torch.no_grad():
        ...     text_features = model.get_text_features(**inputs)
        ```r=  )r4  r-  )rs   r   r   r~   text_outputsr1  r$   r$   r%   get_text_features@  s   zSiglipModel.get_text_featuresFr   r   r   c                 K   s    | j d||d|}|j}|S )ah  
        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`SiglipVisionModel`].

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, AutoModel
        >>> from transformers.image_utils import load_image

        >>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
        >>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     image_features = model.get_image_features(**inputs)
        ```rN  Nr$   )rM  r-  )rs   r   r   r   vision_outputsr1  r$   r$   r%   get_image_featuresd  s   zSiglipModel.get_image_featuresreturn_lossc              	   K   s"  | j d||d|}| jd|||d|}	|j}
|	j}|
|
jdddd }
||jdddd }t||
 |j}| j	|j| j
|j}}||  | }| }d}|rtj|d|jd	}t| d|  }tjj|| }tj|dd
 }| }t|||||
|	|dS )a  
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, AutoModel
        >>> import torch

        >>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
        >>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> texts = ["a photo of 2 cats", "a photo of 2 dogs"]
        >>> # important: we pass `padding=max_length` since the model was trained with this
        >>> inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> logits_per_image = outputs.logits_per_image
        >>> probs = torch.sigmoid(logits_per_image) # these are the probabilities
        >>> print(f"{probs[0][0]:.1%} that image 0 is '{texts[0]}'")
        31.9% that image 0 is 'a photo of 2 cats'
        ```rN  r=  r'   r   T)r   r   keepdimNr   )devicer   )ri   rj   rk   rg   rZ   rl   rm   r$   )rM  r4  r-  normr=   r   tr   r[  r  r  expeyer   	ones_liker   r   
logsigmoidsumr3   rh   )rs   r   r   r   r~   rY  r   r   rW  rU  rZ   rg   rk   r  r  rj   ri   r`  m1_diag1logliknllr$   r$   r%   r     sJ   *zSiglipModel.forward)NNr   )NNNNNF)r^   r_   r`   r   rc   r   r   r   r=   r   r   rb   rV  rD  r   r   rX  r   r   rh   r   r   r$   r$   r   r%   r
    sj   
  "'	r
  z
    SigLIP vision encoder with an image classification head on top (a linear layer on top of the pooled final hidden states of
    the patch tokens) e.g. for ImageNet.
    c                       sj   e Zd ZdZdeddf fddZe e			ddee	j
 dee	j
 d	ed
ee def
ddZ  ZS )r  r   rw   r<   Nc                    sZ   t  | |j| _t|j}|j| _|jdkr"t|jj	|jnt
 | _|   d S )Nr   )r   r   
num_labelsrL  rT  r   rM  r   r   r   Identityr  r5  )rs   rw   rM  r   r$   r%   r     s   "z%SiglipForImageClassification.__init__Flabelsr   r   c           	      K   s\   | j |fd|i|}|j}tj|dd}| |}d}|dur(| ||| j}t||dS )a$  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, SiglipForImageClassification
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> torch.manual_seed(3)  # doctest: +IGNORE_RESULT
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> # note: we are loading a `SiglipModel` from the hub here,
        >>> # so the head will be randomly initialized, hence the predictions will be random if seed is not set above.
        >>> image_processor = AutoImageProcessor.from_pretrained("google/siglip-base-patch16-224")
        >>> model = SiglipForImageClassification.from_pretrained("google/siglip-base-patch16-224")

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> logits = outputs.logits
        >>> # model predicts one of the two classes
        >>> predicted_class_idx = logits.argmax(-1).item()
        >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
        Predicted class: LABEL_1
        ```r   r   r\  N)ri   logits)rM  r[   r=   r3   r  loss_functionrw   r   )	rs   r   ri  r   r   outputssequence_outputrj  ri   r$   r$   r%   r     s"   (
z$SiglipForImageClassification.forward)NNF)r^   r_   r`   rO  r   r   r   r   r   r=   r   rD  r   r   r   r   r   r$   r$   r   r%   r    s&    r  )r
  r   r2  rL  r  )r:   r   r;   r   )r   r@   rA   )r:   )Gra   r    r+   dataclassesr   typingr   r   r   r   numpyr  r=   r   torch.nn.initr   activationsr
   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   r   utils.genericr   configuration_siglipr   r   r   r9   r   floatr?   rO   rT   rV   rY   rf   rh   r>  rv   r   r   r   r   r   r   r  r&  r2  r?  r   rL  r
  r  __all__r$   r$   r$   r%   <module>   s    %

#I/
?"E"81&3 KT