o
    ei}                     @   s  d dl mZ d dlmZ d dlmZ d dlZd dlZd dl	m
Z
 d dlm
  mZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZ ddlmZmZ ddlmZ ddl m!Z!m"Z"m#Z#m$Z$m%Z% ddl&m'Z' ddl(m)Z) ddl*m+Z+m,Z,m-Z- ee#ddG dd de!Z.ee#ddG dd de!Z/ee#G dd de!Z0G dd de
j1Z2G dd de
j1Z3	dEd e
j1d!ej4d"ej4d#ej4d$ej4dB d%e5d&e5fd'd(Z6G d)d* d*e
j1Z7G d+d, d,e
j1Z8G d-d. d.eZ9e#G d/d0 d0eZ:G d1d2 d2e
j1Z;G d3d4 d4e:Z<G d5d6 d6e:Z=e#d7dG d8d9 d9e:Z>G d:d; d;e
j1Z?e#d<dG d=d> d>e:Z@e#G d?d@ d@e:ZAe#dAdG dBdC dCe:ZBg dDZCdS )F    )Callable)	dataclass)AnyN   )initialization)ACT2FN)create_bidirectional_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tupletorch_compilable_check)merge_with_config_defaults)capture_outputs   )Siglip2ConfigSiglip2TextConfigSiglip2VisionConfigz}
    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
    )custom_introc                   @   j   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	e
ejdf dB ed< dZe
ejdf dB ed< dS )Siglip2VisionOutputz
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The image embeddings obtained by applying the projection layer to the pooler_output.
    Nimage_embedslast_hidden_state.hidden_states
attentions)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__r   r    tupler!    r*   r*   j/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/siglip2/modeling_siglip2.pyr   +      
 r   ze
    Base class for text model's outputs that also contains a pooling of the last hidden states.
    c                   @   r   )Siglip2TextOutputz
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The text embeddings obtained by applying the projection layer to the pooler_output.
    Ntext_embedsr   .r    r!   )r"   r#   r$   r%   r.   r&   r'   r(   r   r    r)   r!   r*   r*   r*   r+   r-   =   r,   r-   c                   @   s   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	ejdB ed< dZ
ejdB ed< dZejdB ed< dZeed< dZeed	< d
ee fddZdS )Siglip2Outputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`Siglip2TextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The image embeddings obtained by applying the projection layer to the pooled output of [`Siglip2VisionModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`Siglip2TextModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`Siglip2VisionModel`].
    Nlosslogits_per_imagelogits_per_textr.   r   text_model_outputvision_model_outputreturnc                    s   t  fdd  D S )Nc                 3   s.    | ]}|d vr | nt  | V  qdS ))r3   r4   N)getattrto_tuple).0kselfr*   r+   	<genexpr>n   s
    
z)Siglip2Output.to_tuple.<locals>.<genexpr>)r)   keysr:   r*   r:   r+   r7   m   s   zSiglip2Output.to_tuple)r"   r#   r$   r%   r0   r&   r'   r(   r1   r2   r.   r   r3   r   r4   r)   r   r7   r*   r*   r*   r+   r/   O   s   
 r/   c                	       sb   e Zd Zdef fddZedejdejde	dejfdd	Z
d
ejdejdejfddZ  ZS )Siglip2VisionEmbeddingsconfigc                    sn   t    || _|j| _|j| _tj|j| j | j | jd| _	|j
| _
t| j
d | _t| j
| j| _d S )N)in_featuresout_featuresg      ?)super__init__r?   hidden_size	embed_dim
patch_sizennLinearnum_channelspatch_embeddingnum_patchesintposition_embedding_size	Embeddingposition_embeddingr;   r?   	__class__r*   r+   rC   u   s   
z Siglip2VisionEmbeddings.__init__positional_embeddingsspatial_shapes
max_lengthr5   c                 C   s  |j d }| j d }| j}tj|||f| j|d}| dddd} | jjdkr/| tj	} t
|D ]T}||  \}}	t|	dkd t|dkd t||	 |kd	 tj| ||	fd
ddd}
|
|||	 dd}
|
|}
|
||d||	 f< |
d ||||	 df< q3|S )ac  
        Resize positional embeddings to image-specific size and pad to a fixed size.

        Args:
            positional_embeddings (`torch.Tensor`):
                Position embeddings of shape (height, width, embed_dim)
            spatial_shapes (`torch.LongTensor`):
                Spatial shapes of shape (batch_size, 2) to resize the positional embeddings to
            max_length (`int`):
                Maximum length of the positional embeddings to pad resized positional embeddings to

        Returns:
            `torch.Tensor`: Embeddings of shape (batch_size, max_length, embed_dim)
        r   devicedtype   r   cpuz8Width of resized positional embeddings must be positive.z9Height of resized positional embeddings must be positive.z0Resized positional embeddings exceed max_length.bilinearFT)sizemodealign_corners	antialiasN)shaperY   r&   emptyrX   permute	unsqueezetypetofloat32rangetolistr   Finterpolatereshape	transpose)rS   rT   rU   
batch_sizerE   source_dtyperesulted_positional_embeddingsiheightwidthresized_embeddingsr*   r*   r+   resize_positional_embeddings   s8   

	
z4Siglip2VisionEmbeddings.resize_positional_embeddingspixel_valuesc                 C   sT   | j jj}|  |j|d}| jj| j| jd}| j|||jd d}|| }|S )aH  
        Args:
            pixel_values (`torch.FloatTensor`):
                Pixel values of shape (batch_size, max_num_patches, num_channels * patch_size * patch_size)
            spatial_shapes (`list[tuple[int, int]]`):
                Spatial shapes of shape (batch_size, 2) to resize the positional embeddings to
        )rY   rV   r   )rU   )	rJ   weightrY   rf   rO   rl   rM   ru   ra   )r;   rv   rT   target_dtypepatch_embedsrS   resized_positional_embeddings
embeddingsr*   r*   r+   forward   s   


zSiglip2VisionEmbeddings.forward)r"   r#   r$   r   rC   staticmethodr&   Tensor
LongTensorrL   ru   r'   r|   __classcell__r*   r*   rQ   r+   r>   t   s    $=r>   c                	       sX   e Zd Zdef fddZ			ddejdB dejdB dejdB dejfd	d
Z	  Z
S )Siglip2TextEmbeddingsr?   c                    sR   t    |j}t|j|| _t|j|| _| j	dt
|jddd d S )Nposition_idsr   rV   F)
persistent)rB   rC   rD   rG   rN   
vocab_sizetoken_embeddingmax_position_embeddingsrO   register_bufferr&   arangeexpandr;   r?   rE   rQ   r*   r+   rC      s   

zSiglip2TextEmbeddings.__init__N	input_idsr   inputs_embedsr5   c                 C   s   |d ur	|j d n|j d }| jjj d }||kr#td| d| |d u r2| jd d d |f }|d u r;| |}| |}|| }|S )NrV   r   zRSequence length must be less than max_position_embeddings (got `sequence length`: z and max_position_embeddings: )ra   rO   rw   
ValueErrorr   r   )r;   r   r   r   
seq_lengthmax_position_embeddingposition_embeddingsr{   r*   r*   r+   r|      s"   

zSiglip2TextEmbeddings.forwardNNN)r"   r#   r$   r   rC   r&   r   r'   r~   r|   r   r*   r*   rQ   r+   r      s    r           modulequerykeyvalueattention_maskscalingdropoutc           
      K   s|   t ||dd| }|d ur|| }tjj|dt jd|j}tjj	||| j
d}t ||}	|	dd }	|	|fS )NrV   r   )dimrY   )ptrainingr   rZ   )r&   matmulrm   rG   
functionalsoftmaxrg   rf   rY   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputr*   r*   r+   eager_attention_forward  s   
r   c                
       sR   e Zd ZdZ fddZ	d
dejdejdB deejejdB f fdd	Z  Z	S )Siglip2Attentionz=Multi-headed attention from 'Attention Is All You Need' paperc                    s   t    || _|j| _|j| _| j| j | _| j| j | jkr-td| j d| j d| jd | _	|j
| _d| _t| j| j| _t| j| j| _t| j| j| _t| j| j| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).      F)rB   rC   r?   rD   rE   num_attention_heads	num_headshead_dimr   scaleattention_dropoutr   	is_causalrG   rH   k_projv_projq_projout_projrP   rQ   r*   r+   rC     s$   

zSiglip2Attention.__init__Nr    r   r5   c              
   K   s   |j \}}}| |}| |}| |}	|||| j| jdd}|||| j| jdd}|	||| j| jdd}	t	| j
jt}
|
| |||	|| j| j| jsVdn| jd\}}|||| }| |}||fS )z#Input shape: Batch x Time x Channelr   rZ   r   )r   r   r   )ra   r   r   r   viewr   r   rm   r   get_interfacer?   _attn_implementationr   r   r   r   r   rl   r   r   )r;   r    r   r   rn   r   rE   queriesr=   valuesattention_interfacer   r   r*   r*   r+   r|   2  s.   




zSiglip2Attention.forwardN)
r"   r#   r$   r%   rC   r&   r~   r)   r|   r   r*   r*   rQ   r+   r     s    r   c                       s2   e Zd Z fddZdejdejfddZ  ZS )
Siglip2MLPc                    sD   t    || _t|j | _t|j|j	| _
t|j	|j| _d S r   )rB   rC   r?   r   
hidden_actactivation_fnrG   rH   rD   intermediate_sizefc1fc2rP   rQ   r*   r+   rC   Z  s
   
zSiglip2MLP.__init__r    r5   c                 C   s"   |  |}| |}| |}|S r   )r   r   r   )r;   r    r*   r*   r+   r|   a  s   


zSiglip2MLP.forward)r"   r#   r$   rC   r&   r~   r|   r   r*   r*   rQ   r+   r   Y  s    r   c                	       sN   e Zd ZdeeB f fddZedejdejde	e
 dejfdd	Z  ZS )
Siglip2EncoderLayerr?   c                    sR   t    |j| _tj| j|jd| _t|| _	tj| j|jd| _
t|| _d S Neps)rB   rC   rD   rE   rG   	LayerNormlayer_norm_epslayer_norm1r   	self_attnlayer_norm2r   mlprP   rQ   r*   r+   rC   i  s   

zSiglip2EncoderLayer.__init__r    r   r   r5   c                 K   sT   |}|  |}| jd||d|\}}|| }|}| |}| |}|| }|S )N)r    r   r*   )r   r   r   r   )r;   r    r   r   residual_r*   r*   r+   r|   q  s   



zSiglip2EncoderLayer.forward)r"   r#   r$   r   r   rC   r   r&   r~   r   r   r'   r|   r   r*   r*   rQ   r+   r   h  s    r   c                   @   sT   e Zd ZU eed< dZdZdZg dZdZ	dZ
dZdZeedZe dd	 Zd
S )Siglip2PreTrainedModelr?   siglip2)imagetextT)r   r>   r   $Siglip2MultiheadAttentionPoolingHeadF)r    r!   c                 C   s  t |tr<t | jtr| jjjn| jj}tj|jj	dt
| d t|dr:t|jt|jjd d dS dS t |tjrJt|j	 dS t |trt|jj	 t|jj	 t|jj	 t|jj	 t|jj t|jj t|jj t|jj dS t |trt|jj	 t|j j	 tj|jjdd tj|j jdd dS t |t!rt|j" t|j#j$ t|j#j% dS t |t&rt|j' t|j( dS t |t)rtj|j*j	| jjjd | jj+ d dS t |tj,tj-frt.|j	 |jdurt|j dS dS t |tj/r,t|j t0|j	 dS t |t1rEt|jt|jjd d dS dS )	zInitialize the weightsr   )stdr   rV   r   gư>r   N)2
isinstancer>   r?   r   vision_configrD   initnormal_rO   rw   npsqrthasattrcopy_r   r&   r   ra   r   rG   rN   default_flax_embed_init_r   xavier_uniform_r   r   r   r   zeros_biasr   r   r   r   probe	attentionin_proj_weightin_proj_biasSiglip2Modellogit_scale
logit_biasSiglip2ForImageClassification
classifierinitializer_factorrH   Conv2dlecun_normal_r   ones_r   )r;   r   rs   r*   r*   r+   _init_weights  s`   


&





&z$Siglip2PreTrainedModel._init_weightsN)r"   r#   r$   r   r(   base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendr   r   _can_record_outputsr&   no_gradr   r*   r*   r*   r+   r     s   
 r   c                       sN   e Zd ZdZdef fddZe	ddejdB de	e
 defd	d
Z  ZS )Siglip2Encoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`Siglip2EncoderLayer`].

    Args:
        config: Siglip2Config
    r?   c                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r*   )r   )r8   r   r?   r*   r+   
<listcomp>  s    z+Siglip2Encoder.__init__.<locals>.<listcomp>F)	rB   rC   r?   rG   
ModuleListrh   num_hidden_layerslayersgradient_checkpointingrP   rQ   r   r+   rC     s   
 
zSiglip2Encoder.__init__Nr   r   r5   c                 K   s,   |}| j D ]}|||fi |}qt|dS )N)r   )r   r
   )r;   r   r   r   r    encoder_layerr*   r*   r+   r|     s   

zSiglip2Encoder.forwardr   )r"   r#   r$   r%   r   rC   r   r&   r~   r   r   r
   r|   r   r*   r*   rQ   r+   r     s    r   c                       s`   e Zd ZdZdef fddZe		ddejdej	dej
d	edB d
edB defddZ  ZS )Siglip2VisionTransformerrJ   r?   c                    sp   t  | || _|j}t|| _t|| _tj	||j
d| _t|ds&dn|j| _| jr2t|| _|   d S )Nr   vision_use_headT)rB   rC   r?   rD   r>   r{   r   encoderrG   r   r   post_layernormr   r   use_headr   head	post_initr   rQ   r*   r+   rC     s   


z!Siglip2VisionTransformer.__init__Nrv   r   rT   output_attentionsoutput_hidden_statesr5   c                 K   s   |dur|n| j j}|dur|n| j j}| ||}t| j ||d}| j||||d}	|	j}
| |
}
| jr<| 	|
|nd}t
|
||	j|	jdS )z
        spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
            Tensor containing the spatial dimensions (height, width) of the input images.
        Nr?   r   r   )r   r   r  r  )r   pooler_outputr    r!   )r?   r  r  r{   r   r   r   r  r  r  r   r    r!   )r;   rv   r   rT   r  r  r   r    encoder_attention_maskencoder_outputsr   r  r*   r*   r+   r|     s0   
z Siglip2VisionTransformer.forwardNN)r"   r#   r$   _input_embed_layerr   rC   r   r&   r'   r~   r   boolr   r|   r   r*   r*   rQ   r+   r     s&    r   c                       sj   e Zd ZdZdef fddZee			ddej	dB dej	dB dej	dB d	e
e d
ef
ddZ  ZS )Siglip2TextTransformerr   r?   c                    sZ   t  | || _|j}t|| _t|| _tj	||j
d| _t||j| _|   d S r   )rB   rC   r?   rD   r   r{   r   r   rG   r   r   final_layer_normrH   projection_sizer  r  r   rQ   r*   r+   rC   :  s   

zSiglip2TextTransformer.__init__Nr   r   r   r   r5   c           
      K   s   |d u rt d| }|d|d }| j||d}t| j||d}| jd||d|}|j}| |}|d d dd d f }	| 	|	}	t
||	dS )NzYou have to specify input_idsrV   )r   r   r  )r   r   )r   r  r*   )r   r]   r   r{   r   r?   r   r   r  r  r   )
r;   r   r   r   r   input_shaper    r
  r   pooled_outputr*   r*   r+   r|   E  s0   	

zSiglip2TextTransformer.forwardr   )r"   r#   r$   r  r   rC   r   r   r&   r~   r   r   r   r|   r   r*   r*   rQ   r+   r  7  s&    r  zL
    The text model from Siglip2 without any head or projection on top.
    c                       s   e Zd ZU eed< dZdef fddZdejfddZ	dd	 Z
eed
de			ddejdB dejdB dejdB dee def
ddZ  ZS )Siglip2TextModelr?   )r   c                    "   t  | t|| _|   d S r   )rB   rC   r  
text_modelr  rP   rQ   r*   r+   rC   y  s   
zSiglip2TextModel.__init__r5   c                 C   
   | j jjS r   r  r{   r   r:   r*   r*   r+   get_input_embeddings     
z%Siglip2TextModel.get_input_embeddingsc                 C      || j j_d S r   r  r;   r   r*   r*   r+   set_input_embeddings     z%Siglip2TextModel.set_input_embeddingsFtie_last_hidden_statesNr   r   r   r   c                 K      | j d|||d|S )a  
        Examples:

        ```python
        >>> from transformers import AutoTokenizer, Siglip2TextModel

        >>> model = Siglip2TextModel.from_pretrained("google/siglip2-base-patch16-224")
        >>> tokenizer = AutoTokenizer.from_pretrained("google/siglip2-base-patch16-224")

        >>> # important: make sure to set padding="max_length" as that's how the model was trained
        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding="max_length", return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```r   r   r   Nr*   r  r;   r   r   r   r   r*   r*   r+   r|     s   zSiglip2TextModel.forwardr   )r"   r#   r$   r   r(   r   rC   rG   Moduler  r  r   r   r   r&   r~   r   r   r   r|   r   r*   r*   rQ   r+   r  p  s.   
 r  c                       sH   e Zd ZdZdef fddZddejdejdB dejfd	d
Z  Z	S )r   zMultihead Attention Pooling.r?   c                    sj   t    ttdd|j| _tjj|j|j	dd| _
tj|j|jd| _t|| _|| _|j	| _d S )Nr   T)batch_firstr   )rB   rC   rG   	Parameterr&   randnrD   r   MultiheadAttentionr   r   r   r   	layernormr   r   r?   r   rP   rQ   r*   r+   rC     s   

z-Siglip2MultiheadAttentionPoolingHead.__init__Nhidden_stater   r5   c                 C   s   |j d }| j|dd}|d urS|j d |j d }}t| j|||d}|d urS|d| j|d}|d||}|jtj	krSt
|tjd|j|jdt|jj}| j||||dd }|}| |}|| | }|d d df S )Nr   r   )r?   r   r   encoder_hidden_statesrV   r   rW   )	attn_mask)ra   r   repeatr   r?   r   rl   rY   r&   r  wheretensorrX   finfominr   r)  r   )r;   r*  r   rn   r   
target_len
source_lenr   r*   r*   r+   r|     s0   

z,Siglip2MultiheadAttentionPoolingHead.forwardr   )
r"   r#   r$   r%   r   rC   r&   r~   r|   r   r*   r*   rQ   r+   r     s    *r   zN
    The vision model from Siglip2 without any head or projection on top.
    c                       s~   e Zd ZU eed< dZdZdef fddZdej	fddZ
eed	d
edejdejdejdee def
ddZ  ZS )Siglip2VisionModelr?   rv   r   c                    r  r   )rB   rC   r   vision_modelr  rP   rQ   r*   r+   rC     s   
zSiglip2VisionModel.__init__r5   c                 C   r  r   r6  r{   rJ   r:   r*   r*   r+   r    r  z'Siglip2VisionModel.get_input_embeddingsFr  pixel_attention_maskrT   r   c                 K   r   )a  
        pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
            Mask to avoid performing attention on padding pixel indices.
        spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
            Tensor containing the spatial dimensions (height, width) of the input images.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, Siglip2VisionModel

        >>> model = Siglip2VisionModel.from_pretrained("google/siglip2-base-patch16-224")
        >>> processor = AutoProcessor.from_pretrained("google/siglip2-base-patch16-224")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled features
        ```rv   r   rT   Nr*   r6  r;   rv   r8  rT   r   r*   r*   r+   r|     s   %zSiglip2VisionModel.forward)r"   r#   r$   r   r(   main_input_namer   rC   rG   r$  r  r   r   r   r&   r'   r~   r   r   r   r   r|   r   r*   r*   rQ   r+   r4    s(   
 r4  c                       sT  e Zd ZU eed< def fddZdejfddZdejfdd	Z	e
e	
	
ddejdejd
B dejd
B dee deeB f
ddZe
e	
	
	
ddejd
B dejd
B dejd
B dee deeB f
ddZe
e	
	
	
	
	
	
	
	
	
ddejd
B dejd
B dejd
B dejd
B dejd
B dejd
B ded
B ded
B ded
B defddZ  ZS )r   r?   c                    s   t  | t|jtstdt|j dt|jts(tdt|j d|j}|j}t	
|}t
|}|j| _|j| _ttd| _ttd| _|   d S )NzNconfig.text_config is expected to be of type Siglip2TextConfig but is of type .zRconfig.vision_config is expected to be of type Siglip2VisionConfig but is of type r   )rB   rC   r   text_configr   	TypeErrorre   r   r   r  _from_configr4  r  r6  rG   r&  r&   r'  r   r   r  )r;   r?   r>  r   r  r6  rQ   r*   r+   rC     s,   

zSiglip2Model.__init__r5   c                 C   r  r   r  r:   r*   r*   r+   r  =  r  z!Siglip2Model.get_input_embeddingsr   c                 C   r  r   r  r  r*   r*   r+   r  @  r  z!Siglip2Model.set_input_embeddingsNr   r   r   r   c                 K   r   )ao  
        Examples:

        ```python
        >>> from transformers import AutoTokenizer, AutoModel
        >>> import torch

        >>> model = AutoModel.from_pretrained("google/siglip2-base-patch16-224")
        >>> tokenizer = AutoTokenizer.from_pretrained("google/siglip2-base-patch16-224")

        >>> # important: make sure to set padding="max_length" as that's how the model was trained
        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding="max_length", return_tensors="pt")
        >>> with torch.no_grad():
        ...     text_features = model.get_text_features(**inputs)
        ```r!  Nr*   r"  r#  r*   r*   r+   get_text_featuresC  s   zSiglip2Model.get_text_featuresrv   r8  rT   c                 K   r   )a  
        pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
            Mask to avoid performing attention on padding pixel indices.
        spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
            Tensor containing the spatial dimensions (height, width) of the input images.

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, AutoModel
        >>> from transformers.image_utils import load_image

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> model = AutoModel.from_pretrained("google/siglip2-base-patch16-224")
        >>> processor = AutoProcessor.from_pretrained("google/siglip2-base-patch16-224")

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     image_features = model.get_image_features(**inputs)
        ```
        r9  Nr*   r:  r;  r*   r*   r+   get_image_featuresb  s   "zSiglip2Model.get_image_featuresreturn_lossr  r  c
              	   K   sD  |dur|n| j j}|	dur|	n| j j}	| j|||||	d}| j|||||	d}|j}|j}||jdddd }||jdddd }t||	 
|j}| j
|j| j
|j}}||  | }|	 }d}|rtj|d|jd	}t| d|  }tjj|| }tj|dd
 }| }t|||||||dS )ae  
        pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
            Mask to avoid performing attention on padding pixel indices.
        spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
            Tensor containing the spatial dimensions (height, width) of the input images.
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, AutoModel
        >>> import torch

        >>> model = AutoModel.from_pretrained("google/siglip2-base-patch16-224")
        >>> processor = AutoProcessor.from_pretrained("google/siglip2-base-patch16-224")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> texts = ["a photo of 2 cats", "a photo of 2 dogs"]
        >>> # important: we pass `padding=max_length` since the model was trained with this
        >>> inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> logits_per_image = outputs.logits_per_image
        >>> probs = torch.sigmoid(logits_per_image) # these are the probabilities
        >>> print(f"{probs[0][0]:.1%} that image 0 is '{texts[0]}'")
        31.9% that image 0 is 'a photo of 2 cats'
        ```
        N)rv   r   rT   r  r  )r   r   r   r  r  rZ   rV   T)r   r   keepdimr   )rX   r   )r0   r1   r2   r.   r   r3   r4   )r?   r  r  r6  r  r  normr&   r   trf   rX   r   r   expeyer]   	ones_likerG   r   
logsigmoidsummeanr/   )r;   r   rv   r8  rT   r   r   rC  r  r  r   vision_outputstext_outputsr   r.   r2   r   r   r1   r0   rI  m1_diag1logliknllr*   r*   r+   r|     sR   5zSiglip2Model.forwardr  r   )	NNNNNNNNN)r"   r#   r$   r   r(   rC   rG   r$  r  r  r   r   r&   r~   r   r   r)   r   rA  r'   r   rB  r  r/   r|   r   r*   r*   rQ   r+   r     s   
  (	
r   z
    Siglip2 vision encoder with an image classification head on top (a linear layer on top of the pooled final hidden states of
    the patch tokens) e.g. for ImageNet.
    c                       s   e Zd ZdZdZdeddf fddZdejfdd	Z	d
ejfddZ
eee						ddejdB dejdB dejdB dejdB dedB dedB defddZ  ZS )r   rv   r5  r?   r5   Nc                    sZ   t  | |j| _t|j}|j| _|jdkr"t|jj	|jnt
 | _|   d S )Nr   )rB   rC   
num_labelsr4  r@  r   r6  rG   rH   rD   Identityr   r  )r;   r?   r6  rQ   r*   r+   rC     s   "z&Siglip2ForImageClassification.__init__c                 C   r  r   r7  r:   r*   r*   r+   r    r  z2Siglip2ForImageClassification.get_input_embeddingsr   c                 C   r  r   r7  r  r*   r*   r+   r    r  z2Siglip2ForImageClassification.set_input_embeddingsr8  rT   labelsr  r  c                 K   s   |dur|n| j j}|dur|n| j j}| j|||||d}|j}	|dur>|d |	j}
tj|	|
 ddtj|
dd }	ntj	|	dd}	| 
|	}d}|durX| ||| j }t|||j|jdS )a  
        pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
            Mask to avoid performing attention on padding pixel indices.
        spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
            Tensor containing the spatial dimensions (height, width) of the input images.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, Siglip2ForImageClassification
        >>> import torch
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> torch.manual_seed(3)  # doctest: +IGNORE_RESULT
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> # note: we are loading a `Siglip2Model` from the hub here,
        >>> # so the head will be randomly initialized, hence the predictions will be random if seed is not set above.
        >>> image_processor = AutoImageProcessor.from_pretrained("google/siglip2-base-patch16-224")
        >>> model = Siglip2ForImageClassification.from_pretrained("google/siglip2-base-patch16-224")

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> logits = outputs.logits
        >>> # model predicts one of the two classes
        >>> predicted_class_idx = logits.argmax(-1).item()
        >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
        Predicted class: LABEL_1
        ```
        N)r   rT   r  r  ).Nr   rE  )r0   logitsr    r!   )r?   r  r  r6  r   rf   rX   r&   rL  rM  r   loss_functionr   r    r!   )r;   rv   r8  rT   rU  r  r  r   outputssequence_output	pool_maskrV  r0   r*   r*   r+   r|     s2   3"
z%Siglip2ForImageClassification.forward)NNNNNN)r"   r#   r$   r<  r   r   rC   rG   r$  r  r  r   r   r   r&   r~   r   r  r   r|   r   r*   r*   rQ   r+   r     s<    	r   )r   r   r  r4  r   )r   )Dcollections.abcr   dataclassesr   typingr   numpyr   r&   torch.nnrG   torch.nn.functionalr   rj    r   r   activationsr   masking_utilsr   modeling_layersr	   modeling_outputsr
   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.genericr   utils.output_capturingr   configuration_siglip2r   r   r   r   r-   r/   r$  r>   r   r~   floatr   r   r   r   r   r   r   r  r  r   r4  r   r   __all__r*   r*   r*   r+   <module>   s   #h/
>"J"@93/= `t