o
    ei                     @   s  d Z ddlmZ ddlmZ ddlmZ ddlZddl	Z	ddl	m
Z
 ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZ ddlmZmZ ddlmZ ddlmZmZm Z m!Z!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z(m)Z)m*Z* ee ddG dd deZ+ee ddG dd deZ,ee G dd deZ-G dd de
j.Z/G dd  d e
j.Z0	!dGd"e
j.d#e	j1d$e	j1d%e	j1d&e	j1dB d'e2d(e2fd)d*Z3G d+d, d,e
j.Z4G d-d. d.e
j.Z5G d/d0 d0eZ6e G d1d2 d2eZ7G d3d4 d4e
j.Z8G d5d6 d6e7Z9e d7dG d8d9 d9e7Z:G d:d; d;e7Z;G d<d= d=e
j.Z<e d>dG d?d@ d@e7Z=e G dAdB dBe7Z>e dCdG dDdE dEe7Z?g dFZ@dS )HzPyTorch Siglip model.    )Callable)	dataclass)AnyN)nn   )initialization)ACT2FN)create_bidirectional_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuple	torch_int)merge_with_config_defaults)capture_outputs   )SiglipConfigSiglipTextConfigSiglipVisionConfigz}
    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
    )custom_introc                   @   j   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	e
ejdf dB ed< dZe
ejdf dB ed< dS )SiglipVisionModelOutputz
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The image embeddings obtained by applying the projection layer to the pooler_output.
    Nimage_embedslast_hidden_state.hidden_states
attentions)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__r    r!   tupler"    r+   r+   h/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/siglip/modeling_siglip.pyr   +      
 r   ze
    Base class for text model's outputs that also contains a pooling of the last hidden states.
    c                   @   r   )SiglipTextModelOutputz
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The text embeddings obtained by applying the projection layer to the pooler_output.
    Ntext_embedsr    .r!   r"   )r#   r$   r%   r&   r/   r'   r(   r)   r    r!   r*   r"   r+   r+   r+   r,   r.   >   r-   r.   c                   @   s   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	ejdB ed< dZ
ejdB ed< dZejdB ed< dZeed< dZeed	< d
ee fddZdS )SiglipOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`SiglipTextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The image embeddings obtained by applying the projection layer to the pooled output of [`SiglipVisionModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`SiglipTextModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`SiglipVisionModel`].
    Nlosslogits_per_imagelogits_per_textr/   r   text_model_outputvision_model_outputreturnc                    s   t  fdd  D S )Nc                 3   s.    | ]}|d vr | nt  | V  qdS ))r4   r5   N)getattrto_tuple).0kselfr+   r,   	<genexpr>q   s
    
z(SiglipOutput.to_tuple.<locals>.<genexpr>)r*   keysr;   r+   r;   r,   r8   p   s   zSiglipOutput.to_tuple)r#   r$   r%   r&   r1   r'   r(   r)   r2   r3   r/   r   r4   r   r5   r*   r   r8   r+   r+   r+   r,   r0   Q   s   
 r0   c                       sX   e Zd Zdef fddZdejdededejfdd	Zddej	dejfddZ
  ZS )SiglipVisionEmbeddingsconfigc                    s   t    || _|j| _|j| _|j| _tj|j	| j| j| jdd| _
| j| j d | _| j| _t| j| j| _| jdt| jddd d S )Nvalid)in_channelsout_channelskernel_sizestridepadding   position_idsr   F
persistent)super__init__r@   hidden_size	embed_dim
image_size
patch_sizer   Conv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr'   arangeexpandr<   r@   	__class__r+   r,   rN   x   s    
"zSiglipVisionEmbeddings.__init__
embeddingsheightwidthr6   c                 C   s   |j d }| jjj d }tj s||kr||kr| | jS | jjd}|j d }|| j }|| j }	t	|d }
|
d|
|
|}|dddd}tjj|||	fddd	}|dddddd|}|S )
a  
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing and no class embeddings.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   rJ   g      ?r   rG   bicubicF)sizemodealign_corners)shaperY   weightr'   jit
is_tracingrH   	unsqueezerR   r   reshapepermuter   
functionalinterpolateview)r<   r`   ra   rb   rV   rW   patch_pos_embeddim
new_height	new_widthsqrt_num_positionsr+   r+   r,   interpolate_pos_encoding   s&   




z/SiglipVisionEmbeddings.interpolate_pos_encodingFpixel_valuesc           	      C   sj   |j \}}}}| jjj}| |j|d}|ddd}|r+|| ||| }|S || | j	 }|S )N)dtyperG   r   )
rg   rU   rh   rx   toflatten	transposerv   rY   rH   )	r<   rw   rv   _ra   rb   target_dtypepatch_embedsr`   r+   r+   r,   forward   s   
zSiglipVisionEmbeddings.forwardF)r#   r$   r%   r   rN   r'   Tensorintrv   r(   r   __classcell__r+   r+   r^   r,   r?   w   s     &r?   c                	       sX   e Zd Zdef fddZ			ddejdB dejdB dejdB dejfd	d
Z	  Z
S )SiglipTextEmbeddingsr@   c                    sR   t    |j}t|j|| _t|j|| _| j	dt
|jddd d S )NrH   rI   FrK   )rM   rN   rO   r   rX   
vocab_sizetoken_embeddingmax_position_embeddingsrY   rZ   r'   r[   r\   r<   r@   rP   r^   r+   r,   rN      s   

zSiglipTextEmbeddings.__init__N	input_idsrH   inputs_embedsr6   c                 C   s   |d ur	|j d n|j d }| jjj d }||kr#td| d| |d u r2| jd d d |f }|d u r;| |}| |}|| }|S )NrJ   r   zRSequence length must be less than max_position_embeddings (got `sequence length`: z and max_position_embeddings: )rg   rY   rh   
ValueErrorrH   r   )r<   r   rH   r   
seq_lengthmax_position_embeddingposition_embeddingsr`   r+   r+   r,   r      s"   

zSiglipTextEmbeddings.forwardNNN)r#   r$   r%   r   rN   r'   
LongTensorr(   r   r   r   r+   r+   r^   r,   r      s    r           modulequerykeyvalueattention_maskscalingdropoutc           
      K   s|   t ||dd| }|d ur|| }tjj|dt jd|j}tjj	||| j
d}t ||}	|	dd }	|	|fS )NrJ   r   )rr   rx   )ptrainingr   rG   )r'   matmulr{   r   rn   softmaxfloat32ry   rx   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputr+   r+   r,   eager_attention_forward   s   
r   c                
       sR   e Zd ZdZ fddZ	d
dejdejdB deejejdB f fdd	Z  Z	S )SiglipAttentionz=Multi-headed attention from 'Attention Is All You Need' paperc                    s   t    || _|j| _|j| _| j| j | _| j| j | jkr-td| j d| j d| jd | _	|j
| _d| _t| j| j| _t| j| j| _t| j| j| _t| j| j| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).      F)rM   rN   r@   rO   rP   num_attention_heads	num_headshead_dimr   scaleattention_dropoutr   	is_causalr   Lineark_projv_projq_projout_projr]   r^   r+   r,   rN     s$   

zSiglipAttention.__init__Nr!   r   r6   c              
   K   s   |j \}}}| |}| |}| |}	|||| j| jdd}|||| j| jdd}|	||| j| jdd}	t	| j
jt}
|
| |||	|| j| j| jsVdn| jd\}}|||| }| |}||fS )z#Input shape: Batch x Time x Channelr   rG   r   )r   r   r   )rg   r   r   r   rp   r   r   r{   r   get_interfacer@   _attn_implementationr   r   r   r   r   rl   r   r   )r<   r!   r   r   
batch_sizer   rP   queriesr>   valuesattention_interfacer   r   r+   r+   r,   r     s.   




zSiglipAttention.forwardN)
r#   r$   r%   r&   rN   r'   r   r*   r   r   r+   r+   r^   r,   r      s    r   c                       s2   e Zd Z fddZdejdejfddZ  ZS )	SiglipMLPc                    sD   t    || _t|j | _t|j|j	| _
t|j	|j| _d S r   )rM   rN   r@   r   
hidden_actactivation_fnr   r   rO   intermediate_sizefc1fc2r]   r^   r+   r,   rN   ?  s
   
zSiglipMLP.__init__r!   r6   c                 C   s"   |  |}| |}| |}|S r   )r   r   r   )r<   r!   r+   r+   r,   r   F  s   


zSiglipMLP.forward)r#   r$   r%   rN   r'   r   r   r   r+   r+   r^   r,   r   >  s    r   c                	       sN   e Zd ZdeeB f fddZedejdejde	e
 dejfdd	Z  ZS )
SiglipEncoderLayerr@   c                    sR   t    |j| _tj| j|jd| _t|| _	tj| j|jd| _
t|| _d S Neps)rM   rN   rO   rP   r   	LayerNormlayer_norm_epslayer_norm1r   	self_attnlayer_norm2r   mlpr]   r^   r+   r,   rN   N  s   

zSiglipEncoderLayer.__init__r!   r   r   r6   c                 K   sT   |}|  |}| jd||d|\}}|| }|}| |}| |}|| }|S )N)r!   r   r+   )r   r   r   r   )r<   r!   r   r   residualr|   r+   r+   r,   r   V  s   



zSiglipEncoderLayer.forward)r#   r$   r%   r   r   rN   r   r'   r   r   r   r(   r   r   r+   r+   r^   r,   r   M  s    r   c                   @   sT   e Zd ZU eed< dZdZdZg dZdZ	dZ
dZdZeedZe dd Zd	S )
SiglipPreTrainedModelr@   siglip)imagetextT)r   r?   r   #SiglipMultiheadAttentionPoolingHead)r!   r"   c                 C   s  t |tr<t | jtr| jjjn| jj}tj|jj	dt
| d t|dr:t|jt|jjd d dS dS t |tjrJt|j	 dS t |trt|jj	 t|jj	 t|jj	 t|jj	 t|jj t|jj t|jj t|jj dS t |trt|jj	 t|j j	 tj|jjdd tj|j jdd dS t |t!rt|j" t|j#j$ t|j#j% dS t |t&rt|j' t|j( dS t |t)rtj|j*j	| jjjd | jj+ d dS t |tj,tj-frt.|j	 |jdurt|j dS dS t |tj/r,t|j t0|j	 dS t |t1rEt|jt|jjd d dS dS )	zInitialize the weightsr   )stdrH   rJ   rI   gư>r   N)2
isinstancer?   r@   r   vision_configrO   initnormal_rY   rh   npsqrthasattrcopy_rH   r'   r[   rg   r\   r   rX   default_flax_embed_init_r   xavier_uniform_r   r   r   r   zeros_biasr   r   r   r   probe	attentionin_proj_weightin_proj_biasSiglipModellogit_scale
logit_biasSiglipForImageClassification
classifierinitializer_factorr   rS   lecun_normal_r   ones_r   )r<   r   rb   r+   r+   r,   _init_weights  s`   


&





&z#SiglipPreTrainedModel._init_weightsN)r#   r$   r%   r   r)   base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendr   r   _can_record_outputsr'   no_gradr   r+   r+   r+   r,   r   o  s   
 r   c                       sN   e Zd ZdZdef fddZe	ddejdB de	e
 defd	d
Z  ZS )SiglipEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`SiglipEncoderLayer`].

    Args:
        config: SiglipConfig
    r@   c                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r+   )r   )r9   r|   r@   r+   r,   
<listcomp>  s    z*SiglipEncoder.__init__.<locals>.<listcomp>F)	rM   rN   r@   r   
ModuleListrangenum_hidden_layerslayersgradient_checkpointingr]   r^   r   r,   rN     s   
 
zSiglipEncoder.__init__Nr   r   r6   c                 K   s,   |}| j D ]}|||fi |}qt|dS )N)r    )r   r   )r<   r   r   r   r!   encoder_layerr+   r+   r,   r     s   

zSiglipEncoder.forwardr   )r#   r$   r%   r&   r   rN   r   r'   r   r   r   r   r   r   r+   r+   r^   r,   r     s    r   c                       sj   e Zd ZdZdef fddZee			ddej	dB dej	dB dej	dB d	e
e d
ef
ddZ  ZS )SiglipTextTransformerr   r@   c                    sZ   t  | || _|j}t|| _t|| _tj	||j
d| _t||j| _|   d S r   )rM   rN   r@   rO   r   r`   r   encoderr   r   r   final_layer_normr   projection_sizehead	post_initr   r^   r+   r,   rN     s   

zSiglipTextTransformer.__init__Nr   r   rH   r   r6   c           
      K   s   |d u rt d| }|d|d }| j||d}t| j||d}| jd||d|}|j}| |}|d d dd d f }	| 	|	}	t
||	dS )NzYou have to specify input_idsrJ   )r   rH   )r@   r   r   )r   r   r    pooler_outputr+   )r   rd   rp   r`   r	   r@   r   r    r   r   r   )
r<   r   r   rH   r   input_shaper!   encoder_outputsr    pooled_outputr+   r+   r,   r     s0   	

zSiglipTextTransformer.forwardr   )r#   r$   r%   _input_embed_layerr   rN   r   r   r'   r   r   r   r   r   r   r+   r+   r^   r,   r     s&    r   zK
    The text model from SigLIP without any head or projection on top.
    c                       s   e Zd ZU eed< dZdef fddZdejfddZ	dd	 Z
eed
de			ddejdB dejdB dejdB dee def
ddZ  ZS )SiglipTextModelr@   )r   c                    "   t  | t|| _|   d S r   )rM   rN   r   
text_modelr   r]   r^   r+   r,   rN     s   
zSiglipTextModel.__init__r6   c                 C   
   | j jjS r   r	  r`   r   r;   r+   r+   r,   get_input_embeddings$     
z$SiglipTextModel.get_input_embeddingsc                 C      || j j_d S r   r  r<   r   r+   r+   r,   set_input_embeddings'     z$SiglipTextModel.set_input_embeddingsFtie_last_hidden_statesNr   r   rH   r   c                 K      | j d|||d|S )a  
        Examples:

        ```python
        >>> from transformers import AutoTokenizer, SiglipTextModel

        >>> model = SiglipTextModel.from_pretrained("google/siglip-base-patch16-224")
        >>> tokenizer = AutoTokenizer.from_pretrained("google/siglip-base-patch16-224")

        >>> # important: make sure to set padding="max_length" as that's how the model was trained
        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding="max_length", return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```r   r   rH   Nr+   r	  r<   r   r   rH   r   r+   r+   r,   r   *  s   zSiglipTextModel.forwardr   )r#   r$   r%   r   r)   r   rN   r   Moduler  r  r   r   r   r'   r   r   r   r   r   r   r+   r+   r^   r,   r    s.   
 r  c                       sL   e Zd ZdZdef fddZe	ddedB dee	 d	e
fd
dZ  ZS )SiglipVisionTransformerrU   r@   c                    sp   t  | || _|j}t|| _t|| _tj	||j
d| _t|ds&dn|j| _| jr2t|| _|   d S )Nr   vision_use_headT)rM   rN   r@   rO   r?   r`   r   r   r   r   r   post_layernormr   r  use_headr   r   r   r   r^   r+   r,   rN   P  s   


z SiglipVisionTransformer.__init__Frv   Nr   r6   c                 K   sR   | j ||d}| jdd|i|}|j}| |}| jr!| |nd }t||dS )N)rv   r   r  r+   )r`   r   r    r  r  r   r   )r<   rw   rv   r   r!   r  r    r  r+   r+   r,   r   ^  s   
zSiglipVisionTransformer.forwardr   )r#   r$   r%   r  r   rN   r   boolr   r   r   r   r   r+   r+   r^   r,   r  M  s    r  c                       s.   e Zd ZdZdef fddZdd Z  ZS )r   zMultihead Attention Pooling.r@   c                    s\   t    ttdd|j| _tjj|j|j	dd| _
tj|j|jd| _t|| _d S )Nr   T)batch_firstr   )rM   rN   r   	Parameterr'   randnrO   r   MultiheadAttentionr   r   r   r   	layernormr   r   r]   r^   r+   r,   rN   z  s
   
z,SiglipMultiheadAttentionPoolingHead.__init__c                 C   sX   |j d }| j|dd}| |||d }|}| |}|| | }|d d df S )Nr   r   )rg   r   repeatr   r"  r   )r<   hidden_stater   r   r   r+   r+   r,   r     s   

z+SiglipMultiheadAttentionPoolingHead.forward)r#   r$   r%   r&   r   rN   r   r   r+   r+   r^   r,   r   w  s    r   zM
    The vision model from SigLIP without any head or projection on top.
    c                
       st   e Zd ZU eed< dZdZdef fddZdej	fddZ
eed	d
e		ddedee defddZ  ZS )SiglipVisionModelr@   rw   r   c                    r  r   )rM   rN   r  vision_modelr   r]   r^   r+   r,   rN     s   
zSiglipVisionModel.__init__r6   c                 C   r
  r   r'  r`   rU   r;   r+   r+   r,   r    r  z&SiglipVisionModel.get_input_embeddingsFr  rv   r   c                 K      | j d||d|S )a/  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, SiglipVisionModel

        >>> model = SiglipVisionModel.from_pretrained("google/siglip-base-patch16-224")
        >>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled features
        ```rw   rv   Nr+   r'  r<   rw   rv   r   r+   r+   r,   r     s    zSiglipVisionModel.forwardr   )r#   r$   r%   r   r)   main_input_namer   rN   r   r  r  r   r   r   r  r   r   r   r   r   r+   r+   r^   r,   r%    s"   
 r%  c                       s  e Zd ZU eed< def fddZdejfddZdejfdd	Z	e
e	
	
ddejdejd
B dejd
B dee deeB f
ddZe
e	ddejdedee deeB fddZe
e	
	
	
	
	
	ddejd
B dejd
B dejd
B dejd
B ded
B dedee defddZ  ZS )r   r@   c                    s   t  | t|jtstdt|j dt|jts(tdt|j d|j}|j}t	
|}t
|}|j| _|j| _ttd| _ttd| _|   d S )NzMconfig.text_config is expected to be of type SiglipTextConfig but is of type .zQconfig.vision_config is expected to be of type SiglipVisionConfig but is of type r   )rM   rN   r   text_configr   	TypeErrortyper   r   r  _from_configr%  r	  r'  r   r  r'   r   r   r   r   )r<   r@   r/  r   r	  r'  r^   r+   r,   rN     s,   

zSiglipModel.__init__r6   c                 C   r
  r   r  r;   r+   r+   r,   r    r  z SiglipModel.get_input_embeddingsr   c                 C   r  r   r  r  r+   r+   r,   r    r  z SiglipModel.set_input_embeddingsNr   r   rH   r   c                 K   r  )am  
        Examples:

        ```python
        >>> from transformers import AutoTokenizer, AutoModel
        >>> import torch

        >>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
        >>> tokenizer = AutoTokenizer.from_pretrained("google/siglip-base-patch16-224")

        >>> # important: make sure to set padding="max_length" as that's how the model was trained
        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding="max_length", return_tensors="pt")
        >>> with torch.no_grad():
        ...     text_features = model.get_text_features(**inputs)
        ```r  Nr+   r  r  r+   r+   r,   get_text_features  s   zSiglipModel.get_text_featuresFrw   rv   c                 K   r)  )a  
        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, AutoModel
        >>> from transformers.image_utils import load_image

        >>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
        >>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     image_features = model.get_image_features(**inputs)
        ```r*  Nr+   r+  r,  r+   r+   r,   get_image_features  s   zSiglipModel.get_image_featuresreturn_lossc              	   K   s"  | j d||d|}| jd|||d|}	|j}
|	j}|
|
jdddd }
||jdddd }t||
 |j}| j	|j| j
|j}}||  | }| }d}|rtj|d|jd	}t| d|  }tjj|| }tj|dd
 }| }t|||||
|	|dS )a  
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, AutoModel
        >>> import torch

        >>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
        >>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> texts = ["a photo of 2 cats", "a photo of 2 dogs"]
        >>> # important: we pass `padding=max_length` since the model was trained with this
        >>> inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> logits_per_image = outputs.logits_per_image
        >>> probs = torch.sigmoid(logits_per_image) # these are the probabilities
        >>> print(f"{probs[0][0]:.1%} that image 0 is '{texts[0]}'")
        31.9% that image 0 is 'a photo of 2 cats'
        ```r*  r  rG   rJ   T)r   rr   keepdimNr   )devicerr   )r1   r2   r3   r/   r   r4   r5   r+   )r'  r	  r  normr'   r   try   r7  r   r   expeyerd   	ones_liker   rn   
logsigmoidsummeanr0   )r<   r   rw   r   rH   r5  rv   r   vision_outputstext_outputsr   r/   r3   r   r   r2   r1   r<  m1_diag1logliknllr+   r+   r,   r   6  sJ   ,zSiglipModel.forward)NNr   )NNNNNF)r#   r$   r%   r   r)   rN   r   r  r  r  r   r   r'   r   r   r   r*   r   r3  r(   r  r4  r   r0   r   r   r+   r+   r^   r,   r     sr   
   	r   z
    SigLIP vision encoder with an image classification head on top (a linear layer on top of the pooled final hidden states of
    the patch tokens) e.g. for ImageNet.
    c                       s   e Zd ZdZdZdeddf fddZdejfdd	Z	d
ejfddZ
eee			ddejdB dejdB dedee def
ddZ  ZS )r   rw   r&  r@   r6   Nc                    sZ   t  | |j| _t|j}|j| _|jdkr"t|jj	|jnt
 | _|   d S )Nr   )rM   rN   
num_labelsr%  r2  r   r'  r   r   rO   Identityr   r   )r<   r@   r'  r^   r+   r,   rN     s   "z%SiglipForImageClassification.__init__c                 C   r
  r   r(  r;   r+   r+   r,   r    r  z1SiglipForImageClassification.get_input_embeddingsr   c                 C   r  r   r(  r  r+   r+   r,   r    r  z1SiglipForImageClassification.set_input_embeddingsFlabelsrv   r   c           	      K   s\   | j |fd|i|}|j}tj|dd}| |}d}|dur(| ||| j}t||dS )au  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, SiglipForImageClassification
        >>> import torch
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> torch.manual_seed(3)  # doctest: +IGNORE_RESULT
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> # note: we are loading a `SiglipModel` from the hub here,
        >>> # so the head will be randomly initialized, hence the predictions will be random if seed is not set above.
        >>> image_processor = AutoImageProcessor.from_pretrained("google/siglip-base-patch16-224")
        >>> model = SiglipForImageClassification.from_pretrained("google/siglip-base-patch16-224")

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> logits = outputs.logits
        >>> # model predicts one of the two classes
        >>> predicted_class_idx = logits.argmax(-1).item()
        >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
        Predicted class: LABEL_1
        ```rv   r   r8  N)r1   logits)r'  r    r'   r@  r   loss_functionr@   r   )	r<   rw   rH  rv   r   outputssequence_outputrI  r1   r+   r+   r,   r     s"   +
z$SiglipForImageClassification.forward)NNF)r#   r$   r%   r-  r   r   rN   r   r  r  r  r   r   r   r'   r   r  r   r   r   r   r   r+   r+   r^   r,   r     s.    r   )r   r   r  r%  r   )r   )Ar&   collections.abcr   dataclassesr   typingr   numpyr   r'   r    r   r   activationsr   masking_utilsr	   modeling_layersr
   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.genericr   utils.output_capturingr   configuration_siglipr   r   r   r   r.   r0   r  r?   r   r   floatr   r   r   r   r   r   r   r  r  r   r%  r   r   __all__r+   r+   r+   r,   <module>   s   #I/
?"J"93*7 G^