o
    iYq                     @   sV  d dl Z d dlmZ d dlmZmZmZ d dlZd dlm	  m
Z d dlm	Z	 ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZmZ ddlmZ ddlmZmZmZm Z m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z'm(Z(m)Z) eeG dd deZ*edG dd de	j+Z,G dd de	j+Z-G dd de	j+Z.G dd de	j+Z/	d?de	j+dej0d ej0d!ej0d"eej0 d#e1d$e1fd%d&Z2G d'd( d(e	j+Z3G d)d* d*eZ4G d+d, d,e	j+Z5G d-d. d.e	j+Z6eG d/d0 d0eZ7ed1d2G d3d4 d4e7Z8ed5d2G d6d7 d7e7Z9d8ej0d9ej0fd:d;Z:eG d<d= d=e7Z;g d>Z<dS )@    N)	dataclass)AnyCallableOptional)nn   )ACT2FN)use_kernel_forward_from_hub)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuplefilter_out_non_signature_kwargs)deprecate_kwarg)check_model_inputs   )Aimv2ConfigAimv2TextConfigAimv2VisionConfigc                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeej ed< dZeej ed< dZeed< dZeed	< d
ee fddZdS )Aimv2Outputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`Aimv2TextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The image embeddings obtained by applying the projection layer to the pooled output of [`Aimv2VisionModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`Aimv2TextModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`Aimv2VisionModel`].
    Nlosslogits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_outputreturnc                    s   t  fdd  D S )Nc                 3   s.    | ]}|d vr | nt  | V  qdS ))r"   r#   N)getattrto_tuple).0kself \/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/aimv2/modeling_aimv2.py	<genexpr>K   s
    
z'Aimv2Output.to_tuple.<locals>.<genexpr>)tuplekeysr)   r+   r)   r,   r&   J   s   zAimv2Output.to_tuple)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   r   r    r!   r"   r   r#   r.   r   r&   r+   r+   r+   r,   r   ,   s   
 r   RMSNormc                       s.   e Zd Zd fdd	Zdd Zdd Z  ZS )	Aimv2RMSNormư>c                    s&   t    tt|| _|| _dS )z;
        Aimv2RMSNorm is equivalent to T5LayerNorm
        N)super__init__r   	Parameterr4   onesweightvariance_epsilon)r*   hidden_sizeeps	__class__r+   r,   r;   S   s   

zAimv2RMSNorm.__init__c                 C   sJ   |j }|tj}|djddd}|t|| j  }| j|| S )N   T)keepdim)	dtypetor4   float32powmeanrsqrtr?   r>   )r*   hidden_statesinput_dtypevariancer+   r+   r,   forward[   s
   zAimv2RMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)r.   r>   shaper?   r)   r+   r+   r,   
extra_reprb   s   zAimv2RMSNorm.extra_repr)r9   )r0   r1   r2   r;   rP   rR   __classcell__r+   r+   rB   r,   r8   Q   s    r8   c                       s$   e Zd Z fddZdd Z  ZS )Aimv2MLPc                    sx   t    || _|j| _|j| _tj| j| j|jd| _tj| j| j|jd| _	tj| j| j|jd| _
t|j | _d S )Nbias)r:   r;   configr@   intermediate_sizer   Linearmlp_bias	gate_projup_proj	down_projr   
hidden_actact_fnr*   rW   rB   r+   r,   r;   g   s   
zAimv2MLP.__init__c                 C   s$   |  | | || | }|S N)r]   r_   r[   r\   )r*   xr]   r+   r+   r,   rP   q   s    zAimv2MLP.forward)r0   r1   r2   r;   rP   rS   r+   r+   rB   r,   rT   f   s    
rT   c                       sX   e Zd Zdef fddZedddejfdejfdd	Z	d
ejdejfddZ
  ZS )Aimv2VisionEmbeddingsrW   c                    s   t    || _|j| _tj|j|j|j|jd| _t	|j|j
| _|j|j d }| jjs6t||j| _| jdt|ddd d S )N)kernel_sizestriderD   position_idsr   rE   F
persistent)r:   r;   rW   
patch_sizer   Conv2dnum_channelsr@   patch_embedr8   rms_norm_epsrms_norm
image_size	is_native	Embeddingposition_embeddingregister_bufferr4   arangeexpand)r*   rW   num_patchesrB   r+   r,   r;   w   s   
 zAimv2VisionEmbeddings.__init__   g     @cpur$   c                 C   s   t jt|||d}t jt| ||d}t j||dd\}}|d }t j|||d| }	d||	  }	| d |	d d d f  }
| d |	d d d f  }t j|
 |
 | | gddd d d d d f S )	NrG   devicexy)indexing   g      ?).Nr   dim)r4   ru   intmeshgridflattenconcatsincos)heightwidth	embed_dimtemperaturer{   rG   grid_wgrid_hpos_dimomegaout_hout_wr+   r+   r,   "build_2d_sincos_position_embedding   s   8z8Aimv2VisionEmbeddings.build_2d_sincos_position_embeddingpixel_valuesc                 C   s|   |  \}}}}| |ddd}| |}| jjr2| j|| j || j | jj	|j
|jd}n| | j}|| }|S )NrD   r   )r   r{   rG   )sizerm   r   	transposero   rW   rq   r   rj   r@   r{   rG   rs   rf   )r*   r   _r   r   rM   	pos_embedr+   r+   r,   rP      s   
zAimv2VisionEmbeddings.forward)r0   r1   r2   r   r;   staticmethodr4   rI   Tensorr   rP   rS   r+   r+   rB   r,   rc   v   s    
rc   c                	       sX   e Zd Zdef fddZ			ddeej deej deej dej	fd	d
Z
  ZS )Aimv2TextEmbeddingsrW   c                    sR   t    |j}t|j|| _t|j|| _| j	dt
|jddd d S )Nrf   rg   Frh   )r:   r;   r@   r   rr   
vocab_sizetoken_embeddingmax_position_embeddingsrs   rt   r4   ru   rv   )r*   rW   r   rB   r+   r,   r;      s   

zAimv2TextEmbeddings.__init__N	input_idsrf   inputs_embedsr$   c                 C   s   |d ur	|j d n|j d }| jjj d }||kr#td| d| |d u r2| jd d d |f }|d u r;| |}| |}|| }|S )NrE   r   zRSequence length must be less than max_position_embeddings (got `sequence length`: z and max_position_embeddings: )rQ   rs   r>   
ValueErrorrf   r   )r*   r   rf   r   
seq_lengthmax_position_embeddingposition_embeddings
embeddingsr+   r+   r,   rP      s"   

zAimv2TextEmbeddings.forwardNNN)r0   r1   r2   r   r;   r   r4   
LongTensorr5   r   rP   rS   r+   r+   rB   r,   r      s    r           modulequerykeyvalueattention_maskscalingdropoutc           
      K   s|   t ||dd| }|d ur|| }tjj|dt jd|j}tjj	||| j
d}t ||}	|	dd }	|	|fS )NrE   r   )r   rG   )ptrainingr   rD   )r4   matmulr   r   
functionalsoftmaxrI   rH   rG   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputr+   r+   r,   eager_attention_forward   s   
r   c                
       sR   e Zd ZdZ fddZ	d
dejdeej deejeej f fdd	Z	  Z
S )Aimv2Attentionz=Multi-headed attention from 'Attention Is All You Need' paperc                    s   t    || _|j| _|j| _| j| j | _| j| j | jkr-td| j d| j d| jd | _	|j
| _d| _tj| j| j|jd| _tj| j| j|jd| _tj| j| j|jd| _tj| j| j|jd| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      FrU   )r:   r;   rW   r@   r   num_attention_heads	num_headshead_dimr   scaleattention_dropoutr   	is_causalr   rY   qkv_biask_projv_projq_projout_projr`   rB   r+   r,   r;      s$   

zAimv2Attention.__init__NrM   r   r$   c              
   K   s   |j \}}}| |}| |}| |}	|||| j| jdd}|||| j| jdd}|	||| j| jdd}	t}
| j	j
dkrMt| j	j
 }
|
| |||	|| j| j| js\dn| jd\}}|||| }| |}||fS )z#Input shape: Batch x Time x Channelr   rD   eagerr   )r   r   r   )rQ   r   r   r   viewr   r   r   r   rW   _attn_implementationr   r   r   r   r   reshaper   r   )r*   rM   r   r   
batch_sizer   r   queriesr/   valuesattention_interfacer   r   r+   r+   r,   rP      s.   




zAimv2Attention.forwardra   )r0   r1   r2   r3   r;   r4   r   r   r.   rP   rS   r+   r+   rB   r,   r      s    r   c                	       sN   e Zd Zdef fddZ	ddejdeej dee	 dejfd	d
Z
  ZS )Aimv2EncoderLayerrW   c                    sB   t    t|| _t|| _t|j|j| _	t|j|j| _
d S ra   )r:   r;   r   	attentionrT   ffnr8   r@   rn   	rms_norm1	rms_norm2r`   rB   r+   r,   r;   '  s
   


zAimv2EncoderLayer.__init__NrM   r   r   r$   c                 K   sL   |  |}| jd||d|\}}|| }| |}| |}|| }|S )N)rM   r   r+   )r   r   r   r   )r*   rM   r   r   norm_hidden_statesr   r   
mlp_outputr+   r+   r,   rP   .  s   


zAimv2EncoderLayer.forwardra   )r0   r1   r2   r   r;   r4   r   r   r   r   rP   rS   r+   r+   rB   r,   r   &  s    
r   c                       sN   e Zd ZdZdef fddZe	ddeej	 de
e defd	d
Z  ZS )Aimv2Encoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`Aimv2EncoderLayer`].

    Args:
        config: Aimv2Config
    rW   c                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r+   )r   )r'   r   rW   r+   r,   
<listcomp>K  s    z)Aimv2Encoder.__init__.<locals>.<listcomp>F)	r:   r;   rW   r   
ModuleListrangenum_hidden_layerslayersgradient_checkpointingr`   rB   r   r,   r;   H  s   
 
zAimv2Encoder.__init__Nr   r   r$   c                 K   s,   |}| j D ]}|||fi |}qt|dS )N)last_hidden_state)r   r   )r*   r   r   r   rM   encoder_layerr+   r+   r,   rP   O  s   

zAimv2Encoder.forwardra   )r0   r1   r2   r3   r   r;   r   r   r4   r   r   r   r   rP   rS   r+   r+   rB   r,   r   ?  s    r   c                       s8   e Zd Zdef fddZdejdejfddZ  ZS )Aimv2AttentionPoolingHeadrW   c                    s|   t    |j| _|j| _tj| j| j|jd| _tj| j| j|jd| _	t
tdd| j| _tj| j| jdd| _d S )NrU   r   T)r:   r;   r@   r   r   r   rY   r   r   r   r<   r4   zeros	cls_tokenoutput_projr`   rB   r+   r,   r;   b  s   
z"Aimv2AttentionPoolingHead.__init__rM   r$   c                 C   s   |j \}}}| j|dd}| |||| j|| j }| |||| j|| j }||d| j|| j }|dddd}|dddd}|dddd}t	|||}	|	
dd|d|}	|	jdd}	| |	}
|
S )NrE   r   r   rD   r   r   )rQ   r   rv   r   r   r   r   permuteFscaled_dot_product_attentionr   rK   r   )r*   rM   r   seq_len
hidden_dimr   r   r   r   r   outputr+   r+   r,   rP   m  s   
z!Aimv2AttentionPoolingHead.forward)	r0   r1   r2   r   r;   r4   r   rP   rS   r+   r+   rB   r,   r   a  s    r   c                       sF   e Zd ZU dZeed< dZdZg dZdZ	dZ
dZ fddZ  ZS )Aimv2PreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models. The model is only intended for inference and doesn't support finetuning.
    rW   aimv2T)r   r   rc   r   c                    sh   t  | t|dr t|jtjr|jjt	
d d S d S t|tr2|jjjd| jjd d S d S )Nlogit_scaleg$I$I,@r   )rK   std)r:   _init_weightshasattr
isinstancer   r   r<   datafill_mathlogr   r   normal_rW   initializer_range)r*   r   rB   r+   r,   r     s   

z"Aimv2PreTrainedModel._init_weights)r0   r1   r2   r3   r   r6   base_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attnr   rS   r+   r+   rB   r,   r     s   
 r   zL
    The Vision model from AIMv2 without any head or projection on top.
    )custom_introc                
       s   e Zd ZU eed< dZeedZdef fddZ	de
jfddZed	d
dedde	dd	eej dee defddZ  ZS )Aimv2VisionModelrW   r   rM   
attentionsc                    sZ   t  | || _t|| _t|| _t|j|j	| _
|j| _| jr't|| _|   d S ra   )r:   r;   rW   rc   r   r   encoderr8   r@   rn   ro   use_headr   head	post_initr`   rB   r+   r,   r;     s   


zAimv2VisionModel.__init__r$   c                 C      | j jS ra   )r   rm   r)   r+   r+   r,   get_input_embeddings     z%Aimv2VisionModel.get_input_embeddingsr   zv4.58.0)versionFtie_last_hidden_statesNr   c                 K   sN   |  |}| jdd|i|}|j}| |}| jr| |nd}t||dS )a  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Siglip2VisionModel

        >>> model = Aimv2VisionModel.from_pretrained("apple/aimv2-large-patch14-native")
        >>> processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-native")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled features
        ```r   Nr   pooler_outputr+   )r   r   r   ro   r   r  r   )r*   r   r   r   rM   encoder_outputsr   r
  r+   r+   r,   rP     s   

zAimv2VisionModel.forwardra   )r0   r1   r2   r   r6   main_input_namer   r   _can_record_outputsr;   r   Moduler  r   r   r   r   r4   r   r   r   r   rP   rS   r+   r+   rB   r,   r     s&   
 
r   zJ
    The text model from AIMv2 without any head or projection on top.
    c                	       sz   e Zd ZdZeedZdef fddZde	j
fddZd	d
 Zedde	ddeej dee defddZ  ZS )Aimv2TextModelr   r   rW   c                    sJ   t  | || _t|| _t|| _t|j|j	| _
|j| _|   d S ra   )r:   r;   rW   r   r   r   r   r8   r@   rn   ro   eos_token_idr  r`   rB   r+   r,   r;     s   

zAimv2TextModel.__init__r$   c                 C   r  ra   r   r   r)   r+   r+   r,   r    r  z#Aimv2TextModel.get_input_embeddingsc                 C   s   || j _d S ra   r  )r*   r   r+   r+   r,   set_input_embeddings  s   z#Aimv2TextModel.set_input_embeddingsFr  Nr   r   c                 K   s   |  |}|j\}}}tj|tj|jd}|d|d}	|d ur-t| j	||	||d d}| j
d	||d|}
|
j}| |}|tj|jd |jd|jtj|jd| jk jddf }t||dS )
Nrz   r   rE   )rW   input_embedsrf   r   cache_positionpast_key_values)r   r   )r{   r   r	  r+   )r   rQ   r4   ru   longr{   	unsqueezerv   r
   rW   r   r   ro   rH   r   r  argmaxr   )r*   r   r   r   rM   r   r   r   r  rf   r  r   pooled_outputr+   r+   r,   rP     s<   
	
"zAimv2TextModel.forwardra   )r0   r1   r2   r  r   r   r  r   r;   r   r  r  r  r   r   r   r4   r   r   r   r   rP   rS   r+   r+   rB   r,   r    s$    r  tensorr$   c                 C   s,   t | d}t j|ddd}t |d}|S )z
    This method is equivalent to tensor.norm(p=2, dim=-1, keepdim=True) and used to make
    model `executorch` exportable. See issue https://github.com/pytorch/executorch/issues/3566
    rD   rE   T)r   rF   g      ?)r4   rJ   sum)r  square_tensor
sum_tensornormed_tensorr+   r+   r,   _get_vector_norm7  s   r  c                       s   e Zd ZU eed< g dZdZdef fddZe e			dde
jdee
j d	ee
j d
e
jfddZe e		dde
jded
e
jfddZe	e			ddee
j dee
j dee
j dee d
ef
ddZ  ZS )
Aimv2ModelrW   )r   r   rc   Tc                    s   t  | |j| _|jj| _|jj| _t	|j| _
t	|j| _tj| j| jdd| _tj| j| jdd| _tt| jj| _t|j| _|   d S )NFrU   )r:   r;   projection_dimvision_configr@   vision_embed_dimtext_configtext_embed_dimr   _from_configvision_modelr  
text_modelr   rY   visual_projectiontext_projectionr<   r4   r  rW   logit_scale_init_valuer   r   r   max_logit_scalemax_log_logit_scaler  r`   rB   r+   r,   r;   H  s   

zAimv2Model.__init__Nr   r   rf   r$   c                 C   s$   | j |||d}|j}| |}|S )a  
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`Aimv2TextModel`].

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, Aimv2Model

        >>> model = Aimv2Model.from_pretrained("openai/aimv2-vit-base-patch32")
        >>> tokenizer = AutoTokenizer.from_pretrained("openai/aimv2-vit-base-patch32")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> with torch.inference_mode():
        ...     text_features = model.get_text_features(**inputs)
        ```)r   r   rf   )r(  r
  r*  )r*   r   r   rf   text_outputsr  text_featuresr+   r+   r,   get_text_featuresZ  s   
zAimv2Model.get_text_featuresFr   interpolate_pos_encodingc                 C   s"   | j ||d}|j}| |}|S )an  
        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`Aimv2VisionModel`].

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, Aimv2Model
        >>> from transformers.image_utils import load_image

        >>> model = Aimv2Model.from_pretrained("openai/aimv2-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/aimv2-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> with torch.inference_mode():
        ...     image_features = model.get_image_features(**inputs)
        ```)r   r1  )r'  r
  r)  )r*   r   r1  vision_outputsr  image_featuresr+   r+   r,   get_image_features  s   
zAimv2Model.get_image_featuresr   c                 K   s   | j dd|i|}| jd||d|}|j}| |}|j}| |}|t| }|t| }| jd| j	 
|j}	|	| |  }
|
 }t||
||||dS )a  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Aimv2Model

        >>> model = Aimv2Model.from_pretrained("apple/aimv2-large-patch14-224-lit")
        >>> processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-224-lit")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
        ... )

        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```r   )r   r   r   )r   r   r    r!   r"   r#   Nr+   )r'  r(  r
  r)  r*  r  r   clampr-  exprH   r{   tr   )r*   r   r   r   r   r2  r.  r!   r    r   r   r   r+   r+   r,   rP     s8   

zAimv2Model.forward)NN)Fr   )r0   r1   r2   r   r6   r   r   r;   r   r   r4   r   r   r5   r0  boolr4  r   r   r   r   r   rP   rS   r+   r+   rB   r,   r   B  sX   
 #%r   )r   r   r   r  )r   )=r   dataclassesr   typingr   r   r   r4   torch.nn.functionalr   r   r   activationsr   integrationsr	   masking_utilsr
   modeling_layersr   modeling_outputsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.deprecationr   utils.genericr   configuration_aimv2r   r   r   r   r  r8   rT   rc   r   r   floatr   r   r   r   r   r   r   r  r  r   __all__r+   r+   r+   r,   <module>   sx   #4/
=""HE &