o
    ei:r                     @   sb  d dl Z d dlmZ d dlmZ d dlmZ d dlZd dlm	  m
Z d dlm	Z	 ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZ ddlmZmZ ddlmZ ddlm Z m!Z!m"Z"m#Z# ddl$m%Z% ddl&m'Z' ddl(m)Z)m*Z*m+Z+ ee"G dd de Z,edG dd de	j-Z.G dd de	j-Z/G dd de	j-Z0G dd de	j-Z1	dAd e	j-d!ej2d"ej2d#ej2d$ej2dB d%e3d&e3fd'd(Z4G d)d* d*e	j-Z5G d+d, d,eZ6G d-d. d.e	j-Z7G d/d0 d0e	j-Z8e"G d1d2 d2eZ9e"d3d4G d5d6 d6e9Z:e"d7d4G d8d9 d9e9Z;d:ej2d;ej2fd<d=Z<e"G d>d? d?e9Z=g d@Z>dS )B    N)Callable)	dataclass)Any)nn   )initialization)ACT2FN)use_kernel_forward_from_hub)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuple)merge_with_config_defaults)capture_outputs   )Aimv2ConfigAimv2TextConfigAimv2VisionConfigc                   @   s   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	ejdB ed< dZ
ejdB ed< dZejdB ed< dZeed< dZeed	< d
ee fddZdS )Aimv2Outputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`Aimv2TextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The image embeddings obtained by applying the projection layer to the pooled output of [`Aimv2VisionModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`Aimv2TextModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`Aimv2VisionModel`].
    Nlosslogits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_outputreturnc                    s   t  fdd  D S )Nc                 3   s.    | ]}|d vr | nt  | V  qdS ))r!   r"   N)getattrto_tuple).0kself f/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/aimv2/modeling_aimv2.py	<genexpr>L   s
    
z'Aimv2Output.to_tuple.<locals>.<genexpr>)tuplekeysr(   r*   r(   r+   r%   K   s   zAimv2Output.to_tuple)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__r   r   r   r    r!   r   r"   r-   r   r%   r*   r*   r*   r+   r   -   s   
 r   RMSNormc                       sF   e Zd Zddeddf fddZdejdejfdd	Zd
d Z  Z	S )Aimv2RMSNormư>epsr#   Nc                    s&   t    tt|| _|| _dS )z;
        Aimv2RMSNorm is equivalent to T5LayerNorm
        N)super__init__r   	Parameterr3   onesweightvariance_epsilon)r)   hidden_sizer9   	__class__r*   r+   r;   T   s   

zAimv2RMSNorm.__init__hidden_statesc                 C   sJ   |j }|tj}|djddd}|t|| j  }| j|| S )N   T)keepdim)	dtypetor3   float32powmeanrsqrtr?   r>   )r)   rC   input_dtypevariancer*   r*   r+   forward\   s
   zAimv2RMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)r-   r>   shaper?   r(   r*   r*   r+   
extra_reprc   s   zAimv2RMSNorm.extra_repr)r8   )
r/   r0   r1   floatr;   r3   TensorrO   rQ   __classcell__r*   r*   rA   r+   r7   R   s    r7   c                       s$   e Zd Z fddZdd Z  ZS )Aimv2MLPc                    sx   t    || _|j| _|j| _tj| j| j|jd| _tj| j| j|jd| _	tj| j| j|jd| _
t|j | _d S )Nbias)r:   r;   configr@   intermediate_sizer   Linearmlp_bias	gate_projup_proj	down_projr   
hidden_actact_fnr)   rX   rA   r*   r+   r;   h   s   
zAimv2MLP.__init__c                 C   s$   |  | | || | }|S N)r^   r`   r\   r]   )r)   xr^   r*   r*   r+   rO   r   s    zAimv2MLP.forward)r/   r0   r1   r;   rO   rT   r*   r*   rA   r+   rU   g   s    
rU   c                       sX   e Zd Zdef fddZedddejfdejfdd	Z	d
ejdejfddZ
  ZS )Aimv2VisionEmbeddingsrX   c                    s   t    || _|j| _tj|j|j|j|jd| _t	|j|j
| _|j|j d }| jjs6t||j| _| jdt|ddd d S )N)kernel_sizestriderD   position_idsr   rE   F
persistent)r:   r;   rX   
patch_sizer   Conv2dnum_channelsr@   patch_embedr7   rms_norm_epsrms_norm
image_size	is_native	Embeddingposition_embeddingregister_bufferr3   arangeexpand)r)   rX   num_patchesrA   r*   r+   r;   x   s   
 zAimv2VisionEmbeddings.__init__   g     @cpur#   c                 C   s   t jt|||d}t jt| ||d}t j||dd\}}|d }t j|||d| }	d||	  }	| d |	d d d f  }
| d |	d d d f  }t j|
 |
 | | gddd d d d d f S )	NrG   devicexy)indexing   g      ?).Nr   dim)r3   rv   intmeshgridflattenconcatsincos)heightwidth	embed_dimtemperaturer|   rG   grid_wgrid_hpos_dimomegaout_hout_wr*   r*   r+   "build_2d_sincos_position_embedding   s   8z8Aimv2VisionEmbeddings.build_2d_sincos_position_embeddingpixel_valuesc                 C   s|   |  \}}}}| |ddd}| |}| jjr2| j|| j || j | jj	|j
|jd}n| | j}|| }|S )NrD   r   )r   r|   rG   )sizern   r   	transposerp   rX   rr   r   rk   r@   r|   rG   rt   rg   )r)   r   _r   r   rC   	pos_embedr*   r*   r+   rO      s   
zAimv2VisionEmbeddings.forward)r/   r0   r1   r   r;   staticmethodr3   rI   rS   r   rO   rT   r*   r*   rA   r+   rd   w   s    
rd   c                	       sX   e Zd Zdef fddZ			ddejdB dejdB dejdB dejfd	d
Z	  Z
S )Aimv2TextEmbeddingsrX   c                    sR   t    |j}t|j|| _t|j|| _| j	dt
|jddd d S )Nrg   rh   Fri   )r:   r;   r@   r   rs   
vocab_sizetoken_embeddingmax_position_embeddingsrt   ru   r3   rv   rw   )r)   rX   r   rA   r*   r+   r;      s   

zAimv2TextEmbeddings.__init__N	input_idsrg   inputs_embedsr#   c                 C   s   |d ur	|j d n|j d }| jjj d }||kr#td| d| |d u r2| jd d d |f }|d u r;| |}| |}|| }|S )NrE   r   zRSequence length must be less than max_position_embeddings (got `sequence length`: z and max_position_embeddings: )rP   rt   r>   
ValueErrorrg   r   )r)   r   rg   r   
seq_lengthmax_position_embeddingposition_embeddings
embeddingsr*   r*   r+   rO      s"   

zAimv2TextEmbeddings.forwardNNN)r/   r0   r1   r   r;   r3   
LongTensorr4   rS   rO   rT   r*   r*   rA   r+   r      s    r           modulequerykeyvalueattention_maskscalingdropoutc           
      K   s|   t ||dd| }|d ur|| }tjj|dt jd|j}tjj	||| j
d}t ||}	|	dd }	|	|fS )NrE   r   )r   rG   )ptrainingr   rD   )r3   matmulr   r   
functionalsoftmaxrI   rH   rG   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputr*   r*   r+   eager_attention_forward   s   
r   c                
       sR   e Zd ZdZ fddZ	d
dejdejdB deejejdB f fdd	Z  Z	S )Aimv2Attentionz=Multi-headed attention from 'Attention Is All You Need' paperc                    s   t    || _|j| _|j| _| j| j | _| j| j | jkr-td| j d| j d| jd | _	|j
| _d| _tj| j| j|jd| _tj| j| j|jd| _tj| j| j|jd| _tj| j| j|jd| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      FrV   )r:   r;   rX   r@   r   num_attention_heads	num_headshead_dimr   scaleattention_dropoutr   	is_causalr   rZ   qkv_biask_projv_projq_projout_projra   rA   r*   r+   r;      s$   

zAimv2Attention.__init__NrC   r   r#   c              
   K   s   |j \}}}| |}| |}| |}	|||| j| jdd}|||| j| jdd}|	||| j| jdd}	t	| j
jt}
|
| |||	|| j| j| jsVdn| jd\}}|||| }| |}||fS )z#Input shape: Batch x Time x Channelr   rD   r   )r   r   r   )rP   r   r   r   viewr   r   r   r   get_interfacerX   _attn_implementationr   r   r   r   r   reshaper   r   )r)   rC   r   r   
batch_sizer   r   queriesr.   valuesattention_interfacer   r   r*   r*   r+   rO      s.   




zAimv2Attention.forwardrb   )
r/   r0   r1   r2   r;   r3   rS   r-   rO   rT   r*   r*   rA   r+   r      s    r   c                	       sN   e Zd Zdef fddZ	ddejdejdB dee dejfd	d
Z	  Z
S )Aimv2EncoderLayerrX   c                    sB   t    t|| _t|| _t|j|j| _	t|j|j| _
d S rb   )r:   r;   r   	attentionrU   ffnr7   r@   ro   	rms_norm1	rms_norm2ra   rA   r*   r+   r;   (  s
   


zAimv2EncoderLayer.__init__NrC   r   r   r#   c                 K   sL   |  |}| jd||d|\}}|| }| |}| |}|| }|S )N)rC   r   r*   )r   r   r   r   )r)   rC   r   r   norm_hidden_statesr   r   
mlp_outputr*   r*   r+   rO   /  s   


zAimv2EncoderLayer.forwardrb   )r/   r0   r1   r   r;   r3   rS   r   r   rO   rT   r*   r*   rA   r+   r   '  s    
r   c                       sN   e Zd ZdZdef fddZe	ddejdB de	e
 defd	d
Z  ZS )Aimv2Encoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`Aimv2EncoderLayer`].

    Args:
        config: Aimv2Config
    rX   c                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r*   )r   )r&   r   rX   r*   r+   
<listcomp>L  s    z)Aimv2Encoder.__init__.<locals>.<listcomp>F)	r:   r;   rX   r   
ModuleListrangenum_hidden_layerslayersgradient_checkpointingra   rA   r   r+   r;   I  s   
 
zAimv2Encoder.__init__Nr   r   r#   c                 K   s,   |}| j D ]}|||fi |}qt|dS )N)last_hidden_state)r   r   )r)   r   r   r   rC   encoder_layerr*   r*   r+   rO   P  s   

zAimv2Encoder.forwardrb   )r/   r0   r1   r2   r   r;   r   r3   rS   r   r   r   rO   rT   r*   r*   rA   r+   r   @  s    r   c                       s8   e Zd Zdef fddZdejdejfddZ  ZS )Aimv2AttentionPoolingHeadrX   c                    s|   t    |j| _|j| _tj| j| j|jd| _tj| j| j|jd| _	t
tdd| j| _tj| j| jdd| _d S )NrV   r   T)r:   r;   r@   r   r   r   rZ   r   r   r   r<   r3   zeros	cls_tokenoutput_projra   rA   r*   r+   r;   c  s   
z"Aimv2AttentionPoolingHead.__init__rC   r#   c                 C   s   |j \}}}| j|dd}| |||| j|| j }| |||| j|| j }||d| j|| j }|dddd}|dddd}|dddd}t	|||}	|	
dd|d|}	|	jdd}	| |	}
|
S )NrE   r   r   rD   r   r   )rP   r   rw   r   r   r   r   permuteFscaled_dot_product_attentionr   rK   r   )r)   rC   r   seq_len
hidden_dimr   r   r   r   r   outputr*   r*   r+   rO   n  s   
z!Aimv2AttentionPoolingHead.forward)	r/   r0   r1   r   r;   r3   rS   rO   rT   r*   r*   rA   r+   r   b  s    r   c                       sR   e Zd ZU dZeed< dZdZdZg dZ	dZ
dZdZe  fddZ  ZS )	Aimv2PreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models. The model is only intended for inference and doesn't support finetuning.
    rX   aimv2)imageT)r   r   rd   r   c                    s   t  | t|dr t|jtjrt|jt	
d d S d S t|tr2tj|jd| jjd d S t|trJt|jt|jjd d d S t|trbt|jt|jjd d d S d S )Nlogit_scaleg$I$I,@r   )rK   stdrE   rh   )r:   _init_weightshasattr
isinstancer   r   r<   init	constant_mathlogr   normal_r   rX   initializer_rangerd   copy_rg   r3   rv   rP   rw   r   )r)   r   rA   r*   r+   r     s   


&
&z"Aimv2PreTrainedModel._init_weights)r/   r0   r1   r2   r   r5   base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attnr3   no_gradr   rT   r*   r*   rA   r+   r     s   
 r   zL
    The Vision model from AIMv2 without any head or projection on top.
    )custom_introc                       sr   e Zd ZU eed< dZeedZdef fddZ	de
jfddZeed	d
edee defddZ  ZS )Aimv2VisionModelrX   r   rC   
attentionsc                    sZ   t  | || _t|| _t|| _t|j|j	| _
|j| _| jr't|| _|   d S rb   )r:   r;   rX   rd   r   r   encoderr7   r@   ro   rp   use_headr   head	post_initra   rA   r*   r+   r;     s   


zAimv2VisionModel.__init__r#   c                 C      | j jS rb   )r   rn   r(   r*   r*   r+   get_input_embeddings     z%Aimv2VisionModel.get_input_embeddingsFtie_last_hidden_statesr   c                 K   sN   |  |}| jdd|i|}|j}| |}| jr| |nd}t||dS )a3  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, Siglip2VisionModel

        >>> model = Aimv2VisionModel.from_pretrained("apple/aimv2-large-patch14-native")
        >>> processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-native")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled features
        ```r   Nr   pooler_outputr*   )r   r  r   rp   r  r  r   )r)   r   r   rC   encoder_outputsr   r  r*   r*   r+   rO     s   

zAimv2VisionModel.forward)r/   r0   r1   r   r5   main_input_namer   r   _can_record_outputsr;   r   Moduler  r   r   r   r   r   r   rO   rT   r*   r*   rA   r+   r     s    
 r   zJ
    The text model from AIMv2 without any head or projection on top.
    c                
       s~   e Zd ZdZeedZdef fddZde	j
fddZd	d
 Zeedde	ddejdB dee defddZ  ZS )Aimv2TextModelr   r  rX   c                    sJ   t  | || _t|| _t|| _t|j|j	| _
|j| _|   d S rb   )r:   r;   rX   r   r   r   r  r7   r@   ro   rp   eos_token_idr  ra   rA   r*   r+   r;     s   

zAimv2TextModel.__init__r#   c                 C   r  rb   r   r   r(   r*   r*   r+   r    r	  z#Aimv2TextModel.get_input_embeddingsc                 C   s   || j _d S rb   r  )r)   r   r*   r*   r+   set_input_embeddings  s   z#Aimv2TextModel.set_input_embeddingsFr
  Nr   r   c                 K   s   |  |}|j\}}}tj|tj|jd}|d|d}	|d ur-t| j	||	||d d}| j
d	||d|}
|
j}| |}|tj|jd |jd|jtj|jd| jk jddf }t||dS )
Nr{   r   rE   )rX   r   rg   r   cache_positionpast_key_values)r   r   )r|   r   r  r*   )r   rP   r3   rv   longr|   	unsqueezerw   r
   rX   r  r   rp   rH   r   r  argmaxr   )r)   r   r   r   rC   r   r   r   r  rg   r  r   pooled_outputr*   r*   r+   rO     s<   
		
"zAimv2TextModel.forwardrb   )r/   r0   r1   r  r   r   r  r   r;   r   r  r  r  r   r   r   r3   rS   r   r   r   rO   rT   r*   r*   rA   r+   r    s&    r  tensorr#   c                 C   s,   t | d}t j|ddd}t |d}|S )z
    This method is equivalent to tensor.norm(p=2, dim=-1, keepdim=True) and used to make
    model `executorch` exportable. See issue https://github.com/pytorch/executorch/issues/3566
    rD   rE   T)r   rF   g      ?)r3   rJ   sum)r  square_tensor
sum_tensornormed_tensorr*   r*   r+   _get_vector_norm@  s   r!  c                       s   e Zd ZU eed< g dZdZdef fddZee			dde
jde
jdB d	e
jdB d
ee deeB f
ddZee		dde
jded
ee deeB fddZe	e			dde
jdB de
jdB de
jdB d
ee def
ddZ  ZS )
Aimv2ModelrX   )r   r   rd   Tc                    s   t  | |j| _|jj| _|jj| _t	|j| _
t	|j| _tj| j| jdd| _tj| j| jdd| _tt| jj| _t|j| _|   d S )NFrV   )r:   r;   projection_dimvision_configr@   vision_embed_dimtext_configtext_embed_dimr   _from_configvision_modelr  
text_modelr   rZ   visual_projectiontext_projectionr<   r3   r  rX   logit_scale_init_valuer   r   r   max_logit_scalemax_log_logit_scaler  ra   rA   r*   r+   r;   Q  s   

zAimv2Model.__init__Nr   r   rg   r   r#   c                 K   s0   | j d|||dd|}|j}| ||_|S )a
  
        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, Aimv2Model

        >>> model = Aimv2Model.from_pretrained("openai/aimv2-vit-base-patch32")
        >>> tokenizer = AutoTokenizer.from_pretrained("openai/aimv2-vit-base-patch32")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> with torch.inference_mode():
        ...     text_features = model.get_text_features(**inputs)
        ```T)r   r   rg   return_dictNr*   )r*  r  r,  )r)   r   r   rg   r   text_outputsr  r*   r*   r+   get_text_featuresc  s   zAimv2Model.get_text_featuresFr   interpolate_pos_encodingc                 K   s.   | j d||dd|}|j}| ||_|S )a  
        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, Aimv2Model
        >>> from transformers.image_utils import load_image

        >>> model = Aimv2Model.from_pretrained("openai/aimv2-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/aimv2-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> with torch.inference_mode():
        ...     image_features = model.get_image_features(**inputs)
        ```T)r   r3  r0  Nr*   )r)  r  r+  )r)   r   r3  r   vision_outputsr  r*   r*   r+   get_image_features  s   zAimv2Model.get_image_featuresc                 K   s   | j dd|i|}| jd||d|}|j}| |}|j}| |}|t| }|t| }| jd| j	 
|j}	|	| |  }
|
 }t||
||||dS )a  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, Aimv2Model

        >>> model = Aimv2Model.from_pretrained("apple/aimv2-large-patch14-224-lit")
        >>> processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-224-lit")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
        ... )

        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```r   )r   r   r   )r   r   r   r    r!   r"   Nr*   )r)  r*  r  r+  r,  r!  r   clampr/  exprH   r|   tr   )r)   r   r   r   r   r4  r1  r    r   r   r   r   r*   r*   r+   rO     s8   !

zAimv2Model.forward)NN)Fr   )r/   r0   r1   r   r5   r   r   r;   r   r   r3   rS   r   r   r-   r   r2  r4   boolr5  r   r   rO   rT   r*   r*   rA   r+   r"  K  s`   
 "$r"  )r   r"  r   r  )r   )?r   collections.abcr   dataclassesr   typingr   r3   torch.nn.functionalr   r   r    r   r   activationsr   integrationsr	   masking_utilsr
   modeling_layersr   modeling_outputsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   utils.output_capturingr   configuration_aimv2r   r   r   r   r  r7   rU   rd   r   rS   rR   r   r   r   r   r   r   r   r  r!  r"  __all__r*   r*   r*   r+   <module>   s|   #4/
="""IF &