o
    -i[                     @   s  d Z ddlmZ ddlZddlmZ ddlmZ ddlm	Z	 ddl
mZ ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZmZ ddlmZ ddlmZ ddlm Z  ddl!m"Z" ddl#m$Z$ G dd dej%Z&G dd dej%Z'dej(dej(dej(dej(de)dede*ej(ej(f fddZ+G d d! d!ej%Z,G d"d# d#ej%Z-G d$d% d%ej%Z.G d&d' d'ej%Z/G d(d) d)ej%Z0G d*d+ d+ejj%Z1dS ),z\Implementation of SiglipVisionModel intended to be only used
within a vision language model.    )IterableN)nn)
functional)Siglip2VisionConfig)PretrainedConfig)divide$get_tensor_model_parallel_world_size)
get_act_fn)MMEncoderAttention)Conv2dLayer)ColumnParallelLinear
LinearBaseQKVParallelLinearReplicatedLinearRowParallelLinear)QuantizationConfig)ApplyRotaryEmb)default_weight_loader)current_platform   )is_vit_use_data_parallelc                       s@   e Zd Zddededdf fddZdedejfd	d
Z  Z	S )VisionRotaryEmbedding     @dimthetareturnNc                    s>   t    d|tjd|dtjd|   }| jd|dd d S )Ng      ?r      dtypeinv_freqF)
persistent)super__init__torcharangefloatregister_buffer)selfr   r   r   	__class__ d/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/model_executor/models/siglip2navit.pyr"   $   s   
 zVisionRotaryEmbedding.__init__seqlenc                 C   s*   t j|| jj| jjd}t || j}|S )Ndevicer   )r#   r$   r   r.   r   outer)r'   r,   seqfreqsr*   r*   r+   forward)   s
   zVisionRotaryEmbedding.forward)r   )
__name__
__module____qualname__intr%   r"   r#   Tensorr2   __classcell__r*   r*   r(   r+   r   #   s    r   c                       sF   e Zd Zdef fddZ	d
dejdejdB dejfdd	Z	  Z
S )Siglip2VisionEmbeddingsconfigc                    s   t    || _|j| _|j| _|j| _|j| _|j| _|j	| _	| jdkrMt
|j| j | j | jdd| _| jrKt| jd | _t| j| j| _d S d S t|j| j| j| jdd| _| jry| j| j d | _| j| j | _t| j| j| _d S d S )Nr   F)
input_sizeoutput_sizereturn_biasg      ?valid)in_channelsout_channelskernel_sizestridepaddingr   )r!   r"   r:   hidden_size	embed_dim
patch_size
image_sizenum_patchespreserve_original_pehidden_strider   num_channelspatch_embeddingr6   position_embedding_sizer   	Embeddingposition_embeddingr   )r'   r:   r(   r*   r+   r"   2   s<   

z Siglip2VisionEmbeddings.__init__Npixel_values	grid_thwsr   c              	   C   s  | j jj}t| j tr|  |j|d}n&t| j tr;|d| jj	| jj
 | j| j}|  |j|d}|d| j}| jr|dusDJ t|}| jj| j| jdddddd}d}|D ]Y\}}	}
||	 |
 }tj||	|
fdd	d
}|ddddd|	|
 d}|d |d}|||	| j | j|
| j | jd}|dddddd|d}||||| < ||7 }qa|| }|S )aL  
        Args:
            pixel_values (`torch.FloatTensor`):
                Pixel values of shape (
                    num_patches,
                    num_channels * temporal_patch_size * patch_size * patch_size
                )
            grid_thws: (`torch.LongTensor`):
                grid shape (num_patches, 3)
        r   Nr      r   r   bicubicF)sizemodealign_corners      )rL   weightr   
isinstancer   tor   viewr:   rK   temporal_patch_sizerF   reshaperE   rI   r#   
zeros_likerO   rM   	unsqueezepermuteFinterpolaterepeatrJ   )r'   rP   rQ   target_dtypepatch_embedspos_embed_newpositional_embeddingscntthwvolumeper*   r*   r+   r2   T   sZ   



zSiglip2VisionEmbeddings.forwardN)r3   r4   r5   r   r"   r#   FloatTensor
LongTensorr7   r2   r8   r*   r*   r(   r+   r9   1   s    %r9   qkcossinis_flash_attn_backendapply_rotary_embr   c           	      C   sz   |j dddd  }|j dddd  }|r t r |j}n|r*t r*|j}n|j}|| ||}||||}||fS )Nr   rR   r   r   )chunk
contiguousr   is_cudaforward_cudais_rocmforward_hipforward_native)	rs   rt   ru   rv   rw   rx   apply_rotary_emb_funcq_embedk_embedr*   r*   r+   apply_rotary_pos_emb   s   r   c                       sz   e Zd ZdZ		ddededB def fddZ	dd	ej	d
ej	de
ej	ej	f dB de
ej	ej	dB f fddZ  ZS )Siglip2Attentionz=Multi-headed attention from 'Attention Is All You Need' paperN r:   quant_configprefixc                    s  t    || _|j| _|j| _| j| j | _| j| j | jkr-td| j d| j d| jd | _	|j
| _t }t| j| j| j|| d|d| _t| j| j|| d|d| _|r^d	nt | _t| j| j| _|j| _t| j| j| j	| d
d| _tddd| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      z	.qkv_proj)rD   	head_sizetotal_num_headsr   r   
disable_tpz	.out_proj)r;   r<   r   r   r   r   z.attn)	num_headsr   scaler   T)enforce_enableenable_fp32_compute)r!   r"   r:   rD   rE   num_attention_headsr   head_dim
ValueErrorr   attention_dropoutdropoutr   r   qkv_projr   out_projr   tp_sizer   num_heads_per_partitionuse_roper
   attnr   rx   r'   r:   r   r   use_data_parallelr(   r*   r+   r"      sZ   
	zSiglip2Attention.__init__hidden_states
cu_seqlensposition_embeddingsr   c                 C   s  |j \}}| |\}}|jddd\}}	}
||| j| j}|	|| j| j}	|
|| j| j}
| jrV|\}}t|d|	d||| j	j
| j\}}	|d}|	d}	|dd |dd   }| j	|d|	d|
d||d}||| j| j }| |\}}|S )z#Input shape: Batch x Time x ChannelrS   rR   ry   r   r   N)querykeyvaluer   
max_seqlen)shaper   rz   r]   r   r   r   r   ra   r   rw   rx   squeezemaxr_   r   )r'   r   r   r   
seq_lengthrE   
qkv_states_querieskeysvaluesru   rv   r   attn_outputr*   r*   r+   r2      s>   


zSiglip2Attention.forwardNr   rp   )r3   r4   r5   __doc__r   r   strr"   r#   r7   tupler2   r8   r*   r*   r(   r+   r      s*    ;r   c                       sJ   e Zd Z		ddededB def fddZdejd	ejfd
dZ	  Z
S )
Siglip2MLPNr   r:   r   r   c                    sb   t    || _t }t|j| _t|j|j	|| d|d| _
t|j	|j|| d|d| _d S )Nz.fc1)r   r   r   z.fc2)r!   r"   r:   r   r	   
hidden_actactivation_fnr   rD   intermediate_sizefc1r   fc2r   r(   r*   r+   r"     s$   
zSiglip2MLP.__init__r   r   c                 C   s*   |  |\}}| |}| |\}}|S rp   )r   r   r   )r'   r   r   r*   r*   r+   r2   1  s   
zSiglip2MLP.forwardr   )r3   r4   r5   r   r   r   r"   r#   r7   r2   r8   r*   r*   r(   r+   r     s    r   c                	       sZ   e Zd Z		ddededB def fddZdejd	ejd
ejde	ej
 fddZ  ZS )Siglip2EncoderLayerNr   r:   r   r   c                    sj   t    |j| _tj| j|jd| _t||| dd| _	tj| j|jd| _
t||| dd| _d S )Nepsz
.self_attnr   r   z.mlp)r!   r"   rD   rE   r   	LayerNormlayer_norm_epslayer_norm1r   	self_attnlayer_norm2r   mlpr'   r:   r   r   r(   r*   r+   r"   9  s   
zSiglip2EncoderLayer.__init__r   r   r   r   c                 C   sJ   |}|  |}| j|||d}|| }|}| |}| |}|| }|S )z
        Args:
            hidden_states: Input tensor of shape (batch, seq_len, embed_dim).
            cu_seqlens: Cumulative sequence lengths tensor.
            position_embeddings: Position embeddings tensor.
        )r   r   r   )r   r   r   r   )r'   r   r   r   residualr*   r*   r+   r2   N  s   


zSiglip2EncoderLayer.forwardr   )r3   r4   r5   r   r   r   r"   r#   r7   r   rq   r2   r8   r*   r*   r(   r+   r   8  s&    r   c                       sd   e Zd ZdZ		ddededB def fddZd	d
 Zdd Z	de
jde
jde
jfddZ  ZS )Siglip2Encoderz
    Transformer encoder consisting of `config.num_hidden_layers`
    self attention layers. Each layer is a [`Siglip2EncoderLayer`].

    Args:
        config: PretrainedConfig
    Nr   r:   r   r   c                    s   t     | _t fddt jD | _t j	 j
 d | _ j| _ j| _ j| _ j j | _ jd u rBd | _d S dd  jdD | _d S )Nc                    s$   g | ]}t   d | dqS )z.layers.r   )r   ).0idxr:   r   r   r*   r+   
<listcomp>}  s    z+Siglip2Encoder.__init__.<locals>.<listcomp>r   c                 S   s   g | ]}t |qS r*   )r6   )r   ir*   r*   r+   r     s    |)r!   r"   r:   r   
ModuleListrangenum_hidden_layerslayersr   rD   r   rotary_pos_embrF   rJ   window_sizespatial_merge_unitfullatt_block_indexessplitr   r(   r   r+   r"   t  s&   



zSiglip2Encoder.__init__c                 C   s  g }|D ]e\}}}t |dd|}||| j | j|| j | j}|dddd}| }t |d|d}||| j | j|| j | j}|dddd}| }|t j	||gdd
|d qt j|dd}|d d dd f  }| |}	|	| d}
|
S )Nr   rR   r   r   rS   ry   )r#   r$   ra   expandr_   rJ   rb   flattenappendstackre   catr   r   )r'   grid_thwpos_idsrk   rl   rm   hpos_idswpos_idsmax_grid_sizerotary_pos_emb_fullr   r*   r*   r+   rot_pos_emb  s4   "
zSiglip2Encoder.rot_pos_embc                 C   sj  g }dg}d}| j | j | j }|D ]\}}}|| j || j }	}
t||	 |
 ||	|
}||	|  }||
|  }|	| | }|
| | }t|d|d|fdd}||||||}|ddddd||| ||}|dk	ddgd}|d}||dk }|
||  |d| j |d  }||  |||	 |
  7 }qtj|dd	}||fS )
Nr   constantir   rS   r   rX   rR   ry   )r   rJ   rF   r#   r$   r_   rc   padrb   sumr   cumsumr   extendtolistitemr   )r'   r   window_indexcu_window_seqlenswindow_index_idvit_merger_window_sizegrid_tgrid_hgrid_w
llm_grid_h
llm_grid_windexpad_hpad_wnum_windows_hnum_windows_windex_paddedseqlens	index_newcu_seqlens_tmpr*   r*   r+   get_window_index  sP   
zSiglip2Encoder.get_window_indexinputs_embedsrQ   r   c                 C   s  |  |}| |\}}tj||jtj r|jntjd}t	|}|
 \}}||| j | jd}||ddddf }||d}||| j | jd}||ddddf }||d}tj||fdd}| | f}	t|dddf |dddf  |dddf jdtj r|jntjd}
t|
d|
g}
t|}|}t| jD ]\}}| jr|| jv r|
}n|}||||	}q||| j | jd}||ddf |d}|S )	a  
        Args:
            inputs_embeds: Input tensor of shape
                (batch_size, sequence_length, hidden_size).
                Embedded representation of the input tokens.
            grid_thws: Grid tensor of shape (num_patches, 3)
                containing grid dimensions.
                Whether or not to return a [`~utils.ModelOutput`] instead of
                a plain tuple.
        r-   rR   Nry   r   r   r   )r   r   )r   r   r#   tensorr.   jit
is_tracingr   int32unique_consecutiverU   r_   r   r   ru   rv   repeat_interleaver   	new_zerosargsort	enumerater   r   )r'   r   rQ   r   r   r   seq_lenr   embr   r   reverse_indicesr   r   blockr   r*   r*   r+   r2     sR   

,
zSiglip2Encoder.forwardr   )r3   r4   r5   r   r   r   r   r"   r   r   r#   r7   r2   r8   r*   r*   r(   r+   r   k  s(    "0r   c                       sP   e Zd Z		ddededB def fddZdejd	ej	d
ej
fddZ  ZS )Siglip2VisionTransformerNr   r:   r   r   c                    sL   t    || _|j}t|| _t||| dd| _tj	||j
d| _d S )Nz.encoderr   r   )r!   r"   r:   rD   r9   
embeddingsr   encoderr   r   r   post_layernorm)r'   r:   r   r   rE   r(   r*   r+   r"   *  s   

z!Siglip2VisionTransformer.__init__rP   rQ   r   c                 C   s&   |  ||}| ||}| |}|S )z
        spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
            Tensor containing the spatial dimensions (height, width)
            of the input images.
        )r	  r
  r  )r'   rP   rQ   r   last_hidden_stater*   r*   r+   r2   <  s   

z Siglip2VisionTransformer.forwardr   )r3   r4   r5   r   r   r   r"   r#   rq   rr   r7   r2   r8   r*   r*   r(   r+   r  )  s"    r  c                       st   e Zd Z		ddededB def fddZdejd	ej	d
ej
fddZdeeeej
f  d
ee fddZ  ZS )Siglip2NavitModelNr   r:   r   r   c                    s$   t    t||| dd| _d S )Nz.vision_modelr   )r!   r"   r  vision_modelr   r(   r*   r+   r"   O  s   
zSiglip2NavitModel.__init__rP   rQ   r   c                 C   s   | j ||dS )N)rP   rQ   )r  )r'   rP   rQ   r*   r*   r+   r2   ]  s   zSiglip2NavitModel.forwardweightsc                 C   s   g d}t |  }t }|D ]9\}}|D ]\}}}	||vrq|||}|| }
|
j}||
||	  n|| }
t|
dt}||
| || q|S )N))r   q_projrs   )r   k_projrt   )r   v_projvweight_loader)dictnamed_parameterssetreplacer  getattrr   add)r'   r  stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
param_nameweight_nameshard_idparamr  r*   r*   r+   load_weightsg  s"   
zSiglip2NavitModel.load_weightsr   )r3   r4   r5   r   r   r   r"   r#   rq   rr   r7   r2   r   r   r  r$  r8   r*   r*   r(   r+   r  N  s$    
,
r  )2r   collections.abcr   r#   r   torch.nnr   rc   transformersr    transformers.configuration_utilsr   vllm.distributedr   r   %vllm.model_executor.layers.activationr	   9vllm.model_executor.layers.attention.mm_encoder_attentionr
   vllm.model_executor.layers.convr   !vllm.model_executor.layers.linearr   r   r   r   r   'vllm.model_executor.layers.quantizationr   2vllm.model_executor.layers.rotary_embedding.commonr   -vllm.model_executor.model_loader.weight_utilsr   vllm.platformsr   visionr   Moduler   r9   r7   boolr   r   r   r   r   r   r  r  r*   r*   r*   r+   <module>   sR   f
h!3 ?%