o
    
۾iJ                     @   sF  d Z ddlmZ ddlZddlmZ ddlmZ ddlm	Z	 ddl
mZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZ ddlmZ ddlmZmZmZ G dd dejZG dd dejZ G dd dejZ!eddgddedG dd dejZ"G dd dejZ#G dd dejZ$G dd dejjZ%dS ) z]Implementation of Siglip2VisionModel intended to be only used
within a vision language model.    )IterableN)nn)
functional)Siglip2VisionConfig)support_torch_compile)$get_tensor_model_parallel_world_size)
get_act_fn)MMEncoderAttention)ColumnParallelLinearQKVParallelLinearRowParallelLinear)QuantizationConfig)default_weight_loader   )is_vit_use_data_parallelresolve_visual_encoder_outputsshould_torch_compile_mm_vitc                	       sf   e Zd Zdef fddZdejdejdejfddZ	e
d	ejdejd
ee dejfddZ  ZS )Siglip2VisionEmbeddingsconfigc                    sn   t    || _|j| _|j| _tj|j| j | j | jd| _	|j
| _
t| j
d | _t| j
| j| _d S )N)in_featuresout_featuresg      ?)super__init__r   hidden_size	embed_dim
patch_sizer   Linearnum_channelspatch_embeddingnum_patchesintposition_embedding_size	Embeddingposition_embedding)selfr   	__class__ [/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/lfm2_siglip2.pyr   !   s   
z Siglip2VisionEmbeddings.__init__pixel_values_packedspatial_shapesreturnc                 C   s   |j jdks
J d| dkr|jd dksJ |d }n|}|dddf |dddf  jtjd}| }tt	|}||jd krTt
d|jd  d	| d
| jjj}| |j|d}| jj| j| jd}	| j|	||d}
||
 }|dS )al  Embed patchified pixel values in packed (unpadded) form.

        Args:
            pixel_values_packed: (1, total_tokens, patch_dim) or
                (total_tokens, patch_dim), packed in tile order.
            spatial_shapes: (num_tiles, 2) on CPU (height, width) per tile.

        Returns:
            (1, total_tokens, embed_dim) packed embeddings.
        cpuzYExpected `spatial_shapes` on CPU to avoid device-to-host sync in variable-length packing.   r   r   N)dtypez?Packed pixel_values token count does not match spatial_shapes: z vs .)lengths_list)devicetypedimshapetotorchint64tolistr    sum
ValueErrorr   weightr.   r#   reshaper!   #resize_positional_embeddings_packed	unsqueeze)r$   r)   r*   pixel_values_flatlengthsr1   total_tokenstarget_dtypepatch_embedspositional_embeddingspacked_pos_embeds
embeddingsr'   r'   r(   forward.   s<   
*


zSiglip2VisionEmbeddings.forwardrE   r1   c                 C   s   |j jdksJ | jd }| j}tt|}tj||f| j |d}| ddd	d}|j jdkr7|
tj}d}t|D ]:\}	}
|
dkrFq=||	  \}}tj|||fddd	d
}|||| dd}|
|}|||||
 < ||
7 }q=|S )a  Resize positional embeddings per image and return a packed tensor.

        Args:
            positional_embeddings: (height, width, embed_dim) base grid.
            spatial_shapes: (batch_size, 2) on CPU, (height, width) per image.
            lengths_list: flattened token length per image (height * width).

        Returns:
            (total_tokens, embed_dim) packed positional embeddings, concatenated
            in the same order as `lengths_list`.
        r,   r0   )r2   r.      r   r   bilinearFT)sizemodealign_corners	antialias)r2   r3   r5   r.   r    r:   r7   emptypermuter?   r6   float32	enumerater9   Finterpolater=   	transpose)rE   r*   r1   r   source_dtyperB   rF   pos_4doffsetilengthheightwidthresizedr'   r'   r(   r>   `   s:   


z;Siglip2VisionEmbeddings.resize_positional_embeddings_packed)__name__
__module____qualname__r   r   r7   FloatTensor
LongTensorTensorrH   staticmethodlistr    r>   __classcell__r'   r'   r%   r(   r       s&    
2r   c                       s^   e Zd ZdZ		ddededB def fddZd	ej	d
ej	de
ej	B dej	fddZ  ZS )Siglip2Attentionz=Multi-headed attention from 'Attention Is All You Need' paperN r   quant_configprefixc                    s  t    || _|j| _|j| _| j| j | _| j| j | jkr-td| j d| j d| jd | _	|j
| _t }|r>dnt }| j| dksJJ | j| | _t| j| j| j|| d|d| _t| j| j|| d	|d
| _t| j| j| j	| dd| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      r   r   z	.qkv_proj)r   	head_sizetotal_num_headsri   rj   
disable_tpz	.out_proj)
input_sizeoutput_sizeri   rj   rm   z.attn)	num_headsrk   scalerj   )r   r   r   r   r   num_attention_headsrp   head_dimr;   rq   attention_dropoutdropoutr   r   num_heads_per_partitionr   qkv_projr   out_projr	   attn)r$   r   ri   rj   use_data_paralleltp_sizer%   r'   r(   r      sP   
zSiglip2Attention.__init__hidden_states
cu_seqlens
max_seqlenr+   c                 C   s   |  |\}}|j\}}}|jddd\}}	}
|||| j| j}|	||| j| j}	|
||| j| j}
| j||	|
||d}|||d}| |\}}|S )Nr-   r0   )r4   )querykeyvaluer}   r~   )	rw   r5   chunkviewrv   rs   ry   r=   rx   )r$   r|   r}   r~   qkv_bszq_lenquery_states
key_statesvalue_statesoutattn_outputr'   r'   r(   rH      s0   zSiglip2Attention.forwardNrh   )r^   r_   r`   __doc__r   r   strr   r7   rc   r    rH   rf   r'   r'   r%   r(   rg      s(    /rg   c                       sJ   e Zd Z		ddededB def fddZdejd	ejfd
dZ	  Z
S )
Siglip2MLPNrh   r   ri   rj   c                    sb   t    || _t|j| _t }t|j|j	|| d|d| _
t|j	|j|| d|d| _d S )Nz.fc1)ri   rj   rm   z.fc2)r   r   r   r   
hidden_actactivation_fnr   r
   r   intermediate_sizefc1r   fc2)r$   r   ri   rj   rz   r%   r'   r(   r      s$   
zSiglip2MLP.__init__r|   r+   c                 C   s*   |  |\}}| |}| |\}}|S N)r   r   r   )r$   r|   r   r'   r'   r(   rH     s   
zSiglip2MLP.forwardr   )r^   r_   r`   r   r   r   r   r7   rc   rH   rf   r'   r'   r%   r(   r      s    r   )r|   r}   )dynamic_arg_dims	enable_ifc                       sZ   e Zd Z		ddededB def fddZdejd	ejd
e	ejB dejfddZ
  ZS )Siglip2EncoderLayerNrh   r   ri   rj   c                    sj   t    |j| _tj| j|jd| _t||| dd| _	tj| j|jd| _
t||| dd| _d S )Nepsz
.self_attn)ri   rj   z.mlp)r   r   r   r   r   	LayerNormlayer_norm_epslayer_norm1rg   	self_attnlayer_norm2r   mlp)r$   r   ri   rj   r%   r'   r(   r     s   
zSiglip2EncoderLayer.__init__r|   r}   r~   r+   c                 C   sJ   |}|  |}| j|||d}|| }|}| |}| |}|| }|S )z
        Args:
            hidden_states: Input tensor of shape (batch, seq_len, embed_dim).
            cu_seqlens: Cumulative sequence lengths tensor.
            max_seqlen: Maximum sequence length.
        )r|   r}   r~   )r   r   r   r   )r$   r|   r}   r~   residualr'   r'   r(   rH   (  s   


zSiglip2EncoderLayer.forwardr   )r^   r_   r`   r   r   r   r   r7   rc   r    rH   rf   r'   r'   r%   r(   r     s&    r   c                       sz   e Zd ZdZ			ddededB dedB def fdd	Z	
dde	j
de	j
dee	j
B dede	j
ee	j
 B f
ddZ  ZS )Siglip2Encoderz
    Transformer encoder consisting of `config.num_hidden_layers`
    self attention layers. Each layer is a [`Siglip2EncoderLayer`].

    Args:
        config: PretrainedConfig
    Nrh   r   ri   num_hidden_layers_overriderj   c                    sJ   t     | _|d u r j}n|}t fddt|D | _d S )Nc                    s$   g | ]}t   d | dqS )z.layers.)r   ri   rj   )r   ).0idxr   rj   ri   r'   r(   
<listcomp>^  s    z+Siglip2Encoder.__init__.<locals>.<listcomp>)r   r   r   num_hidden_layersr   
ModuleListrangelayers)r$   r   ri   r   rj   r   r%   r   r(   r   N  s   

zSiglip2Encoder.__init__Finputs_embedsr}   r~   return_all_hidden_statesr+   c                 C   s>   |g}|}| j D ]}||||d}|r|| q|r|S |S )N)r}   r~   )r   append)r$   r   r}   r~   r   hidden_states_poolr|   encoder_layerr'   r'   r(   rH   h  s   

zSiglip2Encoder.forward)NNrh   )F)r^   r_   r`   r   r   r   r    r   r   r7   rc   boolre   rH   rf   r'   r'   r%   r(   r   E  s4    r   c                       s   e Zd Z				ddededB dedB dedB def
 fdd	Zd
d Z		dde
jde
jde
jde
jdee dB de
jfddZ  ZS )Siglip2VisionTransformerNrh   r   ri   r   require_post_normrj   c           	         s   t    |j}|| _t|| _ddlm} |ddd t|||| dd| _	W d    n1 s3w   Y  |j
}t| j	j|j
krStd| d	t| j	j d
|d u r_t| j	j|k}|rltj||jd| _d S d | _d S )Nr   )set_model_tagr   T)
is_encoderz.encoder)ri   r   rj   zThe original encoder only has z layers, but you requested z layers.r   )r   r   r   r   r   rG   vllm.compilation.backendsr   r   encoderr   lenr   r;   r   r   r   post_layernorm)	r$   r   ri   r   r   rj   r   r   r   r%   r'   r(   r     s2   




z!Siglip2VisionTransformer.__init__c                 C   s   | j S r   )rG   )r$   r'   r'   r(   get_input_embeddings  s   z-Siglip2VisionTransformer.get_input_embeddingsr)   r*   r}   r~   select_layersr+   c                 C   s<   |  ||}| j||||dud}t|| j|| jjd}|S )a  
        spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
            Tensor containing the spatial dimensions (height, width)
        of the input images.
        select_layers (`list[int]` or `None`, defaults to `None`):
            Layer indices to select hidden states from. Supports negative
            indices (e.g., -1 for last layer, -2 for second-to-last).
            If None, returns the last layer output.
        N)r   r}   r~   r   )r   max_possible_layers)rG   r   r   r   r   r   )r$   r)   r*   r}   r~   r   r|   encoder_outputsr'   r'   r(   rH     s   z Siglip2VisionTransformer.forwardNNNrh   r   )r^   r_   r`   r   r   r    r   r   r   r   r7   ra   rb   rc   re   rH   rf   r'   r'   r%   r(   r     s>    %	
r   c                       s   e Zd Z				ddededB dedB dedB def
 fdd	Z	dd
e	j
de	jde	jde	jdee dB de	jfddZdeeee	jf  dee fddZ  ZS )Siglip2ModelNrh   r   ri   r   r   rj   c                    s(   t    t||||| dd| _d S )Nz.vision_model)ri   r   r   rj   )r   r   r   vision_model)r$   r   ri   r   r   rj   r%   r'   r(   r     s   
zSiglip2Model.__init__r)   r*   r}   r~   r   r+   c                 C   s   | j |||||dS )af  Forward pass through the vision model.

        Args:
            select_layers: Layer indices to select hidden states from.
                Supports negative indices (e.g., [-2] for second-to-last).
                If None, returns the last layer output with post_layernorm.
                Multiple layers can be selected and will be concatenated.
        )r)   r*   r}   r~   r   )r   )r$   r)   r*   r}   r~   r   r'   r'   r(   rH     s   zSiglip2Model.forwardweightsc                 C   s   g d}t |  }t }t| jjj}|D ]X\}}|dr&| jjd u r&q|dr9t	|
dd }||kr9q|D ]\}	}
}|
|vrEq;||
|	}|| }|j}||||  n|| }t|dt}||| || q|S )N))rw   q_projq)rw   k_projk)rw   v_projvzvision_model.post_layernormzvision_model.encoder.layersr/   r-   weight_loader)dictnamed_parameterssetr   r   r   r   
startswithr   r    splitreplacer   getattrr   add)r$   r   stacked_params_mappingparams_dictloaded_paramslayer_countnameloaded_weight	layer_idx
param_nameweight_nameshard_idparamr   r'   r'   r(   load_weights  s4   

zSiglip2Model.load_weightsr   r   )r^   r_   r`   r   r   r    r   r   r   r7   ra   rb   rc   re   rH   r   tupler   r   rf   r'   r'   r%   r(   r     s>    

,r   )&r   collections.abcr   r7   r   torch.nnr   rS   transformersr   vllm.compilation.decoratorsr   vllm.distributedr   %vllm.model_executor.layers.activationr   $vllm.model_executor.layers.attentionr	   !vllm.model_executor.layers.linearr
   r   r   'vllm.model_executor.layers.quantizationr   -vllm.model_executor.model_loader.weight_utilsr   visionr   r   r   Moduler   rg   r   r   r   r   r   r'   r'   r'   r(   <module>   s2   yT!3:M