o
    -i{                     @   sr  U d dl mZmZmZ d dlmZmZ d dlmZm	Z	 d dl
Z
d dl
mZ d dlmZmZmZmZmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZ d dlmZ d dlmZ d dlm Z  d dl!m"Z"m#Z#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z* d dl+m,Z,m-Z- d dl.m/Z/ d dl0m1Z1m2Z2m3Z3m4Z4m5Z5 d dl6m7Z7m8Z8m9Z9 d dl:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@ d dlAmBZB d dlCmDZDmEZE ddlFmGZGmHZHmIZI ddlJmKZK ddlLmMZMmNZN ddlOmPZPmQZQmRZRmSZSmTZTmUZU G dd deDZVd d d!d"ZWeXeYeRf eZd#< d$eYd%eRfd&d'Z[G d(d) d)e=Z\G d*d+ d+e;e\ Z]G d,d- d-e<e\ Z^G d.d/ d/ePe Z_G d0d1 d1ej`ZaG d2d3 d3ej`ZbG d4d5 d5ej`ZcG d6d7 d7ej`ZdG d8d9 d9ej`ZeG d:d; d;ej`ZfG d<d= d=ej`ZgG d>d? d?ej`ZhG d@dA dAej`ZidBeYdCe
jjdDe
jjdEeXeYe
jjf dFe(d%e
jjfdGdHZkG dIdJ dJej`ZleKdKdLe/jme^e\e]dMG dNdO dOej`eHeIZndS )P    )CallableIterableMapping)cached_propertypartial)	AnnotatedLiteralN)nn)BatchFeatureSiglipConfigSiglipProcessorSiglipTextConfigSiglipVisionConfig)
VllmConfig)BaseDummyOptions)divide$get_tensor_model_parallel_world_size)
get_act_fn)EncoderOnlyAttention)MMEncoderAttention)Conv2dLayer)ColumnParallelLinearQKVParallelLinearRowParallelLinear)DispatchPooler)QuantizationConfig)VocabParallelEmbedding)default_weight_loadermaybe_remap_kv_scale_name)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalInputsMultiModalKwargsItemsMultiModalUUIDDict)ImageProcessorItems	ImageSizeMultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptIndexTargetsPromptReplacementPromptUpdate)IntermediateTensors)TensorSchemaTensorShape   )MultiModalEmbeddingsSupportsMultiModalSupportsQuant)default_pooling_type)AutoWeightsLoadermaybe_prefix)VisionEncoderInfoVisionFeatureSelectStrategyVisionFeatureSelectStrategyStrget_num_selected_vision_tokensis_vit_use_data_parallelresolve_visual_encoder_outputsc                   @   s:   e Zd ZU dZed ed< eeje	ddddf ed< d	S )
SiglipImagePixelInputsz
    Dimensions:
        - bn: Batch size * number of images
        - c: Number of channels (3)
        - h: Height of each image
        - w: Width of each image
    pixel_valuestypebn   hwdataN)
__name__
__module____qualname____doc__r   __annotations__r   torchTensorr0    rM   rM   ^/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/model_executor/models/siglip.pyr>   H   s   
  r>   fullclass)MEANALLCLS_POOLING_TYPE_TO_STRATEGYpooling_typereturnc                 C   s*   zt |  W S  ty   td| d w )Nz;No feature selection strategy is defined for pooling_type: )rT   KeyError
ValueError)rU   rM   rM   rN   #_get_vision_feature_select_strategy\   s   
rY   c                   @   sv   e Zd Zdd Zdd ZdefddZdeee	d	B f fd
dZ
de	de	de	fddZdefddZde	fddZd	S )SiglipProcessingInfoc                 C   s   | j tS N)ctxget_hf_configr   selfrM   rM   rN   r]   i      z"SiglipProcessingInfo.get_hf_configc                 C   s   t |  S r[   )SiglipEncoderInfor]   r^   rM   rM   rN   get_vision_encoder_infol   r`   z,SiglipProcessingInfo.get_vision_encoder_infokwargsc                 K   s   | j jtfi |S r[   )r\   get_hf_processorr   )r_   rc   rM   rM   rN   rd   o   s   z%SiglipProcessingInfo.get_hf_processorrV   Nc                 C   s   ddiS )Nimager1   rM   r^   rM   rM   rN   get_supported_mm_limitsr      z,SiglipProcessingInfo.get_supported_mm_limitsimage_widthimage_heightc                C   s8   |   }| jjj}|d usJ t|j||dt|jS Nrh   ri   )rb   r\   model_configpooler_configr;   get_num_image_tokensrY   seq_pooling_type)r_   rh   ri   vision_encoder_inform   rM   rM   rN   rn   u   s   
z)SiglipProcessingInfo.get_num_image_tokensc                 C   s    |   }|  }}t||dS )N)widthheight)rb   get_image_sizer&   )r_   rp   rq   rr   rM   rM   rN   !get_image_size_with_most_features   s   z6SiglipProcessingInfo.get_image_size_with_most_featuresc                 C   s   |   \}}| j||dS rj   )rt   rn   )r_   target_widthtarget_heightrM   rM   rN   get_max_image_tokens   s   z)SiglipProcessingInfo.get_max_image_tokens)rF   rG   rH   r]   rb   objectrd   r   strintrf   rn   r&   rt   rw   rM   rM   rM   rN   rZ   h   s    
rZ   c                	   @   sX   e Zd Zdeeef defddZ	d
dedeeef deeef dB defdd	Z	dS )SiglipDummyInputsBuilder	mm_countsrV   c                 C      dS N rM   )r_   r|   rM   rM   rN   get_dummy_text   s   z'SiglipDummyInputsBuilder.get_dummy_textNseq_len
mm_optionsc                 C   sB   | dd}| j \}}|r| dnd }d| j||||diS )Nre   r   )rq   rr   
num_images	overrides)getinfort   _get_dummy_images)r_   r   r|   r   r   ru   rv   image_overridesrM   rM   rN   get_dummy_mm_data   s   z*SiglipDummyInputsBuilder.get_dummy_mm_datar[   )
rF   rG   rH   r   ry   rz   r   r   r    r   rM   rM   rM   rN   r{      s    
r{   c                       s   e Zd ZedefddZ	ddddeee B dede	ee
f d	e	ee
f dB d
edB def fddZdedede	ee
f d	e	ee
f def
ddZdede	ee
f de	eef fddZdede	ee
f dedee fddZ  ZS )SiglipMultiModalProcessorrV   c                    s*   | j   t fddt jD }|S )Nc                 3   s    | ]
}| j vr|V  qd S r[   )all_special_ids).0token_id	tokenizerrM   rN   	<genexpr>   s    
z;SiglipMultiModalProcessor.image_token_id.<locals>.<genexpr>)r   get_tokenizernextrange
vocab_size)r_   dummy_token_idrM   r   rN   image_token_id   s
   
z(SiglipMultiModalProcessor.image_token_idN)mm_uuidspromptmm_datahf_processor_mm_kwargstokenization_kwargsr   c                   s>   |r|rt d|ri |pi ddi}t j|||||dS )Nz|Siglip accepts text-only or image-only inputs, not both! Image-only inputs means passing an image with an empty text prompt.add_special_tokensF)r   r   r   r   r   )rX   superapply)r_   r   r   r   r   r   	__class__rM   rN   r      s"   	zSiglipMultiModalProcessor.applyprompt_textmm_itemsc                 C   r}   )NFrM   )r_   r   r   r   r   rM   rM   rN   _hf_processor_applies_updates   s   z7SiglipMultiModalProcessor._hf_processor_applies_updates	hf_inputsc                 C   s   t tddS )Nre   )r?   )dictr!   batched)r_   r   r   rM   rM   rN   _get_mm_fields_config   s   z/SiglipMultiModalProcessor._get_mm_fields_configout_mm_kwargsc                    s0   j  dtf fdd}tdt |dgS )Nitem_idxc                    s4    dt}|| }jj|j|jd} g| S )Nre   rk   )	get_itemsr%   rs   r   rn   rq   rr   )r   images
image_sizenum_image_tokensr   r   r_   rM   rN   get_replacement   s   

zFSiglipMultiModalProcessor._get_prompt_updates.<locals>.get_replacementre   )modalitytargetreplacement)r   rz   r,   r+   start)r_   r   r   r   r   rM   r   rN   _get_prompt_updates   s   
z-SiglipMultiModalProcessor._get_prompt_updatesr[   )rF   rG   rH   r   rz   r   ry   listr    r   rx   r$   r"   r   r'   boolr   r
   r!   r   r#   r-   r   __classcell__rM   rM   r   rN   r      s\    

 


	



r   c                   @   sL   e Zd ZdededefddZdefddZdefdd	Zdefd
dZdS )ra   rh   ri   rV   c                C   s   |   d S )N   )get_patch_grid_length)r_   rh   ri   rM   rM   rN   rn     s   z&SiglipEncoderInfo.get_num_image_tokensc                 C      | j jS r[   )vision_configr   r^   rM   rM   rN   rs     rg   z SiglipEncoderInfo.get_image_sizec                 C   r   r[   )r   
patch_sizer^   rM   rM   rN   get_patch_size  rg   z SiglipEncoderInfo.get_patch_sizec                 C   s   |   |  }}|| S r[   )rs   r   )r_   r   r   rM   rM   rN   r     s   z'SiglipEncoderInfo.get_patch_grid_lengthN)rF   rG   rH   rz   rn   rs   r   r   rM   rM   rM   rN   ra     s    
ra   c                       s^   e Zd Zdef fddZdejdededejfdd	Z	
ddejde	dejfddZ
  ZS )SiglipVisionEmbeddingsconfigc                    s   t    || _|j| _|j| _|j| _t|j| j| j| jdd| _	| j| j d | _
| j
| _t| j| j| _| jdtj| jtjdddd d S )	Nvalid)in_channelsout_channelskernel_sizestridepaddingr   position_idsdtyper1   F
persistent)r   __init__r   hidden_size	embed_dimr   r   r   num_channelspatch_embeddingnum_patchesnum_positionsr	   	Embeddingposition_embeddingregister_bufferrK   arangeint64expandr_   r   r   rM   rN   r     s(   

zSiglipVisionEmbeddings.__init__
embeddingsrr   rq   rV   c                 C   s   |j d }| jjj d }||kr||kr| | jS | jjd}|j d }|| j }|| j }	t|d }
|d|
|
|}|dddd}t	j
j|||	fddd	}|dddddd|}|S )
Nr1   r   r   g      ?rB   r   bicubicF)sizemodealign_corners)shaper   weightr   	unsqueezer   rz   reshapepermuter	   
functionalinterpolateview)r_   r   rr   rq   r   r   patch_pos_embeddim
new_height	new_widthsqrt_num_positionsrM   rM   rN   interpolate_pos_encoding3  s*   



z/SiglipVisionEmbeddings.interpolate_pos_encodingFr?   r   c           	      C   sj   |j \}}}}| jjj}| |j|d}|ddd}|r+|| |||7 }|S || | j	7 }|S )Nr   r   r1   )
r   r   r   r   toflatten	transposer   r   r   )	r_   r?   r   _rr   rq   target_dtypepatch_embedsr   rM   rM   rN   forwardR  s   

zSiglipVisionEmbeddings.forward)F)rF   rG   rH   r   r   rK   rL   rz   r   r   r   r   rM   rM   r   rN   r     s&    
 r   c                       n   e Zd Z	ddddeeB dedB dedee ee	 B ddf
 fd	d
Z
dejdeejdf fddZ  ZS )SiglipAttentionNr   prefixr   quant_configr   attn_clsrV   c                   s  t    || _|j| _|j| _| j| j | _| j| j | jkr-td| j d| j d| jd | _	t
 }t| j| j| j|| d|d| _t| j| j|| d|d| _|rZd	nt | _t| j| j| _|tkr{|| j| j| j	| d
d| _d S || j| j| j	| d
d| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      z	.qkv_proj)r   	head_sizetotal_num_headsr   r   
disable_tpz	.out_proj)
input_sizeoutput_sizer   r   r   r1   z.attnr   )r   r   r   r   r   num_attention_heads	num_headshead_dimrX   scaler<   r   qkv_projr   out_projr   tp_sizer   num_heads_per_partitionr   attn)r_   r   r   r   r   use_data_parallelr   rM   rN   r   d  s\   
		zSiglipAttention.__init__hidden_statesc           	      C   sF   |  |\}}|jddd\}}}| |||}| |\}}|dfS )z#Input shape: Batch x Time x ChannelrB   r   r   N)r  chunkr  r  )	r_   r
  
qkv_statesr   query_states
key_statesvalue_statesoutattn_outputrM   rM   rN   r     s
   zSiglipAttention.forwardr[   rF   rG   rH   r   r   r   ry   r@   r   r   r   rK   rL   tupler   r   rM   rM   r   rN   r   c  s(    =r   c                	       sR   e Zd Z		ddeeB dedB deddf fddZd	ej	dej	fd
dZ
  ZS )	SiglipMLPNr   r   r   r   rV   c                    s   t    || _t }t|j| _|r| dv rd}n|jd dko)|j	d dk}t
|j|j	|r3|nd | d|d| _t|j	|j|rF|nd | d|d| _d S )N)bitsandbytestorchaoT@   r   z.fc1)r   r   r   z.fc2)r   r   r   r<   r   
hidden_actactivation_fnget_namer   intermediate_sizer   fc1r   fc2)r_   r   r   r   r	  quantizabler   rM   rN   r     s,   


zSiglipMLP.__init__r
  c                 C   s*   |  |\}}| |}| |\}}|S r[   )r  r  r  )r_   r
  r   rM   rM   rN   r     s   
zSiglipMLP.forwardr~   )rF   rG   rH   r   r   r   ry   r   rK   rL   r   r   rM   rM   r   rN   r    s    %r  c                       r   )SiglipEncoderLayerNr   r   r   r   r   r   rV   c                   sl   t    |j| _t||| d|d| _tj| j|jd| _	t
||| dd| _tj| j|jd| _d S )Nz
.self_attnr   r   r   eps.mlpr   r   )r   r   r   r   r   	self_attnr	   	LayerNormlayer_norm_epslayer_norm1r  mlplayer_norm2)r_   r   r   r   r   r   rM   rN   r     s   
zSiglipEncoderLayer.__init__r
  c                 C   sN   |}|  |}| j|d\}}||7 }|}| |}| |}||7 }|d fS )N)r
  )r)  r&  r+  r*  )r_   r
  residualr   rM   rM   rN   r     s   


zSiglipEncoderLayer.forwardr[   r  rM   rM   r   rN   r     s(    r   c                       s~   e Zd Z		ddddeeB dedB dedB dedee	 ee
 B d	df fd
dZdejded	ejeej B fddZ  ZS )SiglipEncoderNr   r   r   r   num_hidden_layers_overrider   r   rV   c                   sL   t    | _|d u rj}n|}t fddt|D | _d S )Nc                    s&   g | ]}t  d |  dqS )z.layers.r!  )r   )r   	layer_idxr   r   r   r   rM   rN   
<listcomp>  s    z*SiglipEncoder.__init__.<locals>.<listcomp>)r   r   r   num_hidden_layersr	   
ModuleListr   layers)r_   r   r   r.  r   r   r2  r   r0  rN   r   	  s   
	
zSiglipEncoder.__init__inputs_embedsreturn_all_hidden_statesc                 C   s<   |g}|}| j D ]}||\}}|r|| q|r|S |S r[   )r4  append)r_   r5  r6  hidden_states_poolr
  encoder_layerr   rM   rM   rN   r   '  s   

zSiglipEncoder.forwardNN)rF   rG   rH   r   r   r   rz   ry   r@   r   r   r   rK   rL   r   r   r   r   rM   rM   r   rN   r-    s2    r-  c                
       s   e Zd Z	ddddededB deddf fdd	Zd
ejdejfddZ		dd
ejdB dejdejdB dejfddZ
deeeejf  dee fddZ  ZS )SiglipTextTransformerNr   r   r   r   r   rV   c                   s^   t    || _|j}t|| _t||| dtd| _t	j
||jd| _t	||j| _d S )N.encoder)r   r   r   r   r"  )r   r   r   r   SiglipTextEmbeddingsr   r-  r   encoderr	   r'  r(  final_layer_normLinearprojection_sizehead)r_   r   r   r   r   r   rM   rN   r   ;  s   

zSiglipTextTransformer.__init__	input_idsc                 C   s   | j |S r[   )r   token_embedding)r_   rC  rM   rM   rN   embed_input_idsS  r`   z%SiglipTextTransformer.embed_input_idsr   r5  c                 C   s*   |  |||}| j|dd}| |}|S )NFr5  r6  )r   r>  r?  )r_   rC  r   r5  r
  last_hidden_staterM   rM   rN   r   V  s   
zSiglipTextTransformer.forwardweightsc                 C   s   g d}t |  }t }|D ]9\}}|D ]\}}}	||vrq|||}|| }
|
j}||
||	  n|| }
t|
dt}||
| || q|S )N)r  q_projq)r  k_projk)r  v_projvweight_loader)r   named_parameterssetreplacerP  getattrr   add)r_   rH  stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
param_nameweight_nameshard_idparamrP  rM   rM   rN   load_weightsf  s"   
z"SiglipTextTransformer.load_weightsr[   )rF   rG   rH   r   r   ry   r   rK   rL   rE  r   r   r  rR  r_  r   rM   rM   r   rN   r;  :  s2    
,r;  c                	       sR   e Zd ZdZ		ddededB deddf fdd	Zd
ej	dej	fddZ
  ZS )#SiglipMultiheadAttentionPoolingHeadzMultihead Attention Pooling.Nr   r   r   r   rV   c                    sh   t    ttdd|j| _tjj|j|j	dd| _
tj|j|jd| _t||| dd| _d S )Nr1   T)batch_firstr"  r$  r   r   r   )r   r   r	   	ParameterrK   randnr   probeMultiheadAttentionr   	attentionr'  r(  	layernormr  r*  )r_   r   r   r   r   rM   rN   r     s   

z,SiglipMultiheadAttentionPoolingHead.__init__hidden_statec                 C   sP   | d}| j|dd}| |||d }|}| |}| |}||7 }|S )Nr   r   )r   re  r   rg  rh  r*  )r_   ri  
batch_sizere  r,  rM   rM   rN   r     s   


z+SiglipMultiheadAttentionPoolingHead.forwardr~   )rF   rG   rH   rI   r   r   ry   r   rK   rL   r   r   rM   rM   r   rN   r`    s    r`  c                       s   e Zd Z	dddddddededB dedB dedB d	ed
edB ddf fddZe	dd Z
e	dd Zdddddejdedee dB dedB dejf
ddZdejdejfddZdeeeejf  dee fddZ  ZS ) SiglipVisionTransformerNr   Fr.  require_post_normr   use_headr   r   r.  rm  r   rn  rV   c          	         s   t    || _|j}t|| _t|||| dtd| _|j	}t
| jj|j	kr8td| dt
| jj d|d u rDt
| jj|k}|rPtj||jd| _nd | _t|tr\|| _nt|dscdn|j| _| jrtt||| d	d
nd | _t| j| _d S )Nr<  )r   r.  r   r   zThe original encoder only has z layers, but you requested z layers.r"  vision_use_headTz.headrb  )r   r   r   r   r   r   r-  r   r>  r2  lenr4  rX   r	   r'  r(  post_layernorm
isinstancer   rn  hasattrro  r`  rB  r   maybe_layer_norm_and_apply_headlast_hs_proc)	r_   r   r   r.  rm  r   rn  r   r2  r   rM   rN   r     sL   




	z SiglipVisionTransformer.__init__c                 C      t |  jS r[   )r   
parametersr   r^   rM   rM   rN   r        zSiglipVisionTransformer.dtypec                 C   rv  r[   )r   rw  devicer^   rM   rM   rN   ry    rx  zSiglipVisionTransformer.device)r   select_layersfeature_select_strategyr?   r   rz  r{  c                C   s>   | j ||d}| j||d ud}t|d || jj| j|d}|S )N)r   rF  )rz  max_possible_layersru  r{  )r   r>  r=   r   r2  ru  )r_   r?   r   rz  r{  r
  encoder_outputsrM   rM   rN   r     s"   
	zSiglipVisionTransformer.forwardr}  c                 C   s,   | j dur
|  |}| jdur| |}|S )zApply the post layer norm and head if they are enabled,
        given the last hidden states tensor.

        args:
            encoder_outputs: The last hidden states from the visual encoder.
        N)rq  rB  )r_   r}  rM   rM   rN   rt    s
   
	


z7SiglipVisionTransformer.maybe_layer_norm_and_apply_headrH  c                 C   s   g d}t |  }t }t| jj}|D ]b\}}|dr$| jd u r$q| jd u r/|dr/q|drBt	|
dd }||krBq|D ]\}	}
}|
|vrNqD||
|	}|| }|j}||||  n|| }t|dt}||| || q|S )NrI  rq  rB  zencoder.layers.r   rP  )r   rQ  rR  rp  r>  r4  
startswithrq  rB  rz   splitrS  rP  rT  r   rU  )r_   rH  rV  rW  rX  layer_countrY  rZ  r/  r[  r\  r]  r^  rP  rM   rM   rN   r_  %  s4   

z$SiglipVisionTransformer.load_weightsr[   )rF   rG   rH   r   r   rz   r   ry   r   propertyr   ry  rK   rL   r   r9   r   rt  r   r  rR  r_  r   rM   rM   r   rN   rk    s\    	A



#
,rk  c                       s   e Zd Z	dddddddededB dedB dedB d	ed
edB ddf fddZde	j
fddZedd Zedd Z			ddejdedee dB dedB dejf
ddZdeeeejf  dee fddZ  ZS )SiglipVisionModelNr   Frl  r   r   r.  rm  r   rn  rV   c                   s0   t    || _t||||| d|d| _d S )Nz.vision_model)r   r.  rm  r   rn  )r   r   r   rk  vision_model)r_   r   r   r.  rm  r   rn  r   rM   rN   r   Q  s   

zSiglipVisionModel.__init__c                 C   s
   | j jjS r[   )r  r   r   r^   rM   rM   rN   get_input_embeddingsg  s   
z&SiglipVisionModel.get_input_embeddingsc                 C   r   r[   )r  r   r^   rM   rM   rN   r   j     zSiglipVisionModel.dtypec                 C   r   r[   )r  ry  r^   rM   rM   rN   ry  n  r  zSiglipVisionModel.devicer?   r   rz  r{  c                 C   s   | j ||||dS )N)r?   r   rz  r{  )r  )r_   r?   r   rz  r{  rM   rM   rN   r   r  s   zSiglipVisionModel.forwardrH  c                 C   sZ  g d}t |  }t }t| jjj}|D ]\}}|dr&| jjd u r&q| jj	d u r2|dr2q|drEt
|dd }||krEq|drlt||}	|	d url|	|v rl||	 }
t|
dt}||
| ||	 q|D ]\}}}||vrxqn|||}|| }
|
j}||
||  n|| }
t||
||| j}
t|
dt}||
| || q|S )	NrI  zvision_model.post_layernormzvision_model.headzvision_model.encoder.layersr~  rB   )z.k_scalez.v_scalez.q_scalez.prob_scalerP  )r   rQ  rR  rp  r  r>  r4  r  rq  rB  rz   r  endswithr   rT  r   rU  rS  rP  maybe_swap_ffn_paramr   )r_   rH  rV  rW  rX  r  rY  rZ  r/  remapped_namer^  rP  r[  r\  r]  rM   rM   rN   r_    sR   





zSiglipVisionModel.load_weightsr[   )FNN)rF   rG   rH   r   r   rz   r   ry   r   r	   Moduler  r  r   ry  rK   rL   r   r9   r   r   r  rR  r_  r   rM   rM   r   rN   r  P  sT    	



,r  rY  r^  rZ  rW  r   c           
      C   s   |r|  dkrd| vr|S t }t|dd}||| }||}d| v r7||kr7| dd}	||	 }|S d| v rI||krI| dd}	||	 }|S )Nggufz.fc
output_dimr   z.fc1.z.fc2.)r  r   rT  r   rS  )
rY  r^  rZ  rW  r   r  r  r   weight_out_sizenew_namerM   rM   rN   r    s   
r  c                	       sP   e Zd Zdef fddZ	ddejdB dejdejdB dejfd	d
Z  ZS )r=  r   c                    sR   t    || _t|j|j| _t|j|j| _| j	dt
|jddd d S )Nr   r   Fr   )r   r   r   r   r   r   rD  max_position_embeddingsr   r   rK   r   r   r   r   rM   rN   r     s   

zSiglipTextEmbeddings.__init__NrC  r   r5  rV   c                 C   s(   |d u r	|  |}| |}|| }|S r[   )rD  r   )r_   rC  r   r5  position_embeddingsr   rM   rM   rN   r     s
   

zSiglipTextEmbeddings.forwardr[   )	rF   rG   rH   r   r   rK   rL   r   r   rM   rM   r   rN   r=    s    r=  rS   )ro   )r   dummy_inputsc                       s  e Zd ZdZdg diZededededB fdd	Zd
dde	def fddZ
	d5dejdB dejdejdB dejfddZdejdejdejfddZ	d5dejdedB dejfddZdededB fddZdedejfd d!Zdejd"eejgejf d#ejdB d$edejf
 fd%d&Z	d5dd'd(dejd)edB d#ejdB d$edejf
 fd*d+Zdedefd,d-Z		d6dejdB d.ejd/edB dejdB dedejfd0d1Zd2eeeejf  fd3d4Z  Z S )7SiglipEmbeddingModelTr  )rJ  rL  rN  r   irV   Nc                 C   s   | drd S td)Nre   z Only image modality is supported)r  rX   )clsr   r  rM   rM   rN   get_placeholder_str  s   
z(SiglipEmbeddingModel.get_placeholder_strr   r   vllm_configr   c                   s  t    |jj}|j}|| _t|drd|_|j}|j	}|j
| _|j
| _|j| _| | t||t|dd| _W d    n1 sDw   Y  | |d t||t|dd d| _W d    n1 sfw   Y  |jj}|d usuJ || _t|| _d| _d S )	N
num_labelsr   
text_modelr%  re   r  )r   r   rn  T)r   r   rl   	hf_configr   r   rs  r  text_configr   r   text_embed_dimvision_embed_dimrA  text_projection_size_mark_language_modelr;  r7   r  _mark_tower_modelrk  r  rm   r   for_embeddingpooler_is_text_input)r_   r  r   r   r   r  r   rm   r   rM   rN   r     s>   




zSiglipEmbeddingModel.__init__rC  r   r5  c                 C   s,   | j |||d}| j |}| ||}|S )N)rC  r   r5  )r  rB  _flip_sequences_by_position_ids)r_   rC  r   r5  rG  text_featuresrM   rM   rN   get_text_features6  s   z&SiglipEmbeddingModel.get_text_featuresfeaturesc                 C   s   t |dkr|S |dd |dd  }|dk}ttjdg|jdt|d d tjt |g|jdg}|dd |dd  }|dd }|dd }tjt ||jd|}	tjt ||jd}
||	 ||	  d |
 }|| S )zFlip sequences so EOS token moves to first position for CLS pooling.

        SigLIP position_ids are reversed within each sequence. This method detects
        sequence boundaries and flips each sequence individually.
        r1   Nr   r   )ry  )rp  rK   cattensorry  wherer   repeat_interleave)r_   r  r   position_diffsboundary_maskboundary_indiceslengthsstartsendssequence_idscurrent_positionsflip_indicesrM   rM   rN   r  K  s*   
	
z4SiglipEmbeddingModel._flip_sequences_by_position_idsr?   r{  c                 C   s(   |d u r
t | jj}| j|d |d}|S )N)r?   rz  r{  )rY   rm   ro   r  )r_   r?   r{  pooled_outputrM   rM   rN   get_image_featurest  s   z'SiglipEmbeddingModel.get_image_featuresrc   c                 K   s:   | dd }|d u rd S | jjj }}td|||ddS )Nr?   )rC   rD   )r@   rE   resolve_bindings)popr   r   r   r>   )r_   rc   r?   
expected_h
expected_wrM   rM   rN   _parse_and_validate_image_input  s   z4SiglipEmbeddingModel._parse_and_validate_image_inputinputsc                 C   s   |d }|  |S )NrE   )r  )r_   r  r?   rM   rM   rN   _process_image_inputs  s   
z*SiglipEmbeddingModel._process_image_inputsrE  is_multimodalhandle_oov_mm_tokenc                   sl   t  j||||d}| j}|jd |k r+tj|||jd ||jd  gdd}|S |jd |kr4t|S )Nr  r  r1   r   r  )r   _embed_text_input_idsr  r   rK   r  	new_emptyNotImplementedError)r_   rC  rE  r  r  r5  inputs_embeds_sizer   rM   rN   r    s*   	z*SiglipEmbeddingModel._embed_text_input_idsFr  multimodal_embeddingsc                   sF   |d u p	t |dk| _|d u s|d u rt |S t j||||dS )Nr   )r  r  r  )rp  r  r   rE  )r_   rC  r  r  r  r   rM   rN   rE    s   	z$SiglipEmbeddingModel.embed_input_idsc                 K   s*   | j di |}|d u rg S | |}|S )NrM   )r  r  )r_   rc   image_inputvision_embeddingsrM   rM   rN   embed_multimodal  s
   
z%SiglipEmbeddingModel.embed_multimodal	positionsintermediate_tensorsc                 K   sd   |d urt d| js|S | j}|jd |kr"|d d d |f }n	|jd |k r+t| |||S )Nz"PP is not supported for this modelr1   )RuntimeErrorr  r  r   r  r  )r_   rC  r  r  r5  rc   r   rM   rM   rN   r     s   zSiglipEmbeddingModel.forwardrH  c                 C   s   t | dgddgd}||S )Nz.position_idszlogit_scale.zlogit_bias.)skip_substrsignore_unexpected_prefixes)r6   r_  )r_   rH  loaderrM   rM   rN   r_    s   
z!SiglipEmbeddingModel.load_weightsr[   r:  )!rF   rG   rH   is_pooling_modelpacked_modules_mappingclassmethodry   rz   r  r   r   rK   rL   r  r  r9   r  rx   r>   r  r  r   r   r  r2   rE  r  r.   r   r   r  r_  r   rM   rM   r   rN   r    s    ,

,

&
$r  )ocollections.abcr   r   r   	functoolsr   r   typingr   r   rK   r	   transformersr
   r   r   r   r   vllm.configr   vllm.config.multimodalr   vllm.distributedr   r   %vllm.model_executor.layers.activationr   ;vllm.model_executor.layers.attention.encoder_only_attentionr   9vllm.model_executor.layers.attention.mm_encoder_attentionr   vllm.model_executor.layers.convr   !vllm.model_executor.layers.linearr   r   r   !vllm.model_executor.layers.poolerr   'vllm.model_executor.layers.quantizationr   3vllm.model_executor.layers.vocab_parallel_embeddingr   -vllm.model_executor.model_loader.weight_utilsr   r   vllm.multimodalr   vllm.multimodal.inputsr    r!   r"   r#   r$   vllm.multimodal.parser%   r&   r'   vllm.multimodal.processingr(   r)   r*   r+   r,   r-   vllm.sequencer.   vllm.utils.tensor_schemar/   r0   
interfacesr2   r3   r4   interfaces_baser5   utilsr6   r7   visionr8   r9   r:   r;   r<   r=   r>   rT   r   ry   rJ   rY   rZ   r{   r   ra   r  r   r   r  r   r-  r;  r`  rk  r  rL   r  r=  register_processorr  rM   rM   rM   rN   <module>   s     

-VIK--2G( (q
#