o
    i)                     @   sj  U d dl mZmZmZ d dlmZmZ d dlmZm	Z	 d dl
Z
d dl
mZ d dlmZmZmZmZmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZ d dlmZmZ d dlmZ d dl m!Z!m"Z"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z) d dl*m+Z+m,Z, d dl-m.Z. d dl/m0Z0m1Z1m2Z2m3Z3m4Z4 d dl5m6Z6m7Z7m8Z8 d dl9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z? d dl@mAZA d dlBmCZCmDZD ddlEmFZFmGZGmHZH ddlImJZJ ddlKmLZLmMZM ddlNmOZOmPZPmQZQmRZRmSZSmTZT G dd deCZUddd d!ZVeWeXeQf eYd"< d#eXd$eQfd%d&ZZG d'd( d(e<Z[G d)d* d*e:e[ Z\G d+d, d,e;e[ Z]G d-d. d.eOe Z^G d/d0 d0ej_Z`G d1d2 d2ej_ZaG d3d4 d4ej_ZbG d5d6 d6ej_ZcG d7d8 d8ej_ZdG d9d: d:ej_ZeG d;d< d<ej_ZfG d=d> d>ej_ZgG d?d@ d@ej_ZhdAeXdBe
jidCe
jidDeWeXe
jif dEe'd$e
jifdFdGZjG dHdI dIej_ZkeJdJdKe.jle]e[e\dLG dMdN dNej_eGeHZmdS )O    )CallableIterableMapping)cached_propertypartial)	AnnotatedLiteralN)nn)BatchFeatureSiglipConfigSiglipProcessorSiglipTextConfigSiglipVisionConfig)
VllmConfig)BaseDummyOptions)divide$get_tensor_model_parallel_world_size)
get_act_fn)EncoderOnlyAttentionMMEncoderAttention)Conv2dLayer)ColumnParallelLinearQKVParallelLinearRowParallelLinear)DispatchPooler)QuantizationConfig)VocabParallelEmbedding)default_weight_loadermaybe_remap_kv_scale_name)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalInputsMultiModalKwargsItemsMultiModalUUIDDict)ImageProcessorItems	ImageSizeMultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptIndexTargetsPromptReplacementPromptUpdate)IntermediateTensors)TensorSchemaTensorShape   )MultiModalEmbeddingsSupportsMultiModalSupportsQuant)default_pooling_type)AutoWeightsLoadermaybe_prefix)VisionEncoderInfoVisionFeatureSelectStrategyVisionFeatureSelectStrategyStrget_num_selected_vision_tokensis_vit_use_data_parallelresolve_visual_encoder_outputsc                   @   s:   e Zd ZU dZed ed< eeje	ddddf ed< d	S )
SiglipImagePixelInputsz
    Dimensions:
        - bn: Batch size * number of images
        - c: Number of channels (3)
        - h: Height of each image
        - w: Width of each image
    pixel_valuestypebn   hwdataN)
__name__
__module____qualname____doc__r   __annotations__r   torchTensorr0    rM   rM   W/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/model_executor/models/siglip.pyr>   H   s   
  r>   fullclass)MEANALLCLS_POOLING_TYPE_TO_STRATEGYpooling_typereturnc                 C   s*   zt |  W S  ty   td| d w )Nz;No feature selection strategy is defined for pooling_type: )rT   KeyError
ValueError)rU   rM   rM   rN   #_get_vision_feature_select_strategy\   s   
rY   c                   @   sv   e Zd Zdd Zdd ZdefddZdeee	d	B f fd
dZ
de	de	de	fddZdefddZde	fddZd	S )SiglipProcessingInfoc                 C   s   | j tS N)ctxget_hf_configr   selfrM   rM   rN   r]   i      z"SiglipProcessingInfo.get_hf_configc                 C   s   t |  S r[   )SiglipEncoderInfor]   r^   rM   rM   rN   get_vision_encoder_infol   r`   z,SiglipProcessingInfo.get_vision_encoder_infokwargsc                 K   s   | j jtfi |S r[   )r\   get_hf_processorr   )r_   rc   rM   rM   rN   rd   o   s   z%SiglipProcessingInfo.get_hf_processorrV   Nc                 C   s   ddiS )Nimager1   rM   r^   rM   rM   rN   get_supported_mm_limitsr      z,SiglipProcessingInfo.get_supported_mm_limitsimage_widthimage_heightc                C   s8   |   }| jjj}|d usJ t|j||dt|jS Nrh   ri   )rb   r\   model_configpooler_configr;   get_num_image_tokensrY   seq_pooling_type)r_   rh   ri   vision_encoder_inform   rM   rM   rN   rn   u   s   
z)SiglipProcessingInfo.get_num_image_tokensc                 C   s    |   }|  }}t||dS )N)widthheight)rb   get_image_sizer&   )r_   rp   rq   rr   rM   rM   rN   !get_image_size_with_most_features   s   z6SiglipProcessingInfo.get_image_size_with_most_featuresc                 C   s   |   \}}| j||dS rj   )rt   rn   )r_   target_widthtarget_heightrM   rM   rN   get_max_image_tokens   s   z)SiglipProcessingInfo.get_max_image_tokens)rF   rG   rH   r]   rb   objectrd   r   strintrf   rn   r&   rt   rw   rM   rM   rM   rN   rZ   h   s    
rZ   c                	   @   sX   e Zd Zdeeef defddZ	d
dedeeef deeef dB defdd	Z	dS )SiglipDummyInputsBuilder	mm_countsrV   c                 C      dS N rM   )r_   r|   rM   rM   rN   get_dummy_text   s   z'SiglipDummyInputsBuilder.get_dummy_textNseq_len
mm_optionsc                 C   sB   | dd}| j \}}|r| dnd }d| j||||diS )Nre   r   )rq   rr   
num_images	overrides)getinfort   _get_dummy_images)r_   r   r|   r   r   ru   rv   image_overridesrM   rM   rN   get_dummy_mm_data   s   z*SiglipDummyInputsBuilder.get_dummy_mm_datar[   )
rF   rG   rH   r   ry   rz   r   r   r    r   rM   rM   rM   rN   r{      s    
r{   c                       s   e Zd ZedefddZ	ddddeee B dede	ee
f d	e	ee
f dB d
edB def fddZdedede	ee
f d	e	ee
f def
ddZdede	ee
f de	eef fddZdede	ee
f dedee fddZ  ZS )SiglipMultiModalProcessorrV   c                    s*   | j   t fddt jD }|S )Nc                 3   s    | ]
}| j vr|V  qd S r[   )all_special_ids).0token_id	tokenizerrM   rN   	<genexpr>   s    
z;SiglipMultiModalProcessor.image_token_id.<locals>.<genexpr>)r   get_tokenizernextrange
vocab_size)r_   dummy_token_idrM   r   rN   image_token_id   s
   
z(SiglipMultiModalProcessor.image_token_idN)mm_uuidspromptmm_itemshf_processor_mm_kwargstokenization_kwargsr   c                   s>   |r|rt d|ri |pi ddi}t j|||||dS )Nz|Siglip accepts text-only or image-only inputs, not both! Image-only inputs means passing an image with an empty text prompt.add_special_tokensF)r   r   r   r   r   )rX   superapply)r_   r   r   r   r   r   	__class__rM   rN   r      s"   	zSiglipMultiModalProcessor.applyprompt_textc                 C   r}   )NFrM   )r_   r   r   r   r   rM   rM   rN   _hf_processor_applies_updates   s   z7SiglipMultiModalProcessor._hf_processor_applies_updates	hf_inputsc                 C   s   t tddS )Nre   )r?   )dictr!   batched)r_   r   r   rM   rM   rN   _get_mm_fields_config   s   z/SiglipMultiModalProcessor._get_mm_fields_configout_mm_kwargsc                    s0   j  dtf fdd}tdt |dgS )Nitem_idxc                    s4    dt}|| }jj|j|jd} g| S )Nre   rk   )	get_itemsr%   rs   r   rn   rq   rr   )r   images
image_sizenum_image_tokensr   r   r_   rM   rN   get_replacement   s   

zFSiglipMultiModalProcessor._get_prompt_updates.<locals>.get_replacementre   )modalitytargetreplacement)r   rz   r,   r+   start)r_   r   r   r   r   rM   r   rN   _get_prompt_updates   s   
z-SiglipMultiModalProcessor._get_prompt_updatesr[   )rF   rG   rH   r   rz   r   ry   listr'   r   rx   r$   r"   r   boolr   r
   r!   r   r#   r-   r   __classcell__rM   rM   r   rN   r      s\    

 


	



r   c                   @   sL   e Zd ZdededefddZdefddZdefdd	Zdefd
dZdS )ra   rh   ri   rV   c                C   s   |   d S )N   )get_patch_grid_length)r_   rh   ri   rM   rM   rN   rn     s   z&SiglipEncoderInfo.get_num_image_tokensc                 C      | j jS r[   )vision_configr   r^   rM   rM   rN   rs     rg   z SiglipEncoderInfo.get_image_sizec                 C   r   r[   )r   
patch_sizer^   rM   rM   rN   get_patch_size  rg   z SiglipEncoderInfo.get_patch_sizec                 C   s   |   |  }}|| S r[   )rs   r   )r_   r   r   rM   rM   rN   r     s   z'SiglipEncoderInfo.get_patch_grid_lengthN)rF   rG   rH   rz   rn   rs   r   r   rM   rM   rM   rN   ra     s    
ra   c                       s^   e Zd Zdef fddZdejdededejfdd	Z	
ddejde	dejfddZ
  ZS )SiglipVisionEmbeddingsconfigc                    s   t    || _|j| _|j| _|j| _t|j| j| j| jdd| _	| j| j d | _
| j
| _t| j| j| _| jdtj| jtjdddd d S )	Nvalid)in_channelsout_channelskernel_sizestridepaddingr   position_idsdtyper1   F
persistent)r   __init__r   hidden_size	embed_dimr   r   r   num_channelspatch_embeddingnum_patchesnum_positionsr	   	Embeddingposition_embeddingregister_bufferrK   arangeint64expandr_   r   r   rM   rN   r     s(   

zSiglipVisionEmbeddings.__init__
embeddingsrr   rq   rV   c                 C   s   |j d }| jjj d }||kr||kr| | jS | jjd}|j d }|| j }|| j }	t|d }
|d|
|
|}|dddd}t	j
j|||	fddd	}|dddddd|}|S )
Nr1   r   r   g      ?rB   r   bicubicF)sizemodealign_corners)shaper   weightr   	unsqueezer   rz   reshapepermuter	   
functionalinterpolateview)r_   r   rr   rq   r   r   patch_pos_embeddim
new_height	new_widthsqrt_num_positionsrM   rM   rN   interpolate_pos_encoding3  s*   



z/SiglipVisionEmbeddings.interpolate_pos_encodingFr?   r   c           	      C   sj   |j \}}}}| jjj}| |j|d}|ddd}|r+|| |||7 }|S || | j	7 }|S )Nr   r   r1   )
r   r   r   r   toflatten	transposer   r   r   )	r_   r?   r   _rr   rq   target_dtypepatch_embedsr   rM   rM   rN   forwardR  s   

zSiglipVisionEmbeddings.forward)F)rF   rG   rH   r   r   rK   rL   rz   r   r   r   r   rM   rM   r   rN   r     s&    
 r   c                       n   e Zd Z	ddddeeB dedB dedee ee	 B ddf
 fd	d
Z
dejdeejdf fddZ  ZS )SiglipAttentionNr   prefixr   quant_configr   attn_clsrV   c                   s  t    || _|j| _|j| _| j| j | _| j| j | jkr-td| j d| j d| jd | _	t
 }t| j| j| j|| d|d| _t| j| j|| d|d| _|rZd	nt | _t| j| j| _|tkr{|| j| j| j	| d
d| _d S || j| j| j	| d
d| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      z	.qkv_proj)r   	head_sizetotal_num_headsr   r   
disable_tpz	.out_proj)
input_sizeoutput_sizer   r   r   r1   z.attnr   )r   r   r   r   r   num_attention_heads	num_headshead_dimrX   scaler<   r   qkv_projr   out_projr   tp_sizer   num_heads_per_partitionr   attn)r_   r   r   r   r   use_data_parallelr   rM   rN   r   d  s\   
		zSiglipAttention.__init__hidden_statesc           	      C   sF   |  |\}}|jddd\}}}| |||}| |\}}|dfS )z#Input shape: Batch x Time x ChannelrB   r   r   N)r  chunkr  r  )	r_   r	  
qkv_statesr   query_states
key_statesvalue_statesoutattn_outputrM   rM   rN   r     s
   zSiglipAttention.forwardr[   rF   rG   rH   r   r   r   ry   r@   r   r   r   rK   rL   tupler   r   rM   rM   r   rN   r   c  s(    =r   c                	       sR   e Zd Z		ddeeB dedB deddf fddZd	ej	dej	fd
dZ
  ZS )	SiglipMLPNr   r   r   r   rV   c                    s   t    || _t }t|j| _|r| dv rd}n|jd dko)|j	d dk}t
|j|j	|r3|nd | d|d| _t|j	|j|rF|nd | d|d| _d S )N)bitsandbytestorchaoT@   r   z.fc1)r   r   r   z.fc2)r   r   r   r<   r   
hidden_actactivation_fnget_namer   intermediate_sizer   fc1r   fc2)r_   r   r   r   r  quantizabler   rM   rN   r     s,   


zSiglipMLP.__init__r	  c                 C   s*   |  |\}}| |}| |\}}|S r[   )r  r  r  )r_   r	  r   rM   rM   rN   r     s   
zSiglipMLP.forwardr~   )rF   rG   rH   r   r   r   ry   r   rK   rL   r   r   rM   rM   r   rN   r    s    %r  c                       r   )SiglipEncoderLayerNr   r   r   r   r   r   rV   c                   sl   t    |j| _t||| d|d| _tj| j|jd| _	t
||| dd| _tj| j|jd| _d S )Nz
.self_attnr   r   r   eps.mlpr   r   )r   r   r   r   r   	self_attnr	   	LayerNormlayer_norm_epslayer_norm1r  mlplayer_norm2)r_   r   r   r   r   r   rM   rN   r     s   
zSiglipEncoderLayer.__init__r	  c                 C   sN   |}|  |}| j|d\}}||7 }|}| |}| |}||7 }|d fS )N)r	  )r(  r%  r*  r)  )r_   r	  residualr   rM   rM   rN   r     s   


zSiglipEncoderLayer.forwardr[   r  rM   rM   r   rN   r    s(    r  c                       s~   e Zd Z		ddddeeB dedB dedB dedee	 ee
 B d	df fd
dZdejded	ejeej B fddZ  ZS )SiglipEncoderNr   r   r   r   num_hidden_layers_overrider   r   rV   c                   sL   t    | _|d u rj}n|}t fddt|D | _d S )Nc                    s&   g | ]}t  d |  dqS )z.layers.r   )r  )r   	layer_idxr   r   r   r   rM   rN   
<listcomp>  s    z*SiglipEncoder.__init__.<locals>.<listcomp>)r   r   r   num_hidden_layersr	   
ModuleListr   layers)r_   r   r   r-  r   r   r1  r   r/  rN   r   	  s   
	
zSiglipEncoder.__init__inputs_embedsreturn_all_hidden_statesc                 C   s<   |g}|}| j D ]}||\}}|r|| q|r|S |S r[   )r3  append)r_   r4  r5  hidden_states_poolr	  encoder_layerr   rM   rM   rN   r   '  s   

zSiglipEncoder.forwardNN)rF   rG   rH   r   r   r   rz   ry   r@   r   r   r   rK   rL   r   r   r   r   rM   rM   r   rN   r,    s2    r,  c                
       s   e Zd Z	ddddededB deddf fdd	Zd
ejdejfddZ		dd
ejdB dejdejdB dejfddZ
deeeejf  dee fddZ  ZS )SiglipTextTransformerNr   r   r   r   r   rV   c                   s^   t    || _|j}t|| _t||| dtd| _t	j
||jd| _t	||j| _d S )N.encoder)r   r   r   r   r!  )r   r   r   r   SiglipTextEmbeddingsr   r,  r   encoderr	   r&  r'  final_layer_normLinearprojection_sizehead)r_   r   r   r   r   r   rM   rN   r   ;  s   

zSiglipTextTransformer.__init__	input_idsc                 C   s   | j |S r[   )r   token_embedding)r_   rB  rM   rM   rN   embed_input_idsS  r`   z%SiglipTextTransformer.embed_input_idsr   r4  c                 C   s*   |  |||}| j|dd}| |}|S )NFr4  r5  )r   r=  r>  )r_   rB  r   r4  r	  last_hidden_staterM   rM   rN   r   V  s   
zSiglipTextTransformer.forwardweightsc                 C   s   g d}t |  }t }|D ]9\}}|D ]\}}}	||vrq|||}|| }
|
j}||
||	  n|| }
t|
dt}||
| || q|S )N)r  q_projq)r  k_projk)r  v_projvweight_loader)r   named_parameterssetreplacerO  getattrr   add)r_   rG  stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
param_nameweight_nameshard_idparamrO  rM   rM   rN   load_weightsf  s"   
z"SiglipTextTransformer.load_weightsr[   )rF   rG   rH   r   r   ry   r   rK   rL   rD  r   r   r  rQ  r^  r   rM   rM   r   rN   r:  :  s2    
,r:  c                	       sR   e Zd ZdZ		ddededB deddf fdd	Zd
ej	dej	fddZ
  ZS )#SiglipMultiheadAttentionPoolingHeadzMultihead Attention Pooling.Nr   r   r   r   rV   c                    sh   t    ttdd|j| _tjj|j|j	dd| _
tj|j|jd| _t||| dd| _d S )Nr1   T)batch_firstr!  r#  r   r   r   )r   r   r	   	ParameterrK   randnr   probeMultiheadAttentionr   	attentionr&  r'  	layernormr  r)  )r_   r   r   r   r   rM   rN   r     s   

z,SiglipMultiheadAttentionPoolingHead.__init__hidden_statec                 C   sP   | d}| j|dd}| |||d }|}| |}| |}||7 }|S )Nr   r   )r   rd  r   rf  rg  r)  )r_   rh  
batch_sizerd  r+  rM   rM   rN   r     s   


z+SiglipMultiheadAttentionPoolingHead.forwardr~   )rF   rG   rH   rI   r   r   ry   r   rK   rL   r   r   rM   rM   r   rN   r_    s    r_  c                       s   e Zd Z	dddddddededB dedB dedB d	ed
edB ddf fddZe	dd Z
e	dd Zdddddejdedee dB dedB dejf
ddZdejdejfddZdeeeejf  dee fddZ  ZS ) SiglipVisionTransformerNr   Fr-  require_post_normr   use_headr   r   r-  rl  r   rm  rV   c          	         s   t    || _|j}t|| _t|||| dtd| _|j	}t
| jj|j	kr8td| dt
| jj d|d u rDt
| jj|k}|rPtj||jd| _nd | _t|tr\|| _nt|dscdn|j| _| jrtt||| d	d
nd | _t| j| _d S )Nr;  )r   r-  r   r   zThe original encoder only has z layers, but you requested z layers.r!  vision_use_headTz.headra  )r   r   r   r   r   r   r,  r   r=  r1  lenr3  rX   r	   r&  r'  post_layernorm
isinstancer   rm  hasattrrn  r_  rA  r   maybe_layer_norm_and_apply_headlast_hs_proc)	r_   r   r   r-  rl  r   rm  r   r1  r   rM   rN   r     sL   




	z SiglipVisionTransformer.__init__c                 C      t |  jS r[   )r   
parametersr   r^   rM   rM   rN   r        zSiglipVisionTransformer.dtypec                 C   ru  r[   )r   rv  devicer^   rM   rM   rN   rx    rw  zSiglipVisionTransformer.device)r   select_layersfeature_select_strategyr?   r   ry  rz  c                C   s>   | j ||d}| j||d ud}t|d || jj| j|d}|S )N)r   rE  )ry  max_possible_layersrt  rz  )r   r=  r=   r   r1  rt  )r_   r?   r   ry  rz  r	  encoder_outputsrM   rM   rN   r     s"   
	zSiglipVisionTransformer.forwardr|  c                 C   s,   | j dur
|  |}| jdur| |}|S )zApply the post layer norm and head if they are enabled,
        given the last hidden states tensor.

        args:
            encoder_outputs: The last hidden states from the visual encoder.
        N)rp  rA  )r_   r|  rM   rM   rN   rs    s
   
	


z7SiglipVisionTransformer.maybe_layer_norm_and_apply_headrG  c                 C   s   g d}t |  }t }t| jj}|D ]b\}}|dr$| jd u r$q| jd u r/|dr/q|drBt	|
dd }||krBq|D ]\}	}
}|
|vrNqD||
|	}|| }|j}||||  n|| }t|dt}||| || q|S )NrH  rp  rA  zencoder.layers.r   rO  )r   rP  rQ  ro  r=  r3  
startswithrp  rA  rz   splitrR  rO  rS  r   rT  )r_   rG  rU  rV  rW  layer_countrX  rY  r.  rZ  r[  r\  r]  rO  rM   rM   rN   r^  %  s4   

z$SiglipVisionTransformer.load_weightsr[   )rF   rG   rH   r   r   rz   r   ry   r   propertyr   rx  rK   rL   r   r9   r   rs  r   r  rQ  r^  r   rM   rM   r   rN   rj    s\    	A



#
,rj  c                       s   e Zd Z	dddddddededB dedB dedB d	ed
edB ddf fddZde	j
fddZedd Zedd Z			ddejdedee dB dedB dejf
ddZdeeeejf  dee fddZ  ZS )SiglipVisionModelNr   Frk  r   r   r-  rl  r   rm  rV   c                   s0   t    || _t||||| d|d| _d S )Nz.vision_model)r   r-  rl  r   rm  )r   r   r   rj  vision_model)r_   r   r   r-  rl  r   rm  r   rM   rN   r   Q  s   

zSiglipVisionModel.__init__c                 C   s
   | j jjS r[   )r  r   r   r^   rM   rM   rN   get_input_embeddingsg  s   
z&SiglipVisionModel.get_input_embeddingsc                 C   r   r[   )r  r   r^   rM   rM   rN   r   j     zSiglipVisionModel.dtypec                 C   r   r[   )r  rx  r^   rM   rM   rN   rx  n  r  zSiglipVisionModel.devicer?   r   ry  rz  c                 C   s   | j ||||dS )N)r?   r   ry  rz  )r  )r_   r?   r   ry  rz  rM   rM   rN   r   r  s   zSiglipVisionModel.forwardrG  c                 C   sZ  g d}t |  }t }t| jjj}|D ]\}}|dr&| jjd u r&q| jj	d u r2|dr2q|drEt
|dd }||krEq|drlt||}	|	d url|	|v rl||	 }
t|
dt}||
| ||	 q|D ]\}}}||vrxqn|||}|| }
|
j}||
||  n|| }
t||
||| j}
t|
dt}||
| || q|S )	NrH  zvision_model.post_layernormzvision_model.headzvision_model.encoder.layersr}  rB   )z.k_scalez.v_scalez.q_scalez.prob_scalerO  )r   rP  rQ  ro  r  r=  r3  r~  rp  rA  rz   r  endswithr   rS  r   rT  rR  rO  maybe_swap_ffn_paramr   )r_   rG  rU  rV  rW  r  rX  rY  r.  remapped_namer]  rO  rZ  r[  r\  rM   rM   rN   r^    sR   





zSiglipVisionModel.load_weightsr[   )FNN)rF   rG   rH   r   r   rz   r   ry   r   r	   Moduler  r  r   rx  rK   rL   r   r9   r   r   r  rQ  r^  r   rM   rM   r   rN   r  P  sT    	



,r  rX  r]  rY  rV  r   c           
      C   s   |r|  dkrd| vr|S t }t|dd}||| }||}d| v r7||kr7| dd}	||	 }|S d| v rI||krI| dd}	||	 }|S )Nggufz.fc
output_dimr   z.fc1.z.fc2.)r  r   rS  r   rR  )
rX  r]  rY  rV  r   r  r  r   weight_out_sizenew_namerM   rM   rN   r    s   
r  c                	       sP   e Zd Zdef fddZ	ddejdB dejdejdB dejfd	d
Z  ZS )r<  r   c                    sR   t    || _t|j|j| _t|j|j| _| j	dt
|jddd d S )Nr   r   Fr   )r   r   r   r   r   r   rC  max_position_embeddingsr   r   rK   r   r   r   r   rM   rN   r     s   

zSiglipTextEmbeddings.__init__NrB  r   r4  rV   c                 C   s(   |d u r	|  |}| |}|| }|S r[   )rC  r   )r_   rB  r   r4  position_embeddingsr   rM   rM   rN   r     s
   

zSiglipTextEmbeddings.forwardr[   )	rF   rG   rH   r   r   rK   rL   r   r   rM   rM   r   rN   r<    s    r<  rS   )ro   )r   dummy_inputsc                       s  e Zd ZdZdg diZededededB fdd	Zd
dde	def fddZ
	d5dejdB dejdejdB dejfddZdejdejdejfddZ	d5dejdedB dejfddZdededB fddZdedejfd d!Zdejd"eejgejf d#ejdB d$edejf
 fd%d&Z	d5dd'd(dejd)edB d#ejdB d$edejf
 fd*d+Zdedefd,d-Z		d6dejdB d.ejd/edB dejdB dedejfd0d1Zd2eeeejf  fd3d4Z  Z S )7SiglipEmbeddingModelTr  )rI  rK  rM  r   irV   Nc                 C   s   | drd S td)Nre   z Only image modality is supported)r~  rX   )clsr   r  rM   rM   rN   get_placeholder_str  s   
z(SiglipEmbeddingModel.get_placeholder_strr   r   vllm_configr   c                   s  t    |jj}|j}|| _t|drd|_|j}|j	}|j
| _|j
| _|j| _| | t||t|dd| _W d    n1 sDw   Y  | |d t||t|dd d| _W d    n1 sfw   Y  |jj}|d usuJ || _t|| _d| _d S )	N
num_labelsr   
text_modelr$  re   r  )r   r   rm  T)r   r   rl   	hf_configr   r   rr  r  text_configr   r   text_embed_dimvision_embed_dimr@  text_projection_size_mark_language_modelr:  r7   r  _mark_tower_modelrj  r  rm   r   for_embeddingpooler_is_text_input)r_   r  r   r   r   r  r   rm   r   rM   rN   r     s>   




zSiglipEmbeddingModel.__init__rB  r   r4  c                 C   s,   | j |||d}| j |}| ||}|S )N)rB  r   r4  )r  rA  _flip_sequences_by_position_ids)r_   rB  r   r4  rF  text_featuresrM   rM   rN   get_text_features6  s   z&SiglipEmbeddingModel.get_text_featuresfeaturesc                 C   s   t |dkr|S |dd |dd  }|dk}ttjdg|jdt|d d tjt |g|jdg}|dd |dd  }|dd }|dd }tjt ||jd|}	tjt ||jd}
||	 ||	  d |
 }|| S )zFlip sequences so EOS token moves to first position for CLS pooling.

        SigLIP position_ids are reversed within each sequence. This method detects
        sequence boundaries and flips each sequence individually.
        r1   Nr   r   )rx  )ro  rK   cattensorrx  wherer   repeat_interleave)r_   r  r   position_diffsboundary_maskboundary_indiceslengthsstartsendssequence_idscurrent_positionsflip_indicesrM   rM   rN   r  K  s*   
	
z4SiglipEmbeddingModel._flip_sequences_by_position_idsr?   rz  c                 C   s(   |d u r
t | jj}| j|d |d}|S )N)r?   ry  rz  )rY   rm   ro   r  )r_   r?   rz  pooled_outputrM   rM   rN   get_image_featurest  s   z'SiglipEmbeddingModel.get_image_featuresrc   c                 K   s:   | dd }|d u rd S | jjj }}td|||ddS )Nr?   )rC   rD   )r@   rE   resolve_bindings)popr   r   r   r>   )r_   rc   r?   
expected_h
expected_wrM   rM   rN   _parse_and_validate_image_input  s   z4SiglipEmbeddingModel._parse_and_validate_image_inputinputsc                 C   s   |d }|  |S )NrE   )r  )r_   r  r?   rM   rM   rN   _process_image_inputs  s   
z*SiglipEmbeddingModel._process_image_inputsrD  is_multimodalhandle_oov_mm_tokenc                   sl   t  j||||d}| j}|jd |k r+tj|||jd ||jd  gdd}|S |jd |kr4t|S )Nr  r  r1   r   r
  )r   _embed_text_input_idsr  r   rK   r  	new_emptyNotImplementedError)r_   rB  rD  r  r  r4  inputs_embeds_sizer   rM   rN   r    s*   	z*SiglipEmbeddingModel._embed_text_input_idsFr  multimodal_embeddingsc                   sF   |d u p	t |dk| _|d u s|d u rt |S t j||||dS )Nr   )r  r  r  )ro  r  r   rD  )r_   rB  r  r  r  r   rM   rN   rD    s   	z$SiglipEmbeddingModel.embed_input_idsc                 K   s*   | j di |}|d u rg S | |}|S )NrM   )r  r  )r_   rc   image_inputvision_embeddingsrM   rM   rN   embed_multimodal  s
   
z%SiglipEmbeddingModel.embed_multimodal	positionsintermediate_tensorsc                 K   sd   |d urt d| js|S | j}|jd |kr"|d d d |f }n	|jd |k r+t| |||S )Nz"PP is not supported for this modelr1   )RuntimeErrorr  r  r   r  r  )r_   rB  r  r  r4  rc   r   rM   rM   rN   r     s   zSiglipEmbeddingModel.forwardrG  c                 C   s   t | dgddgd}||S )Nz.position_idszlogit_scale.zlogit_bias.)skip_substrsignore_unexpected_prefixes)r6   r^  )r_   rG  loaderrM   rM   rN   r^    s   
z!SiglipEmbeddingModel.load_weightsr[   r9  )!rF   rG   rH   is_pooling_modelpacked_modules_mappingclassmethodry   rz   r  r   r   rK   rL   r  r  r9   r  rx   r>   r  r  r   r   r  r2   rD  r  r.   r   r   r  r^  r   rM   rM   r   rN   r    s    ,

,

&
$r  )ncollections.abcr   r   r   	functoolsr   r   typingr   r   rK   r	   transformersr
   r   r   r   r   vllm.configr   vllm.config.multimodalr   vllm.distributedr   r   %vllm.model_executor.layers.activationr   $vllm.model_executor.layers.attentionr   r   vllm.model_executor.layers.convr   !vllm.model_executor.layers.linearr   r   r   !vllm.model_executor.layers.poolerr   'vllm.model_executor.layers.quantizationr   3vllm.model_executor.layers.vocab_parallel_embeddingr   -vllm.model_executor.model_loader.weight_utilsr   r   vllm.multimodalr   vllm.multimodal.inputsr    r!   r"   r#   r$   vllm.multimodal.parser%   r&   r'   vllm.multimodal.processingr(   r)   r*   r+   r,   r-   vllm.sequencer.   vllm.utils.tensor_schemar/   r0   
interfacesr2   r3   r4   interfaces_baser5   utilsr6   r7   visionr8   r9   r:   r;   r<   r=   r>   rT   r   ry   rJ   rY   rZ   r{   r   ra   r  r   r   r  r  r,  r:  r_  rj  r  rL   r  r<  register_processorr  rM   rM   rM   rN   <module>   s     

-VIK--2G( (q
#