o
    i˄                     @   s(  U d dl mZmZmZmZ d dlmZ d dlmZm	Z	 d dl
Z
d dlmZ d dlmZmZmZmZmZ d dlmZ d dlmZ d dlmZmZ d d	lmZ d d
lmZmZ d dlm Z  d dl!m"Z"m#Z#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z* d dl+m,Z, d dl-m.Z. d dl/m0Z0 d dl1m2Z2m3Z3m4Z4m5Z5m6Z6 d dl7m8Z8m9Z9m:Z: d dl;m<Z<m=Z=m>Z>m?Z?m@Z@mAZA d dlBmCZC d dlDmEZEmFZF ddlGmHZHmIZI ddlJmKZK ddlLmMZMmNZN ddlOmPZPmQZQmRZRmSZSmTZTmUZU G dd deEZVG dd  d ePe ZWd!d!d"d"d#ZXeYeZeRf e[d$< d%eZfd&d'Z\G d(d) d)e>Z]G d*d+ d+e<e] Z^G d,d- d-e=e] Z_G d.d/ d/ej`ZaG d0d1 d1ej`ZbG d2d3 d3ej`ZcG d4d5 d5ej`ZdG d6d7 d7ej`ZeG d8d9 d9ej`ZfG d:d; d;ej`ZgG d<d= d=ej`ZhG d>d? d?ej`ZieKd@dAe0jje_e]e^dBG dCdD dDej`eIe.ZkdS )E    )CallableIterableMappingSequence)cached_property)	AnnotatedLiteralN)BatchFeature
CLIPConfigCLIPProcessorCLIPTextConfigCLIPVisionConfig)
VllmConfig)BaseDummyOptions)divide$get_tensor_model_parallel_world_size)
get_act_fn)	AttentionMMEncoderAttention)Conv2dLayer)ColumnParallelLinearQKVParallelLinearRowParallelLinear)DispatchPooler)QuantizationConfig)VocabParallelEmbedding)default_weight_loader)SupportsQuant)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalInputsMultiModalKwargsItemsMultiModalUUIDDict)ImageProcessorItems	ImageSizeMultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptIndexTargetsPromptReplacementPromptUpdate)IntermediateTensors)TensorSchemaTensorShape   )MultiModalEmbeddingsSupportsMultiModal)default_pooling_type)AutoWeightsLoadermaybe_prefix)VisionEncoderInfoVisionFeatureSelectStrategyVisionFeatureSelectStrategyStrget_num_selected_vision_tokensis_vit_use_data_parallelresolve_visual_encoder_outputsc                   @   s:   e Zd ZU dZed ed< eeje	ddddf ed< d	S )
CLIPImagePixelInputsz
    Dimensions:
        - bn: Batch size * number of images
        - c: Number of channels (3)
        - h: Height of each image
        - w: Width of each image
    pixel_valuestypebn   hwdataN)
__name__
__module____qualname____doc__r   __annotations__r   torchTensorr/    rK   rK   U/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/model_executor/models/clip.pyr<   B   s   
  r<   c                   @   sL   e Zd ZdededefddZdefddZdefdd	Zdefd
dZdS )CLIPEncoderInfoimage_widthimage_heightreturnc                C   s   |   d d S )N   r0   )get_patch_grid_length)selfrN   rO   rK   rK   rL   get_num_image_tokensP   s   z$CLIPEncoderInfo.get_num_image_tokensc                 C      | j jS N)vision_config
image_sizerS   rK   rK   rL   get_image_sizeX      zCLIPEncoderInfo.get_image_sizec                 C   rU   rV   )rW   
patch_sizerY   rK   rK   rL   get_patch_size[   r[   zCLIPEncoderInfo.get_patch_sizec                 C   s*   |   |  }}|| dksJ || S Nr   )rZ   r]   )rS   rX   r\   rK   rK   rL   rR   ^   s   z%CLIPEncoderInfo.get_patch_grid_lengthN)rD   rE   rF   intrT   rZ   r]   rR   rK   rK   rK   rL   rM   O   s    
rM   fullclass)MEANALLCLSLAST_POOLING_TYPE_TO_STRATEGYpooling_typec                 C   s*   zt |  W S  ty   td| d w )Nz;No feature selection strategy is defined for pooling_type: )rf   KeyError
ValueError)rg   rK   rK   rL   #_get_vision_feature_select_strategym   s   
rj   c                   @   sv   e Zd Zdd Zdd ZdefddZdeee	d	B f fd
dZ
de	de	de	fddZdefddZde	fddZd	S )CLIPProcessingInfoc                 C   s   | j tS rV   )ctxget_hf_configr
   rY   rK   rK   rL   rm   x      z CLIPProcessingInfo.get_hf_configc                 C   s   t |  S rV   )rM   rm   rY   rK   rK   rL   get_vision_encoder_info{   rn   z*CLIPProcessingInfo.get_vision_encoder_infokwargsc                 K   s   | j jtfi |S rV   )rl   get_hf_processorr   )rS   rp   rK   rK   rL   rq   ~   s   z#CLIPProcessingInfo.get_hf_processorrP   Nc                 C   s   ddiS )Nimager0   rK   rY   rK   rK   rL   get_supported_mm_limits   r[   z*CLIPProcessingInfo.get_supported_mm_limitsrN   rO   c                C   s8   |   }| jjj}|d usJ t|j||dt|jS NrN   rO   )ro   rl   model_configpooler_configr9   rT   rj   seq_pooling_type)rS   rN   rO   vision_encoder_inforw   rK   rK   rL   rT      s   
z'CLIPProcessingInfo.get_num_image_tokensc                 C   s    |   }|  }}t||dS )N)widthheight)ro   rZ   r%   )rS   ry   rz   r{   rK   rK   rL   !get_image_size_with_most_features   s   z4CLIPProcessingInfo.get_image_size_with_most_featuresc                 C   s   |   \}}| j||dS rt   )r|   rT   )rS   target_widthtarget_heightrK   rK   rL   get_max_image_tokens   s
   z'CLIPProcessingInfo.get_max_image_tokens)rD   rE   rF   rm   ro   objectrq   r   strr_   rs   rT   r%   r|   r   rK   rK   rK   rL   rk   w   s    
rk   c                	   @   sX   e Zd Zdeeef defddZ	d
dedeeef deeef dB defdd	Z	dS )CLIPDummyInputsBuilder	mm_countsrP   c                 C      dS N rK   )rS   r   rK   rK   rL   get_dummy_text   s   z%CLIPDummyInputsBuilder.get_dummy_textNseq_len
mm_optionsc                 C   sB   | dd}| j \}}|r| dnd }d| j||||diS )Nrr   r   )rz   r{   
num_images	overrides)getinfor|   _get_dummy_images)rS   r   r   r   r   r}   r~   image_overridesrK   rK   rL   get_dummy_mm_data   s   z(CLIPDummyInputsBuilder.get_dummy_mm_datarV   )
rD   rE   rF   r   r   r_   r   r   r   r   rK   rK   rK   rL   r      s    
r   c                       s   e Zd ZedefddZ	ddddeee B dede	ee
f d	e	ee
f dB d
edB def fddZdedede	ee
f d	e	ee
f def
ddZdede	ee
f de	eef fddZdede	ee
f dedee fddZ  ZS )CLIPMultiModalProcessorrP   c                 C   s    | j  }d}||jvsJ |S r^   )r   get_tokenizerall_special_ids)rS   	tokenizerdummy_token_idrK   rK   rL   image_token_id   s   
z&CLIPMultiModalProcessor.image_token_idN)mm_uuidspromptmm_itemshf_processor_mm_kwargstokenization_kwargsr   c                   s>   |r|rt d|ri |pi ddi}t j|||||dS )NzzCLIP accepts text-only or image-only inputs, not both! Image-only inputs means passing an image with an empty text prompt.add_special_tokensF)r   r   r   r   r   )ri   superapply)rS   r   r   r   r   r   	__class__rK   rL   r      s"   	zCLIPMultiModalProcessor.applyprompt_textc                 C   r   )NFrK   )rS   r   r   r   r   rK   rK   rL   _hf_processor_applies_updates   s   z5CLIPMultiModalProcessor._hf_processor_applies_updates	hf_inputsc                 C   s   t tddS )Nrr   )r=   )dictr    batched)rS   r   r   rK   rK   rL   _get_mm_fields_config   s   z-CLIPMultiModalProcessor._get_mm_fields_configout_mm_kwargsc                    s0   j  dtf fdd}tdt |dgS )Nitem_idxc                    s4    dt}|| }jj|j|jd} g| S )Nrr   ru   )	get_itemsr$   rZ   r   rT   rz   r{   )r   imagesrX   num_image_tokensr   r   rS   rK   rL   get_replacement  s   

zDCLIPMultiModalProcessor._get_prompt_updates.<locals>.get_replacementrr   )modalitytargetreplacement)r   r_   r+   r*   start)rS   r   r   r   r   rK   r   rL   _get_prompt_updates   s   z+CLIPMultiModalProcessor._get_prompt_updatesrV   )rD   rE   rF   r   r_   r   r   listr&   r   r   r#   r!   r   boolr   r	   r    r   r"   r   r,   r   __classcell__rK   rK   r   rL   r      s\    

 


	



r   c                	       sP   e Zd Zdef fddZ	ddejdB dejdejdB dejfd	d
Z  ZS )CLIPTextEmbeddingsconfigc                    s0   t    |j}t|j|| _t|j|| _d S rV   )r   __init__hidden_sizer   
vocab_sizetoken_embeddingmax_position_embeddingsposition_embedding)rS   r   	embed_dimr   rK   rL   r     s   

zCLIPTextEmbeddings.__init__N	input_idsposition_idsinputs_embedsrP   c                 C   s8   |d u r|d u rt d| |}| |}|| }|S )Nz5Either `input_ids` or `input_embeds` must be provided)ri   r   r   )rS   r   r   r   position_embeddings
embeddingsrK   rK   rL   forward   s   

zCLIPTextEmbeddings.forwardrV   )	rD   rE   rF   r   r   rI   rJ   r   r   rK   rK   r   rL   r     s    r   c                       s8   e Zd Zdef fddZdejdejfddZ  ZS )CLIPVisionEmbeddingsr   c                    s   t    || _|j| _|j| _|j| _| j| j dksJ tt	
| j| _t|j| j| j| jdd| _| j| j d | _| jd | _t| j| j| _| jdt	| jddd d S )	Nr   F)in_channelsout_channelskernel_sizestridebiasrQ   r0   r   )r0   )
persistent)r   r   r   r   r   rX   r\   nn	ParameterrI   randnclass_embeddingr   num_channelspatch_embeddingnum_patchesnum_positions	Embeddingr   register_bufferarangeexpand)rS   r   r   rK   rL   r   5  s,   

zCLIPVisionEmbeddings.__init__r=   rP   c                 C   sn   |j d }| jjj}| |j|d}|ddd}| j|dd}t	j
||gdd}|| | j }|S )Nr   )dtyperQ   r0   r   dim)shaper   weightr   toflatten	transposer   r   rI   catr   r   )rS   r=   
batch_sizetarget_dtypepatch_embedsclass_embedsr   rK   rK   rL   r   P  s   


zCLIPVisionEmbeddings.forward)	rD   rE   rF   r   r   rI   rJ   r   r   rK   rK   r   rL   r   4  s    r   c                       s`   e Zd Z	ddddeeB dedB dedee ee	 B ddf
 fd	d
Z
dejfddZ  ZS )CLIPAttentionNr   prefixr   quant_configr   attn_clsrP   c                   s  t    || _|j| _|j| _| j| j | _| j| j | jkr-td| j d| j d| jd | _	t
 }t| j| j| j|| d|d| _t| j| j|| d|d| _|rZd	nt | _t| j| j| _|tkr{|| j| j| j	| d
d| _d S || j| j| j	| d
d| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      z	.qkv_proj)r   	head_sizetotal_num_headsr   r   
disable_tpz	.out_proj)
input_sizeoutput_sizer   r   r   r0   z.attnr   )r   r   r   r   r   num_attention_heads	num_headshead_dimri   scaler:   r   qkv_projr   out_projr   tp_sizer   num_heads_per_partitionr   attn)rS   r   r   r   r   use_data_parallelr   rK   rL   r   `  s\   
		zCLIPAttention.__init__hidden_statesc           	      C   sF   |  |\}}|jddd\}}}| |||}| |\}}|dfS )z#Input shape: Batch x Time x Channelr@   r   r   N)r   chunkr   r   )	rS   r   
qkv_states_query_states
key_statesvalue_statesoutattn_outputrK   rK   rL   r     s
   zCLIPAttention.forwardrV   rD   rE   rF   r   r   r   r   r>   r   r   r   rI   rJ   r   r   rK   rK   r   rL   r   _  s$    <r   c                	       sR   e Zd Z		ddeeB dedB deddf fddZd	ej	dej	fd
dZ
  ZS )CLIPMLPNr   r   r   r   rP   c                    sf   t    || _t }t|j| _t|j|j	d|| d|d| _
t|j	|jd|| d|d| _d S )NTz.fc1)r   r   r   r   z.fc2)r   r   r   r:   r   
hidden_actactivation_fnr   r   intermediate_sizefc1r   fc2)rS   r   r   r   r   r   rK   rL   r     s(   
zCLIPMLP.__init__r   c                 C   s*   |  |\}}| |}| |\}}|S rV   )r  r  r	  )rS   r   r   rK   rK   rL   r     s   
zCLIPMLP.forwardr   )rD   rE   rF   r   r   r   r   r   rI   rJ   r   r   rK   rK   r   rL   r    s    r  c                       sf   e Zd Z	ddddeeB dedB dedee ee	 B ddf
 fd	d
Z
dejdejfddZ  ZS )CLIPEncoderLayerNr   r   r   r   r   r   rP   c                   sd   t    t||| d|d| _tj|j|jd| _t	||| dd| _
tj|j|jd| _d S )Nz
.self_attn)r   r   r   epsz.mlpr   r   )r   r   r   	self_attnr   	LayerNormr   layer_norm_epslayer_norm1r  mlplayer_norm2)rS   r   r   r   r   r   rK   rL   r     s   
zCLIPEncoderLayer.__init__r   c                 C   sJ   |}|  |}| j|d\}}|| }|}| |}| |}|| }|S )N)r   )r  r  r  r  )rS   r   residualr   rK   rK   rL   r     s   


zCLIPEncoderLayer.forwardrV   r  rK   rK   r   rL   r
    s     r
  c                       s   e Zd ZdZ		ddddeeB dedB dedB ded	e	e
 e	e B d
df fddZdejded
ejeej B fddZ  ZS )CLIPEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self
    attention layers. Each layer is a [`CLIPEncoderLayer`].

    Args:
        config: CLIPConfig
    Nr   r   r   r   num_hidden_layers_overrider   r   rP   c                   sL   t    | _|d u rj}n|}t fddt|D | _d S )Nc                    s&   g | ]}t  d |  dqS )z.layers.r   r   r   r   )r
  ).0	layer_idxr   r   r   r   rK   rL   
<listcomp>  s    z(CLIPEncoder.__init__.<locals>.<listcomp>)r   r   r   num_hidden_layersr   
ModuleListrangelayers)rS   r   r   r  r   r   r  r   r  rL   r     s   
	
zCLIPEncoder.__init__r   return_all_hidden_statesc                 C   s8   |g}|}| j D ]}||}|r|| q|r|S |S rV   )r  append)rS   r   r   hidden_states_poolr   encoder_layerrK   rK   rL   r     s   

zCLIPEncoder.forwardNN)rD   rE   rF   rG   r   r   r   r_   r   r>   r   r   r   rI   rJ   r   r   r   r   rK   rK   r   rL   r    s4    r  c                
       s   e Zd Z	ddddededB deddf fdd	Zd
ejdejfddZ		dd
ejdB dejdejdB dejfddZ
deeeejf  dee fddZ  ZS )CLIPTextTransformerNr   r   r   r   r   rP   c                   sN   t    || _|j}t|| _t||| dtd| _t	j
||jd| _d S )N.encoderr  r  )r   r   r   r   r   r   r  r   encoderr   r  r  final_layer_norm)rS   r   r   r   r   r   rK   rL   r   3  s   

zCLIPTextTransformer.__init__r   c                 C   s   | j |S rV   )r   r   )rS   r   rK   rK   rL   embed_input_idsM  rn   z#CLIPTextTransformer.embed_input_idsr   r   c                 C   s,   | j |||d}| j|dd}| |}|S )Nr   r   r   Fr   r   )r   r'  r(  )rS   r   r   r   r   last_hidden_staterK   rK   rL   r   P  s   
zCLIPTextTransformer.forwardweightsc                 C   s   g d}t |  }t }|D ]9\}}|D ]\}}}	||vrq|||}|| }
|
j}||
||	  n|| }
t|
dt}||
| || q|S )N)r   q_projq)r   k_projk)r   v_projvweight_loader)r   named_parameterssetreplacer5  getattrr   add)rS   r-  stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
param_nameweight_nameshard_idparamr5  rK   rK   rL   load_weightsd  s"   
z CLIPTextTransformer.load_weightsrV   )rD   rE   rF   r   r   r   r   rI   rJ   r)  r   r   tupler7  rD  r   rK   rK   r   rL   r%  2  s2    
,r%  c                       s   e Zd Z	ddddddededB dedB dedB ded	df fd
dZe	dd Z
e	dd Zddddejdee dB dedB d	ejfddZdeeeejf  d	ee fddZ  ZS )CLIPVisionTransformerNr   r  require_post_normr   r   r   r  rH  r   rP   c                   s   t    || _|j}t|| _tj||jd| _	t
|||| dtd| _|j}t| jj|jkrAtd| dt| jj d|d u rMt| jj|k}|rZtj||jd| _d S d | _d S )Nr  r&  )r   r   r  r   r   zThe original encoder only has z layers, but you requested z layers.)r   r   r   r   r   r   r   r  r  pre_layrnormr  r   r'  r  lenr  ri   post_layernorm)rS   r   r   r  rH  r   r   r  r   rK   rL   r     s0   
	


zCLIPVisionTransformer.__init__c                 C      t |  jS rV   )next
parametersr   rY   rK   rK   rL   r        zCLIPVisionTransformer.dtypec                 C   rL  rV   )rM  rN  devicerY   rK   rK   rL   rP    rO  zCLIPVisionTransformer.deviceselect_layersfeature_select_strategyr=   rR  rS  c                C   sB   |  |}| |}| j||d ud}t|| j|| jj|d}|S )Nr+  )rR  max_possible_layersrS  )r   rI  r'  r;   rK  r   r  )rS   r=   rR  rS  r   encoder_outputsrK   rK   rL   r     s   

zCLIPVisionTransformer.forwardr-  c                 C   s   g d}t |  }t }t| jj}|D ]W\}}|dr$| jd u r$q|dr7t|	dd }||kr7q|D ]\}	}
}|
|vrCq9|
|
|	}|| }|j}||||  n|| }t|dt}||| || q|S )Nr.  rK  zencoder.layers.rQ   r5  )r   r6  r7  rJ  r'  r  
startswithrK  r_   splitr8  r5  r9  r   r:  )rS   r-  r;  r<  r=  layer_countr>  r?  r  r@  rA  rB  rC  r5  rK   rK   rL   rD    s0   

z"CLIPVisionTransformer.load_weightsrV   )rD   rE   rF   r   r   r_   r   r   r   propertyr   rP  rI   rJ   r   r7   r   r   rE  r7  rD  r   rK   rK   r   rL   rF    sF    ,



,rF  c                       s   e Zd Z	ddddddededB dedB dedB ded	df fd
dZ		dde	j
dee dB dedB d	e	j
fddZedd Zedd Z  ZS )CLIPVisionModelNr   rG  r   r   r  rH  r   rP   c                   s(   t    t||||| dd| _d S )Nz.vision_model)r   r   r  rH  r   )r   r   rF  vision_model)rS   r   r   r  rH  r   r   rK   rL   r     s   
	zCLIPVisionModel.__init__r=   rR  rS  c                 C   s   | j |||dS )NrQ  )r\  )rS   r=   rR  rS  rK   rK   rL   r     s
   zCLIPVisionModel.forwardc                 C   rU   rV   )r\  r   rY   rK   rK   rL   r        zCLIPVisionModel.dtypec                 C   rU   rV   )r\  rP  rY   rK   rK   rL   rP    r]  zCLIPVisionModel.devicerV   r$  )rD   rE   rF   r   r   r_   r   r   r   rI   rJ   r   r7   r   rZ  r   rP  r   rK   rK   r   rL   r[    sD    


r[  re   )rx   )r   dummy_inputsc                       s  e Zd ZdZdg diZededededB fdd	Zd
dde	def fddZ
	d2dejdB dejdejdB dejfddZ	d2dejdedB dejfddZdededB fddZdedejfddZdejdeejgejf d ejdB d!edejf
 fd"d#Z	d2dd$d%dejd&edB d ejdB d!edejf
 fd'd(Zdedefd)d*Z		d3dejdB d+ejd,edB dejdB dedejfd-d.Zd/eeeejf  fd0d1Z  ZS )4CLIPEmbeddingModelTr   )r/  r1  r3  r   irP   Nc                 C   s   | drd S td)Nrr   z Only image modality is supported)rW  ri   )clsr   r`  rK   rK   rL   get_placeholder_str-  s   
z&CLIPEmbeddingModel.get_placeholder_strr   r   vllm_configr   c          	         s.  t    |jj}|j}|jj}|| _|| _|j}|j}|j	| _	|j
| _|j
| _| | t||t|dd| _tj| j| j	dd| _W d    n1 sNw   Y  | |d t||t|dd| _tj| j| j	dd| _W d    n1 szw   Y  |jj}|d usJ || _t|| _d| _d S )N
text_modelr  F)r   rr   r\  T)r   r   rv   	hf_configr   multimodal_configr   text_configrW   projection_dimr   text_embed_dimvision_embed_dim_mark_language_modelr%  r5   rd  r   Lineartext_projection_mark_tower_modelrF  r\  visual_projectionrw   r   for_embeddingpooler_is_text_input)	rS   rc  r   r   r   rf  rg  rW   rw   r   rK   rL   r   4  sP   



zCLIPEmbeddingModel.__init__r   r   r   c                 C   s   | j |||d}| |}|S )Nr*  )rd  rm  )rS   r   r   r   pooled_outputtext_featuresrK   rK   rL   get_text_featurese  s   
z$CLIPEmbeddingModel.get_text_featuresr=   rS  c                 C   s2   |d u r
t | jj}| j|d |d}| |}|S )N)r=   rR  rS  )rj   rw   rx   r\  ro  )rS   r=   rS  rs  image_featuresrK   rK   rL   get_image_featuresu  s   
z%CLIPEmbeddingModel.get_image_featuresrp   c                 K   s:   | dd }|d u rd S | jjj }}td|||ddS )Nr=   )rA   rB   )r>   rC   resolve_bindings)popr   rW   rX   r<   )rS   rp   r=   
expected_h
expected_wrK   rK   rL   _parse_and_validate_image_input  s   z2CLIPEmbeddingModel._parse_and_validate_image_inputinputsc                 C   s   |d }|  |S )NrC   )rw  )rS   r}  r=   rK   rK   rL   _process_image_inputs  s   
z(CLIPEmbeddingModel._process_image_inputsr)  is_multimodalhandle_oov_mm_tokenc                   sl   t  j||||d}| j}|jd |k r+tj|||jd ||jd  gdd}|S |jd |kr4t|S )Nr  r  r0   r   r   )r   _embed_text_input_idsrh  r   rI   r   	new_emptyNotImplementedError)rS   r   r)  r  r  r   inputs_embeds_sizer   rK   rL   r    s*   	z(CLIPEmbeddingModel._embed_text_input_idsFr  multimodal_embeddingsc                   sF   |d u p	t |dk| _|d u s|d u rt |S t j||||dS )Nr   )r  r  r  )rJ  rr  r   r)  )rS   r   r  r  r  r   rK   rL   r)    s   	z"CLIPEmbeddingModel.embed_input_idsc                 K   s*   | j di |}|d u rg S | |}|S )NrK   )r|  r~  )rS   rp   image_inputvision_embeddingsrK   rK   rL   embed_multimodal  s
   
z#CLIPEmbeddingModel.embed_multimodal	positionsintermediate_tensorsc                 K   sd   |d urt d| js|S | j}|jd |kr"|d d d |f }n	|jd |k r+t| |||S )Nz"PP is not supported for this modelr0   )RuntimeErrorrr  ri  r   r  ru  )rS   r   r  r  r   rp   r   rK   rK   rL   r     s   zCLIPEmbeddingModel.forwardr-  c                 C   s   t | dgdgd}||S )Nz.position_idszlogit_scale.)skip_substrsignore_unexpected_prefixes)r4   rD  )rS   r-  loaderrK   rK   rL   rD    s   
zCLIPEmbeddingModel.load_weightsrV   r$  ) rD   rE   rF   is_pooling_modelpacked_modules_mappingclassmethodr   r_   rb  r   r   rI   rJ   ru  r7   rw  r   r<   r|  r~  r   r   r  r1   r)  r  r-   r   r   rE  rD  r   rK   rK   r   rL   r_  "  s    5


&
$r_  )lcollections.abcr   r   r   r   	functoolsr   typingr   r   rI   torch.nnr   transformersr	   r
   r   r   r   vllm.configr   vllm.config.multimodalr   vllm.distributedr   r   %vllm.model_executor.layers.activationr   $vllm.model_executor.layers.attentionr   r   vllm.model_executor.layers.convr   !vllm.model_executor.layers.linearr   r   r   !vllm.model_executor.layers.poolerr   'vllm.model_executor.layers.quantizationr   3vllm.model_executor.layers.vocab_parallel_embeddingr   -vllm.model_executor.model_loader.weight_utilsr   %vllm.model_executor.models.interfacesr   vllm.multimodalr   vllm.multimodal.inputsr   r    r!   r"   r#   vllm.multimodal.parser$   r%   r&   vllm.multimodal.processingr'   r(   r)   r*   r+   r,   vllm.sequencer-   vllm.utils.tensor_schemar.   r/   
interfacesr1   r2   interfaces_baser3   utilsr4   r5   visionr6   r7   r8   r9   r:   r;   r<   rM   rf   r   r   rH   rj   rk   r   r   Moduler   r   r   r  r
  r  r%  rF  r[  register_processorr_  rK   rK   rK   rL   <module>   sn     
	
.V+K&(:Nx*