o
    
۾i                     @   s*  U d dl Z d dlmZ d dlmZmZmZ d dlmZ d dl	m
Z
mZmZmZmZ d dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZ d dlm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z* d dl+m,Z, d dl-m.Z.m/Z/m0Z0 d dl1m2Z2 d dl3m4Z4 d dl5m6Z6m7Z7 d dl8m9Z9 d dl:m;Z; d dl<m=Z=m>Z>m?Z?m@Z@mAZAmBZBmCZC d dlDmEZEmFZFmGZGmHZHmIZI d dlJmKZKmLZLmMZMmNZNmOZO d dlPmQZQ d dlRmSZSmTZT ddlUmVZVmWZWmXZXmYZYmZZZ ddl[m\Z\ dd l]m^Z^m_Z_m`Z`maZambZb dd!lcmdZd e(eeZfd"egd#egd$egd%egd&egf
d'd(ZhG d)d* d*eSZiG d+d, d,eSZjeiejB Zkeeld-< G d.d/ d/eSZmG d0d1 d1eSZnemenB Zoeeld2< G d3d4 d4ejpZqd5ejrd6ejrd7ejrd8ejrd9e4d:esejrejrf fd;d<ZtG d=d> d>ejpZuG d?d@ d@ejpZvG dAdB dBejpZwG dCdD dDejpZxG dEdF dFejpZyG dGdH dHejpZzG dIdJ dJejpZ{dKee|ejrf fdLdMZ}G dNdO dOeIZ~G dPdQ dQeMZedRedSZG dTdU dUeKe ZG dVdW dWee ZG dXdY dYeLe ZG dZd[ d[ejpeYZe;jeeed\G d]d^ d^eeYeWeZeXZdS )_    N)abstractmethod)IterableMappingSequence)partial)	AnnotatedAnyLiteral	TypeAliasTypeVar)	rearrange)PretrainedConfig)GELUActivation)BatchFeature)BaseModelOutputBaseModelOutputWithPooling)	torch_int)
VllmConfig)BaseDummyOptions)$get_tensor_model_parallel_world_size)init_logger)MMEncoderAttention)Conv2dLayer)ColumnParallelLinearQKVParallelLinearRowParallelLinear)QuantizationConfig)ApplyRotaryEmb)default_weight_loadermaybe_remap_kv_scale_name)MultiModelKeys)MULTIMODAL_REGISTRY)	ImageItemModalityDataMultiModalDataDictMultiModalFeatureSpecMultiModalFieldConfigMultiModalKwargsItems	VideoItem)DictEmbeddingItems	ImageSizeModalityDataItemsMultiModalDataItemsMultiModalDataParser)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdate)IntermediateTensors)TensorSchemaTensorShape   )MultiModalEmbeddingsSupportsLoRASupportsMRoPESupportsMultiModal
SupportsPP)	SiglipMLP)AutoWeightsLoaderWeightsMapperinit_vllm_registered_modelis_pp_missing_parametermaybe_prefix)is_vit_use_data_parallelheightwidthfactor
min_pixels
max_pixelsc                 C   s8  | |k rt d| | t|| |  }|} ||k r*t d|| t| | | } |}t| |t| | dkr:tdt| | | }t|| | }|| |krst| | | }t| | | | }t|| | | }||fS || |k rt|| |  }t	| | | | }t	|| | | }||fS )Nz8smart_resize: height=%s < factor=%s, reset height=factorz6smart_resize: width=%s < factor=%s, reset width=factor   z]absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)})
loggerwarningroundmaxmin
ValueErrormathsqrtfloorceil)rC   rD   rE   rF   rG   h_barw_barbeta rV   S/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/keye.pysmart_resizeX   s@   rX   c                	   @   X   e Zd ZU dZed ed< eeje	dddddhdf ed< eeje	ddf ed	< d
S )KeyeImagePixelInputs
    Dimensions:
        - bnp: Batch size * Number of patches
        - c: Number of channels
        - ps: Patch size
        - ni: Number of images
        - g: Grid dimensions (3 for t, h, w)
    pixel_valuestypebnp   psdynamic_dimsniimage_grid_thwN
__name__
__module____qualname____doc__r	   __annotations__r   torchTensorr5   rV   rV   rV   rW   rZ         
 	rZ   c                   @   N   e Zd ZU dZed ed< eeje	ddf ed< eeje	ddf ed< d	S )
KeyeImageEmbeddingInputsz
    Dimensions:
        - nf: Number of image features
        - hs: Hidden size (must match the hidden size of language model
          backbone)
        - ni: Number of images
        - g: Grid dimensions (3 for t, h, w)
    image_embedsr]   nfhsrc   r_   rd   Nre   rV   rV   rV   rW   ro      
   
 	ro   KeyeImageInputsc                	   @   rY   )KeyeVideoPixelInputsr[   pixel_values_videosr]   r^   r_   r`   ra   nvvideo_grid_thwNre   rV   rV   rV   rW   ru      rm   ru   c                   @   rn   )
KeyeVideoEmbeddingInputsz
    Dimensions:
        - nf: Number of video features
        - hs: Hidden size (must match the hidden size of language model
          backbone)
        - nv: Number of videos
        - g: Grid dimensions (3 for t, h, w)
    video_embedsr]   rq   rr   rw   r_   rx   Nre   rV   rV   rV   rW   ry      rs   ry   KeyeVideoInputsc                       s   e Zd Zdef fddZ	ddejdededed	ejf
d
dZ	ddefddZ
			ddejdejdB deeeeef eeeeef  B  dB d	ejfddZ  ZS )KeyeVisionEmbeddingsconfigc                    s   t    || _|j| _|j| _|j| _t|j| j| j| jdd| _	| j| j d | _
| j
| _t | _t | _t| j| j| _td| j| _| jdt| jddd d S )	Nvalid)in_channelsout_channelskernel_sizestridepadding   i   position_ids)r6   F
persistent)super__init__r}   hidden_size	embed_dim
image_size
patch_sizer   num_channelspatch_embeddingnum_patchesnum_positionsdictcache_position_embeddingcache_position_countnn	Embeddingposition_embeddingpacking_position_embeddingregister_bufferrk   arangeexpand)selfr}   	__class__rV   rW   r      s.   

zKeyeVisionEmbeddings.__init__F
embeddingsrC   rD   is_after_patchifyreturnc                 C   s   | j jjd }| j jd}|jd }|r|}|}	n
|| j }|| j }	t|d }
|d|
|
|}|dddd}tj	j
|||	fddd	}|dddddd|}|S )
Nr   r   g      ?r6   r_   r   bilinearF)sizemodealign_corners)r   weightshape	unsqueezer   r   reshapepermuter   
functionalinterpolateview)r   r   rC   rD   r   r   patch_pos_embeddim
new_height	new_widthsqrt_num_positionsrV   rV   rW   interpolate_pos_encoding   s*   


z-KeyeVisionEmbeddings.interpolate_pos_encoding   	max_cachec                 C   s   ||f}|| j v r| j|  d7  < | j | S t| j |kr3t| j| jjd}| j| | j | | |||d}d| j|< || j |< |S )Nr6   )keyT)r   r   lenrM   getpopr   )r   r   hwr   gridmin_hit_gridr   rV   rV   rW   "fetch_position_embedding_lfu_cache
  s   



z7KeyeVisionEmbeddings.fetch_position_embedding_lfu_cacheNr\   r   rd   c                 C   s8  |  dkr|d}|  dkr|d u rtd|j\}}}}}	| jjj}
t|d}| |j|
d}|	d
d}|r|d urd}t }|D ]3}|\}}}||| |  }|||d d f }| |||d	
d|d
}|| }|| |}qItj|ddd}|S || | }|S td|   d)N   r      z9position_ids cannot be None when pixel_values.dim() is 5.zb l c h w -> (b l) c h wdtyper   Tr6   r   z$Unsupported pixel_values dimension: z. Expected 4 or 5.)r   r   rN   r   r   r   r   r   toflattensqueezelistr   repeatappendrk   concatr   )r   r\   r   rd   r   
batch_sizesquence_lenchannelrC   rD   target_dtypepatch_embedsr   starttmp_embeddings
image_gridtr   r   endimage_embeddingsr   rV   rV   rW   forward  sR   
	



zKeyeVisionEmbeddings.forward)F)r   )NNF)rf   rg   rh   r   r   rk   rl   intboolr   r   FloatTensorr   tupler   __classcell__rV   rV   r   rW   r|      s:    !
$"r|   qkcossinapply_rotary_embr   c                 C   sL   |j dddd  }|j dddd  }|| ||}||||}||fS )Nr   r   r   r   )chunk
contiguous)r   r   r   r   r   q_embedk_embedrV   rV   rW   apply_rotary_pos_emb_flashattR  s
   r   c                       s   e Zd ZdZ		ddededB def fddZ					dd
ej	dej	dB de
dB deej	 dB deej	ej	f dB dej	fddZ  ZS )KeyeSiglipAttentionzBMulti-headed attention from 'Attention Is All You
    Need' paper.N r}   quant_configprefixc              	      sH  t    || _|j}|j| _t }|rdnt }|j| _| j| dks&J | j| | _|j| _	| j	|kr?| j	| dks>J n	|| j	 dksHJ t
d| j	| | _|j| j | _| j| j | _| j| j | _| jd | _t|| j| j| j	d|| dd| _t|||| dd| _t| j| j| j| j| d	d
| _tddd| _d S )Nr6   r   g      Tz	.qkv_projbiasr   r   z	.out_proj)
input_sizeoutput_sizer   r   z.attn)	num_heads	head_sizescalenum_kv_headsr   )enforce_enableenable_fp32_compute)r   r   r}   r   rB   r   num_attention_headstotal_num_headsr   total_num_kv_headsrL   r   head_dimq_sizekv_sizer   r   qkv_projr   out_projr   attnr   r   )r   r}   r   r   r   use_data_paralleltp_sizer   rV   rW   r   f  sX   

	zKeyeSiglipAttention.__init__Fhidden_statesattention_maskoutput_attentions
cu_seqlensrope_embr   c                 C   s  |  |\}}|j| j| j| jgdd\}}	}
|dd  |d d   }|d u rc|jg |jd d | j| jR  }|	jg |	jd d | j	| jR  }	|
jg |
jd d | j	| jR  }
nP|d u rkt
d|\}}|jg |jd d | j| jR  }|	jg |	jd d | j	| jR  }	t||	||| j\}}	|
jg |
jd d | j	| jR  }
| j||	|
||d}t|d}| |\}}|S )Nr   r   r6   z4cu_seqlens cannot be None when rope_emb is not None.)queryr   valuer  
max_seqlenzb s h d -> b s (h d))r   splitr   r   rL   r   r   r   r   r   rN   r   r   r   r   r   )r   r  r  r  r  r  qkv_r   r   vr  r   r   context_layeroutputrV   rV   rW   r     sf   &
&
zKeyeSiglipAttention.forwardNr   )NFNN)rf   rg   rh   ri   r   r   strr   rk   rl   r   r   r   r   r   rV   rV   r   rW   r   b  s8    ;r   c                       sH   e Zd Zddededdf fddZdd	 Zd
edejfddZ	  Z
S )SigLIPRotaryEmbedding     @r   thetar   Nc                    s"   t    || _|| _|   d S N)r   r   r   r  	rope_init)r   r   r  r   rV   rW   r     s   
zSigLIPRotaryEmbedding.__init__c                 C   s:   d| j tjd| jdtjd| j   }| jd|dd d S )Ng      ?r   r   r   inv_freqFr   )r  rk   r   r   floatr   )r   r  rV   rV   rW   r    s    zSigLIPRotaryEmbedding.rope_initseqlenc                 C   s*   t j|| jj| jjd}t || j}|S )N)devicer   )rk   r   r  r  r   outer)r   r  seqfreqsrV   rV   rW   r     s   zSigLIPRotaryEmbedding.forward)r  )rf   rg   rh   r   r  r   r  rk   rl   r   r   rV   rV   r   rW   r    s    r  c                       s   e Zd Z		ddededB def fddZ			dd	ejd
ejde	dB de
ej dB deejejf dB deej fddZ  ZS )KeyeSiglipEncoderLayerNr   r}   r   r   c                    sj   t    |j| _tj| j|jd| _t||| dd| _	tj| j|jd| _
t||| dd| _d S )Nepsz
.self_attnr   r   z.mlp)r   r   r   r   r   	LayerNormlayer_norm_epslayer_norm1r   	self_attnlayer_norm2r<   mlpr   r}   r   r   r   rV   rW   r     s   
zKeyeSiglipEncoderLayer.__init__Fr  r  r  r  r  r   c                 C   sN   |}|  |}| j|||||d}|| }|}| |}| |}|| }|S )N)r  r  r  r  r  )r#  r$  r%  r&  )r   r  r  r  r  r  residualrV   rV   rW   r     s   


zKeyeSiglipEncoderLayer.forwardr  )FNN)rf   rg   rh   r   r   r  r   rk   rl   r   r   r   r   r   r   rV   rV   r   rW   r    s4    r  c                       s   e Zd Z		ddededB def fddZedd	 Z								
		dde	j
dB dedB dedB dee	j
 dB deeeeef eeeeef  B  dB de	j
dB de	j
dB dedB dedB dedefddZ  ZS )KeyeSiglipEncoderNr   r}   r   r   c                    sZ   t     | _ j} j}|| }t fddt jD | _	t
|d | _d S )Nc                    s$   g | ]}t   d | dqS )z.layers.r   )r  ).0	layer_idxr}   r   r   rV   rW   
<listcomp>0  s    z.KeyeSiglipEncoder.__init__.<locals>.<listcomp>r   )r   r   r}   r   r   r   
ModuleListrangenum_hidden_layerslayersr  rotary_pos_emb)r   r}   r   r   r   r   r   r   r,  rW   r   $  s   

zKeyeSiglipEncoder.__init__c                 C   s4   t  }| D ]}t|t r|| q|| q|S r  )r   
isinstanceextendr   )rd   tmp_image_grid_thwr   rV   rV   rW   flatten_list;  s   
zKeyeSiglipEncoder.flatten_listFr   visionr  r  output_hidden_statesr  rd   height_position_idswidth_position_idsuse_ropewindow_sizevision_or_textr   c                 C   s4  |j }|}|	du r}| |}|d u s|d u rSt }t }|D ]&\}}}tj|| | |d||  }|| }|| }|| || qtj|dd}tj|dd}tj||gdd}| d }| 	|}|| 
d}|dd}| | f}nd }|}|}|d u sJ | jD ]}||||||d}q|S )	NT)r  r   r   r   r6   r   )r  r  r  )r  r6  r   rk   r   r   r   stackrL   r2  r   r   r   r   r1  )r   inputs_embedsr  r  r8  r  rd   r9  r:  r;  r<  r=  r  r  flatten_image_grid_thw
split_hids
split_widsr   r   r   
image_pidssample_hidssample_widspidsmax_grid_sizerope_emb_max_gridr  attn_cu_seqlensencoder_layerrV   rV   rW   r   E  sJ   



zKeyeSiglipEncoder.forwardr  )
NNNNNNNFr   r7  )rf   rg   rh   r   r   r  r   staticmethodr6  rk   rl   r   r   r   r   r   r   r   rV   rV   r   rW   r)  #  s^    
"	
r)  c                #       s  e Zd Z		ddededB def fddZ																	
ddedB dedB dedB dej	dB dej	dB dej	dB dej	dB dej	dB dej	dB de
ej	 dB dej	dB dedB de
eeeef e
eeeef  B  dB dedB dedB dedB def"ddZ  ZS ) KeyeSiglipVisionTransformerNr   r}   r   r   c                    sL   t    || _|j}t|| _t||| dd| _tj	||j
d| _d S )Nz.encoderr   r  )r   r   r}   r   r|   r   r)  encoderr   r!  r"  post_layernorm)r   r}   r   r   r   r   rV   rW   r     s   

z$KeyeSiglipVisionTransformer.__init__FTr   r  r8  r   r  sample_indicesimage_indicesr   r9  r:  r  padding_maskvision_return_embed_listrd   return_pooler_outputr;  r<  r   c                 C   s   | j ||||d}| j||||||||	|
|dd}| |}t }|d u r)tdt|jd d D ]!}|| }||d  }|d d ||d d f d}|| q2|S )N)r   r   rd   r7  )r?  r  r8  r  r  rd   r;  r9  r:  r<  r=  zHcu_seqlens cannot be None for SiglipVisionTransformer output processing.r   r6   )	r   rM  rN  r   rN   r/  r   r   r   )r   r\   r  r8  r   r  rO  rP  r   r9  r:  r  rQ  rR  rd   rS  r;  r<  r  last_hidden_statesample_hidden_stateir   r   tensorrV   rV   rW   r     s>   
 z#KeyeSiglipVisionTransformer.forwardr  )NNFNNNNNNNNFNTFr   )rf   rg   rh   r   r   r  r   r   rk   rl   r   r   r   r   r   r   rV   rV   r   rW   rL    s~    	
"rL  c                       s<  e Zd ZeZdZ		d#dededB def fddZe	d	e
jfd
dZe	d	e
jfddZd	ejfddZ											d$de
jdB dedB dedB dede
jdB dedB deeeeef eeeeef  B  dB dee
j dB dedB dedB dedB d	efddZd eeee
jf  d	ee fd!d"Z  ZS )%KeyeSiglipVisionModelr\   Nr   r}   r   r   c                    s*   t    t||| dd| _|| _d S )Nz.vision_modelr   )r   r   rL  vision_modelr   r'  r   rV   rW   r     s   

zKeyeSiglipVisionModel.__init__r   c                 C      | j jjjjS r  )rY  r   r   r   r   r   rV   rV   rW   r        zKeyeSiglipVisionModel.dtypec                 C   rZ  r  )rY  r   r   r   r  r[  rV   rV   rW   r    r\  zKeyeSiglipVisionModel.devicec                 C   s
   | j jjS r  )rY  r   r   r[  rV   rV   rW   get_input_embeddings  s   
z*KeyeSiglipVisionModel.get_input_embeddingsFTr   rO  r  r8  r   r   rR  rd   r  rS  r;  r<  c                 C   s"   | j |||||||||	|
||dS )N)r\   r  r8  r   r   rR  rd   rO  r  rS  r;  r<  )rY  )r   r\   rO  r  r8  r   r   rR  rd   r  rS  r;  r<  rV   rV   rW   r     s   zKeyeSiglipVisionModel.forwardweightsc                 C   sx  g d}t | jdd}t }|D ]\}}d|v rqd|v s"d|v r#qd|v s+d|v r,q| jd urZ| j| }rZ|| }t|d	t}	| d
krK|n|d
 }|	|| || q|D ].\}
}}||vrfq\|	||
}|
drv||vrvq\t|| r|q\|| }|j}	|	|||  n)|
dr||vrqt||}|d u rqt|| rq|| }t|d	t}	|	|| || q|S )N))r   q_projr   )r   k_projr   )r   v_projr  F)remove_duplicatezrotary_emb.inv_freqzhead.attentionzhead.layernormzhead.mlpz
head.probeweight_loaderr   z.bias)r   named_parameterssetr   get_cache_scalegetattrr   r   addreplaceendswithr@   rc  r   )r   r^  stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
scale_nameparamrc  
param_nameweight_nameshard_idrV   rV   rW   load_weights  sp   






z"KeyeSiglipVisionModel.load_weightsr  )NNNFNFNNTFr   )rf   rg   rh   r   config_classmain_input_namer   r  r   propertyrk   r   r  r   Moduler]  rl   r   r   r   r   r   r   r   re  ru  r   rV   rV   r   rW   rX    sp    "	
,rX  c                	       st   e Zd Z		ddedededB def fddZd	eje	ej B d
e	e
eeef  deje	ej B fddZ  ZS )	ProjectorNr   text_configvision_configr   r   c                    s   t    || _|| _d| _| jj| jd  | jd  | _tjj| jjdd| _	t
 | _t| j| jd|| dd| _t| j| jjd|| d	d| _d S )
N)r   r   r   r6   gh㈵>r  Tz	.linear_1r   z	.linear_2)r   r   r{  r|  merge_kernel_sizer   rk   r   r!  pre_normr   actr   linear_1r   linear_2r   r{  r|  r   r   r   rV   rW   r   M  s4   
zProjector.__init__image_featuresrd   r   c              
   C   s  | j \}}t|ttfrLt }t||D ]5\}}| |}|\}}	}
t|d||	| ||
| |d}| |\}}| |}| 	|\}}|
| q|S |jd d }|jd }|t||}| |d| j}| |}| |}| 	|}|jg |dR  S )Nz$(t h p1 w p2) d -> (t h w) (p1 p2 d))r   r   p1r   p2r   )r}  r3  r   r   zipr~  r   r  r  r  r   r   r   npprodr   )r   r  rd   m1m2processed_featuresimage_featurer   r   r   r   r  r  dimsr   rV   rV   rW   r   q  s8   


	




zProjector.forwardr  )rf   rg   rh   r   r   r  r   rk   rl   r   r   r   r   r   rV   rV   r   rW   rz  L  s&    $rz  	hf_inputsc              	   C   sx   |  dtd}|d}|  dtd}|d}ttd|td|tdtd|td|tddS )Nrd   )r   r_   r   rx   imagevideo)r\   rp   rd   rv   rz   rx   )r   rk   emptyr  r   r&   flat_from_sizesbatched)r  rd   image_grid_sizesrx   video_grid_sizesrV   rV   rW   _keye_field_config  s   




r  c                       sx   e Zd Zdeeejf ee B de	e
e
f dB f fddZdeeejf ee B de	e
e
f dB f fddZ  ZS )KeyeMultiModalDataParserdatar   Nc                    *   t |trt|dddhtdS t |S )Nr  rp   rd   modalityrequired_fieldsfields_factory)r3  r   r)   r  r   _parse_image_datar   r  r   rV   rW   r       

z*KeyeMultiModalDataParser._parse_image_datac                    r  )Nr  rz   rx   r  )r3  r   r)   r  r   _parse_video_datar  r   rV   rW   r    r  z*KeyeMultiModalDataParser._parse_video_data)rf   rg   rh   r   r  rk   rl   r#   r"   r+   r   r  r(   r  r   rV   rV   r   rW   r    s    r  c                   @   s.  e Zd ZdefddZdefddZdefddZd	d
 Zde	e
edB f fddZdede	e
ef de	e
ef fddZddddededededeeef f
ddZdededefddZdedededefddZdefdd Zdefd!d"Zd#edefd$d%Zdedefd&d'Zdedefd(d)ZdS )*KeyeProcessingInfor   c                 C      dS )Ni rV   r[  rV   rV   rW   get_max_image_size     z%KeyeProcessingInfo.get_max_image_sizec                 C   r  )N   rV   r[  rV   rV   rW   get_max_frame_per_video  r  z*KeyeProcessingInfo.get_max_frame_per_videokwargsc                 K   s   | j di |jS )NrV   )get_hf_processorimage_processor)r   r  rV   rV   rW   get_image_processor  s   z&KeyeProcessingInfo.get_image_processorc                 C   s   t |  dS )N)expected_hidden_size)r  _get_expected_hidden_sizer[  rV   rV   rW   get_data_parser  s   z"KeyeProcessingInfo.get_data_parserNc                 C   s
   d d dS Nr  r  rV   r[  rV   rV   rW   get_supported_mm_limits  s   
z*KeyeProcessingInfo.get_supported_mm_limitsseq_len	mm_countsc                 C   s   |   | |dS r  )get_max_image_tokensget_max_video_tokens)r   r  r  rV   rV   rW   get_mm_max_tokens_per_item  s   z-KeyeProcessingInfo.get_mm_max_tokens_per_itemr6   T)
num_frames	do_resizeimage_widthimage_heightr  r  c                C   s   |d u r|   }|  }|j}|j}|j}	d}
|r/t||||	 |j|jd\}}t||d}nt||d}|||
  }t	||
 d}|j
| }|j| }|| | }||	d  }||fS )Nr6   )rC   rD   rE   rF   rG   )rD   rC   r   )r  get_hf_configr|  r   spatial_merge_sizerX   rF   rG   r*   rL   rC   rD   )r   r  r  r  r  r  	hf_configr|  r   
merge_sizetemporal_patch_sizeresized_heightresized_widthpreprocessed_sizepadded_num_framesgrid_tgrid_hgrid_wr   num_vision_tokensrV   rV   rW   _get_vision_info  s0   	


z#KeyeProcessingInfo._get_vision_infoc                C   s   | j |||d\}}|S N)r  r  r  r  )r   r  r  r  r  num_image_tokensrV   rV   rW   get_num_image_tokens  s   
z'KeyeProcessingInfo.get_num_image_tokensc                C   s   | j ||||d\}}|S Nr  r  r  r  r  )r   r  r  r  r  r  num_video_tokensrV   rV   rW   get_num_video_tokens&  s   
z'KeyeProcessingInfo.get_num_video_tokensc                 C   s    | j |  |  d d\}}|S r  )r  r  )r   max_image_sizer  rV   rV   rW   !get_image_size_with_most_features6  s   
z4KeyeProcessingInfo.get_image_size_with_most_featuresc                 C   s   |   \}}| j||d dS r  )r  r  )r   target_widthtarget_heightrV   rV   rW   r  @  s   z'KeyeProcessingInfo.get_max_image_tokens
max_tokensc                 C   s@   |   \}}d}	 |d }| j|||d d}||kr	 |S |}q	)Nr   Tr6   r  )r  r  )r   r  r  r  r  next_num_framesnext_max_tokensrV   rV   rW   _get_max_video_framesI  s   z(KeyeProcessingInfo._get_max_video_framesc                 C   sZ   | j  }|d}|d}|  | }| || }t|t|d |  }t|dS )Nr  r  r6   )ctxget_mm_configget_limit_per_promptr  r  rM   rL   r  )r   r  	mm_config
max_images
max_videosmax_image_tokensmax_total_framesmax_frames_per_videorV   rV   rW   !get_num_frames_with_most_features^  s   



z4KeyeProcessingInfo.get_num_frames_with_most_featuresc                 C   s$   |   \}}| j||| |d dS r  )r  r  r  )r   r  r  r  rV   rV   rW   r  l  s   z'KeyeProcessingInfo.get_max_video_tokens)rf   rg   rh   r   r  r  objectr  r  r   r  r  r  r   r   r*   r  r  r  r  r  r  r  r  rV   rV   rV   rW   r    sf    





)



	r  _I)boundc                	   @   sX   e Zd Zdeeef defddZ	d
dedeeef deeef dB defdd	Z	dS )KeyeBaseDummyInputsBuilderr  r   c                 C   s>   | dd}| dd}| j }|j}|j}|| ||  S )Nr  r   r  )r   infor  image_tokenvideo_token)r   r  
num_images
num_videoshf_processorr  r  rV   rV   rW   get_dummy_text{  s   
z)KeyeBaseDummyInputsBuilder.get_dummy_textNr  
mm_optionsc                 C   s   | dd}| dd}| j \}}| j|}|r | dnd }	|r)| dnd }
| j||||	d| j|||||
dd}|S )Nr  r   r  )rD   rC   r  	overrides)rD   rC   r  r  r  r  )r   r  r  r  _get_dummy_images_get_dummy_videos)r   r  r  r  r  r  r  r  target_num_framesimage_overridesvideo_overridesmm_datarV   rV   rW   get_dummy_mm_data  s*   z,KeyeBaseDummyInputsBuilder.get_dummy_mm_datar  )
rf   rg   rh   r   r  r   r  r   r$   r  rV   rV   rV   rW   r  z  s    
r  c                   @   s   e Zd ZdS )KeyeDummyInputsBuilderN)rf   rg   rh   rV   rV   rV   rW   r    s    r  c                	   @   sX   e Zd Zdedeeef dedee	 fddZ
dedeeef deeef fdd	Zd
S )KeyeMultiModalProcessormm_itemshf_processor_mm_kwargsout_mm_kwargsr   c                    s   | j jd	i |}| j jd	i |}| j  }| }||j ||j d|jd dtdt	ffdd  fdddD S )
Nr  r   item_idxr  c                    sJ   | |  }|| d j }t|tjsJ t|   }| g| S )N	_grid_thw)r  r3  rk   rl   r   r  )r  r  out_itemgrid_thw
num_tokens)merge_lengthr  placeholderrV   rW   get_replacement_keye  s
   zIKeyeMultiModalProcessor._get_prompt_updates.<locals>.get_replacement_keyec              	      s(   g | ]}t || gt |d dqS ))r  )r  targetreplacement)r1   r   )r*  r  )r  r  rV   rW   r-    s    
z?KeyeMultiModalProcessor._get_prompt_updates.<locals>.<listcomp>rV   )
r  r  r  get_tokenizer	get_vocabr  r  r  r   r  )r   r  r  r  r  r  	tokenizervocabrV   )r  r  r  r  rW   _get_prompt_updates  s   

z+KeyeMultiModalProcessor._get_prompt_updatesr  c                 C   s   t |S r  )r  )r   r  r  rV   rV   rW   _get_mm_fields_config  s   z-KeyeMultiModalProcessor._get_mm_fields_configN)rf   rg   rh   r,   r   r  r   r'   r   r2   r
  r   r  r&   r  rV   rV   rV   rW   r    s"    

#

r  c                       s  e Zd Zg dddgdZeddddZed	ed
ededB fddZ	ddde
def fddZe		d7dedededB dedejf
ddZdedeejdf fddZ	d8ded d eej d!ejdB dejeej B fd"d#Zd$edefd%d&Zd$ededB fd'd(Z		d9d)ejdB d*ejd+edB d,ejdB d$edejeB fd-d.Z d/ejdejdB fd0d1Z!d2e"eeejf  de#e fd3d4Z$de%fd5d6Z&  Z'S ):BaseKeyeModule)r_  r`  ra  	gate_projup_proj)r   gate_up_projzlanguage_model.lm_head.zlanguage_model.model.)zlm_head.zmodel.)orig_to_new_prefixr  rV  r   Nc                 C   s$   | drdS | drdS td)Nr  z+<|vision_start|><|image_pad|><|vision_end|>r  z+<|vision_start|><|video_pad|><|vision_end|>z)Only image or video modality is supported)
startswithrN   )clsr  rV  rV   rV   rW   get_placeholder_str  s
   

z"BaseKeyeModule.get_placeholder_strr   )r   vllm_configr   c                   s   t    |jj}|j}|| _| |ddh" t|j|t	|dd| _
| j||j|t	|dd| _W d    n1 s<w   Y  | | t|t	|ddgd| _W d    n1 s]w   Y  | jj| _d S )	Nr  r  visualr   mlp_ARlanguage_modelQwen3ForCausalLM)r  r   architectures)r   r   model_configr  r   r}   _mark_tower_modelrX  r|  rA   r  _build_projectorr  _mark_language_modelr?   r  make_empty_intermediate_tensors)r   r  r   r}   r   r   rV   rW   r     s4   


zBaseKeyeModule.__init__r{  r|  r   c                 C   s   t d)NzNeed projector)NotImplementedErrorr  rV   rV   rW   r    s   zBaseKeyeModule._build_projectorimage_input.c                 C   sX  t  }t  }t  }dg}|d }|jdksJ t|D ]C\}}t|    }	t	|	}
|
|	 t|
t	|	dd   }|
| |
tj|
f|tjd |
|d |
  q|d dkritd	|d
 | jj}tj|dd|j}tj|tjd|j}tj|dd|j}| j|||dd||ddd	}t| ||}|S )Nr   rd   r   r6   r   r   r]   rp   z<Image embeddings are not supported for this processing path.r\   r   FT	r\   rd   r   rR  r   rO  r  r;  r<  r   ndim	enumerater   detachcpunumpytolistr  r  r   rk   r   fullint64rN   r]   r  r   r   r   r  rW  int32r  )r   r   siglip_position_idsimage_grid_hwsrO  r  rd   idxthaw	thw_tuplenumelimage_position_idsr\   rp   rV   rV   rW   _process_image_input  sN   


z#BaseKeyeModule._process_image_input
video_type)rz   rv   rx   rv   c                 C   sD  t  }t  }t  }dg}|jdksJ t|D ]C\}}	t|	    }
t	|
}|
|
 t|t	|
dd   }|
| |
tj|f|tjd |
|d |  q|dkrctd|| jj}tj|dd|j}tj|tjd|j}tj|dd|j}| j|||d	d	||d	dd
	}| ||}|S )Nr   r   r6   r   r   rz   z<Video embeddings are not supported for this processing path.r   Tr!  r"  )r   r4  rx   rv   r,  video_grid_hwsrO  r  r.  sub_thwr0  r1  video_position_idsrz   rV   rV   rW   _process_video_embedsK  sP   


z$BaseKeyeModule._process_video_embedsr  c                 K   sZ   i }|D ]&}|dv rd|vr| j di ||d< |dv r*d|vr*| jdi ||d< q|S )N)r\   rp   images)rv   rz   videosrV   )_parse_and_validate_image_input_parse_and_validate_video_input)r   r  
modalities	input_keyrV   rV   rW   %_parse_and_validate_multimodal_inputs  s   z4BaseKeyeModule._parse_and_validate_multimodal_inputsc           	      K   sv   | j di |}|sd S d}|D ](}|dkr%|d }| |}|t|7 }|dkr8|d }| |}|t|7 }q|S )NrV   r9  r:  )r?  r3  r   _process_video_input)	r   r  r=  multimodal_embeddingsr  r   r   video_inputvideo_embeddingsrV   rV   rW   embed_multimodal  s   

zBaseKeyeModule.embed_multimodal	input_ids	positionsintermediate_tensorsr?  c                 K   s$   |durd}| j j||||d}|S )aU  Run forward pass for Keye-VL.

        Args:
            input_ids: Flattened (concatenated) input_ids corresponding to a
                batch.
            positions: Flattened (concatenated) position ids corresponding to a
                batch.
                **NOTE**: If mrope is enabled (default setting for Qwen2-VL
                opensource models), the shape will be `(3, seq_len)`,
                otherwise it will be `(seq_len,)`.
            intermediate_tensors: Intermediate tensors from prior forward pass.
            inputs_embeds: Optional tensor of input embeddings.
        N)rE  rF  rG  r?  )r  model)r   rE  rF  rG  r?  r  r  rV   rV   rW   r     s   zBaseKeyeModule.forwardr  c                 C   s   | j |S r  )r  compute_logits)r   r  rV   rV   rW   rI    s   zBaseKeyeModule.compute_logitsr^  c                 C   s   t | }|j|| jdS )N)mapper)r=   ru  hf_to_vllm_mapper)r   r^  loaderrV   rV   rW   ru    s   zBaseKeyeModule.load_weightsc                 C   s   t jddddS )z+Get the module prefix in multimodal models.r  zmlp_AR.zvisual.)r  	connectortower_model)r    from_string_fieldr[  rV   rV   rW   get_mm_mapping  s
   zBaseKeyeModule.get_mm_mappingr  r  )NN)(rf   rg   rh   packed_modules_mappingr>   rK  classmethodr  r   r  r   r   r   r   r   r   ry  r  r   r   rk   rl   r3  r	   r   r8  r  r   r?  r7   rD  r3   r   rI  r   re  ru  r    rP  r   rV   rV   r   rW   r    s~    	2
4
!
$r  )r  dummy_inputsc                   @   s   e Zd Z		ddedededB dedejf
dd	Zd
e	de
dB fddZd
e	dedB fddZdedeejdf fddZdee dee deejef fddZdS )KeyeForConditionalGenerationNr   r{  r|  r   r   r   c                 C   s   t ||||S r  )rz  r  rV   rV   rW   r    s   z-KeyeForConditionalGeneration._build_projectorr  c                 K   h   | dd }| dd }| dd }|d u r|d u rd S |d ur'td||dS |d ur2td||dS d S )Nr\   rp   rd   )r]   r\   rd   )r]   rp   rd   )r   rZ   ro   )r   r  r\   rp   rd   rV   rV   rW   r;    $   z<KeyeForConditionalGeneration._parse_and_validate_image_inputc                 K   rU  )Nrv   rz   rx   )r]   rv   rx   )r]   rz   rx   )r   ru   ry   )r   r  rv   rz   rx   rV   rV   rW   r<    rV  z<KeyeForConditionalGeneration._parse_and_validate_video_inputrB  .c                 C   s.   |d }|d }| dd }t| |||S )Nr]   rx   rv   )r   r   r8  )r   rB  r4  rx   rv   rV   rV   rW   r@    s   z1KeyeForConditionalGeneration._process_video_inputinput_tokensmm_featuresc           $   	   C   sD  t |ddh}dd |dg D }dd |dg D }t|tr-t|dkr-|d }dtjtt B dttt  fd	d
}||}| j	}|j
}|j}	|jj}
t|}t|}g }d}||}}d\}}t|| D ]}|dkrz|||}W n ty   t|d }Y nw t|d }|dkrz||	|}W n ty   t|d }Y nw t|d }||k r|| \}}}|d7 }|d8 }|}n|| \}}}|d7 }|d8 }|}|||
 ||
 }}}|| }t|dkr|d  d nd}|t|dddd|  t|ddd||   }t|ddd|d| } t|ddd||d }!|t|| |!g| |  ||| |  }qk|t|k rt|dkrm|d  d nd}t|| }|t|dddd|  tj|dddd}"|" d t|  }#|"|#fS )Nrd   rx   c                 S      g | ]}|  qS rV   r(  r*  itemrV   rV   rW   r-  +      zJKeyeForConditionalGeneration.get_mrope_input_positions.<locals>.<listcomp>c                 S   rY  rV   rZ  r[  rV   rV   rW   r-  ,  r]  r   r  r   c                 S   s   t | trtj| tjd} |  dkrg S | dddf | ddddf }}t|ddddf }tj||gddj|dd}|	 S )a  
            Split grid_thw along the t dimension.

            Args:
                grid_thw: shape [N, 3] tensor or nested list of [t, h, w].

            Returns:
                List of [1, h, w] rows, repeated t times for each original row.
            r   r   Nr6   r   )
r3  r   rk   rW  longr1  	ones_likecatrepeat_interleaver(  )r  r   hwonesoutrV   rV   rW   	split_thw1  s   
&zIKeyeForConditionalGeneration.get_mrope_input_positions.<locals>.split_thw)r   r   r6   r   r_   r   )r%   gather_kwargsr   r3  r   r   rk   rl   r   r}   image_token_idvideo_token_idr|  r  r/  indexrN   rL   r   r   r   r   r^  r   r>  r`  r   r\  )$r   rW  rX  r  rd   rx   re  r  rg  rh  r  
image_nums
frame_numsllm_pos_ids_liststremain_imagesremain_framesimage_indexvideo_indexr  ed_imageed_videor   r   r   ed
llm_grid_t
llm_grid_h
llm_grid_wtext_lenst_idxt_indexh_indexw_indexllm_positionsmrope_position_deltarV   rV   rW   get_mrope_input_positions"  s   $

 



"z6KeyeForConditionalGeneration.get_mrope_input_positionsr  )rf   rg   rh   r   r   r  r   ry  r  r  rt   r;  r{   r<  r   rk   rl   r@  r   r   r%   r  rV   rV   rV   rW   rT    sH    
	


rT  )rO   abcr   collections.abcr   r   r   	functoolsr   typingr   r   r	   r
   r   r'  r  rk   torch.nnr   einopsr   transformersr   transformers.activationsr   %transformers.feature_extraction_utilsr   transformers.modeling_outputsr   r   transformers.utilsr   vllm.configr   vllm.config.multimodalr   vllm.distributedr   vllm.loggerr   $vllm.model_executor.layers.attentionr   vllm.model_executor.layers.convr   !vllm.model_executor.layers.linearr   r   r   'vllm.model_executor.layers.quantizationr   2vllm.model_executor.layers.rotary_embedding.commonr   -vllm.model_executor.model_loader.weight_utilsr   r   )vllm.model_executor.models.module_mappingr    vllm.multimodalr!   vllm.multimodal.inputsr"   r#   r$   r%   r&   r'   r(   vllm.multimodal.parser)   r*   r+   r,   r-   vllm.multimodal.processingr.   r/   r0   r1   r2   vllm.sequencer3   vllm.utils.tensor_schemar4   r5   
interfacesr7   r8   r9   r:   r;   siglipr<   utilsr=   r>   r?   r@   rA   r7  rB   rf   rI   r   rX   rZ   ro   rt   rj   ru   ry   r{   ry  r|   rl   r   r   r   r  r  r)  rL  rX  rz  r  r  r  r  r  r  r  r  r  register_processorrT  rV   rV   rV   rW   <module>   s   
$	
+ 

v4]N~L
$ '-,  


