o
    
۾i<                    @   s  d Z ddlmZmZmZmZmZ ddlmZm	Z	 ddl
mZ ddlmZ ddlZddlZddlmZ ddlm  mZ ddlmZ ddlmZ dd	lmZ dd
lmZmZ ddl m!Z!m"Z" dd	l#mZ$ ddl%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z,m-Z- ddl.m/Z/ ddl0m1Z1 ddl2m3Z3 ddl4m5Z5 ddl6m7Z7m8Z8 ddl9m:Z: ddl;m<Z< ddl=m>Z> ddl?m@Z@ ddlAmBZB ddlCmDZD ddlEmFZF ddlGmHZHmIZImJZJmKZK ddlLmMZMmNZNmOZOmPZPmQZQmRZRmSZS ddlTmUZUmVZV ddlWmXZXmYZYmZZZm[Z[m\Z\ dd l]m^Z^ dd!l_m`Z` dd"lambZb dd#lcmdZd d$d%lemfZfmgZgmhZhmiZimjZjmkZkmlZlmmZm d$d&lnmoZompZpmqZqmrZrmsZsmtZtmuZu d$d'lvmwZwmxZxmyZy d$d(lzm{Z{m|Z| d$d)l}m~Z~mZmZmZmZ d$d*lmZmZmZ e1eZd+ZG d,d- d-ejZG d.d/ d/ejZG d0d1 d1ejZG d2d3 d3ejZG d4d5 d5ejZG d6d7 d7exZG d8d9 d9eXe ZG d:d; d;eYe Ze(dd<dddd=d>G d?d@ d@e|ZG dAdB dBe{ZeFjeeedCG dDdE dEejejeheleiegek	ZdS )FzAInference-only Qwen3VL model compatible with HuggingFace weights.    )CallableIterableIteratorMappingSequence)	lru_cachepartial)islice)AnyN)BatchFeature)Qwen2VLImageProcessorFast)smart_resize)Qwen3VLProcessorQwen3VLVideoProcessor)Qwen3VLConfigQwen3VLVisionConfig)VideoMetadata)support_torch_compile)
VllmConfig)BaseDummyOptionsVideoDummyOptions)get_pp_group)init_logger)_ACTIVATION_REGISTRY)Conv3dLayer)ColumnParallelLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)get_rope)ParallelLMHead)default_weight_loader)MultiModelKeys)MULTIMODAL_REGISTRY)compute_mrope_for_mediacompute_retained_tokens_countcompute_retention_maskrecompute_mrope_positions)MultiModalDataDictMultiModalFeatureSpecMultiModalFieldConfigMultiModalKwargsItemMultiModalKwargsItemsPlaceholderRange	VideoItem)	ImageSizeMultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorPromptReplacementPromptUpdatePromptUpdateDetails)IntermediateTensors)
is_list_of)round_up)AttentionBackendEnum   )MultiModalEmbeddingsSupportsEagle3SupportsLoRASupportsMRoPESupportsMultiModalSupportsMultiModalPruning
SupportsPP_require_is_multimodal)Qwen2_5_VisionAttentionQwen2_5_VLImageEmbeddingInputsQwen2_5_VLImageInputsQwen2_5_VLImagePixelInputsQwen2_5_VLVideoEmbeddingInputsQwen2_5_VLVideoInputsQwen2_5_VLVideoPixelInputs)Qwen2VLMultiModalDataParserQwen2VLProcessingInfo_create_qwen2vl_field_factory)Qwen3ForCausalLM
Qwen3Model)AutoWeightsLoaderPPMissingLayerWeightsMapper_merge_multimodal_embeddingsmaybe_prefix)get_vit_attn_backendis_vit_use_data_parallel!run_dp_sharded_mrope_vision_modeli   c                       sR   e Zd Z				ddedededed	d
f
 fddZdejd	ejfddZ  ZS )Qwen3_VisionPatchEmbed           
patch_sizetemporal_patch_sizein_channelshidden_sizereturnNc                    s>   t    || _|| _|| _|||f}t||||dd| _d S )NT)kernel_sizestridebias)super__init__r\   r]   r_   r   proj)selfr\   r]   r^   r_   ra   	__class__ W/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/qwen3_vl.pyre      s   

zQwen3_VisionPatchEmbed.__init__xc                 C   s:   |j \}}||d| j| j| j}| ||| j}|S N)shapeviewr]   r\   rf   r_   )rg   rl   LCrj   rj   rk   forward   s   
zQwen3_VisionPatchEmbed.forward)rX   rY   rZ   r[   )	__name__
__module____qualname__intre   torchTensorrs   __classcell__rj   rj   rh   rk   rW      s"    rW   c                       sd   e Zd Zdejddfdedededeej	gej	f de
dB d	ef fd
dZdej	fddZ  ZS )Qwen3_VisionMLPFN in_featureshidden_featuresrc   act_fnquant_configprefixc              	      sV   t    t }t||||d| d|d| _t||||d| d|d| _|| _d S )NF.linear_fc1)rc   r   return_biasr   
disable_tp.linear_fc2)rd   re   rU   r   
linear_fc1r   
linear_fc2r   )rg   r}   r~   rc   r   r   r   use_data_parallelrh   rj   rk   re      s*   
		
	zQwen3_VisionMLP.__init__rl   c                 C   s   |  | | |}|S N)r   r   r   )rg   rl   
mlp_outputrj   rj   rk   rs      s   zQwen3_VisionMLP.forward)rt   ru   rv   Fsilurw   boolr   rx   ry   r   strre   rs   rz   rj   rj   rh   rk   r{      s&    r{   c                       s   e Zd Zejdddfdedededeejgejf deege	j
f dB dedB d	ed
df fddZdejdejdejdejdejd
ejfddZ  ZS )Qwen3_VisionBlockNr|   dim	num_headsmlp_hidden_dimr   
norm_layerr   r   r`   c                    sn   t    |d u rttjdd}||| _||| _t||||| dd| _t	|||d|| dd| _
d S )Nư>epsz.attn)	embed_dimr   projection_sizer   r   Tz.mlp)r   rc   r   r   )rd   re   r   nn	LayerNormnorm1norm2rC   attnr{   mlp)rg   r   r   r   r   r   r   r   rh   rj   rk   re      s(   



zQwen3_VisionBlock.__init__rl   
cu_seqlensrotary_pos_emb_cosrotary_pos_emb_sin
max_seqlenc                 C   s6   || j | |||||d }|| | | }|S )Nr   r   r   r   )r   r   r   r   )rg   rl   r   r   r   r   rj   rj   rk   rs      s   zQwen3_VisionBlock.forward)rt   ru   rv   r   r   rw   r   rx   ry   r   Moduler   r   re   rs   rz   rj   rj   rh   rk   r      sF    	r   c                       st   e Zd Z					ddededeegejf dB ded	ed
edB de	ddf fddZ
dejdejfddZ  ZS )Qwen3_VisionPatchMergerNrY   Fr|   d_modelcontext_dimr   spatial_merge_sizeuse_postshuffle_normr   r   r`   c           	         s   t    t }||d  | _|| _| jr| j}|d u r#ttjdd}||| _t	| j| jd|| d|d| _
t | _t| j|d|| d|d| _d S )NrY   r   r   Tr   )rc   r   r   r   r   )rd   re   rU   r_   r   r   r   r   normr   r   GELUr   r   r   )	rg   r   r   r   r   r   r   r   r   rh   rj   rk   re     s4   



z Qwen3_VisionPatchMerger.__init__rl   c                 C   sZ   | j r| |d| j}n
| |d| j}| |\}}| |}| |\}}|S rm   )r   r   rp   r_   r   r   r   )rg   rl   
x_parallel_outrj   rj   rk   rs   ,  s   
zQwen3_VisionPatchMerger.forward)NrY   FNr|   )rt   ru   rv   rw   r   r   r   r   r   r   re   rx   ry   rs   rz   rj   rj   rh   rk   r     s0    	'r   c                       s$  e Zd Z			d$dedededB deddf
 fd	d
Zede	j
fddZ
ede	jfddZeedddededede	jfddZdeee  fddZdeee  de	jfddZde	jde	jfddZde	jde	jeee  B de	jfdd Zd!eeee	jf  dee fd"d#Z  ZS )%Qwen3_VisionTransformerr   Nr|   vision_confignorm_epsr   r   r`   c                    s  t    j_j_j_j_j_jd _j_t	dr+j
ng _
tjd _jdtj
  _tjjjjd_tjj_ttj|d jj }t|ddd	did
_tjj j dd_t fddttj
D _t|t  d_!j!t"j#t"j$t"j%hvrt&dj! dt fddtj'D _(d S )NrY   deepstack_visual_indexesg      ?r:   )r\   r]   r^   r_   r   i    Tpartial_rotary_factor)	head_sizemax_positionis_neox_stylerope_parametersz.merger)r   r   r   r   r   r   c                    s2   g | ]}t jjjd   d| dqS )Tz.deepstack_merger_list.)r   r   r   r   r   r   r   )r   out_hidden_sizer_   r   .0	layer_idxr   r   r   rg   r   rj   rk   
<listcomp>q  s    
z4Qwen3_VisionTransformer.__init__.<locals>.<listcomp>)r   dtypezQwen3-VL does not support z backend now.c                    s8   g | ]}t jjjtj   d | dqS )z.blocks.)r   r   r   r   r   r   r   )r   r_   r   intermediate_sizer   
hidden_actr   r   rj   rk   r     s    
))rd   re   r_   r   num_position_embeddingsr\   r   spatial_merge_unitr]   hasattrr   rw   num_grid_per_sider   lenrW   r^   patch_embedr   	Embedding	pos_embedr   r   r   rotary_pos_embr   merger
ModuleListrangedeepstack_merger_listrT   rx   get_default_dtypeattn_backendr9   
FLASH_ATTN
TORCH_SDPAROCM_AITER_FARuntimeErrordepthblocks)rg   r   r   r   r   head_dimrh   r   rk   re   9  sz   
	


z Qwen3_VisionTransformer.__init__c                 C      | j jjjS r   )r   rf   weightr   rg   rj   rj   rk   r        zQwen3_VisionTransformer.dtypec                 C   r   r   )r   rf   r   devicer   rj   rj   rk   r     r   zQwen3_VisionTransformer.devicei   )maxsizehwr   c                 C   s   t t | | d| |f}| | }|| }|||||}|dddd}| }t t |d|| |f}|||||}|dddd}| }tt j||gddS )Nr:   r   rY   rZ   rn   axis)	npbroadcast_toarangereshape	transposeflattenrx   
from_numpystack)r   r   r   hpos_idsh_divw_divwpos_idsrj   rj   rk   rot_pos_ids  s*   z#Qwen3_VisionTransformer.rot_pos_idsgrid_thwc                    sr   t dd |D } fdd|D }tj|ddj jdd} j|\}}|| d	}|| d	}||fS )
Nc                 s   s     | ]\}}}t ||V  qd S r   max)r   r   r   r   rj   rj   rk   	<genexpr>  s    z6Qwen3_VisionTransformer.rot_pos_emb.<locals>.<genexpr>c                    sB   g | ]\}}}|d kr  || jn  || j|d qS )r:   )r   r   repeat)r   tr   r   r   rj   rk   r     s    z7Qwen3_VisionTransformer.rot_pos_emb.<locals>.<listcomp>r   r   Tnon_blockingr:   )r   rx   cattor   r   get_cos_sinr   )rg   r   max_grid_sizepos_idscossincos_combinedsin_combinedrj   r   rk   rot_pos_emb  s   
z#Qwen3_VisionTransformer.rot_pos_embc           #      C   s   | j }| j}| jj}g }|D ]\}}}tjd|d |tj| jd}	tjd|d |tj| jd}
|	tj	}|
tj	}tj
|d |d d}tj
|d |d d}|	| }|
| }tj||dd\}}tj||dd\}}tj||dd\}}|| }|| }|| }d| | }t||||g}t||||g}|| }|| dd}tj||||gdd	ddd}|j| jd
}| |} | |9 } | jdd	}!|!|| ||| ||}!|!ddddddd|}!|!|ddd|}"||" qtj|dd	S )Nr   r:   )r   r   r   ij)indexing   rn   r   r   rY   rZ   )r   r   r   embedding_dimrx   linspacefloat32r   r   longclampmeshgridr   r   r   sumpermuteexpandappendr   )#rg   r   r   m_size
hidden_dimoutputsr   r   r   h_idxsw_idxsh_floorw_floorh_ceilw_ceildhdwdh_griddw_gridh_floor_gridw_floor_gridh_ceil_gridw_ceil_gridw11w10w01w00h_gridw_grid
h_grid_idxindicesweightsembedscombinedrepeatedrj   rj   rk   fast_pos_embed_interpolate  sP   	 
z2Qwen3_VisionTransformer.fast_pos_embed_interpolater   c                 C   sH   t jg |jd}| jtjks| jtjkr"|dd  |d d   }|S )N)r   r:   rn   )rx   zerosr   r   r9   r   r   r   )rg   r   r   rj   rj   rk   compute_attn_mask_seqlen  s
   z0Qwen3_VisionTransformer.compute_attn_mask_seqlenrl   c                 C   sp  |j | j| jdd}| |}t|tr|}tj|tjd}n|	 }|
 }| |}|| }| |\}}t|d d df |d d df  |d d df jdtjd}ttjdtjd|g}t|}|d}| |}	|j | jdd}g }
t| jD ]$\}}||||||	d	}|| jv r| j|}| j| |}|
| q| |}tj|g|
 dd
}|S )NT)r   r   r   r  r:   rY   r   )r   r   r   r   r   )r   r   r   r   
isinstancelistr   arrayint32tolistnumpyr,  r   r   cumsumconcatenater-  rx   r   	unsqueezer.  	enumerater   r   indexr   r  r   r   )rg   rl   r   hidden_statesgrid_thw_list
pos_embedsr   r   r   r   deepstack_feature_lists	layer_numblkdeepstack_merger_idxdeepstack_featurerj   rj   rk   rs     sN   


4






zQwen3_VisionTransformer.forwardr(  c                 C   s   g d}t | jdd}t }|D ]9\}}|D ]\}}}	||vr!q|||}|| }
|
j}||
||	  n|| }
t|
dt}||
| || q|S )N))	attn.qkv.zattn.q.q)rB  zattn.k.k)rB  zattn.v.vF)remove_duplicateweight_loader)dictnamed_parameterssetreplacerG  getattrr!   add)rg   r(  stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
param_nameweight_nameshard_idparamrG  rj   rj   rk   load_weightsJ  s"   
z$Qwen3_VisionTransformer.load_weights)r   Nr|   )rt   ru   rv   r   floatr   r   re   propertyrx   r   r   staticmethodr   rw   ry   r   r0  r   r,  r.  rs   r   tuplerJ  rW  rz   rj   rj   rh   rk   r   8  sJ    b <

,2r   c                       sD  e Zd Zdd ZdedefddZdedefddZdede	fd	d
Z
dd Zddddededededee	B dB deeef fddZd-dededef fddZdedeeef def fddZdedeeef defd d!Zd"ee ejB d#ed$efd%d&Z		d.d'eeef d(ed)edB d*edB dee f
d+d,Z  ZS )/Qwen3VLProcessingInfoc                 C   s   | j tS r   )ctxget_hf_configr   r   rj   rj   rk   r^  g  s   z#Qwen3VLProcessingInfo.get_hf_configkwargsr`   c                 K   s    | j jtfd|ddi|S )Nuse_fastT)r]  get_hf_processorr   poprg   r_  rj   rj   rk   ra  j  s   
z&Qwen3VLProcessingInfo.get_hf_processorc                 K      | j di |jS Nrj   )ra  image_processorrc  rj   rj   rk   get_image_processorq     z)Qwen3VLProcessingInfo.get_image_processorc                 K   rd  re  )ra  video_processorrc  rj   rj   rk   get_video_processort  rh  z)Qwen3VLProcessingInfo.get_video_processorc                 C   s   t |  jjd|  dS )NT)video_needs_metadataexpected_hidden_size)rJ   r^  r   r   _get_expected_hidden_sizer   rj   rj   rk   get_data_parserw  s
   
z%Qwen3VLProcessingInfo.get_data_parserrY   T)
num_frames	do_resizeimage_widthimage_heightro  rp  rf  Nc                C   s  |d u r|dkr|   }n|d u r|  }t|t}|  }|j}|j}	|j}
|j}|rX|r6t	}||d}nt
}i }|d|||	|
 |jd |jd d|\}}t||d}nt||d}t||}t|| d}|j|	 }|j|	 }|| | }||
d  }||fS )	Nr:   )ro  temporal_factorshortest_edgelongest_edge)heightwidthfactor
min_pixels
max_pixels)rw  rv  rY   rj   )rj  rg  r/  r   r^  r   r\   r   r]   video_smart_resizeimage_smart_resizesizer/   r8   r   rv  rw  )rg   rq  rr  ro  rp  rf  is_video	hf_configr   r\   
merge_sizer]   r   extra_kwargsresized_heightresized_widthpreprocessed_sizepadded_num_framesgrid_tgrid_hgrid_wnum_patchesnum_vision_tokensrj   rj   rk   _get_vision_info~  sH   	





z&Qwen3VLProcessingInfo._get_vision_info
max_tokensstart_num_framesc                    s   t  j||dS )N)r  )rd   _get_max_video_frames)rg   r  r  rh   rj   rk   r    s   z+Qwen3VLProcessingInfo._get_max_video_framesseq_len	mm_countsc                    s   t  j||tdS )N)max_frames_per_video)rd   !get_num_frames_with_most_featuresDUMMY_VIDEO_NUM_FRAMES)rg   r  r  rh   rj   rk   r    s   z7Qwen3VLProcessingInfo.get_num_frames_with_most_featuresc                 C   s>   |   }|jd }| j||j d\}}| j||dd d}|S )Nru  rz  rY   rq  rr  ro  rf  )rj  r}  !get_image_size_with_most_featuresr]   get_num_video_tokens)rg   r  r  ri  video_max_pixelstarget_widthtarget_heightnum_video_soft_tokensrj   rj   rk   get_max_video_tokens  s   

z*Qwen3VLProcessingInfo.get_max_video_tokensr'  	video_fpsr  c                    sv   t |ts	| }t|  dkr ||d g t|     }fdd|D  fddtdt D S )Nr   rn   c                    s   g | ]}|  qS rj   rj   r   idx)r  rj   rk   r         z?Qwen3VLProcessingInfo._calculate_timestamps.<locals>.<listcomp>c                    s(   g | ]}| |  d    d qS )r:   rY   rj   r   i)r  
timestampsrj   rk   r     s    )r/  r0  r3  r   r   )rg   r'  r  r  rj   )r  r  r  rk   _calculate_timestamps  s   
z+Qwen3VLProcessingInfo._calculate_timestampsmetadataout_itemdo_sample_framessampled_fpsc                 C   s   |   }|j}|d }|d }|d u r|dd}|rM|r|n|j}|d }	t|	|d  | }
ttt|
|j|j|	}
t	
d|	d |
 t }| |||}|S )Nframes_indicesfpsr  Ftotal_num_framesr   r:   )rj  r  getr  rw   minr   
min_frames
max_framesr   r  roundastyper3  r  )rg   r  r  r  r  ri  r  r'  r  r  ro  r  rj   rj   rk   _get_video_second_idx  s0   
z+Qwen3VLProcessingInfo._get_video_second_idx)rY   NN) rt   ru   rv   r^  objectr   ra  r   rg  r   rj  rn  rw   r   r[  r/   r  r  r   r   r  r  r0  rx   ry   rX  r  rH  r
   r+   r  rz   rj   rj   rh   rk   r\  f  sp    


7
	



r\  c                   @   sz   e Zd Zdeeef defddZ	ddedeeef deeef dB defdd	Z	d
edededede
e f
ddZdS )Qwen3VLDummyInputsBuilderr  r`   c                 C   s0   | dd}| dd}d}d}|| ||  S )Nimager   video+<|vision_start|><|image_pad|><|vision_end|>+<|vision_start|><|video_pad|><|vision_end|>)r  )rg   r  
num_images
num_videosimage_tokenvideo_tokenrj   rj   rk   get_dummy_text  s
   z(Qwen3VLDummyInputsBuilder.get_dummy_textNr  
mm_optionsc                 C   s  | dd}| dd}|r| dnd }|r| dnd }| j \}}	d}
|rOt|ts0J |j}|rO||
kr@td||
 |dk rJtd| t|
|}
t	|
d}
| j
 }|jd }| jj||j d\}}| jj|||
|d	\}}|j|j}}|rt|tsJ |j}|r||krtd
|| t||}|j}|r||krtd|| t||}| j||	||d| j|||
|ddS )Nr  r   r  rY   z]video.num_frames override (%d) exceeds model's maximum number of frames (%d), will be ignoredzEvideo.num_frames override (%d) cannot be less than 2, will be ignoredru  r  r  zMvideo.width override (%d) exceeds model's maximum width (%d), will be ignoredzOvideo.height override (%d) exceeds model's maximum height (%d), will be ignored)rw  rv  r  	overrides)rw  rv  ro  r  )r  r  )r  infor  r/  r   ro  loggerwarningr  r   rj  r}  r]   r  rw  rv  _get_dummy_images_get_dummy_videos)rg   r  r  r  r  r  image_overridesvideo_overridestarget_image_widthtarget_image_heighttarget_num_framesnum_frames_overrideri  r  target_video_widthtarget_video_heighttarget_video_sizer   width_overrideheight_overriderj   rj   rk   get_dummy_mm_data  s   




	

z+Qwen3VLDummyInputsBuilder.get_dummy_mm_datarw  rv  ro  r  c          
      C   sj   t j|||dfdt jd}g }t|D ]}d|d |dd t|D ddd	}| |f}	||	 q|S )
NrZ      r  g       @c                 S   s   g | ]}|qS rj   rj   r  rj   rj   rk   r         z?Qwen3VLDummyInputsBuilder._get_dummy_videos.<locals>.<listcomp>opencvF)r  durationr  r  video_backendr  )r   fulluint8r   copyr  )
rg   rw  rv  ro  r  r  video_itemsr  video_metadata
video_itemrj   rj   rk   r  z  s   z+Qwen3VLDummyInputsBuilder._get_dummy_videosr   )rt   ru   rv   r   r   rw   r  r   r(   r  r0  r.   r  rj   rj   rj   rk   r    s.    

`r  c                
       s   e Zd Zdedeeef deeef deeef def
 fddZded	eeef deeef fd
dZ	de
d	eeef dedee fddZ  ZS )Qwen3VLMultiModalProcessorpromptmm_data	mm_kwargs
tok_kwargsr`   c                    sH  t |}| jjdi |}|dg  }rg }g }|D ]b}	|	\}
 t di |}d|vr4 dd|d< tdi  fdd D  t  }|
gg|d<  gg|d< t jd|||d}|d	}|j	|d
 }|
d|d}||d  ||d  qt t|t|d}nt  }t j||||d}t |fi |}t|S )Nvideosr  Fc                    s   i | ]}|d kr| | qS )r  rj   )r   rD  r  rj   rk   
<dictcomp>  s    zAQwen3VLMultiModalProcessor._call_hf_processor.<locals>.<dictcomp>r  r  )r  r  r  r  	input_idsr   r:   video_grid_thwpixel_values_videos)r  r  rj   )rH  r  ra  rb  r  r   rd   _call_hf_processor	tokenizerbatch_decoderK  r  rx   r   r   )rg   r  r  r  r  	processorr  video_grid_thw_lstpixel_values_videos_lstitemvideo_arrayvideo_mm_kwargsvideo_mm_datavideo_outputsr  video_placeholderprocessed_outputscombined_outputsrh   r  rk   r    sd   	
z-Qwen3VLMultiModalProcessor._call_hf_processor	hf_inputshf_processor_mm_kwargsc                 C   s   t | j jj|S r   )rL   r  r^  r   r   )rg   r  r  rj   rj   rk   _get_mm_fields_config  s
   z0Qwen3VLMultiModalProcessor._get_mm_fields_configmm_itemsout_mm_kwargsc              
      s   j jdi  j jdi }j  j  }|j|j	|j|jd dt	f fdd}dt	f	f	dd}t
d j|dt
d	d
|dgS )NrY   item_idxc                    sB   d |  }|d j }t|tjsJ t|  } jg| S )Nr  image_grid_thw)datar/  rx   ry   rw   prodimage_token_id)r  r  r   
num_tokens)hf_processormerge_lengthr  rj   rk   get_image_replacement_qwen3vl  s
   
zUQwen3VLMultiModalProcessor._get_prompt_updates.<locals>.get_image_replacement_qwen3vlc                    s  d |  }|d j }t|tjsJ d |  \}}d}d}t|tr.||  }j||||}t	||d ksNJ dt	| d|d  dfd	d
|D }t
|dd      fdd
|D }	jj j}
|
d ur|
dkrt t	||
}t	|dkrg }	nAt	|dkr g}	n7 }t|| d}|t	|d  }|t	|d  }|g}	tdt	|D ]}||d |k rdnd }|	| qg }t|D ]#\}}|| |	|t	|	k r|nd }|	gg|  g  qt|S )Nr  r  r  r  r   zThe timestamps length(z ) should be equal video length (z).c                    s$   g | ]} j d |ddddqS )<z.1fz	 seconds>F)add_special_tokens)encode)r   	curr_time)r  rj   rk   r     s    ziQwen3VLMultiModalProcessor._get_prompt_updates.<locals>.get_video_replacement_qwen3vl.<locals>.<listcomp>r:   c                    s   g | ]} qS rj   rj   r   r   tokens_per_framerj   rk   r     r          rn   )r  r/  rx   ry   r  r7   rX  r  r  r   rw   r  r]  get_mm_configvideo_pruning_rater%   r   r   r  r8  extendr5   select_token_id)r  r  r   r  r  r  r  r  frames_idx_tokenper_frame_token_countsr  total_retainedfirst_frame_tokensremaining_tokensbase	remainder	frame_idxextraplaceholdertimestamp_tokenstokens_this_frame)	r  r  r  r  rg   r  video_token_idvision_end_token_idvision_start_token_idr	  rk   get_video_replacement_qwen3vl  sl   





zUQwen3VLMultiModalProcessor._get_prompt_updates.<locals>.get_video_replacement_qwen3vlr  )modalitytargetreplacementr  r  rj   )r  ra  rg  get_tokenizerr^  r  r  r  r  rw   r3   r  )rg   r  r  r  rf  r  r  r  rj   )
r  r  r  r  r  rg   r  r  r  r  rk   _get_prompt_updates  s*   


"=z.Qwen3VLMultiModalProcessor._get_prompt_updates)rt   ru   rv   r   r   r  r   r  r*   r  r0   r
   r,   r   r4   r$  rz   rj   rj   rh   rk   r    s8    


M


	
r  rn   r  	positionsintermediate_tensorsinputs_embedsdeepstack_input_embeds)dynamic_arg_dimsc                       sr   e Zd Zdddedef fddZ			ddejdB d	ejd
edB dejdB dedB dejeB fddZ	  Z
S )Qwen3LLMModelr|   r   vllm_configr   c                   sP   t  j||d |jjj}t js"t|dr$| jt	|j
ks&J dd S d S d S )Nr-  r   r   zLstart_layer should be greater than or equal to len(deepstack_visual_indexes))rd   re   model_configr  r   r   is_first_rankr   start_layerr   r   )rg   r-  r   r   rh   rj   rk   re   [  s   

zQwen3LLMModel.__init__Nr  r&  r'  r(  r)  r`   c                 C   s   t  jr|d ur|}n| |}d }n|d usJ |d }|d }g }tt| j| j| jD ].\}	}
|	| jv r>|	||  |
|||\}}|d ur\|	t
dt|v r\||d|	   }q.t  jsht||dS | ||\}}t|dkrz||fS |S )Nr:  residualr   deepstack_input_embeds_)r:  r2  )r   r0  embed_input_idsr	   r8  layersr1  	end_layeraux_hidden_state_layersr  r   r   is_last_rankr6   r   )rg   r  r&  r'  r(  r)  r:  r2  aux_hidden_statesr   layerr   rj   rj   rk   rs   f  sF   	

zQwen3LLMModel.forward)NNN)rt   ru   rv   r   r   re   rx   ry   r6   rs   rz   rj   rj   rh   rk   r+  N  s$    r+  c                       s,   e Zd Zdddedef fddZ  ZS )Qwen3LLMForCausalLMr|   r,  r-  r   c                   s   t t|   |jjj}|j}|| _|| _t|t	|dd| _
t jr8|jr,| j
j| _nt|j|j|dd| _nt | _t|j| _| j
j| _d S )Nmodelr.  lm_head)r   r   )rd   rM   re   r/  r  text_configr   configr+  rS   r<  r   r8  tie_word_embeddingsembed_tokensr=  r    
vocab_sizer_   rP   r   logits_processormake_empty_intermediate_tensors)rg   r-  r   r?  r   rh   rj   rk   re     s*   


zQwen3LLMForCausalLM.__init__)rt   ru   rv   r   r   re   rz   rj   rj   rh   rk   r;    s    $r;  )r  dummy_inputsc                       s  e Zd Zg dddgdgdZdZeddd	d
dZedede	dedB fddZ
dddedef fddZdee	df ddfddZdee	df fddZde	dedB fdd Zd!ejddfd"d#Zde	ddfd$d%Zd&ededB fd'd(Zd&ededB fd)d*Zd+edeejdf fd,d-Zd.edeejdf fd/d0Zd1eejdf d+edeejdf fd2d3Zd4eejdf d.edeejdf fd5d6Zd&edefd7d8Z d9e!e	 d:e!e" de#ee	e	e	f  fd;d<Z$d=e%d>e	de!ej dB fd?d@Z&d=e%d>e	de!e	 dB fdAdBZ'd=e%d>e	de!e	 dB fdCdDZ(dEe!e	 dFeejdf dGej)dHe	deeejdf eje	f f
dIdJZ*d9e!e	 d:e!e" deeje	f fdKdLZ+d&ede,dB fdMdNZ-dOejdFe,dPejdeeje,f fdQdRZ.	djddSdTdEejdFe,dB dPejdB dUe/dejf
dVdWZ0		dkdEejdB dXejdYedB dOejdB d&edejeB fdZd[Z1d\ejdejdB fd]d^Z2d_e3eeejf  de4e fd`daZ5de6fdbdcZ7dde	de	fdedfZ8dge	de	fdhdiZ9  Z:S )lQwen3VLForConditionalGeneration)q_projk_projv_proj	gate_projup_projqkv)qkv_projgate_up_projrL  Tvisual.zlanguage_model.lm_head.zlanguage_model.model.)zmodel.visual.zlm_head.zmodel.language_model.)orig_to_new_prefixr   r  r`   Nc                 C   s$   | drdS | drdS td)Nr  r  r  r  z)Only image or video modality is supported)
startswith
ValueError)clsr   r  rj   rj   rk   get_placeholder_str  s
   

z3Qwen3VLForConditionalGeneration.get_placeholder_strr<  r,  r-  r   c                   sH  t    jj j}jj} | _|| _|jdk| _|j	| _	|
 | _
t jd| _| jr5t jjnd| _ jj| _| j| j | _| ddh* t jt dd|t|dd	| _| jro fd
dt| jD | _W d    n1 syw   Y  |  tt|dd| _W d    n1 sw   Y  | jj| _d S )Nr  r   r   r  r  rms_norm_epsr   visual)r   r   r   c                    s    g | ]}t jj jjqS rj   )rx   r-  scheduler_configmax_num_batched_tokensr>  r_   r  r?  r-  rj   rk   r     s    z<Qwen3VLForConditionalGeneration.__init__.<locals>.<listcomp>language_modelr.  )rd   re   r/  r  r   multimodal_configr?  mm_encoder_tp_moder   r  is_multimodal_pruning_enabledr   r   use_deepstackr   r   deepstack_num_levelr   
visual_dimmultiscale_dim_mark_tower_modelr   rL  rS   rV  r   r)  _mark_language_modelr;  rZ  rD  )rg   r-  r   r   r[  rh   rY  rk   re     sJ   




z(Qwen3VLForConditionalGeneration.__init__r5  .c                 C   s   || j j_d S r   )rZ  r<  r7  )rg   r5  rj   rj   rk   set_aux_hidden_state_layers  s   z;Qwen3VLForConditionalGeneration.set_aux_hidden_state_layersc                 C   s    t | jjj}d|d |d fS )NrY   rZ   )r   rZ  r<  r5  )rg   
num_layersrj   rj   rk   "get_eagle3_aux_hidden_state_layers  s   zBQwen3VLForConditionalGeneration.get_eagle3_aux_hidden_state_layersr   c                    s.   t dd sd S t fddtjD S )Nr)  c                    s&   i | ]}d | j | d  qS )r3  N)r)  r  r   rg   rj   rk   r  ,  s    zOQwen3VLForConditionalGeneration._get_deepstack_input_embeds.<locals>.<dictcomp>)rL  r6   r   r_  )rg   r   rj   rg  rk   _get_deepstack_input_embeds#  s   z;Qwen3VLForConditionalGeneration._get_deepstack_input_embedsr)  c                    sz   t dd sd S |d  jd dkr% fddtjD _tjD ]}j| d   ||  q*d S )Nr)  r:   r   c                    s4   g | ]}t j jjjjd  jjd  jdqS )r   r   r   )rx   r-  r?  r>  r_   r)  r   r   r  rg  rj   rk   r   ;  s    

zOQwen3VLForConditionalGeneration._set_deepstack_input_embeds.<locals>.<listcomp>)rL  r}  r)  r   r_  copy_)rg   r)  r  rj   rg  rk   _set_deepstack_input_embeds4  s   
	z;Qwen3VLForConditionalGeneration._set_deepstack_input_embedsc                 C   sF   t | dd sd S |dkrt| jD ]}| j| d |   qd S d S )Nr)  r   )rL  r   r_  r)  zero_)rg   r   r  rj   rj   rk   _clear_deepstack_input_embedsI  s   z=Qwen3VLForConditionalGeneration._clear_deepstack_input_embedsr_  c                 K   sh   | dd }| dd }| dd }|d u r|d u rd S |d ur'td||dS |d ur2td||dS d S )Npixel_valuesimage_embedsr  )typern  r  )rp  ro  r  )rb  rF   rD   )rg   r_  rn  ro  r  rj   rj   rk   _parse_and_validate_image_inputR  s$   z?Qwen3VLForConditionalGeneration._parse_and_validate_image_inputc                 K   sv   | dd }| dd }| dd }| dd }|d u r"|d u r"d S |d ur.td|||dS |d ur9td||dS d S )Nr  video_embedsr  second_per_grid_ts)rp  r  r  rs  )rp  rr  r  )rb  rI   rG   )rg   r_  r  rr  r  rs  rj   rj   rk   _parse_and_validate_video_inputj  s(   z?Qwen3VLForConditionalGeneration._parse_and_validate_video_inputimage_inputc                 C   s   |d }|j dksJ |d dkr|d | jj}n|d | jj}| jr2t| j|| ddS | j||d}| jj}|d	| |  }|	|S )
Nr  rY   rp  ro  rn  rope_3d	rope_typer   rn   )
ndimrp  rV  r   r   rV   r3  r   r  split)rg   ru  r   ro  rn  r  sizesrj   rj   rk   _process_image_input  s   
z4Qwen3VLForConditionalGeneration._process_image_inputvideo_inputc                 C   s   |d }|j dksJ |d dkr|d | jj}n |d | jj}| jr4| }t| j||ddS | j||d}| jj}|d	| |  }|	|S )
Nr  rY   rp  rr  r  rv  rw  ry  rn   )
rz  rp  rV  r   r   r3  rV   r   r  r{  )rg   r~  r   rr  r  r;  r  r|  rj   rj   rk   _process_video_input  s    

z4Qwen3VLForConditionalGeneration._process_video_inputimage_embeds_splitc           
      C   sj   | j j}|d }| }g }t||D ]\}}t|||j}	tj||	gdd}|	| q|}t
|S )a  
        Append mrope positions for each for images.
        This is necessary to recover correct mrope
        positions after video pruning

        Args:
            image_embeds_split: Tuple of image embeddings for
                each image item.
            image_input: Image input data.

        Returns:
            Tuple of image embeddings for each image item.
            Resulting embeddings will have extra 4 channels for
            computed mrope positions.
        r  r:   r   )rV  r   r3  zipr$   r   r   rx   r   r  r[  )
rg   r  ru  r  r   r;  image_embeds_outembr}  r&  rj   rj   rk   _postprocess_image_embeds_evs  s   z=Qwen3VLForConditionalGeneration._postprocess_image_embeds_evsvideo_embeds_splitc                 C   s0  |d }|j dksJ | }| jj}|d}|du r'tjt|tjd}n| }t	| j
jdd}g }t|||D ]X\}	}
}t|	|
| jj| jd}td	|	jd
 |  |
d
 |
d |
d | jd|    d  t|
||| d|	j}|	| }	|| }tj|	|gdd}	||	 q;t|S )a  
        Prunes video embeddings via Efficient Video Sampling (EVS)
        and then appends mrope positions for each retained embeddings

        Args:
            video_embeds_split: Tuple of video embeddings for each video item.
            video_input: Video input data.

        Returns:
            Tuple of video embeddings for each video item.
            Resulting embeddings will have extra 4 channels for
            computed mrope positions.
        r  rY   rs  Nr  tokens_per_secondg      ?)r   rC  z\EVS: Video tokens pruned from %d to %d (T=%d,H=%d,W=%d, pruning_rate=%.2f, reduction=%.1f%%)r   r:   d   )r  video_second_per_gridr   )rz  r3  rV  r   r  rx   onesr   r  rL  r?  r   r  r&   r  r  debugro   r  r  rX  meanr$   r   r   r   r  r[  )rg   r  r~  r   r;  r  rs  r  video_embeds_outr  r}  video_second_per_grid_tretention_maskr&  rj   rj   rk   _postprocess_video_embeds_evs  sT   

z=Qwen3VLForConditionalGeneration._postprocess_video_embeds_evsc                 K   sZ   i }|D ]&}|dv rd|vr| j di ||d< |dv r*d|vr*| jdi ||d< q|S )N)rn  ro  r  )r  rr  r  rj   )rq  rt  )rg   r_  mm_input_by_modality	input_keyrj   rj   rk   %_parse_and_validate_multimodal_inputs  s   

zEQwen3VLForConditionalGeneration._parse_and_validate_multimodal_inputsinput_tokensmm_featuresc                 c   sZ   | j j}| j jj}t|dd dD ]}|jj}|jdkr>|jd j	 \}}}	|dks3J d| ||| |	| fV  q|jdkr|jd	 j	 \}}}	|| }
|	| }t
| d
od| jduod| jdk}|r| |j|}|dur|D ]
}|| |
|fV  qtqtd| j dt|D ]}|||}||
|fV  ||
| 7 }qqtd|j dS )a  
        Iterate over multimodal features and yield grid information.

        For videos with EVS (Efficient Video Sampling) enabled, this function
        computes the offset based on the pruned token count rather than relying
        on input_tokens.index(), which would fail when tokens are pruned.

        Args:
            input_tokens: List of token IDs in the prompt
            mm_features: List of multimodal feature specifications

        Yields:
            Tuple of (offset, grid_h, grid_w) for each frame/image
        c                 S   s   | j jS r   )mm_positionoffset)frj   rj   rk   <lambda>@  s    zAQwen3VLForConditionalGeneration.iter_mm_grid_hw.<locals>.<lambda>)keyr  r  r:   zImage must have 1 frame, got r  r  r  Nr  zEVS is enabled (pruning_rate=z[) but is_embed mask is missing from mm_position. This indicates a bug in prompt processing.zUnsupported modality: )r?  r  r   r   sortedr  r  r   r  r3  r   r   _extract_frame_offsets_from_maskr   r   r9  rR  )rg   r  r  r  r   
mm_featurer  r   r   r   
llm_grid_h
llm_grid_wis_evs_enabledframe_offsets
rel_offsetr   rj   rj   rk   iter_mm_grid_hw-  sH   



z/Qwen3VLForConditionalGeneration.iter_mm_grid_hwr  expected_framesc           	      C   s   t |dd}|du rdS tj|tjdd}tj|dd }| dkr(dS | dkr2|g}n%t|}tj|dkdd }| dkrL|g}nt	||
d }t||k rhtd	t|| dS |d| S )
a  Extract contiguous segments from EVS is_embed mask.

        The EVS (Efficient Video Sampling) mask marks which placeholder
        positions should be filled with video embeddings. This method splits
        the mask into contiguous segments, where each segment represents one
        retained frame.

        This is a pure function - it does not modify any state and always
        returns the same output for the same input (idempotent).

        Args:
            mm_position: MultiModal position containing the is_embed mask
            expected_frames: Expected number of frame segments

        Returns:
            List of tensors, each containing indices for one frame segment,
            or None if EVS is not enabled or validation fails.
        is_embedNr  rn   F)as_tupler   r:   z8EVS mask segments (%d) do not match expected frames (%d))rL  rx   	as_tensorr   rp   nonzeror   numeldifftensor_splitrM  r3  r   r  r  )	rg   r  r  is_embed_maskmask_tensortrue_indicessegmentsdiffssplit_pointsrj   rj   rk   _get_evs_mask_segmentsl  s0   
z6Qwen3VLForConditionalGeneration._get_evs_mask_segmentsc                 C   &   |  ||}|du rdS dd |D S )a  Return relative offsets for each EVS-retained frame.

        The prompt processor stores a boolean mask inside ``mm_position`` that
        marks which placeholder locations should be populated with video
        embeddings. By splitting that mask into contiguous runs we can recover
        the start of every retained frame without probing ``input_tokens``.

        Args:
            mm_position: MultiModal position containing the is_embed mask
            expected_frames: Expected number of frames

        Returns:
            List of starting offsets (relative to mm_position) for each frame,
            or None if EVS is not enabled.
        Nc                 S   s   g | ]
}t |d   qS )r   )rw   r  )r   segmentrj   rj   rk   r     s    zTQwen3VLForConditionalGeneration._extract_frame_offsets_from_mask.<locals>.<listcomp>r  rg   r  r  r  rj   rj   rk   r    s   z@Qwen3VLForConditionalGeneration._extract_frame_offsets_from_maskc                 C   r  )a  Return actual token count for each EVS-retained frame.

        This function calculates the actual number of tokens per frame by
        analyzing the is_embed mask, accounting for EVS pruning. Each frame
        may have a different token count due to content-aware pruning.

        Args:
            mm_position: MultiModal position containing the is_embed mask
            expected_frames: Expected number of frames

        Returns:
            List of token counts for each frame, or None if EVS is not enabled.
        Nc                 S      g | ]}t |qS rj   r   )r   segrj   rj   rk   r     r  zRQwen3VLForConditionalGeneration._get_actual_frame_token_counts.<locals>.<listcomp>r  r  rj   rj   rk   _get_actual_frame_token_counts  s   z>Qwen3VLForConditionalGeneration._get_actual_frame_token_countsr  multimodal_embeddingsmrope_positionsnum_computed_tokensc                 C   s   | j j}| j j}| j j}t|r|d jn|j}tj||tjd}	dd |D }
dd |D }t	|	||||||\}}t
|
||fS )a&  
        Update part of input mrope positions (starting with
        num_computed_tokens index). Original mrope_positions are computed
        for unpruned sequence and becomes incorrect once pruning occurs,
        so once we prune media tokens we should reflect this in the
        mrope_positions before we feed it to LLM.

        Args:
            input_ids: (N,) All input tokens of the prompt (Containing
                entire sequence).
            multimodal_embeddings: Tuple of multimodal embeddings.
            mrope_positions: Existing mrope positions (3, N) for entire
                sequence
            num_computed_tokens: A number of computed tokens so far.

        Returns:
            Tuple of (multimodal_embeddings, mrope_positions,
                mrope_position_delta).
        r   ri  c                 S   s    g | ]}|d d d df qS )Nrj   r   mmrj   rj   rk   r     s     zMQwen3VLForConditionalGeneration.recompute_mrope_positions.<locals>.<listcomp>c                 S   s,   g | ]}|d d dd f  dd qS )Nr  r:   r   )r  r  r  rj   rj   rk   r     s     )r?  r  r  r  r   r   rx   r  r  r'   r[  )rg   r  r  r  r  r  r  r  r   input_ids_tmm_embeddings_outmm_embeddings_posr&  mrope_positions_deltarj   rj   rk   r'     s,   
z9Qwen3VLForConditionalGeneration.recompute_mrope_positionsc                 C   sn  i }|D ]7}|j dkr;t| do| jd uo| jdk}|r;|jd j d }| |j|}|d us5J d|||jj< qg }d}	i }
| ||D ]\}}}||	 }t	|dkr_|d 
 d nd}d }|D ]}||krm|}qe|d ur||v s~J d	| d
||
vrd|
|< || }|
| }|t	|k sJ d| dt	| d|| }|
|  d7  < n|| }tt|d|f| }|| ||7 }td||fdd}|d d d |f | }|| || }	qH|	t	|k rt	|dkr|d 
 d nd}t	||	 }tt|d|f| }|| tj|dddd}|
 d t	|  }t||fS )Nr  r  r  r  r   zGEVS enabled but failed to extract frame token counts from is_embed maskrn   r:   zFound base_offset z" but not in frame_token_counts_mapzEVS frame index z out of range (total frames: )rZ   r   )r   r   r  r  r3  r  r  r  r  r   r   r   r   r   r  r'  r   r6  r  rx   r   )rg   r  r  frame_token_counts_mapr  r  r   token_countsllm_pos_ids_liststframe_counts_idxr  r  r  text_lenst_idxbase_offsetfeat_offsetcountsr  actual_frame_tokenstext_positionsgrid_indicesframe_positionsfinal_text_positionsllm_positionsmrope_position_deltarj   rj   rk   get_mrope_input_positions
  s|   


 




 
z9Qwen3VLForConditionalGeneration.get_mrope_input_positionsc                 K   s   | j di |}|sd S d}|D ]6}|| }|dkr.| |}| jr(| ||}|t|7 }|dkrF| |}| jr@| ||}|t|7 }q|S )Nrj   r  r  )r  r}  r]  r  r[  r  r  )rg   r_  r  r  r   multimodal_inputimage_embeddingsvideo_embeddingsrj   rj   rk   embed_multimodalg  s,   

z0Qwen3VLForConditionalGeneration.embed_multimodalr(  is_multimodalc           	      C   s   dd |D }t j|dd}t j|| j| jgdd\}}t j||dd}t j||dd}||d| j|d }t|||d}|	|j
d | j| j}|ddd}||fS )	Nc                 S   r  rj   r  )r   rl   rj   rj   rk   r     r  zMQwen3VLForConditionalGeneration._compute_deepstack_embeds.<locals>.<listcomp>r   r   rn   r:   r(  r  r  rY   )rx   r   r{  r`  ra  	new_zerosr}  r_  rR   rp   ro   r  )	rg   r(  r  r  visual_lensmultimodal_embeddings_catmultimodal_embeddings_main multimodal_embeddings_multiscaler)  rj   rj   rk   _compute_deepstack_embeds  s:   
z9Qwen3VLForConditionalGeneration._compute_deepstack_embedsFr  handle_oov_mm_tokenr  c                C   sz   | j || jj||d}|d u st|dkr|S t|}| jr)| j|||d\}}nd }t|||d}|d ur;| | |S )Nr  r   r  )	_embed_text_input_idsrZ  r4  r   rB   r^  r  rR   rk  )rg   r  r  r  r  r(  r)  rj   rj   rk   r4    s6   
z/Qwen3VLForConditionalGeneration.embed_input_idsr&  r'  c                 K   sl   |durd}|durt  jr| |d}nd}| jj|||||d}|dur4t  jr4| |d |S )a  Run forward pass for Qwen3VL.

        Args:
            input_ids: Flattened (concatenated) input_ids corresponding to a
                batch.
            positions: Flattened (concatenated) position ids corresponding to a
                batch.
                **NOTE**: If mrope is enabled (default setting for Qwen3VL
                opensource models), the shape will be `(3, seq_len)`,
                otherwise it will be `(seq_len,).
            intermediate_tensors: Intermediate tensors from previous pipeline
                stages.
            inputs_embeds: Pre-computed input embeddings.
            **kwargs: Additional keyword arguments including:
                - pixel_values: Pixel values to be fed to a model.
                    `None` if no images are passed.
                - image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in
                    LLM. `None` if no images are passed.
                - pixel_values_videos: Pixel values of videos to be fed to a
                    model. `None` if no videos are passed.
                - video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in
                    LLM. `None` if no videos are passed.
        Nr   r%  )r   r0  rh  r}  rZ  r<  rm  )rg   r  r&  r'  r(  r_  r)  r:  rj   rj   rk   rs     s"    	z'Qwen3VLForConditionalGeneration.forwardr:  c                 C   s   | j |S r   )rZ  compute_logits)rg   r:  rj   rj   rk   r    s   z.Qwen3VLForConditionalGeneration.compute_logitsr(  c                 C   s   t | }|j|| jdS )N)mapper)rO   rW  hf_to_vllm_mapper)rg   r(  loaderrj   rj   rk   rW    s   z,Qwen3VLForConditionalGeneration.load_weightsc                 C   s   t jdddgddS )z<
        Get the module prefix in multimodal models
        rZ  zvisual.mergerzvisual.deepstack_merger_listrO  )rZ  	connectortower_model)r"   from_string_fieldr   rj   rj   rk   get_mm_mapping  s
   z.Qwen3VLForConditionalGeneration.get_mm_mappingnum_image_tokensc                 C   s   | j }|j}|j}||d  S NrY   r?  r   r   )rg   r  r  r   r  rj   rj   rk   get_num_mm_encoder_tokens$  s   z9Qwen3VLForConditionalGeneration.get_num_mm_encoder_tokensr  c                 C   s   | j }|j}|j}||d  S r  r  )rg   r  r  r   r  rj   rj   rk   get_num_mm_connector_tokens.  s   z;Qwen3VLForConditionalGeneration.get_num_mm_connector_tokensr   r  );rt   ru   rv   packed_modules_mappingsupports_encoder_tp_datarQ   r  classmethodr   rw   rT  r   re   r[  rd  rf  r6   rh  rx   ry   rk  rm  r  rE   rq  rH   rt  r}  r  r  r  rH  r  r0  r)   r   r  r-   r  r  r  
LongTensorr'   r  r;   r  r  r   r4  rs   r  r   rJ  rW  r"   r  r  r  rz   rj   rj   rh   rk   rF    sH   2
	





H
?
7




9
]
,
/
8
$


rF  )__doc__collections.abcr   r   r   r   r   	functoolsr   r   	itertoolsr	   typingr
   r4  r   rx   torch.nnr   torch.nn.functional
functionalr   transformersr   transformers.models.qwen2_vlr   6transformers.models.qwen2_vl.image_processing_qwen2_vlr   r|  transformers.models.qwen3_vlr   r   3transformers.models.qwen3_vl.configuration_qwen3_vlr   r   6transformers.models.qwen3_vl.video_processing_qwen3_vlr{  transformers.video_utilsr   vllm.compilation.decoratorsr   vllm.configr   vllm.config.multimodalr   r   vllm.distributedr   vllm.loggerr   %vllm.model_executor.layers.activationr   vllm.model_executor.layers.convr   !vllm.model_executor.layers.linearr   r   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr    -vllm.model_executor.model_loader.weight_utilsr!   )vllm.model_executor.models.module_mappingr"   vllm.multimodalr#   vllm.multimodal.evsr$   r%   r&   r'   vllm.multimodal.inputsr(   r)   r*   r+   r,   r-   r.   vllm.multimodal.parser/   r0   vllm.multimodal.processingr1   r2   r3   r4   r5   vllm.sequencer6   vllm.utils.collection_utilsr7   vllm.utils.math_utilsr8   #vllm.v1.attention.backends.registryr9   
interfacesr;   r<   r=   r>   r?   r@   rA   rB   
qwen2_5_vlrC   rD   rE   rF   rG   rH   rI   qwen2_vlrJ   rK   rL   qwen3rM   rN   utilsrO   rP   rQ   rR   rS   visionrT   rU   rV   rt   r  r  r   rW   r{   r   r   r   r\  r  r  r+  r;  register_processorrF  rj   rj   rj   rk   <module>   s   $	($
	%44  0 +  =@!

