o
    
۾i                  	   @   s*  U d Z ddlZddlmZmZmZmZmZ ddlm	Z	 ddl
mZmZmZmZ ddlZddlZddlmZ ddlmZ ddlmZ ddlmZmZ dd	lmZmZ dd
lmZ ddl m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z'm(Z( ddl&m)Z* ddl+m,Z, ddl-m.Z. ddl/m0Z0 ddl1m2Z2 ddl3m4Z4m5Z5 ddl6m7Z7 ddl8m9Z9 ddl:m;Z; ddl<m=Z= ddl>m?Z? ddl@mAZA ddlBmCZCmDZDmEZEmFZFmGZGmHZHmIZI ddlJmKZKmLZLmMZMmNZNmOZO ddlPmQZQmRZRmSZSmTZTmUZU ddlVmWZW ddlXmYZY dd lZm[Z[m\Z\ dd!l]m^Z^ d"d#l_m`Z`maZambZbmcZcmdZd d"d$l)meZemfZfmgZgmhZh d"d%limjZjmkZkmlZl e,emZnd&ZoG d'd( d(e[ZpG d)d* d*e[ZqepeqB Zreesd+< G d,d- d-e[ZtG d.d/ d/e[ZueteuB Zveesd0< G d1d2 d2ejwZxG d3d4 d4ejwZyG d5d6 d6ejwZzG d7d8 d8ejwZ{G d9d: d:ejwZ|G d;d< d<ejwZ}d=e~d>eeeejf geeeGf f fd?d@ZG dAdB dBeOZG dCdD dDeSZG dEdF dFeQe ZG dGdH dHeRe ZeAjeeedIG dJdK dKejweceaedebZG dLdM dMeZG dNdO dOeZG dPdQ dQeZG dRdS dSeZeAjeeedIG dTdU dUeZdS )VzBInference-only Qwen2-VL model compatible with HuggingFace weights.    N)CallableIterableIteratorMappingSequence)partial)	AnnotatedAnyLiteral	TypeAlias	rearrange)BatchFeature)Qwen2VLImageProcessorQwen2VLProcessor)Qwen2VLConfigQwen2VLVisionConfig)smart_resize)Qwen2VLVideoProcessor)
VllmConfig)BaseDummyOptions)parallel_state tensor_model_parallel_all_gather)utils)init_logger)	QuickGELU)MMEncoderAttention)Conv3dLayer)ColumnParallelLinearRowParallelLinear)QuantizationConfig)get_rope)ApplyRotaryEmb)default_weight_loader)MultiModelKeys)MULTIMODAL_REGISTRY)	ImageItemModalityDataMultiModalDataDictMultiModalFeatureSpecMultiModalFieldConfigMultiModalKwargsItems	VideoItem)DictEmbeddingItems	ImageSizeModalityDataItemsMultiModalDataItemsMultiModalDataParser)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdate)IntermediateTensors)TokenizerLike)TensorSchemaTensorShape)AttentionBackendEnum   )MultiModalEmbeddingsSupportsLoRASupportsMRoPESupportsMultiModal
SupportsPP)AutoWeightsLoaderWeightsMapperinit_vllm_registered_modelmaybe_prefix)get_vit_attn_backendis_vit_use_data_parallel!run_dp_sharded_mrope_vision_model   c                   @   N   e Zd ZU dZed ed< eeje	ddf ed< eeje	ddf ed< d	S )
Qwen2VLImagePixelInputsa  
    Dimensions:
        - np: The total number of patches over each image over each prompt in
              the batch
        - ni: Number of images
        - cps: Number of channels * patch_size * patch_size

    Historical context:
        - pixel_values shape: (num_patches, num_channels * patch_size *
          patch_size)
        - image_grid_thw shape: (num_images, 3) in (grid_t, grid_h, grid_w)
          format
    pixel_valuestypenpcpsni   image_grid_thwN
__name__
__module____qualname____doc__r
   __annotations__r   torchTensorr:    r[   r[   W/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/qwen2_vl.pyrK   y   s   
 rK   c                   @   rJ   )
Qwen2VLImageEmbeddingInputsa  
    Dimensions:
        - nf: Number of image features
        - hs: Hidden size
        - ni: Number of images

    Historical context:
        - image_embeds shape: (num_image_features, hidden_size)
        - num_image_features varies based on the number and resolution of the
          images.
        - hidden_size must match the hidden size of language model backbone.
        - image_grid_thw shape: (num_images, 3) in (grid_t, grid_h, grid_w)
          format
    image_embedsrM   nfhsrP   rQ   rR   NrS   r[   r[   r[   r\   r]         
 r]   Qwen2VLImageInputsc                   @   rJ   )
Qwen2VLVideoPixelInputsa  
    Dimensions:
        - np: The total number of patches over each video over each prompt in
              the batch
        - ctps: Number of channels * temporal_patch_size * patch_size *
          patch_size
        - nv: Number of videos

    Historical context:
        - pixel_values_videos shape: (num_patches, num_channels *
          temporal_patch_size * patch_size * patch_size)
        - video_grid_thw shape: (num_videos, 3) in (grid_t, grid_h, grid_w)
          format
    pixel_values_videosrM   rN   ctpsnvrQ   video_grid_thwNrS   r[   r[   r[   r\   rc      ra   rc   c                   @   rJ   )
Qwen2VLVideoEmbeddingInputsa  
    Dimensions:
        - nf: Number of video features
        - hs: Hidden size
        - nv: Number of videos

    Historical context:
        - video_embeds shape: (num_video_features, hidden_size)
        - num_video_features varies based on the number and resolution of the
          videos.
        - hidden_size must match the hidden size of language model backbone.
        - video_grid_thw shape: (num_videos, 3) in (grid_t, grid_h, grid_w)
          format
    video_embedsrM   r_   r`   rf   rQ   rg   NrS   r[   r[   r[   r\   rh      ra   rh   Qwen2VLVideoInputsc                       sZ   e Zd Zeddfdededeej dedB de	f
 fdd	Z
d
ejdejfddZ  ZS )Qwen2VisionMLPN in_featureshidden_features	act_layerquant_configprefixc                    sP   t    t }t|||| d|d| _| | _t|||| d|d| _d S )Nz.fc1)rp   rq   
disable_tpz.fc2)super__init__rG   r   fc1actr   fc2)selfrm   rn   ro   rp   rq   use_data_parallel	__class__r[   r\   rt      s"   
zQwen2VisionMLP.__init__xreturnc                 C   s*   |  |\}}| |}| |\}}|S N)ru   rv   rw   )rx   r|   
x_parallel_r[   r[   r\   forward  s   
zQwen2VisionMLP.forward)rT   rU   rV   r   intrM   nnModuler    strrt   rY   rZ   r   __classcell__r[   r[   rz   r\   rk      s     rk   c                       s   e Zd Z		ddededededB deddf fd	d
Zdejde	ejdf fddZ
	ddejdejdejdejdedB dejfddZ  ZS )Qwen2VisionAttentionNrl   	embed_dim	num_headsprojection_sizerp   rq   r}   c                    s   t    t }|rdnt | _t | _t	||| _
t	|| j| _t|d| || d|d| _t|||| d|d| _t| j| j
| j
d | dd| _td	d
| _d S )Nr<   rQ   z.qkv)
input_sizeoutput_sizerp   rq   rr   z.projg      .attn)r   	head_sizescalerq   T)enforce_enable)rs   rt   rG   r   $get_tensor_model_parallel_world_sizetp_sizeget_tensor_model_parallel_ranktp_rank
dist_utilsdividehidden_size_per_attention_head!num_attention_heads_per_partitionr   qkvr   projr   attnr"   apply_rotary_emb)rx   r   r   r   rp   rq   ry   rz   r[   r\   rt     sD   

zQwen2VisionAttention.__init__r   .c           	         s   |j \}}}| jdkrt|}|jddd\}}}| jdkr;ttj| jd}||| j }||| j }||| j }||| j| j	f  fdd|||fD \}}}|||fS )Nr<   rQ      dim)num_partitionsc                 3   s    | ]}|j   V  qd S r~   )view.0r|   	new_shaper[   r\   	<genexpr>`      z1Qwen2VisionAttention.split_qkv.<locals>.<genexpr>)
shaper   r   chunkr   r   split_tensor_along_last_dimr   r   r   )	rx   r   seq_lenbsr   qkvsplitterr[   r   r\   	split_qkvG  s$   


zQwen2VisionAttention.split_qkvr|   
cu_seqlensrotary_pos_emb_cosrotary_pos_emb_sin
max_seqlenc                 C   s   |  |\}}| |\}}}	dd |||	fD \}}}	tj||gdd}
| |
||}tj|ddd\}}| j|||	||d}t|d }| 	|\}}|S )Nc                 s   s    | ]}t |d V  qdS )zs b ... -> b s ...Nr   r   r[   r[   r\   r   q  r   z/Qwen2VisionAttention.forward.<locals>.<genexpr>r   r   r   )querykeyvaluer   r   zb s h d -> s b (h d))
r   r   rY   catr   r   r   r   
contiguousr   )rx   r|   r   r   r   r   r   r   r   r   	qk_concat
qk_rotatedcontext_layeroutputr[   r[   r\   r   c  s(   	zQwen2VisionAttention.forward)Nrl   r~   )rT   rU   rV   r   r    r   rt   rY   rZ   tupler   r   r   r[   r[   rz   r\   r     s>    0"r   c                       s   e Zd Zedddfdedededeej de	egejf dB de
dB d	ed
df fddZ	ddejdejdejdejdedB d
ejfddZ  ZS )Qwen2VisionBlockNrl   r   r   	mlp_ratioro   
norm_layerrp   rq   r}   c           	         sx   t    |d u rttjdd}||| _||| _t|| }t||||| dd| _	t
||||| dd| _d S )Nư>epsr   )r   r   r   rp   rq   z.mlp)ro   rp   rq   )rs   rt   r   r   	LayerNormnorm1norm2r   r   r   rk   mlp)	rx   r   r   r   ro   r   rp   rq   mlp_hidden_dimrz   r[   r\   rt     s(   



zQwen2VisionBlock.__init__r|   r   r   r   r   c                 C   s6   || j | |||||d }|| | | }|S )Nr   r   r   r   )r   r   r   r   )rx   r|   r   r   r   r   r[   r[   r\   r     s   zQwen2VisionBlock.forwardr~   )rT   rU   rV   r   r   floatrM   r   r   r   r    r   rt   rY   rZ   r   r   r[   r[   rz   r\   r     sH    	&r   c                       sR   e Zd Z				ddedededed	d
f
 fddZdejd	ejfddZ  ZS )Qwen2VisionPatchEmbedrI   r   rQ     
patch_sizetemporal_patch_sizein_channelsr   r}   Nc                    s>   t    || _|| _|| _|||f}t||||dd| _d S )NF)kernel_sizestridebias)rs   rt   r   r   r   r   r   )rx   r   r   r   r   r   rz   r[   r\   rt     s   

zQwen2VisionPatchEmbed.__init__r|   c                 C   s:   |j \}}||d| j| j| j}| ||| j}|S N)r   r   r   r   r   r   )rx   r|   LCr[   r[   r\   r     s   
zQwen2VisionPatchEmbed.forward)rI   r   rQ   r   )	rT   rU   rV   r   rt   rY   rZ   r   r   r[   r[   rz   r\   r     s"    r   c                       sn   e Zd Z				ddededeegejf dB dededB d	ed
df fddZ	de
jd
e
jfddZ  ZS )Qwen2VisionPatchMergerNr   rl   d_modelcontext_dimr   spatial_merge_sizerp   rq   r}   c                    s   t    t }||d  | _|d u rttjdd}||| _tt	| j| jd|| d|dt
 t| j|d|| d|dg| _d S )Nr   r   r   Tz.mlp.0)r   rp   rq   rr   z.mlp.2)rs   rt   rG   hidden_sizer   r   r   ln_q
ModuleListr   GELUr   r   )rx   r   r   r   r   rp   rq   ry   rz   r[   r\   rt     s4   
	

zQwen2VisionPatchMerger.__init__r|   c                 C   sH   |  |}|d| j}| j\}}}||\}}||}||\}}|S r   )r   r   r   r   )rx   r|   mlp_fc1mlp_actmlp_fc2r   r   outr[   r[   r\   r     s   
zQwen2VisionPatchMerger.forward)Nr   Nrl   )rT   rU   rV   r   r   r   r   r    r   rt   rY   rZ   r   r   r[   r[   rz   r\   r     s*    %r   c                       s   e Zd Z			ddedededB deddf
 fd	d
Zede	j
fddZ
ede	jfddZdeee  dee	je	jf fddZde	jdedB fddZde	jde	jeee  B de	jfddZdeeee	jf  dee fddZ  ZS )Qwen2VisionTransformerr   Nrl   vision_confignorm_epsrp   rq   r}   c                    s   t    |j}|j}|j}|j}|j}	|j |j}
|j	|j
t | _|j| _|| _| _	 | _t||| d| _ttj|d  }t|ddddid| _t fdd	t|
D | _t|	  d
d| _t|t d| _d S )N)r   r   r   r   r   i    Tpartial_rotary_factorg      ?)r   max_positionis_neox_stylerope_parametersc                    s*   g | ]}t   d | dqS )z.blocks.)r   r   r   r   rp   rq   )r   )r   	layer_idxr   r   r   r   rq   rp   r[   r\   
<listcomp>9  s    	z3Qwen2VisionTransformer.__init__.<locals>.<listcomp>z.merger)r   r   r   rp   rq   )r   dtype)rs   rt   r   r   r   r   r   r   depthr   r   rG   ry   out_hidden_sizer   patch_embedr   r   r   r!   rotary_pos_embr   rangeblocksr   mergerrF   rY   get_default_dtypeattn_backend)rx   r   r   rp   rq   r   r   r   r   r   r   head_dimrz   r   r\   rt     sZ   
	zQwen2VisionTransformer.__init__c                 C      | j jjjS r~   )r   r   weightr   rx   r[   r[   r\   r   Q     zQwen2VisionTransformer.dtypec                 C   r   r~   )r   r   r   devicer   r[   r[   r\   r   U  r   zQwen2VisionTransformer.devicegrid_thwc                 C   s  g }d}|D ]c\}}}t |dd|}t |d|d}||| j | j|| j | jdddd }||| j | j|| j | jdddd }|t j	||gdd
|d t|||}qt j|dd}| j|\}	}
|	| d}|
| d}||fS )Nr   r<   r   r   rQ   r   )rY   arange	unsqueezeexpandreshaper   permuteflattenappendstackrepeatmaxr   r   get_cos_sin)rx   r   pos_idsmax_grid_sizethwhpos_idswpos_idscossincos_combinedsin_combinedr[   r[   r\   rot_pos_embY  s<    
z"Qwen2VisionTransformer.rot_pos_embr   c                 C   s6   d }| j tjtjhv r|dd  |d d   }|S )Nr<   r   )r   r;   
FLASH_ATTNROCM_AITER_FAr	  )rx   r   r   r[   r[   r\   compute_attn_mask_seqlen  s   z/Qwen2VisionTransformer.compute_attn_mask_seqlenr|   c           	      C   s  |j | j| jd}| |}t|tr|}tj|tjd}n|	 }|
 }| |\}}t|d d df |d d df  |d d df jdtjd}ttjdtjd|g}t|}|d}| |}|j | jdd}| jD ]}||||||d	}qt| |}|S )
N)r   r   )r   r<   r   r   )axisr   T)non_blockingr   )tor   r   r   
isinstancelistrN   arrayint32tolistnumpyr  r  cumsumconcatenatezerosrY   
from_numpyr  r  r   r   )	rx   r|   r   grid_thw_listr   r   r   r   blkr[   r[   r\   r     s4   

4




	zQwen2VisionTransformer.forwardweightsc                 C   s   g d}t | jdd}t }|D ]9\}}|D ]\}}}	||vr!q|||}|| }
|
j}||
||	  n|| }
t|
dt}||
| || q|S )N))qkv_projq_projr   )r*  k_projr   )r*  v_projr   F)remove_duplicateweight_loader)dictnamed_parameterssetreplacer/  getattrr#   add)rx   r)  stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
param_nameweight_nameshard_idparamr/  r[   r[   r\   load_weights  s"   
z#Qwen2VisionTransformer.load_weights)r   Nrl   )rT   rU   rV   r   r   r    r   rt   propertyrY   r   r   r  r   r   rZ   r  r  r   r   r2  r?  r   r[   r[   rz   r\   r     sB    C

'	
,.r   r   r}   c                    s    dt ttjf f fdd}|S )N	hf_inputsc              
      s   |  dtd}|d}|    }|  dtd}|d}|    }ttd|td|tjdddtd|td|tjdddd	S )
NrR   )r   rQ   r   rg   imageT)keep_on_cpuvideo)rL   r^   rR   rd   ri   rg   )getrY   emptyprodr0  r*   flat_from_sizesbatched)rA  rR   image_pixel_grid_sizesimage_embed_grid_sizesrg   video_grid_sizesvideo_embed_grid_sizesr   r[   r\   _qwen2vl_field_config  s0   



z<_create_qwen2vl_field_factory.<locals>._qwen2vl_field_config)r   r   rY   rZ   )r   rO  r[   rN  r\   _create_qwen2vl_field_factory  s   rP  c                       s   e Zd Zdef fddZdeeejf e	e
 B deeef dB f fddZdeeejf e	e B deeef dB f fd	d
Z  ZS )Qwen2VLMultiModalDataParserr   c                    s   || _ t j|i | d S r~   )_spatial_merge_sizers   rt   )rx   r   argskwargsrz   r[   r\   rt     s   z$Qwen2VLMultiModalDataParser.__init__datar}   Nc                    0   t |trt|dddht| jdS t |S )NrB  r^   rR   modalityrequired_fieldsfields_factory)r  r0  r-   rP  rR  rs   _parse_image_datarx   rU  rz   r[   r\   r[       
z-Qwen2VLMultiModalDataParser._parse_image_datac                    rV  )NrD  ri   rg   rW  )r  r0  r-   rP  rR  rs   _parse_video_datar\  rz   r[   r\   r^    r]  z-Qwen2VLMultiModalDataParser._parse_video_data)rT   rU   rV   r   rt   r0  r   rY   rZ   r'   r&   r/   r	   r[  r,   r^  r   r[   r[   rz   r\   rQ    s    rQ  c                   @   sz  e Zd Zdd ZdedefddZdedefddZd	d
 Z	de
eedB f fddZdede
eef de
eef fddZddddedededededB deeef fddZdedededB defddZdededededB def
ddZ	d.d edB defd!d"Zdefd#d$Zd/d%ed&edefd'd(Zefdede
eef d)edefd*d+Zdede
eef defd,d-ZdS )0Qwen2VLProcessingInfoc                 C   s   | j tS r~   )ctxget_hf_configr   r   r[   r[   r\   ra    s   z#Qwen2VLProcessingInfo.get_hf_configrT  r}   c                 K   s    | j jtfd|ddi|S )Nuse_fastT)r`  get_hf_processorr   poprx   rT  r[   r[   r\   rc     s   
z&Qwen2VLProcessingInfo.get_hf_processorc                 K   s   | j di |jS Nr[   )rc  image_processorre  r[   r[   r\   get_image_processor'  s   z)Qwen2VLProcessingInfo.get_image_processorc                 C   s   t |  jj|  dS )N)expected_hidden_size)rQ  ra  r   r   _get_expected_hidden_sizer   r[   r[   r\   get_data_parser*  s   
z%Qwen2VLProcessingInfo.get_data_parserNc                 C   s
   d d dS NrB  rD  r[   r   r[   r[   r\   get_supported_mm_limits0  s   
z-Qwen2VLProcessingInfo.get_supported_mm_limitsr   	mm_countsc                 C   s   |   }| ||}||dS rl  )get_max_image_tokensget_max_video_tokens)rx   r   ro  max_image_tokensmax_video_tokensr[   r[   r\   get_mm_max_tokens_per_item3  s   
z0Qwen2VLProcessingInfo.get_mm_max_tokens_per_itemr<   T)
num_frames	do_resizeimage_widthimage_heightru  rv  rg  c                C   s   |d u r|   }|  }|j}|j}|j}	|j}
|r4t||||	 |jd |jd d\}}t||d}nt||d}|||
  }t	||
 d}|j
| }|j| }|| | }||	d  }||fS )Nshortest_edgelongest_edge)heightwidthfactor
min_pixels
max_pixelsr|  r{  r<   r   )rh  ra  r   r   r   r   r   sizer.   r	  r{  r|  )rx   rw  rx  ru  rv  rg  	hf_configr   r   
merge_sizer   resized_heightresized_widthpreprocessed_sizepadded_num_framesgrid_tgrid_hgrid_wnum_patchesnum_vision_tokensr[   r[   r\   _get_vision_info<  s0   	


z&Qwen2VLProcessingInfo._get_vision_infoc                C   s   | j ||d|d\}}|S )Nr<   rw  rx  ru  rg  r  )rx   rw  rx  rg  r   num_image_tokensr[   r[   r\   get_num_image_tokensg  s   
z*Qwen2VLProcessingInfo.get_num_image_tokensc                C   s   | j ||||d\}}|S Nr  r  )rx   rw  rx  ru  rg  r   num_video_tokensr[   r[   r\   get_num_video_tokensv  s   
z*Qwen2VLProcessingInfo.get_num_video_tokensr  c                 C   s   |   }|j}|j}|j}|d u r|  }|jd }|| }|||  }dtdtttf fdd}	d|}
}t|ddD ]}|	|\}
}||
 d	krL nq<t	|| ||
 d
S )Nrz  nr}   c                 S   s<   t t| ddD ]}| | dkr|| | f  S q	d| fS )Nr   r   r<   )r   mathisqrt)r  dr[   r[   r\   closest_factor_pair  s
   zTQwen2VLProcessingInfo.get_image_size_with_most_features.<locals>.closest_factor_pairr<   r   r      r  )
ra  r   r   r   rh  r  r   r   r   r.   )rx   r  r  r   r   r  rg  unitmax_seq_lenr  height_factorwidth_factorr   r[   r[   r\   !get_image_size_with_most_features  s"   

z7Qwen2VLProcessingInfo.get_image_size_with_most_featuresc                 C   s   |   \}}| j||d dS )N)rw  rx  rg  )r  r  )rx   target_widthtarget_heightr[   r[   r\   rp    s   z*Qwen2VLProcessingInfo.get_max_image_tokens
max_tokensstart_num_framesc                 C   s@   |   \}}|}	 |d }| j|||d d}||kr	 |S |}q	)NTr<   r  )r  r  )rx   r  r  r  r  ru  next_num_framesnext_max_tokensr[   r[   r\   _get_max_video_frames  s   z+Qwen2VLProcessingInfo._get_max_video_framesmax_frames_per_videoc                 C   s4   | dd}| |}t|t|d |}t|dS )NrD  r   r<   )rE  r  minr	  )rx   r   ro  r  
max_videosmax_total_framesr[   r[   r\   !get_num_frames_with_most_features  s   

z7Qwen2VLProcessingInfo.get_num_frames_with_most_featuresc                 C   s&   |   \}}| j||| ||d dS r  )r  r  r  )rx   r   ro  r  r  r[   r[   r\   rq    s   
z*Qwen2VLProcessingInfo.get_max_video_tokensr~   )r<   )rT   rU   rV   ra  objectr   rc  r   rh  rk  r   r   r   rn  rt  boolr   r.   r  r  r  r  rp  r  _MAX_FRAMES_PER_VIDEOr  rq  r[   r[   r[   r\   r_    s    




+


)	


r_  c                	   @   sX   e Zd Zdeeef defddZ	d
dedeeef deeef dB defdd	Z	dS )Qwen2VLDummyInputsBuilderro  r}   c                 C   s>   | dd}| dd}| j }|j}|j}|| ||  S )NrB  r   rD  )rE  inforc  image_tokenvideo_token)rx   ro  
num_images
num_videoshf_processorr  r  r[   r[   r\   get_dummy_text  s   
z(Qwen2VLDummyInputsBuilder.get_dummy_textNr   
mm_optionsc                 C   s   | dd}| dd}| j \}}| j||}|r!| dnd }	|r*| dnd }
| j||||	d| j|||||
ddS )NrB  r   rD  )r|  r{  r  	overrides)r|  r{  ru  r  r  rm  )rE  r  r  r  _get_dummy_images_get_dummy_videos)rx   r   ro  r  r  r  r  r  target_num_framesimage_overridesvideo_overridesr[   r[   r\   get_dummy_mm_data  s,   z+Qwen2VLDummyInputsBuilder.get_dummy_mm_datar~   )
rT   rU   rV   r   r   r   r  r   r(   r  r[   r[   r[   r\   r    s    
r  c                	   @   sX   e Zd Zdedeeef dedee	 fddZ
dedeeef deeef fdd	Zd
S )Qwen2VLMultiModalProcessormm_itemshf_processor_mm_kwargsout_mm_kwargsr}   c                    s   | j jd	i |}| j jd	i |}| j  }| }||j ||j d|jd dtdt	ffdd  fdddD S )
Nrm  r   item_idxrX  c                    sJ   | |  }|| d j }t|tjsJ t|   }| g| S )N	_grid_thw)rU  r  rY   rZ   r   rG  )r  rX  out_itemr   
num_tokens)merge_lengthr  placeholderr[   r\   get_replacement_qwen2vl+  s
   zOQwen2VLMultiModalProcessor._get_prompt_updates.<locals>.get_replacement_qwen2vlc              	      s(   g | ]}t || gt |d dqS ))rX  )rX  targetreplacement)r5   r   )r   rX  )r  r  r[   r\   r   3  s    
zBQwen2VLMultiModalProcessor._get_prompt_updates.<locals>.<listcomp>r[   )
r  rc  rh  get_tokenizer	get_vocabr  r  r  r   r   )rx   r  r  r  r  rg  	tokenizervocabr[   )r  r  r  r  r\   _get_prompt_updates  s   

z.Qwen2VLMultiModalProcessor._get_prompt_updatesrA  c                 C   s   t | j jj|S r~   )rP  r  ra  r   r   )rx   rA  r  r[   r[   r\   _get_mm_fields_config<  s
   z0Qwen2VLMultiModalProcessor._get_mm_fields_configN)rT   rU   rV   r0   r   r   r	   r+   r   r6   r  r   r  r*   r  r[   r[   r[   r\   r    s"    

#

r  )r  dummy_inputsc                       s  e Zd ZeddddddZdZdee dee	e
e
e
e
ef  fd	d
Zdee
 dee de	eje
f fddZedede
dedB fddZdddedef fddZdededB fddZdededB fddZdede	ejdf fd d!Zd"ede	ejdf fd#d$Zdedefd%d&Zdedefd'd(Z		d=d)ejdB d*ejd+e dB d,ejdB dedeje B fd-d.Z!d/ejdejdB fd0d1Z"d2e#e	eejf  de$e fd3d4Z%de&fd5d6Z'd7e
de
fd8d9Z(d:e
de
fd;d<Z)  Z*S )>Qwen2VLForConditionalGenerationzlanguage_model.model.visual.zlanguage_model.lm_head.)zmodel.language_model.zmodel.visual.zlm_head.zmodel.orig_to_new_prefixTmm_featuresr}   c                 c   s    | j jj}t| j jdd}t|dd dD ]g}|jj}|jdkrD|jd j	 \}}}|dks7J d	| |d|| || dfV  q|jd
krv|jd j	 \}}}d}	|j
ddre|jd j }	|	| }
|||| || |
fV  qtd|j dS )a  
        Iterate over multimodal features and yield grid information.

        Args:
            mm_features: List of multimodal feature specifications

        Yields:
            Tuple of (offset, grid_t, grid_h, grid_w, t_factor) for each frame/image
        tokens_per_second      ?c                 S   s   | j jS r~   )mm_positionoffset)fr[   r[   r\   <lambda>j  s    zBQwen2VLForConditionalGeneration.iter_mm_grid_thw.<locals>.<lambda>)r   rB  rR   r<   zImage must have 1 frame, got rD  rg   second_per_grid_tsNzUnsupported modality: )configr   r   r4  sortedr  r  rX  rU  r!  rE  item
ValueError)rx   r  r   r  
mm_featurer  r  r  r  r  t_factorr[   r[   r\   iter_mm_grid_thw\  s6   


z0Qwen2VLForConditionalGeneration.iter_mm_grid_thwinput_tokensc                 C   sX  g }d}|  |D ]X\}}}}}	|| }
t|dkr"|d  d nd}|tt|
d|
f|  t|||f}|	dkrL|d |	 tj	|d< ||
dd|
 |  ||| |  }q	|t|k rt|dkrv|d  d nd}t|| }
|tt|
d|
f|  tj|dd
dd}| d t|  }t||fS )Nr   r   r<   rQ   r  )r  )r  lenr	  r  rN   broadcast_tor   indicesastypeint64r  r$  r  rY   r&  )rx   r  r  llm_pos_ids_liststr  
llm_grid_t
llm_grid_h
llm_grid_wr  text_lenst_idxgrid_indicesllm_positionsmrope_position_deltar[   r[   r\   get_mrope_input_positions  s8     z9Qwen2VLForConditionalGeneration.get_mrope_input_positionsrX  iNc                 C   s$   | drdS | drdS td)NrB  z+<|vision_start|><|image_pad|><|vision_end|>rD  z+<|vision_start|><|video_pad|><|vision_end|>z)Only image or video modality is supported)
startswithr  )clsrX  r  r[   r[   r\   get_placeholder_str  s
   

z3Qwen2VLForConditionalGeneration.get_placeholder_strrl   )rq   vllm_configrq   c                   s   t    |jj}|j}|jj}|jdk| _|| _|| _| 	|ddh t
|jt|dd|t|dd| _W d    n1 s@w   Y  | | t|t|dd	gd
| _W d    n1 saw   Y  | jj| _d S )NrU  rB  rD  rms_norm_epsr   visual)r   rp   rq   language_modelQwen2ForCausalLM)r  rq   architectures)rs   rt   model_configr  rp   multimodal_configmm_encoder_tp_modery   r  _mark_tower_modelr   r   r4  rE   r  _mark_language_modelrD   r  make_empty_intermediate_tensors)rx   r  rq   r  rp   r  rz   r[   r\   rt     s0   



z(Qwen2VLForConditionalGeneration.__init__rT  c                 K   h   | dd }| dd }| dd }|d u r|d u rd S |d ur'td||dS |d ur2td||dS d S )NrL   r^   rR   )rM   rL   rR   )rM   r^   rR   )rd  rK   r]   )rx   rT  rL   r^   rR   r[   r[   r\   _parse_and_validate_image_input  $   z?Qwen2VLForConditionalGeneration._parse_and_validate_image_inputc                 K   r  )Nrd   ri   rg   )rM   rd   rg   )rM   ri   rg   )rd  rc   rh   )rx   rT  rd   ri   rg   r[   r[   r\   _parse_and_validate_video_input  r	  z?Qwen2VLForConditionalGeneration._parse_and_validate_video_inputimage_input.c                 C      |d }|j dksJ |d dkr|d }n|d }| jr(t| j|| ddS | j||d}| jj}|d	| |  }||S )
NrR   r   rM   r^   rL   rope_3d	rope_typer   r   ndimry   rH   r  r!  r   rG  split)rx   r  r   r^   rL   r  sizesr[   r[   r\   _process_image_input  s   

z4Qwen2VLForConditionalGeneration._process_image_inputvideo_inputc                 C   r  )
Nrg   r   rM   ri   rd   r  r  r  r   r  )rx   r  r   ri   rd   r  r  r[   r[   r\   _process_video_input  s    

z4Qwen2VLForConditionalGeneration._process_video_inputc                 K   sZ   i }|D ]&}|dv rd|vr| j di ||d< |dv r*d|vr*| jdi ||d< q|S )N)rL   r^   images)rd   ri   videosr[   )r  r
  )rx   rT  
modalities	input_keyr[   r[   r\   %_parse_and_validate_multimodal_inputs/  s   zEQwen2VLForConditionalGeneration._parse_and_validate_multimodal_inputsc           	      K   sv   | j di |}|sg S d}|D ](}|dkr%|d }| |}|t|7 }|dkr8|d }| |}|t|7 }q|S )Nr[   r  r  )r  r  r   r  )	rx   rT  r  multimodal_embeddingsrX  r  image_embeddingsr  video_embeddingsr[   r[   r\   embed_multimodalB  s   

z0Qwen2VLForConditionalGeneration.embed_multimodal	input_ids	positionsintermediate_tensorsinputs_embedsc                 K   s$   |durd}| j j||||d}|S )aV  Run forward pass for Qwen2-VL.

        Args:
            input_ids: Flattened (concatenated) input_ids corresponding to a
                batch.
            positions: Flattened (concatenated) position ids corresponding to a
                batch.
                **NOTE**: If mrope is enabled (default setting for Qwen2-VL
                opensource models), the shape will be `(3, seq_len)`,
                otherwise it will be `(seq_len,)`.
            intermediate_tensors: Intermediate tensors from prior forward pass.
            inputs_embeds: Optional tensor of input embeddings.
        N)r!  r"  r#  r$  )r  model)rx   r!  r"  r#  r$  rT  hidden_statesr[   r[   r\   r   Y  s   z'Qwen2VLForConditionalGeneration.forwardr&  c                 C   s   | j |S r~   )r  compute_logits)rx   r&  r[   r[   r\   r'  z  s   z.Qwen2VLForConditionalGeneration.compute_logitsr)  c                 C   s   t | }|j|| jdS )Nmapper)rB   r?  hf_to_vllm_mapper)rx   r)  loaderr[   r[   r\   r?    s   z,Qwen2VLForConditionalGeneration.load_weightsc                 C   s   t jddddS )z<
        Get the module prefix in multimodal models
        r  zvisual.merger.r  )r  	connectortower_model)r$   from_string_fieldr   r[   r[   r\   get_mm_mapping  s
   z.Qwen2VLForConditionalGeneration.get_mm_mappingr  c                 C   s   | j }|j}|j}||d  S Nr   r  r   r   )rx   r  r  r   r  r[   r[   r\   get_num_mm_encoder_tokens  s   z9Qwen2VLForConditionalGeneration.get_num_mm_encoder_tokensr  c                 C   s   | j }|j}|j}||d  S r0  r1  )rx   r  r  r   r  r[   r[   r\   get_num_mm_connector_tokens  s   z;Qwen2VLForConditionalGeneration.get_num_mm_connector_tokens)NN)+rT   rU   rV   rC   r*  supports_encoder_tp_datar  r)   r   r   r   r   r  rY   rZ   r  classmethodr   r  r   rt   r  rb   r  rj   r
  r  r  r0  r  r=   r   r7   r   r'  r   r2  r?  r$   r/  r2  r3  r   r[   r[   rz   r\   r  F  s    	
&
'




!
$


r  c                   @   s   e Zd ZdS )Tarsier2MultiModalProcessorN)rT   rU   rV   r[   r[   r[   r\   r6    s    r6  c                       s6   e Zd Z	ddeeef dB ddf fddZ  ZS )Tarsier2ImageProcessorNr  r}   c                    s^   |d ur"d|v r"d|v r"|d |d d}t  jdd|i| d S t  jdd|i| d S )Nr~  r  )ry  rz  r  r[   )rs   rt   )rx   r  rT  remapped_sizerz   r[   r\   rt     s   zTarsier2ImageProcessor.__init__r~   )rT   rU   rV   r0  r   r   rt   r   r[   r[   rz   r\   r7    s    r7  c                       s&   e Zd Zdedef fddZ  ZS )Tarsier2Processorr   r  c                    s<   t di || _t jd| j|tdi |d d| d S )N)rg  r  video_processorchat_templater[   )r7  rg  rs   rt   r   )rx   r   r  rT  rz   r[   r\   rt     s   
zTarsier2Processor.__init__)rT   rU   rV   r0  r8   rt   r   r[   r[   rz   r\   r9    s    r9  c                   @   s:   e Zd ZdefddZdedefddZdefddZ	d	S )
Tarsier2ProcessingInfor}   c                 C   s   | j jj}t|}|S r~   )r`  r  r%  r   from_pretrained)rx   
model_pathcorrect_configr[   r[   r\   ra    s   

z$Tarsier2ProcessingInfo.get_hf_configrT  c                 K   s   t d| j |  d|S )N)r   r  r[   )r9  r`  get_hf_image_processor_configr  re  r[   r[   r\   rc    s   z'Tarsier2ProcessingInfo.get_hf_processorc                 C   s   t di | j S rf  )r7  r`  r@  r   r[   r[   r\   rh    s   z*Tarsier2ProcessingInfo.get_image_processorN)
rT   rU   rV   r   ra  r  r9  rc  r7  rh  r[   r[   r[   r\   r<    s    r<  c                   @   s>   e Zd ZeddidZdeeeej	f  de
e fddZdS )	 Tarsier2ForConditionalGenerationzvision_tower.r  r  r)  r}   c                 C   s6   g }| j d u r|dg t| |d}|j|| jdS )Nr  )skip_prefixesr(  )r  extendrB   r?  r*  )rx   r)  rB  r+  r[   r[   r\   r?    s
   
z-Tarsier2ForConditionalGeneration.load_weightsN)rT   rU   rV   rC   r*  r   r   r   rY   rZ   r2  r?  r[   r[   r[   r\   rA    s    (rA  )rW   r  collections.abcr   r   r   r   r   	functoolsr   typingr   r	   r
   r   r"  rN   rY   torch.nnr   einopsr   transformersr   transformers.models.qwen2_vlr   r   3transformers.models.qwen2_vl.configuration_qwen2_vlr   r   6transformers.models.qwen2_vl.image_processing_qwen2_vlr   6transformers.models.qwen2_vl.video_processing_qwen2_vlr   vllm.configr   vllm.config.multimodalr   vllm.distributedr   r   r   r   vllm.loggerr   %vllm.model_executor.layers.activationr   $vllm.model_executor.layers.attentionr   vllm.model_executor.layers.convr   !vllm.model_executor.layers.linearr   r   'vllm.model_executor.layers.quantizationr    +vllm.model_executor.layers.rotary_embeddingr!   2vllm.model_executor.layers.rotary_embedding.commonr"   -vllm.model_executor.model_loader.weight_utilsr#   )vllm.model_executor.models.module_mappingr$   vllm.multimodalr%   vllm.multimodal.inputsr&   r'   r(   r)   r*   r+   r,   vllm.multimodal.parser-   r.   r/   r0   r1   vllm.multimodal.processingr2   r3   r4   r5   r6   vllm.sequencer7   vllm.tokenizersr8   vllm.utils.tensor_schemar9   r:   #vllm.v1.attention.backends.registryr;   
interfacesr=   r>   r?   r@   rA   rB   rC   rD   rE   visionrF   rG   rH   rT   loggerr  rK   r]   rb   rX   rc   rh   rj   r   rk   r   r   r   r   r   r   r   rZ   rP  rQ  r_  r  r  register_processorr  r6  r7  r9  r<  rA  r[   r[   r[   r\   <module>   s   $	"t51 G

'" P-.
  Y