o
    
۾i                     @   s  d Z ddlZddlZddlmZmZmZmZ ddlm	Z	 ddl
mZmZmZ ddlZddlZddlmZ ddlm  mZ ddlmZ ddlmZ ddlmZ dd	lmZmZ dd
lm Z  ddlm!Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z,m-Z-m.Z. ddl/m0Z0 ddl1m2Z2 ddl3m4Z4 ddl5m6Z6 ddl7m8Z8m9Z9m:Z:m;Z; ddl<m=Z=m>Z>m?Z? ddl@mAZAmBZBmCZCmDZDmEZE ddlFmGZG ddlHmIZImJZJ ddlKmLZL ddlMmNZN ddlOmPZPmQZQmRZRmSZSmTZT ddl!mUZUmVZVmWZW ddlXmYZY e$eZZ[d e\d!e\fd"d#Z]G d$d% d%ej^Z_G d&d' d'ej^Z`G d(d) d)ej^ZaG d*d+ d+ej^ZbG d,d- d-ej^ZcG d.d/ d/ej^ZdG d0d1 d1eIZeeeZfG d2d3 d3eIZgegZhd4e\eiB d5e\d6e\fd7d8Zjd4e\eiB d5e\d6e\fd9d:Zkd4e\eiB d5e\d6e\fd;d<Zl	=	>	?dQd@e\dAe\d5e\dBe\dCe\f
dDdEZmG dFdG dGej^ZnG dHdI dIeCZoG dJdK dKeBeo ZpG dLdM dMeAeo Zqe6jrepeoeqdNG dOdP dPej^eSeQeTeRZsdS )RzBInference-only Ernie VL model compatible with HuggingFace weights.    N)CallableIterableMappingSequence)partial)	AnnotatedAnyLiteral)	rearrange)BatchFeature)
VllmConfig)BaseDummyOptionsVideoDummyOptions)parallel_state)utils)init_logger)	QuickGELU)MMEncoderAttention)RMSNorm)ColumnParallelLinearQKVParallelLinearRowParallelLinear)QuantizationConfig)ApplyRotaryEmb)default_weight_loader)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFeatureSpecMultiModalFieldConfigMultiModalKwargsItems)	ImageSizeMultiModalDataItemsMultiModalDataParser)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdate)IntermediateTensors)TensorSchemaTensorShape)AttentionBackendEnum   )Ernie4_5_VLMoeForCausalLM)MultiModalEmbeddingsSupportsLoRASupportsMRoPESupportsMultiModal
SupportsPP)AutoWeightsLoaderWeightsMappermaybe_prefix)get_vit_attn_backendhidden_sizetp_sizec                    sp   ddl m} fddtD }|j|t jd  fdd|D }dd t| D }tj	|dd	}|S )
zEAll-gather the input tensor interleavely across model parallel group.r   Nc                    s   g | ]}t  qS  )torch
zeros_like).0_)local_tensorr9   Y/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/ernie45_vl.py
<listcomp>b   s    z)all_gather_interleave.<locals>.<listcomp>)groupc                    s   g | ]}t |  d qS ))r:   split)r<   tensor)r7   r8   r9   r?   r@   g   s    c                 S   s   g | ]	}|D ]}|qqS r9   r9   )r<   pairrD   r9   r9   r?   r@   j   s
    rB   dim)
torch.distributeddistributedrange
all_gatherr   get_tp_groupdevice_groupzipr:   cat)r>   r7   r8   distgathered_tensorsgathered_tensors_splitordered_tensorsresult_tensorr9   )r7   r>   r8   r?   all_gather_interleave^   s   rU   c                       s   e Zd ZdZ		ddededededB ded	df fd
dZdej	d	e
ej	df fddZ	ddej	dej	dej	dej	dB d	ej	f
ddZ  ZS )Ernie4_5_VisionAttentionz)VisionAttention using VLLM framework APIsN 	embed_dim	num_headsprojection_sizequant_configprefixreturnc              	      s   t    t | _t | _t||| _	t|| j| _
t|| j	||d|| dd| _t|||| dd| _t| j
| j	| j	d | dd| _tddd	| _d S )
NTz.qkv)r7   	head_sizetotal_num_headstotal_num_kv_headsbiasr[   r\   z.proj)
input_sizeoutput_sizer[   r\   g      .attn)rY   r^   scaler\   )enforce_enableenable_fp32_compute)super__init__r   $get_tensor_model_parallel_world_sizer8   get_tensor_model_parallel_ranktp_rank
dist_utilsdividehidden_size_per_attention_head!num_attention_heads_per_partitionr   qkvr   projr   attnr   apply_rotary_emb)selfrX   rY   rZ   r[   r\   	__class__r9   r?   ri   t   sD   


	z!Ernie4_5_VisionAttention.__init__rq   .c           	         s   |j \}}}| jdkrt|| jj| j}|jddd\}}}| jdkr@ttj| jd}||| j	 }||| j	 }||| j	 }||| j
| jf  fdd|||fD \}}}|||fS )Nr,         rF   )num_partitionsc                 3   s    | ]}|j   V  qd S N)viewr<   x	new_shaper9   r?   	<genexpr>   s    z5Ernie4_5_VisionAttention.split_qkv.<locals>.<genexpr>)shaper8   rU   rq   r7   chunkr   rm   split_tensor_along_last_dimrl   rp   ro   )	ru   rq   seq_lenbsr=   qkvsplitterr9   r   r?   	split_qkv   s$   


z"Ernie4_5_VisionAttention.split_qkvr~   
cu_seqlensrotary_pos_emb
max_seqlenc                 C   s   |  |\}}| |\}}}dd |||fD \}}}|d ur>tj||gdd}	| |	| | }
tj|
ddd\}}| j|||||d}t	|d
 }| |\}}|S )Nc                 s   s    | ]
}t |d  V  qdS )zs b ... -> b s ...N)r
   
contiguousr}   r9   r9   r?   r      s    z3Ernie4_5_VisionAttention.forward.<locals>.<genexpr>r   rF   ry   )querykeyvaluer   r   zb s h d -> s b (h d))rq   r   r:   rO   rt   cossinr   rs   r
   r   rr   )ru   r~   r   r   r   r=   r   r   r   	qk_concat
qk_rotatedoutputcontext_layerr9   r9   r?   forward   s*   z Ernie4_5_VisionAttention.forward)NrW   r{   )__name__
__module____qualname____doc__intr   strri   r:   Tensortupler   r   __classcell__r9   r9   rv   r?   rV   q   s<    /!rV   c                       sZ   e Zd Zeddfdededeej dedB de	f
 fdd	Z
d
ejdejfddZ  ZS )Ernie4_5_VisionMLPNrW   in_featureshidden_features	act_layerr[   r\   c                    sF   t    t|||| dd| _| | _t|||| dd| _d S )Nz.fc1)r[   r\   z.fc2)rh   ri   r   fc1actr   fc2)ru   r   r   r   r[   r\   rv   r9   r?   ri      s   
zErnie4_5_VisionMLP.__init__r~   r]   c                 C   s*   |  |\}}| |}| |\}}|S r{   )r   r   r   )ru   r~   
x_parallelr=   r9   r9   r?   r      s   
zErnie4_5_VisionMLP.forward)r   r   r   r   r   typennModuler   r   ri   r:   r   r   r   r9   r9   rv   r?   r      s     r   c                       s   e Zd Zedddfdedededeej de	egejf dB de
dB d	ed
df fddZ	ddejdejdejdejdB d
ejf
ddZ  ZS )Ernie4_5_VisionBlockNrW   rG   rY   	mlp_ratior   
norm_layerr[   r\   r]   c           	         sx   t    |d u rttjdd}||| _||| _t|| }t||||| dd| _	t
||||| dd| _d S )Nư>epsrd   )rX   rY   rZ   r[   r\   .mlp)r   r[   r\   )rh   ri   r   r   	LayerNormnorm1norm2r   rV   rs   r   mlp)	ru   rG   rY   r   r   r   r[   r\   mlp_hidden_dimrv   r9   r?   ri     s(   



zErnie4_5_VisionBlock.__init__hidden_statesr   r   r   c                 C   s4   || j | ||||d }|| | | }|S )Nr   r   r   )rs   r   r   r   )ru   r   r   r   r   r9   r9   r?   r   %  s   zErnie4_5_VisionBlock.forwardr{   )r   r   r   r   r   floatr   r   r   r   r   r   ri   r:   r   r   r   r9   r9   rv   r?   r     sD    	'r   c                	       sN   e Zd Z				ddedededd	f fd
dZdejdejfddZ  ZS )Ernie4_5_VisionPatchEmbed   rx      rW   
patch_sizein_channelsrX   r]   Nc                    s:   t    || _|| _|| _tj|| | |dd| _d S )NF)ra   )rh   ri   r   r   rX   r   Linearrr   )ru   r   r   rX   r\   rv   r9   r?   ri   7  s   
z"Ernie4_5_VisionPatchEmbed.__init__r   c                 C   s"   | j jj}||}|  |}|S r{   )rr   weightdtypeto)ru   r   target_dtyper9   r9   r?   r   G  s   


z!Ernie4_5_VisionPatchEmbed.forward)r   rx   r   rW   )	r   r   r   r   ri   r:   r   r   r   r9   r9   rv   r?   r   6  s    r   c                       s@   e Zd Zddededdf fddZdedejfd	d
Z  Z	S )Ernie4_5_VisionRotaryEmbedding     @rG   thetar]   Nc                    s0   t    d|tjd|dtjd|   | _d S )Ng      ?r   ry   )startendstepr   )rh   ri   r:   arangefloat32inv_freq)ru   rG   r   rv   r9   r?   ri   P  s   
z'Ernie4_5_VisionRotaryEmbedding.__init__seqlenc                 C   s,   t j|| jj| jjd}t j|| jd}|S )Ndevicer   )inputvec2)r:   r   r   r   r   outer)ru   r   seqfreqsr9   r9   r?   r   V  s
   z&Ernie4_5_VisionRotaryEmbedding.forward)r   )
r   r   r   r   r   ri   r:   r   r   r   r9   r9   rv   r?   r   O  s    r   c                	       s   e Zd Z			ddededB deddf fdd	Zedej	fd
dZ	edej
fddZ
dejdejfddZdejdejdB fddZ	ddejdejdejfddZdee fddZ  ZS )Ernie4_5_VisionTransformerr   NrW   norm_epsr[   r\   r]   c                    s   t    |j}|j}|j}|j}|j |j}	|j|j	|| _| _ | _t
||  dd| _ttj|d  }
t|
d | _t fddt|	D | _| ks`J dtj|dd| _t|
t d	| _d S )
Nz.patch_embed)r   r   rX   r\   r   ry   c                    s*   g | ]}t   d | dqS )z.blocks.)rG   rY   r   r   r[   r\   )r   )r<   	layer_idxrX   r   r   rY   r\   r[   r9   r?   r@     s    	z7Ernie4_5_VisionTransformer.__init__.<locals>.<listcomp>z5vit's config.hidden must be equal to config.embed_dimr   )r^   r   )rh   ri   r   spatial_merge_sizer   r7   rX   depthrY   r   r   patch_embedr   r   r   r   r   
ModuleListrJ   blockslnr6   r:   get_default_dtypeattn_backend)ru   vision_configr   r[   r\   r   r   r   r7   r   head_dimrv   r   r?   ri   _  sD   
	
z#Ernie4_5_VisionTransformer.__init__c                 C      | j jjjS r{   )r   rr   r   r   ru   r9   r9   r?   r        z Ernie4_5_VisionTransformer.dtypec                 C   r   r{   )r   rr   r   r   r   r9   r9   r?   r     r   z!Ernie4_5_VisionTransformer.devicegrid_thwc                 C   s  g }|D ]]\}}}t |dd|}t |d|d}||| j | j|| j | jdddd }||| j | j|| j | jdddd }|t j	||gdd
|d qt j|dd}|d d dd f  }| |}	|	| d}
|
S )Nr,   rB   r   ry   rx   rF   )r:   r   	unsqueezeexpandreshaper   permuteflattenappendstackrepeatrO   maxr   )ru   r   pos_idsthwhpos_idswpos_idsmax_grid_sizerotary_pos_emb_fullr   r9   r9   r?   rot_pos_emb  s8   "

z&Ernie4_5_VisionTransformer.rot_pos_embr   c                 C   s<   d }| j tjks| j tjkr|dd  |d d   }|S )Nr,   rB   )r   r+   
FLASH_ATTNROCM_AITER_FAr   )ru   r   r   r9   r9   r?   compute_attn_mask_seqlen  s
   z3Ernie4_5_VisionTransformer.compute_attn_mask_seqlenr   r   c                 C   s
  |  |}| |}||j}t|d d df |d d df  |d d df jdtjd}|d}|dkrJt	|||g}|d | |d< nt	||g}|j
dkr\|jdd}| |}t| jD ]\}}	|	||||d}qf| |}
|
j
d	kr|
jdd}
|
S )
Nr,   ry   r   )rG   r   rB   rF   r   rx   )r   r   r   r   r:   repeat_interleavecumsumint32	new_zerosrO   ndimr   r   	enumerater   r   squeeze)ru   r   r   num_padr   r   zerosr   iblkfinal_outputr9   r9   r?   r     s6   

,




z"Ernie4_5_VisionTransformer.forwardc                 C   sP   t | jdd}t }|D ]\}}|| }t|dt}||| || q|S NF)remove_duplicateweight_loaderdictnamed_parameterssetgetattrr   addru   weightsparams_dictloaded_paramsnameloaded_weightparamr  r9   r9   r?   load_weights  s   
z'Ernie4_5_VisionTransformer.load_weights)r   NrW   r   )r   r   r   r   r   r   ri   propertyr:   r   r   r   r   r   r   r  r  r   r9   r9   rv   r?   r   ^  s8    8 

)r   c                   @   N   e Zd ZU dZed ed< eeje	ddf ed< eeje	ddf ed< d	S )
Ernie4_5_VLImagePixelInputsz
    Dimensions:
        - np: The total number of patches over each image over each prompt in
              the batch
        - ni: Number of images
        - cps: Number of channels * patch_size * patch_size
    pixel_valuesr   npcpsnirx   image_grid_thwN
r   r   r   r   r	   __annotations__r   r:   r   r*   r9   r9   r9   r?   r      s
   
 r   c                   @   r  )
Ernie4_5_VLVideoPixelInputsz
    Dimensions:
        - np: The total number of patches over each image over each prompt in
              the batch
        - ni: Number of images
        - cps: Number of channels * temporal_patch_size * patch_size *
              patch_size
    pixel_values_videosr   r"  r#  r$  rx   video_grid_thwNr&  r9   r9   r9   r?   r(    s
   
 	r(  numberfactorr]   c                 C   s   t | | | S r{   )roundr+  r,  r9   r9   r?   round_by_factor&  s   r/  c                 C      t | | | S r{   )mathceilr.  r9   r9   r?   ceil_by_factor*     r3  c                 C   r0  r{   )r1  floorr.  r9   r9   r?   floor_by_factor.  r4  r6     @     heightwidth
min_pixels
max_pixelsc              
   C   sB  d}t | |t| | |kr5| |kr"t |t||}t|| |}nt |t| |}t|| |}|} |}t |t| |}t |t||}	||	 |krct| | | }
t| |
 |}t||
 |}	n||	 |k rt|| |  }
t| |
 |}t||
 |}	|||	 ks||	 |krtd| d|	 d| d| d	||	fS )N   zInvalid h_bar=z, w_bar=z': h_bar * w_bar must be >= min_pixels (z) and <= max_pixels (z).)r   minr/  r6  r1  sqrtr3  
ValueError)r:  r;  r,  r<  r=  	MAX_RATIO	new_width
new_heighth_barw_barbetar9   r9   r?   smart_resize2  s8   rH  c                       s^   e Zd Z	ddeddf fddZdd Zd	d
 Zdeeee	j
f  dee fddZ  ZS ) VariableResolutionResamplerModelrW   r\   r]   Nc              	      s~  t    || _|| _|| _|| _|| _|j| _| j| j | j | _| j| j | j | j | _	t
| j| jddt|dd | dd| _t | _t
| j| jddt|dd | dd| _tj| jdd| _| jrt
| j	| jddt|dd | dd| _t | _t
| j| jddt|dd | d	d| _tj| jdd| _t
| j| jddt|dd | d
d| _t|t|ddd| _d S )NTr[   z.spatial_linear1)ra   gather_outputr[   r\   z.spatial_linear2r   r   z.temporal_linear1z.temporal_linear2r   rms_norm_eps)r7   r   )rh   ri   in_dimout_dimconfigspatial_conv_sizetemporal_conv_sizeuse_temporal_convspatial_dimtemporal_dimr   r  spatial_linear1r   GELUspatial_geluspatial_linear2r   spatial_normtemporal_linear1temporal_gelutemporal_linear2temporal_normr   r   
after_norm)ru   rL  rM  rO  rP  rN  r\   rv   r9   r?   ri   [  s   
	

	
	

	
	
	z)VariableResolutionResamplerModel.__init__c                 C   s$   |j \}}|d||d  g}|S )NrB   ry   )r   r   )ru   r~   rO  SCr9   r9   r?   spatial_conv_reshape  s   
z5VariableResolutionResamplerModel.spatial_conv_reshapec                    s^    fdd}d
 fdd	} fdd} fdd	}||} j r)|||}||}||}|S )Nc                    sB     |  j}  | \} } | }  | \} } | } | S r{   )r`  rO  rT  rV  rW  rX  r~   r=   r   r9   r?   fwd_spatial  s   

z=VariableResolutionResamplerModel.forward.<locals>.fwd_spatialFc              
      s  |   }|d d df |d d dd f }}|d jd  }|d jd  }tj|j|jd}d|d< | d d |dd < g }	t	|||D ]!\}
}}t
d|
dD ]}|	t|||  ||d |   qYqNttj|	dd| j}	g }t	|||D ]'\}
}}t
|
dkrdnd|
dD ]}|t|||  ||d |   qqttj|dd| j}tj| d|	d}tj| d|d}tj||gdd} | S )	Nr   r,   rB   ry   r   )axis)rG   indexrF   )cpunumpyprodrO  r"  emptysizer   r  rN   rJ   r   r   r:   rD   concatenater   r   index_selectconcat)r~   r   	to_tensorgrid_thw_cpugrid_tgrid_hwgrid_hw_after_convtokens_per_img_or_vidbatch_offsetslice_offsetstemporoal_sizespatial_sizeb_offsettemp_offsetslice_offsets2x_timestep_1x_timestep_2r   r9   r?   fwd_placeholder  sZ   &

	zAVariableResolutionResamplerModel.forward.<locals>.fwd_placeholderc                    s4     | \} } | }  | \} } | } | S r{   )rY  rZ  r[  r\  ra  r   r9   r?   fwd_temporal  s
   

z>VariableResolutionResamplerModel.forward.<locals>.fwd_temporalc                    s     | \} } | } | S r{   )r   r]  ra  r   r9   r?   fwd_mlp  s   
z9VariableResolutionResamplerModel.forward.<locals>.fwd_mlp)F)rQ  )ru   r~   r   rb  r}  r~  r  r9   r   r?   r     s   
1
z(VariableResolutionResamplerModel.forwardr  c                 C   sZ   t | jdd}t }|D ]\}}||vrq|| }t|dt}||| || q|S r  r  r  r9   r9   r?   r    s   
z-VariableResolutionResamplerModel.load_weightsrW   )r   r   r   r   ri   r`  r   r   r   r:   r   r  r  r   r9   r9   rv   r?   rI  Z  s    U,OrI  c                   @   sX  e Zd Zdd ZdefddZdefddZdd	 Zd
ee	e
dB f fddZde
dee	e
f d
ee	e
f fddZdddde
de
de
dededB d
eee
f fddZde
de
dedB d
e
fddZde
de
de
dedB d
e
f
ddZd
efd d!Zd
e
fd"d#Zd$e
d
e
fd%d&Zde
dee	e
f d
e
fd'd(Zde
dee	e
f d
e
fd)d*ZdS )+Ernie4_5_VLProcessingInfoc                 C   s
   | j jjS r{   )ctxmodel_config	hf_configr   r9   r9   r?   get_hf_config     
z'Ernie4_5_VLProcessingInfo.get_hf_configkwargsc                 K   s   | j jdddi|S )Nuse_fastTr9   )r  get_hf_processorru   r  r9   r9   r?   r    s   z*Ernie4_5_VLProcessingInfo.get_hf_processorc                 K   s   | j di |jS )Nr9   )r  image_processorr  r9   r9   r?   get_image_processor  r4  z-Ernie4_5_VLProcessingInfo.get_image_processorc                 C   s   t d|  dS )NT)video_needs_metadataexpected_hidden_size)r"   _get_expected_hidden_sizer   r9   r9   r?   get_data_parser  s   z)Ernie4_5_VLProcessingInfo.get_data_parserr]   Nc                 C   s
   d d dS Nimagevideor9   r   r9   r9   r?   get_supported_mm_limits"  r  z1Ernie4_5_VLProcessingInfo.get_supported_mm_limitsr   	mm_countsc                 C   s   |   }| ||}||dS r  )get_max_image_tokensget_max_video_tokens)ru   r   r  max_image_tokensmax_video_tokensr9   r9   r?   get_mm_max_tokens_per_item%  s   
z4Ernie4_5_VLProcessingInfo.get_mm_max_tokens_per_itemr,   T)
num_frames	do_resizeimage_widthimage_heightr  r  r  c                C   s   |d u r|   }|  }|j}|j}|j}	|j}
|r0t||||	 |j|jd\}}t	||d}nt	||d}t
||
 d}|j| }|j| }|| | }||	d  }||fS )N)r:  r;  r,  r<  r=  )r;  r:  r,   ry   )r  r  r   r   rO  rP  rH  r<  r=  r    r   r:  r;  )ru   r  r  r  r  r  r  r   r   rO  rP  resized_heightresized_widthpreprocessed_sizerp  grid_hgrid_wnum_patchesnum_vision_tokensr9   r9   r?   _get_vision_info.  s.   	


z*Ernie4_5_VLProcessingInfo._get_vision_infoc                C   s   | j |||d\}}|S Nr  r  r  r  )ru   r  r  r  r=   num_image_tokensr9   r9   r?   get_num_image_tokensU  s   
z.Ernie4_5_VLProcessingInfo.get_num_image_tokensc                C   s   | j ||||d\}}|S Nr  r  r  r  r  )ru   r  r  r  r  r=   num_video_tokensr9   r9   r?   get_num_video_tokensc  s   
z.Ernie4_5_VLProcessingInfo.get_num_video_tokensc                 C   s   | j ddd d\}}|S )Ni r  r  )ru   max_image_sizer=   r9   r9   r?   !get_image_size_with_most_featuress  s   
z;Ernie4_5_VLProcessingInfo.get_image_size_with_most_featuresc                 C   s    |   \}}| j||d d}|S r  )r  r  )ru   target_widthtarget_heightr  r9   r9   r?   r  {  s   z.Ernie4_5_VLProcessingInfo.get_max_image_tokens
max_tokensc                 C   sT   |   \}}d}	 |d }| j|||d d}||krn|}q	|d dkr(|d8 }|S )Nr   Tr,   r  ry   )r  r  )ru   r  r  r  r  next_num_framesnext_max_tokensr9   r9   r?   _get_max_video_frames  s"   z/Ernie4_5_VLProcessingInfo._get_max_video_framesc                 C   sJ   | dd}| dd}|  | }| || }|t|d }t|dS )Nr  r   r  r,   ry   )getr  r  r   )ru   r   r  
max_images
max_videosr  max_total_framesmax_frames_per_videor9   r9   r?   !get_num_frames_with_most_features  s   
z;Ernie4_5_VLProcessingInfo.get_num_frames_with_most_featuresc                 C   s&   |   \}}| j||| ||d dS r  )r  r  r  )ru   r   r  r  r  r9   r9   r?   r    s   
z.Ernie4_5_VLProcessingInfo.get_max_video_tokens)r   r   r   r  objectr  r  r  r   r   r   r  r  boolr   r   r    r  r  r  r  r  r  r  r  r9   r9   r9   r?   r    s    




'





r  c                
   @   s   e Zd ZdejdedejfddZdedeeef deeef deeef de	f
d	d
Z
dedeeef dedee fddZde	deeef deeef fddZdS )Ernie4_5VLMultiModalProcessorr!  	mm_kwargsr]   c           
      C   s   | j  }|j}| j jdi |}tj|jtjdg d}tj|j	tjdg d}tj|j
tjd}|jd }	|ddg|	d}|ddg|	d}| sY| }| sa| }||tj | | }||j}|S )Nrc  )r,   rx   r,   r,   ry   r   rB   r9   )infor  r   r  r:   rD   
image_meanr   r   	image_stdrescale_factorr   r  r   is_contiguousr   r   r   )
ru   r!  r  r  r   r  image_mean_tensorimage_std_tensorr  patch_size_squaredr9   r9   r?   _pixel_values_norm  s@   



z0Ernie4_5VLMultiModalProcessor._pixel_values_normpromptmm_data
tok_kwargsc                 C   s  d|vr"d|vr"|dkr"| j  }||}tt|gddd}|S d|vr*g |d< d|vr2g |d< | j jdi |}t|dd}	|d rW|	sWtd	 d
d |d D |d< | j j	
|t|g|d |d dtdi ||}
|
d ur|
d }|d ur| |||
d< t|
 D ]L}|
| d u r|
|= q|dkr|
d }|
d }|d d df dk}|| |
d< ||  |
d< |
d jdd }|d | |
d< ||d  |
d< |
d= q|
S )NimagesvideosrW   )	input_idspt)tensor_typesupports_video_metadataFzgHF processor doesn't support video metadata. Timestamps will NOT be rendered. Please upgrade the model.c                 S   s"   g | ]}t |tr|d  n|qS r  )
isinstancer   )r<   r   r9   r9   r?   r@     s    zDErnie4_5VLMultiModalProcessor._call_hf_processor.<locals>.<listcomp>)textr  r  r   r   r,   r*  r%  rF   r!  r)  r9   )r  get_tokenizerencoder   r  r  r  loggerwarning_oncer  call_hf_processorr  listkeysrh  sum)ru   r  r  r  r  	tokenizer
prompt_idstokenizer_outputhf_processorr  processor_outputr!  r   r   pixel_values_allmaskimage_patch_numr9   r9   r?   _call_hf_processor  sl   	


z0Ernie4_5VLMultiModalProcessor._call_hf_processormm_itemshf_processor_mm_kwargsout_mm_kwargsc                    s`   | j jdi |dddddd jd dtdtf fdd	fd
ddD S )Nz<|image@placeholder|>z<|video@placeholder|>r  z<|IMAGE_PLACEHOLDER|>ry   item_idxmodalityc                    sh   | |  }|| d j }t|tjsJ |dkr&t| j  }nt|  } | | S )N	_grid_thwr  )datar  r:   r   r   rh  rP  )r  r  out_itemr   
num_tokens)after_placeholderr  merge_lengthr  r9   r?   get_replacement_ernie45vlC  s   
zTErnie4_5VLMultiModalProcessor._get_prompt_updates.<locals>.get_replacement_ernie45vlc              	      s&   g | ]}t | | t|d dqS ))r  )r  targetreplacement)r&   r   )r<   r  )before_placeholderr  r9   r?   r@   Q  s    
zEErnie4_5VLMultiModalProcessor._get_prompt_updates.<locals>.<listcomp>r9   )r  r  rO  r   r   )ru   r  r  r  r9   )r  r  r  r  r  r  r?   _get_prompt_updates.  s   
z1Ernie4_5VLMultiModalProcessor._get_prompt_updates	hf_inputsc                 C   sd   | dtd}|d}| dtd}|d}ttd|tdtd|tddS )Nr%  )r   rx   rB   r*  r  r  )r!  r%  r)  r*  )r  r:   ri  rh  r  r   flat_from_sizesbatched)ru   r  r  r%  image_grid_sizesr*  video_grid_sizesr9   r9   r?   _get_mm_fields_configZ  s   

z3Ernie4_5VLMultiModalProcessor._get_mm_fields_configN)r   r   r   r:   r   r  r  r   r   r   r  r!   r   r   r   r'   r  r   r  r9   r9   r9   r?   r    sF    
%



M

,

r  c                   @   s   e Zd Zdeeef defddZ	ddedeeef deeef dB defdd	Z	dd
dedededede
dB f
ddZdS )Ernie4_5_VLDummyInputsBuilderr  r]   c                 C   sd   | dd}| dd}d}t|D ]}|d|d  d7 }qt|D ]}|d|d  d	7 }q#|S )
Nr  r   r  rW   zPicture r,   z2:<|IMAGE_START|><|image@placeholder|><|IMAGE_END|>zVideo z2:<|VIDEO_START|><|video@placeholder|><|VIDEO_END|>)r  rJ   )ru   r  
num_images
num_videosr  r	  r9   r9   r?   get_dummy_textr  s   z,Ernie4_5_VLDummyInputsBuilder.get_dummy_textNr   
mm_optionsc                 C   s   | dd}| dd}| j \}}| j||}|r!| dnd }	|r*| dnd }
| j||||	d| j|||||
ddS )Nr  r   r  )r;  r:  r  	overrides)r;  r:  r  r   r  r  )r  r  r  r  _get_dummy_images_get_dummy_videos)ru   r   r  r  r  r   r  r  target_num_framesimage_overridesvideo_overridesr9   r9   r?   get_dummy_mm_data  s,   z/Ernie4_5_VLDummyInputsBuilder.get_dummy_mm_data)r  r;  r:  r  r   r  c                C   s   |rD|j r|j |krtd|j | t||j }|jr.|j|kr(td|j| t||j}|jrD|j|kr>td|j| t||j}t|d}tj|||dfdtj	d}g }t
|D ]}d|d |d	d
 t
|D ddd}	| |	f}
||
 q\|S )Nz]video.num_frames override (%d) exceeds model's maximum number of frames (%d), will be ignoredzMvideo.width override (%d) exceeds model's maximum width (%d), will be ignoredzOvideo.height override (%d) exceeds model's maximum height (%d), will be ignoredry   rx      rc  g       @c                 S   s   g | ]}|qS r9   r9   )r<   r	  r9   r9   r?   r@     s    zCErnie4_5_VLDummyInputsBuilder._get_dummy_videos.<locals>.<listcomp>opencvF)fpsdurationtotal_num_framesframes_indicesvideo_backenddo_sample_frames)r  r  warningr?  r;  r:  r   r"  fulluint8rJ   copyr   )ru   r;  r:  r  r   r  r  video_itemsr	  video_metadata
video_itemr9   r9   r?   r    sN   	



z/Ernie4_5_VLDummyInputsBuilder._get_dummy_videosr{   )r   r   r   r   r   r   r  r   r   r	  r   r  r9   r9   r9   r?   r  q  s0    

(r  )r  dummy_inputsc                       s  e Zd Zg dddgdZeddddd	d
ddddddZededededB fddZ	dIde
deddf fddZdejdejdB fddZdejd ejdejfd!d"Zd#ejddfd$d%Zd&ee d'ee deejef fd(d)Zd*ededB fd+d,Zd*ededB fd-d.Zd/edeejd0f fd1d2Zd3edeejd0f fd4d5Zd*edefd6d7Zd*ededB fd8d9Z	dJdd:d;d#ejd<edB d=ejdB d>e dejf
 fd?d@Z!		dKd#ejdB dAejdBe"dB dCejdB fdDdEZ#dFe$eeejf  de%e fdGdHZ&  Z'S )L&Ernie4_5_VLMoeForConditionalGeneration)q_projk_projv_proj	gate_projup_proj)qkv_projgate_up_projzlanguage_model.lm_head.zlanguage_model.model.zresampler_model.)zlm_head.zmodel.z%language_model.model.resampler_model.zspatial_linear1.zspatial_linear2.zspatial_norm.ztemporal_linear1.ztemporal_linear2.ztemporal_norm.)zspatial_linear.0.zspatial_linear.2.zspatial_linear.3.ztemporal_linear.0.ztemporal_linear.2.ztemporal_linear.3.)orig_to_new_prefixorig_to_new_substrr  r	  r]   Nc                 C   s$   | drdS | drdS td)Nr  z1<|IMAGE_START|><|image@placeholder|><|IMAGE_END|>r  z1<|VIDEO_START|><|video@placeholder|><|VIDEO_END|>z)Only image or video modality is supported)
startswithrA  )clsr  r	  r9   r9   r?   get_placeholder_str  s
   

z:Ernie4_5_VLMoeForConditionalGeneration.get_placeholder_strrW   vllm_configr\   c              
      sl  t    |jj}|j}|jj}|| _|| _| |ddh0 t|j	t
|dd|t|dd| _t| jj| jj| jj| jj| jt|dd| _W d    n1 sQw   Y  | | t|t|d	d
| _W d    n1 spw   Y  d | _| jj| _t
| jdd rdd | jjt
| jdd t
| jdd t
| jdd t
| jdd fD }tj|tjd| _d S d | _d S )Nr  r  rK  r   vision_model)r   r[   r\   resampler_model)rN  r\   language_model)r'  r\   im_patch_idc                 S   s   g | ]}|d ur|qS r{   r9   )r<   token_idr9   r9   r?   r@   ,  s
    zCErnie4_5_VLMoeForConditionalGeneration.__init__.<locals>.<listcomp>image_start_token_idimage_end_token_idvideo_start_token_idvideo_end_token_idrc  )rh   ri   r  r  r[   multimodal_configrN  _mark_tower_modelr   r   r  r5   r(  rI  pixel_hidden_sizer7   rO  rP  r)  _mark_language_modelr-   r*  visual_token_maskmake_empty_intermediate_tensorsr+  r:   rD   long_visual_token_ids_tensor_cache)ru   r'  r\   rN  r[   r1  visual_token_idsrv   r9   r?   ri     sX   




z/Ernie4_5_VLMoeForConditionalGeneration.__init__r   c                 C   s   | j |S )zcompute logits)r*  compute_logits)ru   r   r9   r9   r?   r:  =  s   z5Ernie4_5_VLMoeForConditionalGeneration.compute_logitsr!  r   c                 C   s   |d ur?||dk }|  d dkrtd|   d|dd}tjt|d d dd f |d d df dg ddd}| ||}|S )	Nr   rx   zgrid_thw has z6 elements after filtering,which is not divisible by 3.rB   r,   )r,   r   r   r   )r   )numelrA  r   Fpadr:   r   r(  )ru   r!  r   image_featuresr9   r9   r?   _vision_forwardD  s   (z6Ernie4_5_VLMoeForConditionalGeneration._vision_forwardr  c                 C   sB   | j du r
d| _dS | j j|j|jd}t||dd| _dS )z@Set mask for visual tokens (image/video patches and delimiters).Nr   rB   r,   )r8  r5  r   r   r   r:   isinr   )ru   r  visual_token_ids_tensorr9   r9   r?   _set_visual_token_maskZ  s   

z=Ernie4_5_VLMoeForConditionalGeneration._set_visual_token_maskinput_tokensmm_featuresc           )   	   C   s  t |ddh}dd |dg D }dd |dg D }| j}|j}|j}|j}	|j}
|j}g }|s7|rg }d}|D ]0}||krFd}n||	krLd}||krZ|du rZ|	d q=||krh|du rh|	d	 q=|	d
 q=g }t
t|dd D ]\}}t|}|d d }|d d d }|	|||f qzd}d}|D ]\}}}t|dkr|d  d nd}|dkr|| \}}}|||
 ||
 }} }!t|ddd| |!  }"t| ddd|d|! }#t|!ddd|| d }$|	t|"|#|$g|  |d7 }q|d	kr||| \}}}|| ||
 ||
 }} }!t|D ]C}%t|%ddd| |!  }"t| ddddd|! }#t|!dddd| d }$|	t|"|#|$g|  q/|d7 }|d7 }q|| }&|	t|&dddd|  d}qnt|}&|	t|&dddd tj|dddd}'|' d t|  }(|'|(fS )Nr%  r*  c                 S      g | ]}|  qS r9   tolistr<   itemr9   r9   r?   r@   r      zTErnie4_5_VLMoeForConditionalGeneration.get_mrope_input_positions.<locals>.<listcomp>c                 S   rE  r9   rF  rH  r9   r9   r?   r@   s  rJ  FTr  r  r  c                 S   s   | d S )Nr,   r9   )r~   r9   r9   r?   <lambda>  s    zRErnie4_5_VLMoeForConditionalGeneration.get_mrope_input_positions.<locals>.<lambda>r   rB   r,   rx   rF   )r   gather_kwargsr  rN  r+  r/  r0  rO  rP  r   	itertoolsgroupbyr  r  lenr   r:   r   r|   r   r   r   rJ   rD   rO   r   rI  ))ru   rC  rD  r  r%  r*  r  image_token_idr/  r0  rO  rP  llm_pos_ids_listinput_token_typevideo_check_flgtokeninput_type_groupr   
group_iter
group_liststart_index	end_indexvideo_frame_nummm_data_idxmodality_type	start_idxend_idxst_idxr   r   r   
llm_grid_t
llm_grid_h
llm_grid_wt_indexh_indexw_indext_idxtext_lenllm_positionsmrope_position_deltar9   r9   r?   get_mrope_input_positionsi  s   














M z@Ernie4_5_VLMoeForConditionalGeneration.get_mrope_input_positionsr  c                 K   >   | dd }| dd }|d u rd S |d urtd||dS d S )Nr!  r%  )r   r!  r%  )popr   )ru   r  r!  r%  r9   r9   r?   _parse_and_validate_image_input     zFErnie4_5_VLMoeForConditionalGeneration._parse_and_validate_image_inputc                 K   rk  )Nr)  r*  )r   r)  r*  )rl  r(  )ru   r  r)  r*  r9   r9   r?   _parse_and_validate_video_input  rn  zFErnie4_5_VLMoeForConditionalGeneration._parse_and_validate_video_inputimage_input.c                 C   sj   |d }|j dksJ |d | jj}| j||d}| ||}| jj}|d| | }||	 S )Nr%  ry   r!  r!  r   rB   )
r  r   r(  r   r?  r)  r   rh  rC   rG  )ru   rp  r   r!  r>  image_embeds
merge_sizesizesr9   r9   r?   _process_image_input  s   z;Ernie4_5_VLMoeForConditionalGeneration._process_image_inputvideo_inputc                 C   sr   |d }|j dksJ |d | jj}| j||d}| ||}| jj}|d| jj	 | | }|
| S )Nr*  ry   r)  rq  rB   )r  r   r(  r   r?  r)  r   rh  rN  rP  rC   rG  )ru   rv  r   r)  video_featuresvideo_embedsrs  rt  r9   r9   r?   _process_video_input  s"   z;Ernie4_5_VLMoeForConditionalGeneration._process_video_inputc                 K   sZ   i }|D ]&}|dv rd|vr| j di ||d< |dv r*d|vr*| jdi ||d< q|S )N)r!  rr  r  )r)  rx  r  r9   )rm  ro  )ru   r  
modalities	input_keyr9   r9   r?   %_parse_and_validate_multimodal_inputs4  s   zLErnie4_5_VLMoeForConditionalGeneration._parse_and_validate_multimodal_inputsc           	      K   sv   | j di |}|sd S d}|D ](}|dkr%|d }| |}|t|7 }|dkr8|d }| |}|t|7 }q|S )Nr9   r  r  )r|  ru  r   ry  )	ru   r  rz  multimodal_embeddingsr  rp  image_embeddingsrv  video_embeddingsr9   r9   r?   embed_multimodalG  s   

z7Ernie4_5_VLMoeForConditionalGeneration.embed_multimodalF)is_multimodalhandle_oov_mm_tokenr}  r  r  c                   sN   |d urt |dkr| | |d u s|d u rt |S t j||||dS )Nr   )r}  r  r  )rO  rB  rh   embed_input_ids)ru   r  r}  r  r  rv   r9   r?   r  ^  s   
z6Ernie4_5_VLMoeForConditionalGeneration.embed_input_ids	positionsintermediate_tensorsinputs_embedsc           
      K   s   ||||d}| j d urJ| j jd |jd kr?|jd | j jd  }tj|| j jd f| j j| j jd}tj| j |gdd| _ |d| j i d | _ | jj	di ||}	|	S )N)r  r  r  r  r   r,   )r   r   rF   r5  r9   )
r5  r   r:   r  r   r   rO   updater*  model)
ru   r  r  r  r  r  forward_kwargspadding_lenr=  r   r9   r9   r?   r   t  s,   	

z.Ernie4_5_VLMoeForConditionalGeneration.forwardr  c                 C   s   t | }|j|| jdS )N)mapper)r3   r  hf_to_vllm_mapper)ru   r  loaderr9   r9   r?   r    s   z3Ernie4_5_VLMoeForConditionalGeneration.load_weightsr  r{   )NN)(r   r   r   packed_modules_mappingr4   r  classmethodr   r   r&  r   ri   r:   r   r:  r?  rB  r  r   r   rj  r  Ernie4_5_VLImageInputsrm  Ernie4_5_VLVideoInputsro  ru  ry  r  r|  r.   r  r  r  r(   r   r   r  r  r   r9   r9   rv   r?   r    s    		5


 




,$r  )r7  r8  r9  )tr   rM  r1  collections.abcr   r   r   r   	functoolsr   typingr   r   r	   rg  r"  r:   torch.nnr   torch.nn.functional
functionalr<  einopsr
   transformersr   vllm.configr   vllm.config.multimodalr   r   vllm.distributedr   r   rm   vllm.loggerr   %vllm.model_executor.layers.activationr   $vllm.model_executor.layers.attentionr   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   r   'vllm.model_executor.layers.quantizationr   2vllm.model_executor.layers.rotary_embedding.commonr   -vllm.model_executor.model_loader.weight_utilsr   vllm.multimodalr   vllm.multimodal.inputsr   r   r   r   vllm.multimodal.parser    r!   r"   vllm.multimodal.processingr#   r$   r%   r&   r'   vllm.sequencer(   vllm.utils.tensor_schemar)   r*   #vllm.v1.attention.backends.registryr+   ernie45_vl_moer-   
interfacesr.   r/   r0   r1   r2   r3   r4   r5   visionr6   r   r  r   rU   r   rV   r   r   r   r   r   r   r  r(  r  r   r/  r3  r6  rH  rI  r  r  r  register_processorr  r9   r9   r9   r?   <module>   s   r4 #
( 9 * 7f

