o
    
۾i                     @   s2  U d Z ddlmZmZmZmZ ddlmZ ddlm	Z	m
Z
mZmZ ddlZddlmZ ddlm  mZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z&m'Z'm(Z( ddl)m*Z* ddl+m,Z, ddl-m.Z. ddl/m0Z0 ddl1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7 ddl8m9Z9m:Z:m;Z;m<Z<m=Z= ddl>m?Z?m@Z@mAZAmBZBmCZC ddlDmEZE ddlFmGZGmHZH ddlImJZJ ddlKmLZL ddlMmNZNmOZO ddlPmQZQmRZRmSZSmTZTmUZUmVZVmWZW ddlmXZXmYZYmZZZm[Z[ ddl\m]Z] ee^Z_G d d! d!eNZ`G d"d# d#eNZae`eaB Zbeecd$< G d%d& d&ejdZeG d'd( d(ejdZfG d)d* d*ejdZgG d+d, d,ejdZhG d-d. d.ejdZiG d/d0 d0ejdZjd1eekejlf fd2d3ZmG d4d5 d5e=ZnG d6d7 d7eAZoG d8d9 d9e?eo ZpG d:d; d;e@eo Zqe0jreqeoepd<G d=d> d>ejdeTeSeUeVeWeR	ZsdS )?zDInference-only HunYuan-VL model compatible with HuggingFace weights.    )CallableIterableMappingSequence)partial)	AnnotatedAnyLiteral	TypeAliasN)BatchFeature)
VllmConfig)BaseDummyOptions)parallel_state)utils)init_logger)
get_act_fn)MMEncoderAttention)RMSNorm)ColumnParallelLinearQKVParallelLinearRowParallelLinear)QuantizationConfig)default_weight_loader)MultiModelKeys)MULTIMODAL_REGISTRY)	ImageItemModalityDataMultiModalDataDictMultiModalFeatureSpecMultiModalFieldConfigMultiModalKwargsItems)DictEmbeddingItems	ImageSizeModalityDataItemsMultiModalDataItemsMultiModalDataParser)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdate)IntermediateTensors)HunYuanVLConfigHunYuanVLVisionConfig)HunYuanVLProcessor)smart_resize)TensorSchemaTensorShape   )MultiModalEmbeddingsSupportsEagle3SupportsLoRASupportsMultiModal
SupportsPPSupportsQuantSupportsXDRoPE)AutoWeightsLoaderWeightsMapperinit_vllm_registered_modelmaybe_prefix)is_vit_use_data_parallelc                   @   N   e Zd ZU dZed ed< eeje	ddf ed< eeje	ddf ed< d	S )
HunYuanVLImagePixelInputsz
    Dimensions:
        - np: Number of patches
        - ni: Number of images
        - cps: Number of channels * patch_size * patch_size
    pixel_valuestypenpcpsni   image_grid_thwN
__name__
__module____qualname____doc__r	   __annotations__r   torchTensorr1    rP   rP   ]/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/hunyuan_vision.pyr@   j      
 r@   c                   @   r?   )
HunYuanVLImageEmbeddingInputszu
    Dimensions:
        - nf: Number of image features
        - hs: Hidden size
        - ni: Number of images
    image_embedsrB   nfhsrE   rF   rG   NrH   rP   rP   rP   rQ   rS      rR   rS   HunYuanVLImageInputsc                       sd   e Zd Zdejddfdedededeej	gej	f de
dB d	ef fd
dZdej	fddZ  ZS )HunYuanVisionMLPTN in_featureshidden_featuresbiasact_fnquant_configprefixc                    sR   t    t }t||||| d|d| _t||||| d|d| _|| _d S )Nz.dense_h_to_4h)r\   r^   r_   
disable_tpz.dense_4h_to_h)super__init__r>   r   dense_h_to_4hr   dense_4h_to_hr]   )selfrZ   r[   r\   r]   r^   r_   use_data_parallel	__class__rP   rQ   rb      s&   
	
zHunYuanVisionMLP.__init__xc                 C   s&   |  |\}}| | |\}}|S N)rc   rd   r]   )re   ri   x_up_x_downrP   rP   rQ   forward   s   zHunYuanVisionMLP.forward)rI   rJ   rK   Fgeluintboolr   rN   rO   r   strrb   rn   __classcell__rP   rP   rg   rQ   rX      s&    rX   c                       sV   e Zd Z		ddededededB deddf fd	d
ZdejdejfddZ	  Z
S )HunYuanVisionAttentionNrY   	embed_dim	num_headsprojection_sizer^   r_   returnc              
      s   t    t }|rdnt | _t||| _t|| j| _	t
|| j||d|| d|d| _t|||| d|d| _| jd | _t| j	| j| j| dd	| _d S )
Nr2   T.qkv)hidden_size	head_sizetotal_num_headstotal_num_kv_headsr\   r^   r_   r`   z.o_proj)
input_sizeoutput_sizer^   r_   r`         z.attnr_   )ra   rb   r>   r   $get_tensor_model_parallel_world_sizetp_size
dist_utilsdividehidden_size_per_attention_head!num_attention_heads_per_partitionr   qkvr   o_projscaler   attn)re   rv   rw   rx   r^   r_   rf   rg   rP   rQ   rb      sH   
zHunYuanVisionAttention.__init__ri   c           	      C   sB   |  |\}}|jddd\}}}| |||}| |\}}|S )NrF   dim)r   chunkr   r   )	re   ri   r   rl   qkvoutoutputrP   rP   rQ   rn      s
   zHunYuanVisionAttention.forwardNrY   )rI   rJ   rK   rq   r   rs   rb   rN   rO   rn   rt   rP   rP   rg   rQ   ru      s*    2ru   c                       s   e Zd Zejdddfdedededeejgejf deege	j
f dB dedB d	ed
df fddZdejd
ejfddZ  ZS )HunYuanVisionBlockNrY   r   rw   mlp_hidden_dimr]   
norm_layerr^   r_   ry   c                    sn   t    |d u rttjdd}||| _||| _t||||| dd| _t	|||d|| dd| _
d S )Ngư>epsz
.self_attn)rv   rw   rx   r^   r_   Tz.mlp)r]   r\   r^   r_   )ra   rb   r   nn	LayerNorminput_layernormpost_attention_layernormru   	self_attnrX   mlp)re   r   rw   r   r]   r   r^   r_   rg   rP   rQ   rb      s(   



zHunYuanVisionBlock.__init__ri   c                 C   s,   ||  | | }|| | | }|S rj   )r   r   r   r   )re   ri   rP   rP   rQ   rn     s   zHunYuanVisionBlock.forward)rI   rJ   rK   ro   rp   rq   r   rN   rO   r   Moduler   rs   rb   rn   rt   rP   rP   rg   rQ   r      s6    	r   c                       sD   e Zd Zdef fddZdejdeee  dejfddZ	  Z
S )	HunYuanVisionPatchEmbedconfigc                    s   t    || _|j| _|j| _|j| _|j| _|j| _t	j
|j| j| j| jdd| _|j| j d | _| jd | _t| jd | _t	| j| j| _d | _d S )NT)in_channelsout_channelskernel_sizestrider\      r2   g      ?)ra   rb   r   r{   rv   
patch_sizenum_channelsspatial_merge_sizeinterpolate_moder   Conv2dpatch_embeddingmax_image_sizemax_num_patchesnum_positionsrq   position_edge	Embeddingposition_embeddingpatch_pos_embed)re   r   rg   rP   rQ   rb   '  s&   

z HunYuanVisionPatchEmbed.__init__rA   grid_thwry   c                 C   s&  | d}||| j| j| j}| |}|ddd}| jd u rFd| j| j| j	f}| j
jdd d d f |dddd | _g }|D ];}|\}}	}
|	d |
d }	}
tjj| j|	| j |
| j f| jdd}|| j	dddd|j}|| qJtj|dd	}|| }|S )
Nr   r   r2   rF   r   g?F)scale_factormodealign_cornersr   )sizereshaper   r   r   squeeze	unsqueezer   r   rv   r   weightpermutefloatr   
functionalinterpolater   	transposetodtypeappendrN   cat)re   rA   r   num_patchespatch_embedspatch_pos_shapepatch_pos_embed_listgridrl   h0w0r   
embeddingsrP   rP   rQ   rn   B  sH   



zHunYuanVisionPatchEmbed.forward)rI   rJ   rK   r-   rb   rN   rO   listrq   rn   rt   rP   rP   rg   rQ   r   &  s    
r   c                       s.   e Zd Z			d	 fdd	Zd
ddZ  ZS )HunYuanVisionPatchMergerr   h㈵>rY   c              	      s   t    || _|d }ttj||d ||dt tj|d |d dd| _t|d || _	t
t|d | | _t
t|| | _t
t|| | _t
t|| | _t||d| _t||d| _d S )Nr   r   )r   r      r2   )r   r   )ra   rb   r   r   
Sequentialr   GELUprojLinearr   	ParameterrN   randnimage_newlineimage_begin	image_end	image_sepr   
before_rms	after_rms)re   r   r   r   rms_norm_epsr_   	embed_stdrg   rP   rQ   rb   w  s(   

z!HunYuanVisionPatchMerger.__init__   r   c           
      C   s  |  |}|\}}|j}|ddd|jd d||}| |}|j\}}}}tj|| jd|dd	|||d
|gdd}|||dddd}| |}| jddd	|d|jd 
|}| jddd	|d|jd 
|}	tj|||	gdd}| |S )Nr   r   r2   r   r   )r   r   r   r   shaper   rN   r   r   expandr   r   r   r   r   )
re   ri   r   hwr   bcbeginendrP   rP   rQ   rn     s    
 
&
&&
z HunYuanVisionPatchMerger.forward)r   r   rY   )r   )rI   rJ   rK   rb   rn   rt   rP   rP   rg   rQ   r   v  s     r   c                	       s   e Zd Z		ddededB deddf fddZedej	fd	d
Z	edej
fddZ
dejdeee  dejfddZdeeeejf  dee fddZ  ZS )HunYuanVisionTransformerNrY   vision_configr^   r_   ry   c                    s  t    j}j| _j| _j| _ddlm} |d t	| _
W d    n1 s.w   Y  ttjjd |d t fddt|D | _W d    n1 s\w   Y  |d tjjjj d	d
| _W d    d S 1 sw   Y  d S )Nr   )set_model_tagr   r   r   c                    s8   g | ]}t jjjtj  d | dqS )z.layers.)r   rw   r   r]   r   r^   r_   )r   r{   num_attention_headsintermediate_sizer   
hidden_act).0	layer_idxr   r_   r^   r   rP   rQ   
<listcomp>  s    
z5HunYuanVisionTransformer.__init__.<locals>.<listcomp>r   z	.perceive)r   r   r_   )ra   rb   num_hidden_layersr{   r   rw   r   vllm.compilation.backendsr   r   r   r   r   r   r   
ModuleListrangelayersr   out_hidden_sizeperceive)re   r   r^   r_   r   r   rg   r   rQ   rb     s4   





"z!HunYuanVisionTransformer.__init__c                 C      | j jjjS rj   )r   r   r   r   re   rP   rP   rQ   r        zHunYuanVisionTransformer.dtypec                 C   r   rj   )r   r   r   devicer   rP   rP   rQ   r    r   zHunYuanVisionTransformer.deviceri   r   c                    sd  | d}dg}|j| j| jd}| ||}|D ]\}}}t|t|t|}}}|||  qtj|tj	d}tj
|dtj	d}|j| jdd}||d}|d}dd	 |D }	| jD ] |j|	d
d}
 fdd	|
D }
tj|
d
d}qb|d
d  |d d   }	|j|	d
d}g }t||D ]\}}|| j| |d
d  dd q|S )Nr   )r  r   )r   )r   r   T)r  non_blockingr   c                 S   s"   g | ]\}}}t |t | qS rP   )rq   )r   rl   r   r   rP   rP   rQ   r     s   " z4HunYuanVisionTransformer.forward.<locals>.<listcomp>r2   r   c                    s   g | ]} |qS rP   rP   )r   playerrP   rQ   r         )r   )r   r   r  r   r   rq   r   rN   tensorint32cumsumr   r   r   splitr   tolistzipr   
contiguousr   )re   ri   r   seq_len
cu_seqlenshidden_statestr   r   split_lengthspartssplit_itemsimage_embeds_listr   
split_itemrP   r  rQ   rn     s2   


z HunYuanVisionTransformer.forwardweightsc                 C   s   g d}t | jdd}t }|D ]9\}}|D ]\}}}	||vr!q|||}|| }
|
j}||
||	  n|| }
t|
dt}||
| || q|S )N))rz   z.q_projr   )rz   z.k_projr   )rz   z.v_projr   F)remove_duplicateweight_loader)dictnamed_parameterssetreplacer  getattrr   add)re   r  stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
param_nameweight_nameshard_idparamr  rP   rP   rQ   load_weights  s"   
z%HunYuanVisionTransformer.load_weightsr   )rI   rJ   rK   r-   r   rs   rb   propertyrN   r   r  rO   r   rq   rn   r   tupler  r)  rt   rP   rP   rg   rQ   r     s0    -

,,r   	hf_inputsc                 C   sD   |  dtd}|d}ttd|td|tjddddS )NrG   )r   rF   r   imageT)keep_on_cpu)rA   rT   rG   )getrN   emptyprodr  r   flat_from_sizesbatched)r,  rG   image_grid_sizesrP   rP   rQ   _hunyuan_vl_field_config,  s   


r5  c                       sD   e Zd Zdeeejf ee B de	e
e
f dB f fddZ  ZS )HunYuanVLMultiModalDataParserdatary   Nc                    s*   t |trt|dddhtdS t |S )Nr-  rT   rG   )modalityrequired_fieldsfields_factory)
isinstancer  r!   r5  ra   _parse_image_data)re   r7  rg   rP   rQ   r<  7  s   
z/HunYuanVLMultiModalDataParser._parse_image_data)rI   rJ   rK   r  rs   rN   rO   r   r   r#   r   r<  rt   rP   rP   rg   rQ   r6  6  s    r6  c                   @   s   e Zd Zdd ZdedefddZdedefddZd	d
 Zde	e
edB f fddZdede	e
ef de	e
ef fddZddddedededededB deeef fddZdedededB defddZdefddZdefd d!ZdS )"HunYuanVLProcessingInfoc                 C   s   | j tS rj   )ctxget_hf_configr,   r   rP   rP   rQ   r?  G  s   z%HunYuanVLProcessingInfo.get_hf_configkwargsry   c                 K   s    | j jtfd|ddi|S )Nuse_fastT)r>  get_hf_processorr.   popre   r@  rP   rP   rQ   rB  J  s   
z(HunYuanVLProcessingInfo.get_hf_processorc                 K   s   | j di |jS )NrP   )rB  image_processorrD  rP   rP   rQ   get_image_processorT  s   z+HunYuanVLProcessingInfo.get_image_processorc                 C   s   t |  dS )N)expected_hidden_size)r6  _get_expected_hidden_sizer   rP   rP   rQ   get_data_parserZ  s   z'HunYuanVLProcessingInfo.get_data_parserNc                 C   s   dd iS )Nr-  rP   r   rP   rP   rQ   get_supported_mm_limits_  s   z/HunYuanVLProcessingInfo.get_supported_mm_limitsr  	mm_countsc                 C   s   |   }d}||dS )Nr   )r-  video)get_max_image_tokens)re   r  rK  max_image_tokensmax_video_tokensrP   rP   rQ   get_mm_max_tokens_per_itemb  s   
z2HunYuanVLProcessingInfo.get_mm_max_tokens_per_itemr2   T)
num_frames	do_resizeimage_widthimage_heightrQ  rR  rE  c                C   s   |d u r|   }|  }|j}|j}|j}	|r-t||||	 |j|jd\}
}t||
d}nt||d}d}|j	| }|j
| }|| |	 ||	 d  d }||fS )N)heightwidthfactor
min_pixels
max_pixels)rV  rU  r2   r   )rF  r?  r   r   r   r/   rX  rY  r"   rU  rV  )re   rS  rT  rQ  rR  rE  	hf_configr   r   r   resized_heightresized_widthpreprocessed_sizegrid_tgrid_hgrid_wnum_vision_tokensrP   rP   rQ   _get_vision_infol  s0   	


z(HunYuanVLProcessingInfo._get_vision_infoc                C   s   | j |||d\}}|S NrS  rT  rE  rb  )re   rS  rT  rE  rl   num_image_tokensrP   rP   rQ   get_num_image_tokens  s   
z,HunYuanVLProcessingInfo.get_num_image_tokensc                 C   s   | j ddd d\}}|S )Ni   i    rd  re  )re   r   rl   rP   rP   rQ   !get_image_size_with_most_features  s   
z9HunYuanVLProcessingInfo.get_image_size_with_most_featuresc                 C   s   |   \}}| j||d dS rc  )rh  rg  )re   target_widthtarget_heightrP   rP   rQ   rM    s   z,HunYuanVLProcessingInfo.get_max_image_tokens)rI   rJ   rK   r?  objectr.   rB  rF  rI  r   rs   rq   rJ  rP  rr   r+  r"   rb  rg  rh  rM  rP   rP   rP   rQ   r=  F  s^    







(
r=  c                	   @   sX   e Zd Zdeeef defddZ	d
dedeeef deeef dB defdd	Z	dS )HunYuanVLDummyInputsBuilderrK  ry   c                 C   s(   | dd}| jjtd}|j}|| S )Nr-  r   )typ)r/  inforB  r.   image_token)re   rK  
num_imageshf_processorro  rP   rP   rQ   get_dummy_text  s   z*HunYuanVLDummyInputsBuilder.get_dummy_textNr  
mm_optionsc                 C   s.   | dd}| j \}}d| j|||diS )Nr-  r2   )rV  rU  rp  )r/  rn  rh  _get_dummy_images)re   r  rK  rs  rp  ri  rj  rP   rP   rQ   get_dummy_mm_data  s   z-HunYuanVLDummyInputsBuilder.get_dummy_mm_datarj   )
rI   rJ   rK   r   rs   rq   rr  r   r   ru  rP   rP   rP   rQ   rl    s    
rl  c                
   @   s   e Zd Zdedeeef deeef deeef def
ddZded	eee	f d
e
dee fddZded	eeef deeef fddZdS )HunYuanVLMultiModalProcessorpromptmm_data	mm_kwargs
tok_kwargsry   c                 C   s<   | j j| j jdi |tdd|i|tdi ||S )NtextrP   )rn  r>  call_hf_processorrB  r  )re   rw  rx  ry  rz  rP   rP   rQ   _call_hf_processor  s
   z/HunYuanVLMultiModalProcessor._call_hf_processormm_itemshf_processor_mm_kwargsout_mm_kwargsc                    sb   | j jd	i |}| j jd	i |}d|ji|jdtdtffdd  fdddD S )
Nr-  item_idxr8  c                    sd   | |  }|| d j }t|tjsJ |\}}}t|  t|  d  d }| g| S )N	_grid_thwr2   r   )r7  r;  rN   rO   rq   )r  r8  out_itemr   rl   r_  r`  
num_tokens)
merge_sizer  placeholderrP   rQ   get_replacement_hunyuan_vl  s   

zTHunYuanVLMultiModalProcessor._get_prompt_updates.<locals>.get_replacement_hunyuan_vlc              	      s(   g | ]}t || gt |d dqS ))r8  )r8  targetreplacement)r)   r   )r   r8  )r  r  rP   rQ   r     s    
zDHunYuanVLMultiModalProcessor._get_prompt_updates.<locals>.<listcomp>)r-  rP   )rn  rB  rF  image_token_idr  rq   rs   )re   r~  r  r  rq  rE  rP   )r  r  r  r  rQ   _get_prompt_updates  s   z0HunYuanVLMultiModalProcessor._get_prompt_updatesr,  c                 C   s   t |S rj   )r5  )re   r,  r  rP   rP   rQ   _get_mm_fields_config  s   z2HunYuanVLMultiModalProcessor._get_mm_fields_configN)rI   rJ   rK   rs   r   rk  r   r}  r$   r   r    r   r*   r  r   r  rP   rP   rP   rQ   rv    s8    





#

rv  )rn  dummy_inputsc                       s  e Zd ZedddddZdZdee dee de	j
fd	d
ZededededB fddZdddedef fddZdededB fddZdedee	j
df fddZdedefddZdedefdd Zd!eedf ddfd"d#Zdeedf fd$d%Zd&e	j
dB d'e	j
d(edB d)e	j
dB dede	j
eB fd*d+Zd,e	j
de	j
dB fd-d.Zd/eeee	j
f  de e fd0d1Z!de"fd2d3Z#  Z$S )4!HunYuanVLForConditionalGenerationzvisual.zlanguage_model.model.)zvit.vit.zvit.zmodel.)orig_to_new_prefixTinput_tokensmm_featuresry   c                 C   s  t |dh}dd |dg D }| j}|j}|jj}t|jd }t	
|}	t	|	|kd}
t	t|	}t	t|	}t	t|	}t	t|	}tt|
D ]a}|
| d }|| \}}}||| || }}}|d | }||||  t	d|d dd|dd ||||  t	d|ddd|d d ||||| < qT|d	krt	||||g}|S |d
krt	|||g}|S )NrG   c                 S   s   g | ]}|  qS rP   )r  )r   itemrP   rP   rQ   r   )  r  zPHunYuanVLForConditionalGeneration.get_xdrope_input_positions.<locals>.<listcomp>xdrope_sectionr2   r   r   r   r   rF   )r   gather_kwargsr/  r   image_start_token_idr   r   lenrope_scalingrN   r  argwherer   aranger   copy_r   r   stack)re   r  r  r@  rG   rZ  r  r   xd_numinput_tokens_tensorimage_start_indicesp_indexw_indexh_indext_indeximage_indexposr  r   r   rl   
llm_grid_h
llm_grid_w	token_numllm_positionsrP   rP   rQ   get_xdrope_input_positions   s\   


z<HunYuanVLForConditionalGeneration.get_xdrope_input_positionsr8  iNc                 C   s   | drdS td)Nr-  ul   <｜hy_place▁holder▁no▁100｜><｜hy_place▁holder▁no▁102｜><｜hy_place▁holder▁no▁101｜>z Only image modality is supported)
startswith
ValueError)clsr8  r  rP   rP   rQ   get_placeholder_strY  s   
z5HunYuanVLForConditionalGeneration.get_placeholder_strrY   r   vllm_configr_   c                   s   t    |jj}|| _| |dh t|j|jt	|dd| _
W d    n1 s+w   Y  | | t|t	|dddgd| _W d    n1 sMw   Y  | jj| _d S )Nr-  visual)r^   r_   language_model.modelHunYuanDenseV1ForCausalLMHunYuanMoEV1ForCausalLM)r  r_   architectures)ra   rb   model_configrZ  r   _mark_tower_modelr   r   r^   r=   r  _mark_language_modelr<   language_modelmake_empty_intermediate_tensors)re   r  r_   r   rg   rP   rQ   rb   `  s*   


z*HunYuanVLForConditionalGeneration.__init__r@  c                 K   s   | dd }| dd }| dd }|d u r|d u rd S t|tr(tj|dd}t|jdkr@|jd }|d|}|dd}|d urKtd||dS |d urVt	d||d	S d S )
NrA   rT   rG   r   r   rF   r   )rB   rA   rG   )rB   rT   rG   )
rC  r;  r   rN   r   r  r   r   r@   rS   )re   r@  rA   rT   rG   last_dimrP   rP   rQ   _parse_and_validate_image_input{  s0   

zAHunYuanVLForConditionalGeneration._parse_and_validate_image_inputimage_input.c                 C   sZ   |d }|j dksJ | }|d dkr |d | jj}|S |d }| j||d}|S )NrG   r   rB   rT   rA   )r   )ndimr  rB   r  r   )re   r  r   grid_thw_listrT   rA   rP   rP   rQ   _process_image_input  s   z6HunYuanVLForConditionalGeneration._process_image_inputc                 K   s6   i }|D ]}|dv rd|vr| j di ||d< q|S )N)rA   rT   r-  rP   )r  )re   r@  mm_input_by_modality	input_keyrP   rP   rQ   %_parse_and_validate_multimodal_inputs  s   
zGHunYuanVLForConditionalGeneration._parse_and_validate_multimodal_inputsc                 K   sP   | j di |}|sg S d}|D ]}|| }|dkr%| |}|t|7 }q|S )NrP   r-  )r  r  r+  )re   r@  r  multimodal_embeddingsr8  multimodal_inputimage_embeddingsrP   rP   rQ   embed_multimodal  s   
z2HunYuanVLForConditionalGeneration.embed_multimodalr   c                 C   s   || j j_d S rj   )r  modelaux_hidden_state_layers)re   r   rP   rP   rQ   set_aux_hidden_state_layers  s   z=HunYuanVLForConditionalGeneration.set_aux_hidden_state_layersc                 C   s    t | jjj}d|d |d fS )Nr   rF   )r  r  r  r   )re   
num_layersrP   rP   rQ   "get_eagle3_aux_hidden_state_layers  s   zDHunYuanVLForConditionalGeneration.get_eagle3_aux_hidden_state_layers	input_ids	positionsintermediate_tensorsinputs_embedsc                 K   s"   |d urd }| j ||||d}|S )N)r  r  r  r  )r  )re   r  r  r  r  r@  r  rP   rP   rQ   rn     s   z)HunYuanVLForConditionalGeneration.forwardr  c                 C   s   | j |S rj   )r  compute_logits)re   r  rP   rP   rQ   r    s   z0HunYuanVLForConditionalGeneration.compute_logitsr  c                 C   s*   t | | jjr	dgnd d}|j|| jdS )Nzlm_head.)skip_prefixes)mapper)r:   r   tie_word_embeddingsr)  hf_to_vllm_mapper)re   r  loaderrP   rP   rQ   r)    s
   z.HunYuanVLForConditionalGeneration.load_weightsc                 C   s   t jddddS )z<
        Get the module prefix in multimodal models
        r  zvisual.perceiver  )r  	connectortower_model)r   from_string_fieldr   rP   rP   rQ   get_mm_mapping  s
   z0HunYuanVLForConditionalGeneration.get_mm_mapping)%rI   rJ   rK   r;   r  supports_encoder_tp_datar   rq   r   rN   rO   r  classmethodrs   r  r   rb   rk  rW   r  r+  r  r  r  r3   r  r  r  r+   rn   r  r   r  r)  r   r  rt   rP   rP   rg   rQ   r    sh    	
9
 


$r  )trL   collections.abcr   r   r   r   	functoolsr   typingr   r   r	   r
   rN   torch.nnr   torch.nn.functionalr   ro   transformersr   vllm.configr   vllm.config.multimodalr   vllm.distributedr   r   r   vllm.loggerr   %vllm.model_executor.layers.activationr   $vllm.model_executor.layers.attentionr   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   r   'vllm.model_executor.layers.quantizationr   -vllm.model_executor.model_loader.weight_utilsr   )vllm.model_executor.models.module_mappingr   vllm.multimodalr   vllm.multimodal.inputsr   r   r   r   r   r    vllm.multimodal.parser!   r"   r#   r$   r%   vllm.multimodal.processingr&   r'   r(   r)   r*   vllm.sequencer+   *vllm.transformers_utils.configs.hunyuan_vlr,   r-   -vllm.transformers_utils.processors.hunyuan_vlr.   3vllm.transformers_utils.processors.hunyuan_vl_imager/   vllm.utils.tensor_schemar0   r1   
interfacesr3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   visionr>   rI   loggerr@   rS   rW   rM   r   rX   ru   r   r   r   r   rs   rO   r5  r6  r=  rl  rv  register_processorr  rP   rP   rP   rQ   <module>   s|    $	
$>)P8~
m9

