o
    
۾i                     @   sf  d Z ddlZddlmZmZ ddlmZ ddlZddl	Z	ddl	m
Z
 ddlmZ ddlmZ ddlmZmZmZ dd	lmZmZmZmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddl m!Z!m"Z"m#Z#m$Z$m%Z% ddl&m'Z' ddl(m)Z) ddl*m+Z+ ddl,m-Z-m.Z. ddl/m0Z0m1Z1 ddl2m3Z3 ddl4m5Z5 ddl6m7Z7m8Z8m9Z9m:Z: ddl;m<Z<m=Z=m>Z>m?Z?m@Z@ dedeAfddZBdedeCfddZDG dd  d e
jEZFG d!d" d"e
jEZGG d#d$ d$e
jEZHG d%d& d&e
jEZIG d'd( d(e
jEZJedd)ddd*d+G d,d- d-e
jEZKG d.d/ d/e
jEe9e:e8ZLG d0d1 d1eLe7ZMG d2d3 d3eLZNG d4d5 d5eNZOG d6d7 d7eMZPdS )8zAInference-only HunYuan model compatible with HuggingFace weights.    N)CallableIterable)islice)nn)PretrainedConfig)support_torch_compile)CacheConfig
VllmConfigget_current_vllm_config)get_ep_groupget_pp_group$get_tensor_model_parallel_world_size tensor_model_parallel_all_reduce)
SiluAndMul)	Attention)SharedFusedMoE)RMSNorm)ColumnParallelLinearMergedColumnParallelLinearQKVParallelLinearReplicatedLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)get_rope)ParallelLMHeadVocabParallelEmbedding)default_weight_loadermaybe_remap_kv_scale_name)IntermediateTensors)AttentionType   )MixtureOfExpertsSupportsEagle3SupportsLoRA
SupportsPP)AutoWeightsLoaderPPMissingLayeris_pp_missing_parametermake_layersmaybe_prefixconfigreturnc                 C   sR   t | dd }t|tr|dkS t|tr'|r'tdd |D r%t|dkS dS dS )Nnum_expertsr!   c                 s   s    | ]}t |tV  qd S N)
isinstanceint).0e r3   Y/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/hunyuan_v1.py	<genexpr>U   s    z_is_moe.<locals>.<genexpr>F)getattrr/   r0   listallmax)r+   r-   r3   r3   r4   _is_moeO   s   
r:   c                 C   s   t | ddsdS t | ddS )Nuse_claFr!   cla_share_factor)r6   )r+   r3   r3   r4   _get_cla_factor\   s   r=   c                       sT   e Zd Z				ddededededB d	ed
ededdf fddZdd Z  Z	S )
HunYuanMLPNF Thidden_sizeintermediate_size
hidden_actquant_configbiasprefixreduce_resultsr,   c                    sj   t    t||gd ||| dd| _t||||| d|d| _|dkr/td| dt | _d S )	N   .gate_up_proj)
input_sizeoutput_sizesrD   rC   rE   z
.down_proj)rI   output_sizerD   rC   rE   rF   siluzUnsupported activation: z!. Only silu is supported for now.)	super__init__r   gate_up_projr   	down_proj
ValueErrorr   act_fn)selfr@   rA   rB   rC   rD   rE   rF   	__class__r3   r4   rN   c   s*   


zHunYuanMLP.__init__c                 C   s*   |  |\}}| |}| |\}}|S r.   )rO   rR   rP   )rS   xgate_up_r3   r3   r4   forward   s   
zHunYuanMLP.forward)NFr?   T)
__name__
__module____qualname__r0   strr   boolrN   rY   __classcell__r3   r3   rT   r4   r>   b   s.    	 r>   c                          e Zd Z						ddededed	ed
ededB dededB dededdf fddZ		dde
jde
jdee
j dB de
jfddZ  ZS )HunYuanAttention    NFr?   r+   r@   	num_headsnum_kv_headsmax_position_embeddingsrC   rD   cache_configrE   layer_idr,   c              	      s  t    || _t }|| _| j| dksJ | j| | _|| _| j|kr/| j| dks.J n	|| j dks8J td| j| | _t	|drN|j
rN|j
| _
nt	|drX|j| _
n| j| j | _
| j| j
 | _| j| j
 | _| j
d | _|| _t|dd| _|
| _t|| j
| j| j|||	 dd	| _t| j| j
 ||||	 d
d| _t| j
||jdd| _t| j| j
| j| j|||	 dd| _| jrt| j
|jd| _t| j
|jd| _d S d S )Nr   r!   head_dimattention_head_dim      use_qk_normF	.qkv_proj)r@   	head_sizetotal_num_headstotal_num_kv_headsrD   rC   rE   .o_projrI   rK   rD   rC   rE   Tmax_positionrope_parametersis_neox_style.attn)re   rg   rC   rE   eps) rM   rN   r@   r   ro   rd   rp   r9   re   hasattrri   rj   q_sizekv_sizescalingrf   r6   rl   rh   r   qkv_projr   o_projr   ru   
rotary_embr   attnr   rms_norm_epsquery_layernormkey_layernormrS   r+   r@   rd   re   rf   rC   rD   rg   rE   rh   tp_sizerT   r3   r4   rN      sr   







zHunYuanAttention.__init__	positionshidden_states	kv_statesc                 C   s   |  |\}}|j| j| j| jgdd\}}}| |||\}}|}	| jrA| |d| j| j	
 }| |d| j| j	
 }| |||}
|
|jd d}
| |
\}}||	|ffS )Nrc   dimr   )r~   splitr{   r|   r   rl   r   viewrd   ri   
contiguousr   re   r   shaper   )rS   r   r   r   qkvrX   qkvori_kattn_outputoutputr3   r3   r4   rY      s    zHunYuanAttention.forwardrb   NFNr?   rc   r.   rZ   r[   r\   r   r0   r   r^   r   r]   rN   torchTensortuplerY   r_   r3   r3   rT   r4   ra      sP    	
Ura   c                       r`   )HunYuanCrossAttentionrb   NFr?   rc   r+   r@   rd   re   rf   rC   rD   rg   rE   rh   r,   c              
      s  t    || _t }|| _| j| dksJ | j| | _|| _| j|kr/| j| dks.J n	|| j dks8J td| j| | _t	|drK|j
| _
nt	|drU|j| _
n| j| j | _
| j| j
 | _| j| j
 | _| j
d | _|| _t|dd| _|
| _t|||||	 dd	| _t| j| j
 ||||	 d
d| _t| j
||jdd| _t| j| j
| j| j|||	 dtjd| _| jrt| j
|jd| _ t| j
|jd| _!d S d S )Nr   r!   ri   rj   rk   rl   F.q_projrD   rC   rE   rq   rr   Trs   rw   )re   rg   rC   rE   	attn_typerx   )"rM   rN   r@   r   ro   rd   rp   r9   re   rz   ri   rj   r{   r|   r}   rf   r6   rl   rh   r   q_projr   r   r   ru   r   r   r    ENCODER_DECODERr   r   r   r   r   r   rT   r3   r4   rN      sp   






zHunYuanCrossAttention.__init__r   r   r   c                 C   s   |d usJ |\}}|}|  |\}}t|}	| |||	\}}| jr@| |d| j| j	 }| 
|d| j| j	 }| |||}
|
|jd d}
| |
\}}|||ffS )Nrc   r   )r   r   
empty_liker   rl   r   r   rd   ri   r   r   re   r   r   r   )rS   r   r   r   r   r   r   r   rX   k_tmpr   r   r3   r3   r4   rY   F  s"   
zHunYuanCrossAttention.forwardr   r.   r   r3   r3   rT   r4   r      sP    	
Tr   c                       sV   e Zd Z				ddededB deded	ef
 fd
dZde	j
de	j
fddZ  ZS )HunYuanSparseMoeBlockNrc   r?   Fr+   rC   rh   rE   enable_eplbc                    s  t    t | _t j| _t j| _| j	 | _
|j| _| j|jkr0td| j d|j dt|jtrK|dks<J t|j|ksEJ |j| }n|j}|j}|jd urdt|jtr_|jn|j| }t }|jj}	|| _| j| _|	j| _| j| j | _| j| j
 | _| j| j | _| j| j | _t |j!|jdd | dd| _"|j#dkrt|j$tr|dksJ t|j$|ksJ |j$| }
n|j$}
t%|j!|j|
 |j&|d| dd	| _'nd | _'t(| j'| j||j!|d|d
k|| d| j| jd| _)d S )NzTensor parallel size z' is greater than the number of experts .r   Fz.gater   z.shared_mlp)r@   rA   rB   rC   rF   rE   r!   z.experts)shared_expertsr-   top_kr@   rA   rF   renormalizerC   rE   r   num_redundant_experts)*rM   rN   r   r   r   device_groupep_grouprank_in_groupep_ranksizeep_sizer-   n_routed_expertsrQ   r/   moe_topkr7   lenrA   moe_intermediate_sizer0   r
   parallel_configeplb_configr   n_logical_expertsr   n_redundant_expertsn_physical_expertsn_local_physical_expertsphysical_expert_startphysical_expert_endr   r@   gateuse_mixed_mlp_moenum_shared_expertr>   rB   
shared_mlpr   experts)rS   r+   rC   rh   rE   r   r   rA   vllm_configr   r   rT   r3   r4   rN   b  s   








	zHunYuanSparseMoeBlock.__init__r   r,   c                 C   sn   |j }|j d }|d|}| |\}}| j||d}| jd ur)|d |d  }| jdkr2t|}||S )Nrc   )r   router_logitsr   r!   )r   r   r   r   r   r   r   )rS   r   
orig_shape
hidden_dimr   rX   final_hidden_statesr3   r3   r4   rY     s   



zHunYuanSparseMoeBlock.forward)Nrc   r?   F)rZ   r[   r\   r   r   r0   r]   r^   rN   r   r   rY   r_   r3   r3   rT   r4   r   a  s"    _r   c                       s   e Zd Z					ddededB dedB ded	ed
eddf fddZ		dde
jde
jde
jdB dee
j dB dee
je
jf f
ddZ  ZS )HunYuanDecoderLayerNr?   rc   Fr+   rg   rC   rE   rh   r   r,   c                    s  t    |dksJ || _|j| _t|jtr|jn|j| | _t|dd}t|ddp2t|dd}t|}	|dkrD||	 dkrDt	j
nt	j}
|
t	jkrft|| j|jt|d|j||||| d|d	
| _n&|
t	j
krt|| j|jt|d|j||||| d|d	
| _ntd
|
 t|rt|||| d|d| _nt| j| j|j|t|dd| dd| _t|j|jd| _t|j|jd| _d S )Nr   rf   rb   attention_biasFrD   num_key_value_headsz
.self_attn)
r+   r@   rd   re   rf   rC   rD   rg   rE   rh   zUnsupported attention type: z.mlp)r+   rC   rh   rE   r   mlp_bias)r@   rA   rB   rC   rD   rE   rx   )rM   rN   rh   r@   r/   rA   r0   r6   r=   r    r   DECODERra   num_attention_heads	self_attnr   RuntimeErrorr:   r   mlpr>   rB   r   r   input_layernormpost_attention_layernorm)rS   r+   rg   rC   rE   rh   r   rf   r   
cla_factorattention_typerT   r3   r4   rN     s   
	






	zHunYuanDecoderLayer.__init__r   r   residualr   c                 C   s`   |d u r|}|  |}n|  ||\}}| j|||d\}}| ||\}}| |}|||fS )N)r   r   r   )r   r   r   r   )rS   r   r   r   r   ori_kv_statesr3   r3   r4   rY   *  s   


zHunYuanDecoderLayer.forward)NNr?   rc   Fr.   )rZ   r[   r\   r   r   r   r]   r0   r^   rN   r   r   r   rY   r_   r3   r3   rT   r4   r     sB    Yr   rc   )	input_idsr   intermediate_tensorsinputs_embeds)dynamic_arg_dimsc                       s   e Zd Zdddedef fddZdejdejfd	d
Z	ddejdB dejde	dB dejdB deje	B f
ddZ
dejfddZdeeeeeef  fddZdeeeejf  fddZ  ZS )HunYuanModelr?   rE   r   rE   c                   s   t    |jj|j |j|jj}|jj|j	| _	| _
| _j| _j| _t js4jr?t jr?t| jjd| _nt | _tj fdd| dd\| _| _| _t jritjjd| _nt | _ttdf  | _ d S )N)rC   c                    s"   t t| dd  | dS )Nr   rc   )r+   rh   rg   rC   rE   r   )r   r0   r   r   rg   r+   r   rC   r3   r4   <lambda>k  s    z'HunYuanModel.__init__.<locals>.<lambda>z.layersr   rx   .)!rM   rN   model_config	hf_configrg   rC   r   r   r   r   r+   pad_token_idpadding_idx
vocab_sizer   is_first_ranktie_word_embeddingsis_last_rankr   r@   embed_tokensr'   r)   num_hidden_layersstart_layer	end_layerlayersr   r   normr   r0   aux_hidden_state_layers)rS   r   rE   r   rT   r   r4   rN   N  s>   

zHunYuanModel.__init__r   r,   c                 C   s
   |  |S r.   )r   rS   r   r3   r3   r4   embed_input_ids{     
zHunYuanModel.embed_input_idsNr   r   r   c                 C   s  t  jr|d ur|}n| |}d }n|d usJ |d }|d }t| j}d }g }	tt| j| j| j	D ],\}
}|
| j
v rE|	||  |||||\}}}t| jddr_|
| dkr_|}q5d }q5t  jsmt||dS | ||\}}t|	dkr||	fS |S )Nr   r   r;   Fr   r   r   )r   r   r   r=   r+   	enumerater   r   r   r   r   appendr6   r   r   r   r   )rS   r   r   r   r   r   r   r   prev_kv_statesaux_hidden_statesilayerr   rX   r3   r3   r4   rY   ~  sB   



zHunYuanModel.forwardr   c           
      C   s   | j j}t| j d| j j}|| }| j j}t| j dr | j j}nt| j dr+| j j}n| j j| }|||d ||}tj	||ddfdd\}}}	|d|}|d|}|	d|}	t
|||	fS )Nr   ri   rj   rG   r!   r   rc   )r+   r   r6   r@   rz   ri   rj   reshaper   r   concat)
rS   r   r   re   num_key_value_groupsr@   rj   r   r   r   r3   r3   r4   _split_qkv_weight  s&   

zHunYuanModel._split_qkv_weightc                 C   s*   t | jrtj| ddd| jj| jdS g S )N	gate_projrP   up_proj)ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namer-   r   )r:   r+   r   make_expert_params_mappingr-   r   rS   r3   r3   r4   get_expert_mapping  s   
	zHunYuanModel.get_expert_mappingweightsc           !   
   C   s  t | j}g d}| jj}t| jd| jj}dddddgd fdd||d  d	|fd
|fd|fg| jfg}t|  }t }|  }	|D ]\}
}d|
v rMqCd|
v rW|
	dd}
d|
v ra|
	dd}
d|
v sid|
v rjqC| jj
rsd|
v rsqC| jd ur| j|
 }r|| }t|dt}|d }||| qCd}|D ]\\}}}||
vrqd|
v rq|dkrtd|
}|rt|ddd }|dkr|| dkrq|
	||}
|
dr|
|vrqt|
| rq||
 }|j}|||| ||
 d} |rqC|D ]m\}}}}}||
vrq|
	||}
|
dr|
|vrqt|
| r q|jd | dks,J |jd | }||
 }|j}d}|D ]'\}}|||  }|rX|||||| | n
||||| | |}q> n|
drt|
|vrtqCd}|	D ]@}|\}}}}||
vrqxd}|
	||}t|| rqx|| }ttdtf |j}||||||dd } | r|}
 n2qx|rqCt|
|}
|
d u rqCt|
| rqCd!|
v r|
	d"d#}
||
 }t|dt}||| ||
 qC|S )$N))rm   r   r   )rm   z.k_projr   )rm   z.v_projr   )rH   z
.gate_projr   )rH   z.up_projr!   r   rH   z.gate_and_up_projrG   )r!   r!   )r   r!   rm   r   r   r   zrotary_emb.inv_freqgate_proj_biaszgate_proj.biasup_proj_biaszup_proj.biaszrotary_emb.cos_cachedzrotary_emb.sin_cachedzlm_head.weightweight_loaderr   Fzmlp.expertsr   zlayers\.\d+r   rc   r!   z.biasT.)shard_id	expert_idreturn_successzmlp.gate.wg.zwg.r?   )r=   r+   r   r6   r   dictnamed_parameterssetr  replacer   rC   get_cache_scaler   researchr0   groupr   endswithr(   r  addr   typingcastr   r^   r   )!rS   r  r   stacked_params_mappingr   re   split_params_mappingparams_dictloaded_paramsexpert_params_mappingnameloaded_weight
scale_nameparamr  is_found
param_nameweight_namer  matchrh   densplit_paramfuncunitsoffsetnum
new_offsetis_expert_weightmappingr  name_mappedsuccessr3   r3   r4   load_weights  s  
	










zHunYuanModel.load_weightsr.   )rZ   r[   r\   r	   r]   rN   r   r   r   r   rY   r   r7   r   r0   r  r   r.  r_   r3   r3   rT   r4   r   C  s$    -
2$r   c                       s  e Zd Zg dddgdZdddedef fd	d
Zdeedf ddfddZ	deedf fddZ
		d&dejdB dejdedB dejdB dejeB f
ddZdejdejdB fddZdedejdejdefdd Zd!eeeejf  dee fd"d#Zdejdejfd$d%Z  ZS )'HunyuanV1ModelBase)r   k_projv_projr   r   )r~   rO   r?   r   r   rE   c                   s   t    |jj}|j}|| _|| _t|dd| _t j	rEt
|j|j|t|dd| _|jr5| jjj| j_t|dd}t|j|d| _d S t | _d S )Nmodelr   rE   lm_head)rC   rE   logit_scaleg      ?)scale)rM   rN   r   r   rC   r+   r   r2  r   r   r   r   r@   r*   r4  r   r   weightr6   r   logits_processorr'   )rS   r   rE   r+   rC   r5  rT   r3   r4   rN     s(   
zHunyuanV1ModelBase.__init__r   .r,   Nc                 C   s   || j _d S r.   )r2  r   )rS   r   r3   r3   r4   set_aux_hidden_state_layers     z.HunyuanV1ModelBase.set_aux_hidden_state_layersc                 C   s   t | jj}d|d |d fS )NrG      )r   r2  r   )rS   
num_layersr3   r3   r4   "get_eagle3_aux_hidden_state_layers  s   z5HunyuanV1ModelBase.get_eagle3_aux_hidden_state_layersr   r   r   r   c                 C   s   |  ||||}|S r.   )r2  )rS   r   r   r   r   model_outputr3   r3   r4   rY     s   zHunyuanV1ModelBase.forwardr   c                 C   s   |  | j|}|S r.   )r8  r4  )rS   r   logitsr3   r3   r4   compute_logits  s   z!HunyuanV1ModelBase.compute_logits
batch_sizedtypedevicec                 C   s6   t tj|| jjf||dtj|| jjf||ddS )N)rB  rC  r   )r   r   zerosr+   r@   )rS   rA  rB  rC  r3   r3   r4   make_empty_intermediate_tensors  s   z2HunyuanV1ModelBase.make_empty_intermediate_tensorsr  c                 C   s$   t | | jjr	dgnd d}||S )Nzlm_head.)skip_prefixes)r&   r+   r   r.  )rS   r  loaderr3   r3   r4   r.    s
   
zHunyuanV1ModelBase.load_weightsc                 C   s   | j |S r.   )r2  r   r   r3   r3   r4   r     r:  z"HunyuanV1ModelBase.embed_input_ids)NN)rZ   r[   r\   packed_modules_mappingr	   r]   rN   r   r0   r9  r=  r   r   r   rY   r@  rB  rC  rE  r   r  r.  r   r_   r3   r3   rT   r4   r/    sL    


$r/  c                       s`   e Zd Zdddedef fddZdeded	d
fddZd	ee	eeeef  fddZ
  ZS )HunYuanMoEV1Baser?   r   r   rE   c                   s   t  j||d g | _d| _g | _d }| jjD ] }t|trqt|t	s&J t|j
tr7|j
}| j|j
j q|d u r@tdt| j| _|j| _|j| _|j| _|j| _|j| _d S )Nr3  r!   z*No HunYuanMoE layer found in model.layers.)rM   rN   expert_weightsnum_expert_groups
moe_layersr2  r   r/   r'   r   r   r   r   r   r   r   num_moe_layersr   num_logical_expertsr   num_physical_expertsr   num_local_physical_expertsr   num_routed_expertsr   r   )rS   r   rE   example_layerr   rT   r3   r4   rN     s*   
zHunYuanMoEV1Base.__init__rO  rP  r,   Nc                 C   sh   | j |ksJ || _|| _ || j | _| jjD ]}t|jtr1|j}||_	||_
| j|_|j  qd S r.   )rP  rO  rN  r   r2  r   r/   r   r   r   r   r   r   update_expert_map)rS   rO  rP  r   moer3   r3   r4    update_physical_experts_metadata  s   
z1HunYuanMoEV1Base.update_physical_experts_metadatac                 C   s
   | j  S r.   )r2  r  r  r3   r3   r4   r    r   z#HunYuanMoEV1Base.get_expert_mapping)rZ   r[   r\   r	   r]   rN   r0   rU  r7   r   r  r_   r3   r3   rT   r4   rI    s    
&rI  c                       s,   e Zd Zdddedef fddZ  ZS )HunYuanDenseV1Baser?   r   r   rE   c                   s   t  j||d d S )Nr3  )rM   rN   )rS   r   rE   rT   r3   r4   rN     s   zHunYuanDenseV1Base.__init__)rZ   r[   r\   r	   r]   rN   r_   r3   r3   rT   r4   rV    s    $rV  c                   @      e Zd ZdS )HunYuanDenseV1ForCausalLMNrZ   r[   r\   r3   r3   r3   r4   rX        rX  c                   @   rW  )HunYuanMoEV1ForCausalLMNrY  r3   r3   r3   r4   r[     rZ  r[  )Q__doc__r  collections.abcr   r   	itertoolsr   regexr  r   r   transformersr   vllm.compilation.decoratorsr   vllm.configr   r	   r
   vllm.distributedr   r   r   r   %vllm.model_executor.layers.activationr   $vllm.model_executor.layers.attentionr   $vllm.model_executor.layers.fused_moer   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   r   r   r   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr   r   vllm.sequencer   vllm.v1.attention.backendr    
interfacesr"   r#   r$   r%   utilsr&   r'   r(   r)   r*   r^   r:   r0   r=   Moduler>   ra   r   r   r   r   r/  rI  rV  rX  r[  r3   r3   r3   r4   <module>   s^   	(kltn
  AZ1