o
    پiy                     @   s  d Z ddlZddlmZmZmZmZmZ ddlZddlm	Z	 ddl
mZ ddlmZmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZ ddlmZ ddlmZ ddl m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z' ddl(m)Z) ddl*m+Z+m,Z, ddl-m.Z. ddl/m0Z0m1Z1m2Z2 ddl3m4Z4 e Z5dede6fddZ7dede8fddZ9G dd de	j:Z;G dd de	j:Z<d d! Z=d"d# Z>G d$d% d%e	j:Z?G d&d' d'e	j:Z@G d(d) d)e	j:ZAG d*d+ d+e	j:ZBG d,d- d-eBZCeBeCgZDdS ).zAInference-only HunYuan model compatible with HuggingFace weights.    N)AnyDictIterableOptionalTuple)nn)PretrainedConfig)get_tensor_model_parallel_rank$get_tensor_model_parallel_world_size tensor_model_parallel_all_reduce)ExpertDistributionRecorder)
SiluAndMul)RMSNorm)ColumnParallelLinearMergedColumnParallelLinearQKVParallelLinearReplicatedLinearRowParallelLinear)LogitsProcessor)FusedMoE)TopK)QuantizationConfig)RadixAttention)get_rope)create_sampler)ParallelLMHeadVocabParallelEmbedding)ForwardBatch)default_weight_loaderkv_cache_scales_loadermaybe_remap_kv_scale_name)is_hipconfigreturnc                 C   sD   t | dd r t| jtr| jdkst| jtr t| jdkr dS dS )Nnum_experts   TF)getattr
isinstancer$   intlistmaxr"    r,   M/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/hunyuan.py_is_moe<   s   


r.   c                 C   s   t | ddsdS t | ddS )Nuse_claFr%   cla_share_factor)r&   r+   r,   r,   r-   _get_cla_factorF   s   r1   c                       sT   e Zd Z				ddedededee d	ed
ededdf fddZdd Z	  Z
S )
HunYuanMLPNF Thidden_sizeintermediate_size
hidden_actquant_configbiasprefixreduce_resultsr#   c                    sj   t    t||gd ||| dd| _t||||| d|d| _|dkr/td| dt | _d S )	N   .gate_up_proj)
input_sizeoutput_sizesr8   r7   r9   z
.down_proj)r=   output_sizer8   r7   r9   r:   siluzUnsupported activation: z!. Only silu is supported for now.)	super__init__r   gate_up_projr   	down_proj
ValueErrorr   act_fn)selfr4   r5   r6   r7   r8   r9   r:   	__class__r,   r-   rB   N   s*   


zHunYuanMLP.__init__c                 C   s*   |  |\}}| |}| |\}}|S N)rC   rF   rD   )rG   xgate_up_r,   r,   r-   forwardo   s   
zHunYuanMLP.forward)NFr3   T)__name__
__module____qualname__r(   strr   r   boolrB   rN   __classcell__r,   r,   rH   r-   r2   L   s.    	!r2   c                       sJ   e Zd Z		ddedee def fddZdej	d	ej	fd
dZ
  ZS )HunYuanSparseMoeBlockNr"   r7   layer_idc                    s^  t    t | _| j|jkrtd| j d|j dt|jtr7|dks(J t	|j|ks1J |j| }n|j}|j
}|jd urPt|jtrK|jn|j| }t|||dkrYdndd| _t|j|j|d||d	| _t|j|jdd d
| _|jdkrt|jtr|dksJ t	|j|ksJ |j| }n|j}t|j|j
| |j|dd| _d S d | _d S )NzTensor parallel size z' is greater than the number of experts .r   r%   TF)top_krW   renormalize)r$   r4   r5   r:   rW   r7   )r8   r7   )r4   r5   r6   r7   r:   )rA   rB   r
   tp_sizer$   rE   r'   moe_topkr)   lenr5   moe_intermediate_sizer(   r   topkr   r4   expertsr   gateuse_mixed_mlp_moenum_shared_expertr2   r6   
shared_mlp)rG   r"   r7   rW   rY   r5   rc   rH   r,   r-   rB   x   sb   



	

zHunYuanSparseMoeBlock.__init__hidden_statesr#   c           	      C   s   |j }|j d }|d|}d }| jd ur| |}| |\}}| ||}| ||}|d ur5|| }| jdkr>t|}||S )NrV   r%   )shapeviewrd   ra   r_   r`   r[   r   )	rG   re   
orig_shape
hidden_dimshared_outputrouter_logitsrM   topk_outputfinal_hidden_statesr,   r,   r-   rN      s   




zHunYuanSparseMoeBlock.forward)NrV   )rO   rP   rQ   r   r   r   r(   rB   torchTensorrN   rT   r,   r,   rH   r-   rU   v   s    ErU   c                 C   s0   t | dr
t| jS t | drt| jS td)Nhead_dimattention_head_dimz8Missing head dim config, try set head_dim in config.json)hasattrr(   rp   rq   rE   r+   r,   r,   r-   get_head_dim   s
   



rs   c                 C   s   | j | j }t| drF|| jkr't| ds'td| d| j d| j d t| drH| j| jkrJtd| j d| j d	d
| j d d S d S d S )Nrq   rp   z0HunYuan model config error: calculated head_dim z != attention_head_dim z
Please Add head_dim:z1 in config.json to make sure correctly inference.z%HunYuan model config error: head_dim(z) != attention_head_dim()z
Please change head_dim:)r4   num_attention_headsrr   rq   rE   rp   )r"   calc_head_dimr,   r,   r-   check_head_dim   s$   

rw   c                       s   e Zd Z								dded	ed
edededeeee	f  dedee
 dededededdf fddZ	ddejdejdedeeej  dejf
ddZ  ZS )HunYuanAttention'  N    Fr3   rG   rV   r"   r4   	num_headsnum_kv_heads
rope_thetarope_scalingmax_position_embeddingsr7   r8   r9   attention_typerW   r#   c              	      s  t    || _t }|| _| j| dksJ | j| | _|| _| j|kr/| j| dks.J n	|| j dks8J td| j| | _t	|| _
t| | j| j
 | _| j| j
 | _| j
d | _|| _|| _t|dd| _|| _|| _|dkrt|| j
| j| j|	||
 dd| _n|d	krt|||	||
 d
d| _ntdt| j| j
 ||	||
 dd| _d}|d ur| dkrd}t| j
| j
||||d| _t| j| j
| j| j||
 dd| _ | jrt!| j
|j"d| _#t!| j
|j"d| _$d S d S )Nr   r%   g      use_qk_normFrG   	.qkv_proj)r4   	head_sizetotal_num_headstotal_num_kv_headsr8   r7   r9   cross.q_proj)r8   r7   r9   Not support attnention typez.o_proj)r=   r?   r8   r7   r9   Tgguf)
rotary_dimmax_positionbaser~   is_neox_stylez.attn)r|   rW   r9   eps)%rA   rB   r4   r
   r   r{   r   r*   r|   rs   rp   rw   q_sizekv_sizescalingr}   r   r&   r   r   rW   r   qkv_projr   q_projRuntimeErrorr   o_projget_namer   
rotary_embr   attnr   rms_norm_epsquery_layernormkey_layernorm)rG   r"   r4   r{   r|   r}   r~   r   r7   r8   r9   r   rW   r[   r   rH   r,   r-   rB      s   



	

	zHunYuanAttention.__init__	positionsre   forward_batch	kv_statesc                 C   sD  | j dkrC| |\}}|j| j| j| jgdd\}}}	| |||\}}|}
| jrB| |d| j	
 }| |d| j	
 }nJ| j dkr|d usNJ |\}
}	|
}| |\}}t|}| |||\}}| jr| |d| j| j	
 }| |d| j| j	
 }ntd| |||	|}| |\}}||
|	ffS )NrG   rV   dimr   r   )r   r   splitr   r   r   r   r   reshaperp   
contiguousr   r   rn   
empty_likerg   r{   r|   r   r   r   )rG   r   re   r   r   qkvrM   qkvori_kk_tmpattn_outputoutputr,   r,   r-   rN   \  s8   
 

zHunYuanAttention.forward)ry   Nrz   NFr3   rG   rV   rJ   )rO   rP   rQ   r   r(   floatr   r   rR   r   r   rS   rB   rn   ro   r   r   rN   rT   r,   r,   rH   r-   rx      s`    	
irx   c                       s   e Zd Z			ddedee dededdf
 fd	d
Z	dde	j
de	j
dedee	j
 deee	j
  dee	j
e	j
f fddZ  ZS )HunYuanDecoderLayerNr3   rV   r"   r7   r9   rW   r#   c                    sb  t    |dksJ || _|j| _t|jtr|jn|j| | _t|dd}t|dd }|d ur<t|dd r<|j|d< t|dd}t|dd	pMt|d
d	}t	|}	|dkr^||	 dkr^dnd}
t
|| j|jt|d|j|||||| d|
|d| _t|rt|||d| _nt| j| j|j|t|dd	| dd| _t|j|jd| _t|j|jd| _d S )Nr   r}   ry   r~    original_max_position_embeddingsr   rz   attention_biasFr8   r   rG   num_key_value_headsz
.self_attn)r"   r4   r{   r|   r}   r~   r   r7   r8   r9   r   rW   )r"   r7   rW   mlp_biasz.mlp)r4   r5   r6   r7   r8   r9   r   )rA   rB   rW   r4   r'   r5   r(   r&   r   r1   rx   ru   	self_attnr.   rU   mlpr2   r6   r   r   input_layernormpost_attention_layernorm)rG   r"   r7   r9   rW   r}   r~   r   r   
cla_factorr   rH   r,   r-   rB     sp   




zHunYuanDecoderLayer.__init__r   re   r   residualr   c                 C   sb   |d u r|}|  |}n|  ||\}}| j||||d\}}| ||\}}| |}|||fS )N)r   re   r   r   )r   r   r   r   )rG   r   re   r   r   r   ori_kv_statesr,   r,   r-   rN     s   	


zHunYuanDecoderLayer.forward)Nr3   rV   rJ   )rO   rP   rQ   r   r   r   rR   r(   rB   rn   ro   r   r   rN   rT   r,   r,   rH   r-   r     s:    Lr   c                       s   e Zd Z		ddedee deddf fddZd	ej	dej	fd
dZ
	dd	eej	 dej	dedeej	 dej	f
ddZ  ZS )HunYuanModelNr3   r"   r7   r9   r#   c                    sp   t     | _ j| _ j| _ j| _t| j j| _	t
 fddt jD | _t j jd| _d S )Nc                    s   g | ]	}t  |d qS ))r"   rW   r7   )r   ).0rW   r"   r7   r,   r-   
<listcomp>  s    z)HunYuanModel.__init__.<locals>.<listcomp>r   )rA   rB   r"   pad_token_idpadding_idx
vocab_sizeorg_vocab_sizer   r4   embed_tokensr   
ModuleListrangenum_hidden_layerslayersr   r   norm)rG   r"   r7   r9   rH   r   r-   rB     s   
zHunYuanModel.__init__	input_idsc                 C   s
   |  |S rJ   )r   )rG   r   r,   r,   r-   get_input_embeddings  s   
z!HunYuanModel.get_input_embeddingsr   r   input_embedsc                 C   sn   |d ur|}n|  |}d }d }tt| jD ]}| j| }	|	|||||\}}}
	 d }q| ||\}}|S rJ   )r   r   r]   r   r   )rG   r   r   r   r   re   r   prev_kv_statesilayerr   rM   r,   r,   r-   rN   
  s$   


zHunYuanModel.forward)Nr3   rJ   )rO   rP   rQ   r   r   r   rR   rB   rn   ro   r   r   rN   rT   r,   r,   rH   r-   r     s2    r   c                       s   e Zd Zg dddgdZdddZdgZd	d
ddddZ	d$dedee	 ddf fddZ
	d$dejdejdedejdejf
ddZdejfddZdeeeejf  fdd Zd!eddfd"d#Z  ZS )%HunYuanMoEV1ForCausalLM)r   k_projv_proj	gate_projup_proj)r   rC   input_embeddingsoutput_embeddings)r   lm_headr   )r   r   )r   r%   )r   r;   )rC   r   )rC   r%   )r   r   r   r   r   Nr"   r7   r#   c                    s   t    || _t||dd| _|j| _t|j|j|d| _	|j
r(| jjj| j	_|j| _t|| _t| t|dd}t||d| _t | _d S )Nmodel)r9   )r7   logit_scaleg      ?)r   )rA   rB   r"   r   r   r   unpadded_vocab_sizer   r4   r   tie_word_embeddingsr   weightrs   rp   rw   r&   r   logits_processorr   sampler)rG   r"   r7   r   rH   r,   r-   rB   F  s"   

z HunYuanMoEV1ForCausalLM.__init__r   r   r   r   c                 C   s"   |  ||||}| ||| j|S rJ   )r   r   r   )rG   r   r   r   r   re   r,   r,   r-   rN   b  s   
zHunYuanMoEV1ForCausalLM.forwardr   c                 C   s   | j j}t| j d| j j}|| }|||d | j| j}tj||ddfdd\}}}|d| j}|d| j}|d| j}t|||fS )Nr   r;   r%   r   rV   )	r"   ru   r&   r   rp   r4   rn   r   concat)rG   r   ru   r|   num_key_value_groupsr   r   r   r,   r,   r-   _split_qkv_weightn  s   z)HunYuanMoEV1ForCausalLM._split_qkv_weightweightsc              	   C   s6  t | j}g d}| jj}t| jd| jj}dddddgd fdd||d  d	|fd
|fd|fg| jfg}t| jrEtjddd| jjd}ni }t	| 
 }|D ]H\}	}
d|	v rYqOd|	v rc|	dd}	d|	v rm|	dd}	d|	v sud|	v rvqO| jjrd|	v rqOd}|D ]Q\}}}||	vrqd|	v rq|dkrtd|	}|rt|ddd }|dkr|| dkrq|	||}	|	d r|	|vrq||	 }|j}|||
| d!} |rqO|D ]c\}}}}}||	vrq|	||}	|	d r|	|vrq|
jd | dksJ |
jd | }||	 }|j}d}|D ]'\}}|||  }|r.||||
|| | n
|||
|| | |}q nZ|	d rJ|	|vrJqO|D ]&}|\}}}}||	vr[qL|	||}	||	 }|j}|||
|	||d"  n%t|	|}	|	d u r~qOd#|	v r|	d$d%}	||	 }t|d&t}|||
 qOd S )'N))r   r   r   )r   z.k_projr   )r   z.v_projr   )r<   z
.gate_projr   )r<   z.up_projr%   r   r<   z.gate_and_up_projr;   )r%   r%   )r   r%   r   r   r   r   r   rD   r   )ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namer$   zrotary_emb.inv_freqgate_proj_biaszgate_proj.biasup_proj_biaszup_proj.biaszrotary_emb.cos_cachedzrotary_emb.sin_cachedzlm_head.weightFzmlp.expertsr   zlayers\.\d+r   rX   rV   r%   z.biasT)shard_id	expert_idzmlp.gate.wg.zwg.r3   weight_loader)r1   r"   ru   r&   r   r.   r   make_expert_params_mappingr$   dictnamed_parametersreplacer   researchr(   groupr   endswithr   rf   r    r   )rG   r   r   stacked_params_mappingru   r|   split_params_mappingexpert_params_mappingparams_dictnameloaded_weightis_found
param_nameweight_namer   matchrW   paramr   densplit_paramfuncunitsoffsetnum
new_offsetmappingr   r,   r,   r-   load_weights  s   
	






z$HunYuanMoEV1ForCausalLM.load_weightsquantization_param_pathc                 C   s~   t  }t }t|||| jj| jjjD ])\}}t| jj	| t
js(| jj	| j}t r/|d9 }t|dr9||j_qtdd S )Nr;   kv_scalez8Self attention has no KV cache scaling factor attribute!)r
   r	   r   r"   r   rI   
model_typer'   r   r   r   Identityr   r!   rr   r   	_kv_scaler   )rG   r  r[   tp_rank	layer_idxscaling_factorlayer_self_attnr,   r,   r-   load_kv_cache_scales  s&   

z,HunYuanMoEV1ForCausalLM.load_kv_cache_scalesrJ   )rO   rP   rQ   packed_modules_mappingembedding_modulesembedding_padding_modules#bitsandbytes_stacked_params_mappingr   r   r   rB   rn   ro   r   rN   r   r   r   rR   r  r  rT   r,   r,   rH   r-   r   +  sP    !
 r   c                   @   s   e Zd ZdS )HunYuanDenseV1ForCausalLMN)rO   rP   rQ   r,   r,   r,   r-   r  ,  s    r  )E__doc__r   typingr   r   r   r   r   rn   r   transformersr   sglang.srt.distributedr	   r
   r   #sglang.srt.eplb.expert_distributionr   sglang.srt.layers.activationr   sglang.srt.layers.layernormr   sglang.srt.layers.linearr   r   r   r   r   "sglang.srt.layers.logits_processorr   &sglang.srt.layers.moe.fused_moe_tritonr   sglang.srt.layers.moe.topkr   *sglang.srt.layers.quantization.base_configr   !sglang.srt.layers.radix_attentionr   "sglang.srt.layers.rotary_embeddingr   sglang.srt.layers.samplerr   *sglang.srt.layers.vocab_parallel_embeddingr   r   ,sglang.srt.model_executor.forward_batch_infor   $sglang.srt.model_loader.weight_utilsr   r   r    sglang.srt.utilsr!   expert_distribution_recorderrS   r.   r(   r1   Moduler2   rU   rs   rw   rx   r   r   r   r  
EntryClassr,   r,   r,   r-   <module>   sL   
*\ cE  