o
    پiz                     @   s  d dl mZ d dlmZ d dlZd dlmZ d dlmZ d dlm	Z	 d dl
mZmZmZmZ d dlmZ d d	lmZmZ d d
lmZmZ d dlmZ d dlmZmZmZmZmZmZ d dl m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z'm(Z( d dl)m*Z* d dl+m,Z, d dl-m.Z. d dl/m0Z0m1Z1 d dl2m3Z3 d dl4m5Z5m6Z6 d dl7m8Z8m9Z9m:Z: d dl;m<Z= d dl>m?Z@ d dlAmBZB d dlCmDZD d dlEmFZFmGZGmHZH G dd dejIZJG dd  d ejIZKG d!d" d"ejIZLG d#d$ d$ejIZMG d%d& d&ejIZNeNZOdS )'    )Iterable)OptionalN)	rearrange)nn)KimiLinearConfig)divideget_pp_group$get_tensor_model_parallel_world_size tensor_model_parallel_all_reduce)'get_global_expert_distribution_recorder)FusedRMSNormGatedfused_kda_gate)get_attention_tp_rankget_attention_tp_size)RMSNorm)ColumnParallelBatchedLinearColumnParallelLinear"MergedColumnParallelRepeatedLinearQKVParallelLinearReplicatedLinearRowParallelLinear)LogitsProcessor)get_moe_impl_class)FusedMoE)TopKTopKOutputFormat)QuantizationConfig)RadixLinearAttention)PPMissingLayer)ParallelLMHeadVocabParallelEmbedding)get_is_capture_mode)ForwardBatchPPProxyTensors)default_weight_loadermaybe_remap_kv_scale_namesharded_weight_loader)DeepseekV2AttentionMLA)LlamaMLP)maybe_prefix)make_layers)BumpAllocator
add_prefixset_weight_attrsc                       s^   e Zd Z				ddedee dededeej	j
 f
 fd	d
ZdejdejfddZ  ZS )KimiMoEN r   configquant_configprefix	layer_idx
alt_streamc                    s6  t    |j}|j}|j}|j}	|j}
t | _|j	| _	|j
| _
|| _|| _|jdkr4td|j dt||	dd | dd| _tt|	| j_t||j|j|j|j| j|| j	td|d| _t|j|
d	|j|j| jj|| j	| jj|d u r|tj nd d

| _!| j
d ur|| j
 }t"|j||j|dd| _#d S d S )NsiluzUnsupported activation: z!. Only silu is supported for now.Fz.gatebiasr1   r2   experts)num_expertstop_khidden_sizeintermediate_sizelayer_idr1   routed_scaling_factorr2   T)
r:   renormalizeuse_grouped_topknum_expert_group
topk_groupcorrection_biasr1   r>   %apply_routed_scaling_factor_on_outputoutput_format)r;   r<   
hidden_actr1   reduce_results)$super__init__r;   r<   moe_intermediate_sizer9   moe_renormalizer	   tp_sizer>   num_shared_expertsr3   r4   rF   
ValueErrorr   gater   	Parametertorchemptye_score_correction_biasr   n_routed_expertsnum_experts_per_tokenr,   r8   r   rA   rB   )should_fuse_routed_scaling_factor_in_topkr   STANDARDtopkKimiMLPshared_experts)selfr0   r1   r2   r3   r4   r;   r<   rJ   r9   rK   	__class__ Q/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/kimi_linear.pyrI   7   sn   



zKimiMoE.__init__hidden_statesreturnc           
      C   s>  |j \}}|d|}d }| jd urd| jd urd|j d dkrdt rdtj }| j| | 	|
 }tj| j | |\}}| ||}| ||}	W d    n1 sXw   Y  || j n$| jd uru|j d dkru| 	|}| |\}}| ||}| ||}	|d ur|	| }	| jdkrt|	}	|	||S )Nr      )shapeviewr4   rM   r!   rQ   cudacurrent_streamwait_streamrZ   clonestreamrO   rX   r8   rL   r
   )
r[   r`   
num_tokensr;   shared_outputrg   router_logits_topk_outputfinal_hidden_statesr^   r^   r_   forward   s6   





zKimiMoE.forward)Nr/   r   N)__name__
__module____qualname__r   r   r   strintrQ   rf   StreamrI   Tensorrq   __classcell__r^   r^   r\   r_   r.   6   s"    
Ir.   c                       s   e Zd Z			ddedededee ded	ed
df fddZ	de
jfddZde
jfddZde
jde
jdeded
df
ddZ  ZS )KimiDeltaAttentionNh㈵>r/   r3   r;   r0   r1   rms_norm_epsr2   ra   c                    s  t    t | _t | _|| _|| _|jd | _	|jd | _
|jd | _|jd | _|jd | _|j| _|| _|| _| j
| j dksEJ t| j
| j| _| j	| j
 }|jd | _|d u | _| jr|||| j
g| _| j	| j	g| _t| j| j| j|| dd| _d| | j | j
| j d| j	 g| _td| j	||jd	| _ndt }	t| j| j	| j
| jd
||	| j| j| dd
| _ t!| j| j	d
|| dd| _"t#| j	|d
|| dd| _$t#| j| j
d
|| dd| _%t!| j| j	d
|| dd| _&t#| j	|d
|| dd| _'t()t*j+t|| jt*j,d	| _-t.| j-dt/di t#| j|d
t*j,| dd| _0t#| j|d
t*j,| dd| _1t#| j|d
t*j,| dd| _2| j0j3j45d| j0j3_4| j1j3j45d| j1j3_4| j2j3j45d| j2j3_4t()t*j+dd| jdt*j,d	| _6t.| j6dt/di t7| j	|dd| _8t9|| jd
|| dd| _:| j0j3;| j0j3<d| j0j3<d| _=| j1j3;| j1j3<d| j1j3<d| _>| j2j3;| j2j3<d| j2j3<d| _?| j=| j>| j?f}
| j0j@| j1j@| j2j@f}tA| j| j| j | j| j | j| j | j| j| j|
|| j6| j-d| _Bd S )Nhead_dim	num_headsr   short_conv_kernel_size.fused_qkvbfg_a_projr1   r2         )dtypeF	.qkv_proj)r7   r1   tp_rankrL   v_head_sizer2   	.f_a_projr6   	.f_b_proj.b_proj	.g_a_proj	.g_b_projweight_loaderz	.q_conv1d)
input_sizeoutput_sizer7   params_dtyper2   z	.k_conv1dz	.v_conv1drc   sigmoid)eps
activationz.o_proj)r=   num_q_headsnum_k_headsnum_v_heads
head_q_dim
head_k_dim
head_v_dimconv_weightsr7   A_logdt_bias)CrH   rI   r	   rL   r   attn_tp_sizer;   r0   linear_attn_configr}   r~   r   r   r   
v_head_dimr   r3   r2   r   local_num_heads	conv_sizedo_fuse_qkvbfg
qkvb_sizesfg_sizesr   fused_qkvbfg_a_projsplit_sizesr   r   fused_fg_b_projr   r   qkv_projr   f_a_projr   f_b_projb_projg_a_projg_b_projr   rP   rQ   rR   float32r   r-   r&   q_conv1dk_conv1dv_conv1dweightdata	unsqueezer   r   o_normr   o_projre   sizeq_conv_weightsk_conv_weightsv_conv_weightsr7   r   attn)r[   r3   r;   r0   r1   r|   r2   kwargsprojection_sizeattn_tp_rankr   r7   r\   r^   r_   rI      s2  







zKimiDeltaAttention.__init__r`   c                 C   sX   |  |\}}| |d }| | |d d }| | |d d }||||fS )Nr   )r   r   r   r   r   r   )r[   r`   qkvrn   betaforget_gateg_proj_statesr^   r^   r_   forward_qkvbfge  s   z!KimiDeltaAttention.forward_qkvbfgc                 C   sP   |  |}tj|| jdd\}}}| |dd| jdd\}}||||fS )Nrb   dimr   r   rc   )r   rQ   splitr   r   re   r}   	transpose)r[   r`   fused_statesr   r   fg_a_statesr   r   r^   r^   r_   forward_qkvbfg_fusedt  s   
z'KimiDeltaAttention.forward_qkvbfg_fused	positionsforward_batchzero_allocatorc                 C   s   | j r| |\}}}}n	| |\}}}}| }|j s3t|| j| j| j	d}|
 }|d}|d}| j||||d}	t|d| jd}
| |	|
}	t|	d}	| |	d S )N)g_biasr   )	mixed_qkvabz... (h d) -> ... h d)dz1 n h d -> n (h d))r   r   r   floatforward_mode	is_decoder   r   r}   r   r   r   r   r   r   r   )r[   r`   r   r   r   r   r   r   r   core_attn_out	norm_gater^   r^   r_   rq     s2   



zKimiDeltaAttention.forward)Nr{   r/   )rr   rs   rt   rv   r   r   r   r   ru   rI   rQ   rx   r   r   r"   r+   rq   ry   r^   r^   r\   r_   rz      sB    	 ?rz   c                       s   e Zd Z			ddededee dedeej	j
 ddf fd	d
Zdejdejdedeej dedeejejf fddZ  ZS )KimiDecoderLayerNr/   r0   r3   r1   r2   r4   ra   c                    s  t    |j| _|| _|j| _||r$t||j||| dd| _nt|| j|j	|| d||j
|j|j|j|jdd| _| jrf|jd urf||jkrf||j dkrft|||| d| jd| _| j| _nt| j|j|j|| dd| _t|j|jd	| _t|j|jd	| _d S )
Nz
.self_attn)r3   r;   r0   r1   r2   T)r=   r;   r~   r1   r2   r0   qk_nope_head_dimqk_rope_head_dimr   q_lora_rankkv_lora_rank	skip_roper   z.mlp)r0   r1   r3   r2   r4   )r;   r<   rF   r1   r2   r   )rH   rI   r;   r4   is_moeis_kda_layerrz   	self_attnKimiMLAAttentionnum_attention_headsr   r   r   r   r   r9   first_k_dense_replacemoe_layer_freqr.   block_sparse_moemlprY   r<   rF   r   r|   input_layernormpost_attention_layernorm)r[   r0   r3   r1   r2   r4   r\   r^   r_   rI     sd   





zKimiDecoderLayer.__init__r   r`   r   residualr   c                 C   s\   |d u r|}|  |}n|  ||\}}| j||||d}| ||\}}| |}||fS )N)r`   r   r   r   )r   r   r   r   )r[   r   r`   r   r   r   r^   r^   r_   rq     s   	
zKimiDecoderLayer.forward)Nr/   N)rr   rs   rt   r   rv   r   r   ru   rQ   rf   rw   rI   rx   r"   r+   tuplerq   ry   r^   r^   r\   r_   r     s<    
Ar   c                       sp   e Zd Z		ddedee def fddZ		ddej	dB d	ej	d
e
dej	dB dee dej	fddZ  ZS )KimiLinearModelNr/   r0   r1   r2   c                    s   t     _ j_ j_t _jjr&t	 j j
| dd_nt _tj _t j fddjjjj| dd\___jjr[t j
 jd_nt _t } j| dksmJ d	d S )
Nz.embed_tokensr2   c                    s   t |  |jdS )N)r3   r0   r1   r2   r4   )r   r4   )idxr2   r0   r1   r[   r^   r_   <lambda>,  s    z*KimiLinearModel.__init__.<locals>.<lambda>z.layers)pp_rankpp_sizer2   r   r   z3num_attention_heads must be divisible by world_size)rH   rI   r0   pad_token_idpadding_idx
vocab_sizer   pp_groupis_first_rankr    r;   embed_tokensr   rQ   rf   rw   r4   r*   num_hidden_layersrank_in_group
world_sizelayersstart_layer	end_layeris_last_rankr   r|   normr	   r   )r[   r0   r1   r2   r   r\   r   r_   rI     s6   

zKimiLinearModel.__init__	input_idsr   r   inputs_embedspp_proxy_tensorsra   c              	   C   s0  t  jr|d ur|}n| |}d }n|d usJ |d }|d }| j| j }|j}	t|d tj|	d}
g }t	| j| jD ]*}t
 |}| | j| }||||||
d\}}W d    n1 sbw   Y  q=| jjsst||dS |jd dkr|d u r| |}n| ||\}}t|dkr|S ||fS )Nr`   r   r   )buffer_sizer   device)r   r`   r   r   r   )r`   r   r   )r   r   r   r   r   r  r+   rQ   r   ranger   with_current_layerr   r   r   r#   rd   r   len)r[   r  r   r   r  r  r`   r   total_num_layersr  r   aux_hidden_statesictxlayerrn   r^   r^   r_   rq   B  sT   


zKimiLinearModel.forwardNr/   NN)rr   rs   rt   r   r   r   ru   rI   rQ   rx   r"   r#   rq   ry   r^   r^   r\   r_   r     s2    6r   c                       s   e Zd Z		ddedee deddf fddZe	 		dd	ej
d
ej
dedeej
 dee dej
fddZdeeeej
f  fddZ  ZS )KimiLinearForCausalLMNr/   r0   r1   r2   ra   c                    s   t    || _|| _t||t|dd| _t | _| jj	r/t
| jj| jj|t|dd| _nt | _t| jdd}t||d| _d S )Nmodelr   lm_headr   logit_scaleg      ?)r0   r  )rH   rI   r0   r1   r   r)   r  r   r   r   r   r   r;   r  r   getattrr   logits_processor)r[   r0   r1   r2   r  r\   r^   r_   rI     s"   

zKimiLinearForCausalLM.__init__r  r   r   r  r  c                 C   s0   |  |||||}| jjr| ||| j|S |S )N)r  r   r   r  r  )r[   r  r   r   r  r  r`   r^   r^   r_   rq     s   	
zKimiLinearForCausalLM.forwardweightsc              	   C   s  g d}| j jrtjddd| j jd}ng }t|  }t }|D ]}|d d \}}t|dkr5|d ni }	d|v r<q!d|v sDd	|v rEq!|D ]g\}
}}||vrQqGd
|v rZ||vrZqG|
dv r|t	|
dd }| j |snqG| jj| j}t|dds|qG|dv rt	|
dd }| j |sqG|||
}|dr||vrqG|| }|j}||||  nTt|D ]$\}\}
}}}||vrq|||
}|| }|j}||||||d  n+|dr||vr| j jsq!t||}|d u rq!|| }t|dt}|||fi |	 || q!| j jD ]E}| jj| j}|jjdd|j|j fj
|j|jgdd\}}|dd dd|_ | dd|_!t"|jdrQ|jj#|_$qd S )N)).gate_up_projz
.gate_projr   )r  z.up_projrc   )r   .q_projr   )r   .k_projrc   )r   .v_projr   )r   r   r   )r   r      )r   r      ).fused_fg_b_projr   r   )r  r   rc   )r   r  q)r   r  k)r   r  vw1w2w3)ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namer9   r   zrotary_emb.inv_freqzrotary_emb.cos_cachedzrotary_emb.sin_cachedzmlp.experts.>   r  r   .r   F>   r  r  r  z.bias)	expert_idshard_idr   r   rb   rc   r   weight_scale)%r0   r   r   make_expert_params_mappingr9   dictnamed_parameterssetr  rv   r   r   r  r   r   r  replaceendswithr   	enumerateis_linear_attnr%   r$   addfull_attention_layer_ids	kv_b_projr   	unflattenr   r   r   
contiguousw_kcw_vchasattrr*  w_scale)r[   r  stacked_params_mappingexpert_params_mappingparams_dictloaded_paramsargsnameloaded_weightr   
param_nameweight_namer)  r=   r  paramr   r   r(  r   r8  r9  r^   r^   r_   load_weights  s   


z"KimiLinearForCausalLM.load_weightsr  r  )rr   rs   rt   r   r   r   ru   rI   rQ   no_gradrx   r"   r#   rq   r   r   rF  ry   r^   r^   r\   r_   r  ~  s:    $r  )Pcollections.abcr   typingr   rQ   einopsr   r   sglang.srt.configs.kimi_linearr   sglang.srt.distributedr   r   r	   r
   #sglang.srt.eplb.expert_distributionr   #sglang.srt.layers.attention.fla.kdar   r   sglang.srt.layers.dp_attentionr   r   sglang.srt.layers.layernormr   sglang.srt.layers.linearr   r   r   r   r   r   "sglang.srt.layers.logits_processorr   "sglang.srt.layers.moe.ep_moe.layerr   ,sglang.srt.layers.moe.fused_moe_triton.layerr   sglang.srt.layers.moe.topkr   r   *sglang.srt.layers.quantization.base_configr   (sglang.srt.layers.radix_linear_attentionr   sglang.srt.layers.utilsr   *sglang.srt.layers.vocab_parallel_embeddingr   r    +sglang.srt.model_executor.cuda_graph_runnerr!   ,sglang.srt.model_executor.forward_batch_infor"   r#   $sglang.srt.model_loader.weight_utilsr$   r%   r&   sglang.srt.models.deepseek_v2r'   r   sglang.srt.models.llamar(   rY   sglang.srt.models.transformersr)   sglang.srt.utilsr*   sglang.srt.utils.commonr+   r,   r-   Moduler.   rz   r   r   r  
EntryClassr^   r^   r^   r_   <module>   sJ    p  ^n 2