o
    پiV                     @   s  d dl Zd dlZd dlmZmZmZmZ d dlZd dlm	Z	 d dl
mZ d dlmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZ d dlmZmZmZ d dlmZ d dl m!Z!m"Z"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z)m*Z* d dl+m,Z, d dl-m.Z.m/Z/ d dl0m1Z1 d dl2m3Z3 d dl4m5Z5 d dl6m7Z7m8Z8m9Z9m:Z:m;Z; d dl<m=Z> d dl?m@Z@mAZA d dlBmCZC d dlDmEZEmFZFmGZG d dlHmIZI d dlJmKZK d dlLmMZM d dlNmOZOmPZPmQZQmRZRmSZSmTZTmUZUmVZVmWZWmXZX eW ZYeV ZZeX Z[e5 Z\eSdoeYZ]eR Z^eU Z_eT Z`eZrd d lambZb ne_re^rneYr'd d!lcmdZb n	 eeefZgG d"d# d#e	jhZiG d$d% d%e	jhZjG d&d' d'e	jhZkG d(d) d)e	jhZlG d*d+ d+e	jhZmG d,d- d-e	jhZnengZodS ).    N)IterableListOptionalTuple)nn)LongcatFlashConfig)$get_tensor_model_parallel_world_size tensor_model_parallel_all_reduce)'get_global_expert_distribution_recorder)ModelConfigForExpertLocation)deep_gemm_wrapper)
SiluAndMul)LayerCommunicatorLayerScatterModes)get_attention_tp_rankget_attention_tp_sizeis_dp_attention_enabled)RMSNorm)MergedColumnParallelLinearReplicatedLinearRowParallelLinear)LogitsProcessor)zero_experts_compute_triton)	DeepEPMoEget_moe_impl_class)FusedMoE)StandardTopKOutputTopK)%filter_moe_weight_param_global_expert)QuantizationConfig)is_fp8_fnuz)block_quant_dequantblock_quant_to_tensor_quantchannel_quant_to_tensor_quantnormalize_e4m3fn_to_e4m3fnuzrequant_weight_ue8m0_inplace)block_dequant)ParallelLMHeadVocabParallelEmbedding)ForwardBatch)maybe_executor_submitshould_async_load$should_deepgemm_weight_requant_ue8m0)default_weight_loader)DeepseekV2AttentionMLA)get_global_server_args)
BumpAllocator
add_prefixbind_or_assigncpu_has_amx_supportget_bool_env_varget_device_smis_cpuis_cudais_hipis_npuSGLANG_USE_AITER)awq_dequantize)awq_dequantize_tritonc                       sN   e Zd Z			ddedededee ded	ed
df fddZdd Z	  Z
S )LongcatFlashMLPNF hidden_sizeintermediate_size
hidden_actquant_configreduce_resultsprefixreturnc              	      sj   t    t||gd d|td|d| _t||d||td|d| _|dkr/td| d	t | _	d S )
N   Fgate_up_proj)biasrB   rD   	down_proj)rH   rB   rC   rD   siluUnsupported activation: !. Only silu is supported for now.)
super__init__r   r1   rG   r   rI   
ValueErrorr   act_fn)selfr?   r@   rA   rB   rC   rD   	__class__ S/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/longcat_flash.pyrN      s*   
	
zLongcatFlashMLP.__init__c                 C   s*   |  |\}}| |}| |\}}|S N)rG   rP   rI   )rQ   xgate_up_rT   rT   rU   forward   s   
zLongcatFlashMLP.forward)NFr>   )__name__
__module____qualname__intstrr   r   boolrN   rZ   __classcell__rT   rT   rR   rU   r=      s(     r=   c                       s4   e Zd Zdejdfdef fddZdd Z  ZS )LongcatFlashRouterr   r>   rD   c              	      sb   t    |j| _| j| | _|| _t|j| j|j|d td|d| _t	
tj| j|d| _d S )N
classifier)rH   params_dtyperB   rD   )dtype)rM   rN   n_routed_expertsrounter_params_dtyper   r?   router_biasr1   rc   r   	Parametertorchzerose_score_correction_bias)rQ   configzero_expert_numrg   rD   rR   rT   rU   rN      s   

zLongcatFlashRouter.__init__c                 C   s   |  || j\}}|S rV   )rc   torg   )rQ   hidden_stateslogitsrY   rT   rT   rU   rZ      s   zLongcatFlashRouter.forward)	r[   r\   r]   rj   float32r_   rN   rZ   ra   rT   rT   rR   rU   rb      s    rb   c                	       sV   e Zd Z		ddededee def fddZd	e	j
d
e	j
fddZdd Z  ZS )LongcatFlashMoENr>   rm   layer_idrB   rD   c              
      s  t    || _|| _|j| _|j| _|j| _|j	| _	|j
| _
|jdkr)tj| _ntj| _t | _| j|jkrDtd| j d|j d|jdkrRtd|j dt| j| j	| jtd|d	| _t| jd
d
| jjj|d| _| jj| j_t|| j| j| j|j|j|td|d| _d S )Nrr   zTensor parallel size z' is greater than the number of experts .rJ   rK   rL   router)rm   rn   rg   rD   F)top_krenormalizeuse_grouped_topkcorrection_biasrt   experts)num_expertsrw   rt   r?   r@   rB   rD   ) rM   rN   rm   rt   routed_scaling_factorrf   r|   moe_topkrw   rn   zero_expert_typerg   rj   rr   bfloat16r   tp_sizerO   rA   rb   r1   rv   r   rl   datatopkforward_nativerZ   r   r?   moe_intermediate_sizer{   )rQ   rm   rt   rB   rD   rR   rT   rU   rN      sZ   




zLongcatFlashMoE.__init__rp   rE   c                 C   s   |j \}}|d|}| |}| ||\}}}| jd ur)t||| j| j|d}t|||}	| ||	}
|
| j	9 }
| jd urN|j d dkrN|
|
|
j7 }
| jdkrWt|
}
|
||S )N)expert_indicesexpert_scalesr|   r   rp   r      )shapeviewrv   r   r   r   r|   r   r{   r}   ro   devicer   r	   )rQ   rp   
num_tokens
hidden_dimrouter_logitstopk_weightstopk_idxrY   zero_expert_resulttopk_outputfinal_hidden_statesrT   rT   rU   rZ     s.   





zLongcatFlashMoE.forwardc                    s    fdd j  D S )Nc                    s.   g | ]\}}|d vrt || jjr|jqS ))rz   )r   r{   num_local_expertsr   ).0namerW   rQ   rT   rU   
<listcomp>'  s    
z3LongcatFlashMoE.get_moe_weights.<locals>.<listcomp>)r{   named_parametersr   rT   r   rU   get_moe_weights&  s   
zLongcatFlashMoE.get_moe_weightsNr>   )r[   r\   r]   r   r^   r   r   r_   rN   rj   TensorrZ   r   ra   rT   rT   rR   rU   rs      s    =rs   c                       s   e Zd Z			ddededee dedeej	j
 ddf fd	d
Zdejdejdedeej dedejfddZdd Z  ZS )LongcatFlashDecoderLayerNr>   rm   rt   rB   rD   
alt_streamrE   c                    sJ  t     _ j__|_t fddtdD _	t fddtdD _
t fddtdD _t fddtdD _tj tdd_t _t _ fd	dtdD _fd
dtdD _tjj jdddd_tjj
d jd j	d jd_d S )Nc                    s   g | ]U}t di d  d jd jd jd jd jd jd jd jd	d
d j	ddt
 dg v r<d
ndd | dddtd| djqS )rm   r?   	num_headsqk_nope_head_dimqk_rope_head_dim
v_head_dimq_lora_rankkv_lora_rank
rope_thetarope_scalingNmax_position_embeddingsrB   	self_attndisable_quant_modulert   rF   rC   FrD   z
self_attn.r   rT   )r.   r?   num_attention_headsr   r   r   r   r   r   r   getattrr1   r   r   irm   rt   rD   rB   rQ   rT   rU   r   A  sN    	
z5LongcatFlashDecoderLayer.__init__.<locals>.<listcomp>rF   c                       g | ]
}t  j jd qS epsr   r?   rms_norm_epsr   rm   rT   rU   r   ]      c                    r   r   r   r   r   rT   rU   r   `  r   c                    sD   g | ]}t  j j jd t dg v rdntd| dqS )mlpsr   Nzmlps.)r?   r@   rA   rB   rD   )r=   r?   r@   rA   r   r1   r   )rm   rD   rB   rT   rU   r   d  s    mlp)rt   rm   rB   rD   c              	      s,   g | ]}t jjd  |  jddddqS )rF   Frt   
num_layersis_layer_sparseis_previous_layer_sparseis_next_layer_sparse)r   init_newrt   num_hidden_layersr   )rm   rQ   rT   rU   r   ~  s    	c                    s6   g | ]}t  j|  j|  j|  j| jd qS )layer_scatter_modesinput_layernormpost_attention_layernormqkv_latent_func)r   mlp_layer_scatter_modesr   r   r   prepare_qkv_latentr   r   rT   rU   r     s    
Tr   r   r   )rM   rN   rm   r?   rt   r   r   
ModuleListranger   r   r   r   rs   r1   r   r   attn_tp_sizer   attn_tp_rankr   mlp_layer_communicatorr   r   r   moe_layer_scatter_modesr   r   moe_layer_communicator)rQ   rm   rt   rB   rD   r   rR   r   rU   rN   3  s`   
	


z!LongcatFlashDecoderLayer.__init__	positionsrp   forward_batchresidualzero_allocatorc                 C   s   | j |||\}}|jd dkr| jd ||||d}| j |||\}}| }| }| |}| j |||\}}| |||||\}}|| }||fS )Nr   r   rp   r   r   )	r   prepare_attnr   r   prepare_mlpcloner   postprocess_layerforward_mlp)rQ   r   rp   r   r   r   moe_hidden_statesmoe_residualrT   rT   rU   rZ     s0   	

z LongcatFlashDecoderLayer.forwardc                 C   s   | j d |}t|}| jd |||\}}|jd dkr)| jd ||||d}| jd |||\}}| j d |}t|}| jd |||\}}||fS )Nr   r   r   )r   r	   r   r   r   r   r   r   )rQ   rp   r   r   r   r   rT   rT   rU   r     s*   


z$LongcatFlashDecoderLayer.forward_mlp)Nr>   N)r[   r\   r]   r   r^   r   r   r_   rj   cudaStreamrN   r   r)   r0   rZ   r   ra   rT   rT   rR   rU   r   1  s>    
o
&r   c                       sv   e Zd ZdZ		ddedee deddf fdd	Zde	j
fd
dZ	dde	j
de	j
dede	j
de	j
f
ddZ  ZS )LongcatFlashModelFNr>   rm   rB   rD   rE   c                    sv   t     j_t j jt d_tj	 _
t fddt jD _t j jd_g _d S )N)use_attn_tp_groupc              	      s,   g | ]}t  |td | jdqS )zlayers.)rB   rD   r   )r   r1   r   )r   rt   rm   rD   rB   rQ   rT   rU   r     s    z.LongcatFlashModel.__init__.<locals>.<listcomp>r   )rM   rN   
vocab_sizer(   r?   r   embed_tokensrj   r   r   r   r   r   r   r   layersr   r   normlayers_to_capturerQ   rm   rB   rD   rR   r   rU   rN     s   

zLongcatFlashModel.__init__c                 C   s   | j S rV   )r   r   rT   rT   rU   get_input_embeddings  s   z&LongcatFlashModel.get_input_embeddings	input_idsr   r   input_embedsc              	   C   s  t | j}|d ur|jn|j}t|d |jrdnd tj|d}|d u r*| |}n|}d }	g }
t|D ]3}|| j	v rB|

||	  t | | j| }|||||	|\}}	W d    n1 sbw   Y  q4|jd dkr|	d u ry| |}n| ||	\}}t |
dkr|S ||
fS )NrF   r   )buffer_sizere   r   r   )lenr   r   r0   can_run_tborj   rr   r   r   r   appendr
   with_current_layerr   r   )rQ   r   r   r   r   total_num_layersr   r   rp   r   aux_hidden_statesr   layerrY   rT   rT   rU   rZ     s:   




zLongcatFlashModel.forwardr   rV   )r[   r\   r]   fall_back_to_pt_during_loadr   r   r   r_   rN   rj   r   r   r)   rZ   ra   rT   rT   rR   rU   r     s4    r   c                       s   e Zd Zi Z		d!dedee deddf fddZde	j
fd	d
Ze 	d"dejdejdedejdejf
ddZd"ddZdd Zdeeeejf  fddZdd Zdd Zedd Zd"deee  fdd Z  ZS )#LongcatFlashForCausalLMNr>   rm   rB   rD   rE   c                    s   t    t|do|jd u| _| jrddg| jd< || _t | _|| _	t
||td|d| _t|j|j|td|t jd| _t|| _d	| _d S )
Nr   q_a_projkv_a_proj_with_mqafused_qkv_a_proj_with_mqamodel)rD   lm_head)rB   rD   r   F)rM   rN   hasattrr   fuse_qkv_a_projpacked_modules_mappingrm   r   r   rB   r   r1   r   r'   r   r?   r/   enable_dp_lm_headr   r   logits_processorcapture_aux_hidden_statesr   rR   rT   rU   rN   ?  s,   



z LongcatFlashForCausalLM.__init__c                 C   s   | j jS rV   )r   r   r   rT   rT   rU   r   b  s   z,LongcatFlashForCausalLM.get_input_embeddingsr   r   r   r   c                 C   s6   |  ||||}d }| jr|\}}| ||| j||S rV   )r   r   r   r   )rQ   r   r   r   r   rp   r   rT   rT   rU   rZ   e  s   zLongcatFlashForCausalLM.forwardc              	   C   s  |d u rt | jj}n t }|D ]}d|v r*t|dd }|| jjk r*|| q|D ]}t dD ]}| jj| j	| }t
|jdritsJtrXt|jj|jj|jjj}nt|jj|jj|jjdddj}n|jj}d}|jtjtjfv rt
| jdr| jjd ur| jj}	t
|jdsJ trt||jjd d	\}
}}n|}
|jj}tr|	d d
kr|	d d
krtjrtjst ddr|}d}n4t!|
||	tj"}n+t#|
||	\}}||_$ntrt||jj%d d	\}
}}n|}
|jj%}t&|
|\}}||_$|jtj'kr6t
| jdr(| jj}	|	d ur't
|jdsJ |}
|jj}t(|
||	)tj"}n|)tj"|jj%)tj" }|*dd|j+|j, fj|j+|j,gdd\}}|st-|j.|/dd0 /dd|_.t-|j1|0 /dd|_1t
|jdr|j$d u rt-|j$|jj%|_$tr| j$d9  _$t2rt3r|jtjkr|j.)tj"|j$ |_.|j1)tj"|j$ |_1nP|j+|	d  }|j,|	d  }|*dd|| fj||gdd\}}t-|j4|/dd0 |_4t-|j5|0 |_5t-|j.|/dd0 |_.t-|j1|0 |_1d|_6| jj7r|j8j j9| jj:| jj; d 9  _9| jj<r.|j=j j9| jj:| jj> d 9  _9q4q-dt_?t@tA| jdd drD| B  d S d S )N	kv_b_projru   rF   qweightr   Fweight_block_sizeweight_scale_inv)weightweight_scaleinput_scale   r   SGL_USE_DEEPGEMM_BMMfalseTr   dimr  g       @g      ?)r   )Cr   rm   r   setr^   splitaddr   r   r   r   r   _is_cuda_is_hipr;   r   scalesqzerosTr  re   rj   float8_e4m3fnfloat8_e4m3fnuzrB   r   _is_fp8_fnuzr$   r  r   ENABLE_JIT_DEEPGEMMDEEPGEMM_BLACKWELLr4   r!   r   r"   w_scaler  r#   int8int8_block_dequantro   	unflattenr   r   r2   w_kc	transpose
contiguousw_vc_is_cpu_is_cpu_amx_available	w_scale_k	w_scale_vuse_deep_gemm_bmmmla_scale_q_loraq_a_layernormr   r?   r   mla_scale_kv_lorakv_a_layernormr   DEEPGEMM_SCALE_UE8M0r,   r   _weight_requant_ue8m0)rQ   weight_names	layer_idsr   rt   r   r   wr#  r   r  r  rY   block_scalescaler  r  num_tiles_knum_tiles_nws_kcws_vcrT   rT   rU   post_load_weightsw  sN  

	











  z)LongcatFlashForCausalLM.post_load_weightsc                 C   sJ  | j j}t| jjD ]m}| jj| }tdD ]`}|j| }|j|j	g}| jj
d ur6||j ||j n||j ||j |D ]}t|drSt|j|j| qD|j| }t|ts`J |j|jfD ]}t|drut|j|j| qfqq
t| jjD ]$}|jj}	t|	tr|	j|	jf|	j|	jffD ]}
t|
d |
d | qq~d S )NrF   r  r   r   ) rB   r   r   rm   r   r   r   r   r   o_projr   r   r   q_b_projr   q_projr   r%   r  r  r   
isinstancer=   rG   rI   r   r{   r   
w13_weightw13_weight_scale_inv	w2_weightw2_weight_scale_inv)rQ   r   rt   r   r   r   module_listmoduler   r{   r,  rT   rT   rU   r)  *  sP   








z-LongcatFlashForCausalLM._weight_requant_ue8m0weightsc                 C   s  ddg}t jddd| jjd}t| jdo| jjd u}|ri nd }tj }g }t	| 
 }g }	|D ]j\}
}t|}d|
v rBq4|	|
 d	|
v rLq4|D ]8\}}}||
vrXqNd
|
v ra|
|vraqN|
||}
|
drq|
|vrqqN||
 }|j}t|||||||fd  n|D ]+}|\}}}}||
vrq|
||}
||
 }|j}t|||||||
f||dd  n|
dr|
|vrq4|rUd|
v sd|
v rU|||
< d|
v r|
n|
dd}d|
v r|
n|
dd}||v rT||v rT|| }|| }d}| jd ur| j dks| j dks| j dkrd}tj||g|d}d|
v r/|
ddn|
dd}|| }t|dt}t||||||fd || || q4d|
v s_d|
v r||
|vr|dD ]}||
v rz|
|d  dd}
 nqf|
|vrt|
 d q4||
 }t|dt}t||||||fd q4tj|D ]}|  qW d    n	1 sw   Y  | j|	d d S ) N)rG   	gate_projr   )rG   up_projr   r?  rI   r@  )ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namer|   r   mtpzrotary_emb.inv_freqzmlp.experts.z.bias)executorfutures	use_asyncfunc	func_args)shard_id	expert_id)rE  rF  rG  rH  rI  func_kwargsr   r   r   awq
awq_marlin	moe_wna16r   r  r   weight_loaderk_scalev_scale)rQ  rR  _projattn_mqaz not found in params_dict.)r*  )r   make_expert_params_mappingrm   rf   r   r   
concurrentrF  ThreadPoolExecutordictr   r+   r   replaceendswithrP  r*   rB   get_namerj   catr   r-   poploggerwarningas_completedresultr3  )rQ   r>  stacked_params_mappingexpert_params_mappingr   cached_a_projrE  rF  params_dictr*  r   loaded_weightuse_async_loading
param_nameweight_namerJ  paramrP  mappingrK  q_a_proj_namekv_a_proj_nameq_a_proj_weightkv_a_proj_weightcat_dimfused_weightr.  futurerT   rT   rU   load_weightsW  s  












	  z$LongcatFlashForCausalLM.load_weightsc                 C   s   | j jj| jjfS rV   )r   r   r  r   r   rT   rT   rU   get_embed_and_head  s   z*LongcatFlashForCausalLM.get_embed_and_headc                 C   s8   | j j`| j`|| j j_|| j_tj  tj  d S rV   )r   r   r  r   rj   r   empty_cachesynchronize)rQ   embedheadrT   rT   rU   set_embed_and_head	  s   

z*LongcatFlashForCausalLM.set_embed_and_headc                 C   s   t |j|jdS )N)r   num_logical_experts)r   r   rf   )clsrm   rT   rT   rU   $get_model_config_for_expert_location  s   z<LongcatFlashForCausalLM.get_model_config_for_expert_locationr+  c                 C   sL   |d u rd| _ | jj}d|d |d g| j_d S d| _ dd |D | j_d S )NTrF      c                 S   s   g | ]}|d  qS )r   rT   )r   valrT   rT   rU   r     s    zHLongcatFlashForCausalLM.set_eagle3_layers_to_capture.<locals>.<listcomp>)r   rm   r   r   r   )rQ   r+  r   rT   rT   rU   set_eagle3_layers_to_capture  s   z4LongcatFlashForCausalLM.set_eagle3_layers_to_capturer   rV   )r[   r\   r]   r   r   r   r   r_   rN   r   	Embeddingr   rj   no_gradr   r)   rZ   r3  r)  r   r   rs  rt  ry  classmethodr|  r   r^   r  ra   rT   rT   rR   rU   r   ;  sJ    #
 4- 0
 r   )pconcurrent.futuresrV  loggingtypingr   r   r   r   rj   r   sglang.srt.configsr   sglang.srt.distributedr   r	   #sglang.srt.eplb.expert_distributionr
   sglang.srt.eplb.expert_locationr   sglang.srt.layersr   sglang.srt.layers.activationr   sglang.srt.layers.communicatorr   r   sglang.srt.layers.dp_attentionr   r   r   sglang.srt.layers.layernormr   sglang.srt.layers.linearr   r   r   "sglang.srt.layers.logits_processorr   $sglang.srt.layers.moe.ep_moe.kernelsr   "sglang.srt.layers.moe.ep_moe.layerr   r   ,sglang.srt.layers.moe.fused_moe_triton.layerr   sglang.srt.layers.moe.topkr   r   sglang.srt.layers.moe.utilsr   *sglang.srt.layers.quantization.base_configr   )sglang.srt.layers.quantization.fp8_kernelr    (sglang.srt.layers.quantization.fp8_utilsr!   r"   r#   r$   r%   )sglang.srt.layers.quantization.int8_utilsr&   r  *sglang.srt.layers.vocab_parallel_embeddingr'   r(   ,sglang.srt.model_executor.forward_batch_infor)   sglang.srt.model_loader.utilsr*   r+   r,   $sglang.srt.model_loader.weight_utilsr-   sglang.srt.models.deepseek_v2r.   sglang.srt.server_argsr/   sglang.srt.utilsr0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r  r  _is_npur  
_use_aiterr   r  
_device_sm
sgl_kernelr;   )sglang.srt.layers.quantization.awq_tritonr<   	getLoggerr[   r^  Moduler=   rb   rs   r   r   r   
EntryClassrT   rT   rT   rU   <module>   sv    0
+i ;P   
j