o
    پir                     @   s  d dl Zd dlZd dlmZmZmZ d dlZd dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZmZ d dlmZmZmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dl m!Z!m"Z"m#Z#m$Z$m%Z% d dl&m'Z( d dl)m*Z*m+Z+ d dl,m-Z- d dl.m/Z/ d dl0m1Z1 d dl2m3Z3 d dl4m5Z5m6Z6 d dl7m8Z8m9Z9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@mAZA e@ ZBe? ZCeA ZDe ZEe<doeBZFe; ZGe> ZHe= ZIeCrd dlJmKZK neHreGrn
eBrd dlLmMZK n	 eNeOZPG dd dejQZRG dd dejQZSG dd de5ZTeTgZUdS )     N)IterableOptionalTuple)nn)LongcatFlashConfig)'get_global_expert_distribution_recorder)deep_gemm_wrapper)LayerCommunicatorLayerScatterModes)get_attention_tp_rankget_attention_tp_sizeis_dp_attention_enabled)RMSNorm)ReplicatedLinear)LogitsProcessor)QuantizationConfig)is_fp8_fnuz)block_quant_dequantblock_quant_to_tensor_quantchannel_quant_to_tensor_quantnormalize_e4m3fn_to_e4m3fnuzrequant_weight_ue8m0_inplace)block_dequant)ParallelLMHeadVocabParallelEmbedding)ForwardBatch)$should_deepgemm_weight_requant_ue8m0)default_weight_loader)DeepseekV2AttentionMLA)LongcatFlashForCausalLMLongcatFlashMLP)
BumpAllocator
add_prefixbind_or_assigncpu_has_amx_supportget_bool_env_varget_device_smis_cpuis_cudais_hipis_npuSGLANG_USE_AITER)awq_dequantize)awq_dequantize_tritonc                       sx   e Zd Z			ddededee dedeej	j
 ddf fd	d
Zdejdejdedeej dedejfddZ  ZS )LongcatFlashDenseDecoderLayerN configlayer_idquant_configprefix
alt_streamreturnc                    s2  t    || _|j| _|| _|| _tdi d|d|jd|jd|jd|j	d|j
d|jd|jd	|jd
d d|jd|d|dddtd|d| j| _t|j|j|j|td|d| _t|j|jd| _t|j|jd| _t | _t | _tj| j|jdddd| _ t!| j | j| jd| _"d S )Nr0   hidden_size	num_headsqk_nope_head_dimqk_rope_head_dim
v_head_dimq_lora_rankkv_lora_rank
rope_thetarope_scalingmax_position_embeddingsr2   r1   reduce_resultsFr3   	self_attnr4   mlps)r6   intermediate_size
hidden_actr2   r3   eps)r1   
num_layersis_layer_sparseis_previous_layer_sparseis_next_layer_sparse)layer_scatter_modesinput_layernormpost_attention_layernorm )#super__init__r0   r6   r1   r4   r   num_attention_headsr8   r9   r:   r;   r<   r=   r?   r"   rA   r    rC   rD   mlpr   rms_norm_epsrL   rM   r   attn_tp_sizer   attn_tp_rankr
   init_newnum_hidden_layersrK   r	   layer_communicator)selfr0   r1   r2   r3   r4   	__class__rN   Y/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/longcat_flash_nextn.pyrP   p   s~   
	

z&LongcatFlashDenseDecoderLayer.__init__	positionshidden_statesforward_batchresidualzero_allocatorc                 C   sn   | j |||\}}|jd dkr| j||||d}| j |||\}}| |}| j |||\}}||fS )Nr   )r]   r^   r_   ra   )rX   prepare_attnshaperA   prepare_mlprR   postprocess_layer)rY   r]   r^   r_   r`   ra   rN   rN   r\   forward   s$   	
z%LongcatFlashDenseDecoderLayer.forward)Nr/   N)__name__
__module____qualname__r   intr   r   strtorchcudaStreamrP   Tensorr   r!   rf   __classcell__rN   rN   rZ   r\   r.   n   s<    
<r.   c                       sr   e Zd Z		ddedee deddf fddZdej	fd	d
Z
	ddej	dej	dedej	dej	f
ddZ  ZS )LongcatFlashModelNextNNr/   r0   r2   r3   r5   c                    s   t    |j| _tj | _t|j|jt	 t
d|d| _t|j|jd| _t|j|jd| _td|j |jd|t
ddd| _t|d	|| jd
| _t|j|jd| _d S )Nembed_tokens)use_attn_tp_groupr3   rE      Feh_projr/   )biasr2   r3   r   )r2   r4   )rO   rP   
vocab_sizerl   rm   rn   r4   r   r6   r   r"   rr   r   rS   enormhnormr   ru   r.   decoderfinal_layernorm)rY   r0   r2   r3   rZ   rN   r\   rP      s,   

zLongcatFlashModelNextN.__init__c                 C   s   | j S N)rr   )rY   rN   rN   r\   get_input_embeddings   s   z+LongcatFlashModelNextN.get_input_embeddings	input_idsr]   r_   input_embedsc                 C   s  d}|d ur	|j n|j }t|d |jrdnd tj|d}|d u r'| |}n|}|jd dkrF| tj| 	|| 
|jjfdd\}}	d }
t   | ||||
|\}}
W d    n1 scw   Y  |j s|
d ur{| ||
\}}	|S | |}|S )N   rt   )buffer_sizedtypedevicer   dim)r   r!   can_run_tborl   float32rr   rc   ru   catrx   ry   	spec_infor^   r   disable_this_regionrz   forward_modeis_idler{   )rY   r~   r]   r_   r   total_num_layersr   ra   r^   _r`   rN   rN   r\   rf      s>   




zLongcatFlashModelNextN.forward)Nr/   r|   )rg   rh   ri   r   r   r   rk   rP   rl   ro   r}   r   rf   rp   rN   rN   rZ   r\   rq      s2    !rq   c                	   @   s~   e Zd Z	ddedee ddfddZe dej	dej	d	e
dej	fd
dZdd Zdd Zdeeeej	f  fddZdS )LongcatFlashForCausalLMNextNNr0   r2   r5   c                 C   s^   t j|  || _dt|dg v rd n|| _t|| j| _t|j	|j
| jd| _t|| _d S )Nmtpdisable_quant_module)r2   )r   ModulerP   r0   getattrr2   rq   modelr   rw   r6   lm_headr   logits_processor)rY   r0   r2   rN   rN   r\   rP     s   z%LongcatFlashForCausalLMNextN.__init__r~   r]   r_   c                 C   s    |  |||}| ||| j|S r|   )r   r   r   )rY   r~   r]   r_   r^   rN   rN   r\   rf   1  s   
z$LongcatFlashForCausalLMNextN.forwardc                 C   s  | j jj}t|jdr.tstrt|jj|jj	|jj
j}nt|jj|jj	|jj
dddj}n|jj}d}|jtjtjfv rt| jdr| jjd ur| jj}t|jdsUJ trdt||jjd d\}}}n|}|jj}tr|d dkr|d dkrtjrtjstd	d
r|}d}n4t|||tj}n+t|||\}}	|	|_ntrt||jjd d\}}}n|}|jj}t||\}}	|	|_|jtj krt| jdr| jj}|d urt|jdsJ |}|jj}t!|||"tj}n|"tj|jj"tj }|#dd|j$|j% fj&|j$|j%gdd\}
}|sst'|j(|
)dd* )dd|_(t'|j+|* )dd|_+t|jdrO|jd u rOt'|j|jj|_trO| jd9  _t,rrt-rr|jtjkrr|j("tj|j |_(|j+"tj|j |_+nP|j$|d  }|j%|d  }|#dd|| fj&||gdd\}}t'|j.|)dd* |_.t'|j/|* |_/t'|j(|
)dd* |_(t'|j+|* |_+d|_0| j1j2r|j3j j4| j1j5| j1j6 d 9  _4| j1j7r|j8j j4| j1j5| j1j9 d 9  _4t:t;| jdd dr | <  d S d S )Nqweightr   Fweight_block_sizeweight_scale_inv)weightweight_scaleinput_scale   r   SGL_USE_DEEPGEMM_BMMfalseTr   r   rt   r   g       @g      ?)r   )=r   rz   rA   hasattr	kv_b_proj_is_cuda_is_hipr,   r   scalesqzerosTr   r   rl   float8_e4m3fnfloat8_e4m3fnuzr2   r   _is_fp8_fnuzr   r   r   ENABLE_JIT_DEEPGEMMDEEPGEMM_BLACKWELLr%   r   bfloat16r   w_scaler   r   int8int8_block_dequantto	unflattenr8   r:   splitr#   w_kc	transpose
contiguousw_vc_is_cpu_is_cpu_amx_available	w_scale_k	w_scale_vuse_deep_gemm_bmmr0   mla_scale_q_loraq_a_layernormdatar6   r;   mla_scale_kv_lorakv_a_layernormr<   r   r   _weight_requant_ue8m0)rY   rA   wr   r   r   r   r   block_scalescaler   r   num_tiles_knum_tiles_nws_kcws_vcrN   rN   r\   post_load_weights=  s  
	










z.LongcatFlashForCausalLMNextN.post_load_weightsc                 C   s   | j j}| jj}|j}|j|jg}| jjd ur$|	|j
 |	|j n|	|j |	|j |D ]}t|drAt|j|j| q2|j}t|tsLJ |j|jfD ]}t|drat|j|j| qRd S )Nr   )r2   r   r   rz   rA   r   o_projr0   r;   appendfused_qkv_a_proj_with_mqaq_b_projkv_a_proj_with_mqaq_projr   r   r   r   rB   
isinstancer    gate_up_proj	down_proj)rY   r   layerrA   module_listmodulerR   rN   rN   r\   r     s8   



z2LongcatFlashForCausalLMNextN._weight_requant_ue8m0weightsc                 C   s&  ddg}t | jdo| jjd u}|ri nd }d}g d}i dddd	d
dddddddddddddddddddddd d!d"d#d$d%d&d'd(d)d*d+d,d-d.d/d0d1	}tj }g }	t|  }
g }|D ]\}}d2|vrzqp||v r|| }|d3rd4| }|d5s|d6s|d7r|d8 | }||sqpd9|v sd:|v rqpd;}|D ]}||v r|	|d<}d=} nq|r|	|d>}|
| d?|v rqp|D ]6\}}}||vrqd@|v r||
vrq|	||}|dAr||
vrq|
| }|j}|	
|||||  n|dAr||
vrqp|rdB|v s(dC|v r|||< dB|v r3|n|	dCdB}dC|v r@|n|	dBdC}||v r||v r|| }|| }dD}| jd urz| j dEksx| j dFksx| j dGkrzdH}tj||g|dI}dB|v r|	dBdJn|	dCdJ}|
| }t|dKt}|	
|||| || || qpdL|v sdM|v r||
vrdND ]}||v r|	|dD  dOdP} nq||
vrt| dQ qp|
| }t|dKt}|	
|||| qpW d    n	1 sw   Y  |   d S )RN)r   	gate_projr   )r   up_projr   r;   zmodel.layers.0)zshared_head.normru   rx   ry   r{   zmodel.mtp.embed_tokens.weightzembed_tokens.weightz!model.mtp.layers.0.eh_proj.weightzeh_proj.weightz+model.mtp.layers.0.eh_proj.weight_scale_invzeh_proj.weight_scale_invz!model.mtp.layers.0.enorm.m.weightzenorm.weightz!model.mtp.layers.0.hnorm.m.weightzhnorm.weightz)model.mtp.layers.0.input_layernorm.weightzlayers.0.input_layernorm.weightz2model.mtp.layers.0.post_attention_layernorm.weightz(layers.0.post_attention_layernorm.weightz2model.mtp.layers.0.self_attn.kv_a_layernorm.weightz(layers.0.self_attn.kv_a_layernorm.weightz6model.mtp.layers.0.self_attn.kv_a_proj_with_mqa.weightz,layers.0.self_attn.kv_a_proj_with_mqa.weightz@model.mtp.layers.0.self_attn.kv_a_proj_with_mqa.weight_scale_invz6layers.0.self_attn.kv_a_proj_with_mqa.weight_scale_invz-model.mtp.layers.0.self_attn.kv_b_proj.weightz#layers.0.self_attn.kv_b_proj.weightz7model.mtp.layers.0.self_attn.kv_b_proj.weight_scale_invz-layers.0.self_attn.kv_b_proj.weight_scale_invz*model.mtp.layers.0.self_attn.o_proj.weightz layers.0.self_attn.o_proj.weightz4model.mtp.layers.0.self_attn.o_proj.weight_scale_invz*layers.0.self_attn.o_proj.weight_scale_invz1model.mtp.layers.0.self_attn.q_a_layernorm.weightz'layers.0.self_attn.q_a_layernorm.weightz,model.mtp.layers.0.self_attn.q_a_proj.weightz"layers.0.self_attn.q_a_proj.weightz6model.mtp.layers.0.self_attn.q_a_proj.weight_scale_invz,layers.0.self_attn.q_a_proj.weight_scale_invz"layers.0.self_attn.q_b_proj.weightz,layers.0.self_attn.q_b_proj.weight_scale_invzlayers.0.mlp.down_proj.weightz'layers.0.mlp.down_proj.weight_scale_invzlayers.0.mlp.gate_proj.weightz'layers.0.mlp.gate_proj.weight_scale_invzlayers.0.mlp.up_proj.weightz%layers.0.mlp.up_proj.weight_scale_invzlayers.0.final_layernorm.weight)	z,model.mtp.layers.0.self_attn.q_b_proj.weightz6model.mtp.layers.0.self_attn.q_b_proj.weight_scale_invz9model.mtp.layers.0.transformer_layer.mlp.down_proj.weightzCmodel.mtp.layers.0.transformer_layer.mlp.down_proj.weight_scale_invz9model.mtp.layers.0.transformer_layer.mlp.gate_proj.weightzCmodel.mtp.layers.0.transformer_layer.mlp.gate_proj.weight_scale_invz7model.mtp.layers.0.transformer_layer.mlp.up_proj.weightzAmodel.mtp.layers.0.transformer_layer.mlp.up_proj.weight_scale_invzmodel.mtp.norm.weightz.mtp.zlayers.0zmodel.rx   ry   ru   .zshared_head.headrr   Tr   Fzmodel.decoderzrotary_emb.inv_freqzmlp.experts.z.biasq_a_projr   r   awq
awq_marlin	moe_wna16r   r   r   weight_loaderk_scalev_scale)r   r   _projattn_mqaz not found in params_dict.)r   r0   r;   
concurrentfuturesThreadPoolExecutordictnamed_parameters
startswithreplacer   endswithr   submitr2   get_namerl   r   r   r   poploggerwarningr   )rY   r   stacked_params_mappingfuse_qkv_a_projcached_a_projnextn_layer_prefixnextn_spec_weight_namesweight_names_mappingexecutorr   params_dictweight_namesnameloaded_weight
is_decoderweight_name
param_nameshard_idparamr   q_a_proj_namekv_a_proj_nameq_a_proj_weightkv_a_proj_weightcat_dimfused_weightr   rN   rN   r\   load_weights  sP  
	















  	z)LongcatFlashForCausalLMNextN.load_weightsr|   )rg   rh   ri   r   r   r   rP   rl   no_gradro   r   rf   r   r   r   r   rk   r  rN   rN   rN   r\   r     s.    
  !r   )Vconcurrent.futuresr   loggingtypingr   r   r   rl   r   sglang.srt.configsr   #sglang.srt.eplb.expert_distributionr   sglang.srt.layersr   sglang.srt.layers.communicatorr	   r
   sglang.srt.layers.dp_attentionr   r   r   sglang.srt.layers.layernormr   sglang.srt.layers.linearr   "sglang.srt.layers.logits_processorr   *sglang.srt.layers.quantization.base_configr   )sglang.srt.layers.quantization.fp8_kernelr   (sglang.srt.layers.quantization.fp8_utilsr   r   r   r   r   )sglang.srt.layers.quantization.int8_utilsr   r   *sglang.srt.layers.vocab_parallel_embeddingr   r   ,sglang.srt.model_executor.forward_batch_infor   sglang.srt.model_loader.utilsr   $sglang.srt.model_loader.weight_utilsr   sglang.srt.models.deepseek_v2r   sglang.srt.models.longcat_flashr   r    sglang.srt.utilsr!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r   r   _is_npur   
_use_aiterr   r   
_device_sm
sgl_kernelr,   )sglang.srt.layers.quantization.awq_tritonr-   	getLoggerrg   r   r   r.   rq   r   
EntryClassrN   rN   rN   r\   <module>   s^    0
\Q   
