o
    پi                     @   s  d dl Z d dlZd dlmZmZmZmZmZmZ d dl	Z	d dl
m  mZ d dl	mZ d dlmZ d dlmZmZmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlm Z m!Z! d dl"m#Z#m$Z$m%Z% d d	l&mZ d dl'm(Z(m)Z)m*Z*m+Z+ d dl,m-Z- d dl.m/Z/m0Z0 d dl1m2Z2 d dl3m4Z4 d dl5m6Z6 d dl7m8Z8 d dl9m:Z:m;Z;m<Z<m=Z=m>Z> d dl?m@ZA d dlBmCZC d dlDmEZE d dlFmGZG d dlHmIZImJZJ d dlKmLZLmMZM d dlNmOZO d dlPmQZQmRZRmSZS d dlTmUZU d dlVmWZW d dlXmYZYmZZZm[Z[m\Z\m]Z]m^Z^m_Z_m`Z`maZambZbmcZcmdZdmeZemfZf d d lgmhZh ec ZSe` Zied Zje8 Zke]d!o/eSZle\ Zme_ Zne^ Zoeb Zpelo@epZqeqrE	 eirOd d"lrmsZs nenrVemrVneSr`d d#ltmuZs nd d"lvmsZs eSrj	 ea Zwe` osee ZxG d$d% d%eQZydZze{e|Z}e_ Znd&d' Z~d(ed)e	jjd*efd+d,Zd-efd.d/ZG d0d1 d1ejZG d2d3 d3ejZG d4d5 d5ejZG d6d7 d7eZG d8d9 d9ejZG d:d; d;ejZG d<d= d=ejZG d>d? d?ejZG d@dA dAejZG dBdC dCeZegZdS )D    N)CallableIterableOptionalSetTupleUnion)nn)PretrainedConfig)get_pp_groupget_tensor_model_parallel_rank$get_tensor_model_parallel_world_size tensor_model_parallel_all_reduce)'get_global_expert_distribution_recorder)deep_gemm_wrapper)
SiluAndMul)RMSNorm)layernorm_fn)LayerCommunicatorLayerScatterModes)get_attention_tp_rankget_attention_tp_sizeis_dp_attention_enabled)ColumnParallelLinearMergedColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessor)	DeepEPMoEget_moe_impl_class)FusedMoE)TopK)QuantizationConfig)is_fp8_fnuz)block_quant_dequantblock_quant_to_tensor_quantchannel_quant_to_tensor_quantnormalize_e4m3fn_to_e4m3fnuzrequant_weight_ue8m0_inplace)block_dequant)RadixAttention)get_rope_wrapper)PPMissingLayer)ParallelLMHeadVocabParallelEmbedding)ForwardBatchPPProxyTensors)default_weight_loader)DeepseekV2AttentionMLADeepseekV2MLP_is_hip)WeightsMapper)get_global_server_args)BumpAllocator
add_prefixbind_or_assigncpu_has_amx_supportget_bool_env_varget_device_smis_cpuis_cudais_flashinfer_availableis_gfx95_supportedis_hipis_npuis_sm100_supportedmake_layers)	rank0_logSGLANG_USE_AITER)awq_dequantize)awq_dequantize_tritonc                       s   e Zd Z fddZ  ZS )DsV3MLAc                    s.   t  jdi | |d r| jj| j_d S d S )Nrope_scaling )super__init__
rotary_embforward_cudaforward)selfkwargs	__class__rJ   X/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/bailing_moe_linear.pyrL   t   s   zDsV3MLA.__init__)__name__
__module____qualname__rL   __classcell__rJ   rJ   rR   rT   rH   s   s    rH   c                 C   s(   | d u rdS |dkr| d | dkS dS )NFr      rJ   )	layer_idxlayer_group_sizerJ   rJ   rT   is_linear_layer   s
   r\   namemodelreturnc                 C   s   t |trdS dS )NTF)
isinstancer+   )r]   r^   rJ   rJ   rT   is_pp_missing_parameter   s   
ra   aliasc                 C   s   dt fdd}|S )Nfuncc                    s(   d ddt jdt jdtf fdd}|S )Nprefixparamloaded_weightre   c                   s    | |g|R i |}|S NrJ   )rf   rg   re   argsrQ   valuerc   rJ   rT   
inner_func   s   z=weight_loader_with_alias.<locals>.wrapper.<locals>.inner_func)torchTensorstr)rc   rl   rJ   rk   rT   wrapper   s   z)weight_loader_with_alias.<locals>.wrapper)r   )rb   rp   rJ   rJ   rT   weight_loader_with_alias   s   rq   c                       sV   e Zd Z			ddededee deddf
 fd	d
Z		ddedefddZ	  Z
S )
BailingMLPTN hidden_sizeintermediate_sizequant_configre   r_   c                    sR   t    t||gd d|| dd| _t||d||| dd| _t | _d S )N   Fz.gate_up_projbiasrv   re   z
.down_proj)ry   rv   reduce_resultsre   )rK   rL   r   gate_up_projr   	down_projr   act_fn)rP   rt   ru   rz   rv   re   rR   rJ   rT   rL      s"   
zBailingMLP.__init__Fshould_allreduce_fusionuse_reduce_scatterc                 C   s2   |  |\}}| |}| j||p|d\}}|S )N)skip_all_reduce)r{   r}   r|   )rP   xr~   r   _rJ   rJ   rT   rO      s   

zBailingMLP.forward)TNrs   FF)rU   rV   rW   intr   r!   ro   rL   boolrO   rX   rJ   rJ   rR   rT   rr      s,    rr   c                       s:   e Zd Z		d	deej def fddZdd Z  Z	S )
BailingMoEGateNrs   params_dtypere   c                    sv   t    |d u rt }|| _ttj|j|j	f| jd| _
t|ddr6ttj|jftjd| _d S d | _d S )N)dtypemoe_router_enable_expert_biasF)rK   rL   rm   get_default_dtyper   r   	Parameteremptynum_expertsrt   weightgetattrfloat32expert_bias)rP   configr   re   rR   rJ   rT   rL      s   



zBailingMoEGate.__init__c                 C   s&   t || jj| jd |j}|S rh   )Flineartor   r   )rP   hidden_stateslogitsrJ   rJ   rT   rO      s   zBailingMoEGate.forwardNrs   )
rU   rV   rW   r   rm   r   ro   rL   rO   rX   rJ   rJ   rR   rT   r      s    r   c                	       s^   e Zd Z			ddedee dedef fdd	Z	
	
dde	j
dedede	j
fddZ  ZS )
BailingMoENr   moer   rv   layer_idre   c              
      s(  t    || _t | _t | _|j| _t	|dd| _
|j| _|j| _t	|dd| _t	|dd| _t	|dd | _t	|dd }|d u rGtj| _n|d	krPtj| _ntj| _t	|d
d| _t	|dd| _| jdksl| jdkr| jdkrd| j  k r~| jksJ  J d| _n	d  | _| _d| _|j| _t|| jtd|d| _| jjd ur| jjjnd | _| jd ur| jdkr| jd u s| jdkr| jd usJ dt| j| j| j
| j| j| j| jd| _ t!|}|| j| j| j| j| j|| j| dd| _"| jdkr| j| j }t#| j|d| d|d| _$d S d S )Nnorm_topk_probFnum_shared_expertsr   routed_scaling_factor      ?score_functionrouter_dtypefp32n_group
topk_groupTgate)r   r   re   softmaxsigmoidzdscore_function and correction_bias should be in 2 combination (softmax, None) or (sigmoid, not None))top_kuse_grouped_topkrenormalizenum_expert_groupr   correction_biasr   z.experts)r   r   r   rt   ru   rv   r   re   z.shared_experts)rt   ru   rz   re   rv   )%rK   rL   r   r   tp_sizer   tp_ranknum_experts_per_tokr   r   norm_expert_probrt   moe_intermediate_sizeru   r   r   r   rm   r   r   bfloat16r   r   r   r   r   r7   r   r   datar   r    topkr   expertsrr   shared_experts)rP   r   rv   r   re   r   moe_clsru   rR   rJ   rT   rL      s   



 
	zBailingMoE.__init__Fr   r~   r   r_   c           
      C   s|   |j \}}|d|}| jdkr| |}| |}| ||}| ||}	| jdkr/|	| }	| jdkr<|s<|s<t|	}	|	S )Nr   rY   )	shapeviewr   r   r   r   r   r   r   )
rP   r   r~   r   
num_tokensrt   shared_outputrouter_logitstopk_outputfinal_hidden_statesrJ   rJ   rT   rO   L  s   




zBailingMoE.forward)Nr   r   r   )rU   rV   rW   r	   r   r!   r   ro   rL   rm   rn   r   rO   rX   rJ   rJ   rR   rT   r      s0    `r   c                       sH   e Zd Z					d fdd	Zedejjdejddfd	d
Z	  Z
S )BailingGroupRMSNormGateh㈵>NTc              	      s(   t  j||||||dd | j| j_d S )Nr   )eps
group_sizenorm_before_gatedevicer   
activation)rK   rL   weight_loaderr   )rP   rt   r   r   r   r   r   rR   rJ   rT   rL   d  s   		z BailingGroupRMSNormGate.__init__rf   rg   r_   c                 C   sH   t  }t }|jd | }t|| |d | }| j||   d S )Nr   rY   )r   r   r   slicer   copy_
contiguous)rf   rg   r   r   
shard_sizeshardrJ   rJ   rT   r   x  s   z%BailingGroupRMSNormGate.weight_loader)r   NTNN)rU   rV   rW   rL   staticmethodrm   r   r   rn   r   rX   rJ   rJ   rR   rT   r   c  s    r   c                	       sx   e Zd Z			ddedee dedef fdd	Ze	d
e
jde
jddfddZde
jde
jdede
jfddZ  ZS )BailingMoELinearAttentionNr   linear_attnr   rv   r   re   c                    s  t    || _|j| _|j| _|j| _t|dd | _| jd u r'|j| j | _| j| j | _	| jd | _
t | _t | _| j| j dksFJ | j| j | _|j| _t|dd| _| j| j | _| j| j | _| j| j | _t|dd| _t|dd	| _td
| j  | jdkrdnd| _t|dd| _t|dr|j| _nt|dr|j| _nd| _t| j| j| j| j|jp|j || d| j| jd	| _!| jrt"| j|j#d| _$t"| j|j#d| _%t&| j| j	d|| d| j| jd| _'t(| j	| j|j|| d| j| jdd| _)t*| j| j| j
| j||| dd| _+t|dd| _,t-t|dd| _#| j| j,ks2J d| j,| j dks?J dt.| j	| j | j#| j	| j, d| _/t|drZ|j0}nt|d rit1| j|j2 }n| j}t3| j|| j| j|j4dt5 j6t7j8d!| _9d S )"Nhead_dim      r   
rope_theta'	 use_qk_normFlinear_backendseg_laz&linear_backend in bailing_moe_linear: minimaxTlinear_ropeuse_linear_silulinear_silu	.qkv_proj)ry   rv   re   r   r   r   z.output_gatez	.out_proj)ry   rv   re   r   r   rz   .attnnum_kv_headsr   rv   re   group_norm_sizerY   rms_norm_epsr   zQtp_size must be less than or equal to group_norm_size that can use local rms normz,group_norm_size must be divisible by tp_size)rt   r   r   
rotary_dimpartial_rotary_factor)r   max_positionbaserI   is_neox_styler   r   ):rK   rL   r   rt   num_attention_headstotal_num_headstotal_kv_headsr   r   hidden_inner_sizescalingr   r   r   r   tp_headsmax_position_embeddingsr   tp_kv_headsq_size_per_rankkv_size_per_rankr   r   loggerdebuglinear_scaler   hasattrr   r   r   use_biasuse_qkv_biasquery_key_valuer   r   query_layernormkey_layernormr   g_projr   denser)   attnr   floatr   g_normr   r   r   r*   rI   r5   r   rm   r   rM   )rP   r   rv   r   re   r   rR   rJ   rT   rL     s   






	



z"BailingMoELinearAttention.__init__rf   rg   r_   c                 C   s$   |   |  ks
J | j| d S rh   )sizer   r   rf   rg   rJ   rJ   rT   weight_direct_load  s   z,BailingMoELinearAttention.weight_direct_loadr   	positionsforward_batchc                 K   s  |  |\}}|tj}| jrt|}tj|| j| j	| j	gdd\}}}	| j
rc|d| j| j}|d| j| j}t|| jjjd | jdd}t|| jjjd | jdd}|d| j}|d| j	}| jro| |||\}}||jd | j| jf}||jd | j| jf}|	|jd | j| jf}	| jr|| j }| |||	||j}
| |\}}| jdkr| |
|}
n| |
}
t ||
 }
|
j|j}
| !|
\}
}|
S )Nr   dimT)ry   r   is_rms_normr   rY   )"r   r   rm   r   r   r   silusplitr   r   r   reshaper   r   r   r   r   r   r   r   r   r   rM   r   r   r   r   r   r   r   r   r   r   r   )rP   r   r  r  rQ   qkvr   qkvhiddenr   rJ   rJ   rT   rO     sX   



z!BailingMoELinearAttention.forward)Nr   r   )rU   rV   rW   r	   r   r!   r   ro   rL   r   rm   rn   r  r.   rO   rX   rJ   rJ   rR   rT   r     s0    |r   c                       s   e Zd Z			ddedee dededdf
 fdd	Zd
e	j
de	j
dee	j
e	j
f fddZde	j
de	j
dede	j
fddZ  ZS )BailingMoEAttentionNmhar   rv   r   re   r_   c              	      s"  t    || _|j| _t }|j| _| j| dksJ | j| | _|j| _	| j	|kr5| j	| dks4J n	|| j	 dks>J t
d| j	| | _t|dd | _| jd u rZ| j| j | _| j| j | _| j| j | _| jd | _t|dd| _| jr|J dt|dd| _t| j| j| j| j	|jp|j|| d	d
| _| jrt| j|jd| _t| j|jd| _t| j| j | j|j|| dd
| _t|dr|j| _nt|drt| j|j  | _n| j| _|j!| _!t|dd| _"t#| j| j| j!| j"|j$t% j&d| _'t(| j| j| j| j||| dd| _)d S )Nr   rY   r   r   !using_split_qkv_in_self_attentionFz"split_qkv is not supported for nowr   r   rx   r   z.o_projr   r   r   r   )r   r   r   rI   r   r   r   )*rK   rL   r   rt   r   r   r   	num_headsnum_key_value_headstotal_num_kv_headsmaxr   r   r   q_sizekv_sizer   	split_qkvr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r*   rI   r5   r   rM   r)   r   )rP   r   rv   r   re   r   rR   rJ   rT   rL   O  s   



	



zBailingMoEAttention.__init__r  r  c                 C   sP   | d| j}| |}||j}| d| j}| |}||j}||fS )Nr   )r	  r   r   r   r   r   )rP   r  r  	q_by_head	k_by_headrJ   rJ   rT   _apply_qk_norm  s   

z"BailingMoEAttention._apply_qk_normr   r  r  c                 K   sx   |  |\}}|j| j| j| jgdd\}}}	| jr"| ||\}}| |||\}}| |||	|}
| |
\}}|S )Nr   r  )	r   r  r  r  r   r  rM   r   r   )rP   r   r  r  rQ   r
  r   r  r  r  attn_outputoutputrJ   rJ   rT   rO     s    zBailingMoEAttention.forward)NNr  )rU   rV   rW   r	   r   r!   r   ro   rL   rm   rn   r   r  r.   rO   rX   rJ   rJ   rR   rT   r  M  s>    O
r  c                       s   e Zd Z				ddedee deded	ed
df fddZ		ddeded	ed
efddZ
dejdejdedeej ded
eejejf fddZedejdejd
dfddZ  ZS )BailingMoELinearDecoderLayerNr   layerFr   rv   r   re   is_nextnr_   c                    sh  t    || _t|dddk| _d }|jdkr%t||| j|d d| _ns|jdkr| jrztd%i d|d|j	d	|j
d
|jd|jd|jdt|drP|jnd d|jdt|ddd|jddd|d|dddtd|d|| _ntd| d t||| j|d d| _ntd|j |j| _|j	| _	| || j}| || jd }| || jd }	| jdkrt| j	|j|td|d| _n$|s| j|jkrt||| jtd|d| _nt| j	|j|td|d| _tt|d d!}
t | j	|
d"| _!t | j	|
d"| _"t#j$||j%|||	d#| _&|jdkr#| jr#| jj'nd }t(| j&| j!| j"d|d$| _)d S )&Nfull_attention_typemlar   z
.attention)rv   r   re   rY   r   rt   r  qk_nope_head_dimqk_rope_head_dim
v_head_dimq_lora_rankkv_lora_rankr   r   rI   r   i   rv   r   rz   Fre   	attention
alt_streamzlayer z use gqazUnsupported attention type: mlp)rt   ru   rv   re   r   r   r   )r   
num_layersis_layer_sparseis_previous_layer_sparseis_next_layer_sparse)layer_scatter_modesinput_layernormpost_attention_layernormallow_reduce_scatterqkv_latent_funcrJ   )*rK   rL   r   r   use_mlaattention_typer   r(  rH   rt   r   r#  r$  r%  r   r&  r'  rI   r7   r   r   r  
ValueErrorr   
expert_num_is_layer_sparserr   ru   r*  first_k_dense_replacer   r   r   r0  r1  r   init_newnum_hidden_layersr/  prepare_qkv_latentr   layer_communicator)rP   r   rv   r   re   r   r)  is_moe_layeris_previous_moe_layeris_next_layer_moe_layerr   r3  rR   rJ   rT   rL     s   










z%BailingMoELinearDecoderLayer.__init__c                 C   s   |p|j d uo||jkS rh   )r   r9  )rP   r   r   r   rJ   rJ   rT   r8  (  s   z-BailingMoELinearDecoderLayer._is_layer_sparser   r  r  residualzero_allocatorc           	      K   s   | j |||\}}|j s$| jr| j||||d}n| j|||d}| j |||\}}| j |}| j |}| 	|||}| j 
|||\}}||fS )N)r  r   r  rB  )r   r  r  )r=  prepare_attnforward_modeis_idler4  r(  prepare_mlp)should_fuse_mlp_allreduce_with_next_layershould_use_reduce_scatterr*  postprocess_layer)	rP   r   r  r  rA  rB  rQ   r~   r   rJ   rJ   rT   rO   /  sB   	
z$BailingMoELinearDecoderLayer.forwardrf   rg   c                 C   s,   |   |  ks
J | j|tj d S rh   )r   r   r   r   rm   r   r   rJ   rJ   rT   shared_moe_coefficient_loadere  s   z:BailingMoELinearDecoderLayer.shared_moe_coefficient_loader)Nr   r  FF)rU   rV   rW   r	   r   r!   r   ro   r   rL   r8  rm   rn   r.   r6   tuplerO   r   rJ  rX   rJ   rJ   rR   rT   r    sb    m

6r  c                       s   e Zd Z		ddedee deddf fddZ			dd	eej	 d
ej	dee
 deej	 dee deej	ef fddZ  ZS )BailingMoELinearModelNrs   r   rv   re   r_   c                    sr  t    t _ _ j_ j_ j_	t
 dd_fddtj	D _tdd jD }tdd jD }td| d	| d
 j	j dks_J dj	 dj jjrrtjjt  jd_nt _ fdd}tj	|jjjj| dd\___i }t dr j|d< jjrt jfi |_nt _d_ d S )Nr[   rY   c                    s    g | ]}t | jrd ndqS )r   rY   )r\   r[   ).0irP   rJ   rT   
<listcomp>  s    z2BailingMoELinearModel.__init__.<locals>.<listcomp>c                 s   s    | ]	}|d krdV  qdS )r   rY   NrJ   rN  trJ   rJ   rT   	<genexpr>      z1BailingMoELinearModel.__init__.<locals>.<genexpr>c                 s   s    | ]	}|d krd V  qdS )rY   NrJ   rR  rJ   rJ   rT   rT    rU  zLayer config: z linear attention layers, z full attention layersr   znum_layers=z% must be divided by layer_group_size=)	enable_tporg_num_embeddingsc                    s<   | }t  }j| |_|d}t|fi |d|iS )N)rv   r   re   )copydeepcopydecoder_attention_typesr5  r  )idxre   rZ   layer_configdecoder_kwargsr   rv   rP   rJ   rT   layer_fn  s   

z0BailingMoELinearModel.__init__.<locals>.layer_fnz.layers)pp_rankpp_sizere   r   r   r   )!rK   rL   r
   pp_groupr   
vocab_sizert   	embed_dimr;  r+  r   r[   rangerZ  sumrD   is_first_rankr-   r   word_embeddingsr+   rC   rank_in_group
world_sizelayersstart_layer	end_layerr   r   is_last_rankr   normembed_scale)rP   r   rv   re   
num_linearnum_fullr_  norm_kwargsrR   r^  rT   rL   q  sT   





zBailingMoELinearModel.__init__	input_idsr  r  inputs_embedspp_proxy_tensorsc              	   C   s.  | j jr|d u r| |}n|}d }n|d usJ |d }|d }| j| j }|d ur.|jn|j}	t|d |jr:dnd tj	|	d}
t
| j| jD ](}t | | j| }||||||
d\}}W d    n1 slw   Y  qI| j js}t||dS |j s|d u r| |}|S | ||\}}|S )Nr   rA  rw   rY   )buffer_sizer   r   )r   r  r  rA  rB  )r   rA  )rb  rg  rh  rm  rl  r   r6   can_run_tborm   r   re  r   with_current_layerrk  rn  r/   rD  rE  ro  )rP   rt  r  r  ru  rv  r   rA  total_num_layersr   rB  rO  r  r   rJ   rJ   rT   rO     sJ   
	

zBailingMoELinearModel.forwardr   )NNN)rU   rV   rW   r	   r   r!   ro   rL   rm   rn   r.   r/   r   rO   rX   rJ   rJ   rR   rT   rM  o  s8    FrM  c                       s  e Zd ZddgddgdZedddd	d
ddddddddddZddddee deddf fddZ	e
dd Ze
dd  Zd!d" Zd6d$d%Zed&d' Zd7d(d)Zd*d+ Z		d8d,ejd-ejd.ed/eej d0ee deejef fd1d2Z	#d7d3eeeejf  dee fd4d5Z  ZS )9BailingMoELinearForCausalLMq_a_projkv_a_proj_with_mqa	gate_projup_proj)fused_qkv_a_proj_with_mqar{   zattention.out_projzlayers.7.attention.o_projzlayers.15.attention.o_projzlayers.23.attention.o_projzlayers.31.attention.o_projzlayers.39.attention.o_projzlayers.47.attention.o_projzlayers.55.attention.o_projzlayers.63.attention.o_projzlayers.71.attention.o_projzlayers.79.attention.o_projzattention.qkv_projzattention.output_gate)zattention.densezlayers.7.attention.out_projzlayers.15.attention.out_projzlayers.23.attention.out_projzlayers.31.attention.out_projzlayers.39.attention.out_projzlayers.47.attention.out_projzlayers.55.attention.out_projzlayers.63.attention.out_projzlayers.71.attention.out_projzlayers.79.attention.out_projzattention.query_key_valuezattention.g_proj)orig_to_new_substrNrs   )rv   re   rv   re   r_   c                   s   t    t | _|| _|| _t| j|td|d| _| jj	r;|j
r%| jnt|j|jtj|t jd| _t|| _d S t | _d S )Nr^   rd   )r   rv   use_attn_tp_group)rK   rL   r
   rb  r   rv   rM  r7   r^   rn  tie_word_embeddingsrh  r,   rc  rt   rm   r   r5   enable_dp_lm_headlm_headr   logits_processorr+   )rP   r   rv   re   rR   rJ   rT   rL     s(   
z$BailingMoELinearForCausalLM.__init__c                 C      | j jS rh   )r^   rl  rP  rJ   rJ   rT   rl       z'BailingMoELinearForCausalLM.start_layerc                 C   r  rh   )r^   rm  rP  rJ   rJ   rT   rm  !  r  z%BailingMoELinearForCausalLM.end_layerc                 C   s   | j jj| jjfS )zUsed by the eagle_worker.)r^   rh  r   r  rP  rJ   rJ   rT   get_embed_and_head%  s   z.BailingMoELinearForCausalLM.get_embed_and_headFc                 C   s  |r| j jg}n4|d u rt| jj| jj}n&t }|D ] }d|v r;t|dd }|| jjk r;|| jjkr;|	| qt
d|  |D ]}|sR| jj| jn| jjj}t|ds]qFt|jdrtsgtrut|jj|jj|jjj}nt|jj|jj|jjdddj}n|jj}d}|jtjtjfv rt| jdr| jjd ur| jj}	t|jd	sJ trt ||jj!d d
\}
}}n|}
|jj!}tr|	d dkr|	d dkrt"j#rt"j$st%ddr|}d}n5t&|
||	tj'}n,t(|
||	\}}||_)n trt ||jj*d d
\}
}}n|}
|jj*}t+|
|\}}||_)|jtj,krUt| jdrG| jj}	|	d urFt|jd	s6J |}
|jj!}t-|
||	.tj'}n|.tj'|jj*.tj' }|/dd|j0|j1 fj|j0|j1gdd\}}|st2|j3|4dd5 4dd|_3t2|j6|5 4dd|_6t|jdr|j)d u rt2|j)|jj*|_)tr| j)d9  _)t7rt8r|jtjkr|j3.tj'|j) |_3|j6.tj'|j) |_6qF|j0|	d  }|j1|	d  }|/dd|| fj||gdd\}}t2|j9|4dd5 |_9t2|j:|5 |_:t2|j3|4dd5 |_3t2|j6|5 |_6d|_;qFt"j#r?t"j<rAt| jdrC| jjd urE| =| d S d S d S d S d S )N	kv_b_proj.rw   zweight loading layer_ids: qweightr   Fweight_block_sizeweight_scale_inv)r   weight_scaleinput_scale   rY   SGL_USE_DEEPGEMM_BMMfalseTr   r  r  g       @)>r   r;  re  r^   rl  rm  setr   r  addr   r   rk  r(  decoderr   r  _is_cudar3   rF   r  scalesqzerosTr   r   rm   float8_e4m3fnfloat8_e4m3fnuzrv   r  _is_fp8_fnuzr&   r  r   ENABLE_JIT_DEEPGEMMDEEPGEMM_BLACKWELLr:   r#   r   r$   w_scaler  r%   int8int8_block_dequantr   	unflattenr#  r%  r8   w_kc	transposer   w_vc_is_cpu_is_cpu_amx_available	w_scale_k	w_scale_vuse_deep_gemm_bmmDEEPGEMM_SCALE_UE8M0_weight_requant_ue8m0)rP   r   weight_names	layer_idsr]   r   	self_attnwr  r  r   r  r   block_scalescaler  r  num_tiles_knum_tiles_nws_kcws_vcrJ   rJ   rT   post_load_weights)  s@  


	








z-BailingMoELinearForCausalLM.post_load_weightsc                 C   s:   t |dd}ddlm} ||j|j|dkrd dS |dS )Nr   r   )ModelConfigForExpertLocation)r+  num_logical_experts
num_groups)r   sglang.srt.eplb.expert_locationr  r;  r   )clsr   r  r  rJ   rJ   rT   $get_model_config_for_expert_location  s   
z@BailingMoELinearForCausalLM.get_model_config_for_expert_locationc                 C   s  | j j}tt| jj| jj| jj}|rdn| jj}t|D ]}|r'| jj	}n| jj
| }|jj|jjg}| jjd urJ||jj ||jj n||jj ||jj |D ]
}t|j|j| qZ||v sk|rt|jdd }	|	d ur|	j|	jfD ]
}t|j|j| q||jj}
t|
tr|
j|
jfD ]}t|d |d | qq|j}t|t sJ |j|jfD ]
}t|j|j| qqd S )NrY   r   r   )!rv   r  listre  r   r9  r;  moe_layer_freqr^   r  rk  r  r  o_projr&  appendr  q_b_projr}  q_projr'   r   r  r   r*  r{   r|   r   r`   r   w13_weight_fp8w2_weight_fp8r2   )rP   r   r  
moe_layersr;  r   r  module_listmoduler   r   r  r*  rJ   rJ   rT   r    sf   




z1BailingMoELinearForCausalLM._weight_requant_ue8m0c                 C   r  rh   )r^   rZ  rP  rJ   rJ   rT   get_decoder_attention_types)  s   z7BailingMoELinearForCausalLM.get_decoder_attention_typesrt  r  r  ru  rv  c                 C   s6   | j |||||d}| jjr| || | j|S |S )N)rt  r  ru  r  rv  )r^   rb  rn  r  r   r  )rP   rt  r  r  ru  rv  r   rJ   rJ   rT   rO   ,  s   z#BailingMoELinearForCausalLM.forwardweightsc              	      sz  dt dtjdd f fdd}|r3t| jdr/| jj}|dks"J d| jjdkr*d	n| jj}ntd
ddg}tj	ddd| jj
d}|rMd| }g d}	t|   t }
g }t| jdoc| jjd u}|rhi nd }|D ]\}}|drwqld }d|v rt|dd }d|v sd|v s| jjrd|v rql|| |r||sqld|v sd|v rqld}|	D ]}||v r||d}d} nq|r||d}|D ]9\}}}||vrqd |v rq|||}|d!r| vrq| vrqt|| rq̈ | }|j}||||  n(|D ]5}|\}}}}||vrq|||}| vr$qt|| r,q | }|j}||||||d"  n|d!rJ| vrJqld#|v rPql|rd$|v s]d%|v r|||< d$|v rh|n|d%d$}d%|v ru|n|d$d%}||v r||v r|| }|| }d	}| jd ur| j d&ks| j d'ks| j d(krd}tj||g|d)}d$|v r|d$d*n|d%d*}| vrql | }t|d+t}||| || || nE| vr|d,d-}| vrqlt|| rqld.|v rd#|vrt|| jj r||||  |
!| ql | }t|d+t}||| |
!| ql| j"||d/ |
S )0Nr]   rg   r_   c                    s>   t | |rd S  |  }t|dtj}t| |}||| d S )Nr   )ra   r   r   r  rq   )r]   rg   rP   rf   r   params_dictrJ   rT   load_linear_attn_weightE  s   

zIBailingMoELinearForCausalLM.load_weights.<locals>.load_linear_attn_weightnum_nextn_predict_layersrY   zOnly 1 nextn layer is supportedr   z-num nextn_predict_layers is not in the config)r{   r~  r   )r{   r  rY   r~  r|   r  )ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namer   zmodel.layers.)final_layernormeh_projenormhnormr&  z	model.mtpr  rw   v_headinv_freqr  zshared_head.headembed_tokensTr^   Fzmodel.decoderzmlp.expertsz.bias)shard_id	expert_idsloper|  r}  awq
awq_marlin	moe_wna16r  r  r   z.dense.z.o_proj.r(  )r   r  )#ro   rm   rn   r   r   r  r;  r6  r   make_expert_params_mappingr   dictnamed_parametersr  r&  
startswithr   r  r  r  replaceendswithra   r   rv   get_namecatr   r0   popr\   r^   r[   r  r  )rP   r  r   r  num_nextn_layersnextn_layer_idstacked_params_mappingexpert_params_mappingnextn_layer_prefixnextn_spec_weight_namesloaded_paramsr  fuse_qkv_a_projcached_a_projr]   rg   rZ   
is_decoderweight_name
param_namer  rf   r   mappingr  q_a_proj_namekv_a_proj_nameq_a_proj_weightkv_a_proj_weightcat_dimfused_weightrJ   r  rT   load_weightsB  sF  

























z(BailingMoELinearForCausalLM.load_weights)FNrK  )NN)rU   rV   rW   packed_modules_mappingr4   hf_to_sglang_mapperr   r!   ro   rL   propertyrl  rm  r  r  classmethodr  r  r  rm   rn   r.   r/   r   rO   r   r   r   r  rX   rJ   rJ   rR   rT   r{    sz    


 5


A
r{  c                   @   s   e Zd ZdS )BailingMoeV2_5ForCausalLMN)rU   rV   rW   rJ   rJ   rJ   rT   r    s    r  )rX  loggingtypingr   r   r   r   r   r   rm   torch.nn.functionalr   
functionalr   transformersr	   sglang.srt.distributedr
   r   r   r   #sglang.srt.eplb.expert_distributionr   sglang.srt.layersr   sglang.srt.layers.activationr   /sglang.srt.layers.attention.fla.layernorm_gatedr   RMSNormGatedr   sglang.srt.layers.communicatorr   r   sglang.srt.layers.dp_attentionr   r   r   sglang.srt.layers.layernormsglang.srt.layers.linearr   r   r   r   "sglang.srt.layers.logits_processorr   "sglang.srt.layers.moe.ep_moe.layerr   r   ,sglang.srt.layers.moe.fused_moe_triton.layerr   sglang.srt.layers.moe.topkr    *sglang.srt.layers.quantization.base_configr!   )sglang.srt.layers.quantization.fp8_kernelr"   (sglang.srt.layers.quantization.fp8_utilsr#   r$   r%   r&   r'   )sglang.srt.layers.quantization.int8_utilsr(   r  !sglang.srt.layers.radix_attentionr)   "sglang.srt.layers.rotary_embeddingr*   sglang.srt.layers.utilsr+   *sglang.srt.layers.vocab_parallel_embeddingr,   r-   ,sglang.srt.model_executor.forward_batch_infor.   r/   $sglang.srt.model_loader.weight_utilsr0   sglang.srt.models.deepseek_v2r1   r2   r3   sglang.srt.models.utilsr4   sglang.srt.server_argsr5   sglang.srt.utilsr6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   sglang.srt.utils.commonrD   r  _is_npur  
_use_aiterr  r  
_device_sm_is_gfx95_supported_use_aiter_gfx95
sgl_kernelrF   )sglang.srt.layers.quantization.awq_tritonrG   vllm._custom_ops_is_flashinfer_available_is_sm100_supportedrH   
LoraConfig	getLoggerrU   r   r\   ro   Moduler   ra   rq   rr   r   r   r   r   r  r  rM  r{  r  
EntryClassrJ   rJ   rJ   rT   <module>   s    @

	
	+v" Im 6v    :