o
    پiV                     @   s  d Z ddlZddlmZmZmZmZmZ ddlZddl	m
  mZ ddlm
Z
 ddlmZ ddlmZmZmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZ ddl m!Z!m"Z"m#Z#m$Z$ ddl%m&Z& ddl'm(Z(m)Z)m*Z* ddl+m,Z, ddl-m.Z.m/Z/m0Z0 ddl1m2Z2 ddl3m4Z4 ddl5m6Z6 ddl7m8Z8 ddl9m:Z: ddl;m<Z< ddl=m>Z> ddl?m@Z@ ddlAmBZB ddlCmDZDmEZE ddlFmGZG ddlHmIZImJZJ ddlKmLZL ddlMmNZNmOZOmPZP ddlQmRZR dd lSmTZTmUZUmVZVmWZW dZXeYeZZ[eU Z\G d!d" d"e
j]Z^G d#d$ d$e
j]Z_G d%d& d&e
j]Z`G d'd( d(e
j]ZaG d)d* d*e
j]ZbG d+d, d,e
j]ZcG d-d. d.e
j]ZdG d/d0 d0edZeG d1d2 d2edZfedeeefgZgdS )3zSGLang BailingMoE model.    N)IterableListOptionalTupleUnion)nn)PretrainedConfig)get_pp_group$get_tensor_model_parallel_world_sizeparallel_state tensor_model_parallel_all_reduce)'get_global_expert_distribution_recorder)ModelConfigForExpertLocation)ExpertLocationDispatchInfo)
SiluAndMul)LayerCommunicatorLayerScatterModesenable_moe_dense_fully_dp)get_attention_dp_sizeget_attention_tp_rankget_attention_tp_sizeis_dp_attention_enabled)RMSNorm)MergedColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessor)get_deepep_modeget_moe_a2a_backend/should_use_flashinfer_cutlass_moe_fp4_allgather)get_moe_impl_class)FusedMoE)DeepEPDispatcher)TopK)%filter_moe_weight_param_global_expert)QuantizationConfig)RadixAttention)get_rope)PPMissingLayer)ParallelLMHeadVocabParallelEmbedding)get_is_capture_mode)ForwardBatchPPProxyTensors)default_weight_loader)apply_qk_normcreate_fused_set_kv_buffer_argenable_fused_set_kv_buffer)get_global_server_args)
add_prefixis_cudais_non_idle_and_non_emptymake_layersc                       s   e Zd Z					ddededee dee ded	ee d
ee ddf fddZ				dde
jdee dedede
jf
ddZ  ZS )BailingMoEMLPNT intermediate_sizeconfigquant_configreduce_resultsprefixtp_ranktp_sizereturnc              
      sz   t    || _t|j|gd |j|td|||d| _t||j|j||td|||d| _	|j
dkr7tdt | _d S )N   gate_up_projbiasr;   r=   r>   r?   	down_proj)rD   r<   r;   r=   r>   r?   siluz7Unsupported activation. Only silu is supported for now.)super__init__r?   r   hidden_sizeuse_biasr3   rB   r   rE   
hidden_act
ValueErrorr   act_fn)selfr9   r:   r;   r<   r=   r>   r?   	__class__ Q/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/bailing_moe.pyrH   ]   s0   

	
zBailingMoEMLP.__init__Fhidden_statesforward_batchshould_allreduce_fusionuse_reduce_scatterc                 C   sN   | j dkr|jd dkr|S | |\}}| |}| j||p |d\}}|S )N   r   )skip_all_reduce)r?   shaperB   rM   rE   )rN   rS   rT   rU   rV   gate_up_rQ   rQ   rR   forward   s   

zBailingMoEMLP.forward)NTr8   NNNFF)__name__
__module____qualname__intr   r   r%   boolstrrH   torchTensorr,   r\   __classcell__rQ   rQ   rO   rR   r7   \   sJ    	(r7   c                       s:   e Zd Z		d	deej def fddZdd Z  Z	S )
BailingMoEGateNr8   params_dtyper=   c                    sv   t    |d u rt }|| _ttj|j|j	f| jd| _
t|ddr6ttj|jftjd| _d S d | _d S )N)dtypemoe_router_enable_expert_biasF)rG   rH   rd   get_default_dtyperh   r   	Parameteremptynum_expertsrI   weightgetattrfloat32expert_bias)rN   r:   rh   r=   rO   rQ   rR   rH      s   



zBailingMoEGate.__init__c                 C   s&   t || jj| jd |j}|S N)Flineartoro   ri   )rN   rS   logitsrQ   rQ   rR   r\      s   zBailingMoEGate.forwardNr8   )
r^   r_   r`   r   rd   ri   rc   rH   r\   rf   rQ   rQ   rO   rR   rg      s    rg   c                       s   e Zd Z			ddededee deejj	 de
f
 fdd	Z		
	
ddejdee dededejf
ddZdd ZdejfddZdejfddZdejdejfddZ	
	
d dejdededejfddZdejdedejfddZ  ZS )!BailingMoESparseMoeBlockNr8   layer_idr:   r;   
alt_streamr=   c                    s  t    || _|| _t | _|j| _|j| _|j	| _	|j
| _
t|dd| _t|dd | _|jdkr;td|j dt|dd }|d u rId | _n|dkrRtj| _ntj| _t jd	ks^J t|d
d	| _t|dd	| _| jd	ksv| jd	kr| jd	krd	| j  k r| jksJ  J d| _n	d  | _| _d| _|jt j | _t|| jtd|d| _| jjd ur| jjjnd | _| jd ur| jdkr| jd u s| jdkr| jd usJ dt | j| j| j| j| j| j| jd| _!t"|| j| j| j|j	|j#|| jtd|d| _$|j
d ur7t%|dr|j&}n|j#}||j
9 }t'd|||dtd|dt( ) r2t*d	ddni | _+t( ) r^t | _,t-t./ j0| jd| j|j| j |j	|j1t2 ddd
| _3d S d S )Nrouted_scaling_factorg      ?score_functionrF   zUnsupported activation: z!. Only silu is supported for now.router_dtypefp32r   n_group
topk_groupTFgate)r:   rh   r=   softmaxsigmoidzdscore_function and correction_bias should be in 2 combination (softmax, None) or (sigmoid, not None))top_krenormalizeuse_grouped_topknum_expert_groupr   correction_biasr|   experts)rn   r   rz   rI   r9   r;   r|   r=   #moe_shared_expert_intermediate_sizeshared_experts)r9   r:   r;   r<   r=   rW   )r>   r?   )
grouprouter_topkpermute_fusionrn   num_local_expertsrI   rh   deepep_modeasync_finishreturn_recv_hookrQ   )4rG   rH   rz   r{   r
   r?   num_experts_per_tokr   norm_topk_probrI   num_shared_expertsrp   r|   r}   rK   rL   r~   rd   rq   bfloat16r2   ep_num_redundant_expertsr   r   r   rn   rg   r3   r   rr   datar   r#   topkr    moe_intermediate_sizer   hasattrr   r7   r   	is_deepepdictr   ep_sizer"   r   get_tp_groupdevice_grouptorch_dtyper   deepep_dispatcher)rN   rz   r:   r;   r{   r=   r~   r9   rO   rQ   rR   rH      s   



 


z!BailingMoESparseMoeBlock.__init__FrS   rT   rU   rV   r@   c                 C   s$   t   s| |||S | ||S rs   )r   r   forward_normalforward_deepep)rN   rS   rT   rU   rV   rQ   rQ   rR   r\   4  s   
z BailingMoESparseMoeBlock.forwardc                    s    fdd j  D S )Nc                    s.   g | ]\}}|d vrt || jjr|jqS ))r   )r$   r   r   r   ).0namexrN   rQ   rR   
<listcomp>E  s    
z<BailingMoESparseMoeBlock.get_moe_weights.<locals>.<listcomp>)r   named_parametersr   rQ   r   rR   get_moe_weightsD  s   
z(BailingMoESparseMoeBlock.get_moe_weightsc                 C   s   d }| j dkr| |}|S )Nr   )r   r   )rN   rS   shared_outputrQ   rQ   rR   _forward_shared_expertsN  s   

z0BailingMoESparseMoeBlock._forward_shared_expertsc                 C   s"   |  |}| ||}| ||S rs   )r   r   r   )rN   rS   router_logitstopk_outputrQ   rQ   rR   _forward_router_expertsT  s   
z0BailingMoESparseMoeBlock._forward_router_expertsc                 C   sp   t j }| j| | | }t j| j | |}W d    n1 s)w   Y  || j ||fS rs   )	rd   cudacurrent_streamr{   wait_streamr   clonestreamr   )rN   rS   r   r   router_outputrQ   rQ   rR   forward_normal_dual_streamZ  s   
z3BailingMoESparseMoeBlock.forward_normal_dual_streamc                 C   s   |j \}}|d|}| jd ur"|j d dkr"t r"| |\}}n
| |}| |}| jdkr5|| }| jdkrE|sE|sEt	 sEt
|}|||S )Nr   rW   )rY   viewr{   r+   r   r   r   r   r?   r   r   )rN   rS   rU   rV   
num_tokensrI   final_hidden_statesr   rQ   rQ   rR   r   h  s,   






z'BailingMoESparseMoeBlock.forward_normalc                 C   s   d }|j }t||r)| |}| jdkr| |}| j|||jtj| j	dd}n| j
|j}| j||d}|d ur?||7 }|S )Nr   )rz   )num_token_non_paddedexpert_location_dispatch_info)rS   r   )forward_moder5   r   r   r   r   r   r   init_newrz   empty_topk_outputdevicer   )rN   rS   rT   r   r   r   r   r   rQ   rQ   rR   r     s,   



	z'BailingMoESparseMoeBlock.forward_deepepNNr8   r]   )FF)r^   r_   r`   ra   r   r   r%   rd   r   Streamrc   rH   re   r,   rb   r\   r   r   r   r   r   r   rf   rQ   rQ   rO   rR   ry      sp    
 



!ry   c                       sn   e Zd Z					ddededee ded	ed
ee	j
j f fddZde	jde	jdede	jfddZ  ZS )BailingMoEAttentionr   NTr8   r:   rz   r;   r<   r=   r{   c           	         s  t    |j| _|j| _|j| _t | _t	 }t
 }| j| dks$J | j|kr3| j| dks2J n	|| j dks<J | j| jksDJ | j| | _|jpR| j| j | _| j| j | _td| j| | _td| j| j | _| jd | _t|dd| _t| j| j| j| j|jp|j|td|||d	| _| jrt| j|jd| _t| j|jd| _t| j| j | j|j||td	|||d
| _t|drt | j|j! | _"nt|dr|j"| _"n| j| _"t#| j| j"|j$|j%|j&d| _'t(| j| j| j| j|td|d| _)|| _*d S )Nr   rW   g      use_qk_normFquery_key_valuerC   epsdense)rD   r;   r<   r=   r>   r?   partial_rotary_factor
rotary_dim)r   max_positionbaserope_scalingattn)num_kv_headsrz   r=   )+rG   rH   rI   num_attention_headstotal_num_headsnum_key_value_headstotal_kv_headsr   dp_sizer   r   	num_headshead_dimq_sizemaxr   kv_sizescalerp   r   r   rJ   use_qkv_biasr3   r   r   rms_norm_epsquery_layernormkey_layernormr   r   r   ra   r   r   r'   max_position_embeddings
rope_thetar   
rotary_embr&   r   r{   )	rN   r:   rz   r;   r<   r=   r{   attn_tp_rankattn_tp_sizerO   rQ   rR   rH     s   
	






	zBailingMoEAttention.__init__	positionsrS   rT   r@   c              	   C   s   |j d dkr	|S | |\}}|j| j| j| jgdd\}}}| jr3t||| j| j| j	| j
d\}}| j|||t|rDt|| j|dnd d\}}| j||||t| d}	| |	\}
}|
S )Nr   r   )dim)qkq_normk_normr   r{   )valuelayerrT   )fused_set_kv_buffer_arg)save_kv_cache)rY   r   splitr   r   r   r/   r   r   r   r{   r   r1   r0   r   r   )rN   r   rS   rT   qkvr[   r   r   vcontext_layerattn_outputrQ   rQ   rR   r\     sD    

zBailingMoEAttention.forward)r   NTr8   N)r^   r_   r`   r   ra   r   r%   rb   rc   rd   r   r   rH   re   r,   r\   rf   rQ   rQ   rO   rR   r     s8    
[r   c                       s   e Zd Z				ddededee dedeej	j
 f
 fd	d
ZdedededefddZ	ddejdejdedeej deeej  dejfddZ  ZS )BailingMoEBlockr   Nr8   r:   rz   r;   r=   r{   c                    sZ  t    || _|j}t||jd| _t | _t	|||dt
d||d| _|| _t | _t | _| j||dd| _| j||d dd}| j||d dd}tj||j| j||d| _| j|jd k| _| jrtt||||t
d|d	| _nt r|d
\}	}
nd\}	}
t|j||t
d||	|
d| _t||jd| _t| j| j| jd| j| jjd kd| _d S )Nr   F	attention)r<   r=   r{   )rz   is_nextnrW   )rz   
num_layersis_layer_sparseis_previous_layer_sparseis_next_layer_sparsemlp)rz   r:   r;   r{   r=   )r   rW   NN)r9   r:   r;   r=   r>   r?   T)layer_scatter_modesinput_layernormpost_attention_layernormallow_reduce_scatteris_last_layer) rG   rH   r:   rI   r   r   r   r   r   r   r3   r   rz   r   r   r   r   _is_layer_sparser   r   r   num_hidden_layersr   r  ry   r   r   r7   r9   r   r   layer_communicator)rN   r:   rz   r;   r=   r{   rI   r   r   mlp_tp_rankmlp_tp_sizerO   rQ   rR   rH   1  sx   




	zBailingMoEBlock.__init__r   r@   c                 C   s   |p|j d uo||jkS rs   )rn   first_k_dense_replace)rN   r:   rz   r   rQ   rQ   rR   r    s   z BailingMoEBlock._is_layer_sparser   rS   rT   residualcaptured_last_layer_outputsc                 C   s   | j j||||d\}}|jd dkr| j|||d}| j j|||d\}}| j |}| j |}| ||||}|rCd|_||fS | j 	|||\}}||fS )Nr
  r   )r   rS   rT   )rS   r	  rT   T)
r  +prepare_attn_and_capture_last_layer_outputsrY   r   prepare_mlp)should_fuse_mlp_allreduce_with_next_layershould_use_reduce_scatterr   _sglang_needs_allreduce_fusionpostprocess_layer)rN   r   rS   rT   r	  r
  rU   rV   rQ   rQ   rR   r\     sF   		
zBailingMoEBlock.forward)r   Nr8   Nrs   )r^   r_   r`   r   ra   r   r%   rc   rd   r   r   rH   rb   r  re   r,   r   r\   rf   rQ   rQ   rO   rR   r   0  sN    
N
r   c                       s~   e Zd Z			ddedee deejj de	f fddZ
		dd	ejd
ejdedejdee deejef fddZ  ZS )BailingMoEModelNr8   r:   r;   r{   r=   c                    s   t    t | _| _j| _j| _| jjr)t	| j| jt
d|t d| _nt | _tjj| _tj fdd| jj| jjt
d|d\| _| _| _| jjr`t| jjd| _ntdd	| _g | _d S )
Nword_embeddingsr;   r=   use_attn_tp_groupc                    s   t | | dS )N)rz   r:   r;   r=   r{   )r   )idxr=   r{   r:   r;   rQ   rR   <lambda>  s    z*BailingMoEModel.__init__.<locals>.<lambda>layers)pp_rankpp_sizer=   r   T)return_tuple)rG   rH   r	   pp_groupr:   
vocab_sizerI   	embed_dimis_first_rankr*   r3   r   r  r(   rd   r   Dropoutembedding_dropoutr6   r  rank_in_group
world_sizer  start_layer	end_layeris_last_rankr   r   normlayers_to_capture)rN   r:   r;   r{   r=   rO   r  rR   rH     s4   


zBailingMoEModel.__init__	input_idsr   rT   input_embedspp_proxy_tensorsr@   c                 C   s6  | j jr|d u r| |}n|}d }n|d usJ |d }|d }g }t| j| jD ]B}	t |	2 |	| jv rE|	|d u r@|n||  | j
|	 }
|
||||t|
ddrW|nd d\}}W d    n1 sgw   Y  q*| j jsxt||dS |j s|d u r| |}n| ||\}}t|dkr|S ||fS )NrS   r	  _is_layer_to_captureFr  )rS   r	  r   )r  r   r  ranger%  r&  r   with_current_layerr)  appendr  rp   r'  r-   r   is_idler(  len)rN   r*  r   rT   r+  r,  rS   r	  aux_hidden_statesir   r[   rQ   rQ   rR   r\     sP   



zBailingMoEModel.forwardr   r   )r^   r_   r`   r   r   r%   rd   r   r   rc   rH   re   r,   r-   r   r\   rf   rQ   rQ   rO   rR   r    s8    
2r  c                       s   e Zd Z		d!dedee def fddZedd	 Z	ed
d Z
dd Zdd Ze 		d"dejdejdedejdee dejfddZd#deeeejf  fddZedd Zd$deee  fdd Z  ZS )%BailingMoEForCausalLMNr8   r:   r;   r=   c                    s   t    t | _|| _|| _trtj	 nd }t
|||tddd| _|jr-| jj| _nt|j|j|td|t jd| _t|| _d| _d S )Nmodelr8   )r{   r=   lm_headr  F)rG   rH   r	   r  r:   r;   _is_cudard   r   r   r  r3   r6  tie_word_embeddingsr  r7  r)   r  rI   r2   enable_dp_lm_headr   logits_processorcapture_aux_hidden_states)rN   r:   r;   r=   r{   rO   rQ   rR   rH   &  s,   


zBailingMoEForCausalLM.__init__c                 C      | j jS rs   )r6  r%  r   rQ   rQ   rR   r%  I     z!BailingMoEForCausalLM.start_layerc                 C   r=  rs   )r6  r&  r   rQ   rQ   rR   r&  M  r>  zBailingMoEForCausalLM.end_layerc                 C   s   | j jj| jjfS )Used by the eagle_worker.)r6  r  ro   r7  r   rQ   rQ   rR   get_embed_and_headQ  s   z(BailingMoEForCausalLM.get_embed_and_headc                 C   s8   | j j`| j`|| j j_|| j_tj  tj  dS )r?  N)r6  r  ro   r7  rd   r   empty_cachesynchronize)rN   embedheadrQ   rQ   rR   set_embed_and_headU  s   

z(BailingMoEForCausalLM.set_embed_and_headr*  r   rT   r+  r,  r@   c                 C   sF   | j |||||d}d }| jr|\}}| jjr!| ||| j||S |S )N)r,  )r6  r<  r  r'  r;  r7  )rN   r*  r   rT   r+  r,  rS   r3  rQ   rQ   rR   r\   ^  s   	zBailingMoEForCausalLM.forwardFweightsc              	   C   s  |r%t | jdr!| jj}|dksJ d| jjdkrdn| jj}ntdddg}|r4d| }g d	}tjd
dd| jjd}t| 	 }	|D ]\}
}d|
v s[d|
v s[| jj
r\d|
v r\qGt | jdr|| jjr|d|
v r|dd lm  m} |j|dddd}|r|
|sqGd|
v sd|
v rqGd}|D ]}||
v r|
|d}
d} nq|r|
|d}
|D ]2\}}}||
vrqd|
v rq|
||}
|
dr|
|	vrq|
|	vrq|	|
 }|j}||||  nM|D ])}|\}}}}||
vrq|
||}
|
|	vrq|	|
 }|j}||||
||d  n!|
dr|
|	vrqG|
|	vrqG|	|
 }t|dt}||| qG|s>d d! t| jjD | _d S d S )"Nnum_nextn_predict_layersrW   zOnly 1 nextn layer is supportedr   z-num_nextn_predict_layers is not in the config)rB   	gate_projr   )rB   up_projrW   zmodel.layers.)final_layernormeh_projenormhnormrH  rE   rI  )ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namern   v_headinv_freqr7  	norm_headzlm_head.weightrA   gHz>)r   pr   zshared_head.headembed_tokensTr6  Fzmodel.decoderzmlp.expertsz.bias)shard_id	expert_idweight_loaderc                 S   s2   i | ]\}}t |tst |jtr||j qS rQ   )
isinstancer(   r   ry   r   )r   rz   r   rQ   rQ   rR   
<dictcomp>  s    

z6BailingMoEForCausalLM.load_weights.<locals>.<dictcomp>)r   r:   rG  r  rL   r!   make_expert_params_mappingrn   r   r   r9  rS  torch.nn.functionalr   
functional	normalize
startswithreplaceendswithrX  rp   r.   	enumerater6  r  routed_experts_weights_of_layer)rN   rF  r   num_nextn_layersnextn_layer_idstacked_params_mappingnextn_layer_prefixnextn_spec_weight_namesexpert_params_mappingparams_dictr   loaded_weightrt   
is_decoderweight_name
param_namerV  paramrX  mappingrW  rQ   rQ   rR   load_weightsz  s   





z"BailingMoEForCausalLM.load_weightsc                 C   s.   t |dd}t|j|j|dkrd dS |dS )Nr   r   )r   num_logical_experts
num_groups)rp   r   r  rn   )clsr:   rs  rQ   rQ   rR   $get_model_config_for_expert_location  s   
z:BailingMoEForCausalLM.get_model_config_for_expert_location	layer_idsc                 C   sR   | j jsd S d| _|d u r| jj}d|d |d g| j_d S dd |D | j_d S )NTrA      c                 S   s   g | ]}|d  qS )rW   rQ   )r   valrQ   rQ   rR   r     s    zFBailingMoEForCausalLM.set_eagle3_layers_to_capture.<locals>.<listcomp>)r  r'  r<  r:   r  r6  r)  )rN   rv  r   rQ   rQ   rR   set_eagle3_layers_to_capture
  s   z2BailingMoEForCausalLM.set_eagle3_layers_to_capturerx   r   )Frs   )r^   r_   r`   r   r   r%   rc   rH   propertyr%  r&  r@  rE  rd   no_gradre   r,   r-   r\   r   r   rq  classmethodru  r   ra   ry  rf   rQ   rQ   rO   rR   r5  %  sJ    #

	 
 r5  c                   @      e Zd ZdS )BailingMoeForCausalLMNr^   r_   r`   rQ   rQ   rQ   rR   r~        r~  c                   @   r}  )BailingMoeV2ForCausalLMNr  rQ   rQ   rQ   rR   r    r  r  )h__doc__loggingtypingr   r   r   r   r   rd   r\  r   r]  rt   transformersr   sglang.srt.distributedr	   r
   r   r   #sglang.srt.eplb.expert_distributionr   sglang.srt.eplb.expert_locationr   (sglang.srt.eplb.expert_location_dispatchr   sglang.srt.layers.activationr   sglang.srt.layers.communicatorr   r   r   sglang.srt.layers.dp_attentionr   r   r   r   sglang.srt.layers.layernormr   sglang.srt.layers.linearr   r   r   "sglang.srt.layers.logits_processorr   sglang.srt.layers.moer   r   r   "sglang.srt.layers.moe.ep_moe.layerr    ,sglang.srt.layers.moe.fused_moe_triton.layerr!   &sglang.srt.layers.moe.token_dispatcherr"   sglang.srt.layers.moe.topkr#   sglang.srt.layers.moe.utilsr$   *sglang.srt.layers.quantization.base_configr%   !sglang.srt.layers.radix_attentionr&   "sglang.srt.layers.rotary_embeddingr'   sglang.srt.layers.utilsr(   *sglang.srt.layers.vocab_parallel_embeddingr)   r*   +sglang.srt.model_executor.cuda_graph_runnerr+   ,sglang.srt.model_executor.forward_batch_infor,   r-   $sglang.srt.model_loader.weight_utilsr.   sglang.srt.models.utilsr/   r0   r1   sglang.srt.server_argsr2   sglang.srt.utilsr3   r4   r5   r6   
LoraConfig	getLoggerr^   loggerr8  Moduler7   rg   ry   r   r   r  r5  r~  r  
EntryClassrQ   rQ   rQ   rR   <module>   sf   
8 v 	 h t