o
    پi                     @   st  d Z ddlZddlmZmZmZmZ ddlZddlm	  m
Z ddlm	Z	 ddlmZ ddlmZmZmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlm Z m!Z!m"Z"m#Z# ddl$m%Z% ddl&m'Z'm(Z(m)Z) ddl*m+Z+ ddl,m-Z-m.Z. ddl/m0Z0 ddl1m2Z2 ddl3m4Z4 ddl5m6Z6 ddl7m8Z8 ddl9m:Z:m;Z; ddl<m=Z= ddl>m?Z? ddl@mAZAmBZB ddlCmDZD ddlEmFZFmGZG ddlHmIZI ddlJmKZKmLZLmMZM ddlNmOZO ddlPmQZQmRZRmSZSmTZT dZUeVeWZXeR ZYG d d! d!e	jZZ[G d"d# d#e	jZZ\G d$d% d%e	jZZ]G d&d' d'e	jZZ^G d(d) d)e	jZZ_G d*d+ d+e	jZZ`G d,d- d-e	jZZaeaZbdS ).zSGLang LLaDA2MoeModelLM model.    N)IterableOptionalTupleUnion)nn)PretrainedConfig)get_pp_group$get_tensor_model_parallel_world_sizeparallel_state tensor_model_parallel_all_reduce)'get_global_expert_distribution_recorder)ModelConfigForExpertLocation)ExpertLocationDispatchInfo)
SiluAndMul)LayerCommunicatorLayerScatterModesenable_moe_dense_fully_dp)get_attention_dp_sizeget_attention_tp_rankget_attention_tp_sizeis_dp_attention_enabled)RMSNorm)MergedColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessor)get_deepep_modeget_moe_a2a_backend)get_moe_impl_class)FusedMoE)DeepEPDispatcher)TopK)QuantizationConfig)AttentionTypeRadixAttention)get_rope)PPMissingLayer)ParallelLMHeadVocabParallelEmbedding)get_is_capture_mode)ForwardBatchPPProxyTensors)default_weight_loader)apply_qk_normcreate_fused_set_kv_buffer_argenable_fused_set_kv_buffer)get_global_server_args)
add_prefixis_cudais_non_idle_and_non_emptymake_layersc                       s   e Zd Z					ddededee dee ded	ee d
ee ddf fddZ			dde
jdee dede
jfddZ  ZS )LLaDA2MoeMLPNT intermediate_sizeconfigquant_configreduce_resultsprefixtp_ranktp_sizereturnc              
      sz   t    || _t|j|gd |j|td|||d| _t||j|j||td|||d| _	|j
dkr7tdt | _d S )N   gate_up_projbiasr9   r;   r<   r=   	down_proj)rB   r:   r9   r;   r<   r=   siluz7Unsupported activation. Only silu is supported for now.)super__init__r=   r   hidden_sizeuse_biasr1   r@   r   rC   
hidden_act
ValueErrorr   act_fn)selfr7   r8   r9   r:   r;   r<   r=   	__class__ L/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/llada2.pyrF   X   s0   

	
zLLaDA2MoeMLP.__init__Fhidden_statesforward_batchuse_reduce_scatterc                 C   sJ   | j dkr|jd dkr|S | |\}}| |}| j||d\}}|S )N   r   )skip_all_reduce)r=   shaper@   rK   rC   )rL   rQ   rR   rS   gate_up_rO   rO   rP   forward}   s   

zLLaDA2MoeMLP.forward)NTr6   NNNF)__name__
__module____qualname__intr   r   r"   boolstrrF   torchTensorr*   rY   __classcell__rO   rO   rM   rP   r5   W   sD    	(r5   c                       s:   e Zd Z		d	deej def fddZdd Z  Z	S )
LLaDA2MoeGateNr6   params_dtyper;   c                    sv   t    |d u rt }|| _ttj|j|j	f| jd| _
t|ddr6ttj|jftjd| _d S d | _d S )N)dtypemoe_router_enable_expert_biasF)rE   rF   ra   get_default_dtypere   r   	Parameteremptynum_expertsrG   weightgetattrfloat32expert_bias)rL   r8   re   r;   rM   rO   rP   rF      s   



zLLaDA2MoeGate.__init__c                 C   s&   t || jj| jd |j}|S N)Flineartorl   rf   )rL   rQ   logitsrO   rO   rP   rY      s   zLLaDA2MoeGate.forwardNr6   )
r[   r\   r]   r   ra   rf   r`   rF   rY   rc   rO   rO   rM   rP   rd      s    rd   c                       s   e Zd Z			ddededee deejj	 de
f
 fdd	Z		
ddejdee dedejfddZdd ZdejfddZdejfddZdejdejfddZ	
ddejdedejfddZdejdedejfddZ  ZS ) LLaDA2MoeSparseMoeBlockNr6   layer_idr8   r9   
alt_streamr;   c                    s  t    || _|| _t | _|j| _|j| _|j	| _	|j
| _
t|dd| _t|dd | _|jdkr;td|j dt|dd }|d u rId | _n|dkrRtj| _ntj| _t jd	ks^J t|d
d	| _t|dd	| _| jd	ksv| jd	kr| jd	krd	| j  k r| jksJ  J d| _n	d  | _| _d| _|jt j | _t|| jtd|d| _| jjd ur| jjjnd | _| jd ur| jdkr| jd u s| jdkr| jd usJ dt | j| j| j| j| j| j| jd| _!t"|| j| j| j|j	|j#|| jtd|d| _$|j
d ur7t%|dr|j&}n|j#}||j
9 }t'd|||dtd|dt( ) r2t*d	ddni | _+t( ) r^t | _,t-t./ j0| jd| j|j| j |j	|j1t2 ddd
| _3d S d S )Nrouted_scaling_factorg      ?score_functionrD   zUnsupported activation: z!. Only silu is supported for now.router_dtypefp32r   n_group
topk_groupTFgate)r8   re   r;   softmaxsigmoidzdscore_function and correction_bias should be in 2 combination (softmax, None) or (sigmoid, not None))top_krenormalizeuse_grouped_topknum_expert_groupr~   correction_biasry   experts)rk   r   rw   rG   r7   r9   ry   r;   #moe_shared_expert_intermediate_sizeshared_experts)r7   r8   r9   r:   r;   rT   )r<   r=   )
grouprouter_topkpermute_fusionrk   num_local_expertsrG   re   deepep_modeasync_finishreturn_recv_hookrO   )4rE   rF   rw   rx   r	   r=   num_experts_per_tokr   norm_topk_probrG   num_shared_expertsrm   ry   rz   rI   rJ   r{   ra   rn   bfloat16r0   ep_num_redundant_expertsr   r~   r   rk   rd   r1   r   ro   datar   r!   topkr   moe_intermediate_sizer   hasattrr   r5   r   	is_deepepdictr   ep_sizer    r
   get_tp_groupdevice_grouptorch_dtyper   deepep_dispatcher)rL   rw   r8   r9   rx   r;   r{   r7   rM   rO   rP   rF      s   



 


z LLaDA2MoeSparseMoeBlock.__init__FrQ   rR   rS   r>   c                 C   s"   t   s| ||S | ||S rp   )r   r   forward_normalforward_deepep)rL   rQ   rR   rS   rO   rO   rP   rY   .  s   
zLLaDA2MoeSparseMoeBlock.forwardc                 C   s   dd | j  D S )Nc                 S   s   g | ]\}}|d vr|j qS ))r   )r   ).0namexrO   rO   rP   
<listcomp>:  s
    z;LLaDA2MoeSparseMoeBlock.get_moe_weights.<locals>.<listcomp>)r   named_parametersrL   rO   rO   rP   get_moe_weights9  s   z'LLaDA2MoeSparseMoeBlock.get_moe_weightsc                 C   s   d }| j dkr| |}|S )Nr   )r   r   )rL   rQ   shared_outputrO   rO   rP   _forward_shared_experts@  s   

z/LLaDA2MoeSparseMoeBlock._forward_shared_expertsc                 C   s"   |  |}| ||}| ||S rp   )r   r   r   )rL   rQ   router_logitstopk_outputrO   rO   rP   _forward_router_expertsF  s   
z/LLaDA2MoeSparseMoeBlock._forward_router_expertsc                 C   sp   t j }| j| | | }t j| j | |}W d    n1 s)w   Y  || j ||fS rp   )	ra   cudacurrent_streamrx   wait_streamr   clonestreamr   )rL   rQ   r   r   router_outputrO   rO   rP   forward_normal_dual_streamL  s   
z2LLaDA2MoeSparseMoeBlock.forward_normal_dual_streamc                 C   s   |j \}}|d|}| jd ur"|j d dkr"t r"| |\}}n
| |}| |}| jdkr5|| }| jdkr@|s@t	|}|||S )Nr   rT   )
rV   viewrx   r)   r   r   r   r   r=   r   )rL   rQ   rS   
num_tokensrG   final_hidden_statesr   rO   rO   rP   r   Z  s    





z&LLaDA2MoeSparseMoeBlock.forward_normalc                 C   s   d }|j }t||r)| |}| jdkr| |}| j|||jtj| j	dd}n| j
|j}| j||d}|d ur?||7 }|S )Nr   rw   )num_token_non_paddedexpert_location_dispatch_info)rQ   r   )forward_moder3   r   r   r   r   r   r   init_newrw   empty_topk_outputdevicer   )rL   rQ   rR   r   r   r   r   r   rO   rO   rP   r   u  s,   



	z&LLaDA2MoeSparseMoeBlock.forward_deepepNNr6   rZ   )F)r[   r\   r]   r^   r   r   r"   ra   r   Streamr`   rF   rb   r*   r_   rY   r   r   r   r   r   r   rc   rO   rO   rM   rP   rv      sd    
 


rv   c                       sn   e Zd Z					ddededee ded	ed
ee	j
j f fddZde	jde	jdede	jfddZ  ZS )LLaDA2MoeAttentionr   NTr6   r8   rw   r9   r:   r;   rx   c           	         s  t    |j| _|j| _|j| _t | _t	 }t
 }| j| dks$J | j|kr3| j| dks2J n	|| j dks<J | j| jksDJ | j| | _|jpR| j| j | _| j| j | _td| j| | _td| j| j | _| jd | _t|dd| _t| j| j| j| j|jp|j|td|||d	| _| jrt| j|jd| _t| j|jd| _t| j| j | j|j||td	|||d
| _t|drt | j|j! | _"nt|dr|j"| _"n| j| _"t#| j| j"|j$|j%|j&d| _'t(| j| j| j| j|t)j*td|d| _+|| _,d S )Nr   rT   g      use_qk_normTquery_key_valuerA   epsdense)rB   r9   r:   r;   r<   r=   partial_rotary_factor
rotary_dim)r   max_positionbaserope_scalingattn)num_kv_headsrw   	attn_typer;   )-rE   rF   rG   num_attention_headstotal_num_headsnum_key_value_headstotal_kv_headsr   dp_sizer   r   	num_headshead_dimq_sizemaxr   kv_sizescalerm   r   r   rH   use_qkv_biasr1   r   r   rms_norm_epsquery_layernormkey_layernormr   r   r   r^   r   r   r%   max_position_embeddings
rope_thetar   
rotary_embr$   r#   ENCODER_ONLYr   rx   )	rL   r8   rw   r9   r:   r;   rx   attn_tp_rankattn_tp_sizerM   rO   rP   rF     s   
	







zLLaDA2MoeAttention.__init__	positionsrQ   rR   r>   c              	   C   s   |j d dkr	|S | |\}}|j| j| j| jgdd\}}}| jr3t||| j| j| j	| j
d\}}| j|||t|rDt|| j|dnd d\}}| j||||t| d}	| |	\}
}|
S )Nr   r   )dim)qkq_normk_normr   rx   )valuelayerrR   )fused_set_kv_buffer_arg)save_kv_cache)rV   r   splitr   r   r   r-   r   r   r   rx   r   r/   r.   r   r   )rL   r   rQ   rR   qkvrX   r   r   vcontext_layerattn_outputrO   rO   rP   rY     sD    

zLLaDA2MoeAttention.forward)r   NTr6   N)r[   r\   r]   r   r^   r   r"   r_   r`   ra   r   r   rF   rb   r*   rY   rc   rO   rO   rM   rP   r     s8    
\r   c                       s   e Zd Z				ddededee dedeej	j
 f
 fd	d
ZdededefddZdejdejdedeej dejf
ddZ  ZS )LLaDA2MoeBlockr   Nr6   r8   rw   r9   r;   rx   c                    s>  t    |j}t||jd| _t | _t|||dt	d||d| _
|| _t | _t | _| j||d| _| j||d d}| j||d d}tj||j| j||d| _| j|jd k| _| jrnt||||t	d|d	| _nt rvd
\}	}
nd\}	}
t|j||t	d||	|
d| _t||jd| _t| j| j| jdd| _d S )Nr   F	attention)r:   r;   rx   r   rT   )rw   
num_layersis_layer_sparseis_previous_layer_sparseis_next_layer_sparsemlp)rw   r8   r9   rx   r;   )r   rT   NN)r7   r8   r9   r;   r<   r=   T)layer_scatter_modesinput_layernormpost_attention_layernormallow_reduce_scatter)rE   rF   rG   r   r   r   r   r   r   r1   r   rw   r   r   r   r   _is_layer_sparser   r   r   num_hidden_layersr   is_last_layerrv   r   r   r5   r7   r   r   layer_communicator)rL   r8   rw   r9   r;   rx   rG   r   r   mlp_tp_rankmlp_tp_sizerM   rO   rP   rF     sh   


	zLLaDA2MoeBlock.__init__r>   c                 C   s   |j d uo	||jkS rp   )rk   first_k_dense_replace)rL   r8   rw   rO   rO   rP   r  d  s   zLLaDA2MoeBlock._is_layer_sparser   rQ   rR   residualc                 C   st   | j j|||d\}}| j|||d}| j j|||d\}}| j |}| |||}| j j|||d\}}||fS )N)rQ   r  rR   )r   rQ   rR   )r  prepare_attnr   prepare_mlpshould_use_reduce_scatterr   postprocess_layer)rL   r   rQ   rR   r  rS   rO   rO   rP   rY   i  s2   


zLLaDA2MoeBlock.forward)r   Nr6   N)r[   r\   r]   r   r^   r   r"   r`   ra   r   r   rF   r_   r  rb   r*   rY   rc   rO   rO   rM   rP   r     s8    
Fr   c                       s~   e Zd Z			ddedee deejj de	f fddZ
		dd	ejd
ejdedejdee deejef fddZ  ZS )LLaDA2MoeModelNr6   r8   r9   rx   r;   c                    s   t    t | _| _j| _j| _| jjr)t	| j| jt
d|t d| _nt | _tjj| _tj fdd| jj| jjt
d|d\| _| _| _| jjrat| jjd| _d S tdd	| _d S )
Nword_embeddingsr9   r;   use_attn_tp_groupc                    s   t | | dS )N)rw   r8   r9   r;   rx   )r   )idxr;   rx   r8   r9   rO   rP   <lambda>  s    z)LLaDA2MoeModel.__init__.<locals>.<lambda>layers)pp_rankpp_sizer;   r   T)return_tuple)rE   rF   r   pp_groupr8   
vocab_sizerG   	embed_dimis_first_rankr(   r1   r   r  r&   ra   r   Dropoutembedding_dropoutr4   r  rank_in_group
world_sizer  start_layer	end_layeris_last_rankr   r   norm)rL   r8   r9   rx   r;   rM   r  rP   rF     s2   

zLLaDA2MoeModel.__init__	input_idsr   rR   input_embedspp_proxy_tensorsr>   c              	   C   s   | j jr|d u r| |}n|}d }n|d usJ |d }|d }t| j| jD ]&}t | | j| }	|	||||\}}W d    n1 sIw   Y  q(| j j	sZt
||dS |j sr|d u rj| |}|S | ||\}}
|S )NrQ   r  )rQ   r  )r  r  r  ranger   r!  r   with_current_layerr  r"  r+   r   is_idler#  )rL   r$  r   rR   r%  r&  rQ   r  ir   rX   rO   rO   rP   rY     s>   



zLLaDA2MoeModel.forwardr   r   )r[   r\   r]   r   r   r"   ra   r   r   r`   rF   rb   r*   r+   r   rY   rc   rO   rO   rM   rP   r    s8    
0r  c                       s   e Zd Z		ddedee def fddZedd	 Z	ed
d Z
dd Zdd Ze 		ddejdejdedejdee dejfddZdeeeejf  fddZedd Z  ZS )LLaDA2MoeModelLMNr6   r8   r9   r;   c                    s   t    t | _|| _|| _trtj	 nd }t
|||tddd| _|jr-| jj| _nt|j|j|td|t jd| _t|dd| _d S )Nmodelr6   )rx   r;   lm_headr  T)return_full_logits)rE   rF   r   r  r8   r9   _is_cudara   r   r   r  r1   r,  tie_word_embeddingsr  r-  r'   r  rG   r0   enable_dp_lm_headr   logits_processor)rL   r8   r9   r;   rx   rM   rO   rP   rF     s*   
zLLaDA2MoeModelLM.__init__c                 C      | j jS rp   )r,  r   r   rO   rO   rP   r        zLLaDA2MoeModelLM.start_layerc                 C   r3  rp   )r,  r!  r   rO   rO   rP   r!    r4  zLLaDA2MoeModelLM.end_layerc                 C   s   | j jj| jjfS )Used by the eagle_worker.)r,  r  rl   r-  r   rO   rO   rP   get_embed_and_head  s   z#LLaDA2MoeModelLM.get_embed_and_headc                 C   s8   | j j`| j`|| j j_|| j_tj  tj  dS )r5  N)r,  r  rl   r-  ra   r   empty_cachesynchronize)rL   embedheadrO   rO   rP   set_embed_and_head  s   

z#LLaDA2MoeModelLM.set_embed_and_headr$  r   rR   r%  r&  r>   c                 C   s2   | j |||||d}| jjr| ||| j|S |S )N)r&  )r,  r  r"  r2  r-  )rL   r$  r   rR   r%  r&  rQ   rO   rO   rP   rY   !  s   	
zLLaDA2MoeModelLM.forwardweightsc              	   C   s  ddg}t jddd| jjd}t|  }|D ]\}}d|v s+d|v s+| jjr,d	|v r,qt| jd
rL| jjrLd|v rLdd l	m
  m} |j|dddd}|D ]2\}}	}
|	|vrXqNd|v r]qN||	|}|drm||vrmqN||vrrqN|| }|j}||||
  nJ|D ])}|\}}	}}
|	|vrq||	|}||vrq|| }|j}|||||
|d  n|dr||vrq||vrq|| }t|dt}||| qdd t| jjD | _d S )N)r@   	gate_projr   )r@   up_projrT   r=  rC   r>  )ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namerk   v_headinv_freqr-  	norm_headzlm_head.weightr   r?   gHz>)r   pr   zmlp.expertsz.bias)shard_id	expert_idweight_loaderc                 S   s2   i | ]\}}t |tst |jtr||j qS rO   )
isinstancer&   r   rv   r   )r   rw   r   rO   rO   rP   
<dictcomp>  s    

z1LLaDA2MoeModelLM.load_weights.<locals>.<dictcomp>)r   make_expert_params_mappingr8   rk   r   r   r0  r   rD  torch.nn.functionalr   
functional	normalizereplaceendswithrH  rm   r,   	enumerater,  r  routed_experts_weights_of_layer)rL   r<  stacked_params_mappingexpert_params_mappingparams_dictr   loaded_weightrq   
param_nameweight_namerF  paramrH  mappingrG  rO   rO   rP   load_weights8  s   


zLLaDA2MoeModelLM.load_weightsc                 C   s.   t |dd}t|j|j|dkrd dS |dS )Nr}   r   )r   num_logical_experts
num_groups)rm   r   r  rk   )clsr8   r]  rO   rO   rP   $get_model_config_for_expert_location  s   
z5LLaDA2MoeModelLM.get_model_config_for_expert_locationru   r   )r[   r\   r]   r   r   r"   r`   rF   propertyr   r!  r6  r;  ra   no_gradrb   r*   r+   rY   r   r   r[  classmethodr_  rc   rO   rO   rM   rP   r+    sF     

	^r+  )c__doc__loggingtypingr   r   r   r   ra   rL  r   rM  rq   transformersr   sglang.srt.distributedr   r	   r
   r   #sglang.srt.eplb.expert_distributionr   sglang.srt.eplb.expert_locationr   (sglang.srt.eplb.expert_location_dispatchr   sglang.srt.layers.activationr   sglang.srt.layers.communicatorr   r   r   sglang.srt.layers.dp_attentionr   r   r   r   sglang.srt.layers.layernormr   sglang.srt.layers.linearr   r   r   "sglang.srt.layers.logits_processorr   sglang.srt.layers.moer   r   "sglang.srt.layers.moe.ep_moe.layerr   ,sglang.srt.layers.moe.fused_moe_triton.layerr   &sglang.srt.layers.moe.token_dispatcherr    sglang.srt.layers.moe.topkr!   *sglang.srt.layers.quantization.base_configr"   !sglang.srt.layers.radix_attentionr#   r$   "sglang.srt.layers.rotary_embeddingr%   sglang.srt.layers.utilsr&   *sglang.srt.layers.vocab_parallel_embeddingr'   r(   +sglang.srt.model_executor.cuda_graph_runnerr)   ,sglang.srt.model_executor.forward_batch_infor*   r+   $sglang.srt.model_loader.weight_utilsr,   sglang.srt.models.utilsr-   r.   r/   sglang.srt.server_argsr0   sglang.srt.utilsr1   r2   r3   r4   
LoraConfig	getLoggerr[   loggerr/  Moduler5   rd   rv   r   r   r  r+  
EntryClassrO   rO   rO   rP   <module>   s^   
7 h 
uY 6