o
    پi                     @   s>  d dl Z d dlZd dlmZmZmZmZmZmZ d dl	Z	d dl
m  mZ d dl	mZ d dlmZmZmZmZmZ d dlmZ d dlmZ d dlmZ d dlmZmZ d d	lmZmZm Z  d d
l!m"Z" d dl#m$Z$m%Z%m&Z&m'Z'm(Z( d dl)m*Z* d dl+m,Z,m-Z- d dl.m/Z/ d dl0m1Z1 d dl2m3Z3m4Z4 d dl5m6Z6m7Z7 d dl8m9Z9 d dl:m;Z; d dl<m=Z= d dl>m?Z? d dl@mAZAmBZB d dlCmDZDmEZE d dlFmGZG d dlHmIZI d dlJmKZKmLZLmMZMmNZN dZOe PeQZReL ZSG dd dejTZUG dd dejTZVG dd  d ejTZWG d!d" d"ejTZXG d#d$ d$ejTZYG d%d& d&ejTZZeZZ[dS )'    N)AnyDictIterableOptionalTupleUnion)nn)"get_moe_expert_parallel_world_sizeget_pp_groupget_tensor_model_parallel_rank$get_tensor_model_parallel_world_size tensor_model_parallel_all_reduce)'get_global_expert_distribution_recorder)ExpertLocationDispatchInfo)
SiluAndMul)LayerCommunicatorLayerScatterModes)get_attention_tp_rankget_attention_tp_sizeis_dp_attention_enabled)GemmaRMSNorm)ColumnParallelLinearMergedColumnParallelLinearQKVParallelLinearReplicatedLinearRowParallelLinear)LogitsProcessor)get_moe_a2a_backend/should_use_flashinfer_cutlass_moe_fp4_allgather)get_moe_impl_class)FusedMoE)StandardTopKOutputTopK)RoutingMethodType%filter_moe_weight_param_global_expert)QuantizationConfig)RadixAttention)get_rope)PPMissingLayer)ParallelLMHeadVocabParallelEmbedding)ForwardBatchPPProxyTensors)default_weight_loader)get_global_server_args)
add_prefixis_cudais_non_idle_and_non_emptymake_layersc                       sN   e Zd Z			ddededee dee deddf fd	d
Zdd Z	  Z
S )
Step3p5MLPN hidden_sizeintermediate_sizeswiglu_limitquant_configprefixreturnc                    sb   t    || _|| _t||gd d|td|d| _t||d|td|d| _t	 | _
|| _d S )N   Fgate_up_projbiasr8   r9   	down_proj)super__init__r5   r6   r   r/   r<   r   r?   r   act_fnlimit)selfr5   r6   r7   r8   r9   	__class__ M/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/step3p5.pyrA   A   s&   

zStep3p5MLP.__init__c                 C   s   | j d ur7| |\}}|jddd\}}t|}|jd | j d}|j| j  | j d}| || \}}|S | |\}}| |}| |\}}|S )Nr;   dim)minmax)rC   r<   chunkFsiluclampr?   rB   )rD   xgate_up_gateupoutputrG   rG   rH   forward]   s   


zStep3p5MLP.forward)NNr4   )__name__
__module____qualname__intr   floatr%   strrA   rX   __classcell__rG   rG   rE   rH   r3   @   s$    r3   c                       s   e Zd Z		d&dedee def fddZ			d'd	ej	d
ee
 dededej	f
ddZdd Z		d(d	ej	dededej	fddZd	ej	d
e
dej	fddZdd Zdd Zdd Zdd Zdd Zd d! Zd"d# Zd$d% Z  ZS ))Step3p5MoEMLPNr4   layer_idr8   r9   c                    sB  t    t | _|| _|j| _|j| _|j| _| jr*t	j
tj|jtjddd| _| j|jkr=td| j d|j d|j| | _| jdkrK| jnd | _t|jddd	| jd|d
| _t||jt j |j||j|j|td|tj| jd	| _t|j|jdd td|d| _ t! " rt# | _$|jt j | _|j| _%d S d S )N)dtypeF)requires_gradzTensor parallel size z' is greater than the number of experts .r   Tsigmoid)top_krenormalizeuse_grouped_topkscoring_funccorrection_bias%apply_routed_scaling_factor_on_outputra   experts)	num_expertsrf   ra   r5   r6   r8   r9   routing_method_typegemm1_clamp_limitrU   r=   )&r@   rA   r   tp_sizera   need_fp32_gatemoe_router_scaling_factorrouted_scaling_factoruse_moe_router_biasr   	Parametertorchzerosmoe_num_expertsfloat32router_bias
ValueErrorswiglu_limitsrC   r"   	moe_top_ktopkr   r.   ep_num_redundant_expertsr5   moe_intermediate_sizer/   r#   Renormalizerl   r   rU   r   	is_deepepr	   ep_sizerf   )rD   configra   r8   r9   rE   rG   rH   rA   m   sp   



zStep3p5MoEMLP.__init__Fhidden_statesforward_batchshould_allreduce_fusionuse_reduce_scatterr:   c                 C   s.   t   st   s| |||S | ||S N)r   r   is_ascend_fuseepforward_normalforward_deepep)rD   r   r   r   r   rG   rG   rH   rX      s   	zStep3p5MoEMLP.forwardc                    s    fdd j  D S )Nc                    s.   g | ]\}}|d vrt || jjr|jqS ))rj   )r$   rl   num_local_expertsdata).0namerR   rD   rG   rH   
<listcomp>   s    
z1Step3p5MoEMLP.get_moe_weights.<locals>.<listcomp>)rl   named_parametersr   rG   r   rH   get_moe_weights   s   
zStep3p5MoEMLP.get_moe_weightsc           
      C   s   |j \}}|d|}| jr!t|tj| jj	 tj}n| |\}}| 
||}| jdkr@t|j| j |j|jd}| ||}	| jdkrV|sV|sVt sVt|	}	|	||S )NrI         ?)topk_weightstopk_idsrouter_logits   )shapeviewrq   rv   matmultory   rU   weighttr~   rs   r!   r   r   r   rl   rp   r   r   )
rD   r   r   r   
num_tokens
hidden_dimr   rT   topk_outputfinal_hidden_statesrG   rG   rH   r      s0   



zStep3p5MoEMLP.forward_normalc                 C   s\   |j d dkr| |\}}| j|||jtj| jdd}n| j|j}| j	||d}|S )Nr   ra   )num_token_non_paddedexpert_location_dispatch_info)r   r   )
r   rU   r~   r   r   init_newra   empty_topk_outputdevicerl   )rD   r   r   r   rT   r   r   rG   rG   rH   r      s    	zStep3p5MoEMLP.forward_deepepc                 C   s0   t |jj|jr| |j\|_}d S d |_d S r   )r1   r   forward_modehidden_states_mlp_inputrU   r   )rD   staterT   rG   rG   rH   op_gate  s
   

zStep3p5MoEMLP.op_gatec                 C   s   | d}|j}|d ur7t | j | j|||jjtj	| jdd|_
W d    d S 1 s0w   Y  d S | j|j|_
d S )Nr   r   )r   r   r   r   )popr   r   with_current_layerra   r~   r   r   r   r   r   r   r   )rD   r   r   r   rG   rG   rH   op_select_experts  s    

"zStep3p5MoEMLP.op_select_expertsc                 C   s8   | j dkr| jjj|d|d|dd d S d S )Nr   r   r   tbo_subbatch_index)r   r   r   )r   rl   
dispatcher
dispatch_ar   getrD   r   rG   rG   rH   op_dispatch_a!  s   

zStep3p5MoEMLP.op_dispatch_ac                 C   sZ   | j dkr+t | j | jjj|dd|_W d    d S 1 s$w   Y  d S d S Nr   r   )r   )	r   r   r   ra   rl   r   
dispatch_br   dispatch_outputr   rG   rG   rH   op_dispatch_b)  s   

"zStep3p5MoEMLP.op_dispatch_bc                 C   s   | j j|jd|_d S )N)r   )rl   run_moe_corer   combine_inputr   rG   rG   rH   
op_experts2  s   zStep3p5MoEMLP.op_expertsc                 C   s:   | j dkr| jjj|d|dd |d d S d S )Nr   r   r   )r   r   r   )r   rl   r   	combine_ar   r   r   rG   rG   rH   op_combine_a7  s   
zStep3p5MoEMLP.op_combine_ac                 C   s*   | j dkr| jjj|dd|_d S d S r   )r   rl   r   	combine_br   hidden_states_after_combiner   rG   rG   rH   op_combine_b?  s
   
zStep3p5MoEMLP.op_combine_bc                 C   s   | d|_d S )Nr   )r   hidden_states_mlp_outputr   rG   rG   rH   	op_outputE  s   zStep3p5MoEMLP.op_outputNr4   )NFF)FF)rY   rZ   r[   r\   r   r%   r^   rA   rv   Tensorr+   boolrX   r   r   r   r   r   r   r   r   r   r   r   r_   rG   rG   rE   rH   r`   l   sf    H

"
		r`   c                !       s   e Zd Z												d"d	ed
ededededeeeef  dee dedee	 dedede
dededeejj ddf  fddZdd Zdejdejdedejfd d!Z  ZS )#Step3p5Attentionr   @B N   r   FrI   r4   r5   	num_headsnum_kv_headsra   
rope_thetarope_scalinghead_dimmax_position_embeddingsr8   rms_norm_epspartial_rotary_factoruse_head_wise_attn_gatesliding_window_sizer9   
alt_streamr:   c                    s  t    || _t | _|| _t }t }| j| dksJ | j| | _|| _	| j	|kr6| j	| dks5J n	|| j	 dks?J t
d| j	| | _|pN|| j | _| j| j | _| j| j | _| jd | _|| _|| _t | _t| j|
d| _t| j|
d| _t|| j| j| j	d|	||td|d	| _t| j| j |d|	||td|d| _|| _| jrt|| jd||td	|d
| _t| j| j||||dd| _ t!| j| j| j| j||td|d| _"|| _#d S )Nr   r   g      ࿩epsFqkv_proj)r>   r8   tp_rankrp   r9   o_projg_proj)r>   r   rp   r9   T)
rotary_dimmax_positionbaser   r   is_neox_styleattn)r   r   ra   r9   )$r@   rA   r5   r   rp   total_num_headsr   r   r   total_num_kv_headsrM   r   r   q_sizekv_sizescalingr   r   r   r   r   q_normk_normr   r/   r   r   r   r   r   r   r'   
rotary_embr&   r   r   )rD   r5   r   r   ra   r   r   r   r   r8   r   r   r   r   r9   r   attn_tp_rankattn_tp_sizerE   rG   rH   rA   J  s   



		
	zStep3p5Attention.__init__c           
      C   s   |  |\}}|j| j| j| jgdd\}}}|j|j}}	| |d| j|}| |d| j|	}| 	|||\}}|||fS )NrI   rJ   )
r   splitr   r   r   r   reshaper   r   r   )
rD   	positionsr   qkvrT   qkvq_shapek_shaperG   rG   rH   forward_prepare_native  s    
z'Step3p5Attention.forward_prepare_nativer   r   r   c                 C   s   | j ||d\}}}| jr| |\}}| ||||}	| jr8|	|	jd | j| j|d	  }
|
j|	j }	| 
|	\}
}|
S )N)r   r   r   rI   )r   r   r   r   r   r   r   r   	unsqueezere   r   )rD   r   r   r   r   r   r   gate_statesrT   attn_outputrW   rG   rG   rH   rX     s&   zStep3p5Attention.forward)r   r   NNr   NNr   FrI   r4   N)rY   rZ   r[   r\   r]   r   r   r^   r   r%   r   rv   cudaStreamrA   r   r   r+   rX   r_   rG   rG   rE   rH   r   I  sp    	

c	r   c                       s   e Zd Z				ddededee dedeej	j
 d	df fd
dZ	ddedeej dee d	dfddZ	ddejdejdedeej deej d	eejejf fddZ  ZS )Step3p5DecoderLayerr   Nr4   r   ra   r8   r9   r   r:   c                    s$  t    |j| _|j}|j}|| |vrd }n|j}|j}	|j}
|j}dd |j	
dD }|j| _|j| _||v | _|j}|jrV|j| d urV|j| dkrV|j| }nd }d| _|| dk}|rs|j| _|jd | _|jd | _t| j| j| j||k r|n|| |	| |||
| j|j| ||j|jtd	||d
| _d| _| jrt|||td|d| _t| j|j||td|d| _d| _nt| j|j||td|d| _ t!|j|jd| _"t!|j|jd| _#t$j%|||jk r|jnddddd| _&t'| j&| j"| j#d| _(|| _)t*j+,ddk| _-d| _.d S )Nc                 S   s   g | ]}t |qS rG   )r\   )r   rR   rG   rG   rH   r     s    z0Step3p5DecoderLayer.__init__.<locals>.<listcomp>,r   rI   sliding_attentionnum_attention_headsnum_attention_groups	self_attn)r5   r   r   ra   r   r   r   r   r   r   r8   r   r   r9   r   Fmlp)ra   r8   r9   share_expert)r5   r6   r7   r8   r9   Tr   r   )ra   
num_layersis_layer_sparseis_previous_layer_sparseis_next_layer_sparse)layer_scatter_modesinput_layernormpost_attention_layernorm SGLANG_DUMP_STEP3P5_INTERMEDIATE1)/r@   rA   r5   layer_typesyarn_only_typesr   r   r   r   moe_layers_enumr   r   r   num_key_value_headsis_moe_layernum_hidden_layersswiglu_limits_sharedsliding_windowattention_other_settingr   partial_rotary_factorsr   r   r/   r   use_moer`   moer3   share_expert_dimr  r6   r   r   r  r  r   r   r  r   layer_communicatorra   osenvironr   dump_intermediate
_dump_step)rD   r   ra   r8   r9   r   r  r  r   r   r   r   moe_layers_listr  swiglu_limit_sharedenable_sliding_windowrE   rG   rH   rA     s   

	
zStep3p5DecoderLayer.__init__r   tensorstep_idc                 C   s   | j r|d u st|sd S d}z6tj|dd t }|d ur$d| nd}tj|d| j | d| d| d	}t	|
  | W d S  tyX   td
|| j Y d S w )Nz/sgl-workspace/sglT)exist_ok_stepr4   step3p5_layerrT   _tpz.ptz%Failed to dump tensor %s for layer %s)r  rv   	is_tensorr  makedirsr   pathjoinra   savedetachcpu	Exceptionlogger	exception)rD   r   r   r!  dump_dirr   	step_partr(  rG   rG   rH   _dump_tensorJ  s"   
z Step3p5DecoderLayer._dump_tensorr   r   r   residualpost_residual_additionc           	      C   s  | j j||||d\}}d }| jr"| j}|  jd7  _| d|| |jd dkr1| j|||d}| d|| || }|}| d|| | |}| d|| | jrc| 	|}| 
|}|| }n| |}| d	|| | j |||\}}| d
|| ||fS )N)r4  r   
attn_inputr   )r   r   r   r   post_attn_residual	mlp_input
mlp_outputlayer_output)r  prepare_attnr  r  r2  r   r   r  r  r  r  r   postprocess_layer)	rD   r   r   r   r3  r4  	dump_stepshare_output
moe_outputrG   rG   rH   rX   a  sD   	





zStep3p5DecoderLayer.forward)r   Nr4   Nr   )rY   rZ   r[   Step3p5Configr\   r   r%   r^   rv   r   r   rA   r   r2  r+   r   rX   r_   rG   rG   rE   rH   r     sT    
{
r   c                       s   e Zd Z		ddee deddf fddZdejdejfd	d
Z	de
jfddZ		ddejdejdedejdee deejef fddZ  ZS )Step3p5ModelNr4   r8   r9   r:   c                    s   t    | _j| _j| _t | _trt	j
 nd  | jjr=tjjt  td|t jd ur7t	jnd d| _nt | _tj fdd| jj| jjtd|d\| _| _| _| jjrmtjjd| _ d S tdd	| _ d S )
Nembed_tokens)r8   	enable_tpr9   params_dtypec                    s   t | | dS )N)ra   r   r8   r9   r   )r   )idxr9   r   r   r8   rG   rH   <lambda>  s    z'Step3p5Model.__init__.<locals>.<lambda>layers)pp_rankpp_sizer9   r   T)return_tuple)!r@   rA   r   pad_token_idpadding_idx
vocab_sizer
   pp_group_is_cudarv   r   r   is_first_rankr*   r5   r   r/   r.   rl_on_policy_targetry   rA  r(   r2   r  rank_in_group
world_sizerG  start_layer	end_layeris_last_rankr   r   norm)rD   r   r8   r9   rE   rE  rH   rA     s8   

zStep3p5Model.__init__	input_idsc                 C   s,   t | jdr|  || jj S |  |S )N	scale_emb)hasattrr   get_input_embeddingsrY  )rD   rX  rG   rG   rH   get_input_embedding  s   z Step3p5Model.get_input_embeddingc                 C   s   | j S r   )rA  r   rG   rG   rH   r[    s   z!Step3p5Model.get_input_embeddingsr   r   input_embedspp_proxy_tensorsc                 C   s   | j jr|d u r| |}n|}d }n|d usJ |d }|d }t| j| jD ]}| j| }	|	||||\}}q(| j jsDt||dS d }
| j jsQt||dS |j	d dkrw|d u r^|n|| }
|d u ro| 
|}||
fS | 
||\}}||
fS )Nr   r3  )r   r3  r   )rN  rP  rA  rangerT  rU  rG  rV  r,   r   rW  )rD   rX  r   r   r]  r^  r   r3  ilayerhidden_states_before_normrT   rG   rG   rH   rX     sJ   


zStep3p5Model.forwardr   NN)rY   rZ   r[   r   r%   r^   rA   rv   r   r\  r   	Embeddingr[  r+   r,   r   rX   r_   rG   rG   rE   rH   r@    s6    1r@  c                       s   e Zd Zg dZddddddZ			d%d
edee deddf fddZ	de
jfddZe 		d&dejdejdedejdee dejfddZedd Zedd Zd'deeeejf  fdd Zd!d" Zd#d$ Z  ZS )(Step3p5ForCausalLM)z.gate_proj.z.down_proj.z	.up_proj.z.q_proj.z.k_proj.z.v_proj.z.o_proj.)r   r   )r   r   )r   r;   )r<   r   )r<   r   )q_projk_projv_proj	gate_projup_projNr4   r   r8   r9   r:   c                    s  t    t | _|| _|| _t||td|d| _d| _	d| _
| jjrE| jjdkr3| j	r3| jj| _nt|j|j|t jtd|d| _nt | _| jjdkr| j	r| jjrf| jj| jjj| jjd d n| jjr| jj| jjjt| j jdd	}| jj| t|| _d S )
Nmodel)r8   r9   Fr   r   lm_head)r8   use_attn_tp_groupr9   )dst)sizerb   src) r@   rA   r
   rN  r   r8   r@  r/   rk  tie_word_embeddingsnum_fused_shared_expertsrV  rS  rA  rl  r)   rM  r5   r.   enable_dp_lm_headr(   rP  sendr   recvr   next
parametersrb   copy_r   logits_processor)rD   r   r8   r9   emb_token_weightrE   rG   rH   rA     sB   

	zStep3p5ForCausalLM.__init__c                 C   s
   | j  S r   )rk  r[  r   rG   rG   rH   r[  N  s   
z'Step3p5ForCausalLM.get_input_embeddingsrX  r   r   r]  r^  c                 C   s:   | j |||||d\}}| jjr| j||| j||dS |S )N)r^  )rb  )rk  rN  rV  ry  rl  )rD   rX  r   r   r]  r^  r   rb  rG   rG   rH   rX   Q  s    	
zStep3p5ForCausalLM.forwardc                 C      | j jS r   )rk  rT  r   rG   rG   rH   rT  m     zStep3p5ForCausalLM.start_layerc                 C   r{  r   )rk  rU  r   rG   rG   rH   rU  q  r|  zStep3p5ForCausalLM.end_layerFweightsc              	   C   s  dt dtt fdd}g d}| jdkr| jdksJ tjdd	d
| jj| j d}t| 	 }t
 }dt dt dtfdd}|D ]\}	}
t| jdrt| jdd}|r|	dr||	}|d ur|sj|| jjkriq?n|dkrrtd| jjdkrzdn| jj}||krq?|D ]1\}}}||	vrqd|	vrd|	v rq|	||}	|	|vrq||	 }|j}|||
| ||	  nd|	vsd|	v r|	|vrq?||	 }t|dt}|||
 ||	 q?d|	v r|	|vrq?||	 }|j}|||
 ||	 q?|D ]M}|\}}}}|| jjkrq||	|sq|dd }|	||d d }|	|d |}||vr,q|| }|j}|||
| |	||d || qq?t
| | }t|dks[J d| d S )Nweight_namer:   c                 S   sV   |  d}t|dkr)|d dkr)|d dkr)zt|d W S  ty(   Y d S w d S )Nrd      r   rk  r   rG  r;   )r   lenr\   r{   )r~  partsrG   rG   rH   _get_layer_id_from_weight_name~  s   
$zGStep3p5ForCausalLM.load_weights.<locals>._get_layer_id_from_weight_name))	.qkv_projz.q_projr   )r  z.k_projr   )r  z.v_projr   ).gate_up_projz
.gate_projr   )r  z.up_projr   r   r   ri  r?   rj  )ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namerm   	name_pathweight_pathc                 S   sD   |  d}| d}t|dkst|dkrdS |d |d k}|S )Nrd      r;   F)r   r  )r  r  
name_partsweight_partsshard_id_matchesrG   rG   rH   match_expert_and_shard_ids  s   

zCStep3p5ForCausalLM.load_weights.<locals>.match_expert_and_shard_idsnum_nextn_predict_layerszmodel.layers.z8Only 1 nextn layer is supported for Step3p5 checkpoints.zgate.r  rz   weight_loaderrd   rI   )shard_id	expert_idz Some parameters are not loaded: )r^   r   r\   rr  r    make_expert_params_mappingr   rx   dictr   setr   rZ  getattr
startswithr  r{   replacer  addr-   r   keysr  )rD   r}  is_nextnr  stacked_params_mappingexpert_params_mappingparams_dictloaded_paramsr  r   loaded_weightnum_nextn_layersra   nextn_layer_id
param_namer~  r  paramr  mappingr  	part_namefake_weight_nameactual_param_nameprint_paramsrG   rG   rH   load_weightsu  s   	

		




 zStep3p5ForCausalLM.load_weightsc                 C   s   | j jj| jjfS r   )rk  rA  r   rl  r   rG   rG   rH   get_embed_and_head  s   z%Step3p5ForCausalLM.get_embed_and_headc                 C   s8   | j j`| j`|| j j_|| j_tj  tj  d S r   )rk  rA  r   rl  rv   r   empty_cachesynchronize)rD   embedheadrG   rG   rH   set_embed_and_head  s   

z%Step3p5ForCausalLM.set_embed_and_headr   rc  )F)rY   rZ   r[   #default_bitsandbytes_target_modules#bitsandbytes_stacked_params_mappingr?  r   r%   r^   rA   r   rd  r[  rv   no_gradr   r+   r,   rX   propertyrT  rU  r   r   r  r  r  r_   rG   rG   rE   rH   re  	  sX    1

 re  )\loggingr  typingr   r   r   r   r   r   rv   torch.nn.functionalr   
functionalrO   sglang.srt.distributedr	   r
   r   r   r   #sglang.srt.eplb.expert_distributionr   (sglang.srt.eplb.expert_location_dispatchr   sglang.srt.layers.activationr   sglang.srt.layers.communicatorr   r   sglang.srt.layers.dp_attentionr   r   r   sglang.srt.layers.layernormr   sglang.srt.layers.linearr   r   r   r   r   "sglang.srt.layers.logits_processorr   sglang.srt.layers.moer   r   "sglang.srt.layers.moe.ep_moe.layerr   ,sglang.srt.layers.moe.fused_moe_triton.layerr    sglang.srt.layers.moe.topkr!   r"   sglang.srt.layers.moe.utilsr#   r$   *sglang.srt.layers.quantization.base_configr%   !sglang.srt.layers.radix_attentionr&   "sglang.srt.layers.rotary_embeddingr'   sglang.srt.layers.utilsr(   *sglang.srt.layers.vocab_parallel_embeddingr)   r*   ,sglang.srt.model_executor.forward_batch_infor+   r,   $sglang.srt.model_loader.weight_utilsr-   sglang.srt.server_argsr.   sglang.srt.utilsr/   r0   r1   r2   r?  	getLoggerrY   r.  rO  Moduler3   r`   r   r   r@  re  
EntryClassrG   rG   rG   rH   <module>   sX     
, ^ 
 Dt  