o
    پiF                     @   s  d Z ddlZddlmZ ddlmZmZmZmZm	Z	m
Z
mZ ddlZddlm  mZ ddlmZ ddlmZ ddlmZ ddlmZmZmZmZ dd	lmZ dd
lmZ ddlmZ ddl m!Z! ddl"m#Z#m$Z$m%Z% ddl&m'Z'm(Z(m)Z) ddl*m+Z+ ddl,m-Z-m.Z.m/Z/m0Z0 ddl1m2Z2 ddl3m4Z4 ddl5m6Z6 ddl7m8Z8 ddl9m:Z: ddl;m<Z<m=Z= ddl>m?Z? ddl@mAZA ddlBmCZC ddlDmEZEmFZF ddlGmHZHmIZI ddlJmKZK ddlLmMZMmNZN ddlOmPZP ddlQmRZR dd lSmTZTmUZUmVZVmWZWmXZXmYZY eZe[Z\eW Z]eV Z^eU Z_G d!d" d"ej`ZaG d#d$ d$ej`ZbG d%d& d&ej`ZcG d'd( d(ej`ZdG d)d* d*ej`ZeG d+d, d,ej`ZfefZgdS )-zBInference-only Qwen2MoE model compatible with HuggingFace weights.    N)nullcontext)AnyDictIterableListOptionalTupleUnion)nn)PretrainedConfig)model_forward_maybe_tbo)"get_moe_expert_parallel_world_sizeget_pp_group$get_tensor_model_parallel_world_size tensor_model_parallel_all_reduce)'get_global_expert_distribution_recorder)ModelConfigForExpertLocation)ExpertLocationDispatchInfo)
SiluAndMul)LayerCommunicatorLayerScatterModesScatterMode)get_attention_tp_rankget_attention_tp_sizeis_dp_attention_enabled)RMSNorm)MergedColumnParallelLinearQKVParallelLinearReplicatedLinearRowParallelLinear)LogitsProcessor)get_moe_a2a_backend)get_moe_impl_class)FusedMoE)TopK)RoutingMethodType%filter_moe_weight_param_global_expert)QuantizationConfig)RadixAttention)get_rope)PPMissingLayerget_layer_id)ParallelLMHeadVocabParallelEmbedding)get_is_capture_mode)ForwardBatchPPProxyTensors)default_weight_loader)get_global_server_args)
add_prefixcpu_has_amx_supportis_cpuis_cudamake_layersuse_intel_amx_backendc                       sr   e Zd Z					ddedededee ded	ed
ee dee ddf fddZ		ddedefddZ	  Z
S )Qwen2MoeMLPNT hidden_sizeintermediate_size
hidden_actquant_configreduce_resultsprefixtp_ranktp_sizereturnc	           	   
      sr   t    t||gd d|td|||d| _t||d||td|||d| _|dkr3td| d	t | _	d S )
N   Fgate_up_proj)biasr>   r@   rA   rB   	down_proj)rF   r>   r?   r@   rA   rB   siluzUnsupported activation: z!. Only silu is supported for now.)
super__init__r   r3   rE   r   rG   
ValueErrorr   act_fn)	selfr;   r<   r=   r>   r?   r@   rA   rB   	__class__ O/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/qwen2_moe.pyrJ   ^   s2   
	

zQwen2MoeMLP.__init__Fshould_allreduce_fusionuse_reduce_scatterc                 C   s2   |  |\}}| |}| j||p|d\}}|S )N)skip_all_reduce)rE   rL   rG   )rM   xrR   rS   gate_up_rP   rP   rQ   forward   s   

zQwen2MoeMLP.forward)NTr:   NN)FF)__name__
__module____qualname__intstrr   r'   boolrJ   rX   __classcell__rP   rP   rN   rQ   r9   ]   s@    	
(r9   c                       s   e Zd Z			ddededee deejj	 de
f
 fdd	Zd
d ZdejfddZdejdefddZdejfddZdejdejfddZ		ddejdee dedejfddZ  ZS )Qwen2MoeSparseMoeBlockNr:   layer_idconfigr>   
alt_streamr@   c              
      sn  t    t | _|| _|| _| j|jkr"td| j d|j dt|j	|j
|d| _t|| j|j	|jt j |j|j|td|tjd| _t|j|jdd td|d	| _|jd
kr}td|j|j|j|dtd|dt  rwtd
ddni | _nd | _trtrt|jddd td|d	| _ nt!j"j#|jddd| _ t  rt$ | _%|jt j | _|j	| _&d S d S )NzTensor parallel size z' is greater than the number of experts .)top_krenormalizera   experts)ra   re   num_expertsr;   r<   r>   r@   routing_method_typeFgate)rF   r>   r@   r   shared_expert)r;   r<   r=   r>   r?   r@      )rA   rB   shared_expert_gate)rF   rP   )'rI   rJ   r   rB   ra   rc   rh   rK   r$   num_experts_per_toknorm_topk_probtopkr"   r2   ep_num_redundant_expertsr;   moe_intermediate_sizer3   r%   RenormalizeNaiverg   r   rj   shared_expert_intermediate_sizer9   r=   r!   	is_deepepdictrk   _is_cpu_is_cpu_amx_availablerm   torchr
   Linearr   ep_sizere   )rM   ra   rb   r>   rc   r@   rN   rP   rQ   rJ      s~   


	


zQwen2MoeSparseMoeBlock.__init__c                    s    fdd j  D S )Nc                    s.   g | ]\}}|d vrt || jjr|jqS ))correction_bias)r&   rg   num_local_expertsdata).0namerU   rM   rP   rQ   
<listcomp>   s    
z:Qwen2MoeSparseMoeBlock.get_moe_weights.<locals>.<listcomp>)rg   named_parametersr   rP   r   rQ   get_moe_weights   s   
z&Qwen2MoeSparseMoeBlock.get_moe_weightshidden_statesc                 C   sf   d }| j d ur1|  |}| jd ur1t| jr'tjj|| jj| jjd|}|S t	
| || }|S )NT)rk   rm   r8   ry   ops
sgl_kernelfused_linear_sigmoid_mulweightrF   Fsigmoid)rM   r   shared_outputrP   rP   rQ   _forward_shared_experts   s$   



z.Qwen2MoeSparseMoeBlock._forward_shared_expertsforward_batchc                 C   s|   d }|j d dkr%| |\}}| |}| j|||jtj| jdd}n| j|j	}| j
||d}|d ur<|| |S )Nr   )ra   )num_token_non_paddedexpert_location_dispatch_info)r   topk_output)shaperj   r   rp   r   r   init_newra   empty_topk_outputdevicerg   add_)rM   r   r   r   router_logitsrW   r   final_hidden_statesrP   rP   rQ   _forward_deepep   s(   
	
z&Qwen2MoeSparseMoeBlock._forward_deepepc                 C   s&   |  |\}}| ||}| ||S N)rj   rp   rg   )rM   r   r   rW   r   rP   rP   rQ   _forward_router_experts  s   z.Qwen2MoeSparseMoeBlock._forward_router_expertsrC   c                 C   sp   t j }| j| | | }t j| j | |}W d    n1 s)w   Y  || j ||fS r   )	ry   cudacurrent_streamrc   wait_streamr   clonestreamr   )rM   r   r   r   router_outputrP   rP   rQ   forward_normal_dual_stream  s   
z1Qwen2MoeSparseMoeBlock.forward_normal_dual_streamFrS   c                 C   s   |j \}}|d|}t  r| ||S | jd ur-|j d dkr-t r-| |\}}n
| |}| 	|}|d ur?||7 }| j
dkrJ|sJt|}|||S )Nr   rl   )r   viewr!   ru   r   rc   r.   r   r   r   rB   r   )rM   r   r   rS   
num_tokens
hidden_dimr   r   rP   rP   rQ   rX   .  s$   





zQwen2MoeSparseMoeBlock.forward)NNr:   )NF)rY   rZ   r[   r\   r   r   r'   ry   r   Streamr]   rJ   r   Tensorr   r/   r   r   r   r^   rX   r_   rP   rP   rN   rQ   r`      sF    
N

r`   c                       s   e Zd Z								ddeded	ed
ededeeeef  dededee	 dee
eef  deddf fddZdejdejdedejfddZ  ZS )Qwen2MoeAttentionr   '  N    Tr:   r;   	num_headsnum_kv_headsra   
rope_thetarope_scalingmax_position_embeddingsqkv_biasr>   dual_chunk_attention_configr@   rC   c                    s\  t    || _t }t }|| _| j| dksJ | j| | _|| _| j|kr2| j| dks1J n	|| j dks;J td| j| | _	|| j | _
| j| j
 | _| j	| j
 | _| j
d | _|| _|| _t|| j
| j| j||	||td|d	| _t| j| j
 |d|	||dtd|d| _t| j
| j
||||
d	| _t| j| j
| j| j	||	td
|d| _d S )Nr   rl   g      qkv_proj)rF   r>   rA   rB   r@   Fo_proj)rF   r>   rA   rB   r?   r@   )
rotary_dimmax_positionbaser   r   attn)r   ra   r>   r@   )rI   rJ   r;   r   r   total_num_headsr   total_num_kv_headsmaxr   head_dimq_sizekv_sizescalingr   r   r   r3   r   r   r   r)   
rotary_embr(   r   )rM   r;   r   r   ra   r   r   r   r   r>   r   r@   attn_tp_rankattn_tp_sizerN   rP   rQ   rJ   S  sp   


zQwen2MoeAttention.__init__	positionsr   r   c                 C   sb   |  |\}}|j| j| j| jgdd\}}}| |||\}}| ||||}	| |	\}
}|
S )Nr   )dim)r   splitr   r   r   r   r   )rM   r   r   r   qkvrW   qkvattn_outputoutputrP   rP   rQ   rX     s    zQwen2MoeAttention.forward)r   r   Nr   TNNr:   )rY   rZ   r[   r\   floatr   r   r]   r   r'   rv   rJ   ry   r   r/   rX   r_   rP   rP   rN   rQ   r   R  sV    	
Qr   c                       s   e Zd Z			ddededee dedeej	j
 ddf fd	d
Z	ddejdejdedeej deeej  deejejf fddZ  ZS )Qwen2MoeDecoderLayerNr:   rb   ra   r>   r@   rc   rC   c                    s@  t    || _|j| _t|dd}t|dd }t|dd}t|dd}	t|dd }
t| j|j|j||||||
|	td	|d
| _	|| _
t | _t | _d| _d}d}tj||j| j||d| _| jrpt||||td|d| _nt|j|j|j|td|d| _t|j|jd| _t|j|jd| _t| j| j| jdd| _d S )Nr   r   r   r   r   r   Tr   	self_attn)r;   r   r   ra   r   r   r   r>   r   r   r@   )ra   
num_layersis_layer_sparseis_previous_layer_sparseis_next_layer_sparsemlp)ra   rb   r>   rc   r@   )r;   r<   r=   r>   r@   eps)layer_scatter_modesinput_layernormpost_attention_layernormallow_reduce_scatter)rI   rJ   rb   r;   getattrr   num_attention_headsnum_key_value_headsr3   r   ra   r   r   r   r   r   r   r   num_hidden_layersr   r`   r   r9   r<   r=   r   rms_norm_epsr   r   r   layer_communicator)rM   rb   ra   r>   r@   rc   r   r   r   r   r   r   r   rN   rP   rQ   rJ     sz   

zQwen2MoeDecoderLayer.__init__r   r   r   residualcaptured_last_layer_outputsc                 K   s   | j j|||fd|i|\}}|jd dkr| j|||d}| j |||\}}| j |}| |||}| j |||\}}||fS )Nr   r   )r   r   r   )r   +prepare_attn_and_capture_last_layer_outputsr   r   prepare_mlpshould_use_reduce_scatterr   postprocess_layer)rM   r   r   r   r   r   kwargsrS   rP   rP   rQ   rX      s6   
zQwen2MoeDecoderLayer.forward)Nr:   Nr   )rY   rZ   r[   r   r\   r   r'   r]   ry   r   r   rJ   r   r/   r   r   rX   r_   rP   rP   rN   rQ   r     s>    
Sr   c                       s   e Zd Zddedfdedee dedee	j
 deejj ddf fd	d
Zdee fddZ		ddejdejdedejdee deejef fddZ  ZS )Qwen2MoeModelNr:   rb   r>   r@   decoder_layer_typerc   rC   c                    s   t    | _j| _j| _t | _| jjr(t	jj
t td|d| _nt | _p/ttj fdd| jj| jjtd|d\| _| _| _| jjr\tj
jd| _ntdd	| _g | _d S )
Nembed_tokens)use_attn_tp_groupr@   c                    s   | | dS )N)ra   rb   r>   r@   rc   rP   )idxr@   rc   rb   r   r>   rP   rQ   <lambda>K  s    z(Qwen2MoeModel.__init__.<locals>.<lambda>layers)pp_rankpp_sizer@   r   T)return_tuple)rI   rJ   rb   pad_token_idpadding_idx
vocab_sizer   pp_groupis_first_rankr-   r;   r   r3   r   r*   r   r7   r   rank_in_group
world_sizer   start_layer	end_layeris_last_rankr   r   normlayers_to_capture)rM   rb   r>   r@   r   rc   rN   r   rQ   rJ   .  s2   


zQwen2MoeModel.__init__r   c                 C   s(   || _ | j D ]}t| j| dd qd S )N_is_layer_to_captureT)r   setattrr   )rM   r   ra   rP   rP   rQ   set_eagle3_layers_to_capture^  s   
z*Qwen2MoeModel.set_eagle3_layers_to_capture	input_idsr   r   input_embedspp_proxy_tensorsc                 C   sP  | j jr|d u r| |}n|}d }n|d usJ |d }|d }g }|jr7t| jdt ||||d\}}nAt| j	| j
D ]9}	t jrGt nt |	}
|
  | j|	 }|||||t|ddrb|nd d\}}W d    n1 srw   Y  q>| j jst||dS |jd	 d	kr|d u r| |}n| ||\}}t|d	kr|S ||fS )
Nr   r   T)r   
enable_tboinput_data_scatter_moder   r   r   r   r   F)r   )r   r   r   )r   r   r   can_run_tbor   r   r   model_input_outputranger   r   r2   enable_piecewise_cuda_graphr   r   with_current_layerr   r   r0   r   r   len)rM   r  r   r   r  r  r   r   aux_hidden_statesictxlayerrW   rP   rP   rQ   rX   c  sd   



zQwen2MoeModel.forwardNN)rY   rZ   r[   r   r   r   r'   r]   typer
   Modulery   r   r   rJ   r   r\   r   r   r/   r0   r	   rX   r_   rP   rP   rN   rQ   r   -  sD    
0
r   c                       s  e Zd ZdZ		d dedee deddf fdd	Ze	
 		d!d
e	jde	jdede	jdee de	jfddZe	
 	d"d
e	jde	jdedeeef de	jf
ddZedd Zedd Zdeeee	jf  fddZedd Zd"deee  fddZ  ZS )#Qwen2MoeForCausalLMFNr:   rb   r>   r@   rC   c                    s~   t    t | _|| _|| _trtj	 nd }t
||td||d| _t|j|j|td|t jd| _t|| _d| _d S )Nmodel)r@   rc   lm_head)r>   r@   r   F)rI   rJ   r   r   rb   r>   _is_cudary   r   r   r   r3   r  r,   r   r;   r2   enable_dp_lm_headr  r    logits_processorcapture_aux_hidden_states)rM   rb   r>   r@   rc   rN   rP   rQ   rJ     s(   


zQwen2MoeForCausalLM.__init__r  r   r   r  r  c                 C   sF   | j |||||d}d }| jr|\}}| jjr!| ||| j||S |S )N)r  )r  r  r   r   r  r  )rM   r  r   r   r  r  r   r  rP   rP   rQ   rX     s   	zQwen2MoeForCausalLM.forwardsplit_intervalc              	   C   s   |\}}|dkr|d u r| j ||_n||_t||D ]+}t | | j j| }	|	||j||j\|_|_W d    n1 sBw   Y  q|| j jj	kri| j 
|j|j\}
}|
|_| ||j| j|}|S d }|S )Nr   )r  r   r   r  r   r
  r   r   rb   r   r   r  r  )rM   r  r   r   r  r  startendr  r  r   rW   resultrP   rP   rQ   forward_split_prefill  s6   		z)Qwen2MoeForCausalLM.forward_split_prefillc                 C      | j jS r   )r  r   r   rP   rP   rQ   r        zQwen2MoeForCausalLM.start_layerc                 C   r  r   )r  r   r   rP   rP   rQ   r     r   zQwen2MoeForCausalLM.end_layerweightsc              	   C   s  g d}t jddd| jjd}t|  }|D ]\}}t|}|d ur6t| jdr6|| jj	k s5|| jj
kr6qd|v r;q|D ]2\}}	}
|	|vrGq=d|v rLq=||	|}|d	r\||vr\q=||vraq=|| }|j}||||
  nU|D ]$}|\}}	}}
|	|vrqr||	|}|| }|j}|||||
|d
  n.|d	r||vrq||vrq|| v r|| }t|dt}||| qtd| d qd S )N))r   q_projr   )r   k_projr   )r   v_projr   )rE   	gate_projr   )rE   up_projrl   r%  rG   r&  )ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namerh   r   zrotary_emb.inv_freqzmlp.expertsz.bias)shard_id	expert_idweight_loaderz
Parameter z not found in params_dict)r#   make_expert_params_mappingrb   rh   rv   r   r+   hasattrr  r   r   replaceendswithr,  keysr   r1   loggerwarning)rM   r!  stacked_params_mappingexpert_params_mappingparams_dictr   loaded_weightra   
param_nameweight_namer*  paramr,  mappingr+  rP   rP   rQ   load_weights  sx   	
z Qwen2MoeForCausalLM.load_weightsc                 C   s   t |j|jd dS )N)r   num_logical_experts
num_groups)r   r   rh   )clsrb   rP   rP   rQ   $get_model_config_for_expert_locationk  s
   z8Qwen2MoeForCausalLM.get_model_config_for_expert_location	layer_idsc                 C   sZ   | j jsd S d| _|d u r | jj}| jd|d |d g d S | jdd |D  d S )NTrD      c                 S   s   g | ]}|d  qS )rl   rP   )r   valrP   rP   rQ   r     s    zDQwen2MoeForCausalLM.set_eagle3_layers_to_capture.<locals>.<listcomp>)r   r   r  rb   r   r  r   )rM   rA  r   rP   rP   rQ   r   s  s   z0Qwen2MoeForCausalLM.set_eagle3_layers_to_capture)Nr:   r  r   )rY   rZ   r[   fall_back_to_pt_during_loadr   r   r'   r]   rJ   ry   no_gradr   r/   r0   rX   r   r\   r  propertyr   r   r   r<  classmethodr@  r   r   r_   rP   rP   rN   rQ   r    sd    
*

V
 r  )h__doc__logging
contextlibr   typingr   r   r   r   r   r   r	   ry   torch.nn.functionalr
   
functionalr   transformersr   *sglang.srt.batch_overlap.two_batch_overlapr   sglang.srt.distributedr   r   r   r   #sglang.srt.eplb.expert_distributionr   sglang.srt.eplb.expert_locationr   (sglang.srt.eplb.expert_location_dispatchr   sglang.srt.layers.activationr   sglang.srt.layers.communicatorr   r   r   sglang.srt.layers.dp_attentionr   r   r   sglang.srt.layers.layernormr   sglang.srt.layers.linearr   r   r   r   "sglang.srt.layers.logits_processorr    sglang.srt.layers.moer!   "sglang.srt.layers.moe.ep_moe.layerr"   &sglang.srt.layers.moe.fused_moe_tritonr#   sglang.srt.layers.moe.topkr$   sglang.srt.layers.moe.utilsr%   r&   *sglang.srt.layers.quantization.base_configr'   !sglang.srt.layers.radix_attentionr(   "sglang.srt.layers.rotary_embeddingr)   sglang.srt.layers.utilsr*   r+   *sglang.srt.layers.vocab_parallel_embeddingr,   r-   +sglang.srt.model_executor.cuda_graph_runnerr.   ,sglang.srt.model_executor.forward_batch_infor/   r0   $sglang.srt.model_loader.weight_utilsr1   sglang.srt.server_argsr2   sglang.srt.utilsr3   r4   r5   r6   r7   r8   	getLoggerrY   r2  r  rw   rx   r  r9   r`   r   r   r   r  
EntryClassrP   rP   rP   rQ   <module>   s^   $ 
	4 B`{| ]