o
    
۾iuz                     @   s  d Z ddlZddlmZmZ ddlmZ ddlmZ ddlZddl	m
  mZ ddlm
Z
 ddlmZ ddlmZmZmZ dd	lmZmZmZmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddl m!Z! ddl"m#Z#m$Z$m%Z%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z, ddl-m.Z.m/Z/ ddl0m1Z1m2Z2 ddl3m4Z4 ddl5m6Z6 ddl7m8Z8m9Z9m:Z:m;Z; ddl<m=Z=m>Z>m?Z?m@Z@mAZAmBZBmCZC eeDZEG dd de
jFZGG dd de
jFZHG dd de
jFZIG d d! d!e
jFZJeG d"d# d#e
jFZKG d$d% d%e
jFe;e:e9e8ZLdS )&zBInference-only Qwen3MoE model compatible with HuggingFace weights.    N)CallableIterable)islice)Any)nn)support_torch_compile)CacheConfig
VllmConfigget_current_vllm_config)get_ep_groupget_pp_group$get_tensor_model_parallel_world_size tensor_model_parallel_all_gather)init_logger)
SiluAndMul)	Attention)SharedFusedMoE)RMSNorm)MergedColumnParallelLinearQKVParallelLinearReplicatedLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)get_rope)ParallelLMHeadVocabParallelEmbedding)default_weight_loadermaybe_remap_kv_scale_name)sequence_parallel_chunk)IntermediateTensors   )MixtureOfExpertsSupportsEagle3SupportsLoRA
SupportsPP)AutoWeightsLoaderPPMissingLayerextract_layer_indexis_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefixc                       s\   e Zd Z				ddededededB ded	ejj	dB d
eddf fddZ
dd Z  ZS )Qwen3MoeMLPNT hidden_sizeintermediate_size
hidden_actquant_configreduce_resultsexpert_gateprefixreturnc                    sp   t    t||gd d|| dd| _t||d||| dd| _|dkr/td| d	t | _|| _	d S )
N   Fz.gate_up_projbiasr2   r5   z
.down_proj)r9   r2   r3   r5   siluzUnsupported activation: z!. Only silu is supported for now.)
super__init__r   gate_up_projr   	down_proj
ValueErrorr   act_fnr4   )selfr/   r0   r1   r2   r3   r4   r5   	__class__ X/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/qwen3_moe.pyr<   S   s,   



zQwen3MoeMLP.__init__c                 C   sL   |  |\}}| |}| |\}}| jd ur$t| |d | }|S )Nr   )r=   r@   r>   r4   Fsigmoid)rA   xgate_up_outrD   rD   rE   forwardt   s   

zQwen3MoeMLP.forward)NTNr.   )__name__
__module____qualname__intstrr   booltorchr   Linearr<   rL   __classcell__rD   rD   rB   rE   r-   R   s.    
	!r-   c                       s@   e Zd Z	d
dedef fddZdejdejfdd	Z  Z	S )Qwen3MoeSparseMoeBlockr.   vllm_configr5   c                    s  t    |jj}|j}|j}t | _t j	| _
t j| _| j
 | _|j| _|j| _| j|jkr>td| j d|j dt }|jj}|j| _| j| _|j| _| j| j | _| j| j | _| j| j | _| j| j | _t|j|jd|| dd| _ t!|dd}|dkrt|jd	dd | d
d| _"t#|j||j$|d| j"| dd| _%nd | _"d | _%t&| j%| j | j|j'|j|j(d|j)|| d| j| j| jd| _*d S )NzTensor parallel size z' is greater than the number of experts .Fz.gater8   shared_expert_intermediate_sizer   r!   z.shared_expert_gatez.shared_expert)r/   r0   r1   r2   r3   r4   r5   z.experts)shared_expertsgatenum_expertstop_kr/   r0   r3   renormalizer2   r5   enable_eplbnum_redundant_expertsis_sequence_parallel)+r;   r<   model_confighf_text_configparallel_configr2   r   tp_sizer   device_groupep_grouprank_in_groupep_ranksizeep_sizer\   n_routed_expertsuse_sequence_parallel_moera   r?   r
   eplb_configr_   n_logical_expertsr`   n_redundant_expertsn_physical_expertsn_local_physical_expertsphysical_expert_startphysical_expert_endr   r/   r[   getattrshared_expert_gater-   r1   shared_expertr   num_experts_per_tokmoe_intermediate_sizenorm_topk_probexperts)rA   rW   r5   configrd   r2   rn   rY   rB   rD   rE   r<      s   






zQwen3MoeSparseMoeBlock.__init__hidden_statesr6   c           
      C   s   |  dks
J d|  dk}|j\}}|d|}| jr"t|}| |\}}| j||d\}}|d ur:|| n|}	| jrKt|	d}	|	d | }	n| jdkrV| j	|	}	|r]|	
dS |	S )Nr7   z4Qwen3MoeSparseMoeBlock only supports 1D or 2D inputsr!   )r}   router_logitsr   )dimshapeviewra   r   r[   r{   r   re   &maybe_all_reduce_tensor_model_parallelsqueeze)
rA   r}   is_input_1d
num_tokens
hidden_dimr   rJ   
shared_out	fused_outfinal_hidden_statesrD   rD   rE   rL      s0   


zQwen3MoeSparseMoeBlock.forwardr.   )
rM   rN   rO   r	   rQ   r<   rS   TensorrL   rU   rD   rD   rB   rE   rV      s    YrV   c                       s   e Zd Z								ddededed	eeef d
ededB dedede	dB de
dB dedeeef dB ddf fddZdejdejdejfddZ  ZS )Qwen3MoeAttention    Nư>Fr.   r/   	num_headsnum_kv_headsrope_parametersmax_position_embeddingshead_dimrms_norm_epsqkv_biascache_configr2   r5   dual_chunk_attention_configr6   c              	      s  t    || _t }|| _| j| dksJ | j| | _|| _| j|kr/| j| dks.J n	|| j dks8J td| j| | _|pG|| j | _	| j| j	 | _
| j| j	 | _| j	d | _|| _|| _t|| j	| j| j||
| dd| _t| j| j	 |d|
| dd| _t| j	|||d| _t| j| j	| jf| j|	|
| d	d
|rt||dni | _t| j	|d| _t| j	|d| _d S )Nr   r!   g      z	.qkv_projr8   Fz.o_proj)max_positionr   r   z.attn)r   r   r2   r5   )	layer_idxr   eps)r;   r<   r/   r   total_num_headsr   total_num_kv_headsmaxr   r   q_sizekv_sizescalingr   r   r   qkv_projr   o_projr   
rotary_embr   r(   attnr   q_normk_norm)rA   r/   r   r   r   r   r   r   r   r   r2   r5   r   re   rB   rD   rE   r<      st   



zQwen3MoeAttention.__init__	positionsr}   c                 C   s   |  |\}}|j| j| j| jgdd\}}}|jg |jd d |jd | j | jR  }| |}||j}|jg |jd d |jd | j | jR  }	| |	}	|	|j}| 	|||\}}| 
|||}
| |
\}}|S )Nr~   )r   )r   splitr   r   r   r   r   r   r   r   r   r   )rA   r   r}   qkvrJ   qkv	q_by_head	k_by_headattn_outputoutputrD   rD   rE   rL   M  s    0
0
zQwen3MoeAttention.forward)r   Nr   FNNr.   N)rM   rN   rO   rP   dictrQ   r   floatrR   r   r   r<   rS   r   rL   rU   rD   rD   rB   rE   r      sV    
	
Qr   c                
       s\   e Zd Zddededdf fddZdejd	ejd
ejdB deejejf fddZ	  Z
S )Qwen3MoeDecoderLayerr.   rW   r5   r6   Nc           
         s  t    |jj}|j}|j}|j| _t|dd}t|dd }t| j|j	|j
|j||jt|ddt|dd ||| d|d| _t|}t|d	sKg n|j}	||	vrk|jd
krk|d |j d
krkt|| dd| _nt|j|j|j|| dd| _t|j|jd| _t|j|jd| _d S )Nr   r   r   attention_biasFr   z
.self_attn)r/   r   r   r   r   r   r   r   r   r2   r5   r   mlp_only_layersr   r!   z.mlprW   r5   )r/   r0   r1   r2   r5   r   )r;   r<   rb   rc   r   r2   r/   ru   r   num_attention_headsnum_key_value_headsr   r   	self_attnr(   hasattrr   r\   decoder_sparse_steprV   mlpr-   r0   r1   r   input_layernormpost_attention_layernorm)
rA   rW   r5   r|   r   r2   r   r   r   r   rB   rD   rE   r<   c  sT   




zQwen3MoeDecoderLayer.__init__r   r}   residualc                 C   sX   |d u r|}|  |}n|  ||\}}| j||d}| ||\}}| |}||fS )N)r   r}   )r   r   r   r   )rA   r   r}   r   rD   rD   rE   rL     s   
zQwen3MoeDecoderLayer.forwardr   )rM   rN   rO   r	   rQ   r<   rS   r   tuplerL   rU   rD   rD   rB   rE   r   b  s    3r   c                       s   e Zd Zdeddededeejj	 f fddZ
dejd	ejfd
dZ		ddejdB dejdedB dejdB d	ejeB eejeej f B f
ddZd	eeeeeef  fddZdeeeejf  d	ee fddZ  ZS )Qwen3MoeModelr.   )r5   decoder_layer_typerW   r5   r   c                   s   t    jj}j}j}|j}|j| _|j| _	|j
| _
|| _|| _t|j
|j|| dd| _t|j fdd| dd\| _| _| _t|j|jd| _tdd	g|j| _d
| _d S )Nz.embed_tokensr2   r5   c                    s    | dS )Nr   rD   r5   r   rW   rD   rE   <lambda>  s    z(Qwen3MoeModel.__init__.<locals>.<lambda>z.layersr   r   r}   r   rD   )r;   r<   rb   rc   r2   rd   rn   r`   pad_token_idpadding_idx
vocab_sizer|   r   r/   embed_tokensr+   num_hidden_layersstart_layer	end_layerlayersr   r   normr*   make_empty_intermediate_tensorsaux_hidden_state_layers)rA   rW   r5   r   r|   r2   rd   rn   rB   r   rE   r<     s4   


zQwen3MoeModel.__init__	input_idsr6   c                 C   s
   |  |S N)r   rA   r   rD   rD   rE   embed_input_ids     
zQwen3MoeModel.embed_input_idsNr   intermediate_tensorsinputs_embedsc                 C   s   t  jr|d ur|}n| |}d }n|d usJ |d }|d }g }tt| j| j| j| jdD ] \}}	|| jv rI|d urB|| n|}
|	|
 |	|||\}}q1t  j
s]t||dS | ||\}}t|dkro||fS |S )Nr}   r   )start)r}   r   r   )r   is_first_rankr   	enumerater   r   r   r   r   appendis_last_rankr    r   len)rA   r   r   r   r   r}   r   aux_hidden_statesr   layeraux_hidden_staterJ   rD   rD   rE   rL     s4   


zQwen3MoeModel.forwardc                 C   s   t j| ddd| jj| jdS )N	gate_projr>   up_proj)ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namer\   r`   )r   make_expert_params_mappingr|   r\   r`   rA   rD   rD   rE   get_expert_mapping  s   z Qwen3MoeModel.get_expert_mappingweightsc              
   C   sl  g d}d}t |  }t }|  }|D ]\}}| jd urP| j| }	rP||	 }
t|
dt}| dksAJ d|  d|	 }||
| |
|	 q|D ]T\}}}||vr\qRd|v raqR|||}||rq||vrqqRt|| rwqR|drt||}|d u rqR||vrqR|| }
t|
dt}|tkr||
| n||
||  nd	}|D ]D}|\}}}}||vrqd
}|||}t|| rq||r||vrq|| }
ttdtf |
j}||
||||d
d}|r|} n?q|rq||r||vrqt|| rq|dr|dd}||vrtd|| q|}|| }
t|
dt}||
| |
| q|S )N))r   q_projr   )r   k_projr   )r   v_projr   )r=   r   r   )r=   r   r!   )
z.bias_biasz.k_scale_k_scalez.v_scale_v_scalez.weight_scale_weight_scalez.input_scale_input_scaleweight_loaderr!   zKV scale numel z != 1zmlp.expertsscaleFT.)shard_id	expert_idreturn_successkv_scalez	.kv_scalez.attn.kv_scalez{Found kv scale in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). kv-scale is not loaded.)r   named_parameterssetr   r2   get_cache_scaleru   r   numelr   addreplaceendswithr)   r   typingcastr   rR   r   loggerwarning_once)rA   r   stacked_params_mappingignore_suffixesparams_dictloaded_paramsexpert_params_mappingnameloaded_weight
scale_nameparamr   
param_nameweight_namer   is_expert_weightmappingr   name_mappedsuccessremapped_kv_scale_namerD   rD   rE   load_weights  s   









zQwen3MoeModel.load_weightsNN)rM   rN   rO   r   r	   rQ   typerS   r   Moduler<   r   r   r    r   listrL   rP   r   r   r   r  rU   rD   rD   rB   rE   r     s4    
%
*,r   c                       s:  e Zd Zdg diZdddZdZddd	ed
ef fddZde	de	ddfddZ
dee	df ddfddZdee	df fddZdejdejfddZ		d)dejdB dejdedB dejdB dejeB f
dd Zd!ejdejdB fd"d#Zd$eeeejf  dee fd%d&Zdeeeee	ef  fd'd(Z  ZS )*Qwen3MoeForCausalLMr   )r   r   r   input_embeddingsoutput_embeddings)r   lm_headFr.   r   rW   r5   c                   sH  t    |jj}|j}|| _|| _t|dg rddg| jd< t|t	|dd| _
t|j|j|t	|dd| _| jjrB| j
jj| j_t|j| _| j
j| _g | _g | _d }| j
jD ] }t|traqYt|tshJ t|jtry|j}| j|jj qY|d u rtd	t | j| _!d
| _"d| _#|j$| _%|j&| _'|j(| _)|j*| _+|j,| _-d S )Nr   r   r   r=   modelr   r  r   z,No Qwen3MoE layer found in the model.layers.r!   r   ).r;   r<   rb   rc   r2   r|   ru   packed_modules_mappingr   r,   r  r   r   r/   r  tie_word_embeddingsr   weightr   logits_processorr   expert_weights
moe_layersr   
isinstancer'   r   r   rV   r   r{   RuntimeErrorr   num_moe_layersnum_expert_groupsnum_shared_expertsro   num_logical_expertsrq   num_physical_expertsrr   num_local_physical_expertsrl   num_routed_expertsrp   r`   )rA   rW   r5   r|   r2   example_layerr   rB   rD   rE   r<     sT   


zQwen3MoeForCausalLM.__init__r,  r-  r6   Nc                 C   sh   | j |ksJ || _|| _ || j | _| jjD ]}t|jtr1|j}||_	||_
| j|_|j  qd S r   )r-  r,  r+  r`   r  r   r&  r   rV   rr   rq   rp   r{   update_expert_map)rA   r,  r-  r   moerD   rD   rE    update_physical_experts_metadata  s   
z4Qwen3MoeForCausalLM.update_physical_experts_metadatar   .c                 C   s   || j _d S r   )r  r   )rA   r   rD   rD   rE   set_aux_hidden_state_layers     z/Qwen3MoeForCausalLM.set_aux_hidden_state_layersc                 C   s   t | jj}d|d |d fS )Nr7      )r   r  r   )rA   
num_layersrD   rD   rE   "get_eagle3_aux_hidden_state_layers  s   z6Qwen3MoeForCausalLM.get_eagle3_aux_hidden_state_layersr   c                 C   s   | j |S r   )r  r   r   rD   rD   rE   r     r4  z#Qwen3MoeForCausalLM.embed_input_idsr   r   r   c                 C   s   |  ||||}|S r   )r  )rA   r   r   r   r   r}   rD   rD   rE   rL   	  s   zQwen3MoeForCausalLM.forwardr}   c                 C   s   |  | j|}|S r   )r#  r  )rA   r}   logitsrD   rD   rE   compute_logits  s   z"Qwen3MoeForCausalLM.compute_logitsr   c                 C   s   t | }||S r   )r&   r  )rA   r   loaderrD   rD   rE   r    s   
z Qwen3MoeForCausalLM.load_weightsc                 C   s
   | j  S r   )r  r   r   rD   rD   rE   r      r   z&Qwen3MoeForCausalLM.get_expert_mappingr  )rM   rN   rO   r   embedding_modulesfall_back_to_pt_during_loadr	   rQ   r<   rP   r2  r   r3  r7  rS   r   r   r    rL   r9  r   r   r  r  r   rU   rD   rD   rB   rE   r    sL    	3


$&r  )M__doc__r  collections.abcr   r   	itertoolsr   r   rS   torch.nn.functionalr   
functionalrF   vllm.compilation.decoratorsr   vllm.configr   r	   r
   vllm.distributedr   r   r   r   vllm.loggerr   %vllm.model_executor.layers.activationr   $vllm.model_executor.layers.attentionr   $vllm.model_executor.layers.fused_moer   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   r   r   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr   r    vllm.model_executor.models.utilsr   vllm.sequencer    
interfacesr"   r#   r$   r%   utilsr&   r'   r(   r)   r*   r+   r,   rM   r  r  r-   rV   r   r   r   r  rD   rD   rD   rE   <module>   sJ   $
-|gK 
|