o
    
۾iZ                     @   s  d Z ddlmZ ddlmZ ddlmZ ddlZddlm	  m
Z ddlm	Z	 ddlmZ ddlmZ dd	lmZmZ dd
lmZmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddl m!Z!m"Z"m#Z#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z,m-Z- ddl.m/Z/ ddl0m1Z1 ddl2m3Z3m4Z4 ddl5m6Z6m7Z7m8Z8m9Z9m:Z:m;Z; ee<Z=G dd de	j>Z?G dd de	j>Z@G dd de	j>ZAG d d! d!e	j>ZBeG d"d# d#e	j>ZCG d$d% d%e	j>e4e3ZDdS )&zBInference-only Qwen2MoE model compatible with HuggingFace weights.    )Iterable)islice)AnyN)nn)Qwen2MoeConfig)support_torch_compile)CacheConfig
VllmConfig)get_pp_group$get_tensor_model_parallel_world_size)init_logger)
SiluAndMul)	Attention)SharedFusedMoE)RMSNorm)MergedColumnParallelLinearQKVParallelLinearReplicatedLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)get_rope)ParallelLMHeadVocabParallelEmbedding)default_weight_loader)IntermediateTensors   )SupportsLoRA
SupportsPP)AutoWeightsLoaderextract_layer_indexis_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefixc                       s\   e Zd Z				ddededededB ded	ejj	dB d
eddf fddZ
dd Z  ZS )Qwen2MoeMLPNT hidden_sizeintermediate_size
hidden_actquant_configreduce_resultsexpert_gateprefixreturnc                    sp   t    t||gd d|| dd| _t||d||| dd| _|dkr/td| d	t | _|| _	d S )
N   Fz.gate_up_projbiasr*   r-   z
.down_proj)r1   r*   r+   r-   siluzUnsupported activation: z!. Only silu is supported for now.)
super__init__r   gate_up_projr   	down_proj
ValueErrorr   act_fnr,   )selfr'   r(   r)   r*   r+   r,   r-   	__class__ X/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/qwen2_moe.pyr4   K   s,   



zQwen2MoeMLP.__init__c                 C   sL   |  |\}}| |}| |\}}| jd ur$t| |d | }|S )Nr   )r5   r8   r6   r,   Fsigmoid)r9   xgate_up_outr<   r<   r=   forwardl   s   

zQwen2MoeMLP.forward)NTNr&   )__name__
__module____qualname__intstrr   booltorchr   Linearr4   rD   __classcell__r<   r<   r:   r=   r%   J   s.    
	!r%   c                       sJ   e Zd Z		ddededB def fddZdejd	ejfd
dZ	  Z
S )Qwen2MoeSparseMoeBlockNr&   configr*   r-   c                    s   t    t | _| j|jkrtd| j d|j dt|j|jdd | dd| _t|jddd | dd| _	|j
d	krRt|j|j
|j|d| j	| d
d| _nd | _t| j|j|j|j|jd|j|| dd	| _d S )NzTensor parallel size z' is greater than the number of experts .Fz.gater0   r   z.shared_expert_gater   z.shared_expert)r'   r(   r)   r*   r+   r,   r-   z.experts)	shared_expertsnum_expertstop_kr'   r(   r+   renormalizer*   r-   )r3   r4   r   tp_sizerR   r7   r   r'   gateshared_expert_gateshared_expert_intermediate_sizer%   r)   shared_expertr   num_experts_per_tokmoe_intermediate_sizenorm_topk_probexperts)r9   rO   r*   r-   r:   r<   r=   r4   x   sX   




zQwen2MoeSparseMoeBlock.__init__hidden_statesr.   c                 C   sr   |j }|j d }|d|}| |\}}| j||d}| jd ur)|d |d  }| jdkr4| j|}||S )N)r^   router_logitsr   r   )shapeviewrV   r]   rY   rU   &maybe_all_reduce_tensor_model_parallel)r9   r^   
orig_shape
hidden_dimr`   rB   final_hidden_statesr<   r<   r=   rD      s   



zQwen2MoeSparseMoeBlock.forward)Nr&   )rE   rF   rG   r   r   rI   r4   rK   TensorrD   rM   r<   r<   r:   r=   rN   w   s    8rN   c                       s   e Zd Z						ddedededeeef dB ded	edB d
edB dedeeef dB ddf fddZ	de
jde
jde
jfddZ  ZS )Qwen2MoeAttentionN    r&   r'   	num_headsnum_kv_headsrope_parametersmax_position_embeddingscache_configr*   r-   dual_chunk_attention_configr.   c
              	      s`  t    || _t }
|| _| j|
 dksJ | j|
 | _|| _| j|
kr/| j|
 dks.J n	|
| j dks8J td| j|
 | _|| j | _	| j| j	 | _
| j| j	 | _| j	d | _|| _|	| _t|| j	| j| jd|| dd| _t| j| j	 |d|| dd| _t| j	|||	d	| _t| j| j	| jf| j||| d
d|	rt||	dni | _d S )Nr   r   g      Tz	.qkv_projr0   Fz.o_proj)max_positionrl   ro   z.attn)rk   rn   r*   r-   )	layer_idxro   )r3   r4   r'   r   total_num_headsrj   total_num_kv_headsmaxrk   head_dimq_sizekv_sizescalingrm   ro   r   qkv_projr   o_projr   
rotary_embr   r    attn)r9   r'   rj   rk   rl   rm   rn   r*   r-   ro   rU   r:   r<   r=   r4      sp   



zQwen2MoeAttention.__init__	positionsr^   c           
      C   s`   |  |\}}|j| j| j| jgdd\}}}| |||\}}| |||}| |\}	}|	S )Nr_   )dim)ry   splitrv   rw   r{   r|   rz   )
r9   r}   r^   qkvrB   qkvattn_outputoutputr<   r<   r=   rD     s    zQwen2MoeAttention.forward)Nri   NNr&   N)rE   rF   rG   rH   dictrI   r   r   r   r4   rK   rg   rD   rM   r<   r<   r:   r=   rh      sF    	
Krh   c                       sh   e Zd Z			ddededB dedB deddf
 fdd	Zd
ej	dej	dej	dB dej	fddZ
  ZS )Qwen2MoeDecoderLayerNr&   rO   rn   r*   r-   r.   c           	         s   t    |j| _t|dd }t|dd}t| j|j|j|j|||| d|d	| _t	|}t
|ds5g n|j}||vrV|jdkrV|d |j dkrVt||| d	d
| _nt|j|j|j|| d	d| _t|j|jd| _t|j|jd| _d S )Nro   rm   ri   z
.self_attn)	r'   rj   rk   rl   rm   rn   r*   r-   ro   mlp_only_layersr   r   z.mlp)rO   r*   r-   )r'   r(   r)   r*   r-   eps)r3   r4   r'   getattrrh   num_attention_headsnum_key_value_headsrl   	self_attnr    hasattrr   rR   decoder_sparse_steprN   mlpr%   r(   r)   r   rms_norm_epsinput_layernormpost_attention_layernorm)	r9   rO   rn   r*   r-   ro   rm   rq   r   r:   r<   r=   r4     sH   

zQwen2MoeDecoderLayer.__init__r}   r^   residualc                 C   sX   |d u r|}|  |}n|  ||\}}| j||d}| ||\}}| |}||fS )N)r}   r^   )r   r   r   r   )r9   r}   r^   r   r<   r<   r=   rD   Q  s   
zQwen2MoeDecoderLayer.forward)NNr&   )rE   rF   rG   r   r   r   rI   r4   rK   rg   rD   rM   r<   r<   r:   r=   r     s0    2r   c                       s   e Zd Zdddedef fddZdejdejfd	d
Z		ddejdB dejde	dB dejdB deje	B f
ddZ
deeeeeef  fddZdeeeejf  dee fddZ  ZS )Qwen2MoeModelr&   r-   vllm_configr-   c                   s   t    |jj|j |jj| _| _tjj	| dd| _
tj fdd| dd\| _| _| _tj	jd| _tdd	gj	| _d S )
Nz.embed_tokensr*   r-   c                    s   t  | dS )N)rO   rn   r*   r-   )r   r   rn   rO   r*   r<   r=   <lambda>|  s    z(Qwen2MoeModel.__init__.<locals>.<lambda>z.layersr   r   r^   r   )r3   r4   model_config	hf_configrn   r*   
vocab_sizerO   r   r'   embed_tokensr#   num_hidden_layersstart_layer	end_layerlayersr   r   normr"   make_empty_intermediate_tensors)r9   r   r-   r:   r   r=   r4   j  s*   



zQwen2MoeModel.__init__	input_idsr.   c                 C   s
   |  |S N)r   r9   r   r<   r<   r=   embed_input_ids     
zQwen2MoeModel.embed_input_idsNr}   intermediate_tensorsinputs_embedsc           	      C   s   t  jr|d ur|}n| |}d }n|d usJ |d }|d }t| j| j| jD ]
}||||\}}q*t  js@t||dS | 	||\}}|S )Nr^   r   )r^   r   )
r
   is_first_rankr   r   r   r   r   is_last_rankr   r   )	r9   r   r}   r   r   r^   r   layerrB   r<   r<   r=   rD     s    
zQwen2MoeModel.forwardc                 C   s   t j| ddd| jjdS )N	gate_projr6   up_proj)ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namerR   )r   make_expert_params_mappingrO   rR   r9   r<   r<   r=   get_expert_mapping  s   z Qwen2MoeModel.get_expert_mappingweightsc              	   C   s  g d}t |  }t }|  }|D ]\}}|D ]=\}}	}
|	|vr#qd|v r(q||	|}|ds8|dr=||vr=qt|| rCq||vrHq|| }|j}||||
  n|D ]9}|\}}	}}
|	|vrfqY||	|}t|| rrqY|ds||dr||vrqY|| }|j}|||||
|d  nP|ds|dr||vrqt|| rq|dr|dd}||vrt	d	|| q|}d
|v rt
|jdkr|d d d f }|| }t|dt}||| || q|S )N))ry   q_projr   )ry   k_projr   )ry   v_projr   )r5   r   r   )r5   r   r   zmlp.expertsz.bias_bias)shard_id	expert_idkv_scalez	.kv_scalez.attn.kv_scalez{Found kv_scale in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). kv_scale is not loaded.zmlp.shared_expert_gater   weight_loader)r   named_parameterssetr   replaceendswithr!   r   loggerwarning_oncelenra   r   r   add)r9   r   stacked_params_mappingparams_dictloaded_paramsexpert_params_mappingnameloaded_weight
param_nameweight_namer   paramr   mappingr   remapped_kv_scale_namer<   r<   r=   load_weights  s   	




zQwen2MoeModel.load_weightsNN)rE   rF   rG   r	   rI   r4   rK   rg   r   r   rD   listtuplerH   r   r   r   r   rM   r<   r<   r:   r=   r   h  s$    
,r   c                       s   e Zd ZdZdg diZdddedef fdd	Zd
ej	dej	fddZ
		dd
ej	dB dej	dedB dej	dB dej	eB f
ddZdej	dej	dB fddZdeeeej	f  dee fddZdeeeeeef  fddZ  ZS )Qwen2MoeForCausalLMFry   )r   r   r   r&   r   r   r-   c                   s   t    |jj}|j}|| _|| _t|dg s|jdkr$ddg| jd< t	|t
|dd| _t|j|j|t
|dd	| _| jjrG| jjj| j_t|j| _| jj| _d S )
Nr   r   r   r   r5   model)r   r-   lm_headr   )r3   r4   r   r   r*   rO   r   rX   packed_modules_mappingr   r$   r   r   r   r'   r   tie_word_embeddingsr   weightr   logits_processorr   )r9   r   r-   rO   r*   r:   r<   r=   r4   $  s.   



zQwen2MoeForCausalLM.__init__r   r.   c                 C   s   | j |S r   )r   r   r   r<   r<   r=   r   A  s   z#Qwen2MoeForCausalLM.embed_input_idsNr}   r   r   c                 C   s   |  ||||}|S r   )r   )r9   r   r}   r   r   r^   r<   r<   r=   rD   D  s   zQwen2MoeForCausalLM.forwardr^   c                 C   s   |  | j|}|S r   )r   r   )r9   r^   logitsr<   r<   r=   compute_logitsP  s   z"Qwen2MoeForCausalLM.compute_logitsr   c                 C   s   t | }||S r   )r   r   )r9   r   loaderr<   r<   r=   r   W  s   
z Qwen2MoeForCausalLM.load_weightsc                 C   s
   | j  S r   )r   r   r   r<   r<   r=   r   [  r   z&Qwen2MoeForCausalLM.get_expert_mappingr   )rE   rF   rG   fall_back_to_pt_during_loadr   r	   rI   r4   rK   rg   r   r   rD   r   r   r   r   r   r   rH   r   rM   r<   r<   r:   r=   r     s4    

$&r   )E__doc__collections.abcr   	itertoolsr   typingr   rK   torch.nn.functionalr   
functionalr>   transformersr   vllm.compilation.decoratorsr   vllm.configr   r	   vllm.distributedr
   r   vllm.loggerr   %vllm.model_executor.layers.activationr   $vllm.model_executor.layers.attentionr   $vllm.model_executor.layers.fused_moer   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   r   r   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr   vllm.sequencer   
interfacesr   r   utilsr   r    r!   r"   r#   r$   rE   r   Moduler%   rN   rh   r   r   r   r<   r<   r<   r=   <module>   sD    	-NYJ 2