o
    
۾in                     @   s  d Z ddlZddlmZmZ ddlmZ ddlmZ ddlZddlm	Z	 ddl
mZ ddlmZ dd	lmZmZmZ dd
lmZmZmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddl m!Z!m"Z"m#Z#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z,m-Z- ddl.m/Z/m0Z0 ddl1m2Z2 ddl3m4Z4 ddl5m6Z6m7Z7m8Z8 ddl9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@ eeAZBG dd de	jCZDG dd de	jCZEG dd  d e	jCZFG d!d" d"e	jCZGeG d#d$ d$e	jCZHG d%d& d&e	jCe8e7e6ZIdS )'zBInference-only ErineMoE model compatible with HuggingFace weights.    N)CallableIterable)islice)Any)nn)PretrainedConfig)support_torch_compile)CacheConfig
VllmConfigget_current_vllm_config)get_ep_groupget_pp_group$get_tensor_model_parallel_world_size)init_logger)
SiluAndMul)	Attention)SharedFusedMoE)RMSNorm)MergedColumnParallelLinearQKVParallelLinearReplicatedLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)get_rope)ParallelLMHeadVocabParallelEmbedding)default_weight_loadermaybe_remap_kv_scale_name)IntermediateTensors)set_default_rope_theta   )MixtureOfExpertsSupportsLoRA
SupportsPP)AutoWeightsLoaderPPMissingLayerextract_layer_indexis_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefixc                       sT   e Zd Z				ddedededed	edB d
ededdf fddZdd Z  Z	S )Ernie4_5_MoeMLPFNT hidden_sizeintermediate_size
hidden_actuse_biasquant_configreduce_resultsprefixreturnc                    sj   t    t||gd ||| dd| _t|||||| dd| _|dkr/td| dt | _d S )	N   z.gate_up_projbiasr2   r4   z
.down_proj)r8   r2   r3   r4   siluzUnsupported activation: z!. Only silu is supported for now.)	super__init__r   gate_up_projr   	down_proj
ValueErrorr   act_fn)selfr.   r/   r0   r1   r2   r3   r4   	__class__ Z/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/ernie45_moe.pyr;   R   s*   


zErnie4_5_MoeMLP.__init__c                 C   s*   |  |\}}| |}| |\}}|S N)r<   r?   r=   )r@   xgate_up_rC   rC   rD   forwardr   s   
zErnie4_5_MoeMLP.forward)FNTr-   )
__name__
__module____qualname__intstrboolr   r;   rI   __classcell__rC   rC   rA   rD   r,   Q   s.    	 r,   c                	       sP   e Zd Z			ddededB dedef fdd	Zd
ej	dej	fddZ
  ZS )Ernie4_5_MoeMoENr-   Fconfigr2   r4   enable_eplbc           	         s  t    t|}|| _t | _t|dd | _t j	| _
t j| _| j
 | _|j| _| j| _t }|jj}|| _|j| _| j| _| j| j | _| j| j | _| j| j | _| j| j | _t|dddk| _| j|jkrytd| j d|j dt|j |jdt!j"d | dd| _#t$%t!j&|jt!j"d	| j#_'| jr|j(|j }t)|j ||j*|| d
dd| _+nd | _+t,| j+|j|j-|j |j(dd|| d| j#j'| j| jt!j"d| _.d S )Nmoe_num_shared_expertsr   zTensor parallel size z' is greater than the number of experts .Fz.gate)r8   params_dtyper2   r4   dtypez.shared_experts)r.   r/   r0   r2   r4   r3   Tz.experts)shared_expertsnum_expertstop_kr.   r/   r3   renormalizer2   r4   e_score_correction_biasrS   num_redundant_expertsrouter_logits_dtype)/r:   r;   r'   	layer_idxr   tp_sizegetattrrT   r   device_groupep_grouprank_in_groupep_ranksizeep_sizemoe_num_expertsn_routed_expertsn_shared_expertsr   parallel_configeplb_configrS   r^   n_redundant_expertsn_logical_expertsn_physical_expertsn_local_physical_expertsphysical_expert_startphysical_expert_endhas_shared_expertsr>   r   r.   torchfloat32gater   	Parameteremptyr]   moe_intermediate_sizer,   r0   rY   r   moe_kexperts)	r@   rR   r2   r4   rS   r`   vllm_configrm   r/   rA   rC   rD   r;   z   s   




	

	zErnie4_5_MoeMoE.__init__hidden_statesr5   c                 C   s   |j }|j d }|d|}| |jtjd\}}| j||d}| jr-|d |d  }n|d }| jdkr<| j	|}||S )NrW   )r~   router_logitsr   r!   )
shapeviewrw   toru   rv   r|   rt   ra   &maybe_all_reduce_tensor_model_parallel)r@   r~   
orig_shape
hidden_dimr   rH   final_hidden_statesrC   rC   rD   rI      s   


zErnie4_5_MoeMoE.forward)Nr-   F)rJ   rK   rL   r   r   rN   rO   r;   ru   TensorrI   rP   rC   rC   rA   rD   rQ   y   s    UrQ   c                       s   e Zd Z							ddededed	eeef d
edB dededede	dB de
dB deddf fddZdejdejdejfddZ  ZS )Ernie4_5_MoeAttentionN   h㈵>Fr-   r.   	num_headsnum_kv_headsrope_parametershead_dimmax_position_embeddingsrms_norm_epsqkv_biascache_configr2   r4   r5   c              	      sb  t    t|dkrt|nd}|| _|| _t }|| _| j| dks&J | j| | _|| _	| j	|kr>| j	| dks=J n	|| j	 dksGJ t
d| j	| | _|pV|| j | _| j| j | _| j| j | _| jd | _|| _t|| j| j| j	||
| dd| _t| j| j |d|
| dd| _t| j||dd| _t| j| j| j| j|	|
| d	d
| _d S )Nr   r!   g      z	.qkv_projr7   Fz.o_proj)max_positionr   is_neox_stylez.attn)r   r   r2   r4   )r:   r;   lenr'   r`   r.   r   total_num_headsr   total_num_kv_headsmaxr   r   q_sizekv_sizescalingr   r   qkv_projr   o_projr   
rotary_embr   attn)r@   r.   r   r   r   r   r   r   r   r   r2   r4   r`   ra   rA   rC   rD   r;      sb   



zErnie4_5_MoeAttention.__init__	positionsr~   c           
      C   s`   |  |\}}|j| j| j| jgdd\}}}| |||\}}| |||}| |\}	}|	S )Nr   )dim)r   splitr   r   r   r   r   )
r@   r   r~   qkvrH   qkvattn_outputoutputrC   rC   rD   rI   2  s    zErnie4_5_MoeAttention.forward)Nr   r   FNNr-   )rJ   rK   rL   rM   dictrN   r   floatrO   r	   r   r;   ru   r   rI   rP   rC   rC   rA   rD   r      sP    
	
Jr   c                       sn   e Zd Z				ddededB dedB deded	df fd
dZde	j
de	j
de	j
dB d	e	j
fddZ  ZS )Ernie4_5_MoeDecoderLayerNr-   FrR   r   r2   r4   rS   r5   c                    sN  t    |j| _t|dd t|dd}t| j|j|jt|dd |j||j	t|dd||| dd	| _
t|}|| _t|d
d}t|dd}	t|d|jd }
t|dd}t|d|dk}|r~|d | dkr~||	kr~||
kr~t||| d|d| _nt|j|j|jt|dd|| dd| _t|j|j	d| _t|j|j	d| _d S )Ni  )default_thetar   r   r   r1   Fz
.self_attn)r.   r   r   r   r   r   r   r   r   r2   r4   ri   r   moe_layer_start_indexmoe_layer_end_indexr!   moe_layer_intervaluse_moez.mlp)rR   r2   r4   rS   )r.   r/   r0   r1   r2   r4   eps)r:   r;   r.   r    rb   r   num_attention_headsnum_key_value_headsr   r   	self_attnr'   r`   num_hidden_layersrQ   mlpr,   r/   r0   r   input_layernormpost_attention_layernorm)r@   rR   r   r2   r4   rS   r   r`   ri   r   r   r   r   rA   rC   rD   r;   D  sb   




	z!Ernie4_5_MoeDecoderLayer.__init__r   r~   residualc                 C   sX   |d u r|}|  |}n|  ||\}}| j||d}| ||\}}| |}||fS )N)r   r~   )r   r   r   r   )r@   r   r~   r   rC   rC   rD   rI     s   
z Ernie4_5_MoeDecoderLayer.forward)NNr-   F)rJ   rK   rL   r   r	   r   rN   rO   r;   ru   r   rI   rP   rC   rC   rA   rD   r   C  s6    Ar   c                       s   e Zd Zdddedef fddZdejdejfd	d
Z		ddejdB dejde	dB dejdB deje	B f
ddZ
deeeeeef  fddZdeeeejf  dee fddZ  ZS )Ernie4_5_MoeModelr-   r4   r}   r4   c                   s   t    |jj|j |jj| _j| _| _	|j
}|j}|j|j| _t jr:tjj| dd| _nt | _tj fdd| dd\| _| _| _t jrdtjjd| _nt | _tdd	gj| _d S )
Nz.embed_tokensr2   r4   c                    s   t  | dS )N)rR   r   r2   r4   rS   )r   r   r   rR   rS   r2   rC   rD   <lambda>  s    z,Ernie4_5_MoeModel.__init__.<locals>.<lambda>z.layersr   r   r~   r   )r:   r;   model_config	hf_configr   r2   pad_token_idpadding_idx
vocab_sizerR   rl   rm   rS   r^   r   is_first_rankr   r.   embed_tokensr&   r*   r   start_layer	end_layerlayersis_last_rankr   r   normr)   make_empty_intermediate_tensors)r@   r}   r4   rl   rm   rA   r   rD   r;     s<   



zErnie4_5_MoeModel.__init__	input_idsr5   c                 C   s
   |  |S rE   )r   r@   r   rC   rC   rD   embed_input_ids     
z!Ernie4_5_MoeModel.embed_input_idsNr   intermediate_tensorsinputs_embedsc           	      C   s   t  jr|d ur|}n| |}d }n|d usJ |d }|d }t| j| j| jD ]
}||||\}}q*t  js@t||dS | 	||\}}|S )Nr~   r   )r~   r   )
r   r   r   r   r   r   r   r   r   r   )	r@   r   r   r   r   r~   r   layerrH   rC   rC   rD   rI     s    
zErnie4_5_MoeModel.forwardc                 C   s   t j| ddd| jj| jdS )N	gate_projr=   up_proj)ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namerZ   r^   )r   make_expert_params_mappingrR   ri   r^   r@   rC   rC   rD   get_expert_mapping  s   z$Ernie4_5_MoeModel.get_expert_mappingweightsc              
   C   s  g d}t |  }t }|  }|D ]\}}| jjr!|dr!qd|v r&qd|v r5|dd}|d}|D ]<\}}	}
|	|vrAq7d|v rJ||vrJq7||	|}|d	sZ|d
r_||vr_q7t	|| req7|| }|j
}||||
  nd}|D ]I}|\}}	}}
|	|vrqxd}||	|}t	|| rqx|d	s|d
r||vrqx|| }ttdtf |j
}|||||
|dd}|r|} n2qx|rq|d	s|d
r||vrqt	|| rqt||}|d u rq|| }t|dt}||| || q|S )N))r   q_projr   )r   k_projr   )r   v_projr   )r<   r   r   )r<   r   r!   zlm_head.weightmtpr]   moe_staticsrw   r   zmlp.experts.z.bias_biasFT.)shard_id	expert_idreturn_successweight_loader)r   named_parameterssetr   rR   tie_word_embeddingsendswithreplacesqueezer(   r   typingcastr   rO   r   rb   r   add)r@   r   stacked_params_mappingparams_dictloaded_paramsexpert_params_mappingnameloaded_weight
param_nameweight_namer   paramr   is_expert_weightmappingr   name_mappedsuccessrC   rC   rD   load_weights  s   	





zErnie4_5_MoeModel.load_weightsNN)rJ   rK   rL   r
   rN   r;   ru   r   r   r   rI   listtuplerM   r   r   r   r   rP   rC   rC   rA   rD   r     s$    /
,r   c                       s  e Zd Zg dddgdZdZddded	ef fd
dZdededdfddZ	de
jde
jfddZ		d"de
jdB de
jdedB de
jdB de
jeB f
ddZde
jde
jdB fddZdeeee
jf  dee fddZdeeeeeef  fd d!Z  ZS )#Ernie4_5_MoeForCausalLM)r   r   r   r   r   )r   r<   Fr-   r   r}   r4   c                   s  t    |jj |j} | _|| _t|t|dd| _t	 j
r/t j j|t|dd| _nt | _| jjr>| jjj| j_t j| _| jj| _g | _ fddt jD }t|| _d| _g | _d }| jjD ] }t|trqqit|tsxJ t|j t!r|j }| j"|j j# qi|d u rt$%d d	| _&d	| _'d	| _(d	| _)d	| _*d	| _+d S |j,| _&|j-| _'|j.| _(|j/| _)|j0| _*|j1| _+d S )
Nmodel)r}   r4   lm_headr   c                    s6   g | ]}| j kr| jkr|d   j dkr|qS )r!   r   )r   r   r   ).0irR   rC   rD   
<listcomp>  s    

z4Ernie4_5_MoeForCausalLM.__init__.<locals>.<listcomp>r!   z/No Ernie4_5_MoeMoE layer found in model.layers.r   )2r:   r;   r   r   r2   rR   r   r+   r  r   r   r   r   r.   r  r&   r   r   weightr   logits_processorr   expert_weightsranger   r   num_moe_layersnum_expert_groups
moe_layersr   
isinstancer   r   rQ   appendr|   loggerwarningnum_logical_expertsnum_physical_expertsnum_local_physical_expertsnum_routed_expertsnum_shared_expertsr^   ro   rp   rq   rj   rk   rn   )r@   r}   r4   r2   moe_layers_indicesexample_moer   rA   r  rD   r;     sf   




	


z Ernie4_5_MoeForCausalLM.__init__r  r  r5   Nc                 C   sh   | j |ksJ || _|| _ || j | _| jjD ]}t|jtr1|j}||_	||_
| j|_|j  qd S rE   )r  r  r  r^   r  r   r  r   rQ   rq   rp   rn   r|   update_expert_map)r@   r  r  r   moerC   rC   rD    update_physical_experts_metadata  s   
z8Ernie4_5_MoeForCausalLM.update_physical_experts_metadatar   c                 C   s   | j |S rE   )r  r   r   rC   rC   rD   r     s   z'Ernie4_5_MoeForCausalLM.embed_input_idsr   r   r   c                 C   s   |  ||||}|S rE   )r  )r@   r   r   r   r   r~   rC   rC   rD   rI     s   zErnie4_5_MoeForCausalLM.forwardr~   c                 C   s   |  | j|}|S rE   )r
  r  )r@   r~   logitsrC   rC   rD   compute_logits  s   z&Ernie4_5_MoeForCausalLM.compute_logitsr   c                 C   s$   t | | jjr	dgnd d}||S )Nzlm_head.)skip_prefixes)r%   rR   r   r   )r@   r   loaderrC   rC   rD   r     s
   
z$Ernie4_5_MoeForCausalLM.load_weightsc                 C   s
   | j  S rE   )r  r   r   rC   rC   rD   r     r   z*Ernie4_5_MoeForCausalLM.get_expert_mappingr   )rJ   rK   rL   packed_modules_mappingfall_back_to_pt_during_loadr
   rN   r;   rM   r  ru   r   r   r   rI   r  r   r  r   r   r   r   rP   rC   rC   rA   rD   r  q  sH    E


$&r  )J__doc__r   collections.abcr   r   	itertoolsr   r   ru   r   transformersr   vllm.compilation.decoratorsr   vllm.configr	   r
   r   vllm.distributedr   r   r   vllm.loggerr   %vllm.model_executor.layers.activationr   $vllm.model_executor.layers.attentionr   $vllm.model_executor.layers.fused_moer   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   r   r   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr   r   vllm.sequencer   vllm.transformers_utils.configr    
interfacesr"   r#   r$   utilsr%   r&   r'   r(   r)   r*   r+   rJ   r  Moduler,   rQ   r   r   r   r  rC   rC   rC   rD   <module>   sF   $
(n\\ R