o
    
۾ii                     @   s  d Z ddlZddlmZmZ ddlmZ ddlZddlmZ ddl	m
Z
 ddlmZmZmZ ddlmZmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZmZmZ ddlm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z&m'Z' ddl(m)Z)m*Z* ddl+m,Z,m-Z-m.Z. ddl/m0Z1 ddl2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8m9Z9m:Z: ddl;m<Z< ddl=m>Z> ee?Z@G dd dejAZBG dd dejAZCG dd dejAZDe
dddddd G d!d" d"ejAZEG d#d$ d$ejAe.e,e-ZFdS )%z?Inference-only AfMoE model compatible with HuggingFace weights.    N)CallableIterable)islice)nn)support_torch_compile)CacheConfig
VllmConfigget_current_vllm_config)get_ep_groupget_pp_group$get_tensor_model_parallel_world_size)init_logger)	Attention)SharedFusedMoE)RMSNorm)ColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)get_rope)ParallelLMHeadVocabParallelEmbedding)default_weight_loadermaybe_remap_kv_scale_name)SupportsEagle3SupportsLoRA
SupportsPP)LlamaMLP)AutoWeightsLoaderPPMissingLayerWeightsMapperextract_layer_indexis_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefix)IntermediateTensors)AttentionTypec                       sL   e Zd Z			ddedB dedef fddZd	ejd
ejfddZ	  Z
S )AfmoeMoEN Fquant_configprefixenable_eplbc                    sT  t    t | _|j| _|j| _|j| _t j| _	| j	
 | _| j	 | _|j| _|j| _|jdkr<td|j dtj|j|jdtjd| _ttj|jtjd| _t }|jj}|| _ |j!| _"| j| _#| j#| j" | _$| j$| j | _%| j| j% | _&| j&| j% | _'d | _(|jdkr|j)|j }t*|j||j|d| dd	| _(t+di d
| j(d|jd|j,d|jd|j)ddd| jdkr| jn3dd|ddd|j-d|j.d| dd| jd| jd| jd| j d| j"dtj| _/d S d|ddd|j-d|j.d| dd| jd| jd| jd| j d| j"dtj| _/d S ) NsiluzUnsupported activation: z!. Only silu is supported for now.F)biasdtyper0   r   z.shared_experts)hidden_sizeintermediate_size
hidden_actr+   reduce_resultsr,   shared_expertsnum_expertstop_kr2   r3   r5   renormalizesigmoidr+   use_grouped_topkTnum_expert_group
topk_groupr,   z.expertsscoring_funcrouted_scaling_factore_score_correction_biasr-   num_redundant_expertsrouter_logits_dtype )0super__init__r   tp_sizeroute_scale
score_func
route_normr
   device_groupep_grouprankep_ranksizeep_sizer7   n_routed_expertsnum_shared_expertsn_shared_expertsr4   
ValueErrorr   Linearr2   torchfloat32gate	Parameteremptyexpert_biasr	   parallel_configeplb_configr-   rA   n_redundant_expertsn_logical_expertsn_physical_expertsn_local_physical_expertsphysical_expert_startphysical_expert_endr6   moe_intermediate_sizeAfmoeMLPr   num_experts_per_tokn_groupr=   experts)selfconfigr+   r,   r-   vllm_configr\   r3   	__class__rC   T/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/afmoe.pyrE   >   s   





	


	


zAfmoeMoE.__init__hidden_statesreturnc                 C   sz   |j \}}|d|}| |jtjd}| j||d}| jd ur*|\}}|| }n|}| jdkr7| j	|}|||S )Nr1   )rn   router_logits   )
shapeviewrW   torU   rV   rg   r6   rF   &maybe_all_reduce_tensor_model_parallel)rh   rn   
num_tokens
hidden_dimrq   fused_moe_outshared_outputfinal_hidden_statesrC   rC   rm   forward   s   



zAfmoeMoE.forward)Nr*   F)__name__
__module____qualname__r   strboolrE   rU   Tensorr|   __classcell__rC   rC   rk   rm   r)   =   s    Vr)   c                       s   e Zd Zddddddejfdedededed	ed
edB dededB dedB de	de	ddf fddZ
dejdejdejfddZ  ZS )AfmoeAttention   Ngh㈵>r*   	layer_idxr2   	num_headsnum_kv_headsmax_position_embeddingshead_dimrms_norm_epscache_configr+   r,   	attn_typero   c                    s  t    || _|| _t }|| _| j| dksJ | j| | _|| _| j|kr2| j| dks1J n	|| j dks;J td| j| | _	|pJ|| j | _
| j| j
 | _| j	| j
 | _| j
d | _|| _|j| dk| _| jrq|jnd | _t| j| j
| j| jd|
| dd| _t| j| j
 | jd|
| dd| _t|| j| j
 d|
| d	d| _t| j
|jd
| _t| j
|jd
| _| jrt| j
||jdd| _nd | _t| j| j
| j| j	|	|
| j| d|d	| _ d S )Nr   rr   g      sliding_attentionFz	.qkv_proj)r/   r+   r,   z.o_projz
.gate_projepsT)max_positionrope_parametersis_neox_stylez.attn)r   r   r+   per_layer_sliding_windowr,   r   )!rD   rE   r   r2   r   total_num_headsr   total_num_kv_headsmaxr   r   q_sizekv_sizescalingr   layer_typesis_local_attentionsliding_windowr   qkv_projr   o_projr   	gate_projr   r   q_normk_normr   r   
rotary_embr   attn)rh   ri   r   r2   r   r   r   r   r   r   r+   r,   r   rF   rk   rC   rm   rE      s~   



	
	
zAfmoeAttention.__init__	positionsrn   c                 C   s   |  |\}}| |\}}|j| j| j| jgdd\}}}| |d| j| j|j	}| 
|d| j| j|j	}| jrO| jd urO| |||\}}| |||}	|	t| }	| |	\}
}|
S )Nrp   )dim)r   r   splitr   r   r   reshaper   r   rs   r   r   r   r   r   rU   r:   r   )rh   r   rn   qkv_rW   qkvattn_outputoutputrC   rC   rm   r|     s     zAfmoeAttention.forward)r}   r~   r   r(   DECODERintfloatr   r   r   rE   rU   r   r|   r   rC   rC   rk   rm   r      sP    	
`r   c                       st   e Zd Z				ddedB dedB dededdf
 fd	d
Zdej	dej	dej	dB de
ej	ej	f fddZ  ZS )AfmoeDecoderLayerNr*   Fr   r+   r,   r-   ro   c                    s   t    |j| _t|dd}t|| _t|| j| j|j|j||j	|j
||| dd| _| j|jk| _| jrDt||| d|d| _nt|j|j|j|| dd| _t|j|j
d| _t|j|j
d| _t|j|j
d| _t|j|j
d| _d S )	Nr   r   z
.self_attn)ri   r   r2   r   r   r   r   r   r   r+   r,   z.mlp)ri   r+   r,   r-   )r2   r3   r4   r+   r,   r   )rD   rE   r2   getattrr"   r   r   num_attention_headsnum_key_value_headsr   r   	self_attnnum_dense_layersmoe_enabledr)   mlprd   r3   r4   r   input_layernormpost_attention_layernormpre_mlp_layernormpost_mlp_layernorm)rh   ri   r   r+   r,   r-   r   rk   rC   rm   rE   (  sL   


zAfmoeDecoderLayer.__init__r   rn   residualc                 C   sl   |d u r|}|  |}n|  ||\}}| j||d}| |}| ||\}}| |}| |}||fS )N)r   rn   )r   r   r   r   r   r   )rh   r   rn   r   rC   rC   rm   r|   _  s   


zAfmoeDecoderLayer.forward)NNr*   F)r}   r~   r   r   r   r   r   rE   rU   r   tupler|   r   rC   rC   rk   rm   r   '  s2    7r   rp   )	input_idsr   intermediate_tensorsinputs_embeds)dynamic_arg_dimsc                       s   e Zd Zdddedef fddZdejdejfd	d
Z		ddejdB dejde	dB dejdB deje	B e
ejeej f B f
ddZdedejdejde	fddZdee
eeeef  fddZdee
eejf  dee fddZ  ZS )
AfmoeModelr*   r,   rj   r,   c                   s   t    |jj|j |j|jj| _j	| _	j
| _
t jr0tj	j| dd| _nt | _tj fdd| dd\| _| _| _t jrZtjjd| _nt | _ttdf  | _tdd	gj| _d S )
Nz.embed_tokensr   c                    s   t  | dS )N)ri   r   r+   r,   r-   )r   r   r   ri   r-   r+   rC   rm   <lambda>  s    z%AfmoeModel.__init__.<locals>.<lambda>z.layersr   .rn   r   )rD   rE   model_config	hf_configr   r+   r[   r-   ri   
vocab_sizemup_enabledr   is_first_rankr   r2   embed_tokensr    r%   num_hidden_layersstart_layer	end_layerlayersis_last_rankr   r   normr   r   aux_hidden_state_layersr$   make_empty_intermediate_tensors)rh   rj   r,   rk   r   rm   rE     s2   



zAfmoeModel.__init__r   ro   c                 C   s
   |  |S N)r   rh   r   rC   rC   rm   embed_input_ids     
zAfmoeModel.embed_input_idsNr   r   r   c                 C   s   t  jr|d ur|}n| |}| jr|| jjd  }d }n|d us$J |d }|d }g }tt| j| j	| j
D ]\}}	|| jv rO||d urL|| n| |	|||\}}q9t  jsct||dS | ||\}}
t|dkru||fS |S )Ng      ?rn   r   rn   r   r   )r   r   r   r   ri   r2   	enumerater   r   r   r   r   appendr   r'   r   len)rh   r   r   r   r   rn   r   aux_hidden_statesidxlayerr   rC   rC   rm   r|     s6   

zAfmoeModel.forward
batch_sizer0   devicec                 C   s6   t tj|| jjf||dtj|| jjf||ddS )N)r0   r   r   )r'   rU   zerosri   r2   )rh   r   r0   r   rC   rC   rm   r     s   z*AfmoeModel.make_empty_intermediate_tensorsc                 C   s   t j| ddd| jjdS )Nr   	down_projup_proj)ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namer7   )r   make_expert_params_mappingri   r7   rh   rC   rC   rm   get_expert_mapping  s   zAfmoeModel.get_expert_mappingweightsc              
   C   s  g d}t |  }t }|  }|D ]\}}|D ];\}}	}
|	|vs&d|v r'qd|v r0||vr0q||	|}|dr@||vr@qt|| rFq|| }|j}||||
  nkd}|D ]:}|\}}	}}
|	|vrfqYd}||	|}t|| rtqY|| }t	t
dtf |j}|||||
|dd}|r|} n-qY|rq|dr||vrqt||}|d u rqt|| rq|| }t|d	t}||| || q|S )
N))r   q_projr   )r   k_projr   )r   v_projr   )gate_up_projr   r   )r   r   rr   zself_attn.gate_projzmlp.experts.z.biasFT.)shard_id	expert_idreturn_successweight_loader)dictnamed_parameterssetr   replaceendswithr#   r   typingcastr   r   r   r   r   add)rh   r   stacked_params_mappingparams_dictloaded_paramsexpert_params_mappingnameloaded_weight
param_nameweight_namer   paramr   is_expert_weightmappingr   name_mappedsuccessrC   rC   rm   load_weights  sx   	




zAfmoeModel.load_weightsNN)r}   r~   r   r   r   rE   rU   r   r   r'   r   listr|   r   r0   r   r   r   r   r   r  r   rC   rC   rk   rm   r   {  s6    	*
-
,r   c                       sb  e Zd Zg dddgdZeddidZdZd	d
dedef fddZ	de
jde
jde
jddfddZde
jde
jfddZdeedf ddfddZdeedf fddZ		d,de
jdB de
jd edB d!e
jdB de
jeB ee
jee
j f B f
d"d#Zd$e
jde
jdB fd%d&Zd'eeee
jf  dee fd(d)Zdeeeeeef  fd*d+Z  ZS )-AfmoeForCausalLM)r   r   r   r   r   )r   r   z.router.gate.weightz.gate.weight)orig_to_new_suffixFr*   r   rj   r,   c                   s8  t    |jj}|j}|| _|| _t|t|dd| _t	 j
r+t|j|j|d| _nt | _t|j| _| jj| _g | _|j|j | _|j| _g | _d }| jjD ]}t|trYqQt|ts`J |jrn|j}| j|jj  qQ|d u r|| jdkr|t!d|d ur|j"| _#|j$| _%|j&| _'|j(| _)|j*| _+|j,| _-d S d S )Nmodel)rj   r,   )r+   r   z(No AfmoeMoE layer found in model.layers.).rD   rE   r   r   r+   ri   r   r&   r  r   r   r   r   r2   lm_headr    r   logits_processorr   expert_weightsr   r   num_moe_layersrf   num_expert_groups
moe_layersr   
isinstancer   r   r   r   rg   RuntimeErrorr^   num_logical_expertsr_   num_physical_expertsr`   num_local_physical_expertsrP   num_routed_expertsrR   rQ   r]   rA   )rh   rj   r,   ri   r+   example_moer   rk   rC   rm   rE   t  sN   




zAfmoeForCausalLM.__init__expert_load_viewlogical_to_physical_maplogical_replica_countro   Nc                 C   s:   t | jD ]\}}| j|  |j||||d qd S )N)moe_layer_idxr  r  r   )r   r  r  r   get_expert_weightsset_eplb_state)rh   r  r  r   r   r   rC   rC   rm   r#    s   zAfmoeForCausalLM.set_eplb_stater   c                 C   s   | j |S r   )r  r   r   rC   rC   rm   r        z AfmoeForCausalLM.embed_input_idsr   .c                 C   s   || j _d S r   )r  r   )rh   r   rC   rC   rm   set_aux_hidden_state_layers  r$  z,AfmoeForCausalLM.set_aux_hidden_state_layersc                 C   s   t | jj}d|d |d fS )N      )r   r  r   )rh   
num_layersrC   rC   rm   "get_eagle3_aux_hidden_state_layers  s   z3AfmoeForCausalLM.get_eagle3_aux_hidden_state_layersr   r   r   c                 C   s   |  ||||}|S r   )r  )rh   r   r   r   r   rn   rC   rC   rm   r|     s   zAfmoeForCausalLM.forwardrn   c                 C   s   |  | j|}|S r   )r  r  )rh   rn   logitsrC   rC   rm   compute_logits  s   zAfmoeForCausalLM.compute_logitsr   c                 C   s   t | }|j|| jdS )N)mapper)r   r  hf_to_vllm_mapper)rh   r   loaderrC   rC   rm   r    s   zAfmoeForCausalLM.load_weightsc                 C   s
   | j  S r   )r  r   r   rC   rC   rm   r     r   z#AfmoeForCausalLM.get_expert_mappingr  )r}   r~   r   packed_modules_mappingr!   r-  fall_back_to_pt_during_loadr   r   rE   rU   r   r#  r   r   r   r%  r)  r'   r  r|   r+  r   r   r  r   r   rC   rC   rk   rm   r  _  sP    /

$&r  )G__doc__r   collections.abcr   r   	itertoolsr   rU   r   vllm.compilation.decoratorsr   vllm.configr   r   r	   vllm.distributedr
   r   r   vllm.loggerr   $vllm.model_executor.layers.attentionr   5vllm.model_executor.layers.fused_moe.shared_fused_moer   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   r   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr   r   %vllm.model_executor.models.interfacesr   r   r    vllm.model_executor.models.llamar   rd    vllm.model_executor.models.utilsr   r    r!   r"   r#   r$   r%   r&   vllm.sequencer'   vllm.v1.attention.backendr(   r}   loggerModuler)   r   r   r   r  rC   rC   rC   rm   <module>   sL   (
n|T ]