o
    -i2f                     @   s  d Z ddlZddlmZmZ ddlmZ ddlZddlmZ ddl	m
Z
 ddlmZ ddlmZmZmZ dd	lmZmZmZ dd
lmZ ddlmZ ddlmZ ddlmZmZmZ ddlm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z&m'Z' ddl(m)Z)m*Z* ddl+m,Z,m-Z- ddl.m/Z0 ddl1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8m9Z9 ddl:m;Z; ddl<m=Z= ee>Z?G dd dej@ZAG dd dej@ZBG dd dej@ZCedddddd G d!d" d"ej@ZDG d#d$ d$ej@e-e,ZEdS )%z?Inference-only AfMoE model compatible with HuggingFace weights.    N)CallableIterable)islice)nn)	Attention)support_torch_compile)CacheConfig
VllmConfigget_current_vllm_config)get_ep_groupget_pp_group$get_tensor_model_parallel_world_size)init_logger)SharedFusedMoE)RMSNorm)ColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)get_rope)ParallelLMHeadVocabParallelEmbedding)default_weight_loadermaybe_remap_kv_scale_name)SupportsLoRA
SupportsPP)LlamaMLP)AutoWeightsLoaderPPMissingLayerWeightsMapperextract_layer_indexis_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefix)IntermediateTensors)AttentionTypec                       sL   e Zd Z			ddedB dedef fddZd	ejd
ejfddZ	  Z
S )AfmoeMoEN Fquant_configprefixenable_eplbc                    sD  t    t | _|j| _|j| _|j| _t j| _	| j	
 | _| j	 | _|j| _|j| _|jdkr<td|j dtj|j|jdtjd| _ttj|jtjd| _t }|jj}|| _ |j!| _"| j| _#| j#| j" | _$| j$| j | _%| j| j% | _&| j&| j% | _'d | _(|jdkr|j)|j }t*|j||j|d| dd	| _(t+di d
| j(d|jd|j,d|jd|j)ddd| jdkr| jn/dd|ddd|j-d|j.d| dd| jd| jd| jd| j d| j"| _/d S d|ddd|j-d|j.d| dd| jd| jd| jd| j d| j"| _/d S )NsiluzUnsupported activation: z!. Only silu is supported for now.F)biasdtyper/   r   z.shared_experts)hidden_sizeintermediate_size
hidden_actr*   reduce_resultsr+   shared_expertsnum_expertstop_kr1   r2   r4   renormalizesigmoidr*   use_grouped_topkTnum_expert_group
topk_groupr+   z.expertsscoring_funcrouted_scaling_factore_score_correction_biasr,   num_redundant_experts )0super__init__r   tp_sizeroute_scale
score_func
route_normr   device_groupep_grouprankep_ranksizeep_sizer6   n_routed_expertsnum_shared_expertsn_shared_expertsr3   
ValueErrorr   Linearr1   torchfloat32gate	Parameteremptyexpert_biasr
   parallel_configeplb_configr,   r@   n_redundant_expertsn_logical_expertsn_physical_expertsn_local_physical_expertsphysical_expert_startphysical_expert_endr5   moe_intermediate_sizeAfmoeMLPr   num_experts_per_tokn_groupr<   experts)selfconfigr*   r+   r,   vllm_configrZ   r2   	__class__rA   ]/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/model_executor/models/afmoe.pyrC   :   s   





	


	


zAfmoeMoE.__init__hidden_statesreturnc                 C   sz   |j \}}|d|}| |jtjd}| j||d}| jd ur*|\}}|| }n|}| jdkr7| j	|}|||S )Nr0   )rl   router_logits   )
shapeviewrU   torS   rT   re   r5   rD   &maybe_all_reduce_tensor_model_parallel)rf   rl   
num_tokens
hidden_dimro   fused_moe_outshared_outputfinal_hidden_statesrA   rA   rk   forward   s   



zAfmoeMoE.forward)Nr)   F)__name__
__module____qualname__r   strboolrC   rS   Tensorrz   __classcell__rA   rA   ri   rk   r(   9   s    Ur(   c                       s   e Zd Zddddddejfdedededed	ed
edB dededB dedB de	de	ddf fddZ
dejdejdejfddZ  ZS )AfmoeAttention   Ngh㈵>r)   	layer_idxr1   	num_headsnum_kv_headsmax_position_embeddingshead_dimrms_norm_epscache_configr*   r+   	attn_typerm   c                    s  t    || _|| _t }|| _| j| dksJ | j| | _|| _| j|kr2| j| dks1J n	|| j dks;J td| j| | _	|pJ|| j | _
| j| j
 | _| j	| j
 | _| j
d | _|| _|j| dk| _| jrq|jnd | _t| j| j
| j| jd|
| dd| _t| j| j
 | jd|
| dd| _t|| j| j
 d|
| d	d| _t| j
|jd
| _t| j
|jd
| _| jrt| j
||jdd| _nd | _t| j| j
| j| j	|	|
| j| d|d	| _ d S )Nr   rp   g      sliding_attentionFz	.qkv_proj)r.   r*   r+   z.o_projz
.gate_projepsT)max_positionrope_parametersis_neox_stylez.attn)r   r   r*   per_layer_sliding_windowr+   r   )!rB   rC   r   r1   r   total_num_headsr   total_num_kv_headsmaxr   r   q_sizekv_sizescalingr   layer_typesis_local_attentionsliding_windowr   qkv_projr   o_projr   	gate_projr   r   q_normk_normr   r   
rotary_embr   attn)rf   rg   r   r1   r   r   r   r   r   r   r*   r+   r   rD   ri   rA   rk   rC      s~   



	
	
zAfmoeAttention.__init__	positionsrl   c                 C   s   |  |\}}| |\}}|j| j| j| jgdd\}}}| |d| j| j|j	}| 
|d| j| j|j	}| jrO| jd urO| |||\}}| |||}	|	t| }	| |	\}
}|
S )Nrn   )dim)r   r   splitr   r   r   reshaper   r   rq   r   r   r   r   r   rS   r9   r   )rf   r   rl   qkv_rU   qkvattn_outputoutputrA   rA   rk   rz     s     zAfmoeAttention.forward)r{   r|   r}   r'   DECODERintfloatr   r   r~   rC   rS   r   rz   r   rA   rA   ri   rk   r      sP    	
`r   c                       st   e Zd Z				ddedB dedB dededdf
 fd	d
Zdej	dej	dej	dB de
ej	ej	f fddZ  ZS )AfmoeDecoderLayerNr)   Fr   r*   r+   r,   rm   c                    s   t    |j| _t|dd}t|| _t|| j| j|j|j||j	|j
||| dd| _| j|jk| _| jrDt||| d|d| _nt|j|j|j|| dd| _t|j|j
d| _t|j|j
d| _t|j|j
d| _t|j|j
d| _d S )	Nr   r   z
.self_attn)rg   r   r1   r   r   r   r   r   r   r*   r+   z.mlp)rg   r*   r+   r,   )r1   r2   r3   r*   r+   r   )rB   rC   r1   getattrr!   r   r   num_attention_headsnum_key_value_headsr   r   	self_attnnum_dense_layersmoe_enabledr(   mlprb   r2   r3   r   input_layernormpost_attention_layernormpre_mlp_layernormpost_mlp_layernorm)rf   rg   r   r*   r+   r,   r   ri   rA   rk   rC   #  sL   


zAfmoeDecoderLayer.__init__r   rl   residualc                 C   sl   |d u r|}|  |}n|  ||\}}| j||d}| |}| ||\}}| |}| |}||fS )N)r   rl   )r   r   r   r   r   r   )rf   r   rl   r   rA   rA   rk   rz   Z  s   


zAfmoeDecoderLayer.forward)NNr)   F)r{   r|   r}   r   r   r~   r   rC   rS   r   tuplerz   r   rA   rA   ri   rk   r   "  s2    7r   rn   )	input_idsr   intermediate_tensorsinputs_embeds)dynamic_arg_dimsc                       s   e Zd Zdddedef fddZdejdejfd	d
Z		ddejdejde	dB dejdB deje	B f
ddZ
dedejdejde	fddZdeeeeeef  fddZdeeeejf  dee fddZ  ZS )
AfmoeModelr)   r+   rh   r+   c                   s   t    |jj|j |j|jj| _j	| _	j
| _
t jr0tj	j| dd| _nt | _tj fdd| dd\| _| _| _t jrZtjjd| _nt | _tddgj| _d S )	Nz.embed_tokensr   c                    s   t  | dS )N)rg   r   r*   r+   r,   )r   r   r   rg   r,   r*   rA   rk   <lambda>  s    z%AfmoeModel.__init__.<locals>.<lambda>z.layersr   rl   r   )rB   rC   model_config	hf_configr   r*   rY   r,   rg   
vocab_sizemup_enabledr   is_first_rankr   r1   embed_tokensr   r$   num_hidden_layersstart_layer	end_layerlayersis_last_rankr   r   normr#   make_empty_intermediate_tensors)rf   rh   r+   ri   r   rk   rC     s0   



zAfmoeModel.__init__r   rm   c                 C   s
   |  |S N)r   rf   r   rA   rA   rk   embed_input_ids     
zAfmoeModel.embed_input_idsNr   r   r   c           	      C   s   t  jr|d ur|}n| |}| jr|| jjd  }d }n|d us$J |d }|d }t| j| j| j	D ]
}||||\}}q5t  j
sKt||dS | ||\}}|S )Ng      ?rl   r   rl   r   )r   r   r   r   rg   r1   r   r   r   r   r   r&   r   )	rf   r   r   r   r   rl   r   layerr   rA   rA   rk   rz     s$   
zAfmoeModel.forward
batch_sizer/   devicec                 C   s6   t tj|| jjf||dtj|| jjf||ddS )N)r/   r   r   )r&   rS   zerosrg   r1   )rf   r   r/   r   rA   rA   rk   r     s   z*AfmoeModel.make_empty_intermediate_tensorsc                 C   s   t j| ddd| jjdS )Nr   	down_projup_proj)ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namer6   )r   make_expert_params_mappingrg   r6   rf   rA   rA   rk   get_expert_mapping  s   zAfmoeModel.get_expert_mappingweightsc              
   C   s  g d}t |  }t }|  }|D ]\}}|D ];\}}	}
|	|vs&d|v r'qd|v r0||vr0q||	|}|dr@||vr@qt|| rFq|| }|j}||||
  nkd}|D ]:}|\}}	}}
|	|vrfqYd}||	|}t|| rtqY|| }t	t
dtf |j}|||||
|dd}|r|} n-qY|rq|dr||vrqt||}|d u rqt|| rq|| }t|d	t}||| || q|S )
N))r   q_projr   )r   k_projr   )r   v_projr   )gate_up_projr   r   )r   r   rp   zself_attn.gate_projzmlp.experts.z.biasFT.)shard_id	expert_idreturn_successweight_loader)dictnamed_parameterssetr   replaceendswithr"   r   typingcastr   r   r   r   r   add)rf   r   stacked_params_mappingparams_dictloaded_paramsexpert_params_mappingnameloaded_weight
param_nameweight_namer   paramr   is_expert_weightmappingr   name_mappedsuccessrA   rA   rk   load_weights  sx   	




zAfmoeModel.load_weightsNN)r{   r|   r}   r	   r~   rC   rS   r   r   r&   rz   r   r/   r   r   listr   r   r   r   r  r   rA   rA   ri   rk   r   v  s6    	(
"
,r   c                       s  e Zd Zg dddgdZeddidZdZd	d
dedef fddZ	de
jde
jde
jddfddZde
jde
jfddZ		d&de
jde
jdedB de
jdB de
jeB f
ddZde
jde
jdB fdd Zd!eeee
jf  dee fd"d#Zdeeeeeef  fd$d%Z  ZS )'AfmoeForCausalLM)r   r   r   r   r   )r   r   z.router.gate.weightz.gate.weight)orig_to_new_suffixFr)   r   rh   r+   c                   s8  t    |jj}|j}|| _|| _t|t|dd| _t	 j
r+t|j|j|d| _nt | _t|j| _| jj| _g | _|j|j | _|j| _g | _d }| jjD ]}t|trYqQt|ts`J |jrn|j}| j|jj  qQ|d u r|| jdkr|t!d|d ur|j"| _#|j$| _%|j&| _'|j(| _)|j*| _+|j,| _-d S d S )Nmodel)rh   r+   )r*   r   z(No AfmoeMoE layer found in model.layers.).rB   rC   r   r   r*   rg   r   r%   r  r   r   r   r   r1   lm_headr   r   logits_processorr   expert_weightsr   r   num_moe_layersrd   num_expert_groups
moe_layersr   
isinstancer   r   r   appendre   RuntimeErrorr\   num_logical_expertsr]   num_physical_expertsr^   num_local_physical_expertsrN   num_routed_expertsrP   rO   r[   r@   )rf   rh   r+   rg   r*   example_moer   ri   rA   rk   rC   b  sN   




zAfmoeForCausalLM.__init__expert_load_viewlogical_to_physical_maplogical_replica_countrm   Nc                 C   s:   t | jD ]\}}| j|  |j||||d qd S )N)moe_layer_idxr  r  r  )	enumerater  r  r  get_expert_weightsset_eplb_state)rf   r  r  r  r   r   rA   rA   rk   r    s   zAfmoeForCausalLM.set_eplb_stater   c                 C   s   | j |S r   )r  r   r   rA   rA   rk   r     s   z AfmoeForCausalLM.embed_input_idsr   r   r   c                 C   s   |  ||||}|S r   )r  )rf   r   r   r   r   rl   rA   rA   rk   rz     s   zAfmoeForCausalLM.forwardrl   c                 C   s   |  | j|}|S r   )r
  r	  )rf   rl   logitsrA   rA   rk   compute_logits  s   zAfmoeForCausalLM.compute_logitsr   c                 C   s   t | }|j|| jdS )N)mapper)r   r  hf_to_vllm_mapper)rf   r   loaderrA   rA   rk   r    s   zAfmoeForCausalLM.load_weightsc                 C   s
   | j  S r   )r  r   r   rA   rA   rk   r     r   z#AfmoeForCausalLM.get_expert_mappingr  )r{   r|   r}   packed_modules_mappingr    r!  fall_back_to_pt_during_loadr	   r~   rC   rS   r   r  r   r&   rz   r  r   r   r   r  r  r   r   r   rA   rA   ri   rk   r  M  sL    /

$&r  )F__doc__r   collections.abcr   r   	itertoolsr   rS   r   vllm.attention.layerr   vllm.compilation.decoratorsr   vllm.configr   r	   r
   vllm.distributedr   r   r   vllm.loggerr   5vllm.model_executor.layers.fused_moe.shared_fused_moer   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   r   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr   r   %vllm.model_executor.models.interfacesr   r    vllm.model_executor.models.llamar   rb    vllm.model_executor.models.utilsr   r   r    r!   r"   r#   r$   r%   vllm.sequencer&   vllm.v1.attention.backendr'   r{   loggerModuler(   r   r   r   r  rA   rA   rA   rk   <module>   sL   (
m|T P