o
    -i[                     @   s  d Z ddlmZ ddlmZ ddlZddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZmZ dd
lmZmZ ddlmZ ddlmZmZmZ ddlmZ ddlmZ ddlmZ ddlm Z m!Z! ddl"m#Z#m$Z$ ddl%m&Z& ddl'm(Z(m)Z) ddl*m+Z+m,Z,m-Z-m.Z.m/Z/ G dd deZ0G dd dej1j2Z3d-ddZ4dej5dej5de6d e7fd!d"Z8G d#d$ d$ej9Z:G d%d& d&ej9Z;G d'd( d(ej9Z<eG d)d* d*ej9Z=G d+d, d,ej9e(e)Z>dS ).zInference-only PhiMoE model.    )Iterable)isliceN)nn)PretrainedConfig)	Attention)support_torch_compile)CacheConfig
VllmConfig)get_pp_group$get_tensor_model_parallel_world_size)FusedMoE)QKVParallelLinearReplicatedLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)get_rope)ParallelLMHeadVocabParallelEmbedding)default_weight_loadermaybe_remap_kv_scale_name)IntermediateTensors   )SupportsLoRA
SupportsPP)AutoWeightsLoaderis_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefixc                       s\   e Zd ZdZdgZ										
																	d fdd	Z  ZS )PhiMoEConfigphimoepast_key_values }      8         Nsilu   {Gz?h㈵>Tr      F           MbP?c                    s   || _ |	| _|| _|| _|| _|| _|| _|| _|| _|d u r!|}|d u r)|| }|| _	|| _
|| _|
| _|| _|| _|d u rJ|dd}d|d}|| _|| _|| _|| _|| _|| _t jd||||d| d S )N
rope_thetag    .Adefault)	rope_typer0   )pad_token_idbos_token_ideos_token_idtie_word_embeddings )
vocab_sizemax_position_embeddingshidden_sizeintermediate_sizenum_hidden_layersnum_attention_headssliding_windowattention_biaslm_head_biasnum_key_value_headshead_dim
hidden_actinitializer_rangerms_norm_eps	use_cachepopattention_dropoutnum_experts_per_toknum_local_expertsoutput_router_logitsrouter_aux_loss_coefrouter_jitter_noisesuper__init__)selfr8   r:   r;   r<   r=   rA   rB   rC   r9   rD   rE   rF   r3   r4   r5   r6   rope_parametersr>   rH   rI   rJ   rK   rL   rM   r?   r@   kwargsr0   	__class__r7   ^/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/model_executor/models/phimoe.pyrO   G   sH   

zPhiMoEConfig.__init__)r#   r$   r%   r&   r&   r'   Nr(   r)   r*   r+   TNr   r,   FNNr-   r,   r.   Fr/   r-   FF)__name__
__module____qualname__
model_typekeys_to_ignore_at_inferencerO   __classcell__r7   r7   rS   rU   r    C   s<    r    c                   @   sL   e Zd Zedejdejdejdejdejf
ddZedejfd	d
ZdS )mpscores
multiplierselected_expertsmasked_gatesmask_for_onec                 C   s   |  ||| || S N)save_for_backward)ctxr]   r^   r_   r`   ra   r7   r7   rU   forward   s   	z
mp.forwardgrad_at_outputc                 C   s@   | j \}}}|| }||d }|jd||d |d d d d fS )N)dimindexsrc)saved_tensorsmulscatter_add_)rd   rf   r^   r_   r`   grad_at_scores_expandedr7   r7   rU   backward   s   zmp.backwardN)rV   rW   rX   staticmethodtorchTensorre   ro   r7   r7   r7   rU   r\      s"    r\   {Gz?c                 C   s`  t  # | jddd\}}|  j|d}||  | d| k}W d    n1 s*w   Y  | |td}|}t j|dd}|jd|d}|}t 	| d|td}	t  # |	jddd\}}|  j|d}||  | d| k}W d    n1 s}w   Y  |	|td}
|}t j|
dd}
|
jd|d}t j
||fdd}t j
||fdd}||fS )	Nrg   T)rh   keepdim)minr,   z-infrh   )rh   ri   )rq   no_gradmaxabsclampmasked_fillfloatsoftmaxgatherscatterconcat)r]   
jitter_epsmask_logits_thresholdmax_indfactorr`   r_   multiplier_or^   masked_scoresmasked_gates_top2selected_experts_top2multiplier_top2r7   r7   rU   sparsemixer   sD   

	

	r   hidden_statesgating_outputtopkrenormalizec                 C   sP   | j d |j d ksJ d|dksJ d|du sJ dt|\}}||fS )Nr   zNumber of tokens mismatchr,   zOnly top-2 routing is supportedFz Renormalization is not supported)shaper   )r   r   r   r   topk_weightstopk_idsr7   r7   rU   phimoe_routing_function   s
   r   c                       sp   e Zd ZdZ				ddededededejdB d	edB d
edB def fddZ	dej
dej
fddZ  ZS )PhiMoEa  A tensor-parallel MoE implementation for PhiMoE that shards each expert
    across all ranks.

    Each expert's weights are sharded across all ranks and a fused MoE
    kernel is used for the forward pass, and finally we reduce the outputs
    across ranks.
    N num_expertstop_kr:   r;   params_dtypequant_configtp_sizeprefixc	           	         sV   t    || _t||d|d | dd| _t|||||dd||t| dd| _d S )NFz.gate)biasr   r   r   Tz.experts)r   r   r:   r;   r   reduce_resultsr   r   r   custom_routing_functionr   )rN   rO   r:   r   gater   r   experts)	rP   r   r   r:   r;   r   r   r   r   rS   r7   rU   rO      s.   
	zPhiMoE.__init__r   returnc                 C   s8   |j }|d| j}| |\}}| ||}||S )Nrg   )r   viewr:   r   r   )rP   r   
orig_shaperouter_logits_final_hidden_statesr7   r7   rU   re   $  s
   
zPhiMoE.forward)NNNr   )rV   rW   rX   __doc__intrq   dtyper   strrO   rr   re   r[   r7   r7   rS   rU   r      s0    	&r   c                       sz   e Zd Z					ddedededededB d	ed
edB dedB deddf fddZde	j
de	j
de	j
fddZ  ZS )PhiMoEAttentionNr)   r   r:   	num_headsnum_kv_headsrQ   rB   max_positioncache_configr   r   r   c
              	      sD  t    || _t }
|| _| j|
 dksJ | j|
 | _|| _| j|
kr/| j|
 dks.J n	|
| j dks8J td| j|
 | _|d u rI|| }|| _	| j| j	 | _
| j| j	 | _| j	d | _t|| j	| j| jd||	 dd| _t| j| j	 |d||	 dd| _t| j	||dd| _t| j| j	| j| j|||	 d	d
| _d S )Nr   r   g      Tz	.qkv_proj)r   r   r   z.o_proj)r   rQ   is_neox_stylez.attn)r   r   r   r   )rN   rO   r:   r   total_num_headsr   total_num_kv_headsrx   r   rB   q_sizekv_sizescalingr   qkv_projr   o_projr   
rotary_embr   attn)rP   r:   r   r   rQ   rB   r   r   r   r   r   rS   r7   rU   rO   /  s`   

	
zPhiMoEAttention.__init__	positionsr   c           
      C   s`   |  |\}}|j| j| j| jgdd\}}}| |||\}}| |||}| |\}	}|	S )Nrg   rv   )r   splitr   r   r   r   r   )
rP   r   r   qkvr   qkvattn_outputoutputr7   r7   rU   re   r  s    zPhiMoEAttention.forward)Nr)   NNr   )rV   rW   rX   r   dictr   r   r   rO   rq   rr   re   r[   r7   r7   rS   rU   r   .  sD    	
Cr   c                       sh   e Zd Z			ddededB dedB deddf
 fdd	Zd
ej	dej	dej	dB dej	fddZ
  ZS )PhiMoEDecoderLayerNr   configr   r   r   r   c                    s   t    |j| _t| j|j|j|jt|d| j|j |||j| dd	| _	t
|j|j|j|j|| dd| _tj|j|jdd| _tj|j|jdd| _d S )NrB   z
.self_attn)	r:   r   r   r   rB   r   r   rQ   r   z.block_sparse_moe)r   r   r:   r;   r   r   Tepselementwise_affine)rN   rO   r:   r   r=   r9   rA   getattrrQ   	self_attnr   rJ   rI   r;   block_sparse_moer   	LayerNormrE   input_layernormpost_attention_layernorm)rP   r   r   r   r   rS   r7   rU   rO     s:   


zPhiMoEDecoderLayer.__init__r   r   residualc                 C   sL   |}|  |}| j||d}|| }|}| |}| |}|| }||fS )N)r   r   )r   r   r   r   )rP   r   r   r   r7   r7   rU   re     s   


zPhiMoEDecoderLayer.forward)NNr   )rV   rW   rX   r    r   r   r   rO   rq   rr   re   r[   r7   r7   rS   rU   r     s0    &r   c                       s   e Zd Zdddedef fddZdejdejfd	d
Z	ddejdejde	dB dejdB deje	B f
ddZ
deeeeeef  fddZdeeeejf  dee fddZ  ZS )PhiMoEModelr   r   vllm_configr   c                   s   t    |jj|j |jj| _| _| _t| jj	| _
tj fdd| dd\| _| _| _tjj	jdd| _tddgj	| _d S )	Nc                    s   t  | dS )Nr   )r   r   r   r   r   r7   rU   <lambda>  s    z&PhiMoEModel.__init__.<locals>.<lambda>z.layersr   Tr   r   r   )rN   rO   model_config	hf_configr   r   r8   r   r   r:   embed_tokensr   r<   start_layer	end_layerlayersr   r   rE   normr   make_empty_intermediate_tensors)rP   r   r   rS   r   rU   rO     s,   



zPhiMoEModel.__init__	input_idsr   c                 C   s
   |  |S rb   )r   rP   r   r7   r7   rU   embed_input_ids     
zPhiMoEModel.embed_input_idsNr   intermediate_tensorsinputs_embedsc                 C   s   t  jr|d ur|}n| |}d }n|d usJ |d }|d }t| j| j| jD ]
}||||\}}q*t  js@t||dS | 	|}|S )Nr   r   )r   r   )
r
   is_first_rankr   r   r   r   r   is_last_rankr   r   )rP   r   r   r   r   r   r   layerr7   r7   rU   re     s(   


zPhiMoEModel.forwardc                 C   s   t j| ddd| jjdS )Nw1w2w3)ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namer   )r   make_expert_params_mappingr   rJ   rP   r7   r7   rU   get_expert_mapping  s   zPhiMoEModel.get_expert_mappingweightsc              	   C   s  g d}t |  }t }|  }|D ]\}}| jd urE| j| }rE|| }	t|	dt}
| dkr6|n|d }|
|	| |	| q|D ].\}}}||vrQqG|
||}|dra||vraqGt|| rgqG|| }	|	j}
|
|	||  nV|D ]*}|\}}}}||vrqx|
||}t|| rqx|| }	|	j}
|
|	||||d  n)|dr||vrqt|| rqt||}|d u rq|| }	t|	dt}
|
|	| |	| q|S )N))r   q_projr   )r   k_projr   )r   v_projr   weight_loaderr   z.bias)shard_id	expert_id)r   named_parameterssetr   r   get_cache_scaler   r   rh   addreplaceendswithr   r   r   )rP   r   stacked_params_mappingparams_dictloaded_paramsexpert_params_mappingnameloaded_weight
scale_nameparamr   
param_nameweight_namer   mappingr   r7   r7   rU   load_weights  st   







zPhiMoEModel.load_weightsrb   )rV   rW   rX   r	   r   rO   rq   rr   r   r   re   listtupler   r   r   r   r  r[   r7   r7   rS   rU   r     s"    
!,	r   c                       s   e Zd ZdZdg diZdddZddd	ed
ef fddZde	j
de	j
fddZ		dde	j
de	j
dedB de	j
dB de	j
eB f
ddZde	j
de	j
fddZdeeee	j
f  dee fddZdeeeeeef  fddZ  ZS ) PhiMoEForCausalLMFr   )r   r   r   input_embeddingsoutput_embeddings)r   lm_headr   r   r   r   c                   sl   t    |jj}|| _|j| _t|t|dd| _t	|j
|jd dt|dd| _t|j
| _| jj| _d S )Nmodel)r   r   Tr  )r   r   r   )rN   rO   r   r   r   r   r   r   r  r   r8   r:   r  r   logits_processorr   )rP   r   r   r   rS   r7   rU   rO   n  s"   

zPhiMoEForCausalLM.__init__r   r   c                 C   s   | j |S rb   )r  r   r   r7   r7   rU   r     s   z!PhiMoEForCausalLM.embed_input_idsNr   r   r   c                 C   s   |  ||||}|S rb   )r  )rP   r   r   r   r   r   r7   r7   rU   re     s   zPhiMoEForCausalLM.forwardr   c                 C   s   |  | j|}|S rb   )r  r  )rP   r   logitsr7   r7   rU   compute_logits  s   z PhiMoEForCausalLM.compute_logitsr   c                 C   s   t | }||S rb   )r   r  )rP   r   loaderr7   r7   rU   r    s   
zPhiMoEForCausalLM.load_weightsc                 C   s
   | j  S rb   )r  r   r   r7   r7   rU   r     r   z$PhiMoEForCausalLM.get_expert_mapping)NN)rV   rW   rX   fall_back_to_pt_during_loadpacked_modules_mappingembedding_modulesr	   r   rO   rq   rr   r   r   re   r  r   r  r   r  r  r   r   r[   r7   r7   rS   rU   r  ]  s2    

$&r  )rs   )?r   collections.abcr   	itertoolsr   rq   r    transformers.configuration_utilsr   vllm.attention.layerr   vllm.compilation.decoratorsr   vllm.configr   r	   vllm.distributedr
   r   $vllm.model_executor.layers.fused_moer   !vllm.model_executor.layers.linearr   r   r   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr   r   vllm.sequencer   
interfacesr   r   utilsr   r   r   r   r   r    autogradFunctionr\   r   rr   r   boolr   Moduler   r   r   r   r  r7   r7   r7   rU   <module>   sN   	J
&4
9QA 