o
    پiJ                     @   s  d dl mZmZmZmZ d dlZd dlmZ d dlmZ d dl	m
Z
 d dlmZmZ d dlmZmZmZ d dlmZmZ d d	lmZ d d
lmZ d dlmZmZ d dlmZ d dlmZ d dl m!Z! d dl"m#Z#m$Z$m%Z% d dl&m'Z' d dl(m)Z)m*Z* d dl+m,Z,m-Z- G dd deZ.d(ddZ/dej0dej0de1de2fddZ3G dd dej4Z5G d d! d!ej4Z6G d"d# d#ej4Z7G d$d% d%ej4Z8G d&d' d'ej4Z9e9Z:dS ))    )IterableOptionalTupleUnionN)nn)PretrainedConfig)$get_tensor_model_parallel_world_size)get_attention_tp_rankget_attention_tp_size)QKVParallelLinearReplicatedLinearRowParallelLinear)LogitsProcessorLogitsProcessorOutput)FusedMoE)TopK)PoolerPoolingType)QuantizationConfig)RadixAttention)get_rope)DEFAULT_VOCAB_PADDING_SIZEParallelLMHeadVocabParallelEmbedding)ForwardBatch)default_weight_loadermaybe_remap_kv_scale_name)
add_prefixmake_layersc                       sV   e Zd ZdZ											
																d fdd	Z  ZS )PhiMoEConfigphimoe }      8         Nsilu   {Gz?h㈵>T      F    .A           MbP?c                    s   || _ |	| _|| _|| _|| _|| _|| _|| _|| _|d u r!|}|d u r)|| }|| _	|| _
|| _|
| _|| _|| _|| _|| _|| _|| _|| _|| _|| _t jd||||d| d S )N)pad_token_idbos_token_ideos_token_idtie_word_embeddings )
vocab_sizemax_position_embeddingshidden_sizeintermediate_sizenum_hidden_layersnum_attention_headssliding_windowattention_biaslm_head_biasnum_key_value_headshead_dim
hidden_actinitializer_rangerms_norm_eps	use_cache
rope_thetaattention_dropoutnum_experts_per_toknum_local_expertsoutput_router_logitsrouter_aux_loss_coefrouter_jitter_noisesuper__init__)selfr5   r7   r8   r9   r:   r>   r?   r@   r6   rA   rB   rC   r0   r1   r2   r3   rD   r;   rE   rF   rG   rH   rI   rJ   r<   r=   kwargs	__class__r4   L/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/phimoe.pyrL   &   sD   
zPhiMoEConfig.__init__)r!   r"   r#   r$   r$   r%   Nr&   r'   r(   r)   TNr*   r+   Fr,   Nr-   r+   r.   Fr/   r-   FF)__name__
__module____qualname__
model_typerL   __classcell__r4   r4   rO   rQ   r   "   s:    r   {Gz?c                 C   s  | j ddd\}}|  j|d}||  | d| k}| |td}|}tj|dd}|jd|d}|}t| d|td}	|	j ddd\}}|  j|d}||  | d| k}|	|td}
|}tj|
dd}
|
jd|d}tj	||fdd}tj	||fdd}||fS )	NT)dimkeepdim)minr+   z-infrY   )rY   index)
maxabsclampmasked_fillfloattorchsoftmaxgatherscatterconcat)scores
jitter_epsmask_logits_thresholdmax_indfactormasked_gatesselected_expertsmultiplier_o
multipliermasked_scoresmasked_gates_top2selected_experts_top2multiplier_top2r4   r4   rQ   sparsemixerj   s<   
	
ru   hidden_statesgating_outputtopkrenormalizec                 C   sP   | j d |j d ksJ d|dksJ d|du sJ dt|\}}||fS )Nr   zNumber of tokens mismatchr+   zOnly top-2 routing is supportedFz Renormalization is not supported)shaperu   )rv   rw   rx   ry   topk_weightstopk_idsr4   r4   rQ   phimoe_routing_function   s
   r}   c                       sj   e Zd ZdZ		ddededededed	ee d
ef fddZ	dde	j
dee de	j
fddZ  ZS )PhiMoEa  A tensor-parallel MoE implementation for PhiMoE that shards each expert
    across all ranks.

    Each expert's weights are sharded across all ranks and a fused MoE
    kernel is used for the forward pass, and finally we reduce the outputs
    across ranks.
    N num_expertstop_kr7   r8   layer_idquant_configprefixc                    s^   t    || _t | _t||dd d| _t|dtd| _	t
|||||d|td|d| _d S )NF)biasr   )r   ry   custom_routing_functionTexperts)r   r   r   r7   r8   reduce_resultsr   r   )rK   rL   r7   r   tp_sizer   gater   r}   rx   r   r   r   )rM   r   r   r7   r8   r   r   r   rO   r4   rQ   rL      s0   

zPhiMoE.__init__rv   forward_batchreturnc                 C   sD   |j }|d| j}| |\}}| ||}| ||}||S )NrX   )rz   viewr7   r   rx   r   )rM   rv   r   
orig_shaperouter_logits_topk_outputfinal_hidden_statesr4   r4   rQ   forward   s   
zPhiMoE.forwardNr   N)rR   rS   rT   __doc__intr   r   strrL   rc   Tensorr   r   rV   r4   r4   rO   rQ   r~      s6    (r~   c                       s   e Zd Z								ddeded	ed
ee dededededee dee de	ddf fddZ
dejdejdedejfddZ  ZS )PhiMoEAttentionNr'   '  r   Fr   r7   	num_headsnum_kv_headsr?   max_positionrD   r   r<   r   rope_scalingr   r   c                    sj  t    || _t }t }|| _| j| dksJ | j| | _|| _| j|kr2| j| dks1J n	|| j dks;J td| j| | _	|d u rL|| }|| _
| j| j
 | _| j	| j
 | _| j
d | _|| _|
| _t|| j
| j| j||	||td|d	| _t| j| j
 |||	||td|d| _t| j
| j
|t| j| jd| _t| j| j
| j| j	||	td|d	| _d S )
Nr   r*   g      qkv_proj)r   r   tp_rankr   r   o_proj)
rotary_dimr   baser   attn)r   r   r   r   )rK   rL   r7   r	   r
   total_num_headsr   total_num_kv_headsr^   r   r?   q_sizekv_sizescalingrD   r   r   r   r   r   r   r   r   
rotary_embr   r   )rM   r7   r   r   r?   r   rD   r   r<   r   r   r   attn_tp_rankattn_tp_sizerO   r4   rQ   rL      sp   


	zPhiMoEAttention.__init__	positionsrv   r   c                 C   sb   |  |\}}|j| j| j| jgdd\}}}| |||\}}| ||||}	| |	\}
}|
S )NrX   r\   )r   splitr   r   r   r   r   )rM   r   rv   r   qkvr   qkvattn_outputoutputr4   r4   rQ   r   :  s    zPhiMoEAttention.forward)Nr'   r   r   FNNr   )rR   rS   rT   r   r   rb   boolr   dictr   rL   rc   r   r   r   rV   r4   r4   rO   rQ   r      sV    	
Pr   c                       sp   e Zd Z		ddededee deddf
 fdd	Zd
e	j
de	j
dee	j
 dedee	j
e	j
f f
ddZ  ZS )PhiMoEDecoderLayerNr   configr   r   r   r   c                    s   t    |j| _t|dd}t| j|j|j|jt|d| j|j |||j||j	t
d|d| _t|j|j|j|j||t
d|d| _tj|j|jdd	| _tj|j|jdd	| _d S )
NrD   r   r?   	self_attn)r7   r   r   r   r?   rD   r   r<   r   r   r   block_sparse_moe)r   r   r7   r8   r   r   r   Tepselementwise_affine)rK   rL   r7   getattrr   r:   r6   r>   r<   r   r   r   r~   rG   rF   r8   r   r   	LayerNormrB   input_layernormpost_attention_layernorm)rM   r   r   r   r   rD   rO   r4   rQ   rL   J  sB   
	

zPhiMoEDecoderLayer.__init__r   rv   residualr   c                 C   sR   |}|  |}| j|||d}|| }|}| |}| j||d}|| }||fS )N)r   rv   r   r   )r   r   r   r   )rM   r   rv   r   r   r4   r4   rQ   r   s  s   

zPhiMoEDecoderLayer.forwardr   )rR   rS   rT   r   r   r   r   r   rL   rc   r   r   r   r   rV   r4   r4   rO   rQ   r   H  s2    )r   c                       sf   e Zd Z		ddedee def fddZ	ddej	d	ej	d
e
deej	 deej	 f
ddZ  ZS )PhiMoEModelNr   r   r   r   c                    sv   t     | _| _ j| _t j jtd|d| _t	 j
 fddtd|d| _tj j jdd| _d S )	Nembed_tokens)r   r   c                    s   t  t|dd |dS )N.rX   r   )r   r   r   )idxr   r   r   r4   rQ   <lambda>  s    z&PhiMoEModel.__init__.<locals>.<lambda>layersr   Tr   )rK   rL   r   r   r5   r   r7   r   r   r   r9   r   r   r   rB   normrM   r   r   r   rO   r   rQ   rL     s$   

zPhiMoEModel.__init__	input_idsr   r   input_embedsr   c                 C   sJ   |d u r
|  |}n|}d }| jD ]}|||||d\}}q| |}|S )Nr   )r   r   r   )rM   r   r   r   r   rv   r   layerr4   r4   rQ   r     s   

zPhiMoEModel.forwardr   r   )rR   rS   rT   r   r   r   r   rL   rc   r   r   r   r   rV   r4   r4   rO   rQ   r     s,    "r   c                       s   e Zd Z		ddedee def fddZe	 		dd	ej
d
ej
dedeej
 dedefddZdeeeej
f  fddZ  ZS )PhiMoEForCausalLMNr   r   r   r   c              
      s   t    || _|| _t||td|d| _t|j|j	|jt
|dtd|d| _| jjr3| jjj| j_t|| _ttjdd| _d S )Nmodel)r   r   r   Tlm_head)org_num_embeddingspadding_sizer   r   r   )pooling_type	normalize)rK   rL   r   r   r   r   r   r   r5   r7   r   r   r3   r   weightr   logits_processorr   r   LASTpoolerr   rO   r4   rQ   rL     s&   
	
zPhiMoEForCausalLM.__init__Fr   r   r   inputs_embedsget_embeddingr   c                 C   s2   |  ||||}|s| ||| j|S | ||S r   )r   r   r   r   )rM   r   r   r   r   r   rv   r4   r4   rQ   r     s   	
zPhiMoEForCausalLM.forwardweightsc              	   C   s&  g d}t jddd| jjd}t|  }|D ]y\}}|D ](\}}}	||vr'q|||}|dr7||vr7q|| }
|
j}||
||	  nJ|D ]$}|\}}}}	||vrUqH|||}|| }
|
j}||
|||	|d  n#|drw||vrwqt	||}|d u rq|| }
t
|
dt}||
| qd S )	N))r   q_projr   )r   k_projr   )r   v_projr   w1w2w3)ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namer   z.bias)shard_id	expert_idweight_loader)r   make_expert_params_mappingr   rG   r   named_parametersreplaceendswithr   r   r   r   )rM   r   stacked_params_mappingexpert_params_mappingparams_dictnameloaded_weight
param_nameweight_namer   paramr   mappingr   r4   r4   rQ   load_weights  s\   

zPhiMoEForCausalLM.load_weightsr   )NF)rR   rS   rT   r   r   r   r   rL   rc   no_gradr   r   r   r   r   r   r   r   rV   r4   r4   rO   rQ   r     s6    $r   )rW   );typingr   r   r   r   rc   r    transformers.configuration_utilsr   sglang.srt.distributedr   sglang.srt.layers.dp_attentionr	   r
   sglang.srt.layers.linearr   r   r   "sglang.srt.layers.logits_processorr   r   &sglang.srt.layers.moe.fused_moe_tritonr   sglang.srt.layers.moe.topkr   sglang.srt.layers.poolerr   r   *sglang.srt.layers.quantization.base_configr   !sglang.srt.layers.radix_attentionr   "sglang.srt.layers.rotary_embeddingr   *sglang.srt.layers.vocab_parallel_embeddingr   r   r   ,sglang.srt.model_executor.forward_batch_infor   $sglang.srt.model_loader.weight_utilsr   r   sglang.srt.utilsr   r   r   ru   r   r   r   r}   Moduler~   r   r   r   r   
EntryClassr4   r4   r4   rQ   <module>   sF    
H4
<`G5k