o
    پiB                     @   s  d Z ddlmZmZmZmZmZ ddlZddlmZ ddl	m
Z
 ddlmZmZmZ ddlmZ ddlmZ dd	lmZmZmZmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddl m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z'm(Z( ddl)m*Z* ddl+m,Z, ddl-m.Z. G dd dej/Z0G dd dej/Z1G dd dej/Z2G dd dej/Z3G dd dej/Z4G dd  d ej/Z5e5Z6dS )!z Inference-only XVERSE MoE model.    )AnyDictIterableOptionalTupleN)nn)PretrainedConfig)get_tensor_model_parallel_rank$get_tensor_model_parallel_world_size tensor_model_parallel_all_reduce)
SiluAndMul)RMSNorm)MergedColumnParallelLinearQKVParallelLinearReplicatedLinearRowParallelLinear)LogitsProcessor)	fused_moe)MoeRunnerConfig)TopK)QuantizationConfig)RadixAttention)get_rope)ParallelLMHeadVocabParallelEmbedding)ForwardBatch)default_weight_loader)
add_prefixc                       sN   e Zd Z			ddedededee ded	ed
df fddZdd Z	  Z
S )	XverseMLPNT hidden_sizeintermediate_size
hidden_actquant_configreduce_resultsprefixreturnc              	      sj   t    t||gd d|td|d| _t||d||td|d| _|dkr/td| d	t | _	d S )
N   Fgate_up_projbiasr#   r%   	down_proj)r*   r#   r$   r%   siluzUnsupported activation: z!. Only silu is supported for now.)
super__init__r   r   r(   r   r+   
ValueErrorr   act_fn)selfr    r!   r"   r#   r$   r%   	__class__ P/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/xverse_moe.pyr.   5   s*   
	
zXverseMLP.__init__c                 C   s*   |  |\}}| |}| |\}}|S N)r(   r0   r+   )r1   xgate_up_r4   r4   r5   forwardU   s   
zXverseMLP.forward)NTr   )__name__
__module____qualname__intstrr   r   boolr.   r:   __classcell__r4   r4   r2   r5   r   3   s(     r   c                       sR   e Zd Z		ddedee def fddZdd	 Zd
e	j
de	j
fddZ  ZS )	XverseMoENr   configr#   r%   c              	      s  t     | _t | _t | _ j| _ j	| _
| j| jkr+td| j d| j dt fddt| jD | _|   tdd| _t j| jdd td	d
| _t| j
t| jddd| _ jd ur j j }t j| jdtdd| _d S d S )NzTensor parallel size z' is greater than the number of experts .c                    s2   g | ]}t  j j jd td| dqS )Fzexperts.r    r!   r"   r#   r$   r%   )r   r    r!   r"   r   ).0irC   r%   r#   r4   r5   
<listcomp>q   s    	z&XverseMoE.__init__.<locals>.<listcomp>T)inplaceFrouterr)   norm_topk_prob)top_krenormalizeshared_expertsrE   )r-   r.   rC   r	   rankr
   tp_sizenum_expertsn_routed_experts	moe_top_krM   r/   r   
ModuleListrangeexpertspack_paramsr   moe_runner_configr   r    r   rK   r   getattrtopknum_shared_expertsr!   r   r"   rO   )r1   rC   r#   r%   r!   r2   rH   r5   r.   ^   sR   

	
zXverseMoE.__init__c                 C   s   g }g }| j D ]}||jj ||jj qtj|| _tj	| j|}t
||D ]\}}||_q,| jjt|g|d jR  | _tj|| _tj	| j|}t
||D ]\}}||_qX| jjt|g|d jR  | _d S )Nr   )rW   appendr(   weightr+   torch_utils_flatten_dense_tensorsw1_unflatten_dense_tensorszipdataviewlenshapew2)r1   rb   ri   expertw1sre   paramw2sr4   r4   r5   rX      s   
 $zXverseMoE.pack_paramshidden_statesr&   c           	      C   s   |j \}}|d|}| jjd ur| |}| |\}}| ||}t|| j| j	|| j
}| jjd ur8|| }t|}|||S )N)rh   rf   rC   r\   rO   rK   r[   r   rb   ri   rY   r   )	r1   rn   
num_tokens
hidden_dimshared_outputrouter_logitsr9   topk_outputfinal_hidden_statesr4   r4   r5   r:      s"   

zXverseMoE.forwardNr   )r;   r<   r=   r   r   r   r?   r.   rX   r_   Tensorr:   rA   r4   r4   r2   r5   rB   \   s    9rB   c                       s   e Zd Z						ddededed	ed
edeeeef  dedee	 deddf fddZ
dejdejdedejfddZ  ZS )XverseAttentionr   '  N    r   r    	num_headsnum_kv_headslayer_id
rope_thetarope_scalingmax_position_embeddingsr#   r%   r&   c
              
      sJ  t    || _t }
|| _| j|
 dksJ | j|
 | _|| _| j|
kr/| j|
 dks.J n	|
| j dks8J td| j|
 | _|| j | _	| j| j	 | _
| j| j	 | _| j	d | _|| _|| _t|| j	| j| jd|td|	d| _t| j| j	 |d|td|	d| _t| j	| j	|||d| _t| j| j	| j| j||td	|	d
| _d S )Nr      g      Fqkv_projr)   o_proj)
rotary_dimmax_positionbaser   attn)r|   r}   r#   r%   )r-   r.   r    r
   total_num_headsr{   total_num_kv_headsmaxr|   head_dimq_sizekv_sizescalingr~   r   r   r   r   r   r   r   
rotary_embr   r   )r1   r    r{   r|   r}   r~   r   r   r#   r%   rQ   r2   r4   r5   r.      sb   



zXverseAttention.__init__	positionsrn   forward_batchc                 C   sb   |  |\}}|j| j| j| jgdd\}}}| |||\}}| ||||}	| |	\}
}|
S )Nro   )dim)r   splitr   r   r   r   r   )r1   r   rn   r   qkvr9   qkvattn_outputoutputr4   r4   r5   r:   	  s    zXverseAttention.forward)r   ry   Nrz   Nr   )r;   r<   r=   r>   floatr   r   r?   r   r   r.   r_   rw   r   r:   rA   r4   r4   r2   r5   rx      sJ    	
Frx   c                       sf   e Zd Z		ddededee deddf
 fdd	Zd
e	j
de	j
dedee	j
 de	j
f
ddZ  ZS )XverseDecoderLayerNr   rC   r}   r#   r%   r&   c           	         s   t    |j| _t|dd}t|dd }t|dd}t|d|j}t| j|j||||||td|d	| _|jd urFt	||td	|d
| _
nt|j|j|j|td	|d| _
t|j|jd| _t|j|jd| _d S )Nr~   ry   r   r   rz   num_key_value_heads	self_attn)	r    r{   r|   r}   r~   r   r   r#   r%   mlp)rC   r#   r%   )r    r!   r"   r#   r%   eps)r-   r.   r    rZ   num_attention_headsrx   r   r   rR   rB   r   r   r!   r"   r   rms_norm_epsinput_layernormpost_attention_layernorm)	r1   rC   r}   r#   r%   r~   r   r   r   r2   r4   r5   r.     sH   


zXverseDecoderLayer.__init__r   rn   r   residualc                 C   sZ   |d u r|}|  |}n|  ||\}}| j|||d}| ||\}}| |}||fS )N)r   rn   r   )r   r   r   r   )r1   r   rn   r   r   r4   r4   r5   r:   F  s   
zXverseDecoderLayer.forwardrv   )r;   r<   r=   r   r>   r   r   r?   r.   r_   rw   r   r:   rA   r4   r4   r2   r5   r     s2    -r   c                	       s\   e Zd ZdZ		ddedee deddf fdd	Zd
e	j
de	j
dede	j
fddZ  ZS )XverseModelFNr   rC   r#   r%   r&   c                    sn   t     j| _ j| _t j jtdd| _t	
 fddt jD | _t j jd| _d S )Nembed_tokensr%   c              	      s(   g | ]}t  |td | dqS )zlayers.r#   r%   )r   r   )rF   r}   rH   r4   r5   rI   s  s    z(XverseModel.__init__.<locals>.<listcomp>r   )r-   r.   pad_token_idpadding_idx
vocab_sizer   r    r   r   r   rU   rV   num_hidden_layerslayersr   r   normr1   rC   r#   r%   r2   rH   r5   r.   c  s   
zXverseModel.__init__	input_idsr   r   c           	      C   sR   |  |}d }tt| jD ]}| j| }|||||\}}q| ||\}}|S r6   )r   rV   rg   r   r   )	r1   r   r   r   rn   r   rG   layerr9   r4   r4   r5   r:     s   


zXverseModel.forwardrv   )r;   r<   r=   fall_back_to_pt_during_loadr   r   r   r?   r.   r_   rw   r   r:   rA   r4   r4   r2   r5   r   _  s,    r   c                	       s|   e Zd Z		ddedee deddf fddZe	 d	ej
d
ej
dedej
fddZdeeeej
f  fddZ  ZS )XverseMoeForCausalLMNr   rC   r#   r%   r&   c                    sV   t    || _|| _t||td|d| _t|j|j	|td|d| _
t|| _d S )Nmodelr   lm_headr   )r-   r.   rC   r#   r   r   r   r   r   r    r   r   logits_processorr   r2   r4   r5   r.     s   
zXverseMoeForCausalLM.__init__r   r   r   c                 C   s    |  |||}| ||| j|S r6   )r   r   r   )r1   r   r   r   rn   r4   r4   r5   r:     s   
zXverseMoeForCausalLM.forwardweightsc                 C   s   g d}t |  }|D ]g\}}d|v rq|D ]5\}}}||vr!q|||}|dr1||vr1qd|v s9d|v r>||vr>q|| }	|	j}
|
|	||  n&|drW||vrWqd|v s_d|v rd||vrdq|| }	t|	dt}
|
|	| qd S )N))r   q_projr   )r   k_projr   )r   v_projr   )r(   	gate_projr   )r(   up_projr   zrotary_emb.inv_freqz.biaszmlp.experts.zmlp.shared_experts.weight_loader)dictnamed_parametersreplaceendswithr   rZ   r   )r1   r   stacked_params_mappingparams_dictnameloaded_weight
param_nameweight_nameshard_idrl   r   r4   r4   r5   load_weights  s8   
z!XverseMoeForCausalLM.load_weightsrv   )r;   r<   r=   r   r   r   r?   r.   r_   no_gradrw   r   r:   r   r   r   rA   r4   r4   r2   r5   r     s.    $r   )7__doc__typingr   r   r   r   r   r_   r   transformersr   sglang.srt.distributedr	   r
   r   sglang.srt.layers.activationr   sglang.srt.layers.layernormr   sglang.srt.layers.linearr   r   r   r   "sglang.srt.layers.logits_processorr   0sglang.srt.layers.moe.fused_moe_triton.fused_moer    sglang.srt.layers.moe.moe_runnerr   sglang.srt.layers.moe.topkr   *sglang.srt.layers.quantization.base_configr   !sglang.srt.layers.radix_attentionr   "sglang.srt.layers.rotary_embeddingr   *sglang.srt.layers.vocab_parallel_embeddingr   r   ,sglang.srt.model_executor.forward_batch_infor   $sglang.srt.model_loader.weight_utilsr   sglang.srt.utilsr   Moduler   rB   rx   r   r   r   
EntryClassr4   r4   r4   r5   <module>   s6   )eVH1N