o
    پiaY                     @  s  d Z ddlmZ ddlZddlmZmZmZ ddlZddl	m
  mZ ddlm
Z
 ddlmZ ddlmZmZmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZ ddlmZ ddlm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z,m-Z- ddl.m/Z/ ddl0m1Z1 ddl2m3Z3 d(ddZ4G dd de
j5Z6G dd de
j5Z7G d d! d!e
j5Z8G d"d# d#e
j5Z9G d$d% d%e
j5Z:G d&d' d'e
j5Z;e;Z<dS ))aQ  Inference-only AfMoE model compatible with HuggingFace weights.

AfMoE is a Mixture-of-Experts model with:
- Gated attention with sigmoid gating
- Q/K normalization with RMSNorm
- Dual normalization (pre/post for both attention and MLP)
- Sliding window attention for local layers
- muP (maximal update parameterization) scaling support
    )annotationsN)IterableOptionalTuple)nn)PretrainedConfig)get_tensor_model_parallel_rank$get_tensor_model_parallel_world_size tensor_model_parallel_all_reduce)
SiluAndMul)RMSNorm)ColumnParallelLinearMergedColumnParallelLinearQKVParallelLinearReplicatedLinearRowParallelLinear)LogitsProcessor)	fused_moe)MoeRunnerConfig)TopK)QuantizationConfig)RadixAttention)get_rope)ParallelLMHeadVocabParallelEmbedding)ForwardBatch)default_weight_loader)
add_prefixconfigr   returnOptional[int]c                 C  s,   t | dd }|d u rd S |dkrd S |d S )Nsliding_windowr      )getattr)r   r!    r$   K/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/afmoe.py!get_attention_sliding_window_size@   s   r&   c                      s0   e Zd Z			dd fddZdddZ  ZS )AfmoeMLPNT hidden_sizeintintermediate_size
hidden_actstrquant_configOptional[QuantizationConfig]reduce_resultsboolprefixr   Nonec              	     sj   t    t||gd d|td|d| _t||d||td|d| _|dkr/td| d	t | _	d S )
N   Fgate_up_projbiasr.   r2   	down_proj)r7   r0   r.   r2   siluzUnsupported activation: z!. Only silu is supported for now.)
super__init__r   r   r5   r   r8   
ValueErrorr   act_fn)selfr)   r+   r,   r.   r0   r2   	__class__r$   r%   r;   L   s*   
	
zAfmoeMLP.__init__xtorch.Tensorc                 C  s*   |  |\}}| |}| |\}}|S N)r5   r=   r8   )r>   rA   gate_up_r$   r$   r%   forwardk   s   
zAfmoeMLP.forward)NTr(   )r)   r*   r+   r*   r,   r-   r.   r/   r0   r1   r2   r-   r   r3   )rA   rB   r   rB   __name__
__module____qualname__r;   rF   __classcell__r$   r$   r?   r%   r'   J   s    r'   c                      sF   e Zd ZedddZ		dd  fddZd!ddZd"ddZ  ZS )#AfmoeMoEhidden_statesrB   gating_outputtopkr*   renormalizer1   
score_funcr-   expert_biasOptional[torch.Tensor]r   !Tuple[torch.Tensor, torch.Tensor]c                C  s   | tj}|dkr=t|}|d ur2|j |j|jd}|| }	tj|	|ddd }
|jd|
d}n+tj||dd\}}
n |d urL||j |j|jd }tj	|dd}tj||dd\}}
|rn|j
ddd	jd
d}|| }| tj|
 tjfS )Nsigmoiddtype)kdimr"   )rZ   indexrZ   T)rZ   keepdimg#B;)min)totorchfloat32rU   devicerW   rO   gatherFsoftmaxsumclampint32)rM   rN   rO   rP   rQ   rR   logitsscoresr7   scores_for_choicetopk_idstopk_weightsprobsdenomr$   r$   r%   _custom_routing_functiont   s"   

z!AfmoeMoE._custom_routing_functionNr(   r   r   r.   r/   r2   c              
     s  t     | _t | _t | _t dd | _| jd u r t	d j
| _| j| jkr7t	d| j d| j dt dd| _t dd	| _tt d
d| _t dd| _t dd| _| jd uoe| jdk| _t dd| _t j| jdd tdd| _tjtj| jtjddd| _t fddt| jD | _ | !  | jr j"| j }t# j| j$dtdd| _%nd | _%d }d }| jr| j}n| jdkrt&j't(j)| j| jd}| jdkr| jnd}t*| j|| j| jr| jnd | jr| jnd ||| jd| _+d S )Nnum_expertsz&AfmoeConfig must define `num_experts`.zTensor parallel size z' is greater than the number of experts .rQ   re   
route_normTroute_scale      ?n_groupr"   
topk_groupnum_shared_expertsr   Fgater6   rV   )requires_gradc                   s2   g | ]}t  j j jd td| dqS )Fzexperts.r)   r+   r,   r.   r0   r2   )r'   r)   moe_intermediate_sizer,   r   ).0idxr   r2   r.   r$   r%   
<listcomp>   s    	z%AfmoeMoE.__init__.<locals>.<listcomp>shared_expertsr{   rU   )rQ   rR   )top_krP   use_grouped_topknum_expert_grouprw   custom_routing_functioncorrection_biasrouted_scaling_factor),r:   r;   r   r   rankr	   tp_sizer#   n_routed_expertsr<   num_experts_per_tokr   rQ   rs   floatrt   rv   rw   r   rx   r   r)   r   ry   r   	Parameterr`   zerosra   rR   
ModuleListrangeexpertspack_paramsr|   r'   r,   r   	functoolspartialrL   rp   r   rO   )r>   r   r.   r2   r+   custom_routing_fnr   rP   r?   r   r%   r;      s   


	
	
zAfmoeMoE.__init__r3   c                 C  s   g }g }| j D ]}||jj ||jj qtj|| _tj	| j|}t
||D ]\}}||_q,| jjt|g|d jR  | _tj|| _tj	| j|}t
||D ]\}}||_qX| jjt|g|d jR  | _d S )Nr   )r   appendr5   weightr8   r`   _utils_flatten_dense_tensorsw1_unflatten_dense_tensorszipdataviewlenshapew2)r>   r   r   expertw1sr   paramw2sr$   r$   r%   r      s   
 $zAfmoeMoE.pack_paramsc           	   	   C  s   |j \}}|d|}d }| jd ur| |}| |\}}| ||}tj|| j| j|td| j	dd}|d ur=|| }t
|}|||S )NrX   T)inplacer   )r   r   topk_outputmoe_runner_config)r   r   r   ry   rO   r   r   r   r   rt   r
   )	r>   rM   
num_tokens
hidden_dimshared_outputrouter_logitsrE   r   final_hidden_statesr$   r$   r%   rF     s*   


zAfmoeMoE.forward)rM   rB   rN   rB   rO   r*   rP   r1   rQ   r-   rR   rS   r   rT   Nr(   )r   r   r.   r/   r2   r-   )r   r3   )rM   rB   r   rB   )	rH   rI   rJ   staticmethodrp   r;   r   rF   rK   r$   r$   r?   r%   rL   r   s    "
]rL   c                      s:   e Zd Z			dd  fddZd!ddZd"ddZ  ZS )#AfmoeAttentionr   Nr(   r   r   r)   r*   	num_headsnum_kv_headslayer_idr.   r/   r2   r-   r   r3   c                   s  t    || _t }|| _| j| dksJ | j| | _|| _| j|kr/| j| dks.J n	|| j dks8J td| j| | _t	|d|| j | _
| j| j
 | _| j| j
 | _| j
d | _t	|dd}	t	|dd }
t	|dd	}t| j
| | _t	|d
d}t	|dd }|d uo|| dk| _| jrt|nd}t|| j
| j| jd|td|d| _t| j| j
 |d|td|d| _t|| j| j
 d|td|d| _t| j
| j||	|
dd| _t| j| j
| j| j|||td|d| _t	|dd}t| j
|d| _t| j
|d| _|| _ d S )Nr   r"   head_dimg      
rope_thetai'  rope_scalingpartial_rotary_factorru   max_position_embeddingsi    layer_typessliding_attentionrX   Fqkv_projr6   o_proj	gate_projT)
rotary_dimmax_positionbaser   is_neox_styleattn)r   r   sliding_window_sizer.   r2   rms_norm_epsh㈵>eps)!r:   r;   r)   r	   total_num_headsr   total_num_kv_headsmaxr   r#   r   q_sizekv_sizescalingr*   r   is_local_attentionr&   r   r   r   r   r   r   r   r   
rotary_embr   r   r   q_normk_normr!   )r>   r   r)   r   r   r   r.   r2   r   r   r   r   r   r   r!   r   r?   r$   r%   r;   !  s   


	


zAfmoeAttention.__init__qrB   rY   rT   c                 C  sH   |  |d| j}| |d| j}||j}||j}||fS )NrX   )r   reshaper   r   r   r   )r>   r   rY   q_headsk_headsr$   r$   r%   _apply_qk_norm{  s
   zAfmoeAttention._apply_qk_norm	positionsrM   forward_batchr   c                 C  s   |  |\}}|j| j| j| jgdd\}}}| ||\}}| jr+| |||\}}| ||||}	| |\}
}|	t	
|
 }	| |	\}}|S )NrX   r\   )r   splitr   r   r   r   r   r   r   r`   rU   r   )r>   r   rM   r   qkvrE   r   rY   vattn_output	gate_valsoutputr$   r$   r%   rF     s    zAfmoeAttention.forward)r   Nr(   )r   r   r)   r*   r   r*   r   r*   r   r*   r.   r/   r2   r-   r   r3   )r   rB   rY   rB   r   rT   r   rB   rM   rB   r   r   r   rB   )rH   rI   rJ   r;   r   rF   rK   r$   r$   r?   r%   r     s    
Z	r   c                      s.   e Zd Z		dd fddZdddZ  ZS )AfmoeDecoderLayerNr(   r   r   r   r*   r.   r/   r2   r-   r   r3   c           	   
     s0  t    || _|j| _|| _t||j|j|j||td|d| _	d}t
|dr.||jk}n$t|dd d urRt
|drRt
|drR|j}|j}||koQ|| | dk}|r`t||td	|d
| _nt|j|j|j|td	|d| _t|dd}t|j|d| _t|j|d| _t|j|d| _t|j|d| _d S )N	self_attn)r   r)   r   r   r   r.   r2   Fnum_dense_layersrq   first_k_dense_replacemoe_layer_freqr   mlp)r   r.   r2   )r)   r+   r,   r.   r2   r   r   r   )r:   r;   r   r)   r   r   num_attention_headsnum_key_value_headsr   r   hasattrr   r#   r   r   rL   r   r'   r+   r,   r   input_layernormpost_attention_layernormpre_mlp_layernormpost_mlp_layernorm)	r>   r   r   r.   r2   use_moer   freqr   r?   r$   r%   r;     sT   



zAfmoeDecoderLayer.__init__r   rB   rM   r   r   c                 C  s\   |}|  |}| |||}| |}|| }|}| |}| |}| |}|| }|S rC   )r   r   r   r   r   r   )r>   r   rM   r   attn_residualmlp_residualr$   r$   r%   rF     s   




zAfmoeDecoderLayer.forwardr   )
r   r   r   r*   r.   r/   r2   r-   r   r3   r   rG   r$   r$   r?   r%   r     s
    7r   c                      s@   e Zd ZdZ		dd fddZ	ddddZdddZ  ZS )
AfmoeModelFNr(   r   r   r.   r/   r2   r-   r   r3   c                   sj   t     | _ j| _ j| _t j j| _t	
 fddt jD | _t j jd| _d S )Nc              	     s(   g | ]}t  |td | dqS )zlayers.r.   r2   )r   r   )r}   r   r   r$   r%   r     s    z'AfmoeModel.__init__.<locals>.<listcomp>r   )r:   r;   r   pad_token_idpadding_idx
vocab_sizer   r)   embed_tokensr   r   r   num_hidden_layerslayersr   r   normr>   r   r.   r2   r?   r   r%   r;     s   
zAfmoeModel.__init__	input_idsrB   r   r   r   input_embedsrS   c                 C  s\   |d u r
|  |}n|}t| jddr|| jjd  }| jD ]}||||}q| |}|S )Nmup_enabledFg      ?)r   r#   r   r)   r   r   )r>   r   r   r   r   rM   layerr$   r$   r%   rF     s   

zAfmoeModel.forwardnn.Embeddingc                 C  s   | j S rC   )r   r>   r$   r$   r%   get_input_embeddings  s   zAfmoeModel.get_input_embeddingsr   r   r   r.   r/   r2   r-   r   r3   rC   
r   rB   r   rB   r   r   r   rS   r   rB   r   r   )rH   rI   rJ   fall_back_to_pt_during_loadr;   rF   r  rK   r$   r$   r?   r%   r     s    !r   c                      sP   e Zd Z		d d! fddZd"ddZ	d#d$ddZd%ddZd&ddZ  ZS )'AfmoeForCausalLMNr(   r   r   r.   r/   r2   r-   r   r3   c                   sV   t    || _|| _t||td|d| _t|j|j	|td|d| _
t|| _d S )Nmodel)r2   lm_headr   )r:   r;   r   r.   r   r   r  r   r   r)   r  r   logits_processorr   r?   r$   r%   r;      s   
zAfmoeForCausalLM.__init__r   c                 C  s   | j jS rC   )r  r   r   r$   r$   r%   r  4  s   z%AfmoeForCausalLM.get_input_embeddingsr   rB   r   r   r   r   rS   c                 C  s"   |  ||||}| ||| j|S rC   )r  r	  r  )r>   r   r   r   r   rM   r$   r$   r%   rF   7  s   
zAfmoeForCausalLM.forwardr    c                 C  s
   t | jS rC   )r&   r   r   r$   r$   r%   r&   C  s   
z2AfmoeForCausalLM.get_attention_sliding_window_sizeweights"Iterable[Tuple[str, torch.Tensor]]c                 C  s   g d}t |  }|D ]a\}}d|v rqd|v r|dd}d}|D ]3\}}}	||vr-q#d|v r6|dv r6q#|||}
|
|vrDd} n||
 }t|d	t}||||	 d} |rZq||v rm|| }t|d	t}||| qd S )
N))r   q_projr   )r   k_projrY   )r   v_projr   )r5   r   r   )r5   up_projr"   zrotary_emb.inv_freqz.mlp.router.gate.z
.mlp.gate.Fz.self_attn.>   r  r   Tweight_loader)dictnamed_parametersreplacer#   r   )r>   r
  stacked_params_mappingparams_dictnameloaded_weighthandled
param_nameweight_nameshard_idnew_namer   r  r$   r$   r%   load_weightsF  s<   	
zAfmoeForCausalLM.load_weightsr   r  r  rC   r  )r   r    )r
  r  r   r3   )	rH   rI   rJ   r;   r  rF   r&   r  rK   r$   r$   r?   r%   r    s    

r  )r   r   r   r    )=__doc__
__future__r   r   typingr   r   r   r`   torch.nn.functionalr   
functionalrd   transformersr   sglang.srt.distributedr   r	   r
   sglang.srt.layers.activationr   sglang.srt.layers.layernormr   sglang.srt.layers.linearr   r   r   r   r   "sglang.srt.layers.logits_processorr   &sglang.srt.layers.moe.fused_moe_tritonr    sglang.srt.layers.moe.moe_runnerr   sglang.srt.layers.moe.topkr   *sglang.srt.layers.quantization.base_configr   !sglang.srt.layers.radix_attentionr   "sglang.srt.layers.rotary_embeddingr   *sglang.srt.layers.vocab_parallel_embeddingr   r   ,sglang.srt.model_executor.forward_batch_infor   $sglang.srt.model_loader.weight_utilsr   sglang.srt.utilsr   r&   Moduler'   rL   r   r   r   r  
EntryClassr$   r$   r$   r%   <module>   s@   


( .yN8[