o
    پi|                     @   s0  d Z ddlZddlmZ ddlmZmZmZmZm	Z	 ddl
Z
ddl
mZ ddlmZ ddlmZmZmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZ ddlm Z  ddl!m"Z"m#Z#m$Z$m%Z% ddl&m'Z'm(Z( ddl)m*Z* ddl+m,Z, ddl-m.Z. ddl/m0Z0 ddl1m2Z2 ddl3m4Z4 ddl5m6Z6 ddl7m8Z8 ddl9m:Z: ddl;m<Z<m=Z= ddl>m?Z? ddl@mAZAmBZB ddlCmDZD ddlEmFZF ddlGmHZHmIZImJZJmKZK eLeMZNeJ ZOG dd  d ejPZQG d!d" d"ejPZRG d#d$ d$ejPZSG d%d& d&ejPZTG d'd( d(ejPZUG d)d* d*ejPZVeVZWdS )+zCInference-only ExaoneMoE model compatible with HuggingFace weights.    N)Iterable)AnyDictOptionalTupleUnion)nn)PretrainedConfig)"get_moe_expert_parallel_world_sizeget_pp_group$get_tensor_model_parallel_world_size tensor_model_parallel_all_reduce)'get_global_expert_distribution_recorder)ModelConfigForExpertLocation)ExpertLocationDispatchInfo)
SiluAndMul)get_attention_tp_rankget_attention_tp_sizeis_dp_attention_enabled)RMSNorm)MergedColumnParallelLinearQKVParallelLinearReplicatedLinearRowParallelLinear)LogitsProcessorLogitsProcessorOutput)get_moe_a2a_backend)get_moe_impl_class)FusedMoE)TopK)RoutingMethodType)QuantizationConfig)RadixAttention)get_rope)PPMissingLayer)ParallelLMHeadVocabParallelEmbedding)get_is_capture_mode)ForwardBatchPPProxyTensors)default_weight_loader)get_global_server_args)	LazyValue
add_prefixis_cudamake_layersc                       st   e Zd Z					ddedededee ded	ed
ee dee ddf fddZ			ddedefddZ	  Z
S )ExaoneMoEMLPNT hidden_sizeintermediate_size
hidden_actquant_configreduce_resultsprefixtp_ranktp_sizereturnc	              
      s   t    |}	|}
|r't|dr'|jr'td||jv rd }	td||jv r'd }
t||gd d|	td|||d| _t||d|
|td|||d| _|d	krUt	d
| dt
 | _d S )Nignore	gate_proj	down_proj   Fgate_up_projbiasr5   r7   r8   r9   )rA   r5   r6   r7   r8   r9   siluzUnsupported activation: z!. Only silu is supported for now.)super__init__hasattrr;   r-   r   r?   r   r=   
ValueErrorr   act_fn)selfr2   r3   r4   r5   r6   r7   r8   r9   gateup_quant_configdown_quant_config	__class__ P/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/exaone_moe.pyrD   K   s@   
	

zExaoneMoEMLP.__init__Fshould_allreduce_fusionuse_reduce_scatterc                 C   s2   |  |\}}| |}| j||p|d\}}|S )N)skip_all_reduce)r?   rG   r=   )rH   xforward_batchrO   rP   gate_up_rM   rM   rN   forwardy   s   

zExaoneMoEMLP.forward)NTr1   NN)NFF)__name__
__module____qualname__intstrr   r!   boolrD   rV   __classcell__rM   rM   rK   rN   r0   J   sB    	
1r0   c                       s   e Zd Z			ddededee deejj	 de
f
 fdd	Zd
d ZdejdejfddZdejdefddZdejdejfddZdejdejfddZ		ddejdee dedejfddZ  ZS )ExaoneMoESparseMoEBlockNr1   layer_idconfigr5   
alt_streamr7   c                    sz  t    t | _t | _|| _|j| _|| _|j	| _
| j|j	kr.td| j d|j	 dt|j|j	dd td|d| _ttj|j	tjd| _t||j	t j |j|j|j| j|td|tjd	| _t|j|jd
|j |j!| j| jd
dd	| _"|j#d ur|j|j# }t$d|j||j%|dtd|dt& ' rt(dddni | _)t& ' rt | _*|j	t j | _	|j| _+d S d S )NzTensor parallel size z' is greater than the number of experts .Fgate)rA   r5   r7   )dtypeexperts)num_expertstop_kr2   r3   r_   r5   r7   routing_method_typeTsigmoid)	rg   renormalizeuse_grouped_topknum_expert_group
topk_groupcorrection_biasrouted_scaling_factor%apply_routed_scaling_factor_on_outputscoring_funcshared_experts)r2   r3   r4   r5   r6   r7   r      )r8   r9   rM   ),rC   rD   r   r9   r
   moe_ep_sizer_   ro   ra   rf   n_routed_expertsrF   r   r2   r-   rc   r   	Parametertorchemptyfloat32e_score_correction_biasr   r+   ep_num_redundant_expertsnum_experts_per_tokmoe_intermediate_sizer    RenormalizeNaivere   r   norm_topk_probn_grouprm   topknum_shared_expertsr0   r4   r   	is_deepepdictrr   ep_sizerg   )rH   r_   r`   r5   ra   r7   r3   rK   rM   rN   rD      s   


	
z ExaoneMoESparseMoEBlock.__init__c                 C   s   dd | j  D S )Nc                 S   s   g | ]\}}|d vr|j qS ))rn   )data).0namerR   rM   rM   rN   
<listcomp>   s
    z;ExaoneMoESparseMoEBlock.get_moe_weights.<locals>.<listcomp>)re   named_parametersrH   rM   rM   rN   get_moe_weights   s   z'ExaoneMoESparseMoEBlock.get_moe_weightshidden_statesr:   c                 C   s   |  |}|S N)rr   )rH   r   shared_outputrM   rM   rN   _forward_shared_experts   s   
z/ExaoneMoESparseMoEBlock._forward_shared_expertsrS   c                 C   s|   d }|j d dkr%| |\}}| |}| j|||jtj| jdd}n| j|j	}| j
||d}|d ur<|| |S )Nr   )r_   )num_token_non_paddedexpert_location_dispatch_info)r   topk_output)shaperc   r   r   r   r   init_newr_   empty_topk_outputdevicere   add_)rH   r   rS   r   router_logitsrU   r   final_hidden_statesrM   rM   rN   _forward_deepep   s(   
	
z'ExaoneMoESparseMoEBlock._forward_deepepc                 C   s&   |  |\}}| ||}| ||S r   )rc   r   re   )rH   r   r   rU   r   rM   rM   rN   _forward_router_experts   s   z/ExaoneMoESparseMoEBlock._forward_router_expertsc                 C   sp   t j }| j| | | }t j| j | |}W d    n1 s)w   Y  || j ||fS r   )	rw   cudacurrent_streamra   wait_streamr   clonestreamr   )rH   r   r   r   router_outputrM   rM   rN   forward_normal_dual_stream  s   
z2ExaoneMoESparseMoEBlock.forward_normal_dual_streamFrP   c                 C   s   |j \}}|d|}t  r| ||S | jd ur-|j d dkr-t r-| |\}}n
| |}| 	|}|d ur?|| }| j
dkrJ|sJt|}|||S )Nr   rs   )r   viewr   r   r   ra   r'   r   r   r   r9   r   )rH   r   rS   rP   
num_tokens
hidden_dimr   r   rM   rM   rN   rV     s$   





zExaoneMoESparseMoEBlock.forward)NNr1   )NF)rW   rX   rY   rZ   r	   r   r!   rw   r   Streamr[   rD   r   Tensorr   r(   r   r   r   r\   rV   r]   rM   rM   rK   rN   r^      sF    
R
r^   c                       s   e Zd Z								dded	ed
ededededeeee	f  de
dedee de
deddf fddZdejdejdedejfddZ  ZS )ExaoneMoEAttentionr   @B NT    Fr1   r`   r2   	num_headsnum_kv_headsr_   
rope_thetarope_scalingrope_is_neox_stylemax_position_embeddingsr5   rA   r7   r:   c                    s  t    || _t }t }|| _| j| dksJ | j| | _|| _| j|kr2| j| dks1J n	|| j dks;J td| j| | _	t
|d| j| j | _| j| j | _| j	| j | _| jd | _|	| _|
}|
}|
rt|
dr|
jrtd||
jv r~d }td||
jv rd }t|| j| j| j||td|||d		| _t| j| j |||td|||d	| _t| j|jd
| _t| j|jd
| _|
d ur|
 dkrd}|j| dk| _d|jv| _t| j| j|	|||d| _ t!| j| j| j| j	|td|| jr|jnd d| _"|| _#d S )Nr   rs   head_dimg      r;   q_projo_projqkv_projr@   epsggufFsliding_attention)
rotary_dimmax_positionbaser   is_neox_styleattn)r   r_   r7   sliding_window_size)$rC   rD   r2   r   r   total_num_headsr   total_num_kv_headsmaxr   getattrr   q_sizekv_sizescalingr   rE   r;   r-   r   r   r   r   r   rms_norm_epsq_normk_normget_namelayer_typessliding_windowapply_rope_all_layersr#   
rotary_embr"   r   r_   )rH   r`   r2   r   r   r_   r   r   r   r   r5   rA   r7   attn_tp_rankattn_tp_sizeqkv_quant_configo_quant_configrK   rM   rN   rD   6  s   



	
zExaoneMoEAttention.__init__	positionsr   rS   c                 C   s   |  |\}}|j| j| j| jgdd\}}}|d| j}| |}|d| j| j }|d| j}| |}|d| j	| j }| j
sI| jrR| |||\}}| ||||}	| |	\}
}|
S )Nr   )dim)r   splitr   r   reshaper   r   r   r   r   r   r   r   r   r   )rH   r   r   rS   qkvrU   qkvattn_outputoutputrM   rM   rN   rV     s    

zExaoneMoEAttention.forward)r   r   NTr   NFr1   )rW   rX   rY   r	   rZ   floatr   r   r[   r   r\   r!   rD   rw   r   r(   rV   r]   rM   rM   rK   rN   r   5  sZ    	
hr   c                       s   e Zd Z				ddededee dedeej	j
 d	df fd
dZdejdejdedeej d	eejejf f
ddZ  ZS )ExaoneMoEDecoderLayerr   Nr1   r`   r_   r5   r7   ra   r:   c                    s*  t    |j| _|| _t|dd}t|dd }|d ur't|dd r'|j|d< t|dd}t|dd}	t|d	d
p>t|dd
}
t | _t | _	t
|| j|j|j|||||	||
td|d| _|j| rqt||||td|d| _nt| j|j|j|td|d| _t|j|jd| _t|j|jd| _d S )Nr   r   r    original_max_position_embeddingsr   Tr   i   attention_biasFrA   	self_attn)r`   r2   r   r   r_   r   r   r   r   r5   rA   r7   mlp)r_   r`   r5   ra   r7   )r2   r3   r4   r5   r7   r   )rC   rD   r2   r`   r   r   r   r   r   r   r   num_attention_headsnum_key_value_headsr-   r   is_moe_layerr^   r   r0   r3   r4   r   r   input_layernormpost_attention_layernorm)rH   r`   r_   r5   r7   ra   r   r   r   r   r   rK   rM   rN   rD     sd   



zExaoneMoEDecoderLayer.__init__r   r   rS   residualc                 C   sZ   |d u r|}|  |}n|  ||\}}| j|||d}| ||\}}| |}||fS )N)r   r   rS   )r   r   r   r   )rH   r   r   rS   r   rM   rM   rN   rV     s   
zExaoneMoEDecoderLayer.forward)r   Nr1   N)rW   rX   rY   r	   rZ   r   r!   r[   rw   r   r   rD   r   r(   r   rV   r]   rM   rM   rK   rN   r     s:    
@r   c                       s   e Zd ZdZ			ddedee dedeej	j
 ddf
 fd	d
Z		ddejdejdedejdee deejef fddZ  ZS )ExaoneMoEModelFNr1   r`   r5   r7   ra   r:   c                    s   t    | _j| _j| _t | _| jjr%t	jj
t  d| _nt | _tj fdd| jj| jjtd|d\| _| _| _| jjrTtj
jd| _ntdd| _g | _d S )	N)	enable_tpc                    s   t | | dS )N)r_   r`   r5   r7   ra   )r   )idxr7   ra   r`   r5   rM   rN   <lambda>/  s    z)ExaoneMoEModel.__init__.<locals>.<lambda>layers)pp_rankpp_sizer7   r   T)return_tuple)rC   rD   r`   pad_token_idpadding_idx
vocab_sizer   pp_groupis_first_rankr&   r2   r   embed_tokensr$   r/   num_hidden_layersrank_in_group
world_sizer-   r   start_layer	end_layeris_last_rankr   r   normlayers_to_capturerH   r`   r5   r7   ra   rK   r   rN   rD     s.   


zExaoneMoEModel.__init__	input_idsr   rS   input_embedspp_proxy_tensorsc              	   C   s  | j jr|d u r| |}n|}d }n|d usJ |d }|d }g }t| j| jD ]2}	t |	" |	| jv r?|	||  | j
|	 }
|
||||\}}W d    n1 sWw   Y  q*| j jsht||dS |jd dkr|d u ry| |}n| ||\}}t|dkr|S ||fS )Nr   r   )r   r   r   )r   r   r   ranger   r   r   with_current_layerr   appendr   r   r)   r   r   len)rH   r  r   rS   r  r  r   r   aux_hidden_statesilayerrU   rM   rM   rN   rV   C  s@   


zExaoneMoEModel.forward)Nr1   NNN)rW   rX   rY   fall_back_to_pt_during_loadr	   r   r!   r[   rw   r   r   rD   r   r(   r)   r   rV   r]   rM   rM   rK   rN   r     s>    
1r   c                       s&  e Zd Z		d'dedee deddf fddZed	d
 Z	e
 		d(de
jde
jdede
jdee defddZe
 	d)de
jde
jdedeeef de
jf
ddZedd Zedd Zdd Zdd Z	d*deeee
jf  defd d!Zed"d# Zd)d$eee  fd%d&Z  ZS )+ExaoneMoEForCausalLMNr1   r`   r5   r7   r:   c                    s   t    t  _| _| _trtj	 nd }t
||td||d _ jjr. jj _nt|j|j|td|t jd _t| _d _t fdd _d S )Nmodel)r5   r7   ra   lm_head)r5   r7   use_attn_tp_groupFc                      s    fddt  j jD S )Nc                    s4   i | ]}t  jj| jtr| jj| j qS rM   )
isinstancer  r   r   r^   r   )r   r_   r   rM   rN   
<dictcomp>  s    zCExaoneMoEForCausalLM.__init__.<locals>.<lambda>.<locals>.<dictcomp>)r  r   r   rM   r   rM   rN   r     s   
 z/ExaoneMoEForCausalLM.__init__.<locals>.<lambda>)rC   rD   r   r   r`   r5   _is_cudarw   r   r   r   r-   r  tie_word_embeddingsr   r  r%   r   r2   r+   enable_dp_lm_headr   logits_processorcapture_aux_hidden_statesr,    _routed_experts_weights_of_layerr  rK   r   rN   rD   s  s2   



zExaoneMoEForCausalLM.__init__c                 C      | j jS r   )r  valuer   rM   rM   rN   routed_experts_weights_of_layer     z4ExaoneMoEForCausalLM.routed_experts_weights_of_layerr  r   rS   r  r  c                 C   sF   | j |||||d}d }| jr|\}}| jjr!| ||| j||S |S )N)r  )r  r  r   r   r  r  )rH   r  r   rS   r  r  r   r	  rM   rM   rN   rV     s&   	zExaoneMoEForCausalLM.forwardsplit_intervalc                 C   s   |\}}|dkr|d u r| j ||_n||_t||D ]}| j j| }	|	||j||j\|_|_q|| j jjkrS| j |j|j\}
}|
|_| 	||j| j
|}|S d }|S )Nr   )r  r   r   r  r   r   r`   r   r   r  r  )rH   r  r   rS   r  r  startendr
  r  r   rU   resultrM   rM   rN   forward_split_prefill  s0   	z*ExaoneMoEForCausalLM.forward_split_prefillc                 C   r  r   )r  r   r   rM   rM   rN   r     r  z ExaoneMoEForCausalLM.start_layerc                 C   r  r   )r  r   r   rM   rM   rN   r     r  zExaoneMoEForCausalLM.end_layerc                 C   s   | j jj| jjfS r   )r  r   weightr  r   rM   rM   rN   get_embed_and_head  s   z'ExaoneMoEForCausalLM.get_embed_and_headc                 C   s8   | j j`| j`|| j j_|| j_tj  tj  d S r   )r  r   r#  r  rw   r   empty_cachesynchronize)rH   embedheadrM   rM   rN   set_embed_and_head  s   

z'ExaoneMoEForCausalLM.set_embed_and_headFweightsis_mtpc              	   C   s  g d}t jddd| jjd}t|  }|D ]\}}|r3d|vr"q|dv r-|dd	}n|dd
}|s:d|v r:qd|v sBd|v rCqd|v sKd|v rLq|drV||vrVq|D ]2\}}	}
|	|vrbqXd|v rgqX||	|}|drw||vrwqX||vr|qX|| }|j	}||||
  nU|D ]$}|\}}	}}
|	|vrq||	|}|| }|j	}||||||
d  n.|dr||vrq||vrq||
 v r|| }t|dt}||| qtd| d qd S )N))r   r   r   )r   k_projr   )r   v_projr   )r?   r<   r   )r?   up_projrs   r<   r=   r.  )ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namerf   mtp)zmtp.fc.weightz mtp.pre_fc_norm_embedding.weightzmtp.pre_fc_norm_hidden.weightzmtp.r1   r  zrotary_emb.inv_freq	projectorzrotary_emb.cos_cachedzrotary_emb.sin_cachedzmodel.vision_towerzmlp.expertsz.bias)	expert_idshard_idweight_loaderz
Parameter z not found in params_dict)r   make_expert_params_mappingr`   rf   r   r   replace
startswithendswithr6  keysr   r*   loggerwarning)rH   r*  r+  stacked_params_mappingexpert_params_mappingparams_dictr   loaded_weight
param_nameweight_namer5  paramr6  mappingr4  rM   rM   rN   load_weights  s   	z!ExaoneMoEForCausalLM.load_weightsc                 C   s   t |j|jd dS )N)
num_layersnum_logical_experts
num_groups)r   r   rf   )clsr`   rM   rM   rN   $get_model_config_for_expert_locationY  s
   z9ExaoneMoEForCausalLM.get_model_config_for_expert_location	layer_idsc                 C   sR   t  jsd S d| _|d u r| jj}d|d |d g| j_d S dd |D | j_d S )NTr>      c                 S   s   g | ]}|d  qS )rs   rM   )r   valrM   rM   rN   r   n  s    zEExaoneMoEForCausalLM.set_eagle3_layers_to_capture.<locals>.<listcomp>)r   r   r  r`   r   r  r   )rH   rL  rG  rM   rM   rN   set_eagle3_layers_to_capturea  s   z1ExaoneMoEForCausalLM.set_eagle3_layers_to_capture)Nr1   r  r   )F) rW   rX   rY   r	   r   r!   r[   rD   propertyr  rw   no_gradr   r(   r)   r   rV   r   rZ   r"  r   r   r$  r)  r   r\   rF  classmethodrK  listrO  r]   rM   rM   rK   rN   r  r  st    '

(

	
_
 r  )X__doc__loggingcollections.abcr   typingr   r   r   r   r   rw   r   transformersr	   sglang.srt.distributedr
   r   r   r   #sglang.srt.eplb.expert_distributionr   sglang.srt.eplb.expert_locationr   (sglang.srt.eplb.expert_location_dispatchr   sglang.srt.layers.activationr   sglang.srt.layers.dp_attentionr   r   r   sglang.srt.layers.layernormr   sglang.srt.layers.linearr   r   r   r   "sglang.srt.layers.logits_processorr   r   sglang.srt.layers.moer   "sglang.srt.layers.moe.ep_moe.layerr   &sglang.srt.layers.moe.fused_moe_tritonr   sglang.srt.layers.moe.topkr   sglang.srt.layers.moe.utilsr    *sglang.srt.layers.quantization.base_configr!   !sglang.srt.layers.radix_attentionr"   "sglang.srt.layers.rotary_embeddingr#   sglang.srt.layers.utilsr$   *sglang.srt.layers.vocab_parallel_embeddingr%   r&   +sglang.srt.model_executor.cuda_graph_runnerr'   ,sglang.srt.model_executor.forward_batch_infor(   r)   $sglang.srt.model_loader.weight_utilsr*   sglang.srt.server_argsr+   sglang.srt.utilsr,   r-   r.   r/   	getLoggerrW   r<  r  Moduler0   r^   r   r   r   r  
EntryClassrM   rM   rM   rN   <module>   sX   
? - \^  