o
    پi                     @   s  d Z ddlZddlmZ ddlmZmZmZmZm	Z	 ddl
Z
ddlZddlmZ ddl
mZ ddlmZ ddlmZ ddlmZmZmZmZmZ dd	lmZ dd
lmZ ddlmZm Z m!Z! ddl"m#Z# ddl$m%Z%m&Z&m'Z' ddl(m)Z) ddl*m+Z+ ddl,m-Z- ddl.m/Z/ ddl0m1Z1 ddl2m3Z3 ddl4m5Z5 ddl6m7Z7 ddl8m9Z9 ddl:m;Z;m<Z< ddl=m>Z>m?Z? ddl@mAZAmBZB ddlCmDZD ddlEmFZFmGZGmHZHmIZImJZJ eKeLZMejNdejOdejOfddZPejNdejOdejOfd d!ZQd"e
jRd#e
jRd$e
jRfd%d&ZS	'	(d@d"e
jRd#e
jRd)e
jRd*e
jRd+e
jRd,eTd-eUd$e
jRfd.d/ZVG d0d1 d1ejWZXG d2d3 d3ejWZYG d4d5 d5ejWZZG d6d7 d7ejWZ[G d8d9 d9ejWZ\G d:d; d;ejWZ]d<ed=e^d$eeT fd>d?Z_e]Z`dS )AzDInference-only MiniMax M2 model compatible with HuggingFace weights.    N)nullcontext)IterableOptionalSetTupleUnion)nn)PretrainedConfig)model_forward_maybe_tbo)"get_moe_expert_parallel_world_sizeget_pp_groupget_tensor_model_parallel_rank$get_tensor_model_parallel_world_size tensor_model_parallel_all_reduce)'get_global_expert_distribution_recorder)ExpertLocationDispatchInfo)LayerCommunicatorLayerScatterModesScatterMode)RMSNorm)QKVParallelLinearReplicatedLinearRowParallelLinear)LogitsProcessor)get_moe_impl_class)FusedMoE)TopK)get_moe_a2a_backend)QuantizationConfig)RadixAttention)get_rope)PPMissingLayer)ParallelLMHeadVocabParallelEmbedding)ForwardBatchPPProxyTensors)default_weight_loadermaybe_remap_kv_scale_name)get_global_server_args)BumpAllocator
add_prefixget_compiler_backendis_non_idle_and_non_emptymake_layersBLOCK_SIZE1BLOCK_SIZE2c
                 C   s   t d}
| |
|  }||
|  }t d|}||k }t d|	}||k }t j|| |dd}t j|| |dd}|t j}t j|| dd}|t j}t j|| dd}t ||
 | t ||
 | | d S )Nr           maskother)axis)tl
program_idarangeloadtofloat32sumstore)x1_ptrx2_ptr	stride_x1	stride_x2
sum_sq_ptrBD1D2r.   r/   row_idx1_rowx2_rowoffsets1mask1offsets2mask2x1x2x1_f32sum_sq1x2_f32sum_sq2 rR   P/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/minimax_m2.pyrmsnorm_sumsq_kernel_serialO   s   
rT   c           #      C   sr  t d}| ||
  }|||  }|||
  }|||  }t || }t || | }t || | | }t ||	 | | }t d|}t d|}||k }||	k }t j|| |dd}t j|| |dd}t j|| |dd}t j|| |dd} |t j| |t j |j}!|t j| | t j |j}"t j|| |!|d t j|| |"|d d S )Nr   r0   r1         ?)r2   )	r5   r6   r8   rsqrtr7   r9   r:   dtyper<   )#r=   r>   w1_ptrw2_ptrrA   out1_ptrout2_ptrrB   rC   rD   r?   r@   tp_worldepsr.   r/   rE   rF   rG   out1_rowout2_rowrO   rQ   inv_rms1inv_rms2rH   rJ   rI   rK   rL   w1rM   w2out1out2rR   rR   rS   rmsnorm_apply_kernel_serialr   s*   
$$rf   rL   rM   returnc                 C   s   | j r|j sJ | j\}}|j\}}||ksJ | d}|d}|| d d d }tj|| jtjd}	t|}
t|}|f}t	| | ||||	||||
|
 |	S )Nr         devicerW   )
is_cudashapestridetorchemptyrk   r:   tritonnext_power_of_2rT   )rL   rM   rB   rC   B2rD   r?   r@   B_paddedsum_sqr.   r/   gridrR   rR   rS   rms_sumsq_serial   s0   





rw      h㈵>rb   rc   ru   r\   r]   c                 C   s   | j r|j r|j r|j r|j sJ | j\}}|j\}	}
||	ks!J | d}|d}tj||| j| jd}tj||
|j|jd}t|}t|
}|f}t	| | |||||||||
|||||| ||fS )Nr   rj   )
rl   rm   rn   ro   rp   rk   rW   rq   rr   rf   )rL   rM   rb   rc   ru   r\   r]   rB   rC   rs   rD   r?   r@   rd   re   r.   r/   rv   rR   rR   rS   rms_apply_serial   s<   "	





rz   c                       s   e Zd ZdZddededdf fddZed	ej	d
e
jddfddZe
jde d	dde
jdee
j dee
jee
je
jf f fddZedd dd de
jde
jde
jf
ddZ  ZS )MiniMaxM2RMSNormTPz:RMSNorm with Tensor Parallel support for QK normalization.ư>hidden_sizer]   rg   Nc                    sJ   t    t | _t | _tt	t
|| j | _| j| j_|| _d S N)super__init__r   r\   r   tp_rankr   	Parameterro   onesintweightweight_loadervariance_epsilon)selfr}   r]   	__class__rR   rS   r      s   


zMiniMaxM2RMSNormTP.__init__paramloaded_weightc                 C   sD   t  }t }|jd | }t|| |d | }| j||  dS )z.Custom weight loader that handles TP sharding.r   rx   N)r   r   rm   slicedatacopy_)r   r   r\   r   
shard_sizeshardrR   rR   rS   r     s
   z MiniMaxM2RMSNormTP.weight_loaderT)dynamicbackendxresidualc                 C   sz   |du sJ d|j }|tj}|djddtjd}| jdkr)t|| j }|t|| j	  }|| j
 |}|S )z0Forward pass with TP-aware variance computation.Nz/RMSNormTP does not support residual connection.   T)dimkeepdimrW   rx   )rW   r9   ro   r:   powmeanr\   r   rV   r   r   )r   r   r   
orig_dtypevariancerR   rR   rS   forward  s   
zMiniMaxM2RMSNormTP.forwardq_normk_normqkc                 C   sD   t ||}| jdkrt|}t||| j|j|| j| j\}}||fS )Nrx   )rw   r\   r   rz   r   r   )r   r   r   r   ru   rR   rR   rS   
forward_qk'  s   


zMiniMaxM2RMSNormTP.forward_qk)r|   r~   )__name__
__module____qualname____doc__r   floatr   staticmethodr   r   ro   Tensorr   compiler+   r   r   r   r   r   __classcell__rR   rR   r   rS   r{      s@    
r{   c                	       s   e Zd ZdZ		d'dededee def fdd	Z	e
d
ejdejddfddZdejdedejfddZdejdejfddZdejdedejfddZdd Zdd Zdd Zdd Zdd  Zd!d" Zd#d$ Zd%d& Z  ZS )(MiniMaxM2MoEzDMiniMax MoE implementation using DeepEP for Expert Parallel support.N configlayer_idquant_configprefixc              
      s  t    t | _| j|jkrtd| j d|j dt|dd| _| jr9t	t
j|jt
jd| _tj| j_nd | _t||jt j |j|j|j||td|d| _t|jd	|j| jd
d| _t|j|jdt
jd td|d| _|| _t   rt! | _"|j| _#d S d S )NzTensor parallel size z' is greater than the number of experts .use_routing_biasF)rW   experts)num_expertstop_kr}   intermediate_sizer   r   r   TrU   )r   renormalizescoring_funccorrection_biasrouted_scaling_factorgate)biasparams_dtyper   r   )$r   r   r   tp_sizenum_local_experts
ValueErrorgetattrr   r   r   ro   rp   r:   e_score_correction_biasr   ebias_weight_loaderr   r   r(   ep_num_redundant_expertsnum_experts_per_tokr}   r   r*   r   r   r   topkr   r   r   r   	is_deepepr   ep_sizer   )r   r   r   r   r   r   rR   rS   r   B  s^   


	
zMiniMaxM2MoE.__init__r   r   rg   c                 C   s,   |   |  ks
J | j|tj d S r~   )sizer   r   r9   ro   r:   )r   r   rR   rR   rS   r   |  s   z MiniMaxM2MoE.ebias_weight_loaderhidden_statesforward_batchc                 C   s    t   r| ||S | |S r~   )r   r   forward_deepepforward_normal)r   r   r   rR   rR   rS   r     s   

zMiniMaxM2MoE.forwardc                 C   sb   |j \}}|d|}| |tj\}}| ||}| ||}| jdkr+t	|}|||S )Nr   rx   )
rm   viewr   r9   ro   r:   r   r   r   r   )r   r   
num_tokens
hidden_dimrouter_logits_topk_outputfinal_hidden_statesrR   rR   rS   r     s   

zMiniMaxM2MoE.forward_normalc                 C   sf   |j d dkr"| |tj\}}| j|||jtj| j	dd}n| jj
|jd}| j||d}|S )Nr   r   )num_token_non_paddedexpert_location_dispatch_info)rk   )r   r   )rm   r   r9   ro   r:   r   r   r   init_newr   empty_topk_outputrk   r   )r   r   r   r   r   r   r   rR   rR   rS   r     s    	zMiniMaxM2MoE.forward_deepepc                 C   s0   t |jj|jr| |j\|_}dS d|_dS )z.Gate operation for TBO - compute router logitsN)r,   r   forward_modehidden_states_mlp_inputr   r   )r   stater   rR   rR   rS   op_gate  s
   

zMiniMaxM2MoE.op_gatec                 C   s   | d}|j}|durDt jrt nt | j}| | j|||j	j
tj| jdd\|_|_}W d   dS 1 s=w   Y  dS tjd| jfdtj|jd|_tjd| jftj|jd|_dS )z"Expert selection operation for TBOr   Nr   )r   r   r   r   r   r   )rW   rk   )popr   r(   enable_piecewise_cuda_graphr   r   with_current_layerr   r   r   r   r   r   topk_weights_localtopk_idx_localro   fullr   r   rk   rp   r:   )r   r   r   r   ctxr   rR   rR   rS   op_select_experts  s2   
"
zMiniMaxM2MoE.op_select_expertsc                 C   sD   | j dkr | jjj|d|d|d|j|dd dS dS )z3Dispatch A operation for TBO - start async dispatchrx   r   r   r   tbo_subbatch_indexr   topk_idxtopk_weightsr   r   N)r   r   deepep_dispatcher
dispatch_ar   r   getr   r   rR   rR   rS   op_dispatch_a  s   

zMiniMaxM2MoE.op_dispatch_ac                 C   sl   | j dkr4t jrt nt | j}| | jjj	|
dd|_W d   dS 1 s-w   Y  dS dS )z6Dispatch B operation for TBO - complete async dispatchrx   r   r   N)r   r(   r   r   r   r   r   r   r   
dispatch_br   dispatch_output)r   r   r   rR   rR   rS   op_dispatch_b  s   

"zMiniMaxM2MoE.op_dispatch_bc                 C   s   | j j|jd|_dS )zExpert computation for TBO)r   N)r   moe_implr   hidden_states_experts_outputr   rR   rR   rS   
op_experts  s   zMiniMaxM2MoE.op_expertsc                 C   sJ   | j dkr#| jjj|d|jj|jj|j|	dd |d dS dS )z1Combine A operation for TBO - start async combinerx   r   r   r   r   N)
r   r   r   	combine_ar   r   r   r   r   r   r   rR   rR   rS   op_combine_a  s   
zMiniMaxM2MoE.op_combine_ac                 C   s*   | j dkr| jjj|dd|_dS dS )z4Combine B operation for TBO - complete async combinerx   r   r   N)r   r   r   	combine_br   hidden_states_after_combiner   rR   rR   rS   op_combine_b  s   
zMiniMaxM2MoE.op_combine_bc                 C   s   | d}||_dS )z+Output operation for TBO - final MLP outputr   N)r   hidden_states_mlp_output)r   r   r   rR   rR   rS   	op_output  s   

zMiniMaxM2MoE.op_outputNr   )r   r   r   r   r	   r   r   r   strr   r   r   r   ro   r   r   r$   r   r   r   r   r   r   r   r   r   r   r   r   rR   rR   r   rS   r   ?  sL    :

		r   c                       s   e Zd ZdZ			ddededee ded	df
 fd
dZ	de
jde
jdefddZdd Zde
jde
jded	e
jfddZdd Zdd Z  ZS )MiniMaxM2AttentionzHMiniMax Attention implementation with QK normalization and partial RoPE.r   Nr   r   r   r   r   rg   c              
      s  t    |j| _t }|j| _| j| dksJ | j| | _|j| _| j|kr2| j| dks1J n	|| j dks;J t	d| j| | _
t|d| j| j | _| j| j | _| j
| j | _| jd | _t|dd| _t|dd| _t|d	| j| _t|d
d| _t|dd| _t| j| j| j| jd|td|d| _t| j| j | jdd|td|d| _t|dd }t| j| j| j| j|d| _| jr| jdkrt| j| j |jd| _t| j| j |jd| _nt d| j t!| j| j| j| j
||td|d| _"d S )Nr   rx   head_dimg      
rope_thetai'  max_position_embeddingsi    
rotary_dimuse_qk_normFqk_norm_type	per_layerqkv_proj)r   r   r   o_proj)r   reduce_resultsr   r   rope_scaling)r  max_positionbaser  r]   zUnsupported qk_norm_type: attn)num_kv_headsr   r   r   )#r   r   r}   r   num_attention_headstotal_num_heads	num_headsnum_key_value_headstotal_num_kv_headsmaxr  r   r  q_sizekv_sizescalingr  r  r  r  r  r   r*   r	  r   r
  r    
rotary_embr{   rms_norm_epsr   r   r   r   r  )r   r   r   r   r   r   r  r   rR   rS   r     s   




	

zMiniMaxM2Attention.__init__	positionsr   r   c           
      C   s   |  |\}}|j| j| j| jgdd\}}}| jr+t| j| j|	 |	 \}}n	|	 |	 }}| 
|||\}}||||f}	d ||	fS )Nr   )r   )r	  splitr  r  r  r{   r   r   r   
contiguousr  )
r   r  r   r   qkvr   r   r   vinner_staterR   rR   rS   forward_preparex  s    

z"MiniMaxM2Attention.forward_preparec                 C   s&   |\}}}| j | }| |\}}|S r~   )r  r
  )r   intermediate_stater   r"  attn_outputoutputrR   rR   rS   forward_core  s   

zMiniMaxM2Attention.forward_corec                 C   s   | j |||d}| |S Nr  r   r   )r#  r'  )r   r  r   r   srR   rR   rS   r     s   
zMiniMaxM2Attention.forwardc                 C   s    | j |j|d|jd|_d S )N!hidden_states_after_comm_pre_attnr)  )r#  r  r   r   attn_intermediate_stater   rR   rR   rS   
op_prepare  s
   zMiniMaxM2Attention.op_preparec                 C   s   |  |d|_d S )Nr,  )r'  r   hidden_states_after_attnr   rR   rR   rS   op_core  s   
zMiniMaxM2Attention.op_core)r   Nr   )r   r   r   r   r	   r   r   r   r   r   ro   r   r$   r#  r'  r   r-  r/  r   rR   rR   r   rS   r    sF    c

r  c                       s   e Zd ZdZ		ddededee deddf
 fd	d
Z	de
jde
jdedee
j de
jf
ddZ	dde
jde
jdedee
j dedee fddZdd Zdd Zdd Z  ZS )MiniMaxM2DecoderLayerz6MiniMax Decoder Layer implementation with MoE support.Nr   r   r   r   r   rg   c                    s   t    |j| _|| _d| _t|||td|d| _t|||td|d| _	t
|jt|ddd| _t
|jt|ddd| _d}d}tj||j| j||d| _t| j| j| jdd	| _d S )
NT	self_attnr   r   r   r   mlpr  r|   r  )r   
num_layersis_layer_sparseis_previous_layer_sparseis_next_layer_sparse)layer_scatter_modesinput_layernormpost_attention_layernormallow_reduce_scatter)r   r   r}   r   r5  r  r*   r1  r   block_sparse_moer   r   r9  r:  r   r   num_hidden_layersr8  r   layer_communicator)r   r   r   r   r   r6  r7  r   rR   rS   r     sJ   
zMiniMaxM2DecoderLayer.__init__r  r   r   r   c                 C   s`   | j |||\}}| j|||d}| j |||\}}| ||}| j |||\}}||fS r(  )r>  prepare_attnr1  prepare_mlpr<  postprocess_layer)r   r  r   r   r   rR   rR   rS   r     s    zMiniMaxM2DecoderLayer.forwardzero_allocatorr   c                 C   s2   | j |||\|_|_|t||||d dS )z3Communication prepare for attention - TBO operation)r   r  rB  r   N)r>  r?  r+  residual_after_input_lnupdatedict)r   r   r  r   r   r   rB  r   rR   rR   rS   op_comm_prepare_attn  s   
z*MiniMaxM2DecoderLayer.op_comm_prepare_attnc                 C   s*   | j |d|d|j\|_|_dS )z-Communication prepare for MLP - TBO operationr.  rC  N)r>  r@  r   r   r   residual_after_comm_pre_mlpr   rR   rR   rS   op_comm_prepare_mlp  s   z)MiniMaxM2DecoderLayer.op_comm_prepare_mlpc                 C   s   | d}| ||j|_d S )Nr   )r   r<  r   r   )r   r   r   rR   rR   rS   op_mlp%  s   

zMiniMaxM2DecoderLayer.op_mlpc                 C   sB   | j |d|d|j\}}t|j|||j|j|jd}|S )z3Communication postprocess for layer - TBO operationr   rG  )r  r   r   r   rB  r   )r>  rA  r   r   rE  r  rB  r   )r   r   r   r   r&  rR   rR   rS   op_comm_postprocess_layer+  s   z/MiniMaxM2DecoderLayer.op_comm_postprocess_layerr   r~   )r   r   r   r   r	   r   r   r   r   r   ro   r   r$   r   r)   rF  rH  rI  rJ  r   rR   rR   r   rS   r0    sV    4
)

r0  c                       s   e Zd ZdZdZ		ddedee deddf fd	d
Z	de
jde
jfddZ		dde
jde
jdede
jdee dee
jeee
jee
j f f fddZ  ZS )MiniMaxM2ModelzMiniMax Model implementation.FNr   r   r   r   rg   c                    s   t    t dd| _ j| _t | _t j j| _	dt
dtjf fdd}t j|| jj| jjtd|d\| _| _| _| jjrNt j jd	| _ntd
d| _g | _d S )Npad_token_idr   r   rg   c                    s   t  | |dS )Nr2  )r0  )idxr   r   r   rR   rS   layer_fnT  s   z)MiniMaxM2Model.__init__.<locals>.layer_fnlayers)pp_rankpp_sizer   r  T)return_tuple)r   r   r   padding_idx
vocab_sizer   pp_groupr#   r}   embed_tokensr   r   Moduler-   r=  rank_in_group
world_sizer*   rP  start_layer	end_layeris_last_rankr   r  normr!   layers_to_capture)r   r   r   r   rO  r   rN  rS   r   C  s(   

zMiniMaxM2Model.__init__	input_idsc                 C   s
   |  |S r~   )rW  r   r`  rR   rR   rS   get_input_embeddingsk  s   
z#MiniMaxM2Model.get_input_embeddingsr  r   input_embedspp_proxy_tensorsc              	   C   sH  | j jr|d u r| |}n|}d }n|d usJ |d }|d }g }|jr7t| jdt ||||d\}}nDt| j	| j
D ]<}	t jrGt nt |	}
|
# |	| jv r\|||  | j|	 }|||||d\}}W d    n1 suw   Y  q>| j jst||dS |d ur| ||\}}n| |}t|dkr|S ||fS )Nr   r   T)rP  
enable_tboinput_data_scatter_moder  r   r   r   )r  r   r   r   )r   r   r   )rV  is_first_rankrb  can_run_tbor
   rP  r   model_input_outputranger[  r\  r(   r   r   r   r   r_  appendr]  r%   r^  len)r   r`  r  r   rc  rd  r   r   aux_hidden_statesir   layerr   rR   rR   rS   r   n  s\   




zMiniMaxM2Model.forwardr   )NN)r   r   r   r   fall_back_to_pt_during_loadr	   r   r   r   r   ro   r   rb  r$   r%   r   r   listr   r   rR   rR   r   rS   rK  >  s<    (rK  c                       s   e Zd ZdZ		ddedee deddf fdd	Zd
e	j
de	j
fddZddeee  fddZdd Ze	 	dd
e	j
de	j
dede	j
de	j
f
ddZdeeee	j
f  fddZedd Z  ZS )MiniMaxM2ForCausalLMz.MiniMax M2 model for causal language modeling.Nr   r   r   r   rg   c                    sn   t    || _|| _t||td|d| _t jr)t	|j
|jd td|d| _nt | _t|| _d| _d S )Nmodel)r   lm_head)r   r   F)r   r   r   r   rK  r*   rs  r   r]  r"   rU  r}   rt  r!   r   logits_processorcapture_aux_hidden_states)r   r   r   r   r   rR   rS   r     s    



zMiniMaxM2ForCausalLM.__init__r`  c                 C   s   | j |S r~   )rs  rb  ra  rR   rR   rS   rb    s   z)MiniMaxM2ForCausalLM.get_input_embeddings	layer_idsc                 C   sR   t  jsd S d| _|d u r| jj}d|d |d g| j_d S dd |D | j_d S )NTr   rh   c                 S   s   g | ]}|d  qS )rx   rR   ).0valrR   rR   rS   
<listcomp>  s    zEMiniMaxM2ForCausalLM.set_eagle3_layers_to_capture.<locals>.<listcomp>)r   r]  rv  r   r=  rs  r_  )r   rw  r4  rR   rR   rS   set_eagle3_layers_to_capture  s   z1MiniMaxM2ForCausalLM.set_eagle3_layers_to_capturec                 C   s   | j jj| jjfS r~   )rs  rW  r   rt  )r   rR   rR   rS   get_embed_and_head  s   z'MiniMaxM2ForCausalLM.get_embed_and_headr  r   rc  c                 C   s6   |  ||||}d }| jr|\}}| ||| j||S r~   )rs  rv  ru  rt  )r   r`  r  r   rc  r   rm  rR   rR   rS   r     s   	zMiniMaxM2ForCausalLM.forwardweightsc              	   C   sh  g d}t jddd| jjd}t|  }t }|D ]\}}d|v r#qt| j|}|dur.q|D ]1\}	}
}|
|vr:q0d|v rC||vrCq0||
|	}|	d	rS||vrSq0|| }|j
}||||  nJ|D ]$}|\}	}
}}|
|vrqqd||
|	}|| }|j
}||||||d
  n#|	d	r||vrqt||}|du rq|| }t|dt}||| || q|S )z@Load model weights with proper mapping for MiniMax architecture.))r	  q_projr   )r	  k_projr   )r	  v_projr!  )gate_up_proj	gate_projr   )r  up_projrx   rb   rc   w3)ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namer   zrotary_emb.inv_freqNzmlp.experts.z.bias)shard_id	expert_idr   )r   make_expert_params_mappingr   r   rE  named_parametersset#get_spec_layer_idx_from_weight_namereplaceendswithr   r'   r   r&   add)r   r}  stacked_params_mappingexpert_params_mappingparams_dictloaded_paramsnamer   
spec_layer
param_nameweight_namer  r   r   mappingr  rR   rR   rS   load_weights  sl   

z!MiniMaxM2ForCausalLM.load_weightsc                 C   s   ddl m} ||j|jd dS )Nr   )ModelConfigForExpertLocation)r4  num_logical_experts
num_groups)sglang.srt.eplb.expert_locationr  r=  r   )clsr   r  rR   rR   rS   $get_model_config_for_expert_locationM  s   z9MiniMaxM2ForCausalLM.get_model_config_for_expert_locationr   r~   )r   r   r   r   r	   r   r   r   r   ro   r   rb  rq  r   r{  r|  no_gradr$   r   r   r   r  classmethodr  r   rR   rR   r   rS   rr    s@    Wrr  r   r  c                 C   sP   t | dr&| jdkr&| j}t| jD ]}|d||  dr%||   S qd S )Nnum_mtp_modulesr   zmodel.layers.r   )hasattrr  r=  rj  
startswith)r   r  	layer_idxrn  rR   rR   rS   r  X  s   r  )rx   ry   )ar   logging
contextlibr   typingr   r   r   r   r   ro   rq   triton.languagelanguager5   r   transformersr	   *sglang.srt.batch_overlap.two_batch_overlapr
   sglang.srt.distributedr   r   r   r   r   #sglang.srt.eplb.expert_distributionr   (sglang.srt.eplb.expert_location_dispatchr   sglang.srt.layers.communicatorr   r   r   sglang.srt.layers.layernormr   sglang.srt.layers.linearr   r   r   "sglang.srt.layers.logits_processorr   "sglang.srt.layers.moe.ep_moe.layerr   ,sglang.srt.layers.moe.fused_moe_triton.layerr   sglang.srt.layers.moe.topkr   sglang.srt.layers.moe.utilsr   *sglang.srt.layers.quantization.base_configr   !sglang.srt.layers.radix_attentionr   "sglang.srt.layers.rotary_embeddingr    sglang.srt.layers.utilsr!   *sglang.srt.layers.vocab_parallel_embeddingr"   r#   ,sglang.srt.model_executor.forward_batch_infor$   r%   $sglang.srt.model_loader.weight_utilsr&   r'   sglang.srt.server_argsr(   sglang.srt.utilsr)   r*   r+   r,   r-   	getLoggerr   loggerjit	constexprrT   rf   r   rw   r   r   rz   rX  r{   r   r  r0  rK  rr  r   r  
EntryClassrR   rR   rR   rS   <module>   s   
	
".,
-K T  o ,
