o
    پig                     @   sJ  d dl Z d dlmZmZmZmZmZmZ d dlZd dl	m
  mZ d dlm
Z
 d dlmZ d dlmZmZmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZmZm Z  d dl!m"Z"m#Z#m$Z$ d dl%m&Z& d dl'm(Z(m)Z)m*Z* d dl+m,Z, d dl-m.Z.m/Z/m0Z0 d dl1m2Z2m3Z3 d dl4m5Z5m6Z6 d dl7m8Z8 d dl9m:Z: d dl;m<Z< d dl=m>Z>m?Z? d dl@mAZAmBZB d dlCmDZDmEZE d dlFmGZGmHZH d dlImJZJ d dlKmLZLmMZMmNZNmOZO dZPe QeRZSG dd de
jTZUG dd de
jTZVG dd  d e
jTZWG d!d" d"e
jTZXG d#d$ d$e
jTZYG d%d& d&e
jTZZG d'd( d(e
jTZ[e[Z\dS ))    N)AnyDictIterableOptionalTupleUnion)nn)model_forward_maybe_tbo)"get_moe_expert_parallel_world_sizeget_pp_group$get_tensor_model_parallel_world_size tensor_model_parallel_all_reduce)'get_global_expert_distribution_recorder)ModelConfigForExpertLocation)ExpertLocationDispatchInfo)
SiluAndMul)LayerCommunicatorLayerScatterModesScatterModeenable_moe_dense_fully_dp)get_attention_tp_rankget_attention_tp_sizeis_dp_attention_enabled)RMSNorm)MergedColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessor)get_moe_a2a_backendget_moe_runner_backend/should_use_flashinfer_cutlass_moe_fp4_allgather)	DeepEPMoEget_moe_impl_class)TopKTopKOutputFormat)QuantizationConfig)RadixAttention)get_rope)PPMissingLayerget_layer_id)ParallelLMHeadVocabParallelEmbedding)ForwardBatchPPProxyTensors)default_weight_loaderkv_cache_scales_loader)get_global_server_args)	LazyValue
add_prefixis_non_idle_and_non_emptymake_layersc                       sx   e Zd Z					ddedededee ded	ed
ee dee ddf fddZ			dde	dedefddZ
  ZS )	MiMoV2MLPNT hidden_sizeintermediate_size
hidden_actquant_configreduce_resultsprefixtp_ranktp_sizereturnc	           	   
      sx   t    || _t||gd d|td|||d| _t||d||td|||d| _|dkr6td| d	t	 | _
d S )
N   Fgate_up_proj)biasr:   r<   r=   r>   	down_proj)rB   r:   r;   r<   r=   r>   siluUnsupported activation: !. Only silu is supported for now.)super__init__r>   r   r2   rA   r   rC   
ValueErrorr   act_fn)	selfr7   r8   r9   r:   r;   r<   r=   r>   	__class__ S/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/mimo_v2_flash.pyrH   U   s4   
	

zMiMoV2MLP.__init__Fforward_batchshould_allreduce_fusionuse_reduce_scatterc                 C   sN   | j dkr|jd dkr|S | |\}}| |}| j||p |d\}}|S )N   r   )skip_all_reduce)r>   shaperA   rJ   rC   )rK   xrP   rQ   rR   gate_up_rN   rN   rO   forward}   s   

zMiMoV2MLP.forward)NTr6   NNNFF)__name__
__module____qualname__intstrr   r%   boolrH   r,   rY   __classcell__rN   rN   rL   rO   r5   T   sF    	
+r5   c                       s4   e Zd Z		d	dedef fddZdd Z  ZS )
MoEGater6   Fr<   is_nextnc                    s   t    || _tj| _ttj|j	|j
f| jd| _|jdkrD|d ur3| dkr3t  r3tjn| j}ttj|j	|d| _d S d | _d S )N)dtypenoaux_tcmodelopt_fp4)rG   rH   rc   torchfloat32rd   r   	Parameteremptyn_routed_expertsr7   weighttopk_methodget_namer   is_flashinfer_trtllmbfloat16e_score_correction_bias)rK   configr:   r<   rc   correction_bias_dtyperL   rN   rO   rH      s$   



zMoEGate.__init__c                 C   s   t || j| jd }|S N)Flineartord   rl   )rK   hidden_stateslogitsrN   rN   rO   rY      s   zMoEGate.forward)r6   F)r[   r\   r]   r_   r`   rH   rY   ra   rN   rN   rL   rO   rb      s    rb   c                       s   e Zd Z			d(dededee dedef
 fd	d
Z	dd Z
			d)dejdee dededejf
ddZ		d*dejdededejfddZdejdedejfddZdd Zdd Zdd Zdd Zd d! Zd"d# Zd$d% Zd&d' Z  ZS )+	MiMoV2MoENr6   Frr   layer_idr:   r<   rc   c                    sh  t    t | _|| _|| _| j|jkr"td| j d|j d|jdkr0td|j dt	||t
d||d| _t|}||jt j |j|j|j| j|d	t
d
|d| _t|j|jd|j|j| jj|d	| jj|d u rrtjnd d
| _t  st  rt | _ |jt j | _!|j| _"|j| _|j| _#| jjd ur| jjj$nd | _%t  pt  | _&d S )NzTensor parallel size z' is greater than the number of experts .rD   rE   rF   gate)rr   r:   r<   rc         ?experts)num_expertstop_kr7   r8   r{   r:   routed_scaling_factorr<   T)
r   renormalizeuse_grouped_topknum_expert_group
topk_groupcorrection_biasr:   r   %apply_routed_scaling_factor_on_outputoutput_format)'rG   rH   r   r>   rr   r{   rk   rI   r9   rb   r2   r}   r"   r0   ep_num_redundant_expertsnum_experts_per_tokr7   moe_intermediate_sizer   r#   norm_topk_probn_groupr   rq   )should_fuse_routed_scaling_factor_in_topkr$   STANDARDtopkr   	is_deepepis_mooncaker
   ep_sizer   r   r   datar   _enable_a2a_moe)rK   rr   r{   r:   r<   rc   experts_typerL   rN   rO   rH      sx   



zMiMoV2MoE.__init__c                 C   s   dd | j  D S )Nc                 S   s   g | ]\}}|d vr|j qS ))r   )r   ).0namerV   rN   rN   rO   
<listcomp>  s
    z-MiMoV2MoE.get_moe_weights.<locals>.<listcomp>)r   named_parametersrK   rN   rN   rO   get_moe_weights  s   zMiMoV2MoE.get_moe_weightsrx   rP   rQ   rR   r?   c                 C   s    | j s
| |||S | ||S rt   )r   forward_normalforward_deepep)rK   rx   rP   rQ   rR   rN   rN   rO   rY     s   zMiMoV2MoE.forwardc                 C   sd   |j d dkr| |}| ||}n| j|j}| ||}| jdkr0|s0|s0t s0t|}|S )Nr   rS   )	rU   r}   r   empty_topk_outputdevicer   r>   r    r   )rK   rx   rQ   rR   router_logitstopk_outputfinal_hidden_statesrN   rN   rO   r     s   

zMiMoV2MoE.forward_normalc                 C   sX   |j d dkr| |}| j|||jtj| jdd}n| j|j}| j	||d}|S )Nr   r{   )num_token_non_paddedexpert_location_dispatch_info)rx   r   )
rU   r}   r   r   r   init_newr{   r   r   r   )rK   rx   rP   r   r   r   rN   rN   rO   r   6  s   
	zMiMoV2MoE.forward_deepepc                 C   s,   t |jj|jr| |j|_d S d |_d S rt   )r3   rP   forward_modehidden_states_mlp_inputr}   r   rK   staterN   rN   rO   op_gateM  s
   

zMiMoV2MoE.op_gatec                 C   s   | d}|j}|d ur7t | j | j|||jjtj	| jdd|_
W d    d S 1 s0w   Y  d S | j|j|_
d S )Nr   r   )rx   r   r   r   )popr   r   with_current_layerr{   r   rP   r   r   r   r   r   r   )rK   r   r   rx   rN   rN   rO   op_select_expertsV  s    

"zMiMoV2MoE.op_select_expertsc                 C   s8   | j dkr| jjj|d|d|dd d S d S )NrS   r   r   tbo_subbatch_index)rx   r   r   )r   r   
dispatcher
dispatch_ar   getr   rN   rN   rO   op_dispatch_ah  s   

zMiMoV2MoE.op_dispatch_ac                 C   sZ   | j dkr+t | j | jjj|dd|_W d    d S 1 s$w   Y  d S d S NrS   r   )r   )	r   r   r   r{   r   r   
dispatch_br   dispatch_outputr   rN   rN   rO   op_dispatch_bp  s   

"zMiMoV2MoE.op_dispatch_bc                 C   s   | j j|jd|_d S )N)r   )r   run_moe_corer   combine_inputr   rN   rN   rO   
op_expertsy  s   zMiMoV2MoE.op_expertsc                 C   s:   | j dkr| jjj|d|dd |d d S d S )NrS   r   r   )r   r   r   )r   r   r   	combine_ar   r   r   rN   rN   rO   op_combine_a~  s   
zMiMoV2MoE.op_combine_ac                 C   s*   | j dkr| jjj|dd|_d S d S r   )r   r   r   	combine_br   hidden_states_after_combiner   rN   rN   rO   op_combine_b  s
   
zMiMoV2MoE.op_combine_bc                 C   s   | d|_d S )Nr   )r   hidden_states_mlp_outputr   rN   rN   rO   	op_output     zMiMoV2MoE.op_output)Nr6   FrZ   )FF)r[   r\   r]   MiMoV2FlashConfigr^   r   r%   r_   r`   rH   r   rg   Tensorr,   rY   r   r   r   r   r   r   r   r   r   r   ra   rN   rN   rL   rO   rz      sp    R



		rz   c                #       s   e Zd Z													d)d	ed
ededee dee dee dedededededeeee	f  dedee
 dededdf" fddZdd Zdd Zd ejd!ejd"efd#d$Zd%d& Zd ejd!ejd"edejfd'd(Z  ZS )*MiMoV2AttentionNFr   @B    r~   r6   r7   	num_headsnum_kv_headshead_dim
v_head_dimv_scalesliding_window_sizeattention_biasattention_sink_biasr{   
rope_thetarope_scalingmax_position_embeddingsr:   partial_rotary_factorr<   r?   c                    s  t    || _t }t }|| _| j| dksJ | j| | _|| _| j|kr2| j| dks1J n	|| j dks;J td| j| | _	|| _
|d urM|n|| _| j| j
 | _| j	| j
 | _| j	| j | _|| _| j
d | _t|| j
| j| j| j||||td|dd| _t| j| j |d|||dtd|d	| _t| j
| j
||||d
| _t| j| j
| j| j	|
| j||td|d	| _|	rtjjt| jdd| _d S d | _d S )Nr   rS   g      qkv_projT)v_head_sizerB   r:   r=   r>   r<   skip_block_quant_checkFo_proj)rB   r:   r=   r>   r;   r<   )
rotary_dimmax_positionbaser   r   attn)r   r{   r   r   r:   r<   )requires_grad)rG   rH   r7   r   r   total_num_headsr   total_num_kv_headsmaxr   r   r   q_sizek_sizev_sizer   scalingr   r2   r   r   r   r'   
rotary_embr&   r   rg   r   ri   rj   r   )rK   r7   r   r   r   r   r   r   r   r   r{   r   r   r   r:   r   r<   attn_tp_rankattn_tp_sizerL   rN   rO   rH     s   


	zMiMoV2Attention.__init__c                 C   s    | j |j|d|jd|_d S )N!hidden_states_after_comm_pre_attn	positionsrx   rP   )forward_preparer   r   rP   attn_intermediate_stater   rN   rN   rO   
op_prepare  s
   zMiMoV2Attention.op_preparec                 C   s   |  |d|_d S )Nr   )forward_corer   hidden_states_after_attnr   rN   rN   rO   op_core  s   
zMiMoV2Attention.op_corer   rx   rP   c           
      C   s   |j d dkr||d fS | |\}}|j| j| j| jgdd\}}}| |||\}}| jd ur6|| j }||||f}	d ||	fS )Nr   r   dim)rU   r   splitr   r   r   r   r   )
rK   r   rx   rP   qkvrX   qkvinner_staterN   rN   rO   r     s   
 


zMiMoV2Attention.forward_preparec                 C   s:   |\}}}|d u r|S | j |d| ji}| |\}}|S )Nsinks)r   r   r   )rK   intermediate_staterx   rP   r   attn_outputoutputrX   rN   rN   rO   r     s   
zMiMoV2Attention.forward_corec                 C   s|   |  |\}}|j| j| j| jgdd\}}}| |||\}}| jd ur*|| j }| j||||| jd}	| 	|	\}
}|
S )Nr   r   )r   )
r   r   r   r   r   r   r   r   r   r   )rK   r   rx   rP   r   rX   r   r   r   r   r   rN   rN   rO   rY     s    

zMiMoV2Attention.forward)NNNr   FFr   r   Nr   Nr~   r6   )r[   r\   r]   r^   r   floatr`   r   r_   r   r%   rH   r   r   rg   r   r,   r   r   rY   ra   rN   rN   rL   rO   r     s    	
e
r   c                       s   e Zd Z			ddededee deddf
 fd	d
Zde	j
de	j
dedee	j
 dee	j
e	j
f f
ddZdedefddZdefddZ	d de	j
de	j
dedee	j
 dee f
ddZdd Zdd Zdd Z  ZS )!MiMoV2DecoderLayerr   Nr6   rr   r{   r:   r<   r?   c              	      s~  t    || _|j| _|| _t|dd}t|dd }t|dd}|  rytd&i d| jd|jd|j	d	|j
d
t|dd dt|dd d|jd|jdt|ddd|dt|d|d|d|d|dt|dddtd|| _nOtd&i d| jd| jjd|jd	|jd
t|d
d dt|dd ddd|jdt|ddd|d|d|d|d|dt|dddtd|| _| || _| |d }| |d }	| jrt||td||d| _nt rd\}
}nd \}
}t| j|j|j|td||
|d!| _t|j|jd"| _t|j|jd"| _tj||j | j||	d#| _!t"| j!| j| jd$| j| jj d kd%| _#d S )'Nr   r   r   r   r   r7   r   r   r   r   swa_v_head_dimr   attention_value_scaler   r   r   add_swa_attention_sink_biasFr{   swa_rope_thetar:   r   r~   r<   	self_attnr   add_full_attention_sink_biasrS   mlp)rr   r:   r<   r{   )r   rS   NN)r7   r8   r9   r:   r<   r=   r>   eps)r{   
num_layersis_layer_sparseis_previous_layer_sparseis_next_layer_sparseT)layer_scatter_modesinput_layernormpost_attention_layernormallow_reduce_scatteris_last_layerrN   )$rG   rH   rr   r7   r{   getattris_swa_layerr   swa_num_attention_headsswa_num_key_value_headsswa_head_dimr   r   r2   r  num_attention_headsnum_key_value_headsr   is_moe_layerr  rz   r  r   r5   r8   r9   r   layernorm_epsilonr  r  r   r   num_hidden_layersr  r   layer_communicator)rK   rr   r{   r:   r<   r   r   r   r  r  mlp_tp_rankmlp_tp_sizerL   rN   rO   rH   4  s   
	
	



zMiMoV2DecoderLayer.__init__r   rx   rP   residualc                 C   s   | j |||\}}|jd dkr| j|||d}| j |||\}}| j |}| j |}| ||||}|r@d|_||fS | j 	|||\}}||fS )Nr   r   T)
r  prepare_attnrU   r  prepare_mlp)should_fuse_mlp_allreduce_with_next_layershould_use_reduce_scatterr  _sglang_needs_allreduce_fusionpostprocess_layer)rK   r   rx   rP   r!  rQ   rR   rN   rN   rO   rY     s:   zMiMoV2DecoderLayer.forward	layer_idxc                 C   sH   t | jdo#d|  kot| jjk n  o#t| jjt o#| jj| S )Nmoe_layer_freqr   )hasattrrr   lenr)  
isinstancer^   )rK   r(  rN   rN   rO   r    s   
zMiMoV2DecoderLayer.is_moe_layerc                 C   s   | j j| j dkS )NrS   )rr   hybrid_layer_patternr{   r   rN   rN   rO   r       zMiMoV2DecoderLayer.is_swa_layerr   c                 C   s0   | j |||\|_|_|t|||d d S )N)rP   r   r   )r  r"  r   residual_after_input_lnupdatedict)rK   r   r   rx   rP   r!  r   rN   rN   rO   op_comm_prepare_attn  s   

z'MiMoV2DecoderLayer.op_comm_prepare_attnc                 C   s*   | j |d|d|j\|_|_d S )Nr   r/  )r  r#  r   rP   r   residual_after_comm_pre_mlpr   rN   rN   rO   op_comm_prepare_mlp  s   z&MiMoV2DecoderLayer.op_comm_prepare_mlpc                 C   s   | d}| ||j|_d S )Nr   )r   r  rP   r   )rK   r   rx   rN   rN   rO   op_mlp  s   
zMiMoV2DecoderLayer.op_mlpc                 C   sN   | j |d|d|j\}}t|j|||j|jd}|jh dd |S )Nr   r3  )r   rx   r!  rP   r   >   r   rP   r   )expect_keys)r  r'  r   rP   r1  r   r   clear)rK   r   rx   r!  r   rN   rN   rO   op_comm_postprocess_layer  s    z,MiMoV2DecoderLayer.op_comm_postprocess_layer)r   Nr6   rt   )r[   r\   r]   r   r^   r   r%   r_   rH   rg   r   r,   r   rY   r`   r  r  r2  r4  r5  r8  ra   rN   rN   rL   rO   r   3  sV    i
/

	r   c                       s   e Zd Zddefdedee dedee	j
 ddf
 fdd	Zd
ejdejfddZde	jfddZ		dd
ejdejdedejdee deejef fddZdeddfddZ  ZS )MiMoV2ModelNr6   rr   r:   r<   decoder_layer_typer?   c                    s   t     | _ j| _ j| _t | _| jjr)t	 j j
t td|d| _nt | _p0tt j fdd| jj| jjtd|d\| _| _| _| jjr]t j
 jd| _d S tdd	| _d S )
Nembed_tokens)r:   use_attn_tp_groupr<   c                    s   |  |dS )N)r{   rr   r:   r<   rN   )idxr<   rr   r:  r:   rN   rO   <lambda>.  s    z&MiMoV2Model.__init__.<locals>.<lambda>layers)layer_fnpp_rankpp_sizer<   r	  T)return_tuple)rG   rH   rr   pad_token_idpadding_idx
vocab_sizer   pp_groupis_first_rankr+   r7   r   r2   r;  r(   r   r4   r  rank_in_group
world_sizer@  start_layer	end_layeris_last_rankr   r  norm)rK   rr   r:   r<   r:  rL   r>  rO   rH     s2   

zMiMoV2Model.__init__	input_idsc                 C   s,   t | jdr|  || jj S |  |S )N	scale_emb)r*  rr   get_input_embeddingsrQ  rK   rP  rN   rN   rO   get_input_embedding=  s   zMiMoV2Model.get_input_embeddingc                 C   s   | j S rt   )r;  r   rN   rN   rO   rR  C  s   z MiMoV2Model.get_input_embeddingsr   rP   input_embedspp_proxy_tensorsc              	   C   sl  | j jr|d u r| |}n|}d }n|d usJ |d }|d }|jrd| j}| j}	| jdkrA| jd }
|
||||\}}|d }t| j||	 d|| jkrRt	 n| j|d  j
j||||d\}}nt| j| jD ]}| j| }
|
||||\}}qkd }| j jst||dS |jd dkr|jr|d u r|n|| }|d u r| |}||fS | ||\}}||fS )Nrx   r!  r   rS   T)r@  
enable_tboinput_data_scatter_moder   rP   rx   r!  )rx   r!  )rH  rI  r;  can_run_tborL  rM  r@  r	   r   model_input_outputr  layer_output_moderangerN  r-   rU    return_hidden_states_before_normrO  )rK   rP  r   rP   rU  rV  rx   r!  tbo_start_layertbo_end_layerlayerihidden_states_before_normrX   rN   rN   rO   rY   F  sn   





zMiMoV2Model.forwardquantization_param_pathc                 C   sv   t  }t }t|||| jj| jjjD ]%\}}t| j| t	j
s&| j| j}t|jdr5||j_||j_qtdd S )Nk_scalez8Self attention has no KV cache scaling factor attribute!)r   r   r/   rr   r  rM   
model_typer,  r@  r   Identityr  r*  r   rd  r   RuntimeError)rK   rc  r   r   r(  scaling_factorlayer_self_attnrN   rN   rO   load_kv_cache_scales  s$   
z MiMoV2Model.load_kv_cache_scalesr  )r[   r\   r]   r   r   r   r%   r_   typer   ModulerH   rg   r   rT  	EmbeddingrR  r,   r-   r   rY   rj  ra   rN   rN   rL   rO   r9    sB    +
Pr9  c                       s   e Zd Zg dZddddddZ			d-d
edee deddf fddZ	e
dd ZdejdejfddZdejfddZe 		d.dejdejdedejdee dejfddZe
dd Ze
dd  Zd!eeeejf  fd"d#Zd$d% Zd&d' Zd(eddfd)d*Zed+d, Z  Z S )/MiMoV2FlashForCausalLM)z.gate_proj.z.down_proj.z	.up_proj.z.q_proj.z.k_proj.z.v_proj.z.o_proj.)r   r   )r   rS   )r   r@   )rA   r   )rA   rS   )q_projk_projv_proj	gate_projup_projNr6   rr   r:   r<   r?   c                    s   t    t  _| _| _t||td|d _ jj	r0t
|j|j|td|t jd _nt  _t| _t fdd _d S )Nmodel)r:   r<   lm_head)r:   r<   r<  c                      s   dd t  jjD S )Nc                 S   s(   i | ]\}}t |jtr||j qS rN   )r,  r  rz   r   )r   r{   r`  rN   rN   rO   
<dictcomp>  s    

zEMiMoV2FlashForCausalLM.__init__.<locals>.<lambda>.<locals>.<dictcomp>)	enumeratert  r@  rN   r   rN   rO   r?    s    
z1MiMoV2FlashForCausalLM.__init__.<locals>.<lambda>)rG   rH   r   rH  rr   r:   r9  r2   rt  rN  r*   rG  r7   r0   enable_dp_lm_headru  r(   r   logits_processorr1    _routed_experts_weights_of_layer)rK   rr   r:   r<   rL   r   rO   rH     s(   

	


zMiMoV2FlashForCausalLM.__init__c                 C      | j jS rt   )rz  valuer   rN   rN   rO   routed_experts_weights_of_layer     z6MiMoV2FlashForCausalLM.routed_experts_weights_of_layerrP  c                 C   s   | j |S rt   )rt  rT  rS  rN   rN   rO   rT    s   z*MiMoV2FlashForCausalLM.get_input_embeddingc                 C   r{  rt   )rt  r;  r   rN   rN   rO   rR    s   z+MiMoV2FlashForCausalLM.get_input_embeddingsr   rP   rU  rV  c                 C   s:   | j |||||d\}}| jjr| j||| j||dS |S )N)rV  )rb  )rt  rH  rN  ry  ru  )rK   rP  r   rP   rU  rV  rx   rb  rN   rN   rO   rY     s    	
zMiMoV2FlashForCausalLM.forwardc                 C   r{  rt   )rt  rL  r   rN   rN   rO   rL  	  r~  z"MiMoV2FlashForCausalLM.start_layerc                 C   r{  rt   )rt  rM  r   rN   rN   rO   rM    r~  z MiMoV2FlashForCausalLM.end_layerweightsc              	   C   s   g d}t jddd| jjd}t|  }|D ]\}}t|}|d ur6t| jdr6|| jj	k s5|| jj
kr6qd|v s>d|v r?qd	|v sGd
|v rHq| jjrid|v ri| jjdkrh| jjrhttdd |d }|}nqd|v rnq|D ]1\}	}
}|
|vrzqpd|v r||vrqp||
|	}|dr||vrqp|| }|j}||||  nk|D ]$}|\}	}
}}|
|vrq||
|	}|| }|j}||||||d  nD|dr||vrq|| v r|| }d|v rt |  }|j||||    qt|dt}||| qtd| d qd S )N))r   ro  r   )r   rp  r   )r   rq  r   )rA   rr  r   )rA   rs  rS   rr  rC   rs  )ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namer   rL  zrotary_emb.inv_freq	projectorzrotary_emb.cos_cachedzrotary_emb.sin_cachedzlm_head.weightrS   c                 S   s   | d dkS )Nr   zmodel.embed_tokens.weightrN   )rV   rN   rN   rO   r?  =  s    z5MiMoV2FlashForCausalLM.load_weights.<locals>.<lambda>mtpzmlp.experts.z.bias)shard_id	expert_idr   weight_loaderz
Parameter z not found in params_dict)r!   make_expert_params_mappingrr   rk   r1  r   r)   r*  rt  rL  rM  tie_word_embeddingsrH  rK  rN  nextfilterreplaceendswithr  keysr   numelr   copy_r  r.   loggerwarning)rK   r  stacked_params_mappingexpert_params_mappingparams_dictr   loaded_weightr{   embed_token_weights
param_nameweight_namer  paramr  mappingr  startrN   rN   rO   load_weights  s   

z#MiMoV2FlashForCausalLM.load_weightsc                 C   s   | j jj| jjfS rt   )rt  r;  rl   ru  r   rN   rN   rO   get_embed_and_headz  r.  z)MiMoV2FlashForCausalLM.get_embed_and_headc                 C   s8   | j j`| j`|| j j_|| j_tj  tj  d S rt   )rt  r;  rl   ru  rg   cudaempty_cachesynchronize)rK   embedheadrN   rN   rO   set_embed_and_head}  s   

z)MiMoV2FlashForCausalLM.set_embed_and_headrc  c                 C   s   | j | d S rt   )rt  rj  )rK   rc  rN   rN   rO   rj    r   z+MiMoV2FlashForCausalLM.load_kv_cache_scalesc                 C   s    t |jt|ddt|dd dS )Nrk   rS   r   )r  num_logical_experts
num_groups)r   r  r  )clsrr   rN   rN   rO   $get_model_config_for_expert_location  s
   

z;MiMoV2FlashForCausalLM.get_model_config_for_expert_location)Nr6   r  )!r[   r\   r]   #default_bitsandbytes_target_modules#bitsandbytes_stacked_params_mappingr   r   r%   r_   rH   propertyr}  rg   r   rT  r   rm  rR  no_gradr,   r-   rY   rL  rM  r   r   r  r  r  rj  classmethodr  ra   rN   rN   rL   rO   rn    sb    $


irn  )]loggingtypingr   r   r   r   r   r   rg   torch.nn.functionalr   
functionalru   *sglang.srt.batch_overlap.two_batch_overlapr	   sglang.srt.distributedr
   r   r   r   #sglang.srt.eplb.expert_distributionr   sglang.srt.eplb.expert_locationr   (sglang.srt.eplb.expert_location_dispatchr   sglang.srt.layers.activationr   sglang.srt.layers.communicatorr   r   r   r   sglang.srt.layers.dp_attentionr   r   r   sglang.srt.layers.layernormr   sglang.srt.layers.linearr   r   r   "sglang.srt.layers.logits_processorr   sglang.srt.layers.moer   r   r    "sglang.srt.layers.moe.ep_moe.layerr!   r"   sglang.srt.layers.moe.topkr#   r$   *sglang.srt.layers.quantization.base_configr%   !sglang.srt.layers.radix_attentionr&   "sglang.srt.layers.rotary_embeddingr'   sglang.srt.layers.utilsr(   r)   *sglang.srt.layers.vocab_parallel_embeddingr*   r+   ,sglang.srt.model_executor.forward_batch_infor,   r-   $sglang.srt.model_loader.weight_utilsr.   r/   sglang.srt.server_argsr0   sglang.srt.utilsr1   r2   r3   r4   r   	getLoggerr[   r  rl  r5   rb   rz   r   r   r9  rn  
EntryClassrN   rN   rN   rO   <module>   sV    
;" ` $ _  g