o
    پi                    @  s  d Z ddlmZ ddlZddlZddlmZ ddlmZm	Z	m
Z
mZmZmZmZ ddlZddlm  mZ ddlmZ ddlmZ ddlmZmZ dd	lmZmZ dd
lmZ ddlmZm Z m!Z!m"Z" ddl#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z) ddl*m+Z+ ddl,m-Z- ddl.m/Z/ ddl0m1Z1 ddl2m3Z3 ddl4m5Z5 ddl6m7Z7 ddl8m9Z9 ddl:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@mAZA ddlBmCZCmDZDmEZEmFZF ddlGmHZH ddlImJZJmKZKmLZLmMZMmNZN ddlOmPZP ddlQmRZRmSZSmTZTmUZU ddlVmWZW ddlXmYZYmZZZm[Z[ ddl\m]Z] ddl^m_Z_ ddl`maZa dd lbmcZcmdZdmeZe dd!lfmgZgmhZh dd"limjZjmkZk dd#llmmZm dd$lnmoZo dd%lpmqZqmrZrmsZs dd&ltmuZu dd'lvmwZw dd(lxmyZy dd)lzm{Z{m|Z| dd*l}m~Z~mZ dd+lmZ dd,lmZmZ dd-lmZ dd.lmZmZmZmZmZmZmZmZmZmZmZmZmZ dd/lmZ dd0lmZ dd1lmZmZmZmZmZmZmZmZ erdd2lmZ dd3lmZmZ dd4lmZmZmZ dd5lmZmZmZ erdd6lmZmZmZ n&erernerdd7lmZ nerdd8lmZmZmZmZmZmZ n	 eeZg d9ZG d:d; d;ejZG d<d= d=ejZG d>d? d?ejZG d@dA dAejeZG dBdC dCejZG dDdE dEejZG dFdG dGejeZG dHdI dIeƃZG dJdK dKeƃZeeegZdS )Lz Inference-only DeepseekV2 model.    )annotationsN)nullcontext)AnyDictIterableListOptionalTupleUnion)nn)PretrainedConfig)SboFlagscompute_overlap_args)MaybeTboDeepEPDispatchermodel_forward_maybe_tbo)is_in_piecewise_cuda_graph)get_nsa_index_head_dimget_nsa_index_n_headsget_nsa_index_topkis_deepseek_nsa)divide"get_moe_expert_parallel_world_sizeget_pp_group$get_tensor_model_parallel_world_size tensor_model_parallel_all_gather tensor_model_parallel_all_reduce)envs)'get_global_expert_distribution_recorder)ModelConfigForExpertLocation)ExpertLocationDispatchInfo)deep_gemm_wrapper)
SiluAndMul)PackWeightMethod)Indexer)can_cp_splitcp_all_gather_rerange_outputcp_split_and_rebuild_datacp_split_and_rebuild_positionis_nsa_enable_prefill_cpnsa_use_prefill_cpprepare_input_dp_with_cp_dsa)LayerCommunicatorLayerScatterModesenable_moe_dense_fully_dpget_attn_tp_context)NSACPLayerCommunicator)get_attention_cp_rankget_attention_cp_sizeget_attention_tp_rankget_attention_tp_sizeis_dp_attention_enabled)RMSNorm)ColumnParallelLinearMergedColumnParallelLinearReplicatedLinearRowParallelLinear)LogitsProcessor)get_moe_a2a_backendget_moe_runner_backend/should_use_flashinfer_cutlass_moe_fp4_allgather)get_moe_impl_class)FusedMoE)KTEPWrapperMethod)BaseDispatcherCombineInputDispatchOutput)TopKTopKOutputFormat)RoutingMethodType%filter_moe_weight_param_global_expert)QuantizationConfig)	Fp8Config)	fp8_dtypeper_tensor_quant_mla_fp8.per_token_group_quant_mla_deep_gemm_masked_fp8)RadixAttention)get_rope_wrapper)PPMissingLayer)ParallelLMHeadVocabParallelEmbedding)ForwardBatchPPProxyTensors)AttentionBackendRegistry)AttnForwardMethodDeepseekMHAForwardMixin)DeepseekV2WeightLoaderMixin)&FORWARD_ABSORB_CORE_ATTENTION_BACKENDS
_device_sm_get_llama_4_scaling_is_cpu_is_cpu_amx_available_is_cublas_ge_129_is_cuda_is_gfx95_supported_is_hip_is_npu
_use_aiter_use_aiter_gfx95yarn_get_mscale)get_global_server_args)SpeculativeAlgorithm)BumpAllocator	LazyValue
add_prefixget_bool_env_varis_non_idle_and_non_emptylog_info_on_rank0make_layersuse_intel_amx_backend)Gbatched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant)fused_flatten_fp8_group_quantfused_rms_fp8_group_quant)batched_gemm_afp4wfp4_pre_quantfused_flatten_mxfp4_quantfused_rms_mxfp4_quant)aiter_dsv3_router_gemmfused_qk_rope_cat_and_cache_mla(get_dsv3_gemm_output_zero_allocator_size)bmm_fp8dsv3_fused_a_gemmdsv3_router_gemm)!decode_attention_fwd_grouped_rope)forward_dsa_core_npuforward_dsa_prepare_npuforward_mha_core_npuforward_mha_prepare_npuforward_mla_core_npuforward_mla_prepare_npu)fa3nsa
flashinfercutlass_mla
trtllm_mlaascendc                      s>   e Zd Z					dd fddZ				ddddZ  ZS ) DeepseekV2MLPNT hidden_sizeintintermediate_size
hidden_actstrquant_configOptional[QuantizationConfig]reduce_resultsboolprefixtp_rankOptional[int]tp_sizereturnNonec	           	   
     s   t    || _t||gd d|td|||d| _t||d||td|||d| _t| jds<t| jdr<| jj	| j_
t| jdsNt| jdrN| jj	| j_
|d	krZtd
| dt | _d S )N   Fgate_up_projbiasr   r   r   r   	down_projr   r   r   r   r   r   weightweight_packedsiluUnsupported activation: !. Only silu is supported for now.)super__init__r   r7   ri   r   r9   r   hasattrr   r   
ValueErrorr!   act_fn)	selfr   r   r   r   r   r   r   r   	__class__ Q/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/deepseek_v2.pyr      sD   
	

zDeepseekV2MLP.__init__Fshould_allreduce_fusionuse_reduce_scattergemm_output_zero_allocatorrg   c           	      C  s   | j dkr|jd dkr|S |d ur;|jd dkr;| jjjtjkr;||jd | jj 	|jd | jj}|d |f}| |\}}| 
|}| j||pM|d\}}|S )N   r      )skip_all_reduce)r   shaper   r   dtypetorchuint8allocateoutput_size_per_partitionviewr   r   )	r   xforward_batchr   r   r   ygate_up_r   r   r   forward  s$   


zDeepseekV2MLP.forward)NTr   NN)r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   NFFN)r   r   r   r   r   rg   __name__
__module____qualname__r   r   __classcell__r   r   r   r   r      s    3r   c                      s4   e Zd Z		dd fddZ				ddddZ  ZS )MoEGater   Fr   r   is_nextnr   c                   s   t    || _tt|j|jf| _	|j
dkr<|d ur,| dkr,t  r,tjntj}ttj|j|d| _nd | _trJtrJtdgd| _t | _d S )Nnoaux_tcmodelopt_fp4r   r   )weight_names)r   r   r   r   	Parameterr   emptyn_routed_expertsr   r   topk_methodget_namer<   is_flashinfer_trtllmbfloat16float32e_score_correction_biasr[   r\   r"   quant_methodr(   nsa_enable_prefill_cp)r   configr   r   r   correction_bias_dtyper   r   r   r   '  s(   

zMoEGate.__init__Nr   rg   r   rR   c                 C  s   t | rtjj|| jd dS t jrt	|| jd S |d ur-t
|r-t	|| jd }|S tr\|jd dkr\|jd dkr\| jjd dksM| jjd dkr\tdkr\t|| jtjd	}|S trn|jd dkrnt|| j|}|S t	|| jd }|S )
NTr      r      r   i  Z   )	out_dtype)rn   r   ops
sgl_kernelweight_packed_linearr   re   enable_deterministic_inferenceFlinearr)   r^   r   rY   rz   r   rc   ru   )r   hidden_statesr   r   logitsr   r   r   r   D  s:    

zMoEGate.forward)r   F)r   r   r   r   NN)r   rg   r   rR   r   r   r   r   r   r   &  s     r   c                      s   e Zd Z				d<d= fddZdd Z				d>d?ddZ			d@dAdd Z			d@dAd!d"Z	dBdCd#d$ZdDd&d'Z		dEdFd(d)Z
d*d+ Zd,d- Zd.d/ Zd0d1 Zd2d3 Zd4d5 Zd6d7 Zd8d9 Zd:d; Z  ZS )GDeepseekV2MoENr   Fr   r   layer_idr   r   r   r   r   
alt_streamOptional[torch.cuda.Stream]r   r   c           
        s  t    t | _t | _|j| _|j| _t j	rdn|j| _
|| _|| _|| _|| _| j|jkr>td| j d|j d|jdkrLtd|j dt||td||d	| _d }| jd
krk| j
dkrkdt| j }t||j| j
 t j | j
|j| j
 |j|j| j|| jt|dtjtd|d
| _t|j| j
 | j|j d|j!| j
|j"| jj#|| j| jj$||d u rt% & st'j(nd d| _)d| _*d| _+d | _,|jd urq| j
dkrq|j|j }t-d|j||j|dtd|dt. / st. 0 st. 1 st. 2 st. 3 st4 rt5dd
dni | _6t7| j6j8j9do*| j6j8j9j:; dv }	|	 o7| j6j8j<j=t>j?k| _*|	 oE| j6j8j<j=t>j@k| _+| j+rqtArX|jBCddkrXn| j6j8j9j:jD| j6jEj9j:jDksiJ | j6j8j9j:jD| _,|j| _Ft. / st. 0 st. 1 st. 2 rt | _G|jt j | _H|j | _I|j"| _"|j!| _J| jj#d ur| jj#jKnd | _Lt. / pt. 0 pt. 1 pt. 2 pt. 3 | _MtNO | _Pd S )Nr   zTensor parallel size z' is greater than the number of experts .r   r   r   gate)r   r   r   r   r         ?routing_method_typeexperts)
num_expertsnum_fused_shared_expertstop_kr   r   r   r   routed_scaling_factorr   r   T)r   r   renormalizeuse_grouped_topknum_expert_groupr   
topk_groupcorrection_biasr   r   %apply_routed_scaling_factor_on_output#fused_shared_experts_scaling_factoroutput_formatFshared_experts)r   r   r   r   r   r   )r   r   r      awq	moe_wna16
awq_marlinr   zcompressed-tensorsr   )Qr   r   r   r   r   moe_ep_sizer   n_shared_expertsre   disable_shared_experts_fusionr   r   r   r   r   r   r   r   r   ri   r   floatr>   ep_num_redundant_expertsnum_experts_per_tokr   moe_intermediate_sizegetattrrF   
DeepSeekV3r   rD   norm_topk_probn_groupr   r   )should_fuse_routed_scaling_factor_in_topkr<   r   rE   STANDARDtopkshared_experts_is_int8shared_experts_is_fp8 shared_experts_weight_block_sizer   r;   	is_deepepis_mooncakeis_moriis_ascend_fuseepis_flashinferr=   dictr   r   r   r   r   r   r   r   r   int8float8_e4m3fnrb   quantization_configgetweight_block_sizer   r   ep_sizer   r   r   datar   _enable_a2a_moer   fuse_shared_experts_inside_sbo_fuse_shared_experts_inside_sbo)
r   r   r   r   r   r   r   r   r   is_packed_weightr   r   r   r   q  s,  
	



	



zDeepseekV2MoE.__init__c                   s    fdd j  D S )Nc                   s.   g | ]\}}|d vrt || jjr|jqS ))r   )rG   r   num_local_expertsr  ).0namer   r   r   r   
<listcomp>%  s    
z1DeepseekV2MoE.get_moe_weights.<locals>.<listcomp>)r   named_parametersr&  r   r&  r   get_moe_weights$  s   
zDeepseekV2MoE.get_moe_weightsr   torch.Tensorr   Optional[ForwardBatch]r   r   r   rg   r   c                 C  sf   | j s-ddlm} | jd ur%| jdkr%|jd dkr%| r%| ||||S | ||||S | ||S )Nr   get_is_capture_mode)	r  +sglang.srt.model_executor.cuda_graph_runnerr-  r   r   r   forward_normal_dual_streamforward_normalforward_deepep)r   r   r   r   r   r   r-  r   r   r   r   .  s(   

zDeepseekV2MoE.forwardc           
      C  s   t j }| j| | ||}t j| j( | ||}| ||}| 	||}	t
r4t| j	jtr9|	| j9 }	W d    n1 sCw   Y  || j |	|7 }	| jdkrb|sb|sbt sbt|	}	|	S )Nr   )r   cudacurrent_streamr   wait_stream_forward_shared_expertsstreamr   r  r   r^   
isinstancer   r@   r   r   r=   r   )
r   r   r   r   r   r3  shared_outputrouter_logitstopk_outputfinal_hidden_statesr   r   r   r/  O  s0   


z(DeepseekV2MoE.forward_normal_dual_streamc           
        s  t drtjjr|S jd dkr.js!  }	|}n	d j	
j}jr^d d fdd}dfdd}jj|jj||}	tshtrotjjtrt|	j9 }	d ur||	7 }	jdkr|s|st st|	}	|	S )Nr   r   
dispatcherrA   combine_inputrB   c                   sX   j tj  tjj   W d    n1 s!w   Y    d S N)r   r4  r   r2  r3  r6  r5  remove)r<  r=  )r   r   pre_combine_hook_handler   r8  r   r   _pre_combine_hook  s   z7DeepseekV2MoE.forward_normal.<locals>._pre_combine_hookr   r*  c                   s   t j j    d S r>  )r   r2  r3  r4  r   r?  r<  r   )post_combine_hook_handler   r8  r   r   _post_combine_hook  s   z8DeepseekV2MoE.forward_normal.<locals>._post_combine_hookr   r<  rA   r=  rB   r<  rA   r   r*  )r   rn   r   r   forward_cpur   r!  r5  r   r  empty_topk_outputdevicer   r<  register_pre_combine_hookregister_post_combine_hookr^   rb   r7  r   r@   r   r   r=   r   )
r   r   r   r   r   r9  r:  rA  rD  r;  r   )r   r   rC  r@  r   r8  r   r0  p  sZ   

zDeepseekV2MoE.forward_normalc                 C  s   |  |}| ||}| j||d}t| jjt| jjks J tjj	
|| jjj| jjj|| jd| j| j| jr=| jjjn	| jrE| jjjnd | jrN| jjjn	| jrV| jjjnd | jr]| jnd d}| jdkrl|slt|}|S )Nr   r:  Tr   )r   r  r   rn   r   r   r   r   r   r   shared_expert_cpur   r   r  r  weight_scaleweight_scale_invr  r   r   )r   r   r   r9  r:  fused_experts_outr;  r   r   r   rG    sJ   

"zDeepseekV2MoE.forward_cpurR   c                   s  d j oj }|ot }|ot }jd dkrpj|d}|s`jd ur[jt	j
  t	j
j j j }W d    n1 sUw   Y  nj||jtjjdd}njj}|rd d fdd}	dfdd}
dfdd}tjjtsJ jj|	 jj|
jj|nf|rd dfdd}
dfdd}dfdd}jj|
jj|jj|n2tj ! rdfdd}
dfdd}dfdd}jj|
jj|jj|j|d}jd dkr:|s:jd ur:t	j
 "| d ur^}jj#sIt$rR|%| |}|S |j%|j&d |}|S jj#skt$sk|j&9 }|S ) Nr   )r   r   )num_token_non_paddedexpert_location_dispatch_infor<  rA   c                   s      D ]}|  qd S r>  )r5  r?  )r<  handle)deepep_dispatch_hook_handler   r   r8  r   r   _deepep_dispatch_hook  s   

z;DeepseekV2MoE.forward_deepep.<locals>._deepep_dispatch_hookdispatch_outputrC   c                   <   t |j\}}}| j||d jj||d    d S N)combine_overlap_argsmeta_overlap_args)down_gemm_overlap_argsr[  r   r   set_overlap_argsr   r?  r<  rW  rZ  r\  r[  post_dispatch_hook_handler   r   r   _post_dispatch_hook$  s   
z9DeepseekV2MoE.forward_deepep.<locals>._post_dispatch_hookr   r*  c                      |    j      d S r>  clear_overlap_argsr   r?  rB  rC  r   r   r   rD  4     
z8DeepseekV2MoE.forward_deepep.<locals>._post_combine_hookc                   rX  rY  r]  r_  r`  r   r   rb  K     
r=  rB   c                   sb   | j d }d ur|  t| j d   W d    n1 s&w   Y    d S )Nrecord_event_after_downcompute_num_sms)r[  r  recordr    configure_deep_gemm_num_smsr5  r?  r<  r=  e)r   r@  r   r8  r   r   rA  ]  s   z7DeepseekV2MoE.forward_deepep.<locals>._pre_combine_hookc                   rc  r>  rd  rB  rf  r   r   rD  p  rg  c                   rX  rY  r]  r_  r`  r   r   rb    rh  c                   s(   | j d }d ur|     d S )Nri  )r[  r  rk  r?  rm  )r@  r   r   rA    s
   c                   rc  r>  rd  rB  rf  r   r   rD    rg  rL  alpha)r<  rA   )r<  rA   rW  rC   rF  rE  )'r!  r   r   )enable_dispatch_shared_one_stream_overlap(enable_combine_shared_two_stream_overlapr   r   r   r4  r   r2  r3  r6  r5  record_streamrecord_eventr  rR  r   init_newr   rH  rI  r7  r   r<  r   register_deepep_dispatch_hookregister_post_dispatch_hookrK  rJ  r   3SGLANG_BLACKWELL_OVERLAP_SHARED_EXPERTS_OUTSIDE_SBOr  
wait_eventr  rb   add_r   )r   r   r   sbo_enabled_flagsbo_overlap_dispatch_flagsbo_overlap_combine_flagr9  shared_eventr:  rV  rb  rD  rA  r;  r   r   )rU  r   rC  ra  r@  r   r8  r   r1    s   




		


zDeepseekV2MoE.forward_deepepc                 C  s*   |j d dkr| jdkr| j||dS d S )Nr   )r   )r   r   r   )r   r   r   r   r   r   r5    s
   z%DeepseekV2MoE._forward_shared_expertsc                 C  s,   t |jj|jr| |j|_d S d |_d S r>  )rk   r   forward_modehidden_states_mlp_inputr   r9  r   stater   r   r   op_gate  s
   

zDeepseekV2MoE.op_gatec                 C  s<   | d}| jdkrt|jj|r| ||_d S d |_d S Nr  r   )popr   rk   r   r  r   r8  )r   r  r  r   r   r   op_shared_experts  s   

zDeepseekV2MoE.op_shared_expertsc                 C  s   | d}|j}|d ur7t | j | j|||jjtj	| jdd|_
W d    d S 1 s0w   Y  d S | j|j|_
d S )Nr9  rQ  )r   r9  rR  rS  )r  r  r   with_current_layerr   r  r   rR  r   ru  r:  rH  rI  )r   r  r9  r   r   r   r   op_select_experts  s    

"zDeepseekV2MoE.op_select_expertsc                 C  s4   | j dkr| jjj|j|d|dd d S d S )Nr   r:  tbo_subbatch_index)r   r:  r  )r  r   r<  
dispatch_ar  r  r  r  r   r   r   op_dispatch_a  s   

zDeepseekV2MoE.op_dispatch_ac                 C  sZ   | j dkr+t | j | jjj|dd|_W d    d S 1 s$w   Y  d S d S Nr   r  )r  )	r  r   r  r   r   r<  
dispatch_br  rW  r  r   r   r   op_dispatch_b  s   

"zDeepseekV2MoE.op_dispatch_bc                 C  s   | j j|jd|_d S )N)rW  )r   run_moe_corerW  r=  r  r   r   r   
op_experts  s   zDeepseekV2MoE.op_expertsc                 C  s:   | j dkr| jjj|d|dd |d d S d S )Nr   r=  r  )r=  r  rW  )r  r   r<  	combine_ar  r  r  r   r   r   op_combine_a  s   
zDeepseekV2MoE.op_combine_ac                 C  s*   | j dkr| jjj|dd|_d S d S r  )r  r   r<  	combine_br  hidden_states_after_combiner  r   r   r   op_combine_b  s
   
zDeepseekV2MoE.op_combine_bc                 C  sJ   | d}| d }d ur|}|j|| jd |}n|| j9 }||_d S )Nr  r8  ro  )r  rz  r   hidden_states_mlp_output)r   r  r;  r8  r   r   r   r   	op_output  s   


zDeepseekV2MoE.op_output)Nr   NF)r   r   r   r   r   r   r   r   r   r   r   r   r   )r   r*  r   r+  r   r   r   r   r   rg   r   r*  )FFN)
r   r*  r   r   r   r   r   rg   r   r*  F)r   r*  r   r   r   r*  )r   r*  r   rR   r   r*  r>  )r   rg   )r   r   r   r   r)  r   r/  r0  rG  r1  r5  r  r  r  r  r  r  r  r  r  r   r   r   r   r   r   o  sH     4$$Q
8 W
			r   c                      s   e Zd Z									dNdO fd"d#ZdPd'd(Zd)d* Zd+d, Z	dQdRd4d5Z	dQdRd6d7Zd8d9 Z	dSd:d;Z
dTd<d=Zd>d? Z	dQdRd@dAZdBdC ZdUdDdEZdUdFdGZdHdI ZdJdK ZedLdM Z  ZS )VDeepseekV2AttentionMLA'  N    Tr   Fr   r   r   r   	num_headsqk_nope_head_dimqk_rope_head_dim
v_head_dimq_lora_rankkv_lora_rank
rope_thetar  rope_scalingOptional[Dict[str, Any]]max_position_embeddingsr   r   r   r   r   r   r   r   r   	skip_roper   r   c                   sd  t    || _|| _|| _|| _|| | _|| _|| _|| _	|| _
t }t }t|| _t | _| jr;| js;J d| jrE| jrEt | _|| _|| dksPJ || | _| jd | _|	| _|| _t j| _|
rld|
d< | jd urt| j| j| j	 | j d|td|d| _t| j|jd	| _t || j| j d| !|td
|||d| _"n&t | j| j| j d|td|||d| _#t| j| j	| j d|td|d| _$| jrt%|dd }t&d;i d|dt'|dt(|d|dt)|d|d|d|	ddddd|
d|dtd|d|d |d!|| _*t | j	| j| j| j  d|td"|||d| _+t,| j| j | jd||td#|||d$| _-t| j	|jd	| _.|st%|d%d& }t/||||	|
|t j0d'| _1|
r|
2d(d}|
d) }t3|t4|}| j| | | _n
| j1j5| j1_6nd | _1|
d u| _7t8| j| j	| j | jd*|| j	|td+|d,| _9t8| j| j| j | j| j|| j|td-|d,| _:|| _;d | j:_+d | _<d | _=d.| _>d | _?d | _@d| _At jB| _Bd | _CtDd/d0| _EtF| d}|rtGrtHrtId1d2gd*d3gd*d3ggd4| _J|otF| jjJdo| jjJj
K d5v }|oI| oI| jjLjMtNjOkoI| jjLjPd d6koI| jjLjPd* d7koItQoId8tR  koGd9k n  | _S|oY| oY| jjLjMtNjTk| _U|oi| oi| jjLjMtNjVk| _Wd | _X| jWrtGrtHrt%| jjJd:dt%| j"jJd:dksJ t%| jjJd:d}|r| jjJj
jX| j"jJj
jXksJ | jjJj
jX| _X| Y  d S )<Nz.CP currently only supports deepseek v3.2 modelr   g      deepseek_yarn	rope_typeFfused_qkv_a_proj_with_mqa)r   r   r   epsq_b_projr   q_projkv_a_proj_with_mqaindexer_rope_interleaver   index_n_headsindex_head_dimrope_head_dim
index_topkr  r  r  	scale_fmtue8m0
block_size   r  is_neox_styler   indexerr   r   r   	kv_b_projo_projr   rope_interleaveT)
rotary_dimmax_positionbaser  r  rI  mscale_all_dimfactorr   attn_mqa)num_kv_headsr   r  r   r   attn_mhar   SGLANG_ROCM_FUSED_DECODE_MLAfalsew_kcw_vcr   )r   transpose_dimsr   i@  r   r   x   block_quantr   )Zr   r   r   r   r  r  qk_head_dimr  r  r  r   r2   r3   r   use_nsar(   r   r1   cp_sizer  num_local_headsscalingr  r  re   kv_cache_dtyper8   ri   r  r5   rms_norm_epsq_a_layernormr6   _get_q_b_proj_quant_configr  r  r  r  r#   r   r   r   r  r  r9   r  kv_a_layernormrN   rI  
rotary_embr  rd   r  forward_nativer   use_deepseek_yarn_roperM   r  r  r   r  r  w_scale	w_scale_k	w_scale_vuse_deep_gemm_bmmflashinfer_mla_disable_raggedcurrent_attention_backendrj   rocm_fused_decode_mlar   r[   r\   r"   r   r   r   r   r   r   r   r^   rY   use_min_latency_fused_a_gemmr  qkv_proj_with_rope_is_int8r  qkv_proj_with_rope_is_fp8r  init_mha_forward)r   r   r   r  r  r  r  r  r  r  r  r  r   r   r   r   r   r  attn_tp_rankattn_tp_sizer  r  scaling_factormscalehas_fused_projr"  use_block_quantr   r   r   r   -  s  









	
	
















zDeepseekV2AttentionMLA.__init__r   rR   rU   c                 C  sj   |j  r
t j}n|j  s|j  r$t jdkrt j}n	t j}nt j}|| _t	
|}|| |S )Ndecode)r  is_decode_or_idlere   decode_attention_backendis_target_verifyis_draft_extendspeculative_attention_modeprefill_attention_backendr  rT   get_handler)r   r   attention_backendhandlerr   r   r   dispatch_attn_forward_method+  s   





z3DeepseekV2AttentionMLA.dispatch_attn_forward_methodc                 C  s$   | j |j|d|j|jd|_d S )N!hidden_states_after_comm_pre_attn)	positionsr   r   zero_allocator)forward_preparer  r  r   r  attn_intermediate_stater  r   r   r   
op_prepareA  s   z!DeepseekV2AttentionMLA.op_preparec                 C  s   |  |d|_d S )Nr  )forward_corer  hidden_states_after_attnr  r   r   r   op_coreI  s   
zDeepseekV2AttentionMLA.op_corer  r*  r   r  rg   llama_4_scalingOptional[torch.Tensor]c                 C  s   | j |||||d}| |S )Nr  r   r   r  r  )r  r  )r   r  r   r   r  r  sr   r   r   r   N  s   
zDeepseekV2AttentionMLA.forwardc                 C  s  | j jd u r| j| j _t|tr*t js)|d jd dkr)| jjr%J d|d S nt jsC|jd dkrC| jjr=J d|d |d fS | 	|}|t
jkrV| ||||}ns|t
jkrd| ||||}ne|t
jkrr| ||||}nW|t
jkr| |||||}nH|t
jkr| ||||}n:|t
jkr| ||||}n,|t
jkrt| ||||}n|t
jkrt| ||||}n|t
jkrt| ||||}ntd |||fS )Nr   z-short-circuiting allreduce will lead to hangs)r  r  r7  tupler.   input_scatteredr   r  r   r  rU   MHAforward_normal_prepareMHA_CHUNKED_KV!forward_normal_chunked_kv_prepareMHA_ONE_SHOTforward_normal_one_shot_prepareMLAforward_absorb_prepareMLA_FUSED_ROPE%forward_absorb_fused_mla_rope_prepareMLA_FUSED_ROPE_CPU)forward_absorb_fused_mla_rope_cpu_prepareMHA_NPUr   MLA_NPUr   DSA_NPUr}   NotImplementedError)r   r  r   r   r  r  attn_forward_methodinner_stater   r   r   r  _  sv   















z&DeepseekV2AttentionMLA.forward_preparec                 C  s   |\}}}}|d u r|S |t jkr| j| S |t jkr | j| S |t jkr*| j| S |t jkr4| j| S |t j	kr>| j
| S |t jkrH| j| S |t jkrUt| g|R  S |t jkrbt| g|R  S |t jkrot| g|R  S tr>  )rU   r  forward_normal_corer  forward_normal_chunked_kv_corer  forward_normal_one_shot_corer
  forward_absorb_corer  "forward_absorb_fused_mla_rope_corer  &forward_absorb_fused_mla_rope_cpu_corer  r~   r  r   r  r|   r  )r   intermediate_stater   r  r   r  r   r   r   r    s.   















z#DeepseekV2AttentionMLA.forward_corec                 C  s`   | j d usJ t|ts'|jd dkr'|jd dkr'| jr't|| jjj}|S | |d }|S )Nr   r   r   )	r  r7  r  r   r  ry   r  r   T)r   r   r   
qkv_latentr   r   r   prepare_qkv_latent  s   
z)DeepseekV2AttentionMLA.prepare_qkv_latentc                 C  s\   | j dkrt jdkpt jdko|jjtjkS | j dko-|j	 p&|j
 o-|jjtjkS )zq
        Check if we should skip rope and do fused rope+quantize for TRTLLM MLA decode in fp8_e4m3 path.
        r   trtllmr   )r  re   nsa_decode_backendnsa_prefill_backendattn_backendr  r   r  r  r  r  	data_type)r   r   r   r   r   _fuse_rope_for_trtllm_mla  s   



z0DeepseekV2AttentionMLA._fuse_rope_for_trtllm_mlac                 C  s   | d|dd | jf< | d|d| jd f< t| | j|tj }|dd | jf d}|d| jd f d}||fS )Nr   .)	squeezer  r%   
contiguousr  r   r2  r3  	unsqueeze)r   latent_cacher   k_nopek_pelatent_cache_outputr   r   r   rebuild_cp_kv_cache  s   z*DeepseekV2AttentionMLA.rebuild_cp_kv_cachec                 C  s  ddl m} d }d }| jd urdt  j| j| j| j gdd\}	}
|
dd | jf }| jd urg| rgt	j
 }| j| | |	}	t	j
| j | |}W d    n1 s[w   Y  || j nstr| jjjt	jkrt|	| jj| jj|| jj| jj^}	}}}nSd }tr| jjjt	jkr| jrt|	| jj| jj|| jj| jjdt	jd dd
\}}}}|}	n&t|	| jj| jj|| jj| jjdt	jd d	d
\}	}}}n
| |	}	| |}| jr|d u r|	}| jd ur@| r@|j r@|d ur@t	j
 }| j| t	j
| j |d
}| |	d d| j| j}	W d    n	1 s)w   Y  | j ||||| j!d}|| j nJ|d
}| |	d d| j| j}	|d urc| j ||||| j!d}n&| "|d d| j| j}	| #|d }
|
dd | jf }| |d
}|	j| j$| jgdd\}}|
d| jd f d
}| j%rt&|'dd
\}}}}}|(| j|| jf}t)*||f| j+| j,f||| |d d d |d d f }nt-rOtr| j+jt	jkr|'dd
}t	j.|j/d |j/d
 | j+j/d |j0t	j1d}t2|| j+'dd| j,'ddt	j1| nptr8| j+jt	jkr8t3|| j+'dd| j4dd d	dt	j1d}nQt	5|6t	j1'dd
| j+6t	j1| j4 }n:| j+jt	jkr~t7|'dd
t8rjt	j9dt	j:|j0dn|;d
\}}t<|| j+|| j4t	j1}nt	5|'dd
| j+}|'dd
}| j=d ur| >|st?rt@r| jr| =|||\}}tA|r| B|
|||\}}|||||||||f	S )Nr   r,  dim.r  T)
group_sizedtype_quantres1output_unquantized_inp1Fr   )r   q_lorar  r   r   r   rI  r   XWQr  r1  YQtranspose_bmtranspose_bm_inr   r   r   rI  )Cr.  r-  r  r.   fetch_qkv_latentsplitr  r  r   r   r2  r3  r4  r  r6  r  rc   r  r   r   r   rt   variance_epsilonr  r  rq   r  r  r(  r   r  r  r  r   r  r  r  r  rL   	transpose	new_emptyr    grouped_gemm_nt_f8f8bf16_maskedr  r  r`   r   r   rI  r   rr   ro   r  bmmtorK   r]   zerosr   r   rx   r  r%  rb   r_   r)   r-  )r   r  r   r   r  r  r-  r5  topk_indicesqr)  r*  r3  r   	q_quantedq_nopeq_per+  
q_nope_valq_nope_scalemasked_m
expected_m	aligned_m
q_nope_outr   r   r   r   r    sv  

	












z-DeepseekV2AttentionMLA.forward_absorb_preparec
                 C  s$  d}
| j tv r3i }| |r| jj| jj|	d}| j||||f||d||d ur.t|dni }ngtri| jj	}| jj
}| jdkrDtn|j}t|||||j| jj|j|||| jj| jj|d\}}}}d}
ntj||gdd	}tj||gdd	}|	d ur||	9 }| j||||fd
|
i|d urt|dni }|d| j| j}| jrt|dd\}}}}}|| j|| jf}t||f| j | j!f||| |d d d |d d f dd"dd}n&t#rtr!| j jtj$kr!|dd}tj%|j&d |j&d | j j&d |j'tj(d}t)|| j dd| j!ddtj(| n5tr@| j*jtj+kr@t,|| j dd| j-dd ddtj(d}nt.|/tj(dd| j /tj(| j- }| j0j1jtj$krj|dd}t2|}n| j0j1jtj+kr|dd}t3|dtj+d}n|dd"dd}n|| j jtj+krt4|ddt5rtj6dtj7|j'dn|8d\}}t9|| j || j-tj(}|dd"dd}nCt: rt.|dd| j dd"dd}n+tj%|j&d | j| j f|j|j'd}tj.|dd| j |d| j| jddd | 0|\}}|S )NT)cos_sin_cacheis_neoxr  )q_ropek_rope)rI  fp8_e4m3)q_out_dtypeFr.  r/  save_kv_cacher   r   r   r6  r7  r  r8  )r1  r2  r>  r?  )out);r  rX   r%  r  rT  r  r  r  rc   	cos_cache	sin_cacher  rJ   r   rv   token_to_kv_poolget_key_bufferr   out_cache_lock_scaler   catr   r  r  r  rL   rC  rD  r  r    rE  r  r  flattenr`   r   r   r   rI  r   rr   r  r  ro   r  rF  rG  r  r   rs   rp   rK   r]   rH  r   r   rx   r   )r   rM  r+  rS  r*  r   r  r  rI  r  rZ  
extra_argsattn_outputcossinr  rJ  r   kattn_output_valattn_output_scalerP  rQ  rR  attn_bmm_outputr   outputr   r   r   r    s.  



(




z*DeepseekV2AttentionMLA.forward_absorb_corec                 C  s*  t dddk}t|tr|d n|}|jd }||| j| j| j }| j	d urR| 
|d j| j	| j| j gdd\}	}
| |	}	| |	d d| j| j}	n| |d d| j| j}	| |d }
|	j| j| jgdd\}}trt|tjdd| jtj| j }n/| jjtjkrt|dd|dtjd\}}t|| j|| jtj}nt|dd| j}|dd|dd | jf< |
dd | jf }|  |! "d}|
"d}||dd | jf< |s|d| jd f }| #|||\}}||d| jd f< ||d| jd f< d }nt$|d| jd f }||d| jd f< tj%|| j| jf|	j|	j&d	}|j'j(\}}}}}}}| j#j)}|j'j*}| j+j,}|d u rgtj%|j-| j|| jd ftj.|	j&d	}|j/0| j+|j1|d  |j/2| j+j3}|dd | jf }||||||||||||||||fS )
N#SGLANG_FUSED_MLA_ENABLE_ROPE_FUSION1r   r.  r/  r   r   .r?  )4osgetenvr7  r  r   rD  r  r  r  r  r  rA  r  r  r   r  r  r  r  r`   r   rF  rG  r   rC  r  r  r   r  rK   r   rx   r  r'  r(  r  
empty_liker   rI  r#  forward_metadatarT  num_kv_splitsr  r  
batch_sizer   r^  set_kv_bufferr`  r_  r   )r   r  r   r   r  enable_rope_fusionhidden_states_tensorq_lenq_inputrJ  r)  rL  rM  rS  rN  rO  v_inputk_inputr+  k_pe_outputre  attn_logitsr   	kv_indptr
kv_indicesrT  num_kv_splitsm_scalekey_cache_bufval_cache_bufr   r   r   r    s   








z<DeepseekV2AttentionMLA.forward_absorb_fused_mla_rope_preparec                 C  s   | j d ur	t| sJ dtjj|| jj| jj| j	| j
j| jj|| jj| jj| j| j| jr2| jjn| jr9| jjnd | jrA| jjn| jrH| jjnd d| j| j | j| j\}}}|||||fS )Nzdforward_absorb_fused_mla_rope_cpu_prepare requires q_lora_rank is not None and use_intel_amx_backendT)r  rn   r   r   r   qkv_proj_with_rope_fused_weightr  r   r  r  r  r  r  rT  rB  r  r  rN  rO  r  r  r  )r   r  r   r   r  ry  r{  rz  r   r   r   r    sF   



&z@DeepseekV2AttentionMLA.forward_absorb_fused_mla_rope_cpu_preparec                 C  s,  t |||||||| j| jj||	|
||| jj|| jjd |r2||d| jd f< |j| j|j	|d  |
d| j| j}trTt|tjdd| jtj| j }n/| jjtjkrxt|dd|dtjd\}}t|| j|| jtj}nt|dd| j}|dddd}| |\}}|S )N)	logit_capuse_roper  .r.  r   r   r   r   )r{   r  r  r  r  r  r  r^  ru  r`  r   r  r`   r   rF  rG  r   rC  r  r  r   r  rK   r   rx   rc  r  )r   ry  r  r  re  r~  r  r|  rT  r  r}  r  r  rv  r{  r   r  rk  ri  rj  rl  r   r   r   r   r  ;  s^   

z9DeepseekV2AttentionMLA.forward_absorb_fused_mla_rope_corec                 C  s   | j d ur	t| sJ d| ||||}|d| j| j}| jd}| jd}|d}	tj	|	t
|| g|jd}
|
|	||gdd}tjj||dd| jdd  |
}| |\}
}|
S )Nzaforward_absorb_fused_mla_rope_cpu_core requires q_lora_rank is not None and use_intel_amx_backendr.  r   r   r   T)r  rn   r  r   r  r  r  sizer   r   r   r   
transpose_r   r   bmm_cpurC  r  )r   ry  r{  rz  r   r  re  BNMrl  rk  r   r   r   r   r    s,   

z=DeepseekV2AttentionMLA.forward_absorb_fused_mla_rope_cpu_corec                 C  s   t j rtdddgdS | S )NTr  )is_checkpoint_fp8_serializedr  )r   "SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTNr  rI   )r   r   r   r   r    s   
z1DeepseekV2AttentionMLA._get_q_b_proj_quant_config)	r  Nr  NTNr   NF)$r   r   r   r   r  r   r  r   r  r   r  r   r  r   r  r   r  r  r  r  r  r   r   r   r   r   r   r   r   r   r   r   r  r   r   r   )r   rR   r   rU   r>  )
r  r*  r   r*  r   rR   r  rg   r  r  )r   r*  r   rR   )r   rR   r   r   )r  r*  r   r*  r   rR   r  rg   )r   r   r   r   r  r  r  r   r  r  r  r%  r-  r  r  r  r  r  r  staticmethodr  r   r   r   r   r   r  +  sD     
H

 a 
;
y3I%r  c                      sj   e Zd Z					d.d/ fddZd0ddZ		d1d2d"d#Z	d3d4d&d'Zd(d) Zd*d+ Zd,d- Z	  Z
S )5DeepseekV2DecoderLayerNFr   r   r   r   r   r   r   moe_quant_config_overrider   r   r   r   r   r   r   r   c              	     s  t    |j| _|| _t|dr2|jd}|d us"J d| |jd}	|	dkr/|jnd }
n|j}|j}
|j	}t
t j| _t | _|| _|| _td"i d|d| jd|jd	|jd
|jd|jdt|drp|jnd d|jd|d|
d|d|d|dddtd|d|| _| j||d| _| j|d dd}| j|d dd}tj||rdn|j| j||d| _ | jrt!||p|td|| j||d| _"nt# rd\}}nd\}}t$|j|j%|j&|td|||d| _"t'|j|j(d| _)t'|j|j(d| _*| jr$t+| j | j)| j*d |p| j| jjd k| jj,d!| _-d S t.| j | j)| j*d |p7| j| jjd k| jj,d!| _-d S )#Nrope_parametersr  z rope_theta not found in config: r  defaultr   r   r  r  r  r  r  r  r  r  r   r   r   Fr   	self_attnr   )r   r   )r   
num_layersis_layer_sparseis_previous_layer_sparseis_next_layer_sparsemlp)r   r   r   r   r   r   )r   r   r   )r   r   r   r   r   r   r   r  T)layer_scatter_modesinput_layernormpost_attention_layernormallow_reduce_scatteris_last_layerqkv_latent_funcr   )/r   r   r   r   r   r  r  r  r  r  rf   from_stringre   speculative_algorithmr(   r   r   r   r  num_attention_headsr  r  r  r  r  ri   r  _is_layer_sparser  r,   ru  num_hidden_layersr  r   r  r-   r   r   r   r5   r  r  r  r/   r  layer_communicatorr+   )r   r   r   r   r  r   r   r   r  r  r  r  r  r  mlp_tp_rankmlp_tp_sizer   r   r   r     s   





	

zDeepseekV2DecoderLayer.__init__c                 C  s,   |p| j jd uo|| j jko|| j j dkS )Nr   )r   r   first_k_dense_replacemoe_layer_freq)r   r   r   r   r   r   r  (	  s   
z'DeepseekV2DecoderLayer._is_layer_sparser  r*  r   r   rR   residualr  r  rg   r   r  c                 C  s2  t r t| jdd d ur t| jjdd d ur | jjjjtjkr dn$t rCt| jdd d urCt| jjdd d urC| jjjjttdd krCdnd}| j	||||\}}| j|||||d}| j
|||\}}| j|}	| j|}
t| jtrxd }| |||	|
|}| js|	rd|_|	s| j|||\}}||fS )	Nr  r   mxfp4r  fp8r   r   T)r_   r  r  r  r   r   r   r   r  prepare_attnprepare_mlp)should_fuse_mlp_allreduce_with_next_layershould_use_reduce_scatterr7  r  r   r   _sglang_needs_allreduce_fusionpostprocess_layer)r   r  r   r   r  r  r   r  quant_formatr   r   r   r   r   r   /	  s   



zDeepseekV2DecoderLayer.forwardr  r   c                 C  s2   | j |||\|_|_|t||||d d S )N)r   r  r  r  )r  r  r  residual_after_input_lnupdater  )r   r  r  r   r   r  r  r  r   r   r   op_comm_prepare_attn	  s   
z+DeepseekV2DecoderLayer.op_comm_prepare_attnc                 C  s*   | j |d|d|j\|_|_d S )Nr  r  )r  r  r  r   r  residual_after_comm_pre_mlpr  r   r   r   op_comm_prepare_mlp	  s   z*DeepseekV2DecoderLayer.op_comm_prepare_mlpc                 C  sB   | d}t r| js|jd dks| ||j|_d S ||_d S r  )r  r-   r  r   r  r   r  )r   r  r   r   r   r   op_mlp	  s   


zDeepseekV2DecoderLayer.op_mlpc                 C  sR   | j |d|d|j\}}t|j|||j|j|jd}|jh dd |S )Nr  r  )r  r   r  r   r  r  >   r  r   r  r  )expect_keys)	r  r  r  r   r  r  r  r  clear)r   r  r   r  rl  r   r   r   op_comm_postprocess_layer	  s"   	z0DeepseekV2DecoderLayer.op_comm_postprocess_layer)NNFr   N)r   r   r   r   r   r   r  r   r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   )r  r*  r   r*  r   rR   r  r  r  rg   r   rg   r  r  r   r*  r>  )r  r*  r   r*  r   rR   r  r  r  rg   r  r   )r   r   r   r   r  r   r  r  r  r  r   r   r   r   r   r    s     
q`	r  c                      sB   e Zd ZdZ		dd fddZdddZ		ddddZ  ZS ) DeepseekV2ModelFNr   r   r   r   r   r   r   r   r   c              
     s
  t     j_ j_ j_t _t _	j	r!t
 _nd _jjr4t j jt d_nt _ts?tj rDtj nd _t j fddjjjjtd|tdd dd dd\_ _!_"jj#r{t$ j j%d	_&ntd
d_&d_'t(r j)dkrjj*dkrt+fddt,t-j D }d}t,t-j D ]2}t.j | j/t0rt1 }|2 p|3 p|4 }|rdnt5 }	 j6 j7 }
t8|
d |	}|} nqt9 j)||jj*_'g _:t1 2 st1 4 rd
_;nd_;t< dd _=d S )N)use_attn_tp_groupc                   s   t  | |jdS )N)r   r   r   r   r   )r  r   )idxr   r   r   r   r   r   <lambda>	  s    z*DeepseekV2Model.__init__.<locals>.<lambda>layersc                 S  s   t | jtr
| jjS | jS r>  )r7  r  r   r   )layerr   r   r   r  	  s   

c                 S  s.   t | trddgt| drddgS g S g S )N
w13_weight	w2_weightw13_blockscale_swizzledw2_blockscale_swizzled)r7  r?   r   )moduler   r   r   r  
  s   	
)submodule_accessorwhitelist_param_names_creator)pp_rankpp_sizer   offloader_kwargsr  T)return_tupler   r   r   c                   s"   g | ]}t  j| jtrd qS r>  )r7  r  r  r   )r$  ir&  r   r   r'  "
  s    z,DeepseekV2Model.__init__.<locals>.<listcomp>r   r   Fr  )>r   r   pad_token_id
padding_id
vocab_sizer  r   pp_groupr(   r   r1   r  is_first_rankrQ   r   r4   embed_tokensrO   r^   r   SGLANG_NPU_USE_MULTI_STREAMr  r   r2  Streamr   rm   r  rank_in_group
world_sizeri   r  r  start_layer	end_layeris_last_rankr5   r  normgemm_output_zero_allocator_sizerc   r   embedding_dimsumrangelenr7  r  r   r;   r  r  r  r   r  r  r   rw   layers_to_captureenable_a2a_moer  llama_4_scaling_config)r   r   r   r   num_moe_layersallocate_sizer  a2a_backend
is_a2a_moer   r   &share_expert_output_size_per_partitionr   r  r   r   	  s   


%


zDeepseekV2Model.__init__r*  c                 C  s   | j S r>  )r  r&  r   r   r   get_input_embeddingsP
  s   z$DeepseekV2Model.get_input_embeddings	input_idsr  r   rR   input_embedspp_proxy_tensorsOptional[PPProxyTensors]#Union[torch.Tensor, PPProxyTensors]c              
   C  s  | j | j }|d ur|jn|j}t|d |jrdnd tj|d}t| d}	|	r6| jdkr6t| jtj|dnd }
| j	j
rK|d u rF| |}n|}d }n|d usQJ |d }|d }t|rk| j	j
rft||}t||}d }| jd urt| jd | jd	 |d
}| j}| j }|jr| j|kr| j|k r| j}n	| j|k rd }}g }t||D ]T}t jrt nt |}|; || jv r| jr|| jkrt|| dd}|| n|||  | j| }|||||||
|\}}W d    n1 sw   Y  q|| j krt| j|| j  d||||| j|d  jj|d\}}| j	j s(t!||dS |j"# sA|d u r9| $|}n| $||\}}| j	j rVt|rVt%|| j&|tj'( }t)|dkr_|S ||fS )Nr   r   )buffer_sizer   rI  r  r   r   r   original_max_position_embeddingsbeta)r  scaling_betar  r/  T)r  
enable_tbor  r   r   r  input_data_scatter_moder  )r   r  )*r  r  rI  rg   can_run_tbor   r   r   r  r  r  r  r)   r&   r'   r  rZ   r  r  re   enable_piecewise_cuda_graphr   r   r  r  r  r   appendr  r   r  layer_output_moder  rS   r  is_idler  r%   r  r2  r3  r  )r   r  r  r   r  r  total_num_layersrI  r  has_gemm_output_zero_allocatorr   r   r  r  normal_start_layernormal_end_layeraux_hidden_statesr  ctxaux_hidden_stater  r   r   r   r   r   S
  s   














zDeepseekV2Model.forwardNr   r   r   r   r   r   r   r   r   )r   r*  r   )r  r*  r  r*  r   rR   r  r*  r  r  r   r  )r   r   r   fall_back_to_pt_during_loadr   r  r   r   r   r   r   r   r  	  s    
r  c                      s   e Zd Zi Z		d3d4 fddZedd Z	d5d6ddZd7ddZe	
 		d8d9ddZed d! Zed"d# Zd:d;d'd(Zd)d* Zd+d, Zed-d. Zd<d=d1d2Z  ZS )>DeepseekV2ForCausalLMNr   r   r   r   r   r   r   r   r   c                   sB  t    t|do|jd u _ jrddg jd< t  _| _t	  _
| _   t| _t||td|d _ jjra jjdkrO|jrO jj _nt|j|j|td|t jd	 _nt  _t| _t fd
d _ d _!t"  _# j#rt$  _%t&  _'nd   _% _'t|dr|jnd }t( )|t| d S )Nr  q_a_projr  r  model)r   r   lm_head)r   r   r  c                     s   dd t  jjD S )Nc                 S  s(   i | ]\}}t |jtr||j qS r   )r7  r  r   r)  )r$  r   r  r   r   r   
<dictcomp>  s    

zDDeepseekV2ForCausalLM.__init__.<locals>.<lambda>.<locals>.<dictcomp>)	enumerater  r  r   r&  r   r   r    s    
z0DeepseekV2ForCausalLM.__init__.<locals>.<lambda>F)*r   r   r   r  fuse_qkv_a_projpacked_modules_mappingr   r  r   r   r   r   "determine_num_fused_shared_expertsr   r  r  ri   r  r  r  tie_word_embeddingsr  r	  rP   r  r   re   enable_dp_lm_headrO   r:   logits_processorrh    _routed_experts_weights_of_layercapture_aux_hidden_statesr(   r   r0   cp_rankr1   r  r.   init_context)r   r   r   r   r  r   r&  r   r   
  sN   



	


zDeepseekV2ForCausalLM.__init__c                 C     | j jS r>  )r  valuer&  r   r   r   routed_experts_weights_of_layer!     z5DeepseekV2ForCausalLM.routed_experts_weights_of_layerDeepseekV3ForCausalLMarchitecturec                 C  s  d| _ t jr	d S d }| jjd |ks| jjdks| jjdkr"d}nFtr,tj	
ddk r9tr6tj	
ddk r9d}n/t dkrKtrHtj	
ddk rKd	}n|d u r\t  sYt  r\d
}n| jrh| j dkrhd}|d ur}dt _d| _ tt| d d S | jj| _ d S )Nr   r   r   z/Config does not support fused shared expert(s).r2  )   r   )	      zOnly Deepseek V3/R1 on NV-platform with capability >= 80 or AMD-platform with capability >= gfx942(MI30x) can use shared experts fusion optimization.zOnly Deepseek V3/R1 on AMD-platform with capability >= gfx942(MI30x) can use shared experts fusion optimization under expert parallelism.z]Deepseek V3/R1 cannot use shared experts fusion optimization under deepep expert parallelism.w4afp8z^Deepseek V3/R1 W4AFP8 model uses different quant method for routed experts and shared experts.Tz0 Shared experts fusion optimization is disabled.)r   re   r  r   architecturesr   r  r^   r   r2  get_device_capabilityr`   r   r;   r  r  r   r   rl   logger)r   r  disable_reasonr   r   r   r  %  sH   
z8DeepseekV2ForCausalLM.determine_num_fused_shared_expertsnn.Embeddingc                 C  r  r>  )r  r  r&  r   r   r   r  Q  s   z*DeepseekV2ForCausalLM.get_input_embeddingsr  r*  r  r   rR   r  r  r  c                 C  s   | j rtt|| j| j|rtt|| j| j|j |_	t
 | | |||||}W d    n1 s7w   Y  d }| jrE|\}}| jjrS| ||| j||S |S r>  )r   r$   r  r  r  r*   r  seq_lens_cputolistnsa_cp_metadatar.   maybe_input_scatteredr  r  r  r  r  r	  )r   r  r  r   r  r  r   r   r   r   r   r   T  s*   	
zDeepseekV2ForCausalLM.forwardc                 C  r  r>  )r  r  r&  r   r   r   r  u  r  z!DeepseekV2ForCausalLM.start_layerc                 C  r  r>  )r  r  r&  r   r   r   r  y  r  zDeepseekV2ForCausalLM.end_layerFweights"Iterable[Tuple[str, torch.Tensor]]c                 C  s   |  || d S r>  )do_load_weights)r   r)  r   r   r   r   load_weights}  s   z"DeepseekV2ForCausalLM.load_weightsc                 C  s   | j jj| jjfS r>  )r  r  r   r	  r&  r   r   r   get_embed_and_head  s   z(DeepseekV2ForCausalLM.get_embed_and_headc                 C  s8   | j j`| j`|| j j_|| j_tj  tj  d S r>  )r  r  r   r	  r   r2  empty_cachesynchronize)r   embedheadr   r   r   set_embed_and_head  s   

z(DeepseekV2ForCausalLM.set_embed_and_headc                 C  s   t |j|j|jdS )N)r  num_logical_experts
num_groups)r   r  r   r  )clsr   r   r   r   $get_model_config_for_expert_location  s
   z:DeepseekV2ForCausalLM.get_model_config_for_expert_location	layer_idsOptional[List[int]]c                 C  sX   | j jsd S |d u rd| _| jj}d|d |d g| j_d S d| _dd |D | j_d S )NTr      c                 S  s   g | ]}|d  qS r>  r   )r$  valr   r   r   r'    s    zFDeepseekV2ForCausalLM.set_eagle3_layers_to_capture.<locals>.<listcomp>)r  r  r  r   r  r  r  )r   r7  r  r   r   r   set_eagle3_layers_to_capture  s   z2DeepseekV2ForCausalLM.set_eagle3_layers_to_capturer  r  )r  )r  r   )r   r$  r   )r  r*  r  r*  r   rR   r  r*  r  r  r   r*  r  )r)  r*  r>  )r7  r8  )r   r   r   r  r   propertyr  r  r  r   no_gradr   r  r  r,  r-  r2  classmethodr6  r;  r   r   r   r   r   r  
  s0    @

, 


r  c                   @     e Zd ZdS )r  Nr   r   r   r   r   r   r   r        r  c                   @  r?  )DeepseekV32ForCausalLMNr@  r   r   r   r   rB    rA  rB  )__doc__
__future__r   loggingro  
contextlibr   typingr   r   r   r   r   r	   r
   r   torch.nn.functionalr   
functionalr   transformersr   -sglang.srt.batch_overlap.single_batch_overlapr   r   *sglang.srt.batch_overlap.two_batch_overlapr   r   0sglang.srt.compilation.piecewise_context_managerr   sglang.srt.configs.model_configr   r   r   r   sglang.srt.distributedr   r   r   r   r   r   sglang.srt.environr   #sglang.srt.eplb.expert_distributionr   sglang.srt.eplb.expert_locationr   (sglang.srt.eplb.expert_location_dispatchr   sglang.srt.layersr    sglang.srt.layers.activationr!   sglang.srt.layers.amx_utilsr"   +sglang.srt.layers.attention.nsa.nsa_indexerr#   %sglang.srt.layers.attention.nsa.utilsr$   r%   r&   r'   r(   r)   r*   sglang.srt.layers.communicatorr+   r,   r-   r.   %sglang.srt.layers.communicator_nsa_cpr/   sglang.srt.layers.dp_attentionr0   r1   r2   r3   r4   sglang.srt.layers.layernormr5   sglang.srt.layers.linearr6   r7   r8   r9   "sglang.srt.layers.logits_processorr:   sglang.srt.layers.moer;   r<   r=   "sglang.srt.layers.moe.ep_moe.layerr>   ,sglang.srt.layers.moe.fused_moe_triton.layerr?   #sglang.srt.layers.moe.kt_ep_wrapperr@   +sglang.srt.layers.moe.token_dispatcher.baserA   rB   rC   sglang.srt.layers.moe.topkrD   rE   sglang.srt.layers.moe.utilsrF   rG   *sglang.srt.layers.quantization.base_configrH   "sglang.srt.layers.quantization.fp8rI   )sglang.srt.layers.quantization.fp8_kernelrJ   rK   rL   !sglang.srt.layers.radix_attentionrM   "sglang.srt.layers.rotary_embeddingrN   sglang.srt.layers.utilsrO   *sglang.srt.layers.vocab_parallel_embeddingrP   rQ   ,sglang.srt.model_executor.forward_batch_inforR   rS   ;sglang.srt.models.deepseek_common.attention_backend_handlerrT   ;sglang.srt.models.deepseek_common.attention_forward_methodsrU   rV   8sglang.srt.models.deepseek_common.deepseek_weight_loaderrW   'sglang.srt.models.deepseek_common.utilsrX   rY   rZ   r[   r\   r]   r^   r_   r`   ra   rb   rc   rd   sglang.srt.server_argsre    sglang.srt.speculative.spec_inforf   sglang.srt.utilsrg   rh   ri   rj   rk   rl   rm   rn   Xaiter.ops.triton.batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quantro    aiter.ops.triton.fused_fp8_quantrp   rq   /sglang.srt.layers.quantization.rocm_mxfp4_utilsrr   rs   rt   #sglang.srt.layers.rocm_linear_utilsru   rv   rw   r   rx   ry   rz   ;sglang.srt.layers.attention.triton_ops.rocm_mla_decode_roper{   Esglang.srt.hardware_backend.npu.modules.deepseek_v2_attention_mla_npur|   r}   r~   r   r   r   	getLoggerr   r"  Moduler   r   r   r  r  r  r  r  rB  
EntryClassr   r   r   r   <module>   s   $ $	<("	

OI     A              F