o
    پi                    @  s  d dl mZ d dlZd dlmZmZmZmZmZm	Z	 d dl
Z
d dlm  mZ d dlmZ d dlmZ d dlmZmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZ d dlm Z m!Z!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z*m+Z+ d dl,m-Z-m.Z.m/Z/ d dl0m1Z1m2Z2m3Z3m4Z4 d dl5m6Z6m7Z7m8Z8m9Z9 d dl:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@mAZAmBZBmCZC d dlDmEZE d dlFmGZGmHZH d dlImJZJ d dlKmLZLmMZMmNZNmOZOmPZP d dlQmRZRmSZSmTZTmUZUmVZVmWZWmXZXmYZYmZZZm[Z[m\Z\m]Z] erd dl^m_Z_m`Z` d dlambZb d dlcmdZd eV ZeeU ZfeW ZgeR ZheT Zie7 ZjeSdoeeZkejlm o&eeZnens-ekrAd dlompZpmqZq d dlrmsZs d dltmuZu d d!gZvewexZyG d"d# d#e3ZzG d$d% d%e2Z{G d&d' d'e1Z|G d(d) d)eEZ}dS )*    )annotationsN)TYPE_CHECKINGAnyDictListOptionalUnion)Module)	Parameter)$get_tensor_model_parallel_world_sizeget_tp_group)use_symmetric_memory)envs)CPUQuantMethod!_amx_process_weight_after_loading)is_allocation_symmetric)	MoeRunnerMoeRunnerBackendMoeRunnerConfig)DeepGemmMoeQuantInfo)FlashInferTrtllmFp8MoeQuantInfo)TritonMoeQuantInfo)RoutingMethodTypeget_moe_runner_backend)BlockQuantScaleParameterModelWeightParameterPerTensorScaleParameter)FusedMoEMethodBaseLinearMethodBaseQuantizationConfigQuantizeMethodBase)	fp8_dtypeis_fp8_fnuzper_token_group_quant_fp8scaled_fp8_quant)	apply_fp8_linearcan_auto_enable_marlin_fp8cutlass_fp8_supporteddispatch_w8a8_block_fp8_linearinput_to_float8mxfp8_group_quantizenormalize_e4m3fn_to_e4m3fnuzrequant_weight_ue8m0_inplacetriton_mxfp8_blockscaled_linear)BaseKVCacheMethod)apply_fp8_marlin_linearprepare_fp8_layer_for_marlin)UnquantizedLinearMethod)all_close_1dconvert_to_channelwiseis_layer_skippedper_tensor_dequantizerequantize_with_max_scale)cpu_has_amx_supportget_bool_env_varis_cpuis_cudais_hipis_npuis_sm90_supportedis_sm100_supportedlog_info_on_rank0print_warning_onceset_weight_attrsuse_intel_amx_backend)CombineInputDispatchOutput)
TopKOutput)W4AFp8ConfigSGLANG_INT4_WEIGHT)ActivationType	QuantType)	fused_moe)shuffle_weightstaticdynamicc                   @  sx   e Zd ZdZ					d)d*ddZd+ddZed,ddZd-ddZed.ddZ	ed/dd Z
d0d%d&Zd.d'd(ZdS )1	Fp8ConfigzConfig class for FP8.FrM   Nis_checkpoint_fp8_serializedboolactivation_schemestrignored_layersOptional[List[str]]weight_block_size	List[int]	use_mxfp8returnNonec                 C  s   || _ |r
ttd |tvrtd| || _|pg | _|| _|d urF|s*tdt|dkr:tdt| d|dkrFtd| d	| jr\|d u rRd
dg}n
|d
dgkr\td|| _	d S )NzDetected fp8 checkpoint.zUnsupported activation scheme zLThe block-wise quantization only supports fp8-serialized checkpoint for now.   zFThe quantization block size of weight must have 2 dimensions, but got z dimensions.rM   zUThe block-wise quantization only supports dynamic activation scheme for now, but got z activation scheme.       z)MXFP8 requires weight_block_size=[1, 32].)
rO   r?   loggerACTIVATION_SCHEMES
ValueErrorrQ   rS   rW   lenrU   )selfrO   rQ   rS   rU   rW    rb   V/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/quantization/fp8.py__init__r   s6   




zFp8Config.__init__c                 C     | j rdS dS )Nmxfp8fp8rW   ra   rb   rb   rc   get_name      zFp8Config.get_nameList[torch.dtype]c                 C  s   t jt jgS N)torchbfloat16halfclsrb   rb   rc   get_supported_act_dtypes   s   z"Fp8Config.get_supported_act_dtypesintc                 C  re   )Nd   P   rh   ri   rb   rb   rc   get_min_capability   rk   zFp8Config.get_min_capability	List[str]c                 C     g S rm   rb   rq   rb   rb   rc   get_config_filenames   s   zFp8Config.get_config_filenamesconfigDict[str, Any]c                 C  s   |  |dg}d|v }d|v p|}|  |dg}| |ddgd }|r2d|dd	v r2d
d |D }| |dgd }|rI|d urItd ddg}| |||||dS )Nquant_methodrf   rg   rQ   rS   modules_to_not_convertmistral3
model_type c                 S  s   g | ]}| d dqS )zmodel.r   )replace).0layerrb   rb   rc   
<listcomp>   s    z)Fp8Config.from_config.<locals>.<listcomp>rU   zQMXFP8 ignoring incoming weight_block_size in config.json; it is fixed to [1, 32].r[   r\   )rO   rQ   rS   rU   rW   )get_from_keysget_from_keys_orgetr]   warning)rr   r{   r}   rW   rO   rQ   rS   rU   rb   rb   rc   from_config   s2   
zFp8Config.from_configr   torch.nn.ModuleprefixOptional[QuantizeMethodBase]c                 C  sp   ddl m} ddlm} ddlm} t||r$t|| jr t	 S t
| S t||r-t| S t||r6t| S d S )Nr   )
LinearBase)FusedMoE)RadixAttention)sglang.srt.layers.linearr   &sglang.srt.layers.moe.fused_moe_tritonr   !sglang.srt.layers.radix_attentionr   
isinstancer4   rS   r1   Fp8LinearMethodFp8MoEMethodFp8KVCacheMethod)ra   r   r   r   r   r   rb   rb   rc   get_quant_method   s   


zFp8Config.get_quant_methodc                 C  ry   rm   rb   ri   rb   rb   rc   get_scaled_act_names   s   zFp8Config.get_scaled_act_names)FrM   NNF)rO   rP   rQ   rR   rS   rT   rU   rV   rW   rP   rX   rY   )rX   rR   )rX   rl   )rX   rt   )rX   rx   )r{   r|   rX   rN   )r   r   r   rR   rX   r   )__name__
__module____qualname____doc__rd   rj   classmethodrs   rw   rz   r   r   r   rb   rb   rb   rc   rN   o   s$    
$

rN   c                   @  sb   e Zd ZdZd(ddZ	d)d*ddZ	d)d+ddZd,ddZd,ddZd,dd Z		!d-d.d&d'Z
d!S )/r   a  Linear method for FP8.

    It supports the following quantization schemes:
    - Per-channel weight quantization + per-token activation quantization
    - Per-tensor weight quantization + per-tensor activation quantization
    - Blockwise weight quantization + blockwise activation quantization

    It supports the following checkpoint formats:
    - FP8 checkpoint
    - FP16/BF16 checkpoint. In this case, the weights will be quantized to FP8 during the weight loading.

    Notes:
    - The activation quantization scheme can be static or dynamic. The dynamic activation quantization is more commonly used.
    - On NV platforms, the per-channel weight quantization is used by default, if block quantization is not enabled.

    Args:
        quant_config: The quantization config.
    quant_configUnion[Fp8Config, W4AFp8Config]c                 C  s|   || _ t | _d| _trtd}t }|p|| _t| j dd| _| jp(| j jd u| _	t
 | _| j j| _tj | _d| _d S )NFSGLANG_FORCE_FP8_MARLINrW   )r   r'   
use_marlin_is_cudar8   r&   getattrrW   rU   block_quantr(   w8a8_block_fp8_linearrO   r   SGLANG_USE_AITER_FP8_PER_TOKENr   use_aiter_fp8_per_tokenuse_per_token_if_dynamic)ra   r   force_marlinauto_enablerb   rb   rc   rd      s   

zFp8LinearMethod.__init__F
input_sizert   input_size_per_partitionoutput_sizeoutput_size_per_partitionoutput_partition_sizesrV   skip_block_quant_checkrP   c                 C  s   t  }| jjd | jjd }}	|rtd d S |dkr3|| |kr3||	 dkr3td| d|	 d|dkr=|| |ksCt|dkrY|D ]}
|
| dkrXtd|
 d| dqEd S d S )	Nr   r[   z8Skipping block quantization checks for weight partition.z"Weight input_size_per_partition = 3 is not divisible by weight quantization block_k = .zWeight output_partition_size = 3 is not divisible by weight quantization block_n = )r   r   rU   r@   r_   r`   )ra   r   r   r   r   r   r   tp_sizeblock_nblock_koutput_partition_sizerb   rb   rc   validate_block_quant_shapes  s>   	

z+Fp8LinearMethod.validate_block_quant_shapesr   r   params_dtypetorch.dtypec                 K  s<  t |}	||_||_|	|_||_|d}
| jr(| jj\}}| 	||||	|| | j
r.tjn|}ttj|	||ddd|
d}|d| | j
r| jrt| jdrZ| jjdksYJ nt| jd	rh| jjdkshJ | jrr| j
srtd
| jrxtjntj}|tjkrtjntj}t||	| d | || d | |ddd|
d}| j|_|tjkrttjj|d d < |d| nttjt|tjd|
d}ttjj|d d < |d| t| jdr| jjdkst| jd	r| jjdkrttjt|tjd|
d}ttjj|d d < |d| d S |dd  d S d S )Nweight_loaderdtyper[   r   )data	input_dim
output_dimr   weightrQ   rM   linear_activation_schemez;MXFP8 requires fp8-serialized checkpoint for linear layers.weight_scale_inv)r   r   weight_scalerL   input_scale)sumlogical_widthsr   r   
orig_dtyper   r   r   rU   r   rO   rn   float8_e4m3fnr   emptyregister_parameterhasattrrQ   r   rW   r_   uint8float32zerosr   format_ue8m0finfominr   r`   )ra   r   r   r   r   r   r   r   extra_weight_attrsr   r   r   r   weight_dtyper   scale_dtype
scale_initscalerb   rb   rc   create_weights)  s   




zFp8LinearMethod.create_weightsr	   rX   rY   c                 C  s  t rt|j|jd d\}}}d |_njtr-tsJ dt|dg tj	j
|jjdd|_d S | jrF| js:| | d S |jd d|j_d S ddlm} dd	lm} |t| jd
d drs| j|u rs|jjsst|j|j| jj d|j_|jj|jj}}|j|j_|j|j_d S )Nr   r   r   z8Fp8LinearMethod on CPU requires that CPU has AMX supportr   Frequires_gradTr   ),deepgemm_w8a8_block_fp8_linear_with_fallback$should_deepgemm_weight_requant_ue8m0rU   rU   )_is_fp8_fnuzr+   r   r   r   _is_cpu_is_cpu_amx_availabler   rn   nnr
   r   rW   rO   _quantize_mxfp8_weightsrequires_grad_r   (sglang.srt.layers.quantization.fp8_utilsr   sglang.srt.model_loader.utilsr   r   r   r   r,   rU   )ra   r   r   r   _r   r   rb   rb   rc   )process_weights_after_loading_block_quant  s\   

z9Fp8LinearMethod.process_weights_after_loading_block_quantc                 C  sx   |j j}t|\}}||j _|j d t|dr)|jd ur)||j_|jd n
|dt|dd d|j_d |_	d S )NFr   r   T)
r   r   r*   r   r   r   r   r
   r   r   )ra   r   r   qweightr   rb   rb   rc   r     s   
z'Fp8LinearMethod._quantize_mxfp8_weightsc                 C  s^  | j r
| | nt|jjdd|_| js^| js!| js!trC| j	rCt
|j|jjd \}}|  }trB| j	rBd| _t| d}nt|j\}}t| dd|_t|dd|_d |_nt|jjdd|_t| jdrs| jjdkst| jdr| jjdkrt|jjdd|_| js| jstr| j	r|j}t|j|j}tr| j	rd| _trt||d	\}}}t| d}n(|j}|j}trt|||jd
\}}}|d urt|dd|_t|||jd\}}t| dd|_t|dd|_t| jdr| jjdkst| jdr| jjdkrt|j dd|_| jr-| j r"| jj|_t|| j   |`d S d S )NFr   T   r   rQ   rL   r   )r   r   r   )r   r   r   )r   r   r
   r   r   rO   r'   r   
_use_aiterr   r#   shapet
contiguousr   rK   r)   r   r   r   r   rQ   r   r3   r   r   r+   r6   maxrU   r0   )ra   r   r   r   r   r   r   rb   rb   rc   process_weights_after_loading  s   








z-Fp8LinearMethod.process_weights_after_loadingNxtorch.TensorbiasOptional[torch.Tensor]c              	   C  s
  | j rt||j|j|j|j|j|dS | jr5t|t	r*t
|d |j|j|d |dS t
||j|jd |dS | jrut|rNtjj||j|j| jj||jdS t|t	rf| j|d |j| jj|j|d |dS | j||j| jj|jd |dS t||j|j|j|| j| jdS )N)inputr   r   	workspacesize_nsize_kr   r   r[   )r   r   r   r   r   T)r   r   
block_sizer   r   r   )r   r   r   r   r   r'   r   )r   r/   r   r   r   r   r   rW   r   tupler-   r   r   rB   rn   ops
sgl_kernelfp8_scaled_mm_cpur   rU   r   r   r%   r   r'   r   )ra   r   r   r   rb   rb   rc   applyI  s~   



		zFp8LinearMethod.apply)r   r   F)r   rt   r   rt   r   rt   r   rt   r   rV   r   rP   )r   r   r   rt   r   rV   r   rt   r   rt   r   r   r   rP   r   r	   rX   rY   rm   )r   r   r   r   r   r   rX   r   )r   r   r   r   rd   r   r   r   r   r   r  rb   rb   rb   rc   r      s    
0
c
<
tr   c                   @  s   e Zd ZdZd;ddZed<dd	Z	
d=d>ddZd?ddZd@dAddZ	d?ddZ
dBdd ZdBd!d"ZdCd&d'ZdDd+d,Zd?d-d.Z	/	
dEdFd8d9Zd:S )Gr   au  MoE method for FP8.
    Supports loading FP8 checkpoints with static weight scale and
    dynamic/static activation scale.

    Also supports loading quantized FP16/BF16 model checkpoints with dynamic
    activation scaling. The weight scaling factor will be initialized after
    the model weights are loaded.

    Args:
        quant_config: The quantization config.
    r   rN   c                 C  sr   || _ t| j dd| _| jp| j jd u| _d| _t  r3t s$J d| js+J dt	 s5t
 s7J d S d S d S )NrW   FzEcutlass_fp8 MoE requires CUDA 12.0+ with SM90 or CUDA 12.4+ with SM89z+cutlass_fp8 MoE requires block quantization)r   r   rW   rU   r   	with_biasr   
is_cutlassr'   r>   r=   ra   r   rb   rb   rc   rd     s   
zFp8MoEMethod.__init__rX   rP   c                  C  sP   ddl m}  ddlm} t }| rdS | r&| jo%|  p%| 	 S dS )z7Check if MoE will actually use DeepGEMM runner for FP8.r   )deep_gemm_wrapper)get_moe_a2a_backendTF)
sglang.srt.layersr  sglang.srt.layers.moe.utilsr	  r   is_deep_gemmis_autoENABLE_JIT_DEEPGEMM	is_deepepis_mooncake)r  r	  moe_runner_backendrb   rb   rc   &is_deepgemm_moe_runner_backend_enabled  s   z3Fp8MoEMethod.is_deepgemm_moe_runner_backend_enabledFr   r	   num_expertsrt   hidden_sizeintermediate_size_per_partitionr   r   r  c                 K  s  || _ ddlm} | jjrtrtjntj}t	 }	| j
rN| jjd | jjd }
}||
 dkr9td| d|
 d|	dkrN|| dkrNtd| d| dtrwtrwtjjtj|d	| |d
 |ddd}tjjtj|||d
 |ddd}n tjjtj|d	| ||ddd}tjjtj||||ddd}|d| t|| |d| t|| | j r|jjrd	| n|}tjjtj||tjddd}|d| t|| tjjtj||tjddd}|d| t|| | j
rf| jrtjntj}|tjkrtjntj}tjj||d	||
 d |
  || d | |ddd}tjj||||
 d |
 || d | |ddd}| j|_| j|_|d| |d| | jjdksZJ t  re| | nXtjjtj|d	tjddd}tjjtj|tjddd}|d| |d| trtjjtj|d	| tjddd}tjjtj||tjddd}|d| |d| || j
rd|j j!ind|j"j!i | jjrt|| t|| trtr|d|j#j!i t|| t|| | jjdkr=| jjs	tdtjjtj|tjddd}|d| t|| tjjtj|tjddd}|d| t|| d S d |_$d |_%d S )Nr   )FusedMoeWeightScaleSupportedr[   z,The output_size of gate's and up's weight = r   r   z"The input_size of down's weight = r   rZ      r   Fr   
w13_weight	w2_weightw13_weight_biasw2_weight_biasw13_weight_scale_invw2_weight_scale_invrM   w13_weight_scalew2_weight_scalew13_weight_scale1w2_weight_scale1r}   rL   zJFound static activation scheme for checkpoint that was not serialized fp8.w13_input_scalew2_input_scale)&r  r   r  r   rO   _use_hip_int4rn   uint32r   r   r   rU   r_   _is_hipr   r
   r   r   rA   moe_runner_configis_gatedr   rW   r   r   onesr   rQ   r   r  #_ensure_cutlass_buffers_initializedupdateBLOCKvalueTENSORCHANNELr"  r#  )ra   r   r  r  r  r   r  r   r  r   r   r   r  r  
w13_up_dimr  r  r   r   r  r  r   r!  r"  r#  rb   rb   rc   r     sN  


	
	





	











zFp8MoEMethod.create_weightsrY   c                 C  s  t r^t|j|jd d\}}}t|j|jd d\}}}tjj|dd|_tjj|dd|_d |_	tjj|dd|_tjj|dd|_d |_
tr\t|j d|j_t|j d|j_d S d S trvt|j d|j_t|j d|j_d S trts~J dt|ddg d S | jr| j|| jj d d S d	d
lm} d	dlm} |  }	|t| jdd dr|	r|jjst||sJ d| jj}
t|j|j|
 t|j|j|
 d|j_d|j_d S d S d S d S )Nr   Fr   r   z5Fp8MoEMethod on CPU requires that CPU has AMX supportr  r  )quantizer   )	DeepEPMoEr   rU   r   z-DeepGemm MoE is only supported with DeepEPMoET) r   r+   r  r  r  r  rn   r   r
   r"  r#  r   rK   r   r   r   r   r   rW   _process_mxfp8_moe_weightsr   rO   "sglang.srt.layers.moe.ep_moe.layerr2  r   r   r  r   r   r   rU   r,   )ra   r   r  r  r   r  r  r2  r   will_use_deepgemmrU   rb   rb   rc   r     s   

	



	

z6Fp8MoEMethod.process_weights_after_loading_block_quantTr1  c           
        sT  t rt s	tdddd}dd  d fdddfdd}|rIt  r8||jj\}}||jj\}}n-||jj\}}||jj\}}n|jj}|jj}|jjj|j	j}|jjj|j
j}ddd}	|	|j| |	|j| |	|j	| |	|j
| |jd |jd |j	d |j
d d|j	_d|j
_d |_d |_d S )Nz&MXFP8 MoE quantization requires SM100.r   r   c                 S  sd  ddl m} |  } | j\}}}|d dksJ d|d| d| }tj|dftj| jd}||d d df< d|d d d	f< ||d d d
f< tj	d|| |tj| jd}|d d d }tj	d|| |tj| jd}	tj
|tjd}
tj|| |d ftj| jd}|||||	|
| |
| }
||||d }||kr|d d d |d d f }|
|fS )Nr   )(es_sm100_mxfp8_blockscaled_grouped_quantr\   k=" must be divisible by 32 for MXFP8r      r   devicer[   rZ         r   )r   r6  r   r   viewrn   r   int32r;  arange
empty_liker   r   view_as)r   r6  r  mkweight_flatproblem_sizesexpert_offsets	aligned_mblockscale_offsetsr   r   rb   rb   rc   ,_quantize_and_swizzle_with_cutlass_es_kernel  sR   
z]Fp8MoEMethod._process_mxfp8_moe_weights.<locals>._quantize_and_swizzle_with_cutlass_es_kernelc                 S  sT   ddl m}m} ddlm} |jd|d\}}| dd} ||| |fi |} | S )Nr   )convert_layoutwrap_torch_tensor)layoutr[   )mx_axis	num_warpsr   )triton_kernels.tensorrK  rL  triton_kernels.tensor_detailsrM  (make_default_matmul_mxfp4_w_scale_layout	transpose)r   rO  rK  rL  rM  scale_layoutscale_layout_optsrb   rb   rc   _swizzle_mxfp8_sf  s   zBFp8MoEMethod._process_mxfp8_moe_weights.<locals>._swizzle_mxfp8_sfweight_shapetuple[int, int, int]r   c                   sR   | \}}}|d d d }| |||d }d} ||}|j |||d }|S )Nr<  r=  r\   r  )r>  r   )rX  r   r  rC  rD  rH  rO  )rW  rb   rc   _swizzle_with_triton_kernel&  s   

zLFp8MoEMethod._process_mxfp8_moe_weights.<locals>._swizzle_with_triton_kernelc                   sj   |   } | j\}}}|d dksJ d|d| d|  }t|\}}|| } | j|}||fS )Nr\   r   r7  r8  r   )r   r   r>  r*   rB  )r   r   rD  rE  r   r   )rZ  rb   rc   (_quantize_and_swizzle_with_triton_kernel1  s   
zYFp8MoEMethod._process_mxfp8_moe_weights.<locals>._quantize_and_swizzle_with_triton_kernelparamr
   	new_valuerX   rY   c                 S  s6   | j j|jkr| j j|jkr| j | d S || _ d S rm   )r   r   r   copy_)r\  r]  rb   rb   rc   _copy_or_rebindX  s   
z@Fp8MoEMethod._process_mxfp8_moe_weights.<locals>._copy_or_rebindFT)r   r   )rX  rY  r   r   )r\  r
   r]  r   rX   rY   )r   r>   RuntimeErrorr   r  r  r   r  r   r  r  r   r   r"  r#  )
ra   r   r1  rJ  r[  w13_qw13_sw2_qw2_sr_  rb   )rW  rZ  rc   r3    sR   

-



	
z'Fp8MoEMethod._process_mxfp8_moe_weightsc                 C  sR  t rtr| | d S | jr| | d S | jjstj|j	j
td}tj|jj
td}tjjtj|jtj|jddd|_t|jD ]<}t|j	j
|d d d d f \||d d d d f< |j|< t|jj
|d d d d f \||d d d d f< |j|< qBtjj|dd|_	tjj|dd|_t r| | d S | jjdkr|jd u s|jd u rtdt|jrt|jstd tjj|j dd|_tjj|j dd|_tr/t |j	|j|j\}}}t |j|j|j\}}}tjj|dd|_	tjj|dd|_|d urtjj|dd|_tjj|dd|_tjj|dd|_|d ur/tjj|dd|_|jd us7J |j!}	|jjdd	j"}
t|jD ]>}d
}tdD ]4}t#|j	| |||	 d d f |j| | }t||
| \|j	| |||	 d d f< }||	7 }qOqGtjj|
dd|_t r| | t$ % rd
dl&m'} || d S )Nr   r:  Fr   rL   zJQuantConfig has static quantization, but found activation scales are None.zjFound input_scales that are not equal for fp8 MoE layer. Using the maximum across experts for each layer. r[   dimr   rZ   )+align_fp8_moe_weights_for_flashinfer_trtllm)(r&  r$  process_weights_hip_int4r   r   r   rO   rn   rA  r  r   r!   r  r   r
   r)  num_local_expertsr   r;  r  ranger$   r  !process_weights_hip_scale_paddingrQ   r"  r#  r_   r2   r@   r   r   r+   r  valuesr5   r   is_flashinfer_trtllm2sglang.srt.layers.moe.moe_runner.flashinfer_trtllmrg  )ra   r   r  r  expertr  r"  r  r#  
shard_sizemax_w13_scales	expert_idstartshard_id	dq_weightr   rg  rb   rb   rc   r   n  s   

 





z*Fp8MoEMethod.process_weights_after_loadingc           	      C  s8  t jjt|jjddd|_t j  t jjt|jjddd|_t j  |j	d us-J |j
}|j	jddj}t|jD ]4}d}|| }tdD ]'}|j	| | |krl|j	| | | }|j| |||   |9  < ||7 }qIq=t jj|dd|_	t|jD ]}|j|  || 9  < |j|  |j| 9  < qd S )Nr   Fr   r[   re  r   rZ   )rn   r   r
   rK   r  r   cudaempty_cacher  r  r  r   rl  rj  ri  r   r!  r  )	ra   r   rp  rq  rr  rs  max_w13_scale_fp8rt  int4_rescalerb   rb   rc   rh    s@   




z%Fp8MoEMethod.process_weights_hip_int4c                 C  s   ddl m} trFtjjt|jjddd|_tj	
  tjjt|jjddd|_tj	
  | j|jd9  _| j|jd9  _d S tdr|tjjt|jjd|fdddd|_tj	
  tjjt|jjd|fdddd|_tj	
  d S d S )	Nr   )padding_sizer   Fr   r   SGLANG_MOE_PADDINGconstant)0sglang.srt.layers.moe.fused_moe_triton.fused_moerz  r   rn   r   r
   rK   r  r   rv  rw  r  r   r  	unsqueezer!  r  r8   Fpad)ra   r   rz  rb   rb   rc   rk    s4   


z.Fp8MoEMethod.process_weights_hip_scale_paddingr   r'  r   c                 C  sX   || _ t }| r|  rtj}ntj}| s!| s!|	 r)t
||| _d S 	 d S rm   )r'  r   r  r  r   	DEEP_GEMMTRITONr  	is_tritonrm  r   runner)ra   r   r'  r  rb   rb   rc   create_moe_runner=  s   zFp8MoEMethod.create_moe_runnerdispatch_outputrD   rC   c                 C  s,  ddl m} |j}| j}t|rDddlm} |j\}}}	||j||\}}t	j
j||j|j||dtj|j|jd d | jjd}
||
dS tr[| |||j|j|j}|d ur[||dS t  rddlm} tt t  d t	 |}W d    n1 sw   Y  |j\}}}	t!| jd	d}|||j"d
d|j"d
d|j"d
d|j"d
d||| j#| j$| j%| j&| j'| j(| j)| j*| j+| j,| j-| j.| j/d||||fd}
||
dS | j0j12 rS|j}|j}| j3r| jj}|j}|j}nXd}||g}|j4d
 d
 | d
 }|j4d d
 | d
 }|j56d
j7|d
d6dj7|dd}|j4d
 d
 | d
 }|j4d d
 | d
 }|j86d
j7|d
d6dj7|dd}t9||d|||d}n| j0j1: rt;t!|d}t;t!|d}t;t!|d}t<|j|j||| ||jj4d t;t!|dt=j>| j3| jjd u rd n| jjd
 | j3r|jnd | j3r|jnd | j3s|j?nd | j3st!|dd nd | j3st!|dd nd | j3st!|dd nd d}n<| j0j1@ rtA|j|jt!|dd t!|dd d| j3r|jn|j5| j3r|jn|j8|j?|jB| jjd
}ntCd| j0j1 | j0D||S )Nr   )StandardCombineInput)apply_topk_weights_cpuFT)hidden_states)cutlass_fused_experts_fp8)disabledrW   r[   rZ   )use_fp8_blockscalerW   output	enable_esr=  re  )r  r  use_fp8	w13_scalew2_scaleblock_shaper  ri  moe_ep_rankrouting_method_typeoutput1_scales_scalaroutput1_scales_gate_scalaroutput2_scales_scalar)r  r  global_num_expertslocal_expert_offsetlocal_num_expertsintermediate_sizer  r   weight_block_kr  r  r"  r  r  r  r  r  )
r  r  b13b2use_fp8_w8a8r  r  	a13_scalea2_scaler  zUnsupported runner backend: %s)E&sglang.srt.layers.moe.token_dispatcherr  r  r'  rB   sglang.srt.layers.moe.topkr  topk_outputapply_router_weight_on_inputrn   r   r   fused_experts_cpur  r  r   	FP8_W8A16r  r  r   rU   r&  maybe_apply_hip_fused_experts
activation
no_combiner   r  !sglang.srt.layers.moe.cutlass_moer  r   r   r   rA  r   rT  ab_strides1
c_strides1ab_strides2
c_strides2r   a_ptrb_ptrout_ptra_scales_ptrb_scales_ptrrG  problem_sizes1problem_sizes2r  runner_backendr  r   r   r  r~  repeat_interleaver  r   rm  rt   r   r   
DeepSeekV3r"  r  r   r#  NotImplementedErrorrun)ra   r   r  r  r   r'  r  topk_weightstopk_idsr   r  retr  symm_outputrW   r  r  r  r  r  scale_block_sizew13_scale_nw13_scale_k
w2_scale_n
w2_scale_k
quant_infor  ri  r  rb   rb   rc   r  R  s<  








'




zFp8MoEMethod.applyc                 C  s`  t | ddrd S |jj}|jjd }|jjd }|j}tj|f||tjd| _	tj|fd| |tjd| _
tj|f||tjd| _tj|f||tjd| _tjd|tjd| _tj||tjd| _tj||tjd| _tj||tjd| _tj||tjd| _tj||tjd| _tj|d |tjd| _tj|d|tjd| _tj|d|tjd| _d	| _d S )
N_cutlass_buffers_readyFr   r[   )r;  r   rZ   i_ r9  T)r   r  r;  r   r  r  rn   fullint64r  r  r  r  r   r   r   r  r  r  r  r  r?  rG  r  r  r  )ra   r   r;  r  r  r  rb   rb   rc   r*    sP   


z0Fp8MoEMethod._ensure_cutlass_buffers_initializedsilur   r   r  rE   r  rR   r  r   c           	      C  s   |\}}}t r-|rJ d|dt||j|j||tj|j|j|dkr(tj	d	S tj
d	S trt|r9J d|d| jrXt||j|j|||j|jtj|dkrQtj	ntj
|jd
S t||j|j||tj|j|j|dkrmtj	ntj
|jd
S d S )Nzno_combine=z is not supported.r  )
quant_typew1_scaler  r  )r  r  r  r  expert_mask)r  r  r  r  r  )r$  rJ   r  r  rI   	per_Tokenr   r!  rH   SiluGelur   r   r  r  per_128x128expert_mask_gpu)	ra   r   r   r  r  r  r  r  r   rb   rb   rc   r  C  s`   

z*Fp8MoEMethod.maybe_apply_hip_fused_expertsNr   rN   )rX   rP   r  )r   r	   r  rt   r  rt   r  rt   r   r   r  rP   r  )T)r   r	   r1  rP   rX   rY   )r   r	   )r   r   r'  r   )r   r   r  rD   rX   rC   )r  F)r   r   r   r   r  rE   r  rR   r  rP   rX   r   )r   r   r   r   rd   staticmethodr  r   r   r3  r   rh  rk  r  r  r*  r  rb   rb   rb   rc   r     s,    
 
TS 

 

+
!
 
E2r   c                      s"   e Zd ZdZd fddZ  ZS )r   zI
    Supports loading kv-cache scaling factors from FP8 checkpoints.
    r   rN   c                   s   t  | d S rm   )superrd   r  	__class__rb   rc   rd     s   zFp8KVCacheMethod.__init__r  )r   r   r   r   rd   __classcell__rb   rb   r  rc   r     s    r   )~
__future__r   loggingtypingr   r   r   r   r   r   rn   torch.nn.functionalr   
functionalr  torch.nnr	   torch.nn.parameterr
   sglang.srt.distributedr   r   <sglang.srt.distributed.device_communicators.pynccl_allocatorr   sglang.srt.environr   sglang.srt.layers.amx_utilsr   r   sglang.srt.layers.dp_attentionr   sglang.srt.layers.moer   r   r   *sglang.srt.layers.moe.moe_runner.deep_gemmr   rn  r   'sglang.srt.layers.moe.moe_runner.tritonr   r  r   r   sglang.srt.layers.parameterr   r   r   *sglang.srt.layers.quantization.base_configr   r   r   r    )sglang.srt.layers.quantization.fp8_kernelr!   r"   r#   r$   r   r%   r&   r'   r(   r)   r*   r+   r,   r-   'sglang.srt.layers.quantization.kv_cacher.   /sglang.srt.layers.quantization.marlin_utils_fp8r/   r0   &sglang.srt.layers.quantization.unquantr1   $sglang.srt.layers.quantization.utilsr2   r3   r4   r5   r6   sglang.srt.utilsr7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   r  rC   rD   r  rE   %sglang.srt.layers.quantization.w4afp8rF   r&  r   _is_npur   r   r   r$  SGLANG_USE_AITERr   r   aiterrH   rI   aiter.fused_moerJ   aiter.ops.shufflerK   r^   	getLoggerr   r]   rN   r   r   r   rb   rb   rb   rc   <module>   st    ,8
g   B       v