o
    پi                     @  s  U d dl mZ d dlZd dlmZ d dlmZ d dlmZm	Z	m
Z
mZmZ d dlZd dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ erOd dlmZ d dlmZmZmZmZmZmZm Z m!Z!m"Z"m#Z#m$Z$ d dl%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1 e2e3Z4e. Z5e, Z6e Z7e(doe5Z8e8rd dl9Z9d dl9m:Z:m;Z; d dl<m=Z= e;e9j>j?Z@e6rd dlAmBZBmCZC ejDEddddZFe(dZGe(dZHdaIdd ZJeJ ZKedddd ZL	ddd$d%ZMG d&d' d'eZNdaOd(ePd)< dd+d,ZQe+ r
e- r
d d-lRmSZS e/ re- rd d.lRmTZT dd0d1ZUdd3d4ZVdd5d6ZWdd:d;ZXdd<d=ZY		dddBdCZZ		dddDdEZ[		dddFdGZ\		dddHdIZ]ddMdNZ^		dddOdPZ_		dddQdRZ`edddSdT ZaddWdXZbddZd[Zc			ddd^d_ZdddbdcZeefddfdgZfddjdkZgddldmZhdndo ZiddrdsZjddudvZkdwdx Zlddd{d|Zmdd}d~Zndd Zodd ZpdddZqdddZrdddZsdd Ztdd ZudddeL dyddyfdddZvdddZwdddeL dyddyfdddZxdddZydS )    )annotationsN)Enum)	lru_cache)TYPE_CHECKINGCallableListOptionalTuple)envs)deep_gemm_wrapper) sglang_per_token_group_quant_fp8)MXFP4QuantizeUtil)torch_release)
ServerArgs)	fp8_dtypefp8_maxis_fp8_fnuz mxfp8_block_scaled_matmul_tritonper_token_group_quant_fp8scaled_fp8_quantsglang_per_token_quant_fp8static_quant_fp8triton_scaled_mmw8a8_block_fp8_matmul_deepgemmw8a8_block_fp8_matmul_triton)
ceil_alignceil_divget_bool_env_varget_cuda_versionget_device_capabilityis_blackwell_supportedis_cudais_flashinfer_availableis_hipis_sm90_supportedis_sm100_supported	offloaderSGLANG_USE_AITER)gemm_a8w8_bpreshuffleget_hip_quant)gemm_a8w8_blockscale)fp8_blockwise_scaled_mmfp8_scaled_mmzsgl_kernel::fp8_scaled_mmc                 C  s&   | j d }|j d }| j||f|dS )Ndtype)shape	new_empty)mat_amat_bscales_ascales_b	out_dtypebiasMN r;   \/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/quantization/fp8_utils.py_fp8_scaled_mm_abstractC   s   

r=    USE_VLLM_CUTLASS_W8A8_FP8_KERNELUSE_TRITON_W8A8_FP8_KERNELc                   C  s   t rt dko
tdkS dS )N)	      )      F)_is_hipr   r   r;   r;   r;   r<   use_rowwise_torch_scaled_mmS   s   rE      )maxsizec                  C  sD   t sdS t \} }t }| dkr|dkS | dkr |dkr |dkS dS )NFr@   )   r      )rH   rA   )_is_cudar   r   )majorminorcuda_versionr;   r;   r<   cutlass_fp8_supported`   s   
rN   weighttorch.Tensorweight_scaleinput_scaleOptional[torch.Tensor]return9Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]c                 C  sZ   | j tjksJ | tj}d}d|||k< |tj} |d }|d ur(|d }| ||fS )Nir          @)r0   torchfloat8_e4m3fnviewint8float8_e4m3fnuz)rO   rQ   rR   weight_as_int8ROCM_FP8_NAN_AS_INTr;   r;   r<   normalize_e4m3fn_to_e4m3fnuzm   s   
r^   c                   @  sr   e Zd ZdZdZdZdZdZdZdZ	dZ
dddZdddZdddZdddZdddZdddZdddZdS )Fp8GemmRunnerBackendz+Enum for FP8 GEMM runner backend selection.autoflashinfer_trtllmflashinfer_deepgemmcutlass	deep_gemmtritonaiterrT   boolc                 C  
   | t jkS N)r_   AUTOselfr;   r;   r<   is_auto      
zFp8GemmRunnerBackend.is_autoc                 C  rh   ri   )r_   FLASHINFER_TRTLLMrk   r;   r;   r<   is_flashinfer_trtllm   rn   z)Fp8GemmRunnerBackend.is_flashinfer_trtllmc                 C  rh   ri   )r_   FLASHINFER_DEEPGEMMrk   r;   r;   r<   is_flashinfer_deepgemm   rn   z+Fp8GemmRunnerBackend.is_flashinfer_deepgemmc                 C  rh   ri   )r_   CUTLASSrk   r;   r;   r<   
is_cutlass   rn   zFp8GemmRunnerBackend.is_cutlassc                 C  rh   ri   )r_   	DEEP_GEMMrk   r;   r;   r<   is_deep_gemm   rn   z!Fp8GemmRunnerBackend.is_deep_gemmc                 C  rh   ri   )r_   TRITONrk   r;   r;   r<   	is_triton   rn   zFp8GemmRunnerBackend.is_tritonc                 C  rh   ri   )r_   AITERrk   r;   r;   r<   is_aiter   rn   zFp8GemmRunnerBackend.is_aiterNrT   rg   )__name__
__module____qualname____doc__rj   ro   rq   rs   ru   rw   ry   rm   rp   rr   rt   rv   rx   rz   r;   r;   r;   r<   r_      s     





r_   zFp8GemmRunnerBackend | NoneFP8_GEMM_RUNNER_BACKENDrg   c                   C  s   t  pt S )zPReturn True if CUTLASS block FP8 is supported (Hopper or newer with CUDA 12.0+).)r$   r    r;   r;   r;   r<   )_check_cutlass_block_fp8_hardware_support   s   r   )gemm_fp8_nt_groupwise)fp8_blockscale_gemm_sm90r   c                  C  s   t  } |  st| S t S )z
    Dispatch to the appropriate FP8 block linear implementation.

    This function selects the backend based on:
    1. The --fp8-gemm-backend server argument (preferred)
    2. Auto-detection based on hardware capabilities
    )get_fp8_gemm_runner_backendrm   _dispatch_explicit_backend_dispatch_auto_backendbackendr;   r;   r<   dispatch_w8a8_block_fp8_linear   s   r   r   c                 C  s   |   rt r
t stdtS |  r t rt stdtS |  r-t	 s+tdt
S |  r9ts7tdtS |  rFtjsDtdtS |  rLtS td|  )z.Dispatch based on explicitly selected backend.zFlashInfer FP8 GEMM requested via --fp8-gemm-backend=flashinfer_trtllm, but FlashInfer is not available or not supported on this hardware. FlashInfer FP8 GEMM requires Blackwell GPUs and FlashInfer to be installed.zFlashInfer DeepGEMM with swapAB requested via --fp8-gemm-backend=flashinfer_deepgemm, but it's not available. This backend requires Hopper (SM90) GPUs and FlashInfer to be installed.zCUTLASS block FP8 requested via --fp8-gemm-backend=cutlass, but hardware does not support it. CUTLASS block FP8 requires Hopper (SM90+) GPUs with CUDA 12.0+.zAITER backend requested via --fp8-gemm-backend=aiter, but AITER is not available. AITER requires AMD GPUs with SGLANG_USE_AITER=1 environment variable set.zDeepGEMM backend requested via --fp8-gemm-backend=deep_gemm, but DeepGEMM is not available. This usually means the deep_gemm package is not installed or has been disabled via SGLANG_ENABLE_JIT_DEEPGEMM=0.zUnknown FP8 GEMM backend: )rp   r    r"   RuntimeError3flashinfer_gemm_w8a8_block_fp8_linear_with_fallbackrr   r$   7flashinfer_deepgemm_w8a8_block_fp8_linear_with_fallbackrt   r   +cutlass_w8a8_block_fp8_linear_with_fallbackrz   
_use_aiteraiter_w8a8_block_fp8_linearrv   r   ENABLE_JIT_DEEPGEMM,deepgemm_w8a8_block_fp8_linear_with_fallbackrx   triton_w8a8_block_fp8_linear
ValueErrorr   r;   r;   r<   r      sB   r   c                   C  s0   t jrtS t rt rtS t rtS trt	S t
S )z<Auto-select the best backend based on hardware capabilities.)r   r   r   r    r"   r   r   r   r   r   r   r;   r;   r;   r<   r      s   	r   server_argsr   Nonec                 C  s`   | j }|dkrtj rd}ntj rd}ntj s!tj r*td| d t|adS )z"Initialize FP8 GEMM configuration.r`   ra   rc   zFP8 GEMM backend set to 'z' via --fp8-gemm-backend overrides environment variables SGLANG_ENABLE_FLASHINFER_FP8_GEMM and SGLANG_SUPPORT_CUTLASS_BLOCK_FP8. Using server argument value.N)	fp8_gemm_runner_backendr
   !SGLANG_ENABLE_FLASHINFER_FP8_GEMMget SGLANG_SUPPORT_CUTLASS_BLOCK_FP8loggerwarningr_   r   )r   r   r;   r;   r<   initialize_fp8_gemm_config  s   


r   c                   C  s   t du rtja t S )z(Get the current FP8 GEMM runner backend.N)r   r_   rj   r;   r;   r;   r<   r   .  s   r   input
block_size	List[int]r8   c                 C  s   |d u sJ |  d| jd }|jd }|dk r!t| |||||S g | jd d |jd }t||d dd\}	}
t|	||
||jdd}|d urM||7 }|j|jd	j | S )
Nr.   rF      r   Tcolumn_major_scalestrtllm)r7   r   r/   )rY   r1   r   r   r   r0   to)r   rO   r   rQ   rR   r8   input_2dk_dimoutput_shapeq_inputx_scaleoutputr;   r;   r<   r   6  s,   


	r   c                 C  s   |du sJ | j }|tjk}|jd d dko|jd d dk}|r$|s:|j tjkr1t||j|}t| |||||S | d| jd }	g | jdd |jd }
t|	|d||d}|durb||7 }|j|
 S )ao  
    FlashInfer DeepGEMM backend for SM90 (Hopper) with swapAB optimization.

    Uses flashinfer.gemm.fp8_blockscale_gemm_sm90 which automatically selects
    the swapAB kernel for small M dimensions (M < 32) for better performance
    during decoding/low batch size scenarios.

    For SM90 (Hopper), this uses the DeepGEMM JIT with automatic swapAB selection.
    Nr   @   rF      r.   )rR   rQ   r7   )	r0   rW   bfloat16r1   int32_unpack_ue8m0_scale_for_tritonr   rY   r   )r   rO   r   rQ   rR   r8   output_dtypedtype_supportedshape_supportedr   r   r   r;   r;   r<   r   a  s0   
$
r   c                 C  s   |d u sJ |j d d dko|j d d dk}|s#t| |||||S | d| j d }g | j d d |j d }t||d dd\}	}
t|	|j|
|j|jd}|d urY||7 }|j|jdj| S )	Nr   r   rF   r.   Tr   )r7   r/   )r1   r   rY   r   r+   Tr0   r   )r   rO   r   rQ   rR   r8   r   r   r   r   r   r   r;   r;   r<   r     s"   $

r   c                 C  s   |d u sJ | j }|tjk}|jd d dko|jd d dk}|r$|s:|j tjkr1t||j|}t| |||||S | d| jd }	g | jd d |jd }
t|	|d ddt	j
d\}}t||||||d}|d urq||7 }|j|d	j|
 S )
Nr   r   rF   r   r.   T)r   scale_tma_alignedscale_ue8m0r   r/   )r0   rW   r   r1   r   r   r   rY   r   r   DEEPGEMM_SCALE_UE8M0r   r   )r   rO   r   rQ   rR   r8   r   r   r   r   r   r   r   r   r;   r;   r<   r     s6   
$
r   	sf_packedweight_shapeTuple[int, int]c                 C  s   | j tjksJ t| jdksJ |\}}|\}}t||}t||}| j\}	}
|
d }|  tj|	|}|	tjd> tj
}|	|krYtjd||| jd}|d|}n|	|kr^ntd| j d| d| |d	d	d	|f  }|S )
a`  
    Unpack UE8M0 packed scale tensor back to float32 format for triton kernel.

    The UE8M0 format packs scales as:
    - Shape: (N, K//block_k//4) with dtype int32
    - Each int32 contains 4 uint8 scale values

    Triton expects:
    - Shape: (N//block_n, K//block_k) with dtype float32

    Args:
        sf_packed: Packed scale tensor with shape (N, packed_k_groups) and dtype int32
        weight_shape: (N, K) shape of the weight tensor
        block_size: [block_n, block_k] quantization block size

    Returns:
        Unpacked scale tensor with shape (n_groups, k_groups) and dtype float32
    rB   rA      r   devicez(Unexpected scale shape: sf_packed.shape=z, weight_shape=z, block_size=N)r0   rW   r   lenr1   r   
contiguousrY   uint8r   float32aranger   index_selectr   )r   r   r   r:   Kblock_nblock_kn_groupsk_groups	mn_repeatk_div_4k_packedsf_u8sf_fp32indicesr;   r;   r<   r     s0   



r   c                 C  s   |  d| jd }g | jd d |jd }|d ur |}|}	n
t|tjjd\}}	t|||	||d ur6tjn| j	d}
|d urC|
|7 }
|
j
|d urLtjn|j	dj | S )Nr.   r   quant_dtyper/   )rY   r1   aiter_per1x128_quantrf   dtypesfp8r*   rW   r   r0   r   r   rO   r   rQ   rR   r8   r   r   r   r   r   r;   r;   r<   r   %  s(   	r   c                 C  s   |d u sJ |  d| jd }g | jd d |jd }t||d dd\}}	t|||	|||jd}
|d ur;|
|7 }
|
j|jdj | S )Nr.   r   rF   Fr   r   r/   )rY   r1   r   r   r0   r   r   r;   r;   r<   r   I  s   

r   c               
   C  s6   z	ddl m}  W | S  ty } ztd|d }~ww )Nr   )downcast_to_mxfpz>MXFP8 quantization requires triton_kernels with MXFP8 support.)$triton_kernels.numerics_details.mxfpr   	Exceptionr   )r   errr;   r;   r<   _get_triton_mxfp8_downcast`  s   r   x!Tuple[torch.Tensor, torch.Tensor]c                 C  s   |   dksJ d|    d|  sJ d| j\}}|d dks+J d|dt }|| tjd	d
\}}| | fS )zJQuantize a 2D contiguous tensor to MXFP8 with UE8M0 scales per group (32).rB   zExpected 2D input, got Dz3MXFP8 quantization requires a contiguous 2D tensor.    r   k=z must be divisible by 32rF   )axis)dimis_contiguousr1   r   rW   rX   r   )r   _kr   r   scale_u8r;   r;   r<   mxfp8_group_quantizek  s    
r   r   c                 C  s   |   dksJ d|    d|  } | j\}}|d dks'J d|dt|d}|d dkrN|d | }tj||fd	| j| jd
}tj| |gdd} |d }| 	|d|d} | 	|dd|d} | 
ddddd }|	d||ddS )NrB   zExpected 2D scale tensor, got r   rA   r   z	k_groups=z3 must be divisible by 4 (K must be multiple of 128)r      r0   r   r   r      rF   r   )r   r   r1   r   rW   fullr0   r   catrY   permute)r   mr   scale_mpad_rowspadscale_kpackedr;   r;   r<   _pack_mxfp8_scalesv  s*    


r   r   Optional[torch.dtype]c              
   C  sF  t rt s	td| d| jd  }g | jd d |jd }d}|jd d dkr/dnd}	d}
|j\}}|j\}}||ksKJ d|d||d dksYJ d|d||	 dksiJ d	|d
|	 |jtjkssJ d|jtj	ks}J d|d u rt
|\}}n|}|}|jtj	ksJ d|j||d fksJ |d u r|jtjtjtjfv r|j}ntj}|| dkrt||| | }tj|tj||f|j|jdgdd}tj||d fd|j|jd}tj||gdd}t|}t|}t||| ||||	|
d}|d |d d f }|d ur||7 }|j|dj| S )Nz4MXFP8 dense linear requires Blackwell GPUs (SM100+).r.   r   r   r   r   z does not match k_w=z# must be divisible by 128 for MXFP8zn=z must be divisible by zMXFP8 weight must be FP8 E4M3.z'MXFP8 weight_scale must be UE8M0 uint8.z&MXFP8 input_scale must be UE8M0 uint8.r   r   r0   r   r   )r   block_mr   r   r/   )rJ   r%   r   rY   r1   r   r0   rW   rX   r   r   float16r   r   r   r   zerosr   r   r   r   r   )r   rO   rQ   rR   r8   r   r   r   r   r   r   r   r   nk_wr   
x_scale_u8r   	pad_scalea_scale_packedb_scale_packedr   r;   r;   r<   triton_mxfp8_blockscaled_linear  sn   


 


r   w_blockw_scalec                 C  s   | j tjksJ |j tjksJ | j\}}}}|j\}}}	|dks#J ||ks)J ||ks/J ||	ks5J tj| ||dgd}
|
|||d S )z
    :param w_block: (batch, n, k, 16), uint8, pack two mxfp4 into one byte
    :param w_scale: (batch, n, k), uint8
    :return: (batch, n, k * 32), float32
       r   )quantized_datascaler0   block_sizes)r0   rW   r   r1   r   
dequantizereshape)r   r  r7   batchr   r   pack_dimbatch_n_k_out_rawr;   r;   r<   dequant_mxfp4  s   
r  r0   torch.dtypec           	      C  s   |   \}}t| |  jdd}trt}t}nt	|}|j
}|| }|  | j| |d}|| |  fS )zTThis function quantizes input values to float8 values with tensor-wise quantization.g-q=min)r  max)aminmaxrW   maximumabsfloatclamp_is_fp8_fnuzr   r   finfor  r   r   
reciprocal)	r   r0   min_valmax_valamaxfp_maxr  r  	x_scl_satr;   r;   r<   input_to_float8  s    
r   	x_q_blockx_sc           	        s   |d |d  | j \ d  }  d   ||j d ks'J |j d ks0J | tj fddt|D }tD ]!}t|D ]}|| | || |  || | ddddf< qPqJtrrtnt| jd\}}||fS )aH  This function converts block-wise quantization to tensor-wise quantization.
    The inputs are block-wise quantization tensor `x_q_block`, block-wise quantization scale
    and the block size.
    The outputs are tensor-wise quantization tensor and tensor-wise quantization scale.
    Note only float8 is supported for now.
    r   rF   c                   s,   g | ]  fd dt D qS )c                   sD   g | ]} t d   |  t |d    f qS )rF   r  ).0i)r   r   jr   r   
x_dq_blockr;   r<   
<listcomp>   s    z:block_quant_to_tensor_quant.<locals>.<listcomp>.<listcomp>)range)r#  r   r   r   k_tilesr   r&  )r%  r<   r'    s    z/block_quant_to_tensor_quant.<locals>.<listcomp>Nr/   )	r1   r   rW   r   r(  rJ   r   r   r0   )	r!  r"  r   n_tilesx_dq_block_tilesr$  r%  
x_q_tensorr  r;   r)  r<   block_quant_to_tensor_quant  s&   
2
r.  c           
      C  sd   |d |d }}| j ^ }}}|j|ddj|dd}	|	dd|d|f }	| tj|	 |S )zThis function converts block-wise quantization to unquantized.
    The inputs are block-wise quantization tensor `x_q_block`, block-wise quantization scale
    and the block size.
    The output is an unquantized tensor with dtype.
    r   rF   r-   r   r.   .N)r1   repeat_interleaver   rW   r   )
r!  r"  r   r0   r   r   r   r   r   x_scale_repeatr;   r;   r<   block_quant_dequant6  s   r1  c                 C  sR   t | tjjs	J t |tjjsJ t| |j||\}}t| | ||_	d S ri   )

isinstancerW   nn	Parameterrequant_weight_ue8m0r   r   r&   update_paramdata)rO   weight_scale_invweight_block_size
new_weightnew_weight_scale_invr;   r;   r<   requant_weight_ue8m0_inplaceM  s   
r<  r8  r9  c           	      C  sX   |ddgksJ | j ^ }}}t| ||tj}t||d\}}t||j d d}||fS )Nr   )weight_dequantr9  r-   mn)r1   r1  rW   r   quant_weight_ue8m0transform_scale_ue8m0)	rO   r8  r9  r   r   r   r=  out_wout_sr;   r;   r<   r5  Y  s   
r5  r=  c           
      C  s   |ddgksJ | j tjksJ d| j d| j| j^ }}}| d|f}t|\}}|g |||R }|g |t||d t||d R }	||	fS )Nr   zweight_dequant.dtype=z weight_dequant.shape=r.   r   rF   )r0   rW   r   r1   rY   per_block_cast_to_fp8r   )
r=  r9  
batch_dimsr   r   weight_dequant_flat
out_w_flat
out_s_flatrB  rC  r;   r;   r<   r@  s  s$   r@  c                 C  s   t | j|d| _d S )Nr>  )rA  r7  )paramr?  r;   r;   r<   transform_scale_ue8m0_inplace  s   rJ  Fuse_torch_implc                 C  sB   dd l }|rtn|jjj}| dtj|| jdd } || } | S )Nr   r-   r   r   )	deep_gemm.utils.layout8_get_mn_major_tma_aligned_packed_ue8m0_tensor_torch_implutilslayout,get_mn_major_tma_aligned_packed_ue8m0_tensorr   rW   r   r   )sfr?  rK  rd   rP  r;   r;   r<   rA    s   rA  c                 C  sT  ddl m}m} | jtjkr|  dv sJ | tjd? 	tj
}| jd | jd }}d}|  dkr=| dd	} }| jd }||d
}||d
}	tj|||	f| jtj
d}
||
d d d |d |f< |
djtjd|||	d
 }
tj||	d
 |f| jtjdj}|
|d d d d d d f< |d d d |d d f }|r|dS |S )Nr   )alignget_tma_aligned_size)rB   r   r   r-   r.   FrB   TrA   r   r/   )deep_gemm.utilsrR  rS  r0   rW   r  r   rY   intr   r   r1   	unsqueezer   r   mTsqueeze)r   rR  rS  ue8m0_tensorr?  r   
remove_dimb
aligned_mn	aligned_kpadded
transposed	aligned_xr;   r;   r<   rM    s*   


"rM  c                 C  sB   t | }t||dd}t| |ksJ d| d|d||S )NT)r?  rK  z
sf_packed=z sf_packed_recreated=z	 sf_fp32=)#_inverse_transform_scale_ue8m0_implrA  rW   all)r   r?  r   sf_packed_recreatedr;   r;   r<   inverse_transform_scale_ue8m0  s   rd  c                 C  s*  t | jdkrtjdd | D ddS d}t | jdks$J d| j| jtjks,J | j\}}|| }|d	 }|   tj	||}|
tjd
> tj}||||}|ddddddf }	t|	|ksddlm}
 td|
|	d|
|d|	d }	|	j||fksJ |	S )z
    NOTE: We assume k is aligned
    :param sf_packed: (scale_mn, scale_k/4) int32
    :return: (scale_mn, scale_k), float32
    r   c                 S  s   g | ]}t |qS r;   )ra  )r#  r   r;   r;   r<   r'    s    z7_inverse_transform_scale_ue8m0_impl.<locals>.<listcomp>r   r   r   rB   zsf_packed.shape=rA   r   NrF   )get_tensor_infoz=sf_unrepeated != sf_reshaped (get_tensor_info(sf_unrepeated)=z get_tensor_info(sf_reshaped)=))r   r1   rW   stackr0   r   r   flattenrY   r   r   r   rb  sglang.srt.debug_utils.dumperre  AssertionErrorrX  )r   r   mn_repeat_128r   r?  r   r   r   sf_reshapedsf_unrepeatedre  r;   r;   r<   ra    s,   
ra  c                 C  s   |   dksJ | j\}}tjt|dt|df| j| jd}| |d |d |f< |dd|dd d}|	 
 jdddd	}t|d
 }|d|  tj}||d |d |f  ||d|dfS )NrB   r   r   r.   rF   )rF   r   T)r   keepdimg-C6?g      |@g      ?r   )r   r1   rW   r   r   r0   r   rY   sizer  r  r  r  ceil_to_ue8m0r   rX   view_asr   )r   r   r   x_paddedx_viewx_amaxrQ  x_scaledr;   r;   r<   rD    s   
 rD  c              	   C  s   t dt t |  S )NrV   )rW   powceillog2r  )r   r;   r;   r<   rp    s   rp  x_q_channelc                 C  s6   |  tj| }trt|nt|| jd\}}||fS )Nr/   )r   rW   r   rJ   r   r   r0   )ry  r"  x_dq_channelr-  r  r;   r;   r<   channel_quant_to_tensor_quant  s   
r{  c                 C  s:   t | tu rt| dkr| d } t| dd|d j| S )NrB   r   )typetupler   rW   narrowrY   )r   input_2d_shaper   r;   r;   r<   _process_scaled_mm_output  s   r  c           	      C  s~   t d u rtjdtj|jda tj| |t t tjd}t|||}t|dd|d }|| |  }|d ur9|| }|j	|dS )NrF   r   )scale_ascale_br7   r   r/   )
TORCH_DEVICE_IDENTITYrW   onesr   r   
_scaled_mmr  r~  tr   )	qinputrO   r   rQ   r  r   r8   input_dtyper   r;   r;   r<   _apply_fallback_scaled_mm  s   r  input_scale_ubuse_per_token_if_dynamic
pad_outputOptional[bool]compressed_tensor_quantc
              	   C  s  |d u r| ot d }|rdnd }
| d| jd }g | jd d |jd }|	rE|
}|r:| |jd kr:d }t||||d\}}n<|d ur[| dksQJ t|||d\}}n&trdt|\}}ntrv| dkrvt|||d\}}nt	||jd d\}}|r| |jd kr|jd	 d
 d	ko|jd d
 d	k}|rt
r|d|jd }t||||| j|}nt||||| j|d}|j| S | dk}| dko| dk }|r|s|ststrtrt||j||| jd}|d ur||7 }t||j|S tj||| j|| |d}t||j|S |rB|rB|jd	kr/|jdkr/|d	}tj||| j|||d}t||j|S t|||||j||| jS )NSGLANG_ENABLE_TORCH_COMPILE   r.   rF   )num_token_paddingr  )repeat_scale)r  )
group_sizer   r  )r7   r8   rB   )XQWQr   r  r0   )r7   r  r  r8   )r   rY   r1   numelr   r   rJ   r   rD   r   use_triton_w8a8_fp8_kernelr   r0   r,   r   USE_ROWWISE_TORCH_SCALED_MMr   r(   r   r  rW   r  r  ndimrV  r  )r   rO   rQ   rR   r  r8   rN   r  r  r  output_paddingr   r   r  r  r   cutlass_compatible_br   per_tensor_weightsper_tensor_activationsr;   r;   r<   apply_fp8_linear9  s   

$
		
r  c                  C  sH   zt  \} }| d | }d|  kodk W S   W S  ty#   Y dS w )N
   P   Y   F)r   r   )rK   rL   smr;   r;   r<   can_auto_enable_marlin_fp8  s   
r  c
                 C  s   |  d| jd }
g | jd d |jd }tj|
tjjd\}}| dko-| dk }| dko9| dk }|r>|sLg | jd d |jd }t||||d | j	}|d ur_|| }|j | S )Nr.   rF   r   rB   r   )
rY   r1   rf   per_token_quant_hipr   r   r  r   r(   r0   )r   rO   rQ   rR   r  r8   rN   r  r  r  r   r   r   r   r  r  r   r;   r;   r<   apply_fp8_ptpc_linear  s   
r  layertorch.nn.Module
input_sizerU  output_sizeinput_size_per_partitionoutput_partition_sizes	list[int]c                 C  s   ddl m} t| d| }|d |d }}	|dkr1|| |kr1||	 dkr1td| d|	 d|dko<|t| |k}
t|dk}|
sG|ri|}|
sS|rS|dd	 }|D ]}|| dkrhtd
| d| dqUdS dS )z:Validate block quantization shapes for tensor parallelism.r   )$get_tensor_model_parallel_world_sizetp_sizerF   z"Weight input_size_per_partition = z3 is not divisible by weight quantization block_k = .Nr.   zWeight output_partition_size = z3 is not divisible by weight quantization block_n = )sglang.srt.distributedr  getattrr   sumr   )r  r  r  r  r  r   r  r  r   r   is_tp_splitis_merged_gemmsizes_to_checkoutput_partition_sizer;   r;   r<   validate_fp8_block_shape  s:   	r  ri   )rO   rP   rQ   rP   rR   rS   rT   rU   r{   )rT   r   )r   r_   rT   r   )r   r   rT   r   )rT   r_   )NN)r   rP   rO   rP   r   r   rQ   rP   rR   rS   r8   rS   rT   rP   )r   rP   r   r   r   r   rT   rP   )r   rP   rT   r   )r   rP   rT   rP   )NNN)r   rP   rO   rP   rQ   rP   rR   rS   r8   rS   r   r   rT   rP   )r   rP   r  rP   rT   rP   )r   rP   r0   r  rT   r   )r!  rP   r"  rP   r   r   rT   r   )
r!  rP   r"  rP   r   r   r0   r  rT   rP   )rO   rP   r8  rP   r9  r   )r=  rP   r9  r   )F)rK  rg   )r   rP   rT   rP   )r   rP   )ry  rP   r"  rP   rT   r   )r   rP   rO   rP   rQ   rP   rR   rS   r  rS   r8   rS   rN   rg   r  rg   r  r  r  rg   rT   rP   )r  r  r  rU  r  rU  r  rU  r  r  r   r  rT   r   )z
__future__r   loggingenumr   	functoolsr   typingr   r   r   r   r	   rW   sglang.srt.environr
   sglang.srt.layersr   )sglang.srt.layers.quantization.fp8_kernelr   +sglang.srt.layers.quantization.mxfp4_tensorr   sglang.srt.utils.commonr   sglang.srt.server_argsr   r   r   r   r   r   r   r   r   r   r   r   sglang.srt.utilsr   r   r   r   r   r    r!   r"   r#   r$   r%   r&   	getLoggerr|   r   rD   rJ   r  r   rf   r(   r)   %aiter.ops.triton.gemm_a8w8_blockscaler*   	QuantType	per_1x128r   
sgl_kernelr+   r,   libraryregister_faker=    use_vllm_cutlass_w8a8_fp8_kernelr  r  rE   r  rN   r^   r_   r   __annotations__r   flashinfer.gemmr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r   r.  r1  r<  r5  r@  rJ  rA  rM  rd  ra  rD  rp  r{  r  r  r  r  r  r  r;   r;   r;   r<   <module>   s    48



!



6

0;&
0B)




L

+



'

# 
0#