o
    پi                     @   s	  d dl Z d dlZd dlZd dlZd dl mZ d dlmZmZmZm	Z	m
Z
 d dlZd dlZd dlmZ zd dlmZ W n   Y d dlmZ d dlmZmZmZmZmZmZmZmZ d dlmZ e Ze Z e Z!edomeZ"e rd d	l#m$Z$ d d
l%m&Z' z
d dl#m(Z( dZ)W n e*y   d dl#m+Z+ dZ)Y nw erdZ,e"rzd dl-m.Z.m/Z/m0Z0 W n e*y   e*dw zd dl1Z2dZ,W n e*y   dZ,Y nw e3e4Z5e de6fddZ7e7 rej8Z9dZ:n	ej;Z9e<e9j=Z:e: Z>edgddej?dej?dej?dej?dej?ddfddZ@ejAdejBfddZCejAdejBd ejBfd!d"ZDd#e9dddfd$ej?d%eEd&eFd'ejGd(e6d)e6d*e6de
ej?ej?f fd+d,ZHeHZId$ej?d%eEd-ejGd(e6d)e6d*e6d.e	ej? de
ej?ej?f fd/d0ZJ	#					dd$ej?d%eEd-ejGd&eFd(e6d)e6d*e6d1e6d.e	ej? de
ej?ej?f fd2d3ZKd(e6d)e6d*e6fd4d5ZL	#						dd$ej?d%eEd&eFd(e6d)e6d*e6d1e6d.e	ej? d6e	e6 fd7d8ZM	#						dd$ej?d%eEd-ejGd&eFd(e6d)e6d*e6d1e6d.e	ej? d6e	e6 fd9d:ZNe9fd$ej?d'ejGfd;d<ZOejAdejBd=ejBfd>d?ZP	dd$ej?d@ej?dAe6de
ej?ej?f fdBdCZQejAdDejBdEejBdFejBdGejBdHejBf
dIdJZRejAdDejBdEejBdFejBdGejBdHejBf
dKdLZSe jdMeEdNeEdOeEdPeEde	eeEef  f
dQdRZTdSdT ZUerrdUdV ZVdWdT ZUejWfdej?dej?dej?dej?dXeeE dYejGde
eEeEeEf fdZd[ZXdej?dej?dej?dej?dXeeE dYejGdej?fd\d]ZYejWfdej?dej?dej?dej?dXeeE dYejGdej?fd^d_ZZejWfdej?dej?dej?dej?dXeeE dYejGdej?fd`daZ[ejAdbejBdMejBdNejBdcejBddejBdeejBdfejBdgejBdhejBdiejBdjejBfdkdlZ\dmdndmdodpdqej?drej?dsej?dtej?dYejGdueEdOeEdPeEdveEdej?fdwdxZ]ejAdyejBfdzd{Z^ejAdyejBfd|d}Z_	~dd$ej?dej?d&eFde
ej?ej?f fddZ`ejAdejBdejBfddZadmd~e9fd$ej?d%eEd&eFd'ejGde
ej?ej?f f
ddZb	 erdd Zcdd Zddd Ze			ddej?de	ej? de	eE de6defej?ej?f f
ddZgn			ddej?de	ej? de	eE de6defej?ej?f f
ddZgejhdd dD g ddZiejAdNejBdfejBdejBddejBfddZje!seiejZj	ddej?dej?dej?d%eEdeEde
ej?ej?f fddZkejAdej?dej?dej?deEdejBdDejBdFejBfddZl	ddqej?dej?deEdej?fddZmd$ej?fddZnejAdejBdDejBdEejBdFejBdejBdejBfddZo					ddej?dej?dej?dej?depejG de	ej? deEdeEdeEdej?fddZqe re)rejrsddd Ztn
ejrsddd Ztejrsddd ZtdS dS )    N)	lru_cache)AnyDictListOptionalTuple)TensorDescriptor)deep_gemm_wrapper)
ceil_alignget_bool_env_varget_device_core_countget_device_nameis_cpuis_cudais_hiplog_info_on_rank0)register_custom_opSGLANG_USE_AITER)sgl_per_token_quant_fp8)per_tensor_quant_fp8)sgl_per_token_group_quant_8bitT)sgl_per_token_group_quant_fp8F)dynamic_per_tensor_quantdynamic_per_token_scaled_quantstatic_per_tensor_quantz6aiter is required when SGLANG_USE_AITER is set to Truereturnc                   C   s   t rdtjdjv S dS )Ngfx94r   F)_is_hiptorchcudaget_device_propertiesgcnArchName r"   r"   ]/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/quantization/fp8_kernel.pyis_fp8_fnuzY   s   r$         l@C)mutates_argsAAsBBsc                 C   s   t | |f||f| d S N)r	   gemm_nt_f8f8bf16)r(   r)   r*   r+   r&   r"   r"   r#   deep_gemm_fp8_fp8_bf16_ntj   s   r.   BLOCKc	                 C   s   t d}	| |	| 7 } ||	| 7 }||	7 }t d|}
|
|k }t j| |
 |ddt j}t t t ||}|| }d| }t 	|| |||j
j}t j||
 ||d t || dS )zA Triton-accelerated function to perform per-token-group quantization on a
    tensor.

    This function converts the tensor values into float8 values.
    r           maskother      ?r2   N)tl
program_idarangeloadtofloat32maximummaxabsclampdtype
element_tystore)y_ptry_q_ptry_s_ptry_strideNepsbit8_minbit8_maxr/   g_idcolsr2   y_absmaxy_sy_s_invy_qr"   r"   r#   _per_token_group_quant_8bitu   s   
rR   SCALE_UE8M0c              	   C   s
  t d}| |t j| 7 } ||t j| 7 }|| }|| }|| }||| | 7 }t d|	}||k }t j| | |ddt j}t t t 	||}|| }|
ret 
t t t 	|}t || |||jj}t j|| ||d t || dS )zA Triton-accelerated function to perform per-token-group
    quantization on a tensor.
    This function converts the tensor values into float8 values.
    r   r0   r1   r5   N)r6   r7   r:   int64r8   r9   r;   r<   r=   r>   exp2ceillog2r?   r@   rA   rB   )rC   rD   rE   
group_sizey_num_columnsy_s_col_striderH   rI   rJ   r/   rS   rK   blocks_per_row	scale_col	scale_rowrL   r2   rM   rN   rO   rQ   r"   r"   r#   $_per_token_group_quant_8bit_colmajor   s"   
r^   绽|=xrX   rH   r@   column_major_scalesscale_tma_alignedscale_ue8m0c                 C   s  | j d | dksJ d|  sJ dtr%|tjkrd}nd}| }n|tjkr0t|}	nt|}	|	j}|	j}tj	| | j
|d}
t| j | j
|||dd	}|  | }|}t|}tt|d
 dd}d}|rt|f | |
||| j d |d|||||||d n|rJ t|f | |
|||||||||d |rddlm} |dksJ ||d|
j d |
j d d||fdd}|
|fS )a'  Function to perform per-token-group quantization on an input tensor `x`.

    It converts the tensor values into signed float8 values and returns the
    quantized tensor along with the scaling factor used for quantization.

    Args:
        x: The input tensor with ndim >= 2.
        group_size: The group size used for quantization.
        eps: The minimum to avoid dividing zero.
        dtype: The dype of output tensor.

    Returns:
        Tuple[torch.Tensor, torch.Tensor]: The quantized tensor and the scaling factor for quantization.
    r   =the last dimension of `x` cannot be divisible by `group_size``x` is not contiguousg     _@r%   devicer@   Fx_shaperh   rX   ra   rb   rc            )rI   rJ   r/   	num_warps
num_stagesrS   )rI   rJ   r/   rn   ro   !transform_sf_into_required_layout   NT
num_groupsmnkrecipeis_sfa)shapeis_contiguousr   r   int8iinfofinfor=   min
empty_likerh   -create_per_token_group_quant_fp8_output_scalenumeltritonnext_power_of_2r^   striderR   	deep_gemmrq   )r`   rX   rH   r@   ra   rb   rc   rJ   rI   infox_qx_sMrG   r/   rn   ro   rq   r"   r"   r#   _per_token_group_quant_8bit_raw   s   


	
	r   	dst_dtypemasked_mc                 C   sH  ddl m} ddlm} |sJ |sJ |sJ |  dk}	|	r;| j\}
}| d} |d u s0J tj|
g| j	tj
d}tjg | jd d | jd d R | j	|d}tjg | jd d | jd d | R | j	tjd}|| |||||d |dks~J |||jd |jd	 |jd d
||fdd}|	r|d}|d}||fS )Nr   rp   )"silu_and_mul_masked_post_quant_fwd   rg   rd   )inputoutputoutput_scalequant_group_sizer   rc   rr   rl   Trs   )r   rq   $sglang.srt.layers.moe.ep_moe.kernelsr   dimry   	unsqueezer   tensorrh   int32zerosr;   squeeze)r`   rX   r   ra   rb   rc   r   rq   r   needs_unsqueeze
num_tokens_r   output_scale_for_kernelr   r"   r"   r#   -_per_token_group_quant_8bit_fuse_silu_and_mulB  sT   

 $		

r   fuse_silu_and_mulc	           	   	   C   s0   |rt | ||||||dS t| ||||||dS )N)r`   rX   r   ra   rb   rc   r   )r`   rX   rH   ra   rb   rc   r@   )r   r   )	r`   rX   r   rH   ra   rb   rc   r   r   r"   r"   r#   per_token_group_quant_8bit  s&   
r   c                 C   s6  |r>|r|sJ | ^ }}}||d }	}
t |	d}t |
d}tjg ||d |R |tjddddd |	d d f S |r|ro| d d d d }tj| d d | d | |f |tjdddd | d d d f S tj| d | f| d d  |tjdddS tj| d d | d | f |tjdS )Nrr      rg   rd   r   .   )r
   r   emptyint	transposer;   permute)rj   rh   rX   ra   rb   rc   x_batchx_q_mnx_q_kx_s_mnx_s_k
aligned_mn	aligned_kaligned_sizer"   r"   r#   r     sF   

r   	enable_v2c	                 C   s   | j d | dksJ d|  sJ dg | j d d | j d |r%dnd R }	tj|	| jtd}
t|	| j||||d}| j d dkritrZt| |
|||t	t
||||d	 |
|fS |r^J t| |
|||t	t
| |
|fS )
Nrd   r   re   rf   r   rl   rg   ri   )r   )ry   rz   r   r   rh   	fp8_dtyper   %enable_sgl_per_token_group_quant_8bitr   fp8_minfp8_maxr   )r`   rX   rH   ra   rb   rc   r   r   r   	out_shaper   r   r"   r"   r#    sglang_per_token_group_quant_fp8  sF   *	r   c
                 C   sf   ddl m}
 |tjkr&|rJ |rJ |rJ |d u sJ |
| ||||	dS t| ||||||||	d	S )Nr   )!sglang_per_token_group_quant_int8)r`   rX   rH   r@   r   )	r`   rX   rH   ra   rb   rc   r   r   r   )*sglang.srt.layers.quantization.int8_kernelr   r   r{   r   )r`   rX   r   rH   ra   rb   rc   r   r   r   r   r"   r"   r#   !sglang_per_token_group_quant_8bit  s0   
r   c                 C   sR   |   sJ dtj| | j|d}tj| jd d| jtjd}t| || ||fS )Nrf   rg   r   rl   )rz   r   r   rh   r   ry   r;   r   )r`   r@   r   r   r"   r"   r#   sglang_per_token_quant_fp89  s   r   REPEAT_SCALEc
                 C   s   t d}
| |
| 7 } ||
| 7 }|	r||
7 }t d|}||k }t j| | |ddt j}t |t j}d| }t || |||jj}t j	|| ||d |	r^t 	|| dS dS )zA Triton-accelerated function to perform quantization using the given scale on a
    tensor

    This function converts the tensor values into float8 values.
    r   r0   r1   r4   r5   N
r6   r7   r8   r9   r:   r;   r?   r@   rA   rB   )rC   rD   rE   y_s_repeat_ptrrF   rG   r   r   r/   r   rK   rL   r2   rM   rO   rP   rQ   r"   r"   r#   _static_quant_fp8L  s   
r   r   repeat_scalec           
      C   s   |   sJ d| dksJ dtj| | jtd}|  | jd  }| jd }|r8tj|df| jtjd}nd}t	
|}tt|d dd}d}	t|f | |||||tt||||	d	 |rb|n|}||fS )
a@  Function to perform static quantization using the given scale on an input tensor `x`.

    It converts the tensor values into signed float8 values and returns the
    quantized tensor along with the scaling factor used for quantization.

    Args:
        x: The input tensor with ndim >= 2.
        x_s: The quantization scale.
        repeat_scale: Whether to broadcast per-tensor scale to per-channel scale.
        dtype: The dype of output tensor.

    Returns:
        Tuple[torch.Tensor, torch.Tensor]: The quantized tensor and the scaling factor for quantization.
    rf   rl   zonly supports per-tensor scalerg   rd   Nrk   rm   )r   r   r/   r   rn   ro   )rz   r   r   r   rh   r   ry   r   r;   r   r   r~   r=   r   r   r   )
r`   r   r   r   r   rG   
x_s_repeatr/   rn   ro   r"   r"   r#   static_quant_fp8w  s>   

r   BLOCK_SIZE_MBLOCK_SIZE_NBLOCK_SIZE_KGROUP_SIZE_Mneeds_maskingc           6      C   s   t jdd}t ||}t ||}|| }|| }|| }t|| |}|||  } || | }!| | t d| | }"|!| t d| | }#t d|}$| |"dddf |
 |$dddf |   }%||$dddf | |#dddf |   }&||"|  }'|#| }(||(|  })||	 }*t j||ft jd}+tdt ||D ]q},|rt j|%|$dddf ||,|  k dd}-t j|&|$dddf ||,|  k dd}.n
t |%}-t |&}.t |'}/t |)}0|+t 	|-|.|/dddf  |0dddf  7 }+|%|| 7 }%|&|| 7 }&|'|*| 7 }'|)|*| 7 })q|j
jt jkr#|+t j}1n|j
jt jkr2|+t j}1n|+t j}1| | t d| }2|!| t d| }3|||2dddf   ||3dddf   }4|2dddf |k |3dddf |k @ }5t j|4|1|5d dS )Triton-accelerated function used to perform linear operations (dot
    product) on input tensors `A` and `B` with block-wise quantization, and store the result in output
    tensor `C`.
    r   axisNr@   r0   r1   r5   r6   r7   cdivr~   r8   r   r;   ranger9   dotr@   rA   bfloat16r:   float16rB   )6r(   r*   r&   r)   r+   r   rG   Kgroup_ngroup_k	stride_am	stride_ak	stride_bk	stride_bn	stride_cm	stride_cnstride_As_mstride_As_kstride_Bs_kstride_Bs_nr   r   r   r   r   pid	num_pid_m	num_pid_nnum_pid_in_groupgroup_idfirst_pid_mgroup_size_mpid_mpid_noffs_amoffs_bnoffs_ka_ptrsb_ptrsAs_ptrsoffs_bsnBs_ptrsscale_step_kaccumulatorrv   aba_sb_scoffs_cmoffs_cnc_ptrsc_maskr"   r"   r#   _w8a8_block_fp8_matmul  sT   &,,(*



0,(r   c           7      C   s  t jdd}t ||}t ||}|| }|| }|| }t|| |}|||  } || | }!| | t d| | }"|!| t d| | }#t d|}$| |"dddf |
 |$dddf |   }%||$dddf | |#dddf |   }&||"|  }'|#| }(||(|  })||	 }*t j||ft jd}+d},tdt |||, D ]}-|rt j|%|$dddf ||-|, |  k dd}.t j|&|$dddf ||-|, |  k dd}/n
t |%}.t |&}/t |'}0t |)}1|+t 	|.|/|0dddf  |1dddf  7 }+|%|| 7 }%|&|| 7 }&|'|*| 7 }'|)|*| 7 })|rPt j|%|$dddf ||-|, d |  k dd}.t j|&|$dddf ||-|, d |  k dd}/n
t |%}.t |&}/t |'}0t |)}1|+t 	|.|/|0dddf  |1dddf  7 }+|%|| 7 }%|&|| 7 }&|'|*| 7 }'|)|*| 7 })|rt j|%|$dddf ||-|, d	 |  k dd}.t j|&|$dddf ||-|, d	 |  k dd}/n
t |%}.t |&}/t |'}0t |)}1|+t 	|.|/|0dddf  |1dddf  7 }+|%|| 7 }%|&|| 7 }&|'|*| 7 }'|)|*| 7 })|r@t j|%|$dddf ||-|, d
 |  k dd}.t j|&|$dddf ||-|, d
 |  k dd}/n
t |%}.t |&}/t |'}0t |)}1|+t 	|.|/|0dddf  |1dddf  7 }+|%|| 7 }%|&|| 7 }&|'|*| 7 }'|)|*| 7 })q|j
jt jkr|+t j}2n|j
jt jkr|+t j}2n|+t j}2| | t d| }3|!| t d| }4|||3dddf   ||4dddf   }5|3dddf |k |4dddf |k @ }6t j|5|2|6d dS )r   r   r   Nr   r   r0   r1   rl   r   r   r5   r   )7r(   r*   r&   r)   r+   r   rG   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   UNROLL_FACTORrv   r   r   r   r   r   r   r   r   r   r"   r"   r#   !_w8a8_block_fp8_matmul_unrolledx4
  s   &,,



0""



0""



0""



0,(r   rG   r   block_nblock_kc                 C   s   t j rdS t dd}d|  d| d| d| d| d	}tjtjtj	t
d
|}tj|r_t|}ttd| d dd t| D W  d   S 1 sZw   Y  td| dS )a}  
    Return optimized configurations for the w8a8 block fp8 kernel.

    The return value will be a dictionary that maps an irregular grid of
    batch sizes to configurations of the w8a8 block fp8 kernel. To evaluate the
    kernel on a given batch size bs, the closest batch size in the grid should
    be picked and the associated configuration chosen to invoke the kernel.
    N r   zN=z,K=z,device_name=z,dtype=fp8_w8a8,block_shape=[z, z].jsonconfigszUsing configuration from z for W8A8 Block FP8 kernel.c                 S   s   i | ]	\}}t ||qS r"   )r   ).0keyvalr"   r"   r#   
<dictcomp>  s    z.get_w8a8_block_fp8_configs.<locals>.<dictcomp>ziUsing default W8A8 Block FP8 kernel config. Performance might be sub-optimal! Config file not found at %s)r   _dynamois_compilingr   replaceospathjoindirnamerealpath__file__existsopenr   loggerjsonr9   itemswarning)rG   r   r   r   device_namejson_file_nameconfig_file_pathfr"   r"   r#   get_w8a8_block_fp8_configs  s(   
$

 
r  c                 C   s   t S r,   )r   r   rG   METAr"   r"   r#   #select_w8a8_block_fp8_matmul_kernel  s   r  c                 C   s.   t | |d t ||d  }|t k d S Nr   r   )r   r   r   )r   rG   r  num_workgroupsr"   r"   r#   $use_w8a8_block_fp8_matmul_unrolledx4  s   r  c                 C   s   t | ||rtS tS r,   )r  r   r   r  r"   r"   r#   r    s   
block_sizeoutput_dtypec                 C   s  t |dksJ |d |d }}| jd |jd ksJ | jd d |jd d ks-J |  s3J |jtjkrJt| jd ||jd ksIJ n*|jtjkrrtt| jd |d|jd ksqJ d| jd|jd|nt	| 
 | jd  }|jdksJ | sJ |jdksJ |j\}	}
|jtjkrt|	||jd ksJ t|
||jd ksJ n=|jtjkr|	|jd ksJ d	|jd
|jd|tt|
|d|jd ksJ d	|jd
|jd|nt	| jd d |	f }| j||d}||	|
|fS )Nr   r   rl   rd   r   zA.shape=z
 As.shape=z block_size=zB.shape=z
 Bs.shape=r   )lenry   rz   r@   r   floatr   r   r   NotImplementedErrorr   ndim	new_empty)r(   r*   r)   r+   r  r   r   r   r   rG   r   C_shaper&   r"   r"   r#   prepare_block_fp8_matmul_inputs   s:    "&
, r'  c           
      C   sD   t | |||||\}}}}	|	jtjkrtjsJ t| ||||	 |	S r,   )r'  r@   r   r   r	   ENABLE_JIT_DEEPGEMMr.   )
r(   r*   r)   r+   r  r   r   rG   r   r&   r"   r"   r#   w8a8_block_fp8_matmul_deepgemm0  s   r)  c                    s$  t | |||||\ }}|\}}	t||d |d }
|
r-|
t|
  fddd }nd|d |d ddd	d
}t||d  dk} fdd}t |}|| | |||| |||	| d| d|d|d|d|d|d|d|d|dfi |d|i |S )a  This function performs matrix multiplication with block-wise quantization.

    It takes two input tensors `A` and `B` with scales `As` and `Bs`.
    The output is returned in the specified `output_dtype`.

    Args:
        A: The input tensor, e.g., activation.
        B: The input tensor, e.g., weight.
        As: The per-token-group quantization scale for `A`.
        Bs: The per-block quantization scale for `B`.
        block_size: The block size for per-block quantization. It should be 2-dim, e.g., [128, 128].
        output_dytpe: The dtype of the returned tensor.

    Returns:
        torch.Tensor: The result of matmul.
    r   rl   c                    s   t |   S r,   )r>   r`   )r   r"   r#   <lambda>c  s    z.w8a8_block_fp8_matmul_triton.<locals>.<lambda>)r  @       r   r   )r   r   r   r   rn   ro   r   c                    "   t  | d t | d  fS r  r   r   r  r   rG   r"   r#   gridr  s   z*w8a8_block_fp8_matmul_triton.<locals>.gridr   rd   r   )r'  r  r~   keysboolr  r   )r(   r*   r)   r+   r  r   r   r&   r   r   r   configr   r2  kernelr"   r1  r#   w8a8_block_fp8_matmul_tritonB  sT   	r7  c                 C   s8   |t jkrtjrt| |||||dS t| |||||dS )N)r   )r   r   r	   r(  r)  r7  )r(   r*   r)   r+   r  r   r"   r"   r#   w8a8_block_fp8_matmul  s   r8  r   output_typeBLOCK_MBLOCK_NBLOCK_Krep_mrep_nrep_k
NUM_STAGESc           #   
   C   s  |dkrt j}n|dkrt j}n|dkrt j}t jdd}t ||	}|| }|| }||	 }||
 }d}d}|| }|| }d}d}t j|	|
ft jd}t jdt |||dD ]h}| ||g}|||g} |d||ddg}!|d||ddg}"|!	||ddd
dd	ddd	|	|| }!|"	||ddd
dd	ddd	|
|| }"t ||!d
| j|"d
|}||7 }||7 }||7 }qY|||g|| d S )Nr   rl   r   r   r-  r   )ro   r   r   e4m3)r6   r;   r   r   r7   r   r   r   r9   reshapetrans
dot_scaledTrB   r:   )#a_desca_scale_descb_descb_scale_descc_descr   rG   r   r9  r:  r;  r<  r=  r>  r?  r@  r   r   r   r   r   r   r   offs_k_aoffs_k_boffs_scale_moffs_scale_noffs_scale_kVEC_SIZEr   rv   r   r   scale_ascale_br"   r"   r#   !_mxfp8_block_scaled_matmul_kernel  sN   
rS  rr   rk   r   )block_mr   r   ro   r   a_scaler   b_scalerT  ro   c                C   sB  | j \}	}
|j \}}|
|ksJ |tjkrd}n|tjkr d}n|tjkr(d}ntd| |d }|d }|d d }t| ||g}t|||g}d||ddg}tj||d	}d||ddg}tj||d	}tj|	|f|| j	d
}t|||g}t
|	|t
|| df}t| ||||||	||
|||||||| |S )z6Block-scaled matmul for MXFP8 using Triton dot_scaled.r   rl   r   zUnsupported output dtype: rr   r-  r   rk   )block_shaper@   rh   )ry   r   r;   r   r   
ValueErrorr   from_tensorr   rh   r   r   rS  )r   rU  r   rV  r   rT  r   r   ro   r   r   rG   K_br9  r=  r>  r?  rF  rH  scale_block_shaperG  rI  r   rJ  r2  r"   r"   r#    mxfp8_block_scaled_matmul_triton  sR   




r]  
BLOCK_SIZEc                 C   s   t d}t d}	t d|}
|
|k }| |	| ||  7 } t j| |
 |ddt j}t t t ||}t 	|||  d S )Nr   rl   r0   r1   )
r6   r7   r8   r9   r:   r;   r<   r=   r>   
atomic_max)x_ptrx_s_ptr	head_size
x_stride_h
x_stride_srH   r   r^  seq_idhead_idoffsetr2   r`   rN   r"   r"   r#    _per_tensor_quant_mla_fp8_stage1.  s   

rh  c
                 C   s   t d}
t d}t d|	}||k }t |}d| }| || |
|  7 } ||| | |
|  7 }t j| | |ddt j}t || |||jj}t j	|| ||d d S )Nr   rl   r4   r0   r1   r5   r   )r`  ra  x_q_ptrnum_seqrb  rc  rd  r   r   r^  re  rf  rg  r2   r   x_s_invr`   r   r"   r"   r#    _per_tensor_quant_mla_fp8_stage2E  s   


rl  -q=x_s_outc           	      C   s   |   dks
J d|jdkr|jtjkr|j| jksJ | j|  td}| j\}}}t	
|}||f}t| | ||| d| d|t| t| | ||||| d| dtt|
 ||fS )z
    This function quantizes input values to float8 values with tensor-wise quantization
    and specialized for mla absorbed case.
    r   `x` is not a 3d-tensorrl   r   r   rl   )r   ry   r@   r   r;   rh   r%  sizer   r   r   rh  r   r   rl  r   )	r`   rn  rH   r   num_headrj  rb  r^  r2  r"   r"   r#   per_tensor_quant_mla_fp8b  s>   


rs  	NUM_GROUPc                 C   s   t d}t d}| || ||  7 } ||| ||  7 }|||	 | 7 }|dkr5t || t d t d|}||k }t|D ]J}t j| ||  | |ddt j}t 	t 
t ||}|| }t || |||jj}t j|||  | ||d t |||
  | qCdS )zA Triton-accelerated function to perform per-token-group
    quantization on a tensor for deep_gemm grouped_gemm_masked.
    This function converts the tensor values into float8 values.
    y and y_q: (b, t, k)
    y_s: (b, k//group_size, t)
    r   rl   r0   r1   r5   N)r6   r7   rB   num_programsr8   r   r9   r:   r;   r<   r=   r>   r?   r@   rA   )rC   rD   rE   masked_m_ptrrX   
y_stride_b
y_stride_ty_q_stride_by_q_stride_ty_s_stride_by_s_stride_grH   r   r   rt  r/   t_idb_idrL   r2   gidrM   rN   rO   rQ   r"   r"   r#   /_per_token_group_quant_mla_deep_gemm_masked_fp8  s&   

r  c                 C   s  |   dks
J d| j\}}}|d d d }|| }|| |ks*J d| d| j|||f|d}	| j|||ftjd}
| j|ftjd}t|}||f}t| | |	|
||| 	d| 	d	|		d|		d	|
	d|
	d	|t
 t
|| |	|
d	d
|||fS )z
    This function quantizes input values to float8 values with per-token-group-quantization
    for deep_gemm grouped_gemm_masked and specialized for mla absorbed case.
    r   ro     rk   zk % z must be zeror   r   rl   r   )r   ry   r%  r   r;   r   r   r   r  r   r   r   )r`   rX   rH   r@   r   mrv   	aligned_mnum_tiles_kr   r   r   r^  r2  r"   r"   r#   .per_token_group_quant_mla_deep_gemm_masked_fp8  s:   

r  c           	      C   sh   |j \}}d}| jdddj}tj||d}|t }|| t|| tt	t
}| | dS )zXNative PyTorch fallback for dynamic per-token FP8 quantization when vLLM is unavailable.rm  rl   T)r   keepdimr~   N)ry   r>   r=   valuesr   r?   r   copy_r   r:   r   )	r   r   scaler   rG   rH   absmax	scale_valoutput_datar"   r"   r#   #_native_dynamic_per_token_quant_fp8  s   

r  c                 C   sb   d}|   }tj||d}|t }|d|d t|| ttt	}| | dS )zYNative PyTorch fallback for dynamic per-tensor FP8 quantization when vLLM is unavailable.rm  r  rd   N)
r>   r=   r   r?   r   viewr  r   r:   r   )r   r   r  rH   r  r  r  r"   r"   r#   $_native_dynamic_per_tensor_quant_fp8  s   r  c                 C   s&   t || ttt}| | dS )zMNative PyTorch fallback for static FP8 quantization when vLLM is unavailable.N)r   r?   r   r   r:   r   r  )r   r   r  r  r"   r"   r#   _native_static_quant_fp8!  s   r  r   r  num_token_paddinguse_per_token_if_dynamicc                 C   s  | j dksJ d| j  d| j}|rt|| jd |d f}tj|| jtd}|d u r|rdtj|d df| jtjd}trHt	|| | ||fS t
rZtjj||  |d  ||fS t|| | ||fS tjd| jtjd}trzt|| | ||fS t
rtjj|| | ||fS t|| | ||fS | dksJ d|  trt|| | ||fS t
rtjj|| | ||fS t|| | ||fS )Nr   Expected 2D input tensor, got Dr   rl   rg   !Expected scalar scale, got numel=)r$  ry   r=   r   r   rh   r   r;   
_use_aiterr   	_has_vllmops_C"dynamic_per_token_scaled_fp8_quant
contiguousr  r   r   dynamic_scaled_fp8_quantr  r   r   static_scaled_fp8_quantr  r   r  r  r  ry   r   r"   r"   r#   scaled_fp8_quant'  sP   r  c                 C   s   | j dksJ d| j  d| j}|rt|| jd |d f}tj|| jtd}|d u r\|rFtj|d df| jtjd}t| || ||fS tj	d| jtjd}t
| ||dd ||fS | dkskJ d	|  t
| ||d
d ||fS )Nr   r  r  r   rl   rg   F)	is_staticr  T)r$  ry   r=   r   r   rh   r   r;   r   r   sgl_per_tensor_quant_fp8r   r  r"   r"   r#   r  Y  s0   c                 C   s(   g | ]}d D ]}t jd|i|dqqS ))r   r   rm   r:  )rn   )r   Config)r  rT  rn   r"   r"   r#   
<listcomp>  s    r  )   r-  r,  rr   )r   r<  M_ALIGNMENT)r   r  r  c	                 C   s  t d}	t d}
t ||
d  }t ||
 t j}t || t || |	| t d| }t t ||D ]}|| t d| }| ||  |d d d f |  |d d d f  }||k d d d f ||k d d d f @ }t j||dt j	}t j
t |dd}t j|dtdd}|d	|d d d f   t j}|||  |d d d f |  |d d d f  }t j|||d t ||}|||  |	|  | }t j||d	 ||k d q=d S )
Nr   rl   r   r5   r   g-C6?inf)r~   r=   g      |@)r6   r7   r9   r:   rT   multiple_ofr8   r   r   r;   r=   r>   r?   r"  
float8e4nvrB   )r   expert_offsetsproblem_sizesa_fp8sfar   r<  r  r:  k_offset	expert_idr  current_expert_offsetcoord_kicoord_mr   a_maskinpinp_amaxinp_fp8
a_fp8_ptrsrv   sfa_ptrsr"   r"   r#   ._per_token_group_quant_fp8_hopper_moe_mn_major  s.   

0( .r  rl   r  r  expert_tokens_alignmentc              	   C   s   |   dksJ |  sJ d| jd | dksJ dtj| | jtd}| jd | jd }}|| }tj||f| jtjd}	|jd }
||
f}t	| | ||||	||| ||	fS )Nr   z`A` is not contiguousrd   r   z=the last dimension of `A` cannot be divisible by `group_size`rg   rl   )
r   rz   ry   r   r   rh   r   r   r;   r  )r(   r  r  rX   r  a_qr   r   rv   r  num_expertsr2  r"   r"   r#   -per_token_group_quant_fp8_hopper_moe_mn_major  s,   

r  data_ptrtrans_data_ptrrv   c                 C   sb  t d}t d}t d}	t || }
t || d }||
 }t |
| t || | |
|  }||
|  }|	| t d| }||k }t d||t d D ]Y}|||  t d| }||k }|d d d f | |d d d f  }|d d d f |d d d f |  }|d d d f |d d d f @ }t j|| |d}t j|| ||d qUd S )Nr   rl   r   r5   )r6   r7   r9   r  r8   r   ru  rB   )r  r  r  rv   r  r   r   r  m_idk_idcurr_expert_offsetnext_expert_offsetnum_tokens_of_expertdata_start_ptrtrans_data_start_ptrk_coordk_maskstart_mm_coordm_maskoff	trans_offr2   datar"   r"   r#   _per_group_transpose  s*   



$$ r  c              	      sr   |   dksJ |  sJ d|  \ t| }|dd  fdd}t| | || |ddd	 |S )
Nr   z`a` is not contiguousr   rl   c                    s.   t  d  | d t  | d fS )Nrl   r   r   r/  r0  rv   r  r  r"   r#   r+    s   z%per_group_transpose.<locals>.<lambda>r  rm   )r   r   )r   rz   rq  r   r   r  )r   r  r  trans_ar2  r"   r  r#   per_group_transpose  s   
r  c                 C   sZ   |   }| j}|d dko|d td|d k}|d dko(|d td|d k}|p,|S Nr   rl   )r   ry   r=   )r`   stridessizesis_not_transposeis_transposer"   r"   r#   is_weak_contiguous  s
   ""r  ACCUMULATOR_DTYPEBLOCK_SIZE_SCALE_ABLOCK_SIZE_SCALE_Bc           =      C   s  t jdd}t ||}|| }|| }|}t j||f|d}|| t d|t j }||k }|| t d|t j }||k }t d|t j}|	|d d d f  |
|d d d f   } ||d d d f  ||d d d f   }!t d||dk| |  }"|"|k }#t d||dk| |  }$|$|k }%| |  }&||! }'||" }(||$ })tdt ||D ]M}*||k }+|d d d f |+d d d f @ },t j|&|,d}-|+d d d f |d d d f @ }.t j|'|.d}/t j	|-|/||d}||7 }|&||
 7 }&|'|| 7 }'q|#d d d f t dddk d d d f @ }0t |(d d d f |0}1|1
|df}1|1|t j }|%d d d f t dddk d d d f @ }2t |)d d d f |2}3|3
|df}3|3j|t j }||jj}4|r|}5||5 }6|5|k }7t |6|7}8|4|87 }4|| t d|t j }9|| t d|t j }:|9t j}9|:t j}:|||9d d d f   ||:d d d f   };|9d d d f |k |:d d d f |k @ }<t j|;|4|<d d S )Nr   r   r   rl   r5   )	out_dtype)r6   r7   r   r   r8   r:   rT   r   r9   r   broadcast_tor;   rE  typerA   rB   )=a_ptrb_ptrscale_a_ptrscale_b_ptrc_ptrbias_ptrr   rG   r   r   r   r   r   r   r   r  r   r   r   r  r  r   r   r   r   accumulator_dtyper   
offsets_ammasks_am
offsets_bnmasks_bn	offsets_k	offsets_a	offsets_boffsets_scale_ammasks_scale_amoffsets_scale_bnmasks_scale_bnr   r   scale_a_ptrsscale_b_ptrsrv   masks_kmasks_ar   masks_br   masks_scale_arQ  masks_scale_brR  r   offsets_bias	bias_ptrs	bias_maskbiasr   r   r   r   r"   r"   r#   scaled_mm_kernel  sv   ((

  ,,,(r  r-  weightrQ  rR  r  r  block_size_mblock_size_nblock_size_kc
                    sT  | j \ }
|j d dkr|
dkr dksJ |j d |
ks!J | j|jks)J | dkr5|ddn|}| dkrC|ddn|}|j|jkrO| sQJ |j d dkrf|j d dksh|j d  kshJ |j d dkr}|j d dks|j d ksJ |jsJ |d u s| sJ t| sJ t|sJ  fdd}tj f|| jd}dd }|	rوdk }t	d	t
 }|d	kr|rd
nd}n|dkrd
}n	|dkrd}nd}|\}}}||rdn|}||rdn|}|  rtjntj}t| | ||||| |
| d| d|d|d|d|d||||||d ||S )Nrl   r   rd   c                    r.  r  r/  r0  r1  r"   r#   r+    s   z"triton_scaled_mm.<locals>.<lambda>rX  c                 S   s   | j d dko| j d dkS r  )ry   r*  r"   r"   r#   r+    s    i    r-  )r,  r,  rk   )r,  rr   rk   r,  rr   )r,  rr   rr   )rr   rr   rr   )r   r   r   r  r  )ry   r@   r   rB  is_floating_pointr  r   r   rh   r=   r   r   r6   r;   r   r  r   r:   )r   r  rQ  rR  r  r  r  r   r  use_heuristicr   r2  result
has_scalar
is_small_Nnext_power_of_2_M
tile_shapeblock_size_sablock_size_sbr  r"   r1  r#   triton_scaled_mm  sn   

..


r  z*sgl_kernel::sgl_per_token_group_quant_8bitc                 C      d S r,   r"   r   output_qoutput_srX   rH   r   r   rc   r"   r"   r#   r        r   z)sgl_kernel::sgl_per_token_group_quant_fp8c                 C   r  r,   r"   r  r"   r"   r#   r     r  z#sgl_kernel::sgl_per_token_quant_fp8c                 C   r  r,   r"   )r   r  r  r"   r"   r#   r     s   )r_   FFFFN)r_   FFFFNN)F)rm  )NNFrp  )Nr-  r-  r-  T)u	functoolsr  loggingr  r   typingr   r   r   r   r   r   r   triton.languagelanguager6   triton.tools.tensor_descriptorr   sglang.srt.layersr	   sglang.srt.utilsr
   r   r   r   r   r   r   r   sglang.srt.utils.custom_opr   r   _is_cuda_is_cpur  
sgl_kernelr   &sglang.jit_kernel.per_tensor_quant_fp8r   r  r   r   ImportErrorr   r  aiterr   r   r   vllm._Cvllm	getLogger__name__r  r4  r$   float8_e4m3fnuzr   r   float8_e4m3fnr}   r=   r   Tensorr.   jit	constexprrR   r^   r   r"  r@   r   per_token_group_quant_fp8r   r   r   r   r   r   r   r   r   r   r  r  r  r   r'  r)  r7  r8  rS  r]  rh  rl  rs  r  r  r  r  r  tupler  autotunefp8_autotuner  r  r  r  r  r  r  r  libraryregister_faker   r"   r"   r"   r#   <module>   sd  (



*5
l
N	

!
/	
:	

,
-
8Z 1.	
0

Z
	
L	

?

.3
.4
%
	,

 &
 	
V



