o
    پiD3                     @   s  d dl Z d dlZd dlZd dlZd dlmZmZmZmZm	Z	 d dl
Z
d dlZd dlmZ d dlmZmZ e ZerUz
d dlmZ dZW n eyT   d dlmZ dZY nw eeZejdejd	ejfd
dZe
jdfddZejd	ejfddZ de
j!fde
j"de#de$de
j%de	e
j"e
j"f f
ddZ&de
j!dfde
j"de#de$de
j%dee' f
ddZ(ejdejdejdejdejfdd Z)e j*d!e#d"e#d#e#d$e#deee#ef  f
d%d&Z+e
j,fd'e
j"d(e
j"d)e
j"d*e
j"d+ee# d,e
j%de
j"fd-d.Z-dS )/    N)AnyDictListOptionalTuple)get_device_nameis_cuda)sgl_per_token_group_quant_8bitT)sgl_per_token_group_quant_int8FCAL_SUMBLOCKc	                 C   s   t d}	t d|}
|
|k }t j| |	|  |
 |ddt j}t t t |d}|d }|d|  }t j	j
j|t j}|rYt j|dd}t ||	 ||jj t j||	|  |
 ||d t ||	 ||jj d S )Nr           maskother绽|=   axisr   )tl
program_idarangeloadtofloat32maximummaxabsextracuda	libdeviceroundint8sumstoredtype
element_ty)x_ptrxq_ptr	scale_ptr	x_sum_ptrstride_x	stride_xqNr   r   row_idcolsr   xabsmaxscale_xx_qx_sum r6   ^/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/quantization/int8_kernel.py_per_token_quant_int8   s   
$r8   c           
      C   s   |   | jd  }| jd }tj| | jtjd}tj| jd d d | j|d}|r:tj| jd d | j| jd}nd }t	|}t
t|d dd}	|  sQJ t|f | |||| d|d||||	dd |rp|||fS ||fS )	Ndevicer&   )      r<      )r,   r-   r.   r   r   	num_warps
num_stages)numelshapetorch
empty_liker;   r#   emptyr&   tritonnext_power_of_2minr   is_contiguousr8   stride)
r1   scale_dtypecal_sumMr.   r4   scalesr5   r   r@   r6   r6   r7   per_token_quant_int8;   s4   
  

rP   c	                 C   s   t d}	| |	| 7 } ||	| 7 }||	7 }t d|}
|
|k }t j| |
 |ddt j}t t t ||}|| }t 	|| |||j
j}t j||
 ||d t || dS )zA Triton-accelerated function to perform per-token-group quantization on a
    tensor.

    This function converts the tensor values into int8 values.
    r   r   r   r   N)r   r   r   r   r   r   r   r   r   clampr&   r'   r%   )y_ptry_q_ptry_s_ptry_strider.   epsint8_minint8_maxr   g_idr0   r   y_absmaxy_sy_qr6   r6   r7   _per_token_group_quant_int8\   s   
r^   r   r1   
group_sizerV   r&   returnc                 C   s   | j d | dksJ d|  sJ dt|}|j}|j}tj| | j|d}|  | }|}	tj	| j dd | j d | f | jtj
d}
t|	}tt|d dd	}d}t|f | ||
||	||||||d
 ||
fS )aW  Function to perform per-token-group quantization on an input tensor `x`.

    It converts the tensor values into signed int8 values and returns the
    quantized tensor along with the scaling factor used for quantization.

    Args:
        x: The input tensor with ndim >= 2.
        group_size: The group size used for quantization.
        eps: The minimum to avoid dividing zero.
        dtype: The dype of output tensor. Note that only `torch.int8` is supported for now.

    Returns:
        Tuple[torch.Tensor, torch.Tensor]: The quantized tensor and the scaling factor for quantization.
    r9   r   =the last dimension of `x` cannot be divisible by `group_size``x` is not contiguousr:   Nr=   r<   r>   )rW   rX   r   r@   rA   )rC   rJ   rD   iinfor   rI   rE   r;   rB   rF   r   rG   rH   r^   )r1   r_   rV   r&   rc   rX   rW   r4   rN   r.   x_sr   r@   rA   r6   r6   r7   per_token_group_quant_int8   s@   

re   	enable_v2c           
   
   C   s   | j d | dksJ d|  sJ dt|}|j}|j}tj| | j|d}tj| j d d | j d | f | jtj	d}	t
rRt| ||	|||||d ||	fS |rVJ t| ||	|||| ||	fS )Nr9   r   ra   rb   r:   )rf   )rC   rJ   rD   rc   r   rI   rE   r;   rF   r   %enable_sgl_per_token_group_quant_8bitr	   r
   )
r1   r_   rV   r&   rf   rc   rX   rW   r4   rd   r6   r6   r7   !sglang_per_token_group_quant_int8   s*   
rh   BLOCK_SIZE_MBLOCK_SIZE_NBLOCK_SIZE_KGROUP_SIZE_Mc           6      C   s  t jdd}t ||}t ||}|| }|| }|| }t|| |}|||  }|| | } || t d| | }!| | t d| | }"t d|}#| |!dddf |
 |#dddf |   }$||#dddf | |"dddf |   }%||!|  }&|"| }'||'|  }(t j||ft jd})tdt ||D ]l}*t j|$|#dddf ||*|  k dd}+t j|%|#dddf ||*|  k dd},|*| }-|-|	 }.t |&|.|  }/t |(|.|  }0|)t 	|+|,
t j|/dddf  |0dddf  7 })|$|| 7 }$|%|| 7 }%q|jjt jkr|)
t j}1n|jjt jkr)|)
t j}1n|)
t j}1|| t d| }2| | t d| }3|||2dddf   ||3dddf   }4|2dddf |k |3dddf |k @ }5t j|4|1|5d dS )zTriton-accelerated function used to perform linear operations (dot
    product) on input tensors `A` and `B` with block-wise quantization, and store the result in output
    tensor `C`.
    r   r   Nr&   r   r   r   )r   r   cdivrI   r   zerosr   ranger   dotr   r&   r'   bfloat16float16r%   )6ABCAsBsrN   r.   Kgroup_ngroup_k	stride_am	stride_ak	stride_bk	stride_bn	stride_cm	stride_cnstride_As_mstride_As_kstride_Bs_kstride_Bs_nri   rj   rk   rl   pid	num_pid_m	num_pid_nnum_pid_in_groupgroup_idfirst_pid_mgroup_size_mpid_mpid_noffs_amoffs_bnoffs_ka_ptrsb_ptrsAs_ptrsoffs_bsnBs_ptrsaccumulatorkabk_startoffs_ksa_sb_scoffs_cmoffs_cnc_ptrsc_maskr6   r6   r7   _w8a8_block_int8_matmul   sL   %,,((8,(r   r.   ry   block_nblock_kc                 C   s   t  dd}d|  d| d| d| d| d}tjtjtjtd	|}tj|rUt	|}t
d
| dd t| D W  d   S 1 sPw   Y  t
d| dS )a}  
    Return optimized configurations for the w8a8 block fp8 kernel.

    The return value will be a dictionary that maps an irregular grid of
    batch sizes to configurations of the w8a8 block fp8 kernel. To evaluate the
    kernel on a given batch size bs, the closest batch size in the grid should
    be picked and the associated configuration chosen to invoke the kernel.
     _zN=z,K=z,device_name=z,dtype=int8_w8a8,block_shape=[z, z].jsonconfigsz7Using configuration from %s for W8A8 Block INT8 kernel.c                 S   s   i | ]	\}}t ||qS r6   )int).0keyvalr6   r6   r7   
<dictcomp>U  s    z/get_w8a8_block_int8_configs.<locals>.<dictcomp>NzjUsing default W8A8 Block INT8 kernel config. Performance might be sub-optimal! Config file not found at %s)r   replaceospathjoindirnamerealpath__file__existsopenloggerinfojsonr   itemswarning)r.   ry   r   r   device_namejson_file_nameconfig_file_pathfr6   r6   r7   get_w8a8_block_int8_configs9  s$   $
 
r   rt   ru   rw   rx   
block_sizeoutput_dtypec                    s  t |dksJ |d |d }}| jd |jd ksJ | jdd |jdd kr/|  s1J t| jd ||jd ksAJ |  | jd   |jdkrX| rX|jdksZJ |j\}t||jd kslJ t|||jd ksyJ | jdd f }	| j|	|d}
t||d |d }|r|t	|
  fddd	 }nd
|d |d dddd} fdd}t| | ||
|| |||| d| d|d|d|
d|
d|d|d|d|dfi | |
S )a  This function performs matrix multiplication with block-wise quantization.

    It takes two input tensors `A` and `B` with scales `As` and `Bs`.
    The output is returned in the specified `output_dtype`.

    Args:
        A: The input tensor, e.g., activation.
        B: The input tensor, e.g., weight.
        As: The per-token-group quantization scale for `A`.
        Bs: The per-block quantization scale for `B`.
        block_size: The block size for per-block quantization. It should be 2-dim, e.g., [128, 128].
        output_dytpe: The dtype of the returned tensor.

    Returns:
        torch.Tensor: The result of matmul.
       r   r<   r9   Nrm   c                    s   t |   S )N)r   )r1   )rN   r6   r7   <lambda>  s    z(w8a8_block_int8_matmul.<locals>.<lambda>)r   @             )ri   rj   rk   rl   r@   rA   c                    s"   t  | d t | d  fS )Nri   rj   )rG   rn   )METArN   r.   r6   r7   grid  s   z$w8a8_block_int8_matmul.<locals>.gridr?   )lenrC   rJ   rG   rn   rB   ndim	new_emptyr   rI   keysr   rK   )rt   ru   rw   rx   r   r   r   r   ry   C_shaperv   r   configr   r6   r   r7   w8a8_block_int8_matmulc  s`   (  
	r   ).	functoolsr   loggingr   typingr   r   r   r   r   rD   rG   triton.languagelanguager   sglang.srt.utilsr   r   _is_cuda
sgl_kernelr	   rg   ImportErrorr
   	getLogger__name__r   jit	constexprr8   r   rP   r^   r#   Tensorr   floatr&   re   boolrh   r   	lru_cacher   rs   r   r6   r6   r6   r7   <module>   s    
	!,
>
#T/