o
    پi                     @   s   d dl mZmZ d dlZd dlZd dlmZ ejdej	dej	dej	dej	fddZ
	dd
ejdedee deejejf fddZdS )    )OptionalTupleN
group_sizeround_scaleBLOCK_MBLOCK_Nc	           "      C   s  t d}	t d}
d}d}d| }|	| }|
| }|t d| }|t d| }||k }||k }|dddf |dddf @ }| |dddf |  |dddf  }t j||ddt j}t |}t j|dd	}t |d
}|rt 	|| }t 
|}t |}n|| }|dddf }|| }t t |||}||dddf |  |dddf  }t j|||d |
}||||   | } |}!t j| ||!d dS )zs
    Triton kernel for activation quantization.

    Each block processes BLOCK_M rows and group_size columns.
    r      g      |g      |@g      ?Ng        )maskother)axisg-C6?)r	   )tl
program_idarangeloadtofloat32absmaxmaximumlog2ceilexp2minimumstore)"X_ptrY_ptrS_ptrMNr   r   r   r   pid_mpid_nfp8_minfp8_maxfp8_max_inv	row_start	col_startrowscolsrow_maskcol_maskr	   x_ptrsxx_absamaxlog_vallog_ceilscalescale_broadcastyy_ptrss_colss_ptrss_mask r7   a/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/attention/nsa/triton_kernel.py_act_quant_kernel	   s>   

 (

(r9      r+   
block_size	scale_fmtreturnc                 C   s  |   sJ d| d| dksJ d| d| d}| d|}|d}tj| tjd}|d|}| jg |  dd || R dtji}|d|| }	d	}
|}t	||
t	||f}|du}t
| |||	|||||
||rydnd
d
 ||fS )a  
    Quantizes the input tensor `x` using block-wise quantization with Triton.

    Args:
        x (torch.Tensor): The input tensor to be quantized. Must be contiguous and its last dimension size must be divisible by `block_size`.
        block_size (int, optional): The size of the blocks to be used for quantization. Default is 128.
        scale_fmt (Optional[str], optional): The format of the scale. Default is None.
    Returns:
        Tuple[torch.Tensor, torch.Tensor]: A tuple containing:
            - The quantized tensor with dtype `torch.float8_e4m3fn`.
            - A tensor of scaling factors with dtype `torch.float32`.
    zInput tensor must be contiguousr   z@Last dimension size must be divisible by block_size (block_size=))dtypeNr@          )r   r   r   r   
num_stages)is_contiguoussizeviewtorch
empty_likefloat8_e4m3fn	new_emptyr   tritoncdivr9   )r+   r;   r<   r   x_flatr   r2   y_flatss_flatr   r   gridr   r7   r7   r8   	act_quantV   s8   


,
rR   )r:   N)typingr   r   rG   rK   triton.languagelanguager   jit	constexprr9   TensorintstrrR   r7   r7   r7   r8   <module>   s0    	M