o
    iS3                     @   s   d dl Z d dlmZ d dlZdadadadadada	dd Z
	ddejdejd	ejd
ejdedejfddZdejfdejdedejdeejejf fddZdejfdejdedeejejf fddZ	ddejdejdedejfddZdS )    N)TupleFc                     s^  t rd S da ddlm}  |  sdad S dadd ldd lm ddlm   fdddD }j|g d	d
j	dj
dj
dj
dj
dj
dj
ffddtjjddd	d(dtjdtjdtjdtjdtdtjffdd}|jd(dd }|aj	d!j
ffd"d#}|aj	d!j
ffd$d%}|aj	d!j
ffd&d'}|ad S ))NTr   )
has_tritonFConfigc              	      s4   g | ]}d D ]}dD ]} ||d|ddq
qqS ))    @      )            )BLOCK_SIZE_MBLOCK_SIZE_N   )
num_stages	num_warps ).0block_mblock_nr   r   r   Y/home/ubuntu/.local/lib/python3.10/site-packages/torchao/kernel/blockwise_quantization.py
<listcomp>/   s    
z%_lazy_init_triton.<locals>.<listcomp>)   r   r   r   )NKM_BUCKETBLOCK_SIZE_K)configskeyr   r   r   r   r   r   c                    s<   j dd} j dd} ||}||	  d|	 | }||
  d|
 | } d|}| |d d d f |  |d d d f  }||d d d f |  |d d d f  }|||  }||| |  } j|	|
f jd}t|D ]\} j||d d d f |||  k dd} j||d d d f |||  k dd} |} |}| |||d d d f  |d d d f  7 }||7 }||7 }|d7 }|d7 }qt||j	j
}||	  d|	 }||
  d|
 }||d d d f |  |d d d f  }|d d d f |k |d d d f |k @ } j|||d d S )Nr   axis   dtypeg        )maskotherr$   )
program_idcdivarangezerosfloat32rangeloaddottor#   
element_tystore)a_ptrb_ptrc_ptra_s_ptrb_s_ptrMr   r   r   r   r   r   pid_mpid_nkoffs_moffs_noffs_ka_ptrsb_ptrsa_s_ptrsb_s_ptrsaccumulatoriaba_sb_scc_ptrsr$   tlr   r   blockwise_fp8_gemm_kernel:   s6   ((((

0
((z4_lazy_init_triton.<locals>.blockwise_fp8_gemm_kernelzao::blockwise_fp8_gemmr   )mutates_argsr   rD   rF   rE   rG   
block_sizereturnc           	         s   |   sJ |  sJ |  sJ |  sJ | d}|  |  |dtt }| jg |  d d R dtji} fdd}| | |||| |||d
 |S )Nr   r#   c                    s      | d  | d fS )Nr   r   r(   )METAr7   r   tritonr   r   <lambda>{      zC_lazy_init_triton.<locals>._blockwise_fp8_gemm_op.<locals>.<lambda>)r   )	is_contiguoussizenumelmathceillog2	new_emptytorchbfloat16)	rD   rF   rE   rG   rN   r   r   rH   grid)rL   rT   )r7   r   r   _blockwise_fp8_gemm_opj   s   

(z1_lazy_init_triton.<locals>._blockwise_fp8_gemm_opc                 S   s6   | d}| jg |   d d |R dtji}|S )Nr   rP   r#   )rX   r]   r^   r_   )rD   rF   rE   rG   rN   r   rH   r   r   r   _   s   
(z_lazy_init_triton.<locals>._
BLOCK_SIZEc           	         s    j dd}||  d| } | |  j}  |d }|| }||jj} 	|| |  	|| | dS )a!  
        Quantizes the input tensor `x_ptr` and stores the result in `y_ptr` and the scaling factor in `s_ptr`.

        Args:
            x_ptr (triton.Pointer): Pointer to the input tensor.
            y_ptr (triton.Pointer): Pointer to the output tensor where quantized values will be stored.
            s_ptr (triton.Pointer): Pointer to the output tensor where scaling factors will be stored.
            BLOCK_SIZE (tl.constexpr): The size of the block to be processed by each program instance.

        Returns:
            None
        r   r         |@N)
r'   r)   r-   r/   r+   maxabsr#   r0   r1   )	x_ptry_ptrs_ptrrc   pidoffsxsyrJ   r   r   $_fp8_blockwise_act_quant_kernel_impl   s   z?_lazy_init_triton.<locals>._fp8_blockwise_act_quant_kernel_implc                    s   j dd} j dd} ||}||  d| }	||  d| }
|	dddf | |
dddf  }|	dddf |k |
dddf |k @ } j| | |d j}  |d }|| }||jj	} j
|| ||d  
|||  | | dS )aj  
        Quantizes the input tensor `x_ptr` and stores the result in `y_ptr` and the scaling factors in `s_ptr`.

        Args:
            x_ptr (tl.pointer): Pointer to the input tensor.
            y_ptr (tl.pointer): Pointer to the output tensor where quantized values will be stored.
            s_ptr (tl.pointer): Pointer to the output tensor where scaling factors will be stored.
            M (int): Number of rows in the weight matrix.
            N (int): Number of columns in the weight matrix.
            BLOCK_SIZE (tl.constexpr): The size of the block to be processed by each program instance.
        r   r   r!   Nr&   rd   )r'   r(   r)   r-   r/   r+   re   rf   r#   r0   r1   )rg   rh   ri   r7   r   rc   r8   r9   nr;   r<   rk   r$   rl   rm   rn   rJ   r   r   '_fp8_blockwise_weight_quant_kernel_impl   s   $(zB_lazy_init_triton.<locals>._fp8_blockwise_weight_quant_kernel_implc                    s    j dd} j dd} ||}||  d| }	||  d| }
|	dddf | |
dddf  }|	dddf |k |
dddf |k @ } j| | |d j} |||  | }|| } j|| ||d dS )a%  
        Dequantizes weights using the provided scaling factors and stores the result.

        Args:
            x_ptr (tl.pointer): Pointer to the quantized weights.
            s_ptr (tl.pointer): Pointer to the scaling factors.
            y_ptr (tl.pointer): Pointer to the output buffer for dequantized weights.
            M (int): Number of rows in the weight matrix.
            N (int): Number of columns in the weight matrix.
            BLOCK_SIZE (tl.constexpr): Size of the block for tiling.

        Returns:
            None
        r   r   r!   Nr&   )r'   r(   r)   r-   r/   r+   r1   )rg   ri   rh   r7   r   rc   r8   r9   rp   r;   r<   rk   r$   rl   rm   rn   rJ   r   r   )_fp8_blockwise_weight_dequant_kernel_impl   s   $(zD_lazy_init_triton.<locals>._fp8_blockwise_weight_dequant_kernel_implr   )_triton_initializedtorch.utils._tritonr   _triton_availablerT   triton.languagelanguager   autotunejit	constexprr^   library	custom_opTensorintregister_fake_blockwise_fp8_gemm_impl_fp8_blockwise_act_quant_kernel"_fp8_blockwise_weight_quant_kernel$_fp8_blockwise_weight_dequant_kernel)r   fp8_gemm_configsra   rb   ro   rq   rr   r   )r   rL   rK   rT   r   _lazy_init_triton   s   
	
,r   r   rD   rF   rE   rG   rN   rO   c                 C   s"   t   ts	tdt| ||||S )Nunsupported without triton)r   rv   AssertionErrorr   )rD   rF   rE   rG   rN   r   r   r   blockwise_fp8_gemm   s   r   rl   r#   c                    s   t   ts	tdddl  sJ dd| dks&J d| d|tjtjfv s2J dtj	|d	}j
g  dd d| R d
tji} fdd}t| |||d ||fS )a  
    Quantizes the input tensor `x` using block-wise quantization with block size being BLOCK_SIZEx1.

    Args:
        x (torch.Tensor): The input tensor to be quantized. Must be contiguous and its last dimension size must be divisible by `block_size`.
        block_size (int, optional): The size of the blocks to be used for quantization. Default is 128.
        dtype (torch.dtype, optional): The dtype to use for the quantized tensor. Default is `torch.float8_e4m3fn`.


    Returns:
        Tuple[torch.Tensor, torch.Tensor]: A tuple containing:
            - The quantized tensor with dtype `dtype`.
            - A tensor of scaling factors with dtype `torch.float32`.
    r   r   NInput tensor must be contiguousrP   z@Last dimension size must be divisible by block_size (block_size=)6dtype must be torch.float8_e4m3fn or torch.float8_e5m2r"   r#   c                    s      | d fS Nrc   )r(   rY   metarT   rl   r   r   rU     s    z)fp8_blockwise_act_quant.<locals>.<lambda>rc   )r   rv   r   rT   rW   rX   r^   float8_e4m3fnfloat8_e5m2
empty_liker]   r+   r   rl   rN   r#   rn   rm   r`   r   r   r   fp8_blockwise_act_quant   s&   
2r   c                    s   t   ts	tdddl|  sJ d|  dksJ d| d| dkr1| d| dks9J d| d	|tjtj	fv sEJ d
|  \ tj
| |d}| j | | tjd} fdd}t| | || |d ||fS )aa  
    Quantizes the given weight tensor using block-wise quantization with block size being BLOCK_SIZExBLOCK_SIZE.

    Args:
        x (torch.Tensor): The weight tensor to be quantized.
        block_size (int, optional): The block size to use for quantization. Defaults to 128.
        dtype (torch.dtype, optional): The dtype to use for the quantized tensor. Defaults to `torch.float8_e4m3fn`.

    Returns:
        Tuple[torch.Tensor, torch.Tensor]: A tuple containing:
            - The quantized weight tensor with dtype `dtype`.
            - A tensor of scaling factors with dtype `torch.float32`.
    r   r   Nr      z#Input tensor must have 2 dimensionsr!   zABoth dimensions of x must be divisible by block_size (block_size=r   r   r"   c                          | d  | d fS r   rQ   r   rS   r   r   rU   <  rV   z,fp8_blockwise_weight_quant.<locals>.<lambda>r   )r   rv   r   rT   rW   dimrX   r^   r   r   r   r]   r+   r   r   r   rS   r   fp8_blockwise_weight_quant  s*   &
r   rm   c                    s   t   ts	tdddl|  r| sJ d|  dkr%| dks)J d|  \ tj| t	 d} fdd	}t
| | || |d
 |S )a  
    Dequantizes the given weight tensor using the provided scale tensor.

    Args:
        x (torch.Tensor): The quantized weight tensor of shape (M, N).
        s (torch.Tensor): The scale tensor of shape (M, N).
        block_size (int, optional): The block size to use for dequantization. Defaults to 128.

    Returns:
        torch.Tensor: The dequantized weight tensor of the same shape as `x`.

    Raises:
        AssertionError: If `x` or `s` are not contiguous or if their dimensions are not 2.
    r   r   Nz Input tensors must be contiguousr   z$Input tensors must have 2 dimensionsr"   c                    r   r   rQ   r   rS   r   r   rU   _  rV   z.fp8_blockwise_weight_dequant.<locals>.<lambda>r   )r   rv   r   rT   rW   r   rX   r^   r   get_default_dtyper   )rl   rm   rN   rn   r`   r   rS   r   fp8_blockwise_weight_dequantD  s    r   rs   )rZ   typingr   r^   rt   rv   r   r   r   r   r   r~   r   r   r   r#   r   r   r   r   r   r   r   <module>   sj    X

'
+