o
    پi                     @   sr   d dl mZ d dlZd dlZddlmZmZmZ ddlm	Z	m
Z
mZ ddd	Zdd
dZ						dddZdS )    )OptionalN   )gemm_kernel!gemm_kernel_descriptor_persistentgemm_kernel_persistent)check_device	check_dimcheck_input      ?        c                    s  t |  t| |g td|  td| |dur&t | t|g td| | jd |jd ks4J d| j|jks>J d|dur^| jd |jd ksPJ d|jd |jd ks^J d| j\ }|j\}| j}|ro|n	|tjkrv|ntj}|du s|j|ksJ d	|du rtj f| j	|d
n|}tj
dj}	du r|	nt|	 fdd}
t|
 | || || d| d|d|d|d|d||d |S )a  
    GEMM operation with SM constraint by Triton.
    C = alpha * (a @ b.T) + beta * C

    Args:
        a: The first input matrix. Shape: (M, K)
        b: The second input matrix. Shape: (K, N)
        c: The output matrix. Shape: (M, N). In-place epilogue is supported. Expected to be out_dtype (if not specified, same as a.dtype, but fp8 --> bf16).
        alpha: The scaling factor for the product of a and b.
        beta: The scaling factor for the output matrix c.
        out_dtype: The dtype of the output matrix. Default: fp8 --> bf16. Otherwise, same as a.dtype.
        num_sms: The number of SMs to use for the computation.
       Nr   r   'Incompatible dimensions between a and b#Incompatible dtypes between a and b'Incompatible dimensions between a and c'Incompatible dimensions between b and c+Incompatible dtypes between c and out_dtypedevicedtypecudac                    (   t t | d t| d  fS NBLOCK_SIZE_MBLOCK_SIZE_NmintritoncdivMETAMNnum_sms X/home/ubuntu/.local/lib/python3.10/site-packages/flashinfer/triton/sm_constraint_gemm.py<lambda>G   
   z!gemm_persistent.<locals>.<lambda>)alphabetaNUM_SMS)r	   r   r   shaper   torchfloat8_e4m3fnbfloat16emptyr   r   get_device_propertiesmulti_processor_countr   r   stride)abcr(   r)   	out_dtyper#   Kr   r*   gridr$   r    r%   gemm_persistent   s^   






"r9   c           	         s  t |  t| |g td|  td| |dur&t | t|g td| | jd |jd ks4J d| j|jks>J d|dur^| jd |jd ksPJ d|jd |jd ks^J d| j\ }|j\}| j}|ro|n	|tjkrv|ntj}|du s|j|ksJ d	|du rtj f| j	|d
n|} fdd}t
| | || || d| d|d|d|d|d||d |S )a  
    GEMM operation without SM constraint by Triton.
    C = alpha * (a @ b.T) + beta * C

    Args:
        a: The first input matrix. Shape: (M, K)
        b: The second input matrix. Shape: (K, N)
        c: The output matrix. Shape: (M, N). In-place epilogue is supported. Expected to be out_dtype (if not specified, same as a.dtype, but fp8 --> bf16).
        alpha: The scaling factor for the product of a and b.
        beta: The scaling factor for the output matrix c.
        out_dtype: The dtype of the output matrix. Default: fp8 --> bf16. Otherwise, same as a.dtype.
        num_sms: The number of SMs to use for the computation.
    r   Nr   r   r   r   r   r   r   r   c                    s"   t  | d t | d  fS r   )r   r   r   r!   r"   r$   r%   r&      s   zgemm.<locals>.<lambda>)r(   r)   )r	   r   r   r+   r   r,   r-   r.   r/   r   r   r2   )	r3   r4   r5   r(   r)   r6   r7   r   r8   r$   r:   r%   gemmb   sX   






"r;   Fc                    s  t |  t | t| |g td|  td| |dur*t | t|g td| | jd |jd ks8J d| j|jksBJ d|durb| jd |jd ksTJ d|jd |jd ksbJ d| j\ }|j\}| j}	|rs|n	|	tjkrz|	ntj}|	tjkr|d	ksJ d
d	ksJ d
n|dksJ d
dksJ d
|du rtj f| j	|dn|}tj
dj}
du r|
nt|
dtdtdtt fdd}t|  fdd}t| | || |||d|	tjkrdnddddd|d |S )a  
    GEMM operation with SM constraint by Triton.
    Requires TMA support and descriptor creation.
    C = alpha * (a @ b.T) + beta * C

    Note:
        - K and N must be greater than 16B.
        - Support float16, float8_e4m3fn, bfloat16.
        - float32 is not supported due to performance issues.

    Args:
        a: The first input matrix. Shape: (M, K)
        b: The second input matrix. Shape: (N, K)
        c: The output matrix. Shape: (M, N). In-place epilogue is supported. Expected to be out_dtype (if not specified, same as a.dtype, but fp8 --> bf16).
        alpha: The scaling factor for the product of a and b.
        beta: The scaling factor for the output matrix c.
        out_dtype: The dtype of the output matrix. Default: fp8 --> bf16. Otherwise, same as a.dtype.
        num_sms: The number of SMs to use for the computation.
        EPILOGUE_SUBTILE: Whether to use the epilogue subtile optimization.
    r   Nr   r   r   r   r   r      zLeast chunk size must be 16B   r   r   size	alignmentstreamc                 S   s   t j| dt jdS )Nr   r   )r,   r/   int8)r>   r?   r@   r$   r$   r%   alloc_fn   s   z,gemm_descriptor_persistent.<locals>.alloc_fnc                    r   r   r   r   r    r$   r%   r&      r'   z,gemm_descriptor_persistent.<locals>.<lambda>   @      )r*   r   r   BLOCK_SIZE_KGROUP_SIZE_M
num_stages	num_warpsEPILOGUE_SUBTILE)r	   r   r   r+   r   r,   r-   r.   r/   r   r   r0   r1   r   intr   r   set_allocatorr   float32)r3   r4   r5   r(   r)   r6   r#   rJ   r7   r   r*   rB   r8   r$   r    r%   gemm_descriptor_persistent   sj   








"
rN   )Nr
   r   NN)Nr
   r   N)Nr
   r   NNF)typingr   r,   r   kernels.sm_constraint_gemmr   r   r   utilsr   r   r	   r9   r;   rN   r$   r$   r$   r%   <module>   s    

TN