o
    'i>                      @   s  U d Z ddlZddlZddlZddlZddlZddlmZmZm	Z	m
Z
 zddlm  mZ W n ey? Z zededZ[ww ddlZddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZmZ G dd dejZ G dd dejZ!G dd dejZ"dej#fddZ$dej#fddZ%dej#fddZ&dej'fddZ(dd Z)de*de*de*fd d!Z+dej#dej#fd"d#Z,	$	dd%ej#d&e*d'e*d(e
e*e*f d)e	e* d*e-d+e	ej' dej#fd,d-Z.		$dd%ej#d&e*d'e*d.e
e*e*e*f d)e	e* d/e-fd0d1Z/ej0dd2d3d4 Z1d5e2de2fd6d7Z3ej0dd2de-fd8d9Z4ej0dd2d:ej'd;ej'de
e*e*e*f fd<d=Z5G d>d? d?Z6G d@dA dAZ7	$ddBe*dCe*dDe*dEe*dFe-de-fdGdHZ8dIe*dJe*de*fdKdLZ9dMe*dNe*dOej'fdPdQZ:dMe*dNe*dOej;fdRdSZ<dMe*dNe*dTe*dUe!dVe!dWe"dOej'dXej'dYe*dZe6de7fd[d\Z=ej0dd2d]e d^e*d_e*d'e*d)e*dUe!dVe!dWe"dOej'dXej'dEe*de
e*e*e*e*e*e6e7f fd`daZ>ej?ej@jAejBej@jCejDej@jEejFej@jGejHej@jAejIej@jCejJej@jKejLej@jMejNej@jOejPej@jQejRej@jSejTej@jAejUej@jAejVej@jAejWej@jAiZXeee2f eYdb< ejZj[ejZj[ejZj\ejZj]ejZj^dcZ_dej#dde
ej`def dfe
ej`def dge
ejadef dhejZdejbfdidjZcdej#dke*dle*dme*dne*doe*dpe*dejbfdqdrZddse!dej#dte*due*dMe*dTe*dve*d)e*dpe*dejbfdwdxZedse!dej#dye*due*dNe*dTe*dve*d)e*dpe*dejbfdzd{Zfdse"dej#dte*dye*dMe*dNe*dve*d)e*dpe*dejbfd|d}Zgdse!dej#d~e*due*de*dTe*d)e*dpe*dejbfddZhddd$dejRdej;diZii ZjG dd dZkdd Zlde2de2dekfddZmd^e*d_e*d'e*de*d)e*dUe!dVe!dWe"de2dej'fddZndej#dej#dej#dej#dej#dej#dUe!dVe!de2fddZodej#dej#dej#dej#dej#dej#dUe!dVe!de2ddfddZpd^e*d_e*d'e*de*de*d)e*dUe!dVe!dWe"de2dej'fddZqdej#dej#dej#dej#dej#dej#de*dUe!dVe!de2fddZrdej#dej#dej#dej#dej#dej#de*dUe!dVe!de2ddfddZseddg		dde
ej#ej#f de
ej#ej#f dej#dej#d.e	e
e*e*e*f  de2de-fddZtei etd		dde
ej#ej#f de
ej#ej#f dej#dej#d.e	e
e*e*e*f  de2ddfddZueddg		dde
ej#ej#f de
ej#ej#f dej#dej#de*d.e	e
e*e*e*f  de2de-fddZvei evd		dde
ej#ej#f de
ej#ej#f dej#dej#de*d.e	e
e*e*e*f  de2ddfddZwG dd dZxex ZydS )a*  
MIT License

Copyright (c) 2025 DeepSeek

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
    N)AnyDictOptionalTuplez^Could not import the 'cuda' module. Please install cuda-python that matches your CUDA version.   )ArtifactPath)checkCudaErrors)	get_cubin)FLASHINFER_CUBIN_DIR)ceil_divround_upsupported_compute_capabilitybackend_requirementc                   @   s&   e Zd ZdZdZdZdefddZdS )GemmTyper   r      returnc                 C   s   dddd| j  S )NzGemmType::NormalzGemmType::GroupedContiguouszGemmType::GroupedMasked)r   r   r   valueself r   Q/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/flashinfer/deep_gemm.py__str__=   s   zGemmType.__str__N)__name__
__module____qualname__NormalGroupedContiguousGroupedMaskedstrr   r   r   r   r   r   8   s
    r   c                   @   s2   e Zd ZdZdZdd Zdd Zdefdd	Zd
S )MajorTypeABr   r   c                 C      | j dkrdS dS )Nr   r   r   r   r   r   r   shape_directionI      zMajorTypeAB.shape_directionc                 C   r!   Nr   r"   r   r   r   r   r   non_contiguous_dimL   r$   zMajorTypeAB.non_contiguous_dimr   c                 C   s   ddd| j  S )Nzcute::UMMA::Major::Kzcute::UMMA::Major::MN)r   r   r   r   r   r   r   r   O   s   zMajorTypeAB.__str__N)	r   r   r   KMajorMNMajorr#   r'   r   r   r   r   r   r   r    E   s    r    c                   @   s   e Zd ZdZdZdd ZdS )MajorTypeCDr   r   c                 C   r!   r%   r   r   r   r   r   r'   W   r$   zMajorTypeCD.non_contiguous_dimN)r   r   r   NMajorMMajorr'   r   r   r   r   r*   S   s    r*   tc                 C   sj   |   dv sJ |   dkr!| d| d| d ks!J d| ddks1| ddks3J d S d S )Nr      r/   r   r&   r"   z-Grouped dimension cannot have abnormal strider   )dimstridesizer-   r   r   r   major_check[   s    (r4   c                 C   "   t |  | ddkrtjS tjS Nr"   r   )r4   r1   r    r(   r)   r3   r   r   r   get_major_type_abd      r7   c                 C   r5   r6   )r4   r1   r*   r+   r,   r3   r   r   r   get_major_type_cdi   r8   r9   dtypec                 C   s   t jdt jdt jdi|  S )Nr   r      )torchfloat8_e4m3fnbfloat16floatr:   r   r   r   get_element_sizen   s   rA   c                   C   s   dS N   r   r   r   r   r   %get_m_alignment_for_contiguous_layoutv   s   rD   xelement_sizer   c                 C   s&   d}|| dks
J || }t | |S )N   r   )r   )rE   rF   tma_alignment_bytes	alignmentr   r   r   get_tma_aligned_sizez   s   
rJ   c                 C   sL  | j tjkr|  dv sJ | tjd? tj}| jd | jd }}d}|  dkr5| 	dd} }| jd }t
|d	}t|d	}tj|||f| jtjd
}||d d d |d |f< |djtjd|||d	 }ttj||d	 |f| jtjd
dd}	||	d d d d d d f< |	d d d |d d f }
|r|
dS |
S )Nr.      r&   r"   Fr   r   Tr;   )devicer:   r@   r   )r:   r<   r?   r0   viewinttouint8shape	unsqueezerJ   r   zerosrL   	transposeemptysqueeze)rE   ue8m0_tensormnk
remove_dimb
aligned_mn	aligned_kpadded
transposed	aligned_xr   r   r   'get_col_major_tma_aligned_packed_tensor   s(   


"ra   FsfrX   rY   gran
num_groupstma_stride_check
type_checkc                 C   s  |d ur| j |ksJ | j tjtjfv sJ |  t|d ud ks$J |d ur1| d|ks1J | dt||d ks?J | dt||d | j tjkrPdnd ksWJ |r|d urn| d| d| d ksnJ | ddkswJ | dt|| 	 ksJ | S )Nr   r&   r   r"   r   r;   )
r:   r<   r?   rN   r0   r2   r   r1   rJ   rF   )rb   rX   rY   rc   rd   re   rf   r   r   r   check_sf_layout   s   
0"rh   recipeis_sfac              	   C   s  ||rdnd |d f}| j tjko|dkot dv p)| j tjko)|dko)t dv }|s5t| ||||d | j tjkrF|dkrFt dkrFt| j tjkre|dkret dv ret| } t| ||d|d	tjd
S | j tjkrv|dkrvt dkrvt| j tjkr|dkrt dv r| dtj	|| j
dd } t| } t| ||d|d	tjd
S |rt| ||d|d	tjd
S td| j d|dt  )Nr   r   r   )r   rC   100a103a)rC   rC   )rX   rY   rc   rd   90aT)rX   rY   rc   rd   re   rf   r&   rL   rC   zUnknown cases: sf.dtype=z, gran=z, arch=)r:   r<   rN   get_device_archrh   r?   NotImplementedErrorra   index_selectarangerL   AssertionError)rb   rX   rY   ri   rd   rj   rc   should_skip_transformr   r   r   !transform_sf_into_required_layout   sr   




rv   )maxsizec                  C   s2   t j \} }| dkrdnd}| d |  | S )N	   a 
   )r<   cudaget_device_capability)majorminorsuffixr   r   r   rp     s   rp   sc                 C   s(   t  }|| d | dd S )Nutf-8r      )hashlibmd5updateencode	hexdigest)r   r   r   r   r   hash_to_hex  s   r   c                   C   s   ddddt   S )NTF)rn   rl   rm   )rp   r   r   r   r   must_be_k_major  s   r   	sfa_dtype	sfb_dtypec              
   C   sT   | t jt jfv s
J dt jfddt jfddt jfddt jfddt jfdit |f S )Nrn   )r   rC   rC   rl   )r   r   rC   rm   )r<   r?   rN   rp   )r   r   r   r   r   get_default_recipe(  s   




r   c                   @   s:   e Zd ZdedefddZdefddZdefd	d
ZdS )MulticastConfignum_multicastis_multicast_on_ac                 C   s   || _ || _d S N)r   r   )r   r   r   r   r   r   __init__7  s   
zMulticastConfig.__init__block_mc                 C   s$   t  dksJ || jr| j S d S Nrn   r   rp   r   r   )r   r   r   r   r   get_ab_load_block_m;     z#MulticastConfig.get_ab_load_block_mblock_nc                 C   s$   t  dksJ || jrd S | j S r   r   )r   r   r   r   r   get_ab_load_block_n@  r   z#MulticastConfig.get_ab_load_block_nN)r   r   r   rN   boolr   r   r   r   r   r   r   r   6  s    r   c                   @   s&   e Zd ZdedededefddZdS )SharedMemoryConfig	smem_sizeswizzle_a_modeswizzle_b_modeswizzle_cd_modec                 C   sZ   || _ || _|| _|| _d| _| jdksJ | jdksJ | jdks$J | jdks+J d S )Nr   rG   )r   r   r   r   swizzle_sf_mode)r   r   r   r   r   r   r   r   r   G  s   zSharedMemoryConfig.__init__N)r   r   r   rN   r   r   r   r   r   r   F  s    r   	shape_dim	block_dimr   num_smsrequire_divisiblec                 C   s(   t | || dkp| }|o|| dkS Nr   r   )r   r   r   r   r   	divisibler   r   r   is_multicast_legal\  s   r   
block_size	elem_sizec                 C   s.   dD ]}| | | dkr|  S qt d dS )N)rC   @       rG   r   zInvalid mode)rt   )r   r   
mode_bytesr   r   r   get_swizzle_modei  s   r   r   r   ab_dtypec                 C   s8   d}| | dks
J t jdt jt| |t||fi| S )NrC   r   )r   r   )r<   r>   r=   r   )r   r   r   num_utccp_aligned_elemsr   r   r   get_sf_aligned_block_sizess  s   r   c                 C   s,   t | ||\}}d| |d  |d  dkS )Nr   r      )r   )r   r   r   
sf_block_m
sf_block_nr   r   r   is_tmem_size_legal  s   r   block_kmajor_amajor_bmajor_dcd_dtype
num_stagesmulticast_configc
                 C   s:  |t jksJ t|}
t|}|	| }|	|}t|tjkr!|n||
}t|tjkr-|n||
}t|t jkr9|n| |}d}t| || d }|| |
 }|| |
 }t	| ||\}}|d }|d }|d d d d }d}d}||7 }||| 7 }||| 7 }||| 7 }||| 7 }||7 }||7 }t
||||S )NrC   r   r;      r/   r   r   )r*   r+   rA   r   r   r   r    r(   minr   r   )r   r   r   r   r   r   r   r   r   r   ab_elem_sizecd_elem_sizeload_block_mload_block_nr   r   r   layout_ad_msmem_dsmem_a_per_stagesmem_b_per_stager   r   smem_scales_a_per_stagesmem_scales_b_per_stagesmem_barriersmem_tmem_ptrr   r   r   r   get_smem_config  sD   

r   	gemm_typemnc           "         s  |t jksJ |	t jt jfv sJ d }| tjkrt f}n	|tjkr$dnd}|tjkr3t	t
dddnt	t
ddd}dt| }fdd fd	d} fd
d}d\}}|D ]k}|D ]f}d}||||||}}|d u s|d u s||k rd}n4||kr|||}|||}||k}||kr|||ko||k O }|||ko||k O }|||ko||kO }|t|||M }|r||fn||f\}}qdq`|d ur|d usJ tdd}dt|ddo| tjkd}||krdndD ]}dkr|| rtd|dk} nqd\}}}t	tfddd}|D ]} t||||||||	| |
}|j|kr4| } nq|d us=J |d usDJ |||}tt|t|  |}!t|!|j|j }!|!kshJ |!||||||fS )N)rC   )rC      rG   i  r   rC   c                    s   | dkr S | S r   r   )rE   )r   r   r   <lambda>  s    z"get_best_configs.<locals>.<lambda>c                    s&   | rt t  | t |  S d S r   r   bmbn)r   r   rd   r   r   r   r     s    c                    s     t | t |   S r   r   r   )fix_wave_saturater   r   rd   r   r   r   r     s    )NNFTr   r   )AB)r   r   r   r   )NNi  c                    s   | t  d dkS )NrC   r   )max)r   )rY   r   r   r   (  s    )r            r;   r/   r   r   )r<   r=   r>   r?   r   r   rD   r    r(   tuplerangerA   r   r   r   r   filterr   r   r   r   )"r   r   r   rY   rd   r   r   r   r   r   r   block_msblock_nsr   get_num_wavesget_last_wave_utilbest_block_mbest_block_nr   r   success	num_wavesbest_num_wavesutil	best_utilbest_multicast_configis_legalibest_num_stagesbest_smem_configsm100_capacitystage_candidatesr   num_min_smsr   )r   rY   r   r   rd   r   r   get_best_configs  s   








r   tmap_type_map)r   rG   r   r   rC   	gmem_dims.gmem_strides	smem_dimsswizzle_typec                 C   sx   t |}t ||d ksJ t ||ksJ t| j }tt|||  |||tdf| tjj	|tj
jtjj}|S Nr   )lenr   r:   r   cbdcuTensorMapEncodeTileddata_ptr
cuuint32_tCUtensorMapInterleaveCU_TENSOR_MAP_INTERLEAVE_NONECUtensorMapL2promotion"CU_TENSOR_MAP_L2_PROMOTION_L2_256BCUtensorMapFloatOOBfill!CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE)r-   r   r   r   r   num_dimstensor_dtype
tensor_mapr   r   r   make_tma_xd_descp  s(   
r  gmem_inner_dimgmem_outer_dimsmem_inner_dimsmem_outer_dimgmem_outer_strideswizzle_modec           
      C   sx   |dkr||    dksJ ||    }t|t|f}t||    f}t|t|f}	t| |||	t| S r   )rF   r   
cuuint64_tr   r  swizzle_type_map)
r-   r  r  r	  r
  r  r  r   r   r   r   r   r   make_tma_2d_desc  s   
r  
major_typeshape_mshape_kouter_stridec	                 C   sb   |dkr| t jksJ ||| fd d |   \}	}
||fd d |   \}}t||	|
||||S r   )r    r(   r#   r  )r  r-   r  r  r   r   r  rd   r  r  r  r	  r
  r   r   r   make_tma_a_desc  s   
r  shape_nc	                 C   sZ   ||fd d |    }	|	d |	d | }
}||fd d |    \}}t||
|||||S )Nr   r   )r#   r  )r  r-   r  r  r   r   r  rd   r  	io_shapesr  r  r	  r
  r   r   r   make_tma_b_desc  s   r  c	           
      C   s0   | t jksJ d}	t|||| |t||	||S rB   )r*   r+   r  r   )
r  r-   r  r  r   r   r  rd   r  r   r   r   r   make_tma_cd_desc  s   r  shape_mnblock_mnc                 C   sJ   | t jksJ |dksJ t|| }t||t||d | |d||S )Nr   r;   r   )r    r)   rJ   rF   r  r   )r  r-   r  r  r  r   rd   r  r   r   r   make_tma_sf_desc  s   
r  Ttruefalsezcutlass::bfloat16_tr?   c                   @   s   e Zd ZdededdfddZdejfddZdd	d
Ze	de
eef defddZe	dejde
eef dejfddZdS )SM100FP8GemmRuntimepathsymbolr   Nc                 C   s$   || _ d | _d | _|| _tj| _d S r   )r  libkernelr   r   cuLibraryUnload_cleanup_func)r   r  r   r   r   r   r      s
   zSM100FP8GemmRuntime.__init__c              
   K   s`   | j d u r)t| jdd}tt|g g dg g d| _tt| jt| jdd| _ | 	| j |S )Nr   )encodingr   )
r"  bytesr  r   r   cuLibraryLoadFromFiler!  cuLibraryGetKernelr   launch)r   kwargsr  r   r   r   __call__(  s   
zSM100FP8GemmRuntime.__call__c              
   C   sl   | j d ur2t| dd }t|r4z|| j  W d S  ty1 } ztd|  W Y d }~d S d }~ww d S d S )Nr$  z5Failed to delete SM100FP8GemmRuntime with exception: )r!  getattrcallable	Exceptionprint)r   cleanuper   r   r   __del__7  s   
zSM100FP8GemmRuntime.__del__r*  c                 C   s  | d t jt jfv sJ dg d| d  d| d  dd| d v r)| d	 nd
 dd| d v r8| d nd
 dd| d v rG| d nd
 d| d  d| d  d| d  d| d  d| d  d| d  d| d  d| d  d| d  d| d  d| d  d| d  dt| d   d| d  dt| d   dt| d   d}|S )N
CD_DTYPE_Trz   a+  
#ifdef __CUDACC_RTC__
#include <deep_gemm/nvrtc_std.cuh>
#else
#include <cuda.h>
#include <string>
#endif

#include <deep_gemm/impls/sm100_fp8_gemm_1d1d.cuh>

using namespace deep_gemm;

static void __instantiate_kernel() {
    auto ptr = reinterpret_cast<void*>(&sm100_fp8_gemm_1d1d_impl<
        MAJOR_Az
,
        MAJOR_Br   COMPILED_DIMSMr   r   NrY   KBLOCK_MBLOCK_NBLOCK_K
NUM_GROUPSSWIZZLE_A_MODESWIZZLE_B_MODESWIZZLE_CD_MODE
NUM_STAGESNUM_LAST_STAGESNUM_NON_EPILOGUE_THREADSNUM_EPILOGUE_THREADSNUM_MULTICASTIS_MULTICAST_ON_A	GEMM_TYPEWITH_ACCUMULATIONz
      >);
};
)r<   r>   r?   joinpytypes_to_ctypes)r*  coder   r   r   generateA  sZ   

 
!
"&zSM100FP8GemmRuntime.generater"  c                 C   s8  t ttjj|d | t|d  t }|d |j_d|j_	d|j_
t }tjj|_||_t }d|_|g|_|d |_d|_d|_|d |d  |_d|_d|_|d |_|d |_|d	  |d
 |d |d |d |d |d |d |d |d f
}tjtjtjtjd d d d d d f
}t|| ||fdS )N	SMEM_SIZEDEVICE_INDEXrE  r   NUM_SMSrC  rD  STREAMGROUPED_LAYOUTr7  r8  r9  TENSOR_MAP_ATENSOR_MAP_BTENSOR_MAP_SFATENSOR_MAP_SFBTENSOR_MAP_CTENSOR_MAP_Dr   ) r   r   cuKernelSetAttributeCUfunction_attribute/CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTESCUdeviceCUlaunchAttributeValue
clusterDimrE   yzCUlaunchAttributeCUlaunchAttributeID%CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSIONidr   CUlaunchConfignumAttrsattrsgridDimXgridDimYgridDimZ	blockDimX	blockDimY	blockDimZsharedMemByteshStreamr   ctypesc_void_pc_uint32cuLaunchKernelEx)r"  r*  attr_valattrconfig
arg_values	arg_typesr   r   r   r)  m  sd   	




zSM100FP8GemmRuntime.launch)r   N)r   r   r   r   r   r   CUresultr+  r2  staticmethodr   r   rL  CUkernelr)  r   r   r   r   r    s    

+(r  c                  C   sl   t D ]1} | tv r	qt |  \}}| d } ttjd |  | ttj |  }| s*J tt||t| < qd S )N.cubin/)	
KERNEL_MAPRUNTIME_CACHEr	   r   DEEPGEMMr
   existsr  r   )
cubin_namer   sha256r  r   r   r   load_all  s   r  namerK  c                 C   s   |  d| }d|  dt | }|tvrtd| |tv r$t| S t| \}}|d }ttjd | | ttj | }| sEJ t	t
||t|< t| S )Nz$$zkernel..zcubin not registered: r{  r|  )r   r}  
ValueErrorr~  r	   r   r  r
   r  r  r   )r  rK  	signaturer  r   r  r  r   r   r   load  s   r  r]   compiled_dimsoutput_dtypec
                 C   s   t jjddj}
ttj| ||||||t j|	|
\}
}}}}}}i d|d| d|d|dtjdd	d
d	d|d|d|d|d|d|d|dt||| d|j	d|j
|j|j|jd|	d}|
||||||f|fS )Nr|   ro   r6  r7  r8  r9  rG  rC  rC   rD  r4  r5  r=  r:  r;  r<  rA  rB  r>  r?  Fr@  rE  rF  rH  r3  )r<   r|   get_device_propertiesmulti_processor_countr   r   r   r=   r   r   r   r   r   r   )r   r   rY   r]   rd   r   r   r   r  r  r   r   r   r   r   r   smem_configr*  r   r   r   2m_grouped_fp8_gemm_nt_contiguous_static_kwargs_gen  s   	
r  ry   sfar[   sfbd	m_indicesc	                 C   sD  | j \}	}
|j \}}}tj}t|
d}t|	||
|||||||j
\\}}}}}}}}t|| |	|
|||| |	 d|j
d	}t||||
|||||	 ||jd	}t|||	|||||	 d|jd	}ttj||	|
||d|jd}ttj|||
||||jd}i ||||j||||||tj j|jjd}||fS )NrC   r   )rd   r  rQ  rO  rM  rR  rS  rT  rU  rV  rW  rP  rN  )rQ   r*   r+   r   r  r:   r  r   r1   r'   r   r  r   r   r  r   r  r    r)   r   r   r<   r|   current_streamcuda_streamrL   index)ry   r  r[   r  r  r  r   r   r  r   rY   rd   r   _r   r]   r   r   r   r   r   r   r  static_kwargstensor_map_atensor_map_btensor_map_dtensor_map_sfatensor_map_sfb
all_kwargsr   r   r   +m_grouped_fp8_gemm_nt_contiguous_kwargs_gen  s   




r  c	              
   C   sB   t | ||||||||	\}	}
t|	}td|}|di |
 d S Nfp8_m_grouped_gemmr   )r  r  rL  r  )ry   r  r[   r  r  r  r   r   r  r  r  rK  runtimer   r   r   &m_grouped_fp8_gemm_nt_contiguous_sm10xv  s   

r  
expected_mc                 C   s   t jjddj}ttj|||||||t j|
|\}}}}}}}|dkr+| | dks+J i d|	d| d|d|d	tjd
dddd|d|d|d|d|d|d|dt||| d|j	d|j
|j|j|jd|
d}|||||||f|fS )Nr|   ro   r   r   r6  r7  r8  r9  rG  rC  rC   rD  r4  r5  r=  r:  r;  r<  rA  rB  r>  r?  Fr  )r<   r|   r  r  r   r   r   r=   r   r   r   r   r   r   )r   r   rY   r  r]   rd   r   r   r   r  r  r   r   r   r   r   r   r  r*  r   r   r   .m_grouped_fp8_gemm_nt_masked_static_kwargs_gen  s   	
r  masked_mc
                 C   s>  | j \}
}}|j \}}}tj}t|d}t||||||
||||	|j\\}}}}}}}}t|| |||||| |	 |
|j
	}t|||||||||	 |
|j	}t||||||||	 |
|j	}ttj||||||
|j}ttj||||||
|j}i ||||j||||||tj j|jjd}||fS )NrC   r  )rQ   r*   r+   r   r  r:   r  r   r1   r'   r   r  r   r   r  r   r  r    r)   r   r   r<   r|   r  r  rL   r  )ry   r  r[   r  r  r  r  r   r   r  rd   r   rY   r  r   r   r]   r   r   r   r   r   r   r  r  r  r  r  r  r  r  r   r   r   'm_grouped_fp8_gemm_nt_masked_kwargs_gen  s   



r  c
                 C   sD   t | |||||||||	
\}
}t|
}td|}|di | d S r  )r  r  rL  r  )ry   r  r[   r  r  r  r  r   r   r  r  r  rK  r  r   r   r   "m_grouped_fp8_gemm_nt_masked_sm10xA  s   

r  d   g   nka_fp8b_fp8c                 C   s  t | d }t |d }|tjkrtd| t r'|tjkr'td| | s4td|  | \}}	|\}
}|j\}}|
j\}}}|j\}}| }||ks`||ks`||ks`||krytd| d| d| d| d	| d
| d| |jt	j
krtd|j |
jt	j
krtd|
j |jt	jkrtd|j |jt	jkrtd|j t|tjkrtdt| dS )Nr    major_a must be KMajor, but got  major_b must be KMajor, but got z&m_indices must be contiguous, but got zShape mismatch. m = , m_ = , k = , k_ = , n = , n_ = z, m__ = !a must be float8_e4m3fn, but got !b must be float8_e4m3fn, but got d must be bfloat16, but got z!m_indices must be int32, but got d must be N-major, but got T)r7   r    r(   r  r   is_contiguousrQ   numelr:   r<   r=   r>   int32r9   r*   r+   )r  r  r  r  ri   r  r   r   ry   r  r[   r  r   rY   rd   r   k_m_n_m__r   r   r   4_check_group_deepgemm_fp8_nt_contiguous_problem_sizeV  s>   



 ,r  )common_checkc                 C   s   |  }t| d }t|d }| \}}	|\}
}|j\}}|
j\}}}|dkr)d S |d u r4t|	j|jn|}t|	|||dd}	t|||||dd}tjt|||dtjt|||ddt	  }|||	|
||| d S )Nr   T)rX   rY   ri   rj   FrX   rY   ri   rd   rj   r   r   r  rk   )
lowerr7   rQ   r   r:   rv   	functoolspartialr  rp   )r  r  r  r  ri   r  r   r   ry   r  r[   r  r   rY   rd   r   r  implr   r   r    m_grouped_fp8_gemm_nt_contiguous  s<   
r  c                 C   s  t | d }t |d }|tjkrtd| |tjkr$td| | s1td|  | \}	}
|\}}|	j\}}}|j\}}}|j\}}}| }||ks[||ks[||krktd| d| d| d| ||ksw||ksw||krtd	| d
| d| d| d| d| |dks|dks|dks|dks|dkrtd| d| d| d| d| 
|	jtj	krtd|	j |jtj	krtd|j |jtj
krtd|j |jtjkrtd|j t|tjkrtdt| dS )Nr   r  r  z%masked_m must be contiguous, but got z"num_groups mismatch. num_groups = z, num_groups_ = z, num_groups__ = z, num_groups___ = zm, n, k mismatch. m = r  r  r  r  r  zMexpected_m, m, n, k, num_groups must be greater than 0, but got expected_m = z, m = z, num_groups = r  r  r  z masked_m must be int32, but got r  T)r7   r    r(   r  r  rQ   r  r:   r<   r=   r>   r  r9   r*   r+   )r  r  r  r  r  ri   r  r   r   ry   r  r[   r  rd   r   rY   num_groups_r   r  num_groups__r  r  num_groups___r   r   r   0_check_m_grouped_fp8_gemm_nt_masked_problem_size  sR   


&( r  c                 C   s   |  }t| d }t|d }||  krtjksJ  J | s%J | \}	}
|\}}|	j\}}}|j\}}}|d u rDt|
j|jn|}t|
||||dd}
t|||||dd}t	j
t|||dt	j
t|||ddt  }||	|
||||| d S )Nr   Tr  Fr  rk   )r  r7   r    r(   r  rQ   r   r:   rv   r  r  r  rp   )r  r  r  r  r  ri   r  r   r   ry   r  r[   r  rd   r   rY   r  r   r  r  r   r   r   m_grouped_fp8_gemm_nt_masked  s@   r  c                   @   s0   e Zd ZdZdd Zdd Zdd Zdd	 Zd
S )	KernelMap@f161e031826adb8c4f0d31ddbd2ed77e4909e4e43cdfc9728918162a62fcccfbc                 C   s
   d | _ d S r   )indicer   r   r   r   r   2  s   
zKernelMap.__init__c                 C   sr   t jd d }t|| jsJ dt| }| sJ t|d}t|| _	W d    d S 1 s2w   Y  d S )Nr|  zkernel_map.jsonzCcubin kernel map file not found, nor downloaded with matched sha256r)
r   r  r	   KERNEL_MAP_HASHr
   r  openjsonr  r  )r   indice_pathr  fr   r   r   init_indices5  s   "zKernelMap.init_indicesc                 c   s*    | j d u r
|   | j D ]}|V  qd S r   r  r  )r   r  r   r   r   __iter__?  s   

zKernelMap.__iter__c                 C   s   | j d u r	|   | j | S r   r  )r   keyr   r   r   __getitem__E  s   

zKernelMap.__getitem__N)r   r   r   r  r   r  r  r  r   r   r   r   r  .  s    
r  )FN)NF)F)Nr  )z__doc__ro  enumr  r   r  typingr   r   r   r   cuda.bindings.driverbindingsdriverr   ImportErrorr1  r<   	artifactsr   
cuda_utilsr   jit.cubin_loaderr	   jit.envr
   utilsr   r   r   r   Enumr   r    r*   Tensorr4   r7   r9   r:   rA   rD   rN   rJ   ra   r   rh   rv   	lru_cacherp   r   r   r   r   r   r   r   r   r   r?   r   r   r   int8CUtensorMapDataTypeCU_TENSOR_MAP_DATA_TYPE_UINT8int16CU_TENSOR_MAP_DATA_TYPE_UINT16r  CU_TENSOR_MAP_DATA_TYPE_INT32int64CU_TENSOR_MAP_DATA_TYPE_INT64rP   uint16uint32CU_TENSOR_MAP_DATA_TYPE_UINT32uint64CU_TENSOR_MAP_DATA_TYPE_UINT64float32CU_TENSOR_MAP_DATA_TYPE_FLOAT32float16CU_TENSOR_MAP_DATA_TYPE_FLOAT16r>    CU_TENSOR_MAP_DATA_TYPE_BFLOAT16r=   float8_e4m3fnuzfloat8_e5m2float8_e5m2fnuzr   __annotations__CUtensorMapSwizzleCU_TENSOR_MAP_SWIZZLE_NONECU_TENSOR_MAP_SWIZZLE_32BCU_TENSOR_MAP_SWIZZLE_64BCU_TENSOR_MAP_SWIZZLE_128Br  r  r   CUtensorMapr  r  r  r  r  r  rJ  r~  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r}  r   r   r   r   <module>   s   	$

$

T




	


B	
 














	

	

	

	

	
 	

A	
p	

	

E	

r	


1	
/=
1
