o
    پiX                     @   sF  d Z ddlZddlmZ ddlZddlZddlmZmZ ddl	m
Z ddlmZmZmZmZmZmZ dd	lmZmZ dd
lmZmZmZ defddZdefddZdefddZdefddZdefddZdefddZdefddZ defddZ!defddZ"ej#dfdej$d e%defd!d"Z&defd#d$Z'defd%d&Z(dS )'a3  
Copyright (c) 2024 by FlashInfer team.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

  http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
    N)product   )ArtifactPathCheckSumHash   )env)JitSpecgen_jit_specsm90a_nvcc_flagssm100a_nvcc_flagssm100f_nvcc_flagscurrent_compilation_context)	get_cubinget_meta_hash)dtype_cutlass_mapfilename_safe_dtype_mapwrite_if_differentreturnc                   C   s*   t dtjd tjd tjd gddgdS )Ngemmz
bmm_fp8.cuzgroup_gemm.cuzflashinfer_gemm_binding.cuz-lcublasz
-lcublasLt)extra_ldflags)r	   jit_envFLASHINFER_CSRC_DIR r   r   L/home/ubuntu/.local/lib/python3.10/site-packages/flashinfer/jit/gemm/core.pygen_gemm_module%   s   r   c                  C   s   t jd } tj| dd t jd g}tt jd H}t| }ddg}g d}|D ].\}}}|D ]&}	| d	|	 d
| d
| d
| d	 }
|	|
 |j
|	|||d}t|
| q2q+W d    n1 sdw   Y  tjg dd}td||ddg dgdS )Ngen_gemm_sm100_cutlass_fp4Texist_okzfp4_gemm_cutlass.cufp4_gemm_cutlass.jinja__nv_bfloat16half   @   r#   r#      r#   )r#   r#   r&   )r#   r&   r&   fp4_gemm_cutlass__.cutypecta_mcta_ncta_k
         supported_major_versionsfp4_gemm_cutlass-DENABLE_BF16-DENABLE_FP4-DFAST_BUILDextra_cuda_cflagsextra_cflagsr   FLASHINFER_GEN_SRC_DIRosmakedirsr   openjinja2Templatereadappendrenderr   r   get_nvcc_flags_listr	   gen_directorysource_pathsfkernel_inst_templ
dtype_listcta_m_n_k_listr,   r-   r.   dtype	dest_pathsource
nvcc_flagsr   r   r   !gen_gemm_sm100_module_cutlass_fp41   sN   

rR   c                  C   s  t jd } tj| dd t jd g}tt jd H}t| }ddg}g d}|D ].\}}}|D ]&}	| d	|	 d
| d
| d
| d	 }
|	|
 |j
|	|||d}t|
| q2q+W d    n1 sdw   Y  tt jd H}t| }ddg}g d}|D ].\}}}|D ]&}	| d	|	 d
| d
| d
| d	 }
|	|
 |j
|	|||d}t|
| qqW d    n1 sw   Y  tjg dd}td||ddg dgdS )Ngen_gemm_sm103_cutlass_fp4Tr   zfp4_gemm_cutlass_sm103.cuzfp4_gemm_cutlass_sm103.jinjar   r    ))r#   r#      )r#      rT   )r#   r&   rT   r'   r(   r)   r*   r   r!   r/   r3   fp4_gemm_cutlass_sm103r6   r7   r8   r9   r<   rG   r   r   r   !gen_gemm_sm103_module_cutlass_fp4a   sx   


rW   c                  C   s   t jd } tj| dd t jd g}tt jd G}t| }ddg}dg}|D ].\}}}|D ]&}	| d	|	 d
| d
| d
| d	 }
|	|
 |j
|	|||d}t|
| q1q*W d    n1 scw   Y  tjdgd}td||ddg dgdS )Ngen_gemm_sm120_cutlass_fp4Tr   zfp4_gemm_cutlass_sm120.cuzfp4_gemm_cutlass_sm120.jinjar   r    r#   r#   r#   r'   r(   r)   r*   r2   r3   fp4_gemm_cutlass_sm120r6   r7   r8   r9   r<   rG   r   r   r   !gen_gemm_sm120_module_cutlass_fp4   sP   

r[   c                  C      t jd } tj| dd t jd g}tt jd H}t| }ddg}g d}|D ].\}}}|D ]&}	| d	|	 d
| d
| d
| d	 }
|	|
 |j
|	|||d}t|
| q2q+W d    n1 sdw   Y  tjg dd}td||dg dgdS )Ngen_gemm_sm100_cutlass_fp8Tr   zfp8_gemm_cutlass.cuzfp8_gemm_cutlass.jinjar   r    )r$   r$   r#   r$   r#   r#   r$   r&   r#   r"   rY   r%   fp8_gemm_cutlass_r(   r)   r*   r/   r3   fp8_gemm_cutlassr6   r8   r9   r<   rG   r   r   r   !gen_gemm_sm100_module_cutlass_fp8   sL   

rc   c                  C   r\   )Ngen_gemm_sm100_cutlass_bf16Tr   zbf16_gemm_cutlass.cuzbf16_gemm_cutlass.jinjar   r    )r^   r_   r`   r"   rY   bf16_gemm_cutlass_r(   r)   r*   r/   r3   bf16_gemm_cutlassr6   r8   r9   r<   rG   r   r   r   "gen_gemm_sm100_module_cutlass_bf16  sF   

rg   c                  C   s  t jd } tj| dd g }dD ]w}tt j| d }t| }W d    n1 s.w   Y  t	j
t	jg}t	jt	jg}ddg}dd	g}t||||D ]8\}	}
}}t|	 }t|
 }| | d
| d
| d| d| d
 }|| |jt|	 t|
 ||d}t|| qNqd}tt j| d }t| }W d    n1 sw   Y  t	j
t	jg}t	jt	jg}dd	g}ddg}t||||D ]9\}}}}t| }t| }| | d
| d
| d| d| d
 }|| |jt| dt| ||d}t|| qdD ]0}t j| }| | }|| t|d}| }W d    n	1 s'w   Y  t|| qtjg dd}td||dS )Ngen_gemm_sm100Tr   )gemm_groupwisegroup_gemm_fp8_groupwisez_sm100_kernel_inst.jinjatruefalse   r   r(   _major_mmaz	_sm100.cu)dtype_in	dtype_outscale_major_kmma_smgroup_gemm_mxfp4_groupwise_swapzcutlass::float_e2m1_t)dtype_adtype_bdtype_drs   swap_ab)zgemm_groupwise_sm100.cuz!group_gemm_fp8_groupwise_sm100.cuz#group_gemm_mxfp4_groupwise_sm100.cuzgemm_sm100_binding.cuzgroup_gemm_sm100_binding.curr/   r3   
gemm_sm100r:   )r   r=   r>   r?   r@   r   rA   rB   rC   torchfloat8_e4m3fnfloat8_e5m2float16bfloat16r   r   rD   rE   r   r   r   rF   r	   )rH   rI   prefixrJ   rK   dtype_in_listdtype_out_listscale_major_k_listmma_sm_listrp   rq   rr   rs   name_dtype_inname_dtype_outrO   rP   dtype_a_listdtype_d_listswap_ab_listrv   rx   ry   name_dtype_aname_dtype_dfilenamesrc_pathrQ   r   r   r   gen_gemm_sm100_module2  s   
 
 



r   c               
   C   s.  t jd } | jddd g }d}tjtjg}tjtjg}ddg}tt j	| d }t
| }W d    n1 s<w   Y  t|||D ]3\}}	}
t| }t|	 }| | d| d| d	|
 d
 }|| |jt| t|	 |
d}t|| qGd}tt j	| d }t
| }W d    n1 sw   Y  t|||D ]3\}}	}
t| }t|	 }| | d| d| d	|
 d
 }|| |jt| t|	 |
d}t|| qdD ].}t j	| }| | }|| t|d}| }W d    n1 sw   Y  t|| qtjdgd}td||dS )Ngen_gemm_sm120T)parentsr   ri   rk   rl   z_sm120_kernel_inst.jinjar(   rn   z	_sm120.cu)rp   rq   rr   rj   )zgemm_groupwise_sm120.cuz!group_gemm_fp8_groupwise_sm120.cuzgemm_sm120_binding.cuzgroup_gemm_sm120_binding.curz   r2   r3   
gemm_sm120r|   )r   r=   mkdirr}   r~   r   r   r   r@   r   rA   rB   rC   r   r   rD   rE   r   r   r   rF   r	   )rH   rI   r   r   r   r   rJ   rK   rp   rq   rr   r   r   rO   rP   r   r   rQ   r   r   r   gen_gemm_sm120_module  s   





r   c               	   C   s   t j d} d}t j d}t|tj}|sJ d| t|}t|  d| d|}|s5J | dtdtjd	 gd
dddt j dgt tj	|  gdS )N/includeflashinferMetaInfo/checksums.txt!Failed to get checksums.txt from /.h.h not foundtrtllm_gemmztrtllm_gemm_runner.cu-DTLLM_GEN_EXPORT_INTERFACE-DTLLM_GEN_EXPORT_FLASHINFER-DTLLM_ENABLE_CUDA-DTLLM_GEN_GEMM_CUBIN_PATH=\"\"r:   extra_include_paths
r   TRTLLM_GEN_GEMMr   r   r   r	   r   r   r   FLASHINFER_CUBIN_DIRinclude_pathheader_namechecksum_pathchecksum	meta_hashmetainfor   r   r   gen_trtllm_gen_gemm_module  s0   
r   FrN   use_sm_100fc                 C   s*  | t jt jfvrtd|  d| t jkrdnd}d| }tjd|  }tj|dd tjd	 g}t	tjd
 }t
| }W d   n1 sLw   Y  g d}|D ])\}	}
}|d| d|	 d|
 d| d	 }|| |j|	|
||d}t|| qWt|||rddgt nttjtjgdS )a  
    Generate TGV GEMM module for SM100 architecture.

    Args:
        dtype: Data type for the GEMM operation (torch.bfloat16 or torch.float16)
        use_sm_100f: Whether to compile with SM100f flags (default: False), which makes the compiled kernel
            compatible with both B200 and B300 GPUs. However, it's only available with CUDA 12.9+.

    Returns:
        JitSpec for the TGV GEMM module
    zUnsupported dtype z*. Only bfloat16 and float16 are supported.bf16fp16	tgv_gemm_gen_tgv_gemm_Tr   ztgv_gemm.cuztgv_gemm.jinjaN))r$         )r$   r   r   )r$   r   r0   )r$   r   r2   )r$      r   )r$   r   r   )r$   r   r0   )r$       r   )r$   r   r   )r$   r$   r   )r#   r   r   r(   xr)   )r,   r-   	dma_stagerN   z--expt-relaxed-constexprz -DCUTLASS_ENABLE_GDC_FOR_SM100=1r   )r}   r   r   
ValueErrorr   r=   r>   r?   r   r@   rA   rB   rC   rD   rE   r   r	   r   r   FLASHINFER_INCLUDE_DIR)rN   r   	dtype_strmodule_namerH   rI   rJ   rK   cta_m_n_dma_listr,   r-   r   rO   rP   r   r   r   gen_tgv_gemm_sm10x_module  sJ   

 
r   c               	   C   sb  t jd } tj| dd g }tt jd }t| }W d    n1 s'w   Y  t	j
t	j
ft	jt	jft	jt	j
ft	jt	j
ft	jt	jft	jt	jffD ],\}}t| }t| }| d| d| d }|| |jt| t| d}	t||	 qLd	D ].}
t j|
 }| |
 }|| t|d
}| }	W d    n1 sw   Y  t||	 q{td|tdS )Ngen_gemm_sm90Tr   z!group_gemm_sm90_kernel_inst.jinjagroup_gemm_r(   z_sm90.cu)rp   rq   )zgroup_gemm_sm90.cuzflashinfer_gemm_sm90_binding.curz   	gemm_sm90r|   )r   r=   r>   r?   r@   r   rA   rB   rC   r}   r   r   r~   r   r   rD   rE   r   r   r	   r
   )rH   rI   rJ   rK   rp   rq   r   r   rO   rP   r   r   r   r   r   gen_gemm_sm90_moduleF  sH   










r   c               	   C   s   t j d} d}t j d}t|tj}|sJ d| t|}t|  d| d|}|s5J | dtdtjd	 gd
dddt j dgt tj	|  gdgdS )Nr   r   r   r   r   r   r   trtllm_low_latency_gemmz!trtllm_low_latency_gemm_runner.cur   r   r   r   r   z-lcuda)r:   r   r   r   r   r   r   r   "gen_trtllm_low_latency_gemm_modulep  s2   
r   ))__doc__r>   	itertoolsr   rA   r}   	artifactsr   r    r   r   corer   r	   r
   r   r   r   cubin_loaderr   r   utilsr   r   r   r   rR   rW   r[   rc   rg   r   r   r   r   rN   boolr   r   r   r   r   r   r   <module>   s:     0C.2.OU&
K*