o
    Ii"                     @   sj  d dl Z d dlZd dlmZ d dlmZ d dlmZ d dlm	Z	m
Z
 edddgZd	d
ddZd	d
dZd	d
dZddgZg dZddgZddgZddgZddgZdZdZdZdZeG dd dZde	e fddZde	e fddZde	e fddZded eddfd!d"Zd#e
e ddfd$d%Ze d&kre j!d'd(d)Z"e"j#d*d+d,dd-d. e"$ Z%ee%j& dS dS )/    N)
namedtuple)	dataclass)Path)ListOptionalKerneltemplatefilenamezcutlass::half_tzcutlass::bfloat16_tzcutlass::float_e4m3_t)fp16bf16e4m3)r
   r   P   Z   )@   `            FTz#include "flash_fwd_launch_template.h"

#ifndef FLASHATTENTION_DISABLE_HDIM{HEAD_DIM}
template void run_mha_fwd_<{ARCH}, {DTYPE}, {HEAD_DIM}, {SPLIT}, {PAGEDKV}, {SOFTCAP}, {PACKGQA}>(Flash_fwd_params &params, cudaStream_t stream);
#endif
a  #include "flash_fwd_launch_template.h"

#ifndef FLASHATTENTION_DISABLE_SM8x
#ifndef FLASHATTENTION_DISABLE_HDIM{HEAD_DIM}
template void run_mha_fwd_<80, {DTYPE}, {HEAD_DIM}, {SPLIT}, {PAGEDKV}, {SOFTCAP}, {PACKGQA}>(Flash_fwd_params &params, cudaStream_t stream);
template void run_mha_fwd_<86, {DTYPE}, {HEAD_DIM}, {SPLIT}, {PAGEDKV}, {SOFTCAP}, {PACKGQA}>(Flash_fwd_params &params, cudaStream_t stream);
#endif
#endif
a#  #include "flash_bwd_launch_template.h"

#ifndef FLASHATTENTION_DISABLE_HDIM{HEAD_DIM}
template<>
void run_mha_bwd_<{ARCH}, {DTYPE}, {HEAD_DIM}, {SOFTCAP}>(Flash_bwd_params &params, cudaStream_t stream) {{
    run_mha_bwd_hdim{HEAD_DIM}<{ARCH}, {DTYPE}, {SOFTCAP}>(params, stream);
}}
#endif
a  #include "flash_bwd_launch_template.h"

#ifndef FLASHATTENTION_DISABLE_SM8x
#ifndef FLASHATTENTION_DISABLE_HDIM{HEAD_DIM}
template<>
void run_mha_bwd_<80, {DTYPE}, {HEAD_DIM}, {SOFTCAP}>(Flash_bwd_params &params, cudaStream_t stream) {{
    run_mha_bwd_hdim{HEAD_DIM}<80, {DTYPE}, {SOFTCAP}>(params, stream);
}}
template<>
void run_mha_bwd_<86, {DTYPE}, {HEAD_DIM}, {SOFTCAP}>(Flash_bwd_params &params, cudaStream_t stream) {{
    run_mha_bwd_hdim{HEAD_DIM}<86, {DTYPE}, {SOFTCAP}>(params, stream);
}}
#endif
#endif
c                   @   sr   e Zd ZU eed< eed< eed< eed< eed< eed< eed< eed< ed	efd
dZed	efddZ	dS )r   smdtypehead_dimsplitpaged_kvsoftcappackgqa	directionreturnc              	   C   s  | j dkr[| jdkr9| jp| jp| j}tjt| jt| j	 | j
t| j t| j t| j t| dS tjt| j	 | j
t| j t| j t| j td dS | j dkr| jdkrztjt| jt| j	 | j
t| j dS tjt| j	 | j
t| j dS d S )	Nfwdr   )ARCHDTYPEHEAD_DIMSPLITPAGEDKVSOFTCAPPACKGQAT)r   r    r!   r"   r#   r$   bwd)r   r   r    r#   )r   r    r#   )r   r   r   r   r   KERNEL_IMPL_TEMPLATE_FWD_SM90formatstr	DTYPE_MAPr   r   lowerr   KERNEL_IMPL_TEMPLATE_FWD_SM8xKERNEL_IMPL_TEMPLATE_BWD_SM90KERNEL_IMPL_TEMPLATE_BWD_SM8x)selfr    r/   K/home/ubuntu/.local/lib/python3.10/site-packages/hopper/generate_kernels.pyr   ^   s0   



zKernel.templatec                 C   s^   d| j  d| j d| j | jrdnd | jrdnd | jrdnd | jr&dnd d	| j d
S )Nflash__hdim__paged _split_softcap_packgqa_sm.cu)r   r   r   r   r   r   r   r   )r.   r/   r/   r0   r	   |   s   ^zKernel.filenameN)
__name__
__module____qualname__int__annotations__r(   boolpropertyr   r	   r/   r/   r/   r0   r   S   s   
 r   c                  c   s    t t tttttt	D ]-\} }}}}}}|r&|dk s%|dkr&|s%|r&q|dks.| t
v r;t|| |||||ddV  qt t ttt	D ]\} }}}t|| |dd|dddV  qFd S )Nr   r   )r   r   r   r   r   r   r   r   Fr%   )	itertoolsproductr)   keysHEAD_DIMENSIONSr!   r"   r#   r$   SMDTYPE_MAP_FWD_SM8xr   DTYPE_MAP_BWD)r   r   r   r   r   r   r   r/   r/   r0   get_all_kernels   s   , rI   c              
   #   s    t t tttttD ]R\ dk rq fdd| D }t	|dkr_d  r5dnd r;dnd rAd	nd rGd
nd d d	}d
dd |D }t||V  qd S )Nr   c                    sV   g | ]'}|j d kr|j kr|jkr|jkr|jkr|jkr|jkr|qS r   )r   r   r   r   r   r   r   .0kr   r   r   r   r   r   r/   r0   
<listcomp>      V zbatch_hdim.<locals>.<listcomp>r   flash_fwd_hdimall_r4   r5   r6   r7   r8   r9   r:   
c                 S      g | ]	}d |j  dqS z
#include ""r	   rK   r/   r/   r0   rO          )rB   rC   r)   rD   r!   r"   r#   r$   rF   lenjoinKERNEL_BATCHkernels_allkernelsr	   r   r/   rN   r0   
batch_hdim   s   (Br^   c                 #   sB   t t tttttD ]P\ dkrq fdd| D }t	|dkr]d d  r8dnd r>d	nd d
rEdnd d d}d
dd |D }t||V  qt t ttD ]7\ dk rqqg fdd| D }t	|dkrd d  d d}d
dd |D }t||V  qgd S )Nr   c                    sV   g | ]'}|j d kr|j kr|jkr|jkr|jkr|jkr|jkr|qS rJ   )r   r   r   r   r   r   r   rK   r   r   r   r   r   r   r/   r0   rO      rP   z!batch_softcap.<locals>.<listcomp>r   flash_fwd_hdimr3   r4   r5   r6   _softcapallr8   r9   r:   rR   c                 S   rS   rT   rV   rK   r/   r/   r0   rO      rW   c                    s8   g | ]}|j d kr|j kr|jkr|jkr|qS )r%   )r   r   r   r   rK   )r   r   r   r/   r0   rO      s   8 flash_bwd_hdim_softcapall_smc                 S   rS   rT   rV   rK   r/   r/   r0   rO      rW   )rB   rC   r)   rD   rE   r!   r"   r$   rF   rX   rY   rZ   r[   r/   r_   r0   batch_softcap   s(   (>rd   kernelautogen_dirc                 C   s   d}|| j  || j  d S )Na  // Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
// Splitting the different template instantiations to different files to speed up compilation.
// This file is auto-generated. See "generate_kernels.py"

)r	   
write_textr   )re   rf   preluder/   r/   r0   write_kernel   s   ri   
output_dirc                 C   sz   | d urt | nt tj} | jddd tt }|D ]}t||  qt|D ]}t||  q't|D ]}t||  q3d S )NT)parentsexist_ok)	r   __file__parentmkdirlistrI   ri   r^   rd   )rj   r\   re   r/   r/   r0   main   s   
rq   __main__generate_kernelsz<Generate the flash_attention kernels template instantiations)progdescriptionz-oz--output_dirinstantiationszEWhere to generate the kernels  will default to the current directory )defaultrequiredhelp)'argparserB   collectionsr   dataclassesr   pathlibr   typingr   r   rZ   r)   rG   rH   rF   rE   r"   r!   r#   r$   r&   r+   r,   r-   r   rI   r^   rd   ri   r(   rq   r;   ArgumentParserparseradd_argument
parse_argsargsrj   r/   r/   r/   r0   <module>   sb   

-