o
    i(                     @   sj  d dl Z d dlZd dlmZ d dlmZ d dlmZ d dlm	Z	m
Z
 edddgZd	d
ddZd	d
dZd	d
dZddgZg dZddgZddgZddgZddgZdZdZdZdZeG dd dZde	e fddZde	e fddZde	e fddZded eddfd!d"Zd#e
e ddfd$d%Ze d&kre j!d'd(d)Z"e"j#d*d+d,dd-d. e"$ Z%ee%j& dS dS )/    N)
namedtuple)	dataclass)Path)ListOptionalKerneltemplatefilenamezcutlass::half_tzcutlass::bfloat16_tzcutlass::float_e4m3_t)fp16bf16e4m3)r
   r   P   Z   )@   `            FTz#include "flash_fwd_launch_template.h"

#ifndef FLASHATTENTION_DISABLE_HDIM{HEAD_DIM}
template void run_mha_fwd_<{ARCH}, {DTYPE}, {HEAD_DIM}, {HEAD_DIM_V}, {SPLIT}, {PAGEDKV}, {SOFTCAP}, {PACKGQA}>(Flash_fwd_params &params, cudaStream_t stream);
#endif
a  #include "flash_fwd_launch_template.h"

#ifndef FLASHATTENTION_DISABLE_SM8x
#ifndef FLASHATTENTION_DISABLE_HDIM{HEAD_DIM}
template void run_mha_fwd_<80, {DTYPE}, {HEAD_DIM}, {HEAD_DIM_V}, {SPLIT}, {PAGEDKV}, {SOFTCAP}, {PACKGQA}>(Flash_fwd_params &params, cudaStream_t stream);
template void run_mha_fwd_<86, {DTYPE}, {HEAD_DIM}, {HEAD_DIM_V}, {SPLIT}, {PAGEDKV}, {SOFTCAP}, {PACKGQA}>(Flash_fwd_params &params, cudaStream_t stream);
#endif
#endif
a#  #include "flash_bwd_launch_template.h"

#ifndef FLASHATTENTION_DISABLE_HDIM{HEAD_DIM}
template<>
void run_mha_bwd_<{ARCH}, {DTYPE}, {HEAD_DIM}, {SOFTCAP}>(Flash_bwd_params &params, cudaStream_t stream) {{
    run_mha_bwd_hdim{HEAD_DIM}<{ARCH}, {DTYPE}, {SOFTCAP}>(params, stream);
}}
#endif
a  #include "flash_bwd_launch_template.h"

#ifndef FLASHATTENTION_DISABLE_SM8x
#ifndef FLASHATTENTION_DISABLE_HDIM{HEAD_DIM}
template<>
void run_mha_bwd_<80, {DTYPE}, {HEAD_DIM}, {SOFTCAP}>(Flash_bwd_params &params, cudaStream_t stream) {{
    run_mha_bwd_hdim{HEAD_DIM}<80, {DTYPE}, {SOFTCAP}>(params, stream);
}}
template<>
void run_mha_bwd_<86, {DTYPE}, {HEAD_DIM}, {SOFTCAP}>(Flash_bwd_params &params, cudaStream_t stream) {{
    run_mha_bwd_hdim{HEAD_DIM}<86, {DTYPE}, {SOFTCAP}>(params, stream);
}}
#endif
#endif
c                   @   sz   e Zd ZU eed< eed< eed< eed< eed< eed< eed< eed< eed	< ed
efddZed
efddZ	dS )r   smdtypehead_dim
head_dim_vsplitpaged_kvsoftcappackgqa	directionreturnc              
   C   s"  | j dkr_| jdkr;| jp| jp| j}tjt| jt| j	 | j
| jt| j t| j t| j t| dS tjt| j	 | j
| jt| j t| j t| j td dS | j dkr| jdkr~tjt| jt| j	 | j
t| j dS tjt| j	 | j
t| j dS d S )	Nfwdr   )ARCHDTYPEHEAD_DIM
HEAD_DIM_VSPLITPAGEDKVSOFTCAPPACKGQAT)r    r!   r"   r#   r$   r%   r&   bwd)r   r    r!   r%   )r    r!   r%   )r   r   r   r   r   KERNEL_IMPL_TEMPLATE_FWD_SM90formatstr	DTYPE_MAPr   r   r   lowerr   KERNEL_IMPL_TEMPLATE_FWD_SM8xKERNEL_IMPL_TEMPLATE_BWD_SM90KERNEL_IMPL_TEMPLATE_BWD_SM8x)selfr    r1   M/home/ubuntu/vllm_env/lib/python3.10/site-packages/hopper/generate_kernels.pyr   _   s2   



zKernel.templatec                 C   sz   d| j  d| j | j| jkrd| j nd d| j | jrdnd | jr&dnd | jr-dnd | jr4dnd d	| j d
S )Nflash__hdim_ _paged_split_softcap_packgqa_sm.cu)	r   r   r   r   r   r   r   r   r   )r0   r1   r1   r2   r	   ~   s   zzKernel.filenameN)
__name__
__module____qualname__int__annotations__r*   boolpropertyr   r	   r1   r1   r1   r2   r   S   s   
 r   c                  c   s8   t t tttttt	D ]l\} }}}}}}|r&|dk s%|dkr&|s%|r&q|dks.| t
v r<t|| ||||||dd	V  |dkrR|dkrRt|| |d||||dd	V  |dkrz|dkrz| dv rzt|| |d||||dd	V  t|| |d	||||dd	V  qt t ttt	D ]\} }}}t|| ||d
d
|d
dd	V  qd S )Nr   r   )	r   r   r   r   r   r   r   r   r   r   r   r   )r   r
   r   i   Fr'   )	itertoolsproductr+   keysHEAD_DIMENSIONSr#   r$   r%   r&   SMDTYPE_MAP_FWD_SM8xr   DTYPE_MAP_BWD)r   r   r   r   r   r   r   r1   r1   r2   get_all_kernels   s   , rK   c              
   #   sN   t t tttttD ]\ dk rq fdd| D }t	|dkr_d  r5dnd r;dnd rAd	nd rGd
nd d d	}d
dd |D }t||V   fdd| D }t	|dkrd  rzdnd rdnd rd	nd rd
nd d d	}d
dd |D }t||V  qd S )Nr   c                    sb   g | ]-}|j d kr|j kr|jkr|jkr|jkr|jkr|jkr|j|jkr|qS r   	r   r   r   r   r   r   r   r   r   .0kr   r   r   r   r   r   r1   r2   
<listcomp>      b zbatch_hdim.<locals>.<listcomp>r   flash_fwd_hdimall_r7   r6   r8   r9   r:   r;   r<   
c                 S      g | ]	}d |j  dqS z
#include ""r	   rN   r1   r1   r2   rR          c                    sb   g | ]-}|j d kr|j kr|jkr|jkr|jkr|jkr|jkr|j|jkr|qS rL   rM   rN   rQ   r1   r2   rR      rS   flash_fwd_hdimdiff_c                 S   rV   rW   rY   rN   r1   r1   r2   rR      rZ   )rD   rE   r+   rF   r#   r$   r%   r&   rH   lenjoinKERNEL_BATCHkernels_allkernelsr	   r   r1   rQ   r2   
batch_hdim   s    (BBrb   c                 #   sB   t t tttttD ]P\ dkrq fdd| D }t	|dkr]d d  r8dnd r>d	nd d
rEdnd d d}d
dd |D }t||V  qt t ttD ]7\ dk rqqg fdd| D }t	|dkrd d  d d}d
dd |D }t||V  qgd S )Nr   c                    sV   g | ]'}|j d kr|j kr|jkr|jkr|jkr|jkr|jkr|qS rL   )r   r   r   r   r   r   r   rN   r   r   r   r   r   r   r1   r2   rR      s   V z!batch_softcap.<locals>.<listcomp>r   flash_fwd_hdimr5   r7   r6   r8   _softcapallr:   r;   r<   rU   c                 S   rV   rW   rY   rN   r1   r1   r2   rR      rZ   c                    s8   g | ]}|j d kr|j kr|jkr|jkr|qS )r'   )r   r   r   r   rN   )r   r   r   r1   r2   rR      s   8 flash_bwd_hdim_softcapall_smc                 S   rV   rW   rY   rN   r1   r1   r2   rR      rZ   )rD   rE   r+   rF   rG   r#   r$   r&   rH   r\   r]   r^   r_   r1   rc   r2   batch_softcap   s(   (>rh   kernelautogen_dirc                 C   s   d}|| j  || j  d S )Na  // Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
// Splitting the different template instantiations to different files to speed up compilation.
// This file is auto-generated. See "generate_kernels.py"

)r	   
write_textr   )ri   rj   preluder1   r1   r2   write_kernel   s   rm   
output_dirc                 C   sz   | d urt | nt tj} | jddd tt }|D ]}t||  qt|D ]}t||  q't|D ]}t||  q3d S )NT)parentsexist_ok)	r   __file__parentmkdirlistrK   rm   rb   rh   )rn   r`   ri   r1   r1   r2   main   s   
ru   __main__generate_kernelsz<Generate the flash_attention kernels template instantiations)progdescriptionz-oz--output_dirinstantiationszEWhere to generate the kernels  will default to the current directory )defaultrequiredhelp)'argparserD   collectionsr   dataclassesr   pathlibr   typingr   r   r^   r+   rI   rJ   rH   rG   r$   r#   r%   r&   r(   r-   r.   r/   r   rK   rb   rh   rm   r*   ru   r=   ArgumentParserparseradd_argument
parse_argsargsrn   r1   r1   r1   r2   <module>   sb   

/