o
    پi                  +   @   s  d Z ddlZddlmZ ddlZddlZddlmZ ddl	m
Z
mZmZmZmZ ddlmZmZ dd	lmZmZmZmZmZ d
dlmZ d
dlmZ dejdejdejdedededededefddZdejdejdejdejdedededededefddZ dedejdejdejdejdedededefdd Z!dedejdejdejdejdededede
fd!d"Z"dejdejdejdejdededed#edefd$d%Z#dejdejdejdejd&ed'ededed(ede
fd)d*Z$dedejdejdejdededededed+edefd,d-Z%dejdejdejd&ed.ed/ed0ed+edejd1ed2ed3edefd4d5Z&dedejdejdejdejdededededed+edefd6d7Z'dedejdejdejdejdededededefd8d9Z(dejdejdejdejdedededededefd:d;Z)dejdejdejdededededede
fd<d=Z*dedejdejdejdededededed+ede
fd>d?Z+dejdejdejd&ed.ed/ed0ed+edejd1ed2ed3ede
fd@dAZ,dejdejdejd&ed.ed/ed0ed+edejd1ed2ed3ede
fdBdCZ-		D	D		D	D	DdrdEedejdejdejdejd&edFee dGee dHee dIee dJedKedLed.ed/ed0ed1ed2ed3ed+ede
f*dMdNZ.		D	D		D	D	DdrdEedejdejdejdejd&edFee dGee dHee dIee dJedKedLed.ed/ed0ed1ed2ed3ed+ede
f*dOdPZ/dejdejdejdejdededededede
fdQdRZ0dedejdejdejdejdededededed+ede
fdSdTZ1dedejdejdejdejdedededede
fdUdVZ2dejdejdejdejdededededefdWdXZ3		D	DdsdEedejdejdejdededFee dGee dHee dIee dYedLedededede
f dZd[Z4		D	D	D	DdtdedEedejdejdejdededFee dGee dHee dIee dYedLedededed+ed\ede
f&d]d^Z5		D	DdsdEedejdejdejd_ejdededFee dGee dHee dIee dYedLedededede
f"d`daZ6		D	D	D	DdtdedEedejdejdejd_ejdededFee dGee dHee dIee dYedLedededed+ed\ede
f(dbdcZ7dejdejdejdejdedededededefdddeZ8dejdejdejdejdededededede
fdfdgZ9dhdi Z:		D	DdsdEedejdejdejd_ejdededFee dGee dHee dIee dYedLedededef djdkZ;dldm Z<dndo Z=de
fdpdqZ>dS )ua3  
Copyright (c) 2025 by FlashInfer team.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

  http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
    N)List   )env)JitSpecgen_jit_specloggersm90a_nvcc_flagscurrent_compilation_context   )	get_cubinget_meta_hash)	dtype_mapfilename_safe_dtype_mapmask_mode_literalpos_encoding_mode_literalwrite_if_different   )generate_additional_params)enumerate_kernelsdtype_qdtype_kvdtype_ohead_dim_qkhead_dim_vopos_encoding_modeuse_sliding_windowuse_logits_soft_capreturnc                 C   s@   dt |   dt |  dt |  d| d| d| d| d| S )	N$single_decode_with_kv_cache_dtype_q_
_dtype_kv_	_dtype_o__head_dim_qk__head_dim_vo__posenc_	_use_swa__use_logits_cap_r   )r   r   r   r   r   r   r   r    r'   T/home/ubuntu/.local/lib/python3.10/site-packages/flashinfer/jit/attention/modules.pyget_single_decode_uri+   s    r)   	dtype_idxc	           	      C   sJ   dt |   dt |  dt |  dt |  d| d| d| d| d	| S )
N#batch_decode_with_kv_cache_dtype_q_r   r    _dtype_idx_r!   r"   r#   r$   r%   r&   	r   r   r   r*   r   r   r   r   r   r'   r'   r(   get_batch_decode_uriA   s$   r.   backendhead_dim_ckvhead_dim_kpeuse_profilerc                 C   sP   dt |  dt |  dt |  dt |  d| d| d| | dkr%d	 S d
 S )Nbatch_mla_attention_dtype_q_r   r    r,   _head_dim_ckv__head_dim_kpe_
_profiler_fa3_sm90 r&   )r/   r   r   r   r*   r0   r1   r2   r'   r'   r(   get_batch_mla_uriY   s"   
r:   c              
   C   s@  | dkrt dt| |||||||}tj| }	tj|	dd | dkrttjd }
t	|

 }W d    n1 s<w   Y  |	d }t||jt| t| t| t| ||d g }d	D ].}tj| }|	| }|| t|d
}
|

 }W d    n1 sw   Y  t|| q_nw| dkrttjd }
t	|

 }W d    n1 sw   Y  |	d }t||jt| t| t| t| ||d g }dD ].}tj| }|	| }|| t|d
}
|

 }W d    n1 sw   Y  t|| qnt d|  g }| dkr|t7 }|r|dg7 }t|||dS )Nauto4backend should not be auto when jit_args is providedTexist_okfa2zbatch_mla_config.jinjazbatch_mla_config.inc)r   r   r   r*   r0   r1   )zbatch_mla_plan.cuzbatch_mla_run.cuzbatch_mla_binding.curr7   zbatch_mla_sm90_config.inc)zbatch_mla_sm90_plan.cuzbatch_mla_sm90_run.cuzbatch_mla_sm90_binding.cuzUnsupported backend: -DFLASHINFER_ENABLE_PROFILERextra_cuda_cflags)
ValueErrorr:   jit_envFLASHINFER_GEN_SRC_DIRosmakedirsopenFLASHINFER_CSRC_DIRjinja2Templatereadr   renderr   appendr   r   )r/   r   r   r   r*   r0   r1   r2   urigen_directoryfconfig_templgenerated_config_pathsource_pathsfilenamesrc_path	dest_pathsourcerC   r'   r'   r(   gen_batch_mla_modulen   s   










rZ   arcc                 C   sD   dt |   dt |  dt |  dt |  d| d| d| d| S )	N'batch_decode_mla_with_kv_cache_dtype_q_r   r    r,   _head_dim_ckvr$   r%   _arc_r&   )r   r   r   r*   r0   r   r   r[   r'   r'   r(   get_batch_decode_mla_uri   s    r_   head_dimnum_qo_headsuse_tensor_coresc	                 C   s  t jdj}	|	dkrd}
nd}
|r3|	dkr3||
 dkr3| t jkr3|t jkr3|t jkr3td d}ntd d	}t| |||||||}tj	| }t
j|d
d ttjd }t| }W d    n1 sjw   Y  |d }t||jt|  t| t| t| ||d |
t| t| d	 g }|dkrddg}ng d}g }|D ].}tj| }|| }|| t|d}| }W d    n1 sw   Y  t|| qt||S )Nr   	      @      z2Use tensor-core SM80 version of MLA decode kernel.sm80z4Fall back to cuda-core version of MLA decode kernel.	cuda_coreTr=   zbatch_decode_mla_config.jinjazmla_config.inc)	r   r   r   r*   r0   r1   qo_tile_lenr   r   zbatch_decode_mla_cute_sm80.cubatch_decode_mla_binding.cu)zbatch_decode_mla_plan.cuzbatch_decode_mla_run.curj   r@   )torchcudaget_device_propertiesmajorfloat16r   infor_   rE   rF   rG   rH   rI   rJ   rK   rL   rM   r   rN   r   strlowerrO   r   )r   r   r   r*   r`   ra   r   r   rb   cuda_arch_majorri   r[   rP   rQ   rR   rS   rT   	filenamesrU   rV   rW   rX   rY   r'   r'   r(   gen_batch_decode_mla_module   sz   












ru   use_fp16_qk_reductionc
           
      C   sX   dt |  dt |  dt |  d| d| d| d| d| d	|	 | d
kr)d S d S )N%single_prefill_with_kv_cache_dtype_q_r   r    r!   r"   r#   r$   r%   _f16qk_r7   r8   r9   r&   )
r/   r   r   r   r   r   r   r   r   rv   r'   r'   r(   get_single_prefill_uri<  s.   
	ry   pos_encoding_mode_puse_sliding_window_puse_logits_soft_cap_ppos_encoding_mode_duse_sliding_window_duse_logits_soft_cap_dc                 C   s\   dt |   dt |  dt |  d| d| d| d| d|	 d	|
 d
| dt |  d| S )Npod_with_kv_cache_dtype_q_r   r    
_head_dim_
_posenc_p__use_swa_p__use_logits_cap_p_
_posenc_d__use_swa_d__use_logits_cap_d_r,   rx   r&   )r   r   r   r`   rz   r{   r|   rv   r*   r}   r~   r   r'   r'   r(   get_pod_uriU  s0   	
r   c                 C   sb   dt |  dt |  dt |  dt |  d| d| d| d| d	|	 d
|
 | dkr.d S d S )N$batch_prefill_with_kv_cache_dtype_q_r   r    r,   r!   r"   r#   r$   r%   rx   r7   r8   r9   r&   )r/   r   r   r   r*   r   r   r   r   r   rv   r'   r'   r(   get_batch_prefill_uris  s2   	
	
r   c	           	      C   sR   dt |  dt |  dt |  dt |  d| d| d| d| d	kr&d
 S d S )N3batch_prefill_with_attention_sink_kv_cache_dtype_q_r   r    r,   r!   r"   r$   _r7   r8   r9   r&   )	r/   r   r   r   r*   r   r   r   r   r'   r'   r(   $get_batch_prefill_attention_sink_uri  s&   
r   c	           	      C   sZ   dt |   dt |  dt |  dt |  d| d| d| dt|  d	t|  S )
N&batch_attention_with_kv_cache_dtype_q_r   r    r,   r!   r"   r#   _use_logits_soft_cap__use_profiler_)r   rq   rr   )	r   r   r   r*   r   r   r   r   r2   r'   r'   r(   get_batch_attention_uri  s$   

r   c           	      C   sx   t | |||||||}t|| ||||dgdgg dg ddt|  dt|  dt|dk  dd	|||d
S Nmaybe_alibi_slopesfloatlogits_soft_capsm_scalerope_rcp_scalerope_rcp_thetadoubler   r   r   zDefaultAttention<false, , r   >+#include<flashinfer/attention/variants.cuh>)r   r   r   )r)   "gen_customize_single_decode_modulerq   rr   )	r   r   r   r   r   r   r   r   rP   r'   r'   r(   gen_single_decode_module  s6   

2r   c
                 C   s&  t | |||||||||	
}
|tjtjfv }| dkrL|rJ dddg}ddg}g d}g d}d	t|  d
t|  d
t|dk  d}d}n1|sidg}dg}g d}g d}dt|  d}d}ng d}g d}g d}g d}d}d}t| |
|||||||||||||||	|dS )Nr?   /fp8 tensor core is not supported in fa2 backendmaybe_custom_maskr   uint8_tr   r   r   "DefaultAttention<use_custom_mask, r   r   r   r   maybe_scale_v)r   r   scale_v_scalar)r   r   r   DefaultAttention<2#include<flashinfer/attention/hopper/variants.cuh>maybe_scale_qmaybe_scale_kr   r   r   r   r   scale_q_scalarscale_k_scalarr   DefaultFP8Attentionr   r   r   rv   fp8_enabled)ry   rk   float8_e4m3fnfloat8_e5m2rq   rr   #gen_customize_single_prefill_module)r/   r   r   r   r   r   r   r   r   rv   rP   r   additional_tensor_namesadditional_tensor_dtypesadditional_scalar_namesadditional_scalar_dtypesvariant_namevariant_declr'   r'   r(   gen_single_prefill_module  sl   4r   c                 C   s   t | |||||||||	|
|}ddg}ddg}g d}g d}dt|  dt|  dt|d	k  d
}dt|
  dt|  dt|	d	k  d
}d}t|| |||||||||||||||	|
||dS )Nr   r   r   r   r   r   r   r   r   $DefaultAttention<use_custom_mask_p, r   r   r   $DefaultAttention<use_custom_mask_d, r   rz   r{   r|   r}   r~   r   rv   )r   rq   rr   gen_customize_pod_moduler   r   r   r`   rz   r{   r|   rv   r*   r}   r~   r   rP   r   r   r   r   variant_name_pvariant_name_dr   r'   r'   r(   gen_pod_moduleB  sV   44r   c                 C   s   dt | |||||||||	|
| }ddg}ddg}g d}g d}dt|  d	t|  d	t|d
k  d}dt|
  d	t|  d	t|	d
k  d}d}t|| |||||||||||||||	|
||dS )Nbatch_r   r   r   r   r   r   r   r   r   r   r   r   r   )r   rq   rr   gen_customize_batch_pod_moduler   r'   r'   r(   gen_batch_pod_module  sV   44r   FrP   r   r   r   r   r   r   r   c           %   	   C   V  t j|  }t||||	\}}}tt jd }t| }W d    n1 s(w   Y  tt jd }t| }W d    n1 sFw   Y  i d|d|d|d|d|
d|d	t| d
t| dt| dt| d|d|dt	| dt	| dt
| dt
| dt
| t
| t
| d}|jd i |}tj|dd |d }t|| g }dD ]2}dD ]-} t| |d< t|  |d< d| d|  d}!||! }"||" |jd i |}#t|"|# qqdD ]/}!t j|! }$||! }"||" t|$d}| }#W d    n	1 sw   Y  t|"|# qt| |S )!Nzpod_customize_config.jinjazpod_kernel_inst.jinjaadditional_func_paramsadditional_params_decladditional_params_setterr   r   r   r   r   r   idtyper   r   rz   r}   r{   r|   r~   r   rv   Tr=   zpod_config.incr   r   r   r
   mask_mode_pmask_mode_dpod_kernel_mask_p_d.cu)zpod.cuzpod_jit_binding.cur@   r'   rE   rF   r   rI   rJ   rK   rL   rM   r   r   rq   rr   rN   rG   rH   r   r   rO   r   %rP   r   r   r   r*   r`   r   r   r   r   r   r   r   rz   r{   r|   r}   r~   r   rv   rQ   r   r   r   rR   rS   kernel_inst_templkwargsgenerated_inc_strrT   rU   r   r   rV   rX   rY   rW   r'   r'   r(   r        
	








r   c           %   	   C   r   )!Nz batch_pod_customize_config.jinjazbatch_pod_kernel_inst.jinjar   r   r   r   r   r   r   r   r   r   r   r   rz   r}   r{   r|   r~   r   Tr=   zbatch_pod_config.incr   r   r   batch_pod_kernel_mask_r   r   )zbatch_pod.cuzbatch_pod_jit_binding.cur@   r'   r   r   r'   r'   r(   r   (  r   r   c	           
      C   s|   t | ||||||||	}	t|	| |||||dgdgg dg ddt|  dt|  dt|dk  dd	|||d
S r   )r.   !gen_customize_batch_decode_modulerq   rr   )
r   r   r   r*   r   r   r   r   r   rP   r'   r'   r(   gen_batch_decode_module  s:   2r   c                 C   s\  t | |||||||||	|
}|tjtjfv }| dv s!J d|  |tjtjfvs-J d| dkrd|r7J dg d}g d}g d}g d	}d
t|  dt|	  dt|dk  d}d}n3|sg d}g d}g d}g d}dt|	  d}d}ng d}g d}g d}g d}d}d}t| ||||||||||||||||	|
|dS )N)r?   r7   z?backend must be fa2 or fa3 in gen_batch_prefill_module(), got: z3FP8 output is not supported in fa2/fa3 backends yetr?   r   )r   maybe_mask_indptrr   maybe_prefix_len_ptrmaybe_token_pos_in_items_ptrmaybe_max_item_len_ptr)r   int32_tr   uint32_tuint16_tr   )r   r   r   r   token_pos_in_items_len)r   r   r   r   int64_tr   r   r   r   r   )r   r   r   r   )r   r   r   r   )r   r   r   r   )r   r   r   r   r   r   r   r   r   r   r   r   )r   rk   r   r   rq   rr   "gen_customize_batch_prefill_module)r/   r   r   r   r*   r   r   r   r   r   rv   rP   r   r   r   r   r   r   r   r'   r'   r(   gen_batch_prefill_module  s|   
4r   c	                 C   s^   ddl m}	 t| ||||||||	}
t| |
||||||dgdgdgdgd|	|  ||dddd	S )
Nr   )attention_sink_declsinkr   r   r   AttentionSinkFr   )!flashinfer.jit.attention.variantsr   r   r   )r/   r   r   r   r*   r   r   r   r   r   rP   r'   r'   r(   'gen_batch_prefill_attention_sink_module8  sB   r   c	                 C   sh   t | ||||||||	}	g }
g }g }g }dt|  d}d}t|	| ||||||
||||||||dS )NzStandardAttention<r   r   )r   r   r2   )r   rq   rr   $gen_customize_batch_attention_module)r   r   r   r*   r   r   r   r   r2   rP   r   r   r   r   r   r   r'   r'   r(   gen_batch_attention_moduleh  sF   r   r   c                 C   s  t j|  }t||||	\}}}tt jd }t| }W d    n1 s(w   Y  tt jd }t| }W d    n1 sFw   Y  |||||
t| t| t| ||t	| t
| t
| d}|jd
i |}tj|dd g }|d }|| |jd
i |}t|| dD ].}t j| }|| }|| t|d}| }W d    n1 sw   Y  t|| q|d	 }t|| t| |S )Nz$single_decode_customize_config.jinjazsingle_decode_kernel_inst.jinja)r   r   r   r   r   r   r   r   r   r   r   r   r   Tr=   zsingle_decode_kernel.cu)zsingle_decode.cuzsingle_decode_jit_binding.cur@   zsingle_decode_config.incr'   )rE   rF   r   rI   rJ   rK   rL   rM   r   r   rq   rr   rN   rG   rH   rO   r   r   )rP   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rQ   r   r   r   rR   rS   r   r   r   rU   rX   rY   rV   rW   rT   r'   r'   r(   r     sp   









r   r   c           %      C   s  ||t | t | t | ||t| t| t| t| d}| dkr*td| dkrtj| }t|||	|
\}}}ttj	d }t
| }W d    n1 sVw   Y  ttj	d }t
| }W d    n1 stw   Y  ||||dO }|jdi |}tj|dd	 g }d
D ]"}d| d}|| }|| |jddt| i|}t|| qdD ].}tj	| } || }|| t| d}| }W d    n1 sw   Y  t|| q|d }!t|!| t||S | dkrtj| }t|||	|
dd\}}}d}"|rd}#d}$nd}#d}$ttj	|" }t
| }W d    n	1 s4w   Y  ttj	|# }t
| }W d    n	1 sSw   Y  ||||dO }|jdi |}tj|dd	 g }d
D ]#}d| d}|| }|| |jddt| i|}t|| qs|$dfD ]0}tj	| } || }|| t| d}| }W d    n	1 sw   Y  t|| q|d }!t|!| t||tdS td|  )N)r   r   r   r   r   r   r   r   r   r   rv   r;   r<   r?   z%single_prefill_customize_config.jinjaz single_prefill_kernel_inst.jinja)r   r   r   Tr=   r   single_prefill_kernel_mask_.cu	mask_mode)zsingle_prefill.cuzsingle_prefill_jit_binding.cur@   zsingle_prefill_config.incr7   is_sm90_templatez*single_prefill_sm90_customize_config.jinjaz)single_prefill_fp8_sm90_kernel_inst.jinjazsingle_prefill_fp8_sm90.cuz%single_prefill_sm90_kernel_inst.jinjazsingle_prefill_sm90.cu single_prefill_sm90_kernel_mask_z"single_prefill_sm90_jit_binding.cuzsingle_prefill_sm90_config.incrB   Invalid backend: r'   r   r   rq   rr   rD   rE   rF   r   rI   rJ   rK   rL   rM   rN   rG   rH   rO   r   r   r   r   )%r/   rP   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rv   r   r   rQ   r   r   r   rR   rS   r   r   rU   r   rV   rX   rY   rW   rT   _file_config_file_kernel_inst
_file_csrcr'   r'   r(   r     s   



	













r   r   c                 C   s  t j|  }t|||	|
\}}}|||||t| t| t| t| ||t| t| t| d}tt jd }t	
| }W d    n1 sKw   Y  tt jd }t	
| }W d    n1 siw   Y  |jdi |}g }|d }|| |jdi |}t|| dD ].}t j| }|| }|| t|d}| }W d    n1 sw   Y  t|| q|d }t|| t| |S )	N)r   r   r   r   r   r   r   r   r   r   r   r   r   r   z#batch_decode_customize_config.jinjazbatch_decode_kernel_inst.jinjazbatch_decode_kernel.cu)zbatch_decode.cuzbatch_decode_jit_binding.cur@   zbatch_decode_config.incr'   )rE   rF   r   r   r   rq   rr   rI   rJ   rK   rL   rM   rN   rO   r   r   )rP   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rQ   r   r   r   r   rR   rS   r   r   rU   rX   rY   rV   rW   rT   r'   r'   r(   r     sd   










r   c           (      C   s  ||t | t | t | t | ||t| t| t| t| d}| dkr-td| dkr5tj| }t||	|
|\}}}ttj	d }t
| }W d    n1 sZw   Y  ttj	d }t
| }W d    n1 sxw   Y  ttj	d }t
| }W d    n1 sw   Y  ||||dO }|jd"i |}tj|d	d
 g }dD ]>}|d| d }|| |jd"dt| i|} t||  |d| d }|| |jd"dt| i|} t||  qdD ]/}!tj	|! }"||! }|| t|"d}| } W d    n	1 sw   Y  t||  q|d }#t|#| t||S | dkrWtj| }t||	|
|d	d\}}}d}$|rWd}%d}&d}'nd}%d}&d}'ttj	|$ }t
| }W d    n	1 sww   Y  ttj	|% }t
| }W d    n	1 sw   Y  ttj	|& }t
| }W d    n	1 sw   Y  ||||dO }|jd"i |}g }dD ]C}d| d}!||! }|| |jd"dt| i|} t||  d| d}!||! }|| |jd"dt| i|} t||  q|'dfD ]0}!tj	|! }"||! }|| t|"d}| } W d    n	1 s;w   Y  t||  q|d }#t|#| t||td S td!|  )#N)r   r   r   r   r   r   r   r   r   r   r   rv   r;   r<   r?   z$batch_prefill_customize_config.jinjaz%batch_prefill_paged_kernel_inst.jinjaz&batch_prefill_ragged_kernel_inst.jinjar   r   r   Tr=   r    batch_prefill_paged_kernel_mask_r   r   !batch_prefill_ragged_kernel_mask_)zbatch_prefill.cuzbatch_prefill_jit_binding.cur@   zbatch_prefill_config.incr7   r   z)batch_prefill_sm90_customize_config.jinjaz.batch_prefill_fp8_paged_sm90_kernel_inst.jinjaz/batch_prefill_fp8_ragged_sm90_kernel_inst.jinjazbatch_prefill_fp8_sm90.cuz*batch_prefill_paged_sm90_kernel_inst.jinjaz+batch_prefill_ragged_sm90_kernel_inst.jinjazbatch_prefill_sm90.cu%batch_prefill_paged_sm90_kernel_mask_&batch_prefill_ragged_sm90_kernel_mask_z!batch_prefill_sm90_jit_binding.cuzbatch_prefill_sm90_config.incrB   r   r'   r   )(r/   rP   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rv   r   r   rQ   r   r   r   rR   rS   paged_kernel_inst_templragged_kernel_inst_templr   rU   r   rX   rY   rV   rW   rT   r   _file_paged_kernel_inst_file_ragged_kernel_instr   r'   r'   r(   r     s2  




	

















r   c	           	      C   s   dS )Nfmha_cutlass_sm100ar'   r-   r'   r'   r(   get_fmha_cutlass_sm100a_uri  s   r  c	              
   C   sR   t | ||||||||	}	tjd tjd tjd g}
tjg dd}t|	|
|dS )Nzfmha_cutlass_sm100.cuzfmha_cutlass_sm100_binding.cuzblackwell_fmha_plan.cu)
         supported_major_versionsrB   )r  rE   rJ   r	   get_nvcc_flags_listr   )r   r   r   r*   r   r   r   r   r   rP   rU   
nvcc_flagsr'   r'   r(   gen_fmha_cutlass_sm100a_module  s.   r  c                  C   s   ddl m} m} | j d}d}| j d}t||j}|s%J d| t|}t| d| d|}|s=J | d	td
tjd tjd gtj	| gd| j dd| dgdS )Nr
   )ArtifactPathCheckSumHashz/includeflashInferMetaInfoz/checksums.txtz!Failed to get checksums.txt from /z.hz.h not foundfmha_genztrtllm_fmha_kernel_launcher.cuzfmhaReduction.cuz-DTLLM_GEN_FMHA_CUBIN_PATH=\"\"z -DTLLM_GEN_FMHA_METAINFO_HASH=\")extra_include_pathsrC   )
	artifactsr  r  TRTLLM_GEN_FMHAr   r   r   rE   rJ   FLASHINFER_CUBIN_DIR)r  r  include_pathheader_namechecksum_pathchecksum	meta_hashmetainfor'   r'   r(   gen_trtllm_gen_fmha_module  s,   

r  c                  C   s  ||t | t | t | t | ||t| t| d
}tj|  }t|||	|
\}}}ttjd }t	
| }W d    n1 sCw   Y  ttjd }t	
| }W d    n1 saw   Y  ||||dO }|jdi |}tj|dd g }dD ] }|d| d	 }|| |jdd
t| i|}t|| qdD ].}tj| }|| }|| t|d}| }W d    n1 sw   Y  t|| q|d }t|| t| ||rdgdS g dS )N)
r   r   r   r   r   r   r   r   r   r   z&batch_attention_customize_config.jinjaz'batch_attention_paged_kernel_inst.jinjar   Tr=   r   "batch_attention_paged_kernel_mask_r   r   )zbatch_attention.cuzbatch_attention_jit_binding.cur@   zbatch_attention_config.incrA   rB   r'   )r   r   rq   rr   rE   rF   r   rI   rJ   rK   rL   rM   rN   rG   rH   rO   r   r   r   ) rP   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r2   r   rQ   r   r   r   rR   rS   r   r   rU   r   rX   rY   rV   rW   rT   r'   r'   r(   r     s   






r   c                  C   s.   ddl m}  tdtjd gd| j dgdS )Nr
   r  fmha_cudnn_genzcudnn_sdpa_kernel_launcher.cuz-DCUDNN_SDPA_CUBIN_PATH=\"r  rB   )r  r  r   rE   rJ   
CUDNN_SDPAr   r'   r'   r(   gen_cudnn_fmha_moduleb  s   
r#  c                  C   s   t   } | S )N)gen_trtllm_fmha_v2_modulebuild_and_load)moduler'   r'   r(   get_trtllm_fmha_v2_modulen  s   
r'  c                  C   s   d} t j|  }|jddd t jd }t|| g d}dd |D }t jd }||g }tjd	gd
}|dt jd   |d t| ||dS )Ntrtllm_fmha_v2T)parentsr>   fmha_v2)z<fmha_v2_flash_attention_bf16_64_128_S_q_k_v_192x128_sm120.cuzLfmha_v2_flash_attention_e4m3_fp32_64_64_S_q_k_v_192x128_output_bf16_sm120.cuz@fmha_v2_flash_attention_e4m3_fp32_64_64_S_q_k_v_192x128_sm120.cuc                 S   s   g | ]}t jd  d | qS )r(  	generated)rE   FLASHINFER_JIT_DIR).0kernelr'   r'   r(   
<listcomp>  s    z-gen_trtllm_fmha_v2_module.<locals>.<listcomp>ztrtllm_fmha_v2_binding.cur  r	  z-Iz-Wno-deprecated-gpu-targetsrB   )	rE   r,  mkdirrJ   r   r	   r  rO   r   )rP   
cached_opsfmha_v2_src_dirkernelskernel_pathsbinding_source_pathrU   r  r'   r'   r(   r$  s  s*   





r$  )r   FFr   FFF)r   FF)r   FFFF)?__doc__rG   typingr   rK   rk   r9   r   rE   corer   r   r   r   r	   jit.cubin_loaderr   r   utilsr   r   r   r   r   r   fmha_v2.generate_kernelsr   dtypeintboolrq   r)   r.   r:   rZ   r_   ru   ry   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r   r#  r'  r$  r'   r'   r'   r(   <module>   s   	
	

	
	
`	
	

X	

	

	

	

	

	
,	

Z	

A	

O	

r	

d	

/	

}	

0	
?	

d	

 3	

_	

 ?	

	

'2	

[