o
    پibq                     @   s8  d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	m
Z
mZmZ ddlZddlmZ ddlmZmZ dd	lmZ dd
lmZmZmZmZmZmZ ddlmZ ddlmZ ddl m!Z!m"Z"m#Z#m$Z$m%Z% ddl&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0 ddl1m2Z2 ddl3m4Z4 ddl5m6Z6m7Z7m8Z8 ddl9m:Z: ddl;m<Z< ddl=m>Z> ddl?m@Z@ ddlAmBZB ddlCmDZD ddlEmFZF ddlGmHZHmIZI ddlJmKZKmLZLmMZMmNZNmOZOmPZPmQZQmRZRmSZS ddlTmUZUmVZV ddlTmWZX ddlYmZZZ ddl[m\Z\ dej]d ej]d!e^d"e^d#e_d$e_d%eeU fd&d'Z`d(ej]d ej]d)ej]d!e^d"e^d#e_d$e_d%eeU fd*d+Zad,e	ej] d-e	ej] d.e	e
e^e^f  d/e	e
e^e^f  d0e	e_ d1e	e_ d2e_d3e_d4e_d5e_d%eeU fd6d7Zbd8e	ej] d9e	e_ d:e	e^ d;e	e^ d<e	e^ d0e	e_ d2e_d3e_d=e_d>e_d%eeU fd?d@Zcd,e	ej] d-e	ej] d.e	e
e^e^f  d/e	e
e^e^f  d0e	e_ d1e	e_ dAeddBe_d4e_d5e_dCe_dDe_dEe_dFe_d%e	eU fdGdHZedIe	eU dJed%dfdKdLZf		M	NdedJee dOedPedQeddRe_dSe_d%dfdTdUZgdVehd%e_fdWdXZidYehd%e
e^e^f fdZd[Zjd\d] Zkd^d_ Zld%e^fd`daZmdbdc Zneoddkren  dS dS )fa  
Copyright (c) 2025 by FlashInfer team.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

  http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

AOT build script for FlashInfer.

NOTE (Zihao): The following modules are intentionally excluded from the AOT build:
- gen_pod_module
- gen_deepgemm_sm100_module (it doesn't involve host-side compilation)
    N)product)Path)ListTupleIteratorOptional)Version   )act_func_def_strgen_act_and_mul_module)gen_cascade_module) gen_fp4_quantization_sm90_module!gen_fp4_quantization_sm100_module!gen_fp4_quantization_sm103_module!gen_fp4_quantization_sm110_module!gen_fp4_quantization_sm120_module!gen_fp4_quantization_sm121_module)#gen_mxfp8_quantization_sm100_module)gen_gdn_prefill_sm90_module)"gen_cutlass_fused_moe_sm120_module"gen_cutlass_fused_moe_sm103_module"gen_cutlass_fused_moe_sm100_module!gen_cutlass_fused_moe_sm90_module%gen_trtllm_gen_fused_moe_sm100_module)
gen_gemm_modulegen_gemm_sm90_modulegen_gemm_sm100_module!gen_gemm_sm100_module_cutlass_fp4!gen_gemm_sm100_module_cutlass_fp8gen_tgv_gemm_sm10x_modulegen_gemm_sm120_module!gen_gemm_sm120_module_cutlass_fp4gen_trtllm_gen_gemm_module"gen_trtllm_low_latency_gemm_module)gen_spdlog_module)gen_mla_module)!gen_selective_state_update_module&gen_selective_state_update_sm90_module'gen_selective_state_update_sm100_module)gen_norm_module)gen_page_module)gen_quantization_module)gen_rope_module)gen_sampling_module)gen_topk_module)gen_trtllm_utils_module)gen_xqa_modulegen_xqa_module_mla)	gen_batch_attention_modulegen_batch_decode_modulegen_batch_mla_modulegen_batch_prefill_modulegen_cudnn_fmha_modulegen_fmha_cutlass_sm100a_modulegen_single_decode_modulegen_single_prefill_modulegen_trtllm_gen_fmha_module)JitSpecbuild_jit_specs)env)get_cuda_version)CompilationContextdtype_qodtype_kvhead_dim_qkhead_dim_vouse_sliding_windowuse_logits_soft_capreturnc                 c   s    | j |j kr| |krd S | j dkrd S td| || ||d||dd
V  td| || tj||d||ddV  t| || ||d||dV  t| || tj||d||d	V  d S )	Nr	   fa2r   F)
backenddtype_qrA   dtype_orB   rC   pos_encoding_moderD   rE   use_fp16_qk_reductionrH   rI   rA   rJ   	dtype_idxrB   rC   rK   rD   rE   rL   )rI   rA   rJ   rB   rC   rK   rD   rE   	rI   rA   rJ   rN   rB   rC   rK   rD   rE   )itemsizer9   r5   torchint32r8   r3   r@   rA   rB   rC   rD   rE    rT   B/home/ubuntu/.local/lib/python3.10/site-packages/flashinfer/aot.pygen_fa2`   sf   
rV   rI   rJ   c                 c   sh    | |krd S | j dkr| |krd S |j dkr!|dks|dkr!d S td| ||tj||d||ddV  d S )	N   r	      @   fa3r   FrM   )rP   r5   rQ   rR   rI   rA   rJ   rB   rC   rD   rE   rT   rT   rU   gen_fa3   s,   	

r\   
f16_dtype_	f8_dtype_fa2_head_dim_fa3_head_dim_use_sliding_window_use_logits_soft_cap_has_sm90	has_sm100	add_gemmaadd_oai_ossc
                 c   s
   d}
d}t || | | ||D ]%\\}}}}}}t||||||dE d H  t|||tj||d|dd	V  q|rYt || | | ||D ]\\}}}}}}t|||||||dE d H  qA|rt | | | dgD ]\}}\}}t||d	d	||dE d H  qd|rt | | | dgD ]\}}\}}t|||d	d	||dE d H  q|	rd
dlm} | D ]}dD ]}dD ]}|||||tjddd|d	V  qqq|rttj	tj	tj	tjdddddd	V  t
 V  dg|rdgng  }| D ]}|D ]}t||||tj|
|ddV  qq|rt V  d S d S )Ni   rY   rS   r   F)	rI   rA   rJ   rN   rB   rC   rK   rE   use_profilerr[   )TT   r	   )'gen_batch_prefill_attention_sink_module)rG   rZ   )TF)	rH   rI   rA   rJ   rN   rB   rC   rK   rD      rO   rG   rZ   )rH   rI   rA   rJ   rN   head_dim_ckvhead_dim_kperg   )r   rV   r2   rQ   rR   r\   jit.attentionri   r7   bfloat16r:   r4   r%   )r]   r^   r_   r`   ra   rb   rc   rd   re   rf   rk   rl   rB   rC   r@   rA   rD   rE   	dtype_qkvrJ   ri   dtyperH   use_swamla_backend_rT   rT   rU   gen_attention   s  	


rs   input_type_fp8_kv_cache_token_per_page_
head_size_head_grp_size_	has_sm120	has_sm121c
              
   c   s    |s|s|s|	sdS t | |||||D ]0\}
}}}}}|d dks*|dks*|dk r+q|dvr0q|r6tj}n|
}t|
||||||
dV  q|sI|	r[|D ]}ttjtj|ddd	d
V  qKdS dS )z0Generate XQA modules for various configurations.N   r   rh   r{       rY   rj   )input_dtypekv_cache_dtype	page_sizehead_dimhead_group_ratiorD   output_dtypei@  rj   F)r~   r   r   r   r   rD   )r   rQ   float8_e4m3fnr0   r1   )rt   ru   rv   rw   rx   ra   rc   rd   ry   rz   
input_typefp8_kv_cachetoken_per_page	head_sizehead_grp_sizerD   r   rT   rT   rU   gen_xqan  s\   	



r   sm_capabilitiesadd_commadd_moeadd_actadd_miscadd_xqac           %      C   s  g }| t  |dd}|dd}|dd}|dd}|dd}|dd}|dd}|tt| |||||||||	
7 }|rQtD ]	}| t| qG|
r
| t  |rn| t  | t	  | t
  |r| t  | t  | t  | t  | t  | ttjdd	 | ttjdd	 | t  | t  | t  | t  |r| ttjd
d	 | ttjd
d	 |r| t  | t  |r| t  |r| t  | t  | t  | t  |r
| t  |rTddl m!}m"} ddl m#} ddl m$} ddl m%} ddl m&} | |  | |  |rN| |  | |  | |  | |  |r|t' t( t) t* t+ t, t- t. g7 }|r| t/  | t0  | t1  |r| t2  |rt3 t4dkrtjtjg}d
dg}g d}g d} g d}!|tt5|||| |!|||||
7 }| t6  t7 }"g }#|D ]}$|$j8|"vr|"9|$j8 |# |$ q|#S )Nsm90Fsm100sm100fsm103sm110sm120sm121)use_sm_100fTr	   )gen_trtllm_comm_modulegen_vllm_comm_module)gen_nvshmem_module)gen_comm_alltoall_module)gen_trtllm_mnnvl_comm_module)gen_moe_alltoall_module12.8r|   )rY   rj   rh   )r	   rW         ):appendr$   getlistrs   r
   r   r   r   r   r   r   r   r   r   r   r   rQ   rn   float16r   r"   r#   r   r   r   r   r   r   r    r!   r   jit.commr   r   r   r   r   r   r   r)   r*   r+   r,   r-   r.   r&   r'   r/   r   r(   r>   r   r   r6   setnameadd)%r]   r^   r_   r`   ra   rb   r   r   re   rf   r   r   r   r   	jit_specsrc   rd   
has_sm100f	has_sm103	has_sm110ry   rz   act_namer   r   r   r   r   r   xqa_input_type_xqa_fp8_kv_cache_xqa_token_per_page_xqa_head_size_xqa_head_grp_size_namesretjit_specrT   rT   rU   gen_all_modules  s   

r   r   out_dirc                 C   sx   |  r	t| |jddd | D ]'}tj|j |j d }||j |j d }|jjddd t|| qd S )NTFparentsexist_okz.so)r   r   )	existsshutilrmtreemkdirjit_envFLASHINFER_JIT_DIRr   parentcopy2)r   r   r   srcdstrT   rT   rU   copy_built_kernelsL  s   
r   FT	build_dirproject_rootconfigverboseskip_prebuiltc                 C   sH  t  }|dur|| |}dtjvrtdt }|d t_|d t_|d d d |d d d d	 d gt_	|d d
 d t_
|t_|d t_|d t_tjjddd tjjddd |rtd | durntd|  td| td|d  td|d  td|d  td|d  td|d  td|d  tdtjd  td | D ]\}}	|	rtd | d! qd"D ]}
td#|
 d$||
  q|rtd% t|d |d |d |d |d |d ||d& |d' |d( |d) |d* |d+ |d, }|rtd-t| t|||d. | durt||  |r"td/|  dS dS )0a  
    Compile and package modules based on the provided configuration.

    Args:
        out_dir: Output directory for packaged modules
        build_dir: Build directory for compilation
        project_root: Project root directory
        config: Configuration dictionary to override defaults (optional)
        verbose: Whether to print verbose build output
        skip_prebuilt: Whether to skip pre-built modules
    NFLASHINFER_CUDA_ARCH_LISTz8Please explicitly set env var FLASHINFER_CUDA_ARCH_LIST.csrcinclude3rdpartycutlasstoolsutilspdlog
cached_ops	generatedTr   zAOT build summary:z
  out_dir:z  build_dir:z  fa2_head_dim:fa2_head_dimz  fa3_head_dim:fa3_head_dimz  f16_dtype:	f16_dtypez  f8_dtype:f8_dtypez  use_sliding_window:rD   z  use_logits_soft_cap:rE   z  FLASHINFER_CUDA_ARCH_LIST:z  SM capabilities detected:z    z: Truer   re   rf   r   r   r   r   z  :zGenerating JIT specs...r   re   rf   r   r   r   r   z
Total ops:)r   r   zAOT kernels saved to:)get_default_configupdateosenvironRuntimeErrordetect_sm_capabilitiesr   FLASHINFER_CSRC_DIRFLASHINFER_INCLUDE_DIRCUTLASS_INCLUDE_DIRSSPDLOG_INCLUDE_DIRFLASHINFER_WORKSPACE_DIRr   FLASHINFER_GEN_SRC_DIRr   printitemsr   lenr<   r   )r   r   r   r   r   r   final_configr   sm_namehas_smkeyr   rT   rT   rU   compile_and_package_modulesZ  s~   







	

r   sc                 C   s.   |   dv rdS |   dv rdS td|  )N)true1T)false0FzInvalid boolean value: )lower
ValueError)r   rT   rT   rU   
parse_bool  s
   r   r   c                 C   s   t t| d\}}||fS )N,)mapintsplit)r   qokvrT   rT   rU   parse_head_dim  s   r   c                   C   s<   g dg dt jt jgt jgddgddgddddddddS )zGet default AOT configuration)rY   rY   rj   rj   rh   rh   ))rX   rj   r   r   r  FT)r   r   r   r   rD   rE   r   re   rf   r   r   r   r   )rQ   r   rn   r   rT   rT   rT   rU   r     s   
r   c               	      sj   t  } | jdd dtdtdtf fdd}|dd	|d
d|d
d|dd|dd|dd|dddS )zDetect SM capabilitiesN)supported_major_versionscomputeversionrF   c                    s(   t  fddD sdS t t|kS )Nc                 3   s    | ]} |v V  qd S )NrT   ).0flagr  rT   rU   	<genexpr>  s    z9detect_sm_capabilities.<locals>.has_sm.<locals>.<genexpr>F)anyr>   r   )r  r  gencode_flags_listr  rU   r     s   z&detect_sm_capabilities.<locals>.has_sm
compute_90z12.3compute_100r   z12.9compute_103compute_110z13.0compute_120compute_121)r   r   r   r   r   r   r   )r?   get_nvcc_flags_liststrbool)compilation_contextr   rT   r
  rU   r     s   r   c                  C   sj   t  } t }t| d | d | d | d | d | d || d | d | d	 | d
 | d | d | d }t|S )z#Register the default set of modulesr   r   r   r   rD   rE   r   re   rf   r   r   r   r   )r   r   r   r   )r   r   r   rT   rT   rU   register_default_modules  s&   r  c                  C   s.  t jdd} | jdtdd | jdtdd | jdd	d
d | jdd	dd | jdd	ddgdd | jdd	ddgdd | jdd	dd | jdd	dd | jdtdd | jdtdd | jdtd d | jd!td"d | jd#td$d | jd%td&d | jd'td(d |  }tt jd) }t	 }t
j}d }|jrt|j}|jrt|j}|jrd*d+ |jD |d,< |jrd-d+ |jD |d.< |jrd/d+ |jD |d0< |jrd1d+ |jD |d2< |jrd3d+ |jD |d4< |jrd5d+ |jD |d6< d7D ]}t||d }|d ur
|||< qt||||d8d9d: d S );Nz%Ahead-of-Time (AOT) build all modules)descriptionz	--out-dirzOutput directory)typehelpz--build-dirzBuild directoryz--fa2-head-dim*z2FA2 head dim pair of qk and vo, separated by comma)nargsr  z--fa3-head-dimz2FA3 head dim pair of qk and vo, separated by commaz--f16-dtyper   rn   z16-bit data type)r  choicesr  z
--f8-dtyper   float8_e5m2z8-bit data typez--use-sliding-windowzUse sliding window attentionz--use-logits-soft-capzUse logits soft capz
--add-commz2Add communication kernels (trtllm_comm, vllm_comm)z--add-gemmazSAdd kernels for Gemma Model (head_dim=256, use_sliding_window, use_logits_soft_cap)z--add-oai-ossz?Add kernels for OAI OSS Model (head_dim=64, use_sliding_window)z	--add-moezAdd MoE kernelsz	--add-actzAdd activation kernelsz
--add-misczAdd miscellaneous kernelsz	--add-xqaz'Add XQA (Cross-Query Attention) kernelsr	   c                 S      g | ]}t |qS rT   r   r  dimrT   rT   rU   
<listcomp>Y      zmain.<locals>.<listcomp>r   c                 S   r  rT   r  r   rT   rT   rU   r"  [  r#  r   c                 S      g | ]}t t|qS rT   getattrrQ   r  rp   rT   rT   rU   r"  ]      r   c                 S   r$  rT   r%  r'  rT   rT   rU   r"  _  r(  r   c                 S   r  rT   r   r  r   rT   rT   rU   r"  a  r#  rD   c                 S   r  rT   r)  r*  rT   rT   rU   r"  c  s    rE   r   TF)r   r   r   r   r   r   )argparseArgumentParseradd_argumentr   r   
parse_args__file__resolver   r   r   r   r   r   r   r   r   r   rD   rE   r&  r   )parserargsr   r   r   r   r   	arg_valuerT   rT   rU   main  s   


	

r4  __main__)NFT)p__doc__r+  r   r   	itertoolsr   pathlibr   typingr   r   r   r   rQ   packaging.versionr   jit.activationr
   r   jit.cascader   jit.fp4_quantizationr   r   r   r   r   r   jit.fp8_quantizationr   jit.gdnr   jit.fused_moer   r   r   r   r   jit.gemmr   r   r   r   r   r   r    r!   r"   r#   
jit.spdlogr$   jit.mlar%   	jit.mambar&   r'   r(   jit.normr)   jit.pager*   jit.quantizationr+   jit.roper,   jit.samplingr-   jit.topkr.   jit.tllm_utilsr/   jit.xqar0   r1   rm   r2   r3   r4   r5   r6   r7   r8   r9   r:   jitr;   r<   r=   r   jit.cpp_extr>   r  r?   rp   r   r  rV   r\   rs   r   dictr   r   r   r  r   r   r   r   r  r4  __name__rT   rT   rT   rU   <module>   sb    0,
@
"	

 -	

@	

 

j	
i
