o
    i5                     @   sj  d dl Z d dlZd dlmZmZ ddlmZmZmZm	Z	m
Z
mZmZmZmZmZmZmZmZ ddlmZ G dd de jZG d	d
 d
e jZejdejdejdejdiZejdejdejdejdiZejdejdiZ ddejdejdiZ!G dd de jZ"e"j#de"j$de"j%de"j&diZ'e"j#de"j$de"j%d e"j&d!iZ(G d"d# d#Z)e) Z*d$d% Z+ddd&d'Z,e*d(e	j-d)e	j.d*e	j/d+e	j0d,e	j*d-e	j1d.e	j2d/iZ3G d0d1 d1Z4d2d3 Z5d4d5 Z6d6d7 Z7d8d9 Z8d:d; Z9d<d= Z:d>d? Z;d@dA Z<dBdC Z=dDdE Z>dFdG Z?dHdI Z@dJdK ZAdLdM ZBdNdO ZCdPdQ ZDdRdS ZEdTdU ZFdVdW ZGdXdY ZHdZd[ ZIG d\d] d]ZJd^d_ ZKd`da ZLdbdc ZMdS )e    N)chainproduct   )	enum_autoDataTypeNamesDataTypeSizeDataTypeDataTypeTagGemmKindGemmKindNamesKernelScheduleTypeKernelScheduleTagKernelScheduleSuffixesEpilogueScheduleTypeEpilogueScheduleTagEpilogueScheduleSuffixes   )is_cuda_version_at_leastc                   @   $   e Zd Ze Ze Ze Ze ZdS )TrtLlm_EpilogueTagN)__name__
__module____qualname__r   epilogue_op_defaultepilogue_op_biasepilogue_op_siluepilogue_op_gelu r   r   b/home/ubuntu/vllm_env/lib/python3.10/site-packages/flashinfer/jit/gemm/cutlass/generate_kernels.pyr      
    
r   c                   @   s   e Zd Ze Ze ZdS )TrtLlm_EpilogueFusionN)r   r   r   r   epilogue_fusion_noneepilogue_fusion_finalizer   r   r   r   r        s    
r    lclc_biassilugeluz3tensorrt_llm::cutlass_extensions::EpilogueOpDefaultz0tensorrt_llm::cutlass_extensions::EpilogueOpBiasz7tensorrt_llm::cutlass_extensions::EpilogueOpDefaultSiluz9tensorrt_llm::cutlass_extensions::EpilogueOpDefaultFtGeluzFtensorrt_llm::TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::NONEzJtensorrt_llm::TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::FINALIZE EpilogueFusion_NONEEpilogueFusion_FINALIZEc                   @   r   )TrtLlm_QuantOpN)r   r   r   r   per_column_scale_onlyfinegrained_scale_onlyfinegrained_scale_and_zerosnoner   r   r   r   r*   A   r   r*   csfgsfgsznoquantz1cutlass::WeightOnlyQuantOp::PER_COLUMN_SCALE_ONLYz2cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLYz7cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROSvoidc                   @   s   e Zd ZdS )	e2m1_typeN)r   r   r   r   r   r   r   r4   [   s    r4   c                 C   s   t | trdS t|  S )N   )
isinstancer4   r   )typer   r   r   GetDataTypeBitsb   s   
r8   c                 C   s6   d}|d ur|r
dnd}t | tr|d S |t|   S )Nr'   mx_nv_e2m1)r6   r4   r   )r7   	is_mx_fpxmxprefixr   r   r   GetDataTypeNamesh   s   
r>   SafeFP4__nv_fp8_e4m3__nv_bfloat16halffloat__nv_fp4_e2m1zcutlass::float_ue8m0_tzcutlass::uint4b_tc                   @   s&   e Zd Z				dddZdd ZdS )TrtLlm_GemmLauncherNFc                 C   sv   || _ || _|| _|| _|| _|| _|| _|| _|	| _|
| _	|| _
|| _|| _|| _|| _|| _|| _|| _|| _d S N)	gemm_kindarchact_typeweight_typescalezero_type	bias_typeoutput_typequant_opepi_tag	cta_shape
warp_shapestages	cga_shapedynamic_cgamainloop_scheduleepi_schedule
epi_fusionr<   swap_ab)selfrG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rU   rV   rW   r<   rT   rX   r   r   r   __init__   s&   
zTrtLlm_GemmLauncher.__init__c                 C   s
  d t| j | jt| j| jt| j| jt| jt| j	t| j
t| j t| j | jd | jd | jd | jd | jd | jd | j}d | jd | jd | jd t| j t| j t| j | jrcdnd| jridnd}| jd	kru|| S | jd
krtd| j d|S )Nz1{}_sm{}_{}_{}_{}_{}_{}_{}_{}_{}x{}x{}_{}x{}x{}_{}r   r      z_{}x{}x{}{}{}{}{}{}_mxfpx_r'   _swap_abZ   d   SMz not supported yet.)formatr   rG   rH   r>   rI   r<   rJ   rK   rL   rM   QuantOpNamesrN   EpiTagNamesrO   rP   rQ   rR   rS   r   rU   r   rV   EpiFusionSuffixesrW   rX   
ValueError)rY   kernel_prefixhopper_suffixr   r   r   __repr__   sB   

zTrtLlm_GemmLauncher.__repr__)NFFF)r   r   r   rZ   rh   r   r   r   r   rE      s    
*rE   c                 C   s$   d| d  d| d  d| d  dS )Nzcute::Shape<cute::Int<r   z>, cute::Int<r   r[   z>>r   )shaper   r   r   tuple_to_cute_shape   s   $rj   c                 C   s  t | j }t | j }t | j }t | j }t| j }t| j }t	| j
}t	| j}t| j }	d}
| jd ur9t| j }
| jtjkrt| j }dg d| d| d| d| d| d| d| d| d| d|	 d|
 d| d| d| d| d| d| d	}|S | jtjkr| j| jkr| jtjks| jtkrt | j }d
| d| d| d| d| d| d|	 d|
 d| d| d| d| d| d}|S | jtjtjfv sJ |	dd d| j }t | j }| jd usJ t | j }|!dd }|!dd }|
!dd }
|
dd}
tdtjdtj"di}|#| jd}|#| jd}t$| j%& }t$| j'& }t$d& }t$| j(& }dg d| d| d| d| d| d| d|
 d| d| d| j
d  d| j
d  d| j
d  d| jd  d| jd  d| jd  d| d| d| d| d}|S ) Nr3   r'   z6
template void sm90_generic_mixed_gemm_kernelLauncher<, z,
z
> (
const z	*, const z*, const float,
zv*, int, int, int, const int, tensorrt_llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
);z:
template void sm90_generic_mixed_moe_gemm_kernelLauncher<z> (
GroupedGemmInput<zc>inputs, TmaWarpSpecializedGroupedGemmInput hopper_inputs, int sm_count_, size_t* workspace_size);
z::Kernelz::KernelGroupedSm:1Smzdefined(ENABLE_FP4)zdefined(ENABLE_FP8)zdefined(ENABLE_BF16)1Fz
#if z && z3
        INSTANTIATE_TMA_WARP_SPECIALIZED_MOE_GEMM(z
,
        r   r   r[   z	);
#endif))CudaTypeNamerI   rK   rL   rM   
QuantOpTagrN   EpiTagrO   rj   rP   rS   r   rU   rV   r   rG   r
   Gemmr	   rJ   joinGroupedr   e4m3r;   r   TmaWarpSpecializedCooperative)TmaWarpSpecializedCooperativeFP8FastAccumreplacerH   rW   	EpiFusionsplitbf16getstrr<   lowerrT   rX   )	operationact_tagscale_zero_tagbias_tagout_tagrN   rO   cute_cta_shapecute_cga_shapekernel_sched	epi_sched
weight_taginstantiationarch_tagrW   	guard_map	guard_actguard_weightr<   use_dynamic_cgause_biasrX   r   r   r   *instantiate_operation_tma_warp_specialized   s&  












B
4



r   c                 C   s~   t | j }t | j }t| j }d| d| d| jd  d| jd  d| jd  d| j d| d| d| d| d	| d
}|S )NzF
            template void sm80_generic_fused_moe_gemm_kernelLauncher<rk   r   r   r[   z>
                    (z const* A, z const* B, z( const* biases, bool bias_is_broadcast, z* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);)r	   dtypers   rO   rP   stage)r   r   r   rO   r   r   r   r   instantiate_operation_sm80-  s6   


r   c                 C   s(   | j dkr	t| S | j dkrt| S d S )NP   r^   )rH   r   r   )r   r   r   r   instantiate_operation8  s
   

r   c           	      C   sp   |sJ t  }| D ]}|d| d q	d|}t  }|D ]	}|t| qd|}| d| d}|S )Nz
#include ""
zO
namespace tensorrt_llm
{
namespace kernels
{
namespace cutlass_kernels_oss
{

zX

} // namespace cutlass_kernels_oss
} // namespace kernels
} // namespace tensorrt_llm
)listappendru   r   )	launcher_inl_files
operationsinclude_listfileincludes
insts_listopinstantiationsfile_contentr   r   r   get_file_content?  s   

r   c                 C   sD   t | D ]\}}}|D ]}t j||}||vrt | qqdS )zARemove leftover generated files that weren't created in this run.N)oswalkpathru   remove)
output_dirgenerated_filesroot_dirsfilesr   	file_pathr   r   r   clean_leftover_files\  s   
r   c                 C   s   t jt j|dd t| |}z(t|dd}| |kr(	 W d    W d S W d    n1 s2w   Y  W n	 tyA   Y nw t|dd}|| W d    d S 1 sYw   Y  d S )NT)exist_okr)modew)	r   makedirsr   dirnamer   openreadFileNotFoundErrorwrite)r   r   output_filecontentfr   r   r   
write_filee  s    
"r   c                 C   sL  | j \}}}| j\}}}| jtjkr| jtjkrdS |dks&|dks&|dkr(dS | jdkr?| j	t
ko>| jt
ko>|dko>|dv S |dvrEdS | j	t
ksO| jt
krf|dvsW|dkrYdS | jd	krf| jtjkrfdS | j	tjkr|d
kst|dkr|dkr|dkr|dkp|d
 dkS |d dks|dk s|dkrdS |d dkr|d dkrdS dS )NFr[   r   g      )r      )@   r   )r   r   r   r_         r       r   r   T)rP   rS   rW   r    r"   rV   r   PtrArrayTmaWarpSpecialized1SmrH   rI   r;   rJ    PtrArrayNoSmemWarpSpecialized1Smr   rw   )r   tile_mtile_n_cga_mcga_ncga_kr   r   r   is_gemm_op_valid_sm100s  s@   

	
r   c                 C   s   | j \}}}| j\}}}|dkr|dkrdS |dkr$|dkr$|dkr$dS |dkr2|dkr2|dkr2dS |dkrD|dkrD|dkrD|dkrDdS dS )Nr   Tr[   r   F)rP   rS   )r   r   r   r   r   r   r   r   r   is_gemm_op_valid  s    r   c                 C   sP   t | sdS | jtjkrdS | jd ur| jtjkrdS | jtj	tj
fvr&dS dS )NFT)r   rO   r   r   rV   r   NoSmemWarpSpecializedrU   r   rx   ry   r   r   r   r   is_grouped_gemm_op_valid  s   
r   c                 C   s>   | j dkr	t| S | jtjkrt| S | jtjkrt| S d S Nr_   )rH   r   rG   r
   rt   r   rv   r   r   r   r   r   is_op_valid  s   
r   c               
   C   s  d} t jt jt jt jt jft jt jt jt jt jft jt jt jt jt jft jt jt jt jt jft jt jt jt jt jft jt jt jt jt jfg}tjtjtj	g}t
jg}ddg}g d}t||}g d}d}tddgddgdg}	t|||||	}
t }|
D ]O\}}}}}d	}|t|d  }||f }|d dk}|rtjntj}|rtjntj}ttj| g|||||||||R  }t|r|| q}|S )
Nr^   r   r   r   r   r   r   r   )r5   r   r   r   r   r[      )r   rw   u4f16r}   u8r*   r+   r,   r-   r   r   r   r   r8   r   rx   TmaWarpSpecializedPingpongr   TmaWarpSpecializedrE   r
   rt   r   r   )rH   supported_dtypes	quant_opsepi_tagsM_TILESN_TILEScta_shapes_mnrQ   rR   
cga_shapespartial_argsr   dtype_comborN   rO   cta_shape_mnrS   
max_k_bitscta_shape_kcta_shape_mnkuse_cooprU   rV   fpA_intB_operationr   r   r   #generate_sm90_mixed_gemm_operations  s|   



	

r   c                 C   sV  | sg S d}t jt jt jt jg}tjg}tjg}dg}g d}t	t
||dg }g d}d}	tjtjg}
ddg}t
d	d
gd	d
gd	g}t
||||
|||}t	 }|D ]U\}}}}}}}d}|t| }||f }|t jkrqtjntj}d }|g}|t jkrt jt jg}|D ]!}ttj|||||||||||	|||||d}t|r|| qqS|S )Nr^   r   r   )r   r   r   r   r   r   TFr   r[   r   )rX   )r   r   r}   f32rw   r*   r.   r   r   r   r   r    r!   r"   r8   r   rx   ry   rE   r
   rv   r   r   )is_arch_enabledrH   r   r   r   r   r   r   rQ   rR   epi_fusionsrX   r   r   r   r   rN   rO   rW   r   rS   r   r   r   rU   rV   otypesotypemoe_gemm_operationr   r   r   %generate_sm90_grouped_gemm_operations,  s   
	



r   c                 C   s  | sg S d}t jt jt jt jt jft jt jt jt jt jfg}tdr;t jt jt jt jt jft jt jt jt jt jfg}ng }tj	g}t
jg}ddg}g d}g d}tt|||}	ddg}g d}ddg}tt|||}
|
d	 g d
}d}ttddgddgdg}t||||	|}t||||
|}t||}t }|D ]R\}}}}}|d dk}|rtjtjgntjg}tj}|D ]3}|d dkr|d dkr|tjkrqttj|g|||||||||R  }|| qq|S )Nr^   z12.8r   r   )r   r   r   r   )r   r   i   )r   r   r   r   r   r   r   r   r   r   r[   )r   rw   r   r   r}   r   r;   ue8m0r*   r,   r   r   r   r   r   r   r   rx   r   r   rE   r
   rv   )r   rH   supported_dtypes_int4supported_dtypes_fp4r   r   r   r   K_TILEScta_shapes_mnk_int4cta_shapes_mnk_fp4rQ   rR   r   partial_args_int4partial_args_fp4r   r   r   rN   rO   r   rS   r   mainloop_schedulesrV   rU   r   r   r   r   0generate_sm90_mixed_type_grouped_gemm_operations  s   



	
r   c                 C   s&   t  }|t|  |t|  |S rF   )r   extendr   r   r   r   r   r   r   generate_sm90_operations  s   r  c                 C   s4   d}|t | }|tjkr| d dkrd}| |f S )Nr   r   r   r   )r8   r   rw   )r   r   r   r   r   r   r   !calc_shape_mnk_sm100_grouped_gemm  s
   
r  c                 C   s^  | sg S d}t tjt fg}tjg}tjg}g dg dg dg dg}g d}d}tjtj	g}g dg}	d	d
g}
t
||||||	|
}t }|D ]e\}}}}}}}
tj}d }t|tr_|\}}n||}}|tjkrt|t krt|g dkrtqG|g}|tjt fv rtjtjg}|D ]%}ttj||||||||||||||||tjko|t k|
d}|| qqG|S )Nx   r   )r   r   r   )r   r   r   )r   r   r   r   r   r   r   r   TF)r<   rX   )r;   r   rw   r*   r.   r   r   r    r!   r"   r   r   r   rx   r6   tupler   r}   rE   r
   rv   r   )r   rH   r   r   r   cta_shapes_mnkrQ   rR   r   r   rX   r   r   r   rN   rO   rW   r   rS   rU   rV   rI   rJ   r   r   r   r   r   r   &generate_sm120_grouped_gemm_operations  s   

	


r  c                 C   s   t | }|S rF   )r  r   r   r   r   generate_sm120_operationsC  s   r  c                 C   sl  | sg S t jt jt jt jtt jtfg}tjg}tj	g}ddg}g d}t
||}g d}d}	tjtjg}
tjtjg}ddg}dd	g}dd	g}t
||||
|||||	}t }|D ]\\	}}}}}}}}}t|trl|\}}n|}t||}tj}|g}|t jtfv rt jt jg}|D ]*}ttj|||||||||||	|||||t jko|tk||d
}t|r|| qqW|S )Nr   r   )r   r   r   r   r      r   r   r   r  )r[   r   r   TF)r<   rT   rX   )r   r   r}   r   rw   r;   r*   r.   r   r   r   r    r!   r"   r   r   r   r   r6   r  r  r   rx   rE   r
   rv   r   r   )r   rH   r   r   r   cta_shapes_mcta_shapes_nr   rQ   rR   r   epi_schedulesr   rX   rT   r   r   r   rN   rO   rW   r   rS   rV   rJ   r   rU   r   r   r   r   r   r   &generate_sm100_grouped_gemm_operationsH  s   




r  c                 C      t | d}|S )Nr   r  r   r   r   r   generate_sm103_operations     
r  c                 C   r  r   r  r   r   r   r   generate_sm100_operations  r  r  c                   @   s   e Zd Zdd ZdS )GemmSm80LauncherConfigc                 C   s(   || _ || _|| _|| _|| _|| _d S rF   )rG   rH   r   rO   rP   r   )rY   rG   rH   r   rO   rP   r   r   r   r   rZ     s   
zGemmSm80LauncherConfig.__init__N)r   r   r   rZ   r   r   r   r   r    s    r  c                  C   st   d} t jt jg}tjtjg}g d}g d}t||||}t }|D ]\}}}	}
tt	j
| |||	|
}|| q"|S )Nr   ))r   r   r   )r   r   r   )r   r   r   )r   r   r   )r   r   r   )r[   r   r5   )r   r   r}   r   r   r   r   r   r  r
   rv   r   )rH   r   r   r  rR   r   r   r   rO   r   r   itemr   r   r   +generate_sm80_fused_grouped_gemm_operations  s   r  c                 C   s
   t  }|S rF   )r  r   r   r   r   generate_sm80_operations  s   r  c                    sb  | d tj| } d}d}d}d}tjdf|gtjdf|gtjdf|gtjdf|gtjd	f|gtjd
f|gi} fdd}g }|t|d	pJ|d7 }|t|d7 }|t	|dp^|d7 }|t
|d7 }|t|d
pr|d7 }dd }	dd }
d}t }|D ]F}|	|rq|j|j|jd |jdko|jtkp|j|
|f}||g }t|dkst|d |kr||g n|d | |||< qg }| D ]V\}}|\}}}}}t|D ]F\}}tj| t| t|dt|  d| d| |rdnd |rdnd d| d}|r|gn||d d  }t||| || qqt| t| d S )N;zWtensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.inlzTtensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_launcher.inlz`tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.inlzXtensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/fused_moe_gemm_launcher_sm80.inlr^   r_   r   r  r   c                    s4   |   v p|  d v p|  d v p|  d v S )Nz-realzf-realr   r   )smarchesr   r   has_arch  s   
z*generate_gemm_operations.<locals>.has_archy   Y   c                 S   s   dS NFr   r   r   r   r   should_skip  s   z-generate_gemm_operations.<locals>.should_skipc                 S   s<   t | trdS | j| jko| jtjko| jtjkp| jt	kS r  )
r6   r  rI   rJ   rG   r
   rv   r   rw   r;   r   r   r   r   is_mixed_dtype_grouped  s   

z8generate_gemm_operations.<locals>.is_mixed_dtype_groupedr   r   rn   cutlass_kernel_file__sm_M_BSr'   _Mixed_groupz.generated.cur[   )r|   r   r   abspathr
   rt   rv   r  r  r  r  r  dictrG   rH   rP   rJ   r;   r<   r~   lenr   items	enumerateru   r   r   r   r   set)r   architecturesfpA_intB_inlmoe_gemm_inlmoe_mixed_gemm_inlsm80_moe_gemm_inlinl_mapr  r   r  r   
GROUP_SIZE	op_groupsr   dict_keyop_group	file_listkeyvaluerG   rH   mblock_scaleis_mixediop_sub_groupout_fileinl_filer   r  r   generate_gemm_operations  sj   
	

:rA  rF   )Nenumr   	itertoolsr   r   cutlass_libraryr   r   r   r   r	   r
   r   r   r   r   r   r   r   cpp_extr   Enumr   r    r   r   r   r   rc   rs   r!   r"   r{   rd   r*   r+   r,   r-   r.   rb   rr   r4   r;   r8   r>   rw   r}   r   r   r   r   rq   rE   rj   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r  r  r  r  r  r  rA  r   r   r   r   <module>   s    <	

RX	;JTZZh
