o
    U۷ik                  %   @   s  d dl mZmZ d dlmZmZ ddlmZ ddl	Z	ddl
mZmZ ddlZddlmZ ddlmZmZ ddlmZ ddlm  mZ dd	lmZ dd
lmZmZmZ G dd deZddlZee Z!i Z"dde	j#e	j$e	j#dddddddddfde	j%de	j%de&de'de	j(de	j(de	j(dee)e)f deee)e)f  dee	j% dee	j% dee	j% de)d e*d!e)d"eej+ d#ee	j%d$f f"d%d&Z,dS )'   )PersistentDenseGemmKernel!PersistentDenseGemmKernelNoDlpack))Sm100BlockScaledPersistentDenseGemmKernel1Sm100BlockScaledPersistentDenseGemmKernelNoDlpack    )driverN)TupleOptional)from_dlpackmake_ptr)version)_convert_to_cutlass_data_type)APIBaseceil_divis_power_of_2c                !       sZ  e Zd Zdejddddddddddfdejdejd	ejd
ejdedejdee	e	f de
ee	e	f  de
ej de
ej de
ej de
ej de
ej de	dede	f  fddZdefddZd+de
ej ddfddZ								d,dejd ejd!ejd"ejd#e
ej d$e
ej d%e
ej d&e
ej d'e
ej dede
ej d(eddfd)d*Z  ZS )-GemmSwigluSm100      ?   r   N   F   sample_asample_bsample_ab12sample_calpha	acc_dtypemma_tiler_mncluster_shape_mn
sample_sfa
sample_sfbsample_amax
sample_sfcsample_norm_constsf_vec_size
vector_f32ab12_stagesc                    s  t    | jd | jd || _|| _|| _|| _|| _	|| _
|| _|d u r7| jd dks3dnd| _n|| _|	| _|
| _|| _| |dd| _| |dd	| _|| _|| _|| _| jd u r| jd u r| jd u r| jd u r| jd u r| jd
 t| _n	| jd t| _| jdg d|j d|j d|j d|j d| d| d| d| d|	d ur|	jnd  d|
d ur|
jnd  d|d ur|jnd  d|d ur|jnd  d|d ur|jnd  d| d| d|  d| _d S )Nz&GemmSwigluSm100 is an experimental APIzEntering __init__r      r   r   )   r)   r   amax
norm_constzDNo quantization arguments provided, using regular GEMM swiglu kernelzCQuantization arguments provided, using quantized GEMM swiglu kernel z'__init__ completed with args: sample_a z, sample_b z, sample_ab12 z, sample_c z, alpha z, acc_dtype z, mma_tiler_mn z, cluster_shape_mn z, sample_sfa z, sample_sfb z, sample_amax z, sample_sfc z, sample_norm_const z, sf_vec_size z, vector_f32 z, ab12_stages T)super__init___loggerwarningdebugr   r   r   r   r   r   r   r   r   r    r"   _unpad_tensor_to_ndimr!   r#   r$   r%   r&   r   _kernelr   joinshape_interpret_uint8_as_fp4x2)selfr   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   	__class__ K/home/ubuntu/vllm_env/lib/python3.10/site-packages/cudnn/gemm_swiglu/api.pyr.   5   s<   
2
zGemmSwigluSm100.__init__returnc              	      s	   j d  j d  j jdd\}}} j jdd\}}} j jdd\}}} j jdd\}}}  j|||fd   j|||fd	   j|||fd
   j||d |fd  jt	t
hv rtt| jd}  jddt|dd||fd   jddt|dd||fd   jdd tt|d  jd}  jddt|dd||fd   jdd  j jd||| f|d|| fgd\} _ j jd||| f|d|| fgd\} _ j jd||| f|d|| fgd\} _ j jd||| f|d|| fgd\} _  j jkd j d j   j d  jtthv r j jtjtjtjtjtj gdd _! j" tjkr|  j jtjtjtjtjtj gdd _# $ % j#d n,tjkr j jtjtjgdd _# j j!tjtjtj gdd n		 t&d  j"  j jtjtjgdd _'n+ jt	t
hv r  jd u p̈ jd u d!  j jtj(tj)tj tjgd"d _! j j"tjd#d _" j jtjtjtjtjtj gd$d _# j jtjtjtjtjtj gd%d _'  * j!o" % j'd&   % j'o8 jd u p8 jd u d'   * j!oO j'tjkoO jd u d(  $ j'tjko` j#tjkd)   jd*vd+ j   j jtj+tjgdd _, j j j,dd,d-  j j j,dd.d-  % j!r  j,tj+ko jdk d/ n * j!rĈ  j,tjko jdkd0  * j!r  jd1kp׈ jd1kd2   jd1kd3  j j j!d	d4d-  j d5   j-d6 d7vd8 j-d6    jtthv r#  j-d t.dd9dvd: j-d   nD jt	t
hv rg * j!rH  j-d t.d;d9d;vd< j-d   n % j!rg  % j'pd % j#pd j#tjkd=   j/d6  j-d6 d>krwdnd d6kd?   j/d6  j/d  d@ko j/d6 d6ko j/d d6kot0 j/d6 ot0 j/d  dA j/d6  dB j/d    jtthv r j-d6 d>k}	 |	 oԈ j/dCkdD  j/dCkr j-d6 dkr  j-dEkdF  j dG  fdHdI}
 |
 j! j|||fo|
 j! j|||fo|
 j# j|||f dJ  jt	t
hv rA | j-d6  d6kp>| j-d  d6kdK  j dL tj12 sQt3dMtj14 }tj15|\}}|dN | }|dOk rst3dP| dQ| |dRkr|t3dSdT _6 j dU dTS )VNzEntering check_supportz+Checking tensor shapes, strides, and dtypesr   namer   r   r   ABAB12r)   Cr       r   SFASFB)r   r*   SFCr+   r   )stridez0AB12 and C tensor stride orders must match, got z and zChecking data types)dtyper>   zAB12 (for float32 acc_dtype)zIab12_dtype {torch.float8_e5m2, torch.float8_e4m3fn} is currently disabledzAB12 (for float16 acc_dtype)zA/B (for float16 acc_dtype)zKUnsupported acc_dtype: expected one of {torch.float32, torch.float16}, got z=sfa and sfb must be provided for quantized GEMM swiglu kernelz$A (for quantized GEMM swiglu kernel)z.Accumulator (for quantized GEMM swiglu kernel)z'AB12 (for quantized GEMM swiglu kernel)z$C (for quantized GEMM swiglu kernel)z]Invalid dtype combination: fp4 ab_dtype is not compatible with fp8 c_dtype (recommended bf16)z7sfc and norm_const must be provided when c_dtype is fp8z>amax must be provided when ab_dtype is fp4 and c_dtype is bf16zKfloat32 c_dtype and float32 ab12_dtype currently disabled due to kernel bug>   r   rC   z\sf_vec_size must be 16 or 32 when ab_dtype is {torch.float8_e5m2, torch.float8_e4m3fn}, got z#SFB must have the same dtype as SFA)rH   r>   extra_error_msgz#SFC must have the same dtype as SFAzwInvalid ab_dtype and sf_dtype/sf_vec_size combination: fp8 ab_dtype requires float8_e8m0fnu sf_dtype and 32 sf_vec_sizezInvalid ab_dtype and sf_dtype/sf_vec_size combination: fp4 ab_dtype not supported with float8_e4m3fn sf_dtype and 32 sf_vec_size)r   r   r)   z?Invalid A or B tensor stride: fp4 dtype requires k-major layoutz=Invalid AB12 tensor stride: fp4 dtype requires n-major layoutz A and B must have the same dtypez)Checking MMA tile shape and cluster shaper   )r   r'   zDInvalid MMA tile shape: expected mma_tiler_mn[0] in {128, 256}, got i  zQInvalid MMA tile shape: expected mma_tiler_mn[1] in {32, 64, ..., 224, 256}, got @   zMInvalid MMA tile shape: expected mma_tiler_mn[1] in {64, 128, 192, 256}, got zFor MXFP8 inputs for blockscaled quantized GEMM swiglu kernel, ab12_dtype and c_dtype cannot be FP8. ab12_dtype also cannot be float32r'   z[Invalid cluster shape: cluster_shape_mn[0] must be divisible by 2 if mma_tiler_mn[0] == 256r   zrInvalid cluster shape: expected values to be powers of 2 and cluster_shape_mn[0] * cluster_shape_mn[1] <= 16, got ,r(   zNInvalid cluster shape: cluster_shape must be (1, 1) when use_2cta_instrs=Falser   ziInvalid MMA tile shape: for non-1x1 cluster shape and 128xmma tile shape, mma_tiler_mn must be (128, 128)zChecking tensor alignmentc                    s<   |dk}|rdnd}|| }dt |  jdj }|| dkS )N)r   r   r)   r   r   r   )interpret_uint8_as_fp4x2)r   r6   width)rH   stride_ordertensor_shapeis_mode0_majormajor_mode_idxnum_major_elementsnum_contiguous_elementsr7   r:   r;   check_contigous_16B_alignmentu  s
   zDGemmSwigluSm100.check_support.<locals>.check_contigous_16B_alignmentz5Invalid tensor alignment: tensors must be 16B alignedzAInvalid tensor alignment: m and n must be aligned to mma_tiler_mnzChecking environmentzCUDA is not available
   d   z;GemmSwiglu requires SM100+ compute capability, but found SMz on device g   z,cuteDSL GemmSwiglu is not supported on SM103Tz$check_support completed successfully)7r/   r1   _tensor_shaper   r   r   r   _check_tensor_shaper3   r   r   r   r$   r   r    r!   r"   r#   _check_tensor_stridea_stride_orderb_stride_orderab12_stride_orderc_stride_order_value_error_ifr   r   _check_dtypetorchfloat16bfloat16float32float8_e4m3fnfloat8_e5m2ab_dtyper   
ab12_dtype_not_implemented_error_if_is_fp8
ValueErrorc_dtypefloat4_e2m1fn_x2uint8	_is_fp4x2float8_e8m0fnusf_dtyper   ranger   r   cudais_availableRuntimeErrorcurrent_deviceget_device_capability_is_supported)r7   mklnn_2rest_krest_n2_use_2cta_instrsrU   devicemajorminorcompute_capabilityr:   rT   r;   check_supportr   s  """....



 
"$

&$	$


zGemmSwigluSm100.check_supportcurrent_streamc           '      C   s  | j d | |}|   ttj}| | j	}| | j
}| | j	}| | j
}t|jtdk}|pB|pB|s@|oB| }|re| j d | jtu rTt| _n| jtu r]t| _ntd| j d }	| jttfv r| jt| j| jd dk| j| jd}	n| jttfv r| j| j| j| j| j| jd}	ntd| j tj }
|
| jd | jd	  }| jtu r| j d
 tj|	t | j!t | j"t | j#t | j$| j%||d| _&ni| jtu r@| j d tj|	t | j!ddt | j"ddt | j'ddt | j(ddt | j$ddt | j#dd| j)d urt | j)ddnd | j*d ur)t | j*ddnd | j+d ur5t | j+nd | j%||d| _&n| jttfv r<| j,| j!dd\}}}| j,| j"dd\}}}| j,| j#dd\}}}| jtu rtj|	|||||||||t | j$| j%||d| _&n| jtu r4| j,| j$dd\}}}| j,| j'dd\}}}| j,| j(dd\}}}| j,| j)dd\}}} | j,| j*dd\}!}"}#| j,| j+dd\}$}%}&tj|	fi d|d|d|d|d |d!|d"|d#|d$|d%|d&|d'|d(|d)|d*|d+|d,|d-|d.|d/|d0| d1|!d2|"d3|#d4|$d5|%d6|&d7| j%d8|d9|| _&ntd| j td| j | j d: d S );NzEntering compilez2.10.0z\Running no_dlpack kernel wrapper due to fp4 dtype or fp8 dtype on incompatible torch version!Unreachable: invalid kernel type r   r'   r   r   r   r   r$   r   r   r%   r&   r   zCompiling gemm_swiglu (dlpack)abab12cr   max_active_clustersstreamz4Compiling gemm_swiglu_blockscaled_quantized (dlpack)r   assumed_align   a_tensorb_tensor
sfa_tensor
sfb_tensorc_tensorab12_tensoramax_tensor
sfc_tensornorm_const_tensorr   r   r   r?   r=   r@   rA   a_ptra_shapea_orderb_ptrb_shapeb_orderab12_ptr
ab12_shape
ab12_orderc_cuter   r   r   rB   rD   rE   AMAXrF   
NORM_CONSTr   r   r   r   r   r   sfa_ptr	sfa_shape	sfa_ordersfb_ptr	sfb_shape	sfb_orderc_ptrc_shapec_orderr   r   r   amax_ptr
amax_shape
amax_ordersfc_ptr	sfc_shape	sfc_ordernorm_const_ptrnorm_const_shapenorm_const_orderr   r   r   zKernel compiled successfully)-r/   r1   _get_default_stream_ensure_support_checkedr   parserb   __version__rp   rh   ri   rk   base_versionr3   r   r   r   r   NotImplementedErrorr   r   r   r   r$   r%   r&   cutlassutilsHardwareInfoget_max_active_clusterscutecompiler
   r   r   r   r   r   _compiled_kernelr   r    r!   r"   r#   _make_cute_tensor_descriptor)'r7   r   torch_version	is_ab_fp4is_ab12_fp4	is_ab_fp8is_ab12_fp8_fp8_dlpack_supporteduse_no_dlpack_kernelgemm_swigluhardware_infor   r   r   r\   r   r   r]   r   r   r^   r   r   r_   r   r   sfa_stride_orderr   r   sfb_stride_orderr   r   amax_stride_orderr   r   sfc_stride_orderr   r   norm_const_stride_orderr:   r:   r;   r     sP  






	
"zGemmSwigluSm100.compiler   r   r   r   r   r   r   r   r   skip_compilec           )      C   sr  | j d | |}|s'| | jd u d | j d | jtu r6| jt|t|t|t||
|d n| jtu r| 	|dd}| 	|	dd}	| jt|dd	t|dd	t|dd	t|dd	t|dd	t|d
d	|d urst|dd	nd |d ur~t|dd	nd |	d urt|	nd |
|d n| jt
tfv r| j|dd	}| j|dd	}| j|dd	}| jt
u r| j|||t||
|d na| jtu r| 	|dd}| 	|	dd}	| j|dd	}| j|dd	}| j|dd	}| j|dd	}| j|dd	}| |	}| j||||||||||
|d ntdt| j tdt| j | j d d S | j d | jtu ri| jt| j| jd dk| j| jd}|t|t|t|t||
tj | jd | jd  |d n| jt
u r| jt| j| jd dk| j| jd}| j|dd\}}}| j|dd\}}}| j|dd\}}}||||||||||t||
tj | jd | jd  |d nn| jtu r<| j| j| j| j| j| jd}| 	|dd}| 	|	dd}	|t|dd	t|dd	t|dd	t|dd	t|dd	t|d
d	|d urt|dd	nd |d urt|dd	nd |	d ur&t|	nd |
tj | jd | jd  |d n| jtu r'| j| j| j| j| j| jd}| 	|dd}| 	|	dd}	| j|dd\}}}| j|dd\}}}| j|dd\}}}| j|dd\}}}| j|dd\}}} | j|dd\}}!}"| j|dd\}}#}$| j|d d\}}%}&| j|	d!d\}}'}(|dAi d"|d#|d$|d%|d&|d'|d(|d)|d*| d+|d,|!d-|"d.|d/|d0|d1|d2|d3|d4|d5|#d6|$d7|d8|%d9|&d:|d;|'d<|(d=|
d>tj | jd | jd  d?| n
tdt| j | j d@ d S )BNzEntering executez[GemmSwigluSm100 kernel not compiled; call compile() first or use execute(skip_compile=True)zExecuting with compiled kernel)r   r   r   r   r   r   r   r*   r+   r   r   r   )r   r   r   r   r   r   r   r   r   r   r   )r   r   r   r   r   r   )r   r   r   r   r   r   r   r   r   r   r   r   z*Executed with compiled kernel successfullyz'Executing without compiled kernel (JIT)r   r'   r   r   r?   r=   r@   rA   r   r   r   rB   rD   rE   r   rF   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   z3Executed without compiled kernel (JIT) successfullyr:   )r/   r1   r   _runtime_error_ifr   r3   r   r
   r   r2   r   r   _make_cute_pointerr   typer   r   r   r   r   r   r   r   r   r$   r%   r&   ))r7   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r\   r   r]   r   r^   r   r_   r   r   r   r   r   r   r   r   r   r   r:   r:   r;   execute6  s  











	






	
 !zGemmSwigluSm100.execute)N)NNNNNr   NF)__name__
__module____qualname__rb   re   TensorfloatrH   r   intr	   boolr.   r   rt   CUstreamr   r   __classcell__r:   r:   r8   r;   r   4   s    
	=  -  	
r   r   r}   r   r   Fr   r   r   r   c_majorri   rm   r   r   r   r   r   r   r$   r%   r&   r   r<   .c                 C   s  t d | j\}}}|j\}}}d\}}|dkrBtj|||fd||| f|| jd}tj||d |fd||| d f|| jd}n6|dkrqtj|||f|d|| f|| jd}tj||d |f|d d|| d f|| jd}ntd| d\}}|	d ur|
d urt d	 |tjtjhv rt d
 t	|d |}|t	|dt	|ddddf}d}tj
|tj| jd|}| jtjtjhv r|tjkrt d tjdtd | jtjd}| j|j| j|j|  | ||||||||	d ur|	jnd |
d ur|
jnd |	d ur	|	 nd |
d ur|
 nd |	d ur|	jnd |
d ur%|
jnd |d ur.|jnd |d ur8| nd |d urA|jnd |||f}|tv rft d t| }|j| ||||	|
|||||d n]t d td(i d| d|d|d|d|d|d|d|d|	d|
d |d!|d"|d#|d$|d%|}| sJ d&|j|d' |j| ||||	|
|||||d |t|< |	d ur|
d ur||||fS ||fS ))NzCgemm_swiglu_wrapper_sm100: Creating empty output tensors ab12 and c)NNrz   r   )rH   r   r)   r}   z'c_major must be either 'm' or 'n', got zdgemm_swiglu_wrapper_sm100: Detected sfa_tensor and sfb_tensor, constructing quantized output tensorszHgemm_swiglu_wrapper_sm100: Detected fp8 c_dtype, constructing sfc_tensorr   r   rC   )   r   r      r)   r   z[gemm_swiglu_wrapper_sm100: Detected fp4 ab_dtype and bf16 c_dtype, constructing amax_tensor)r   r   r   inf)r   rH   zIgemm_swiglu_wrapper_sm100: Using previously cached GemmSwigluSm100 object)r   r   r   r   r   r   r   r   r   r   r   zqgemm_swiglu_wrapper_sm100: No previously cached GemmSwigluSm100 object found, creating new GemmSwigluSm100 objectr   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   zUnsupported testcase)r   r:   )r/   r1   r5   rb   empty_stridedr   rl   rg   rf   r   emptyrq   permuterH   rn   ro   rd   fullr   re   rG    _cache_of_GemmSwigluSm100Objectsr   r   r   r   )r   r   r   r   ri   rm   r   r   r   r   r   r   r$   r%   r&   r   rz   r{   r|   r}   r   r   r   r   sf_k	mma_shapemma_permute_order	cache_keyr   r:   r:   r;   gemm_swiglu_wrapper_sm100  s  
$.$





	
r   )-dense_gemm_persistent_swiglur   r   :dense_blockscaled_gemm_persistent_swiglu_interleaved_quantr   r   cuda.bindingsr   rt   rb   typingr   r	   r   cutlass.cuter   cutlass.cute.runtimer
   r   	packagingr   cutlass.cute.mathmathcudnn.datatypesr   cudnn.api_baser   r   r   r   logging	getLoggerr   r/   r   re   rc   r   r   strrH   r   r   r   r   r:   r:   r:   r;   <module>   s        e

	