o
    U۷i]                     @   s&  d dl mZmZ ddlmZ ddlZddlmZm	Z	 ddl
mZ ddlZddlmZ ddlmZ ddlmZ dd	lmZmZmZ G d
d deZddlZeeZi Zdejejddddfdejdejdejdejdedej dej dee!e!f dee!e!f de!de	ej" deejejf fddZ#dS )   ))Sm100BlockScaledPersistentDenseGemmKernel1Sm100BlockScaledPersistentDenseGemmKernelNoDlpack    )driverN)TupleOptional)version)from_dlpack)_convert_to_cutlass_data_type)APIBaseis_power_of_2ceil_divc                       s   e Zd Zejdddfdejdejdejdejdejd	ejd
ejdeeef deeef def fddZ	de
fddZd!deej ddfddZ		d"dejdejdejdejdejdejdeej de
ddfdd Z  ZS )#GemmAmaxSm100   r   r   r       sample_asample_b
sample_sfa
sample_sfbsample_csample_amax	acc_dtypemma_tiler_mncluster_shape_mnsf_vec_sizec                    s   t    | jd | jd || _|| _|| _|| _|| _	| 
|dd| _|| _|| _|	| _|
| _d| _d| _d| _| jd|j d	|j d
|j d|j d|j d| jj d| d| d|	 d|
  d S )Nz$GemmAmaxSm100 is an experimental APIzEntering __init__   r   )r      r   Tz'__init__ completed with args: sample_a z, sample_b z, sample_sfa z, sample_sfb z, sample_c z, sample_amax z, acc_dtype z, mma_tiler_mn z, cluster_shape_mn z, sf_vec_size )super__init___loggerwarningdebugr   r   r   r   r   _pad_tensor_to_ndimr   r   r   r   r   atom_matom_k_interpret_uint8_as_fp4x2shape)selfr   r   r   r   r   r   r   r   r   r   	__class__ I/home/ubuntu/vllm_env/lib/python3.10/site-packages/cudnn/gemm_amax/api.pyr       s&   
LzGemmAmaxSm100.__init__returnc              	   C   s0  | j d | j d | j| jtjtjtjtjgdd}| j| j	|ddd |tjkr2| j 
d | | jd	vd
| j d | j| jtjtjtjgdd}| j| j|ddd |tjkrd| j 
d | |tjkoo| jdkd | |tjtjhv o| jdkd | j| jtjtjtjtjtjtjtjgdd}| | |o| | d|  | | |o| |d | j| jtjddd || _|| _| j d | j| jdd\}}}| j| j	dd\}}}| j| jdd\}}}| jj\}}}	}}
}| jj\}}}}}
}| | j|||fd | | j	|||fd | | j|||fd | | j| jd | jd |	| j|
|fd | | j| jd | jd || j|
|fd | | j d d! t!|| jd | jd  }t!|| jd | jd  }| |	|kd"| d#|	  | ||kd$| d#|  | j"| jd||| f|d|| fgdd%\}| _#| j"| j	d||| f|d|| fgdd%\}| _$| j"| jd||| f|d|| fgdd%\}| _%| j#d&krd'nd(| _&| j$d&krd)nd(| _'| j%d&krd'nd)| _(| | |o| j&d(ko
| j'd(k d*| j& d+| j'  | | |o"| j(d'kd,| j(  | j d- | | j)d d.vd/| j)d   | | j)d d.vd0| j)d   | | j)d d1kd2 | | | jop| j)d d1kop|d3kd4|  | | j*d | j)d d1krd5nd dk d6 | | j)d.ko| jdko|tjtjtjhv d7 | | j*d d8ko| j*d d8ko| j*d dko| j*d dkot+| j*d ot+| j*d  d9| j*  | j d: d;d< }| ||| j&d'k|||fo||| j'd)k|||fo||| j(d'k|||f d= | j d> | ,tj-.  d? tj-/ }tj-0|\}}|d@ | }| ,|dAk dB| dC|  | ,|dDkdE | | j}| | j}| | j}t12tj3}t12|j4t12dFk}|p||p||o|| }|r| j dG t5| _6nt7| _6dH| _8| j dI dHS )JNzEntering check_supportzChecking dtypes and sf_vec_sizeA)dtypenameBz A and B tensor dtypes must match)r0   r1   extra_error_msgzEUint8 ab_dtype will be interpreted as packed fp4, not as native uint8>      r   z"Unsupported sf_vec_size: received z, expected {16, 32}sfasfbz$sfa and sfb tensor dtypes must matchzGInt8 sf_dtype will be interpreted as float8_e8m0fnu, not as native int8r   zWUnsupported sf_dtype and sf_vec_size combination: float8_e4m3fn and 32 is not supportedr4   zfUnsupported ab_dtype and sf_vec_size combination: {float8_e5m2, float8_e4m3fn} and 16 is not supportedCzUUnsupported c_dtype and ab_dtype combination: fp4 c_dtype requires fp4 ab_dtype, got z\Unsupported c_dtype and ab_dtype combination: fp8 ab_dtype and fp8 c_dtype (fails to launch)AccumulatorzAccumulator must be float32zChecking tensor layoutr   )r1   r   r   r   r   r   r   r   amaxzHInput/Output shape mismatch: expected m_div_atom_m0_m1 (sfa.shape[2]) = z, got zHInput/Output shape mismatch: expected n_div_atom_m0_m1 (sfb.shape[2]) = )strider1   )r   r      mknzeUnsupported A or B tensor stride: Float4 tensors require k-major layout for hardware efficiency, got z and z`Unsupported C tensor stride: Float4 tensors require n-major layout for hardware efficiency, got z$Checking mma tiler and cluster shape)r      z6Unsupported mma_tiler_mn[0]: expected {128, 256}, got z6Unsupported mma_tiler_mn[1]: expected {128, 256}, got r@   z&mma_tiler_mn[0] == 256 currently hangsr   z8mma_tiler_mn (X, 256) requires k > 128 (packed x2), got r<   zIllegal cluster shapezomma_tiler_mn (128, 256), sf_vec_size 16, c_dtype {torch.float32, torch.float16, torch.bfloat16} fails to launchr   zJInvalid cluster shape: expected cluster_shape_mn values in {1, 2, 4}, got zChecking tensor alignmentc                 S   s.   |rdnd}|| }dt | j }|| dkS )Nr   r   r   )r
   width)r0   is_mode0_majortensor_shapemajor_mode_idxnum_major_elementsnum_contiguous_elementsr,   r,   r-   check_contigous_16B_alignment   s   zBGemmAmaxSm100.check_support.<locals>.check_contigous_16B_alignmentz9Unsupported tensor alignment: tensors must be 16B alignedzChecking environmentzCUDA is not available
   d   z9GemmAmax requires SM100+ compute capability, but found SMz on device g   z*cuteDSL GemmAmax is not supported on SM103z2.10.0z\Running no_dlpack kernel wrapper due to fp4 dtype or fp8 dtype on incompatible torch versionTz$check_support completed successfully)9r!   r#   _check_dtyper   torchfloat4_e2m1fn_x2uint8float8_e5m2float8_e4m3fnr   r"   _value_error_ifr   r   float8_e8m0fnuint8r   r   float32float16bfloat16	_is_fp4x2_not_implemented_error_if_is_fp8r   ab_dtypec_dtype_tensor_shaper(   _check_tensor_shaper%   r&   r   r   _check_tensor_stridea_stride_orderb_stride_orderc_stride_ordera_majorb_majorc_majorr   r   r   _runtime_error_ifcudais_availablecurrent_deviceget_device_capabilityr   parse__version__base_versionr   _kernelr   _is_supported)r)   rZ   sf_dtyper[   r=   r>   lr?   _m_div_atom_m0_m1sf_k_div_atom_kn_div_atom_m0_m1expected_m_div_atomexpected_n_div_atoma_strideb_stridec_striderG   devicemajorminorcompute_capability	is_ab_fp4is_c_fp4	is_ab_fp8torch_version_fp8_dlpack_supporteduse_no_dlpack_kernelr,   r,   r-   check_support:   s  

"
$&*
	
zGemmAmaxSm100.check_supportNcurrent_streamc                 C   s2  | j d | |}|   | j| j| j| jd}tj	
 }|| jd | jd  }| jtu re| j d tj|t| jddt| jddt| jddt| jddt| jddt| jdd||d	| _n| jtu r	| j d	 | | j}| | j}| j| j|rd
nddd\}}}	| j| j|rd
nddd\}
}}| j| j|rd
nddd\}}}| j| jddd\}}}| j| jddd\}}}tj|fi d|d|d|	d|
d|d|d|d|d|d|d|d|d|d|d|d t| jddd!|d"|| _ntd#| j | j d$ d S )%NzEntering compiler   r   r   r   r   zCompiling gemm_amaxr4   assumed_aligna_tensorb_tensor
sfa_tensor
sfb_tensorc_tensoramax_tensormax_active_clustersstreamzCompiling gemm_amax (no dlpack)r   r/   r   r1   r2   r7   r5   r6   a_ptra_shapea_orderb_ptrb_shapeb_ordersfa_ptr	sfa_shape	sfa_ordersfb_ptr	sfb_shape	sfb_orderc_ptrc_shapec_order	amax_cuter   r   !Unreachable: invalid kernel type zKernel compiled successfully)r!   r#   _get_default_stream_ensure_support_checkedrm   r   r   r   cutlassutilsHardwareInfoget_max_active_clustersr   cutecompiler	   r   r   r   r   r   r   _compiled_kernelr   rW   rZ   r[   _make_cute_tensor_descriptorNotImplementedError)r)   r   	gemm_amaxhardware_infor   r~   r   r   r   r_   r   r   r`   r   r   ra   r   r   sfa_stride_orderr   r   sfb_stride_orderr,   r,   r-   r     s   



   	
zGemmAmaxSm100.compileFr   r   r   r   r   r   skip_compilec	              
   C   s,  | j d | |}| |dd}| | j}	| | j}
|s| | jd u d | j d | j	t
u rY| jt|ddt|ddt|ddt|ddt|ddt|dd|d nM| j	tu r| j||	red	ndd}| j||	rpd	ndd}| j||
r{d	ndd}| j|dd}| j|dd}| j|||||t|dd|d
 ntd| j	 | j d n| j d | j	| j| j| jd}tj }|| jd | jd  }| j	t
u r|t|ddt|ddt|ddt|ddt|ddt|dd||d n| j	tu r| j||	rd	nddd\}}}| j||	rd	nddd\}}}| j||
r(d	nddd\}}}| j|ddd\}}}| j|ddd\}}}|d+i d|d|d|d|d|d|d|d|d |d!|d"|d#|d$|d%|d&|d't|ddd(|d)| ntd| j	 | j d* d S ),NzEntering executer   r   zYGemmAmaxSm100 kernel not compiled; call compile() first or use execute(skip_compile=True)zExecuting with compiled kernelr4   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   z*Executed with compiled kernel successfullyz'Executing without compiled kernel (JIT)r   r   r   r   r/   r   r2   r7   r5   r6   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   zExecuted successfullyr,   )r!   r#   r   r$   rW   rZ   r[   re   r   rm   r   r	   r   _make_cute_pointerr   r   r   r   r   r   r   r   r   )r)   r   r   r   r   r   r   r   r   r~   r   r   r   r   r   r   r   r   r   r   r_   r   r`   r   ra   r   r   r   r   r,   r,   r-   executeW  s   








	










   	
zGemmAmaxSm100.execute)N)NF)__name__
__module____qualname__rL   rT   Tensorr0   r   intr    boolr   r   rf   CUstreamr   r   __classcell__r,   r,   r*   r-   r      sd    	
	

& ^H	
r   r?   r   r   r   r   r   r   r   rd   r[   r   r   r   r   r   r.   c                 C   s  t d | j\}}}|j\}}}d }|dkr*tj|||fd||| f|| jd}n|dkrAtj|||f|d|| f|| jd}ntd| tjdtd | jtj	d	}| j|j|j|j| j
|j
|j
|j
|  | | | ||||||	f}|tv rt d
 t| }|j| ||||||
d ||fS t d t| |||||||||	d
}| sJ |j|
d |j| ||||||
d |t|< ||fS )NzAgemm_amax_wrapper_sm100: Creating empty output tensors c and amaxr=   r   )r0   rz   r?   z'c_major must be either 'm' or 'n', got r9   inf)rz   r0   zEgemm_amax_wrapper_sm100: Using previously cached GemmAmaxSm100 object)r   r   r   r   r   r   r   zkgemm_amax_wrapper_sm100: No previously cached GemmAmaxSm100 object found, creating new GemmAmaxSm100 object)
r   r   r   r   r   r   r   r   r   r   )r   )r!   r#   r(   rL   empty_stridedrz   
ValueErrorfullfloatrT   r0   r;   _cache_of_GemmAmaxSm100Objectsr   r   r   r   )r   r   r   r   rd   r[   r   r   r   r   r   r=   rq   rp   r?   r   r   	cache_keyr   r,   r,   r-   gemm_amax_wrapper_sm100  s   
&&
$
	r   )$&dense_blockscaled_gemm_persistent_amaxr   r   cuda.bindingsr   rf   rL   typingr   r   	packagingr   r   cutlass.cuter   cutlass.cute.runtimer	   cudnn.datatypesr
   cudnn.api_baser   r   r   r   logging	getLoggerr   r!   r   rT   r   strr0   r   r   r   r,   r,   r,   r-   <module>   sb       3


	
