o
    ԰i	d                  (   @   sp  d dl mZmZ d dlZd dlmZ d dlZd dlZd dl	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ dd	lmZ d
dlmZ d
dlmZ G dd deZd dlZeeZi Zddddej ej ddddddddfdej!dej!dej!deej! deej! de"deej# dej#dej#dee$e$f de"de%de%de%d e%d!ee% d"eej& d#eej!eej! f f$d$d%Z'dS )&    )TupleOptionalN)driver)from_dlpack)Int32)APIBase)_convert_to_cutlass_data_type   )make_tensor_strided_like   )'BlackwellFusedMultiHeadAttentionForward)fmha_helpersc                "       s^  e Zd Zdddejejdddddddfdejdejdejdejd	eej d
eej deej dejdejdee	e	f de
dededededee f  fddZde
fddZd'deej ddfddZ										d(dejdejdejd ejd!eej d"eej d#eej deej d$e
dee dee dee dee dee ddfd%d&Z  ZS ))CompressionAttentionN   r   F      ?sample_qsample_ksample_vsample_o
sample_lsesample_cum_seqlen_qsample_cum_seqlen_kqk_acc_dtypepv_acc_dtypemma_tiler_mnis_persistentscale_qscale_kscale_vinv_scale_oscale_softmaxc                    s  t    t| _| jd | jd || _|| _|| _	|| _
|| _|d u| _|| _|| _|| _|	| _|
| _|| _|| _|| _|| _|| _|| _d | _d | _d | _d | _d | _d | _d | _d | _d | _ | jd!g d|j" d|j" d|j" d|j" d|d ur|j"nd	 d
|d ur|j"nd	 d|d ur|j"nd	 d| d|	 d|
 d| d| d| d| d| d|  d S )Nz+CompressionAttention is an experimental APIzEntering __init__ z'__init__ completed with args: sample_q z, sample_k z, sample_v z, sample_o z, sample_lse Nonez, sample_cum_seqlen_q z, sample_cum_seqlen_k z, qk_acc_dtype z, pv_acc_dtype z, mma_tiler_mn z, is_persistent z
, scale_q z
, scale_k z
, scale_v z, inv_scale_o z, scale_softmax )#super__init__r   _kernel_loggerwarningdebugr   r   r   r   r   
enable_lser   r   qk_acc_dtype_torchpv_acc_dtype_torchr   r   r   r   r   r    r!   
batch_sizes_qs_kh_qh_kh_rhead_dimproblem_size_compiled_kerneljoinshape)selfr   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   	__class__ a/home/ubuntu/.local/lib/python3.10/site-packages/cudnn/native_sparse_attention/compression/api.pyr%      sB   

zCompressionAttention.__init__returnc                 C   sl  | j d | j d | jjdkrd| _| jj\}}}}| jj\}}}}| jj\}}}}| jj\}}}}| jj||||fkrOt	d||||f d| jj | jj||||fkrit	d||||f d| jj | jj||||fkrt	d||||f d| jj | jj||||fkrt	d	||||f d| jj | j
r| | jd
d| _| jj|||fkrt	d|||f d| jj | j st	d| jd us| jd ur| j d || _|| _|| _|| _|| _|| | _|| _nZ| jjd
krDd| _| jj\}	}}| jj\}
}}| jj\}
}}| jj\}	}}| jj|	||fkr2t	d|	||f d| jj | jj|
||fkrKt	d|
||f d| jj | jj|
||fkrdt	d|
||f d| jj | jj|	||fkr}t	d	|	||f d| jj | j
r| | jdd| _| jj|	|fkrt	d|	|f d| jj | jd u s| jd u rt	d| j d| j | | jdd| _| | jdd| _| jjdks| jjdkrt	d| jj d| jj d| jjtjtjhvs| jjtjtjhvrt	d| jj d| jj t| jt| jkr't	dt| j dt| j t| jd | _d | _d | _|| _|| _|| | _|| _n	t	d| jj ||krVt	d|dvr_t	d|| dkrjt	d | j d! | jj}| jj}| jj|ks| jj|krt	d"| jj d#| jj d$| |tjtjtjhvrt	d%| |tjtjtjhvrt	d&| | j tj!hvrt	d'| j  | j"tj!hvrt	d(| j" | j#d u r| j d) d*t$%| j | _#| j d+ tj&' s t(d,tj&) }tj&*|\}}|d- | }|d.k r"t(d/| d0| |d1kr+t(d2d3| _+| j d4 d3S )5NzEntering check_supportz+Checking shape normalization and validation   B,H,S,Dz.Input shape mismatch: expected Q tensor shape z, got z.Input shape mismatch: expected K tensor shape z.Input shape mismatch: expected V tensor shape z/Output shape mismatch: expected O tensor shape    r   z1Output shape mismatch: expected LSE tensor shape zLSE tensor must be contiguouszJsample_cum_seqlen_q and sample_cum_seqlen_k are ignored for B,H,S,D layoutT,H,Dr	   zSsample_cum_seqlen_q and sample_cum_seqlen_k must be provided for T,H,D layout, got  and r   r   r   zDsample_cum_seqlen_q and sample_cum_seqlen_k must be 1D tensors, got zD and DzHsample_cum_seqlen_q and sample_cum_seqlen_k must be int32 or int64, got zKsample_cum_seqlen_q and sample_cum_seqlen_k must have the same length, got zOInvalid input layout: sample_q must be rank-3 (T,H,D) or rank-4 (B,H,S,D), got zD_qk must match D_v>       @   r   z*Head dimension D_qk must be 32, 64, or 128r   z1H_q must be divisible by H_k (GQA/MQA constraint)zChecking dtypesz'Inputs must have the same dtype, got K z, V z for Q z7Inputs must be Float16, BFloat16, or Float8E4M3FN, got z8Outputs must be Float16, BFloat16, or Float8E4M3FN, got z"qk_acc_dtype must be Float32, got z"pv_acc_dtype must be Float32, got z2No scale_softmax provided, using default 1/sqrt(d)r   zChecking environmentzCUDA is not available
   d   zECompressionAttention requires SM100+ compute capability, but found SMz on device g   z!cuteDSL is not supported on SM103Tz$check_support completed successfully),r'   r)   r   ndiminput_layoutr7   r   r   r   
ValueErrorr*   _unpad_tensor_to_ndimr   is_contiguousr   r   r(   r-   r.   s_kvr0   h_kvr2   r3   dtypetorchint32int64lenfloat16bfloat16float8_e4m3fnr+   float32r,   r!   mathsqrtcudais_availableRuntimeErrorcurrent_deviceget_device_capability_is_supported)r8   bh_qos_qod_qkrO   rN   d_vr0   tt_kvin_dtype	out_dtypedevicemajorminorcompute_capabilityr;   r;   r<   check_supportS   s       





"


z"CompressionAttention.check_supportcurrent_streamc           	      C   s>  | j d | |}|   | jt| jt| jg | j| j	R | j
tjjd}ttd}| j| j | j }|| }| j| j }| jdkrL| jnt| j }| jdkr[| jnt| j }| j|||| j| j| j	f| _ | j d t!j"|fi dt#| j$ddj%d	| jdkr| j$&d
d' n| j$' d g| j$' R dt#| j(ddj%d| jdkr| j(&d
d' n| j(' d g| j(' R dt#| j)ddj%d| jdkr| j)&d
d' n| j)' d g| j)' R dt#| j*ddj%d| jdkr| j*&d
d' n| j*' d g| j*' R d| j d| jdkr&t#| jddnd d| jdkr6t#| jddnd d| j+rEt#| j,ddj%nd d| jdkrW| j,&d
d' n(dg| j,' R d|d|d|dd dt-dd|| _.| j d d S d|d|d|dd dt-dd|| _.| j d d S ) NzEntering compile	mask_typer   r?   z7Compiling CompressionAttention kernel with cute.compileq_iter   assumed_alignq_strider   r	   r   k_iterk_stridev_iterv_strideo_itero_strider4   cum_seqlen_qrA   cum_seqlen_klse_iter
lse_stridescale_softmax_log2r!   scale_outputwindow_size_leftwindow_size_rightstreamzKernel compiled successfully)/r'   r)   _get_default_stream_ensure_support_checkedr&   r   r+   r,   r   r3   r   
fmha_utilsMaskTypeCOMPRESSED_CAUSAL_MASKrY   log2expr   r   r!   r   r    rJ   r.   maxr   itemrN   r   r-   r0   rO   r4   cutecompiler   r   iterator	transposestrider   r   r   r*   r   r   r5   )	r8   ro   fmha_kernellog2_er!   r   r   r.   rN   r;   r;   r<   r      s   
	
6668	
0zCompressionAttention.compileq_tensork_tensorv_tensoro_tensor
lse_tensorcum_seqlen_q_tensorcum_seqlen_k_tensorskip_compilec                 C   s  | j d | |}| jr |d u rtd| ||jd d}| jdkrE|d u s-|d u r7td| d| | |dd}| |dd	}|
d u rL| jn|
}
|d u rU| j	n|}|d u r^| j
n|}|d u rg| jn|}|d u rp| jn|}ttj}|
| | }|| }|| }|	s| jd u rtd
| j d | jt| jdkr|ddn|ddjt| jdkr|ddn|ddjt| jdkr|ddn|ddjt| jdkr|ddn|ddj| j| jdkrt|ddjnd | jdkrt|ddjnd | jrt|ddjnd |||d td|d | j d d S | j d | jt| jt| jg | j| jR | jtjjd}|d)i dt| jdkrJ|ddn|ddjd| jdkr_|dd  n|  d g|  R dt| jdkry|ddn|ddjd| jdkr|dd  n|  d g|  R dt| jdkr|ddn|ddjd| jdkr|dd  n|  d g|  R dt| jdkr|ddn|ddjd| jdkr|dd  n|  d g|  R d| jd| jdkr	t|ddnd d| jdkrt|ddnd d | jr&t|ddjnd d!| jdkr7|dd  ndg|  R d"|d#|d$|d%d d&tdd'| | j d( d S )*NzEntering executez\kernel was compiled with lse_tensor provided, but lse_tensor was not provided during executer   r   rA   zScum_seqlen_q_tensor and cum_seqlen_k_tensor must be provided for T,H,D layout, got rB   r   r   z(CompressionAttention kernel not compiledzExecuting with compiled kernelr?   r	   rs   rt   r   )rr   rw   ry   r{   r4   r}   r~   r   r   r!   r   r   r   r   z*Executed with compiled kernel successfullyz'Executing without compiled kernel (JIT)rp   rr   rv   rw   rx   ry   rz   r{   r|   r4   r}   r~   r   r   r   r!   r   r   r   r   zExecuted successfullyr;   )!r'   r)   r   r*   rK   rL   rI   rJ   r   r   r   r    r!   rY   r   er5   r   r   r   r4   r   r&   r   r+   r,   r   r3   r   r   r   r   r   )r8   r   r   r   r   r   r   r   ro   r   r   r   r   r    r!   r   scale_softmax_valscale_softmax_log2_valscale_output_valr   r;   r;   r<   execute  s  


22
22*!zCompressionAttention.execute)N)
NNNNFNNNNN)__name__
__module____qualname__rQ   rX   Tensorr   rP   r   intboolfloatr%   rn   r[   CUstreamr   r   __classcell__r;   r;   r9   r<   r      s    	

> >	
r   Fr   r   r   r   r   r   r   r*   o_dtyper   r   r   r   r   r   r   r    r!   r   r=   c                 C   s  t d d\}}|dur|n| j}| jdkrC| j\}}}}|j\}}}}t| ||||f|| jd}|rBtj|||tj	| jd
 }n;| jdkrv| j\}}}|j\}}}t| |||f|| jd}|rutjd||tj	| jd
 ddd	}ntd
| j | j|j|j|dur|jnd|dur|jnd| j|j|j|dur|jnd|dur|jnd|  | | |dur| nd|dur| nd|||||	|
|||||f}|tv rt d t| }|j| |||||||d ||fS t d tdi d| d|d|d|d|d|d|d|d|d|	d|
d|d|d|d|d|}| s1J |  |j| |||||||d |t|< ||fS )z
    Compression Attention Wrapper that returns output (and optionally LSE) tensors directly.

    Returns:
        tuple: (o_tensor, lse_tensor | None)
    zNcompression_attention_wrapper: Creating empty output tensor o and optional lse)NNNr>   )rP   rj   r@   r   r	   r   zOInvalid input layout: q_tensor must be rank-4 (B,H,S,D) or rank-3 (T,H,D), got zRcompression_attention_wrapper: Using previously cached CompressionAttention object)r   r   r   r   r   r   r   ro   z_compression_attention_wrapper: No cached object found, creating new CompressionAttention objectr   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r;   )r'   r)   rP   rI   r7   r
   rj   rQ   emptyrX   
contiguouspermuterK   r   %_cache_of_CompressionAttentionObjectsr   r   rn   r   )r   r   r   r   r   r*   r   r   r   r   r   r   r   r   r    r!   r   r   r   ra   r0   r.   d_r1   r/   re   rf   	cache_key	comp_attnr;   r;   r<   compression_attention_wrapper  s   


&
,
	

r   )(typingr   r   rY   cuda.bindingsr   r[   rQ   cutlasscutlass.cuter   cutlass.cute.runtimer   cutlass.cute.typingr   cudnn.api_baser   cudnn.datatypesr   utilsr
   fmhar   r"   r   r   r   logging	getLoggerr   r'   r   rX   r   r   rP   r   r   r   r   r;   r;   r;   r<   <module>   s      w
	

