o
    ԰iV                  #   @   s:  d dl mZ ddlmZ ddlZddlmZmZ ddl	Z	ddl
Z
ddlmZ ddlmZ ddlmZ ddlmZ G d	d
 d
eZddlZeeZi Zddddejdddddddfdejdejdejdeej deej dee dee dejdededededeeef dee deej deejejf f d d!Z dS )"   )FineGrainedReductionQK    )driverN)TupleOptional)from_dlpack)_convert_to_cutlass_data_type)APIBasec                "       s(  e Zd ZdZddddejddddddfdejd	ejd
ejdejdejdeej deej dee dee dej	dededede
deeef dee f  fddZde
fddZd+deej ddfddZ			 	d,d!ejd"ejd#ejd$ejd%ejd&eej d'eej d(e
deej ddfd)d*Z  ZS )-TopKReductiona  
    Top-K Reduction for Native Sparse Attention.

    This class performs top-k reduction on attention scores to identify the most important
    key-value pairs for each query position.

    Note:
        The returned values calculated by the kernel exclude the first block and neighboring blocks from the reduction.
        As a result, it is expected to see rows of all -inf values and -1 values in the final topk_scores and topk_indices output tensors, respectively.
    N   @       T   r   sample_qsample_k
sample_lsesample_topk_scoressample_topk_indicessample_cum_seqlen_qsample_cum_seqlen_kmax_s_qmax_s_k	acc_dtypek_valueselection_block_sizecompress_stride	is_causalmma_tiler_mnscale_softmaxc                    s   t    t| _| jd | jd || _|| _|| _	|| _
|| _|| _|| _|| _|	| _|
| _|| _|| _|| _|| _|| _|| _d S )Nz$TopKReduction is an experimental APIzEntering __init__)super__init__r   _kernel_loggerwarningdebugr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )selfr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   	__class__ [/home/ubuntu/.local/lib/python3.10/site-packages/cudnn/native_sparse_attention/top_k/api.pyr!      s(   

zTopKReduction.__init__returnc                 C   s0  | j d | j d | jd u r| jd u rd| _nH| jd urW| jd urWd| _| jjdkrA| j d | jd	dd	| _| j
jdkrX| j d
 | j
d	dd	| _
| jjd	krp| j d | jd	dd	| _n| jjdkr| j d | | jd	dd	dd	| _| jjdkr| j d | jd	dd	| _| jjdkr| j d | jd	dd	| _| jjdkr| j d t| jjd D ]	}| jd| _q| jjdkrtd| jj d| jjdkr| j d t| jjd D ]	}| jd| _q| jjdkrtd| jj d| jd u r7| j d | jdd  | jd d    | _| jd u rV| j d | jdd  | jd d    | _ntd| j d| j | jj\}}}}| j
j\}}}}| jj||||fkrtd||||f d| jj | j
j||||fkrtd||||f d| j
j | jj|||dfkr| j d | jd| _| jj|||fkrtd|||f d| jj | jddkr| j d | j | _| jj|||| jfkrtd |||| jf d| jj | jj|||| jfkr)td!|||| jf d| jj | jdkr1|nt| jd | _|||| _| _| _| jdkrQ||| _| _| j d" | jj | j
j krntd#| jj  d| j
j  | jj | _ | jj | j!krtd$| jj  d| j! | jj | j!krtd%| jj  d| j! | jj t"j#krtd&| jj  | jdkr| jj t"j#ks| jj t"j#krtd'| jj  d| jj  | j d( t"j$% st&d)t"j$' }t"j$(|\}	}
|	d* |
 }|d+k rt&d,| d-| |d.krt&d/d0| _)| j d1 d0S )2NzEntering check_supportz+Checking shape normalization and validationzB,H,S,DT,H,D   (reshaping q_tensor from T,H,D to 1,H,T,Dr   r      (reshaping k_tensor from T,H,D to 1,H,T,Dz&reshaping lse_tensor from T,H to 1,T,H(reshaping lse_tensor from T,H,1 to 1,H,Tr   2reshaping topk_scores_tensor from T,H,D to 1,H,T,D3reshaping topk_indices_tensor from T,H,D to 1,H,T,DzGcum_seqlen_q must be 1D tensor. Attempting to squeeze last dimension(s)z$cum_seqlen_q must be 1D tensor, got DzGcum_seqlen_k must be 1D tensor. Attempting to squeeze last dimension(s)z$cum_seqlen_k must be 1D tensor, got z1max_s_q not provided, inferring from cum_seqlen_qz1max_s_k not provided, inferring from cum_seqlen_kzAcum_seqlen_q and cum_seqlen_k must be None or both not None, got  and z.Input shape mismatch: expected Q tensor shape z, got z.Input shape mismatch: expected K tensor shape z;reshaping lse_tensor from (b, h_q, s_q, 1) to (b, h_q, s_q)z0Input shape mismatch: expected LSE tensor shape xlse_tensor is expected to have leading stride in last dimension of shape (b, h_q, s_q), copying lse_tensor to contiguousz8Input shape mismatch: expected TopK Scores tensor shape z9Input shape mismatch: expected TopK Indices tensor shape zChecking dtypesz&Q and K must have the same dtype, got z2LSE and Accumulator must have the same dtype, got z:TopK Scores and Accumulator must have the same dtype, got z TopK Indices must be int32, got z9cum_seqlen_q and cum_seqlen_k tensors must be int32, got zChecking environmentzCUDA is not available
   d   z>TopKReduction requires SM100+ compute capability, but found SMz on device g   z/cuteDSL TopKReduction is not supported on SM103Tz$check_support completed successfully)*r#   r%   r   r   input_layoutr   ndiminfo	unsqueeze	transposer   r   _unpad_tensor_to_ndimr   r   rangesqueeze
ValueErrorr   r$   maxitemr   shapestride
contiguousr   len
batch_sizeh_qh_khead_dimdtyper   torchint32cudais_availableRuntimeErrorcurrent_deviceget_device_capability_is_supported)r&   _brK   s_qdrL   s_kdevicemajorminorcompute_capabilityr)   r)   r*   check_supportI   s   
 &&  "" 
 


zTopKReduction.check_supportcurrent_streamc                 C   s  | j d | |}|   g | j| jR }| jt| jt| j	| j
| j| j|| jd}| jd u r;dt| j n| j}ttj}|| }| j| j| j| j| j| jf}t| jddjdd}t| jddjdd}	t| jddjdd}
t| jddjdd}t| jddjdd}| jd	krt| j  nd }| jd	krt| j! nd }t"j#||||	|
||||||d
| _$| j d d S )NzEntering compileelement_dtyper   r   r   compress_block_sliding_stride	mma_tilerr         ?r   assumed_alignr-   leading_dimr/   r,   
problem_sizeQKLSETopk_scoresTopk_indicessoftmax_scale_log2_ecumulative_s_qcumulative_s_kstreamzKernel compiled successfully)%r#   r%   _get_default_stream_ensure_support_checkedr   rM   r"   r   rN   r   r   r   r   r   r   mathsqrtlog2erJ   r   r   rK   rL   r   r   mark_layout_dynamicr   r   r   r   r;   r   r   cutecompile_compiled_kernel)r&   ra   re   topk_reductionr   log2_err   rl   sample_q_cutesample_k_cutesample_lse_cutesample_topk_scores_cutesample_topk_indices_cutesample_cum_seqlen_q_cutesample_cum_seqlen_k_cuter)   r)   r*   r~      sX   
 
	zTopKReduction.compileFq_tensork_tensor
lse_tensortopk_scores_tensortopk_indices_tensorcumulative_s_q_tensorcumulative_s_k_tensorskip_compilec
                 C   s(  | j d | |	}	| jdkr|d u s|d u rtd|jdkr0| j d |ddd}|jdkrD| j d	 |ddd}|jdkrY| j d
 |ddd}n|jdkrr| j d | 	|ddddd}|jdkr| j d |ddd}|jdkr| j d |ddd}|jdkr| j d |
d}|ddkr| j d | }t|ddjdd}
t|ddjdd}t|ddjdd}t|ddjdd}t|ddjdd}| jdkrt| nd }| jdkrt| nd }| jd u rdt| j n| j}ttj}|| }| j| j| j| j| j| jf}|s[| jd u r>td| j d | j||
||||||||	d
 | j d d S | j d | jt| jt| j| j | j!| j"g | j#| jR | j$d}|||
||||||||	d
 | j d d S )NzEntering executer,   zTcumulative_s_q_tensor and cumulative_s_k_tensor are required when using T,H,D layoutr-   r.   r   r   r/   r0   z&reshaping lse_tensor from T,H to 1,H,Tr1   r   r2   r3      z1reshaping lse_tensor to remove trailing dimensionr4   r7   r   rg   ri   rf   z!TopKReduction kernel not compiledzExecuting with compiled kernelrk   z*Executed with compiled kernel successfullyz'Executing without compiled kernel (JIT)rb   zExecuted successfully)%r#   r%   rv   r;   rC   r<   r=   r>   r?   r@   rB   rG   r$   rH   r   r|   r   rx   ry   rM   rz   r{   rJ   r   r   rK   rL   r   r"   r   rN   r   r   r   r   r   r   )r&   r   r   r   r   r   r   r   r   ra   q_cutek_cutelse_cutetopk_scores_cutetopk_indices_cutecumulative_s_q_cutecumulative_s_k_cuter   r   rr   rl   r   r)   r)   r*   execute   s   









"		zTopKReduction.execute)N)NNFN)__name__
__module____qualname____doc__rO   float32Tensorr   intrN   boolr   floatr!   r`   rQ   CUstreamr~   r   __classcell__r)   r)   r'   r*   r
      s    	

-j;	
r
   r   r   r   Tr   r   r   r   cum_seqlen_q_tensorcum_seqlen_k_tensorr   r   r   r   r   r   r   r   r   ra   r+   c                 C   sz  t d d\}}|d ur4|d ur4|d  }|jd }tj||||| jd}tj|||tj| jd}nB|d u rl|d u rl| j\}}}}|j\}}}}tj|||||| jddd}tj||||tj| jddd}n
t	d| d| | j|j|j|d ur|jnd |d ur|jnd | j
|j
|j
|d ur|j
nd |d ur|j
nd |  | | |d ur| nd |d ur| nd |||||	|
|||f}|tv rt d	 t| }|j| |||||||d
 ||fS tdi d| d|d|d|d|d|d|d|d|d|d|d|	d|
d|d|d|}| s"J |j|d |j| |||||||d
 |t|< ||fS )Nz7topk_reduction_wrapper: Entering topk_reduction_wrapper)NNr4   r   )rN   r\   r/   zmcum_seqlen_q_tensor and cum_seqlen_k_tensor must either both be None (B,H,S,D) or both not None (T,H,D), got r6   zDtopk_reduction_wrapper: Using previously cached TopKReduction object)r   r   r   r   r   r   r   ra   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )ra   r)   )r#   r%   rE   rF   rO   emptyr\   rP   r?   rC   rN   rG   _cache_of_TopKReductionObjectsr   r
   r`   r~   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   ra   r   r   total_seq_len_qrL   rX   rW   rY   	cache_keyr   r)   r)   r*   topk_reduction_wrapperW  s   

 $

	

r   )!nsa_top_k_reduction_fwdr   cuda.bindingsr   rQ   rO   typingr   r   rx   cutlasscutlass.cuter}   cutlass.cute.runtimer   cudnn.datatypesr   cudnn.api_baser	   r
   logging	getLoggerr   r#   r   r   r   r   rN   r   r   r   r   r)   r)   r)   r*   <module>   sz      C
	

