o
    蹏iU                  "   @   s6  d dl mZ ddlmZ ddlmZ ddlZddlmZ ddl	m
Z
 ddlmZ ddlZddlmZmZ ddlZG d	d
 d
eZddlZeeZi Zdddddejdddf	dejdejdejdejdejdeej deej dedee deej dejdee dee deej deejejejf fddZ dS )   )HopperSelectAttentionFwd    )_convert_to_cutlass_data_type)APIBaseN)from_dlpack)driver)TupleOptionalc                        sj  e Zd Zddddejddfdejdejdejdejdejd	ejd
ejdejdeej deej dee dee dejdedee	 f fddZ
defddZdejdejdejdejdejdejdeejdf fdd Zd2d!eej ddfd"d#Z					$d3d%ejd&ejd'ejd(ejd)ejd*ejd+ejd,ejd-eej d.eej dee	 d!eej d/efd0d1Z  ZS )4SelectionAttentionNi   @   sample_qsample_ksample_vsample_osample_lsample_msample_block_indicessample_block_countssample_cum_seqlen_qsample_cum_seqlen_kmax_s_qmax_s_k	acc_dtype
block_sizescale_softmaxc                     sB  t    t| _| jd | jd || _|| _|| _	|| _
|| _|| _|| _|| _|	| _|
| _|| _|| _|| _|| _d | _d | _d | _d | _d | _d | _d | _|| _| jd|j d|j d|j d|j d|j d|j d	|j d
|j d|	d ur|	jnd d|
d ur|
jnd d| d| d| d| d|  d S )Nz)SelectionAttention is an experimental APIzEntering __init__z'__init__ completed with args: sample_q z, sample_k z, sample_v z, sample_o z, sample_l z, sample_m z, sample_block_indices z, sample_block_counts z, sample_cum_seqlen_q Nonez, sample_cum_seqlen_k z, acc_dtype z
, max_s_q z
, max_s_k z, block_size z, scale_softmax )super__init__r   _kernel_loggerwarningdebugr   r   r   r   r   r   r   r   r   r   r   r   r   r   input_layoutdtypeh_qh_kvgqa_group_sizehead_dim	value_dimr   shape)selfr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   	__class__ h/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/cudnn/native_sparse_attention/selection/api.pyr      s:   
zSelectionAttention.__init__returnc           
      C   s  | j d | j d | jjdkrd| _td| jjdkrd| _| jj\}}}| jj\}}}| jj\}}}| j	j\}}}| jj|||fkrWt
d|||f d	| jj | jj|||fkrot
d
|||f d	| jj | jj|||fkrt
d|||f d	| jj | j	j|||fkrt
d|||f d	| j	j | | jdd| _| jj||fkrt
d||f d	| jj | | jdd| _| jj||fkrt
d||f d	| jj | jd u rt
d| j | jd urt| j| jstd| j d| j | jd u rt
d| j | jd ur,| j| jkr,td| j d| j t| jd | _| jdkrBt
d| j | jjtjtjfvrVt
d| jj | jjd d ||fkr{| jjdkr{t
d||df d	t| jj | jj||fkrt
d||f d	t| jj | jjtjks| jjtjkrt
d| jj d| jj n	t
d| jj || dkrt
d || _|| _|| | _|| _|| _ | j d! | jj| _| j| jj  kr| jj  kr| j	jkst
d" t
d"| jtj!tj"hvrt
d#| j#tj$hvrt
d$| j%d%vr(t
d&| j&d u r7d't'(| j | _&tj)* sG| j +d( t,d(| j d) tj)- }tj).|\}}|d* | }	|	d+k r{| j +d,|	 d-|  t,d,|	 d-| |	d.krt,d/d0| _/| j d1 d0S )2NzEntering check_supportz+Checking shape normalization and validation   B,H,S,Dz#B, H_q, S, D format not implemented   T,H,Dz.Input shape mismatch: expected Q tensor shape z, got z.Input shape mismatch: expected K tensor shape z.Input shape mismatch: expected V tensor shape z/Output shape mismatch: expected O tensor shape    r   z/Output shape mismatch: expected L tensor shape r   z/Output shape mismatch: expected M tensor shape z;sample_cum_seqlen_q must be provided for T,H,D format, got zaSelectionAttention requires sample_cum_seqlen_q and sample_cum_seqlen_k to be identical, but got z and z/max_s_q must be provided for T,H,D format, got zISelectionAttention requires max_s_q and max_s_k to be identical, but got r   r   zFbatch_size (len(sample_cum_seqlen_q) - 1) must be greater than 0, got z0sample_cum_seqlen_q must be int32 or int64, got z.sample_block_indices shape mismatch: expected Kz-sample_block_counts shape mismatch: expected z@sample_block_indices and sample_block_counts must be int32, got z9sample_q must be rank-3 (T,H,D) or rank-4 (B,H,S,D), got z3H_q must be a multiple of H_kv (GQA/MQA constraint)zChecking dtypes and configz1All input/output tensors must have the same dtypez!dtype must be Float16 or BFloat16zacc_dtype must be Float32>          r   z block_size must be 16, 32, or 64g      ?zCUDA is not availablezChecking environment
   Z   z/Requires SM90+ compute capability, but found SMz on device g   z4cuteDSL SelectionAttention is not supported on SM103Tz$check_support completed successfully)0r   r!   r   ndimr"   NotImplementedErrorr)   r   r   r   
ValueError_unpad_tensor_to_ndimr   r   r   r   torchequalr   r   len
batch_sizer#   int32int64r   tupler   r$   r%   r&   r'   r(   float16bfloat16r   float32r   r   mathsqrtcudais_availableerrorRuntimeErrorcurrent_deviceget_device_capability_is_supported)
r*   tr$   d_qkr%   d_vdevicemajorminorcompute_capabilityr-   r-   r.   check_supportH   s   
("  

0


z SelectionAttention.check_supportqkvolm.c                 C   sb  | j dkr	td| j dkrg|j\}}}	|j\}
}}
|j\}
}
}|||| j|	dddd}|ddd}|ddd}|||| j|dddd}|||| jddd}|||| jddd}ntd| j  d	d
 }|||s|td|||std|||std|||std|||std|||std||||||fS )a(  
        Reshape tensors from input format to kernel expected format:
        - Q: (gqa_group_size, d, T, h_kv)
        - K: (T, d, h_kv)
        - V: (T, d_v, h_kv)
        - O: (gqa_group_size, d_v, T, h_kv)
        - L: (gqa_group_size, T, h_kv)
        - M: (gqa_group_size, T, h_kv)
        r1   zB,H,S,D format not implementedr3   r4   r2   r   r   zInvalid input layout: c                 S   s   |   |  kS N)data_ptr)originalreshapedr-   r-   r.   shares_memory   s   z:SelectionAttention._reshape_tensors.<locals>.shares_memoryz@Q tensor memory changed during reshape - expected view operationz@K tensor memory changed during reshape - expected view operationz@V tensor memory changed during reshape - expected view operationz@O tensor memory changed during reshape - expected view operationz@L tensor memory changed during reshape - expected view operationz@M tensor memory changed during reshape - expected view operation)r"   r<   r)   viewr&   permuter=   )r*   rZ   r[   r\   r]   r^   r_   Tr$   d_r%   rT   
q_reshaped
k_reshaped
v_reshaped
o_reshaped
l_reshaped
m_reshapedrd   r-   r-   r.   _reshape_tensors   s6   







z#SelectionAttention._reshape_tensorscurrent_streamc                 C   s$  | j d | |}|   | j| j| j| j| jt	| j
t	| jd}| j d | | j| j| j| j| j| j\}}}}}}t|dd}	t|dd}
t|dd}t|dd}t|}t|}t| j}t| j}t| j}| j d tj||	|
||||||| j|| j|d| _| j d d S )	NzEntering compiler'   r(   GQA_group_sizer   r#   r   +Reshaping tensors to kernel expected format   assumed_alignzCompiling selection_attentionQr5   VOLMblock_indicesblock_counts
max_lengthseq_offsetssoftmax_scalestreamzKernel compiled successfully)r   r!   _get_default_stream_ensure_support_checkedr   r'   r(   r&   r   r   r#   r   rp   r   r   r   r   r   r   r   r   r   r   cutecompiler   r   _compiled_kernel)r*   rq   selection_attentionrj   rk   rl   rm   rn   ro   mQmKmVmOmLmMm_block_indicesm_block_countsm_cum_seqlen_qr-   r-   r.   r      s\   
		


zSelectionAttention.compileFq_tensork_tensorv_tensoro_tensorl_tensorm_tensorblock_indices_tensorblock_counts_tensorcum_seqlen_q_tensorcum_seqlen_k_tensorskip_compilec                 C   s  | j d | |}| j d | |dd}| |dd}| ||||||\}}}}}}t|dd}t|dd}t|dd}t|dd}t|}t|}t|}t|}t|	}|d u rb| jn|}|s| jd u rotd| j d	 | j||||||||| j	|||d
 | j d d S | j d | j
| j| j| j| jt| jt| jd}|||||||||| j	|||d
 | j d d S )NzEntering executert   r4   r   r   ru   rv   z&SelectionAttention kernel not compiledzExecuting with compiled kernelrx   z*Executed with compiled kernel successfullyz'Executing without compiled kernel (JIT)rr   zExecuted successfully)r   r!   r   r>   rp   r   r   r   rN   r   r   r'   r(   r&   r   r   r#   r   )r*   r   r   r   r   r   r   r   r   r   r   r   rq   r   rj   rk   rl   rm   rn   ro   r   r   r   r   r   r   r   r   r   r   r-   r-   r.   execute"  sz   

zSelectionAttention.executer`   )NNNNF)__name__
__module____qualname__r?   rH   Tensorr	   intr#   floatr   boolrY   r   rp   rK   CUstreamr   r   __classcell__r-   r-   r+   r.   r
      s    	
9i
<?	
r
   r   r   r   r   r   r   r   r   r   r   o_dtyper   r   r   r   r/   c                 C   s  t d |du rt|dd |dd   n|}|du r/t|dd |dd   n|}| j\}}}|j\}}}|	durC|	n| j}	tj|||f|	| jd}tj||dftj	| jd}tj||dftj	| jd}| j|j|j|j|j|j|j| j|j|j| 
 |
 |
 |
 |
 |
 |
 |||
||f}|tv rt d t| }|j| |||||||||||d n7t d t| ||||||||||
||||d	}| sJ |  |j| |||||||||||d |t|< |||fS )
z
    Selection Attention Wrapper that returns output tensors directly.

    Returns:
        tuple: (o_tensor, l_tensor, m_tensor) - Output, logsumexp, and max tensors
    zFselection_attention_wrapper: Creating empty output tensors o, l, and mNr   )r#   rU   zNselection_attention_wrapper: Using previously cached SelectionAttention object)r   r   r   r   r   r   r   r   r   r   r   rq   zyselection_attention_wrapper: No previously cached SelectionAttention object found, creating new SelectionAttention object)r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )r   r!   maxitemr)   r#   r?   emptyrU   rH   stride#_cache_of_SelectionAttentionObjectsr   r
   rY   r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   rR   r$   rh   ri   r%   rT   r   r   r   	cache_keyr   r-   r-   r.   selection_attention_wrapper}  s   
,,


r   )!NSA_select_attn_fwd_hmmar   cudnn.datatypesr   cudnn.api_baser   cutlasscutlass.cuter   cutlass.cute.runtimer   cuda.bindingsr   rK   r?   typingr   r	   rI   r
   logging	getLoggerr   r   r   rH   r   r   r   r#   r   rE   r   r-   r-   r-   r.   <module>   sp      k
		
