o
    'i0                  !   @   s  d dl mZ d dlmZ d dlZddlmZ ddlmZ zd dl	Z	dZ
W n ey1   dZ	d	Z
Y nw dad
ejjfddZG dd deZddddddddejdejdejdededee deej deej deej deej deej fddZe
re	je	jjgde	jedddddddddejdejdejdededee deej deej deej deej deej fdd Zddddddddd!dejdejdejded"ejdedeej deej deej dee deej deej d#eej d$eej d%ejd&ejf d'd(Zeddd	dddddd)dejdejdejded"ejdedeej deej d*edeej deej d#eej d$eej d%eej d&ejfd+d,ZdS )-    )Enum)OptionalN   )flashinfer_api   )get_cudnn_fmha_gen_moduleTFstreamc                 C   s"   t d u rt a tt | j t S )N)_cudnn_handlecudnncreate_handle
set_streamcuda_stream)r    r   T/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/flashinfer/cudnn/decode.py_create_cudnn_handle   s   r   c                   @   sD   e Zd ZdZdZdZdZdZdZdZ	dZ
d	Zd
ZdZdZdZdZdS )UIDsr   r   r      d   e            2   3   4   i  i  N)__name__
__module____qualname__RESERVED_INVALID_UIDQ_UIDK_UIDV_UIDACTUAL_SEQ_LENS_Q_UIDACTUAL_SEQ_LENS_KV_UIDBLOCK_TABLES_UIDBLOCK_TABLES_K_UIDBLOCK_TABLES_V_UIDRAGGED_Q_UIDRAGGED_O_UIDRAGGED_STATS_UIDO_UID	STATS_UIDr   r   r   r   r      s    r   )
block_sizeactual_seq_lens_qactual_seq_lens_kvblock_tablesbatch_offsets_qbatch_offsets_oqk_cachev_cachescalemax_sequence_kvr,   r-   r.   r/   r0   r1   c                C   s   d|t | jt |jfS )Ndecode)tupleshape)r2   r3   r4   r5   r6   r,   r-   r.   r/   r0   r1   r   r   r   _sdpa_decode_key_fn4   s
   r:   )
heur_modes)key_fnc                 C   s&  t tj }d }	d }
t|[\}}|  dkr.d}| jd | jd | jd }}}n&|  dkrK| jd | jd | jd | jd f\}}}}n	td|   |dks\J d| dksfJ d|jd }|j	d	||||f|| ||| dftj
jd
}|	d ur||	}|tjj || ||}||}|tjj |tjj |tjj |d ur||jd d|jd d}||}|tjj ||}|tjj |d ur||}|tjj |d ur||}|tjj |d |d u}|jd||||d ur|nd |d ur|nd |d||||tj
jd\}}|
d urD||
}|tjj || |tjjd ||||g!|| ||| dg"tj
j W d    n	1 spw   Y  ||||g}|d ur|#| |d ur|#| ||fS )Nr   r   r   r      z#q must have 3 or 4 dimensions, got z"q must have a sequence length of 1zk_cache must have 4 dimensionsr2   )namedimstride	data_typeFsdpaT)r>   r2   kv	seq_len_q
seq_len_kvuse_padding_maskis_inference
attn_scalepaged_attention_k_tablepaged_attention_v_tablepaged_attention_max_seq_len_kvcompute_data_type)$r   torchcudacurrent_streamr
   graphr?   r9   
ValueErrortensorrA   BFLOAT16tensor_likeset_uidr   r'   valueset_ragged_offsetr   r    r!   reshaper%   r&   r"   r#   set_is_pass_by_valuerB   FLOATr(   r*   
set_outputset_dim
set_strideset_data_typeappend) r2   r3   r4   r5   r6   r,   r-   r.   r/   r0   r1   handleg_s_qobh_qod_qkd_vocudnn_qragged_qcudnn_k_cachecudnn_v_cachend_block_tablescudnn_k_block_tablescudnn_v_block_tablescudnn_actual_seq_lens_qcudnn_actual_seq_lens_kvpadding_maskOragged_otensors_to_returnr   r   r   _build_decode_graphL   s   $
















Y



rv   )r-   r.   r/   r,   r0   r1   batch_offsets_kbatch_offsets_vworkspace_bufferrw   rx   outreturnc       
         C   s   t | ||||||||	|
d ur|
nd |
d ur|
nd d\}}ttj }tjj| tjj|tj	j|tj
j|i}|d ur@||tjj< |d urJ||tjj< |
d urT|
|tjj< |d ur^||tjj< |d urn||tjj< ||tjj< |j|||d |S )N)r2   r3   r4   r5   r6   r-   r.   r/   r,   r0   r1   )	workspacera   )rv   r   rN   rO   rP   r   r   rW   r    r!   r*   r"   r#   r'   r(   r%   r&   execute)r2   r3   r4   r5   ry   r6   r-   r.   r/   r,   r0   r1   rw   rx   rz   rQ   tensorshandle_var_mapr   r   r   _batch_decode_with_kv_cache   s@   
r   )r.   r/   is_cuda_graph_compatibler0   r1   rw   rx   rz   r   c       	         C   s   | j d }| j d }|j d }|du rtj|||| j| jd}ts?|j| jdd}t j}||| |||||||||	|
| |S tj	|dddf| jtj
d}|j d }t| |||||||||	|
||d	 |S )
a)  Performs batched decode attention with paged KV cache using cuDNN.

    Args:
        q: Query tensor of shape (batch_size, num_heads_qo, head_dim), seq_len_q is the maximum sequence length of queries in the batch
        k_cache: Key cache tensor of shape   (total_num_pages, num_heads_kv, page_size, head_dim)
        v_cache: Value cache tensor of shape (total_num_pages, num_heads_kv, page_size, head_dim)
        scale: Scaling factor for attention scores, typically 1/sqrt(head_dim)
        workspace_buffer: Workspace buffer for cuDNN operations. Scales with batch size. 128 MB should be sufficient for most cases
        max_sequence_kv: Maximum number of tokens per key/value sequence (s_kv_max)
        actual_seq_lens_kv: Actual sequence lengths for key/values per batch, shape (batch_size,) on CPU
        block_tables: Page table mapping for KV cache, shape (batch_size, num_pages_per_seq) on GPU
        is_cuda_graph_compatible: Whether the decode operation is compatible with CUDA graph
        batch_offsets: Optional batch offsets tensor of shape (batch_size,) on GPU
        out: Optional pre-allocated output tensor
        batch_offsets_q: Optional batch offsets for query tensor of shape (batch_size,) on GPU
        batch_offsets_o: Optional batch offsets for output tensor of shape (batch_size,) on GPU
        batch_offsets_k: Optional batch offsets for key tensor of shape (batch_size,) on GPU
        batch_offsets_v: Optional batch offsets for value tensor of shape (batch_size,) on GPU

    Returns:
        Output tensor of shape (batch_size, num_heads_qo, head_dim)

    Note:
        Currently only supports causal attention (causal must be True)
        All tensors must be contiguous and on the same CUDA device
        Query and KV heads can have different sizes (num_heads_qo >= num_heads_kv)
    r   r   r   N)devicedtypeT)non_blockingr   )r2   r3   r4   r5   ry   r6   r-   r.   r/   r0   r1   r,   rz   )r9   rN   emptyr   r   CUDNN_AVAILABLEtor   r7   onesint32r   )r2   r3   r4   r5   ry   r6   r.   r/   r   r0   r1   rw   rx   rz   bsrf   rh   actual_seq_lens_kv_gpurun_funcr-   r,   r   r   r    cudnn_batch_decode_with_kv_cache   sX   
.

%
r   )enumr   typingr   rN   api_loggingr   utilsr   r
   r   ImportErrorr	   rO   Streamr   r   Tensorfloatintr:   jit	heur_modeAgraph_cacherv   r   boolr   r   r   r   r   <module>   s:   		


	
	

;		
