o
    'i+k                  9   @   sd  U d dl mZ d dlmZ d dlZddlmZ ddlmZ zd dl	Z	dZ
W n ey2   dZ	d	Z
Y nw dai Zeejejf ed
< dejfddZdejjfddZG dd deZddddddd	ddddddddddejdejdejdedee dee deej dejdeej dee dee dee d eej d!eej d"eej d#eej d$eej d%eej d&eej d'eej f(d(d)Ze
r>e	je	jjgd*e	jed+ddddddd	ddddddddd,dejdejdejdedee dee deej deej deej dee dee d eej d!eej d"eej d#eej d$eej d%eej d&eej d'eej f&d-d.Z ddddddddddddd/dejdejdejded0ejd1ededejdejdeej d2eded3eej d4eej d5eej d eej d!eej d"eej d#eej d$eej d%eej d&eej d'eej d6e!ejejf f0d7d8Z"edddddddddddd	ddd9dejdejdejded0ejd1ededejdejdeej d2eded3eej d4eej d5eej d eej d!eej d"eej d#eej d$eej d%eej d&eej d:ed;ee# d'eej d6e!ejeej f f4d<d=Z$dS )>    )Enum)OptionalN   )flashinfer_api   )get_cudnn_fmha_gen_moduleTF_dummy_scale_tensorsdevicec                 C   s>   t | }|d u rtjdg| tjddddd}|t | < |S )Ng      ?r	   dtyper   )r   gettorchtensorfloat32reshape)r	   t r   U/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/flashinfer/cudnn/prefill.py_get_dummy_scale_tensor   s
   
 r   streamc                 C   s"   t d u rt a tt | j t S )N)_cudnn_handlecudnncreate_handle
set_streamcuda_stream)r   r   r   r   _create_cudnn_handle   s   r   c                   @   sl   e Zd ZdZdZdZdZdZdZdZ	dZ
d	Zd
ZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdS )UIDsr   r   r      d   e            2   3   4   5   6   i  i                          N)__name__
__module____qualname__RESERVED_INVALID_UIDQ_UIDK_UIDV_UIDACTUAL_SEQ_LENS_Q_UIDACTUAL_SEQ_LENS_KV_UIDBLOCK_TABLES_UIDBLOCK_TABLES_K_UIDBLOCK_TABLES_V_UIDRAGGED_Q_UIDRAGGED_O_UIDRAGGED_STATS_UIDRAGGED_K_UIDRAGGED_V_UIDO_UID	STATS_UIDQ_SCALE_UIDK_SCALE_UIDV_SCALE_UIDS_SCALE_UIDS_DESCALE_UIDO_SCALE_UID
S_AMAX_UID
O_AMAX_UIDr   r   r   r   r   )   s2    r   )max_token_seq_qmax_sequence_kvactual_seq_lens_qblock_tables	page_sizebottom_right_causal_mask
return_lsebatch_offsets_qbatch_offsets_obatch_offsets_kbatch_offsets_vbatch_offsets_statsoutlseo_data_typeqk_cachev_cachescalerK   rL   rM   actual_seq_lens_kvrN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   c                C   s   |j d }|  dkr| j d | j d }}n|  dkr(| j d | j d }}| dkr:|j d |j d }}n| dkrK|j d |j d }}|d urT|j d }	||  | j| |||||||d u||
|	f}|S )Nr   r   r   r      )shapedimr   )rZ   r[   r\   r]   rK   rL   rM   r^   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   graph_bh_qod_qkh_kvd_vokeyr   r   r   _sdpa_prefill_key_fnK   s6   

rh   )
heur_modes)key_fn)rK   rL   rM   r^   rN   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   c          :      C   s  t tj| j}|jd }|}|}tj st	dtj
| j}tj
|j}tj
|j}|d u r7| j}tj
|}|tjjksI|tjjkrXt dk rXt	dt  t|\}}|  dkrs| jd | jd }}n|  dkr| jd | jd }}ntd	| j |jd
||||f|| ||| df|d}|tjjks|tjjkr|jdddtjjd} |jdddtjjd}!|jdddtjjd}"|jdddtjjd}#|jdddtjjd}$|jdddtjjd}%| tjj |!tjj |"tjj |#tjj |$tjj |%tjj |d ur1||}&|&tjj | |& | dkrM|d u sAJ d|jd |jd }'}(n| dkr`|jd |jd }'}(ntd|j | dkr|jd||'||f|'| | |||' df|d})|d ur||}*|*tj!j |) |* |jd||'||(f|'|( | |(|(|' df|d}+|d ur||},|,tj"j |+ |, n| dkr|jd|j|# |d})|jd|j|# |d}+|tj$j |)tj%j |+tj&j |d ur)|'|jd d|jd d}-||-}.|.tj(j ||-}/|/tj)j |d ur?||}0|0*d |0tj+j |d urU||}1|1*d |1tj,j |d uo]|d u}2|tjj-ksl|tjj.kr|j/d||)|+|d ury|0nd |d ur|1nd |2||
|	|d ur|.nd |d ur|/nd |d ur|nd tjjd\}3}4n|tjjks|tjjkrB|j0d.i d
|d|)d|+d| d|!d|"d |#d!|$d"|%d#d$d%|d&|	d'|2d(|d ur|0nd d)|d ur|1nd d*|d ur|.nd d+|d ur|/nd d,|d ur|nd \}3}4}5}6|5tj1j2d-3d4d5tjj |6tj6j2d-3d4d5tjj |d urX||}7|7tj7j |3 |7 |d urn||}8|8tj8j |4 |8 |3tj9j2d$3||||(g4||( | |(|(| dg5| |
r|4tj:j2|
5tjj3|||dg4|| d|dg ||)|+|3g}9|
r|9;|4 |d ur|9;|0 |d ur|9;|1 ||9fW  d    S 1 sw   Y  d S )/Nr   ztorch is not availablei5f zKFP8 is not supported in cuDNN backend version < 9.17.1, current version is r   r   r   r_   zInvalid query tensor shape: rZ   )namera   stride	data_typeq_scale)r   r   r   r   k_scalev_scales_scale	s_descaleo_scalez+block_tables needs 4 dimensions of kv cachezInvalid kv cache tensor shape: r[   r\   rM   r^   sdpa)rk   rZ   kv	seq_len_q
seq_len_kvuse_padding_mask
attn_scalegenerate_statsuse_causal_mask_bottom_rightpaged_attention_k_tablepaged_attention_v_tablepaged_attention_max_seq_len_kvcompute_data_typeru   rv   	descale_q	descale_k	descale_vscale_s	descale_sscale_or{   Trz   r|   ry   rw   rx   r}   r~   r   Fr   )<r   r   cudacurrent_streamr	   r`   r   	datatypesis_torch_availableRuntimeError_torch_to_cudnn_data_typer   rm   FP8_E4M3FP8_E5M2backend_versiongraphra   
ValueErrorr   FLOATset_uidr   rC   valuerD   rE   rF   rG   rH   tensor_liker<   set_ragged_offsetr?   r@   rl   r4   r5   r6   r   r:   r;   set_namer7   r8   BFLOAT16HALFrt   sdpa_fp8rI   
set_outputset_dim
set_strideset_data_typerJ   r=   r>   rA   rB   append):rZ   r[   r\   r]   rK   rL   rM   r^   rN   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   handlerb   
graph_s_qo
graph_s_kvcudnn_q_data_typecudnn_k_data_typecudnn_v_data_typecudnn_o_data_typeg_rc   rd   cudnn_qcudnn_q_scalecudnn_k_scalecudnn_v_scalecudnn_s_scalecudnn_s_descalecudnn_o_scaleragged_qre   rf   cudnn_k_cacheragged_kcudnn_v_cacheragged_vnd_block_tablescudnn_k_block_tablescudnn_v_block_tablescudnn_actual_seq_lens_qcudnn_actual_seq_lens_kvpadding_maskOStatsamax_samax_oragged_oragged_statstensors_to_returnr   r   r   _build_prefill_graph   s   
























 
	


#












  &r   )rN   rn   ro   rp   rR   rS   rT   rU   rV   rW   rX   rY   workspace_buffermax_token_per_sequencecausalrn   ro   rp   returnc                C   s   t di d| d|d|d|d|d|d|d|d	|	d
|
d|d|d|d|d|d|d|d|d|\}}tjj| tjj|tjj|tjj|i}|d ur\||tjj< |d urf||tjj< |d urp||tj	j< |d urz||tj
j< |d ur||tjj< |d ur||tjj< |	d ur|	|tjj< |	|tjj< |r||tjj< |d ur||tjj< |d urt| j}||tjj< ||tjj< ||tjj< ||tjj< |d ur||tjj< |d ur||tjj< ttj| j}|j|||d |r||fS |d fS )NrZ   r[   r\   r]   rK   rL   rM   r^   rN   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   )	workspacer   r   )r   r   r4   r   r5   r6   rA   r7   r8   r<   r=   r?   r@   r:   r;   rB   r>   r   r	   rC   rF   rG   rH   rD   rE   r   r   r   r   execute)rZ   r[   r\   r]   r   r   rL   rM   r^   rN   r   rQ   rn   ro   rp   rR   rS   rT   rU   rV   rW   rX   rY   r   tensorsvar_mapdummy_scale_tensorr   r   r   r   _batch_prefill_with_kv_cache  s   	

r   )rN   rn   ro   rp   rR   rS   rT   rU   rV   rW   rX   is_cuda_graph_compatiblebackendrY   r   r   c          "      C   s@  | j d }|j d }|  dkr| j d | j d }}n|  dkr-| j d | j d }}| dkr9|j d }n| dkrD|j d }|rV|du rVtj|||| jtjd}|durf|j |||fkrftd|du rm| j}|du r|||f}tj|| j|d}tr|d	krt	d'i d
| d|d|d|d|d|d|d|d|d|	d|
d|d|d|d|d|d|d|d|d|d|d|d |S |sJ d!|d"kr|	du s|d#kr|	dusJ d$|du r|}|j
| jd%d&}|j
| jd%d&} t j}!|!|||| |||||||| |	|
|||dddd| ||fS )(a3  Performs batched prefill attention with paged KV cache using cuDNN.

    Args:
        q: Query tensor of shape (Total number of tokens, num_heads_qo, head_dim)
        k_cache: Key cache tensor of shape   (total_num_pages, num_heads_kv, page_size, head_dim) if paged kv cache is enabled else (Total sequence length of kv, num_heads_kv, d_qk)
        v_cache: Value cache tensor of shape (total_num_pages, num_heads_kv, page_size, head_dim) if paged kv cache is enabled else (Total sequence length of kv, num_heads_kv, d_vo)
        scale: Scaling factor for attention scores, typically 1/sqrt(head_dim)
        workspace_buffer: Workspace buffer for cuDNN operations. Scales with batch size. 128 MB should be sufficient for most cases
        max_token_per_sequence: Maximum number of tokens per query sequence (s_qo_max)
        max_sequence_kv: Maximum number of tokens per key/value sequence (s_kv_max)
        actual_seq_lens_q:  Actual number of tokens per query sequence shape (batch_size,) on cpu or device (cpu if cuda_graph is False)
        actual_seq_lens_kv: Actual sequence lengths for key/values per batch, shape (batch_size,) on CPU or device (cpu if cuda_graph is False)
        block_tables: Page table mapping for KV cache, shape (batch_size, num_pages_per_seq) on GPU
        causal: Whether to apply causal masking
        return_lse: Whether to return log-sum-exp values (must be True)
        out: Optional pre-allocated output tensor
        lse: Optional pre-allocated tensor for log-sum-exp values if return_lse is True else returns None
        is_cuda_graph_compatible: Whether the prefill operation is compatible with CUDA graph
        q_scale: Optional scale tensor for query tensor of shape (1, 1, 1, 1) on GPU
        k_scale: Optional scale tensor for key tensor of shape (1, 1, 1, 1) on GPU
        v_scale: Optional scale tensor for value tensor of shape (1, 1, 1, 1) on GPU
        batch_offsets_q: Optional batch offsets for query tensor of shape (batch_size,) on GPU
        batch_offsets_o: Optional batch offsets for output tensor of shape (batch_size,) on GPU
        batch_offsets_k: Optional batch offsets for key tensor of shape (batch_size,) on GPU
        batch_offsets_v: Optional batch offsets for value tensor of shape (batch_size,) on GPU
        o_data_type: Optional data type for output tensor
    Returns:
        Output tensor of shape (batch_size * seq_len_q, num_heads_qo, head_dim)
        If return_lse is True, also returns log-sum-exp tensor of shape (batch_size, seq_len_q, num_heads_qo)

    Note:
        Query and KV heads can have different sizes (num_heads_qo >= num_heads_kv)
        When using cuda graph, actual_seq_lens_q and actual_seq_lens_kv must be on the same device as q
        Head dimension of query and key must be 128 or 192
        Head dimension of value and output must be 128
    r   r   r   r   r_   Nr
   zAlse must have shape (num_sequences, max_token_per_sequence, h_qo)cubinrZ   r[   r\   r]   r   r   rL   rM   r^   rN   r   rQ   rn   ro   rp   rR   rS   rT   rU   rV   rW   rX   rY   z)Currently only supports return_lse = True      ziCurrently only supports if d_qk = 192 and block_tables is None or d_qk = 128 and block_tables is not NoneT)non_blockingr   )r`   ra   r   emptyr	   r   r   r   CUDNN_AVAILABLEr   tor   prefill)"rZ   r[   r\   r]   r   r   rL   rM   r^   rN   r   rQ   rn   ro   rp   rR   rS   rT   rU   rV   rW   rX   r   r   rY   
num_tokensnum_sequencesrc   rd   rf   	out_shapeactual_seq_lens_q_gpuactual_seq_lens_kv_gpurun_funcr   r   r   !cudnn_batch_prefill_with_kv_cache*  s   
B


	
r   )%enumr   typingr   r   api_loggingr   utilsr   r   r   	Exceptionr   r   dictr	   Tensor__annotations__r   r   Streamr   r   floatintboolr   rh   jit	heur_modeAgraph_cacher   tupler   strr   r   r   r   r   <module>   s
   
(	

9
	
  L	

c	
