o
    پil                  9   @   sd  U d dl mZ d dlmZ d dlZddlmZ ddlmZ zd dl	Z	dZ
W n ey2   dZ	d	Z
Y nw dai Zeejejf ed
< dejfddZdejjfddZG dd deZddddddd	ddddddddddejdejdejdedee dee deej dejdeej dee dee dee d eej d!eej d"eej d#eej d$eej d%eej d&eej d'eej f(d(d)Ze
r>e	je	jjgd*e	jed+ddddddd	ddddddddd,dejdejdejdedee dee deej deej deej dee dee d eej d!eej d"eej d#eej d$eej d%eej d&eej d'eej f&d-d.Z ddddddddddddd/dejdejdejded0ejd1ededejdejdeej d2eded3eej d4eej d5eej d eej d!eej d"eej d#eej d$eej d%eej d&eej d'eej d6e!ejejf f0d7d8Z"edddddddddddd	ddd9dejdejdejded0ejd1ededejdejdeej d2eded3eej d4eej d5eej d eej d!eej d"eej d#eej d$eej d%eej d&eej d:ed;ee# d'eej d6e!ejeej f f4d<d=Z$dS )>    )Enum)OptionalN   )flashinfer_api   )get_cudnn_fmha_gen_moduleTF_dummy_scale_tensorsdevicec                 C   s>   t | }|d u rtjdg| tjddddd}|t | < |S )Ng      ?r	   dtyper   )r   gettorchtensorfloat32reshape)r	   t r   L/home/ubuntu/.local/lib/python3.10/site-packages/flashinfer/cudnn/prefill.py_get_dummy_scale_tensor   s
   
 r   streamc                 C   s"   t d u rt a tt | j t S )N)_cudnn_handlecudnncreate_handle
set_streamcuda_stream)r   r   r   r   _create_cudnn_handle   s   r   c                   @   sl   e Zd ZdZdZdZdZdZdZdZ	dZ
d	Zd
ZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdS )UIDsr   r   r      d   e            2   3   4   5   6   i  i                          N)__name__
__module____qualname__RESERVED_INVALID_UIDQ_UIDK_UIDV_UIDACTUAL_SEQ_LENS_Q_UIDACTUAL_SEQ_LENS_KV_UIDBLOCK_TABLES_UIDBLOCK_TABLES_K_UIDBLOCK_TABLES_V_UIDRAGGED_Q_UIDRAGGED_O_UIDRAGGED_STATS_UIDRAGGED_K_UIDRAGGED_V_UIDO_UID	STATS_UIDQ_SCALE_UIDK_SCALE_UIDV_SCALE_UIDS_SCALE_UIDS_DESCALE_UIDO_SCALE_UID
S_AMAX_UID
O_AMAX_UIDr   r   r   r   r   )   s2    r   )max_token_seq_qmax_sequence_kvactual_seq_lens_qblock_tables	page_sizebottom_right_causal_mask
return_lsebatch_offsets_qbatch_offsets_obatch_offsets_kbatch_offsets_vbatch_offsets_statsoutlseo_data_typeqk_cachev_cachescalerK   rL   rM   actual_seq_lens_kvrN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   c                C   s   |j d }|  dkr| j d | j d }}n|  dkr(| j d | j d }}| dkr:|j d |j d }}n| dkrK|j d |j d }}|d urT|j d }	||  | j| |||||||d u||
|	f}|S )Nr   r   r   r      )shapedimr   )rZ   r[   r\   r]   rK   rL   rM   r^   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   graph_bh_qod_qkh_kvd_vokeyr   r   r   _sdpa_prefill_key_fnK   s6   

rh   )
heur_modes)key_fn)rK   rL   rM   r^   rN   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   c          =      C   s  t tj| j}|jd }|}|}tj st	dtj
| j}tj
|j}tj
|j}|d u r7| j}tj
|}|tjjksI|tjjkrXt dk rXt	dt  t|\}}|  dkrz| jd | jd }}|  \}} }!n!|  dkr| jd | jd }}|  \}} }!ntd	| j |jd
||||f|| | ||!f|d}"|tjjks|tjjkr'|jdddtjjd}#|jdddtjjd}$|jdddtjjd}%|jdddtjjd}&|jdddtjjd}'|jdddtjjd}(|#tjj |$tjj |%tjj |&tjj |'tjj |(tjj |d ur=||})|)tj j |"!|) | dkrY|d u sMJ d|jd |jd }*}+n| dkrl|jd |jd }*}+ntd|j | dkr| \}} }!|jd||*||f|*| | | ||!f|d},|d ur||}-|-tj"j |,!|- | dksJ d| \}} }!|jd||*||+f|*|+ | | ||!f|d}.|d ur||}/|/tj#j |.!|/ n| dkr
|jd|j| |d},|jd|j| |d}.|"tj$j |,tj%j |.tj&j |d urJ|'|jd d|jd d}0||0}1|1tj(j ||0}2|2tj)j |d ur`||}3|3*d |3tj+j |d urv||}4|4*d |4tj,j |d uo~|d u}5|tjj-ks|tjj.kr|j/d|"|,|.|d ur|3nd |d ur|4nd |5||
|	|d ur|1nd |d ur|2nd |d ur|nd tjjd\}6}7n|tjjks|tjjkrc|j0d/i d
|"d|,d|.d|#d|$d |%d!|&d"|'d#|(d$d%d&|d'|	d(|5d)|d ur	|3nd d*|d ur|4nd d+|d ur|1nd d,|d ur'|2nd d-|d ur1|nd \}6}7}8}9|8tj1j2d.3d4d5tjj |9tj6j2d.3d4d5tjj |d ury||}:|:tj7j |6!|: |d ur||};|;tj8j |7!|; |6tj9j2d%3||||+g4||+ | |+|+| dg5| |
r|7tj:j2|
5tjj3|||dg4|| d|dg |"|,|.|6g}<|
r|<;|7 |d ur|<;|3 |d ur|<;|4 ||<fW  d    S 1 sw   Y  d S )0Nr   ztorch is not availablei5f zKFP8 is not supported in cuDNN backend version < 9.17.1, current version is r   r   r   r_   zInvalid query tensor shape: rZ   )namera   stride	data_typeq_scale)r   r   r   r   k_scalev_scales_scale	s_descaleo_scalez+block_tables needs 4 dimensions of kv cachezInvalid kv cache tensor shape: r[   z=v_cache must have 3 dimensions since k_cache has 3 dimensionsr\   rM   r^   sdpa)rk   rZ   kv	seq_len_q
seq_len_kvuse_padding_mask
attn_scalegenerate_statsuse_causal_mask_bottom_rightpaged_attention_k_tablepaged_attention_v_tablepaged_attention_max_seq_len_kvcompute_data_typeru   rv   	descale_q	descale_k	descale_vscale_s	descale_sscale_or{   Trz   r|   ry   rw   rx   r}   r~   r   Fr   )<r   r   cudacurrent_streamr	   r`   r   	datatypesis_torch_availableRuntimeError_torch_to_cudnn_data_typer   rm   FP8_E4M3FP8_E5M2backend_versiongraphra   rl   
ValueErrorr   FLOATset_uidr   rC   valuerD   rE   rF   rG   rH   tensor_liker<   set_ragged_offsetr?   r@   r4   r5   r6   r   r:   r;   set_namer7   r8   BFLOAT16HALFrt   sdpa_fp8rI   
set_outputset_dim
set_strideset_data_typerJ   r=   r>   rA   rB   append)=rZ   r[   r\   r]   rK   rL   rM   r^   rN   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   handlerb   
graph_s_qo
graph_s_kvcudnn_q_data_typecudnn_k_data_typecudnn_v_data_typecudnn_o_data_typeg_rc   rd   s_strideh_strided_stridecudnn_qcudnn_q_scalecudnn_k_scalecudnn_v_scalecudnn_s_scalecudnn_s_descalecudnn_o_scaleragged_qre   rf   cudnn_k_cacheragged_kcudnn_v_cacheragged_vnd_block_tablescudnn_k_block_tablescudnn_v_block_tablescudnn_actual_seq_lens_qcudnn_actual_seq_lens_kvpadding_maskOStatsamax_samax_oragged_oragged_statstensors_to_returnr   r   r   _build_prefill_graph   s  
























 
	


#












  &r   )rN   rn   ro   rp   rR   rS   rT   rU   rV   rW   rX   rY   workspace_buffermax_token_per_sequencecausalrn   ro   rp   returnc                C   s   t di d| d|d|d|d|d|d|d|d	|	d
|
d|d|d|d|d|d|d|d|d|\}}tjj| tjj|tjj|tjj|i}|d ur\||tjj< |d urf||tjj< |d urp||tj	j< |d urz||tj
j< |d ur||tjj< |d ur||tjj< |	d ur|	|tjj< |	|tjj< |r||tjj< |d ur||tjj< |d urt| j}||tjj< ||tjj< ||tjj< ||tjj< |d ur||tjj< |d ur||tjj< ttj| j}|j|||d |r||fS |d fS )NrZ   r[   r\   r]   rK   rL   rM   r^   rN   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   )	workspacer   r   )r   r   r4   r   r5   r6   rA   r7   r8   r<   r=   r?   r@   r:   r;   rB   r>   r   r	   rC   rF   rG   rH   rD   rE   r   r   r   r   execute)rZ   r[   r\   r]   r   r   rL   rM   r^   rN   r   rQ   rn   ro   rp   rR   rS   rT   rU   rV   rW   rX   rY   r   tensorsvar_mapdummy_scale_tensorr   r   r   r   _batch_prefill_with_kv_cache  s   	

r   )rN   rn   ro   rp   rR   rS   rT   rU   rV   rW   rX   is_cuda_graph_compatiblebackendrY   r   r   c          "      C   s@  | j d }|j d }|  dkr| j d | j d }}n|  dkr-| j d | j d }}| dkr9|j d }n| dkrD|j d }|rV|du rVtj|||| jtjd}|durf|j |||fkrftd|du rm| j}|du r|||f}tj|| j|d}tr|d	krt	d'i d
| d|d|d|d|d|d|d|d|d|	d|
d|d|d|d|d|d|d|d|d|d|d|d |S |sJ d!|d"kr|	du s|d#kr|	dusJ d$|du r|}|j
| jd%d&}|j
| jd%d&} t j}!|!|||| |||||||| |	|
|||dddd| ||fS )(a3  Performs batched prefill attention with paged KV cache using cuDNN.

    Args:
        q: Query tensor of shape (Total number of tokens, num_heads_qo, head_dim)
        k_cache: Key cache tensor of shape   (total_num_pages, num_heads_kv, page_size, head_dim) if paged kv cache is enabled else (Total sequence length of kv, num_heads_kv, d_qk)
        v_cache: Value cache tensor of shape (total_num_pages, num_heads_kv, page_size, head_dim) if paged kv cache is enabled else (Total sequence length of kv, num_heads_kv, d_vo)
        scale: Scaling factor for attention scores, typically 1/sqrt(head_dim)
        workspace_buffer: Workspace buffer for cuDNN operations. Scales with batch size. 128 MB should be sufficient for most cases
        max_token_per_sequence: Maximum number of tokens per query sequence (s_qo_max)
        max_sequence_kv: Maximum number of tokens per key/value sequence (s_kv_max)
        actual_seq_lens_q:  Actual number of tokens per query sequence shape (batch_size,) on cpu or device (cpu if cuda_graph is False)
        actual_seq_lens_kv: Actual sequence lengths for key/values per batch, shape (batch_size,) on CPU or device (cpu if cuda_graph is False)
        block_tables: Page table mapping for KV cache, shape (batch_size, num_pages_per_seq) on GPU
        causal: Whether to apply causal masking
        return_lse: Whether to return log-sum-exp values (must be True)
        out: Optional pre-allocated output tensor
        lse: Optional pre-allocated tensor for log-sum-exp values if return_lse is True else returns None
        is_cuda_graph_compatible: Whether the prefill operation is compatible with CUDA graph
        q_scale: Optional scale tensor for query tensor of shape (1, 1, 1, 1) on GPU
        k_scale: Optional scale tensor for key tensor of shape (1, 1, 1, 1) on GPU
        v_scale: Optional scale tensor for value tensor of shape (1, 1, 1, 1) on GPU
        batch_offsets_q: Optional batch offsets for query tensor of shape (batch_size,) on GPU
        batch_offsets_o: Optional batch offsets for output tensor of shape (batch_size,) on GPU
        batch_offsets_k: Optional batch offsets for key tensor of shape (batch_size,) on GPU
        batch_offsets_v: Optional batch offsets for value tensor of shape (batch_size,) on GPU
        o_data_type: Optional data type for output tensor
    Returns:
        Output tensor of shape (batch_size * seq_len_q, num_heads_qo, head_dim)
        If return_lse is True, also returns log-sum-exp tensor of shape (batch_size, seq_len_q, num_heads_qo)

    Note:
        Query and KV heads can have different sizes (num_heads_qo >= num_heads_kv)
        When using cuda graph, actual_seq_lens_q and actual_seq_lens_kv must be on the same device as q
        Head dimension of query and key must be 128 or 192
        Head dimension of value and output must be 128
    r   r   r   r   r_   Nr
   zAlse must have shape (num_sequences, max_token_per_sequence, h_qo)cubinrZ   r[   r\   r]   r   r   rL   rM   r^   rN   r   rQ   rn   ro   rp   rR   rS   rT   rU   rV   rW   rX   rY   z)Currently only supports return_lse = True      ziCurrently only supports if d_qk = 192 and block_tables is None or d_qk = 128 and block_tables is not NoneT)non_blockingr   )r`   ra   r   emptyr	   r   r   r   CUDNN_AVAILABLEr   tor   prefill)"rZ   r[   r\   r]   r   r   rL   rM   r^   rN   r   rQ   rn   ro   rp   rR   rS   rT   rU   rV   rW   rX   r   r   rY   
num_tokensnum_sequencesrc   rd   rf   	out_shapeactual_seq_lens_q_gpuactual_seq_lens_kv_gpurun_funcr   r   r   !cudnn_batch_prefill_with_kv_cache1  s   
B


	
r   )%enumr   typingr   r   api_loggingr   utilsr   r   r   	Exceptionr   r   dictr	   Tensor__annotations__r   r   Streamr   r   floatintboolr   rh   jit	heur_modeAgraph_cacher   tupler   strr   r   r   r   r   <module>   s
   
(	

9
	
  S	

c	
