o
    پi:                     @   s  d dl mZ d dlmZmZ d dlZzd dlmZ W n   edzddl	m
Z W n ey5   dZY nw eddd#d	efd
dZdd Z																								 		 					d$deeeejf  deej deej deej deej deej dee deej deej deej deej dee fdd Z													 				 					d%d!d"Z
dS )&    )	lru_cache)OptionalUnionN)	flash_opszACan not import FA3 in sgl_kernel. Please check your installation.   )flash_attn_varlen_func)maxsizereturnc                 C   s4   t jjdkot j| d dkpt j| d dkS )Nz12.3r   	      )torchversioncudaget_device_capability)device r   I/home/ubuntu/.local/lib/python3.10/site-packages/sgl_kernel/flash_attn.pyis_fa3_supported   s   
r   c                 C   s"   | d ur|  ddkr|  S | S )Nr   )stride
contiguous)xr   r   r   maybe_contiguous#   s   "r   Fr   r           T   cache_seqlenscache_batch_idxcache_leftpad
page_tablecu_seqlens_qcu_seqlens_k_newmax_seqlen_qrotary_seqlens	q_descale	k_descale	v_descaleattention_chunkc"           %      C   s  |!dkrt dusJ d|du r|du sJ d|du r$|du r$|du s(J d|	du r0|
du s4J d|du r@|du r@|du sDJ d|dkrJd	}t d'i d
| d|d|d|d|d|d|d|d|d|d|d|d|d|d|d| S |ddksJ d|ddksJ d|du r| jd |dur|jd nd d }|durt|trtj|jd f|tj|jd }t	|}d!d" | |||fD \} }}}|ddkr|d#dkr|
 n|}d$d" ||fD \}}d%d" ||	|
fD \}}	}
d&d" ||fD \}}t	|}|du rdnt|}tjjjjg | |||||d|d|d||d||	|
|||||||||d |d ||||||||R  ^}"}#}$|r||"|#g|$R S |"S )(a  
    If k and v are not None, k_cache and v_cache will be updated *inplace* with the new values from
    k and v. This is useful for incremental decoding: you can pass in the cached keys/values from
    the previous step, and update them with the new keys/values from the current step, and do
    attention with the updated cache, all in 1 kernel.

    If you pass in k / v, you must make sure that the cache is large enough to hold the new values.
    For example, the KV cache could be pre-allocated with the max sequence length, and you can use
    cache_seqlens to keep track of the current sequence lengths of each sequence in the batch.

    Also apply rotary embedding if rotary_cos and rotary_sin are passed in. The key @k will be
    rotated by rotary_cos and rotary_sin at indices cache_seqlens, cache_seqlens + 1, etc.
    If causal or local (i.e., window_size != (-1, -1)), the query @q will be rotated by rotary_cos
    and rotary_sin at indices cache_seqlens, cache_seqlens + 1, etc.
    If not causal and not local, the query @q will be rotated by rotary_cos and rotary_sin at
    indices cache_seqlens only (i.e. we consider all tokens in @q to be at position cache_seqlens).

    See tests/test_flash_attn.py::test_flash_attn_kvcache for examples of how to use this function.

    Supports multi-query and grouped-query attention (MQA/GQA) by passing in KV with fewer heads
    than Q. Note that the number of heads in Q must be divisible by the number of heads in KV.
    For example, if Q has 6 heads and K, V have 2 heads, head 0, 1, 2 of Q will attention to head
    0 of K, V, and head 3, 4, 5 of Q will attention to head 1 of K, V.

    If causal=True, the causal mask is aligned to the bottom right corner of the attention matrix.
    For example, if seqlen_q = 2 and seqlen_k = 5, the causal mask (1 = keep, 0 = masked out) is:
        1 1 1 1 0
        1 1 1 1 1
    If seqlen_q = 5 and seqlen_k = 2, the causal mask is:
        0 0
        0 0
        0 0
        1 0
        1 1
    If the row of the mask is all zero, the output will be zero.

    If window_size != (-1, -1), implements sliding window local attention. Query at position i
    will only attend to keys between
    [i + seqlen_k - seqlen_q - window_size[0], i + seqlen_k - seqlen_q + window_size[1]] inclusive.

    Note: Does not support backward pass.

    Arguments:
        q: (batch_size, seqlen, nheads, headdim)
        k_cache: (batch_size_cache, seqlen_cache, nheads_k, headdim) if there's no page_table,
            or (num_blocks, page_block_size, nheads_k, headdim) if there's a page_table (i.e. paged KV cache)
            page_block_size must be a multiple of 256.
        v_cache: (batch_size_cache, seqlen_cache, nheads_k, headdim_v) if there's no page_table,
            or (num_blocks, page_block_size, nheads_k, headdim_v) if there's a page_table (i.e. paged KV cache)
        k [optional]: (batch_size, seqlen_new, nheads_k, headdim). If not None, we concatenate
            k with k_cache, starting at the indices specified by cache_seqlens.
        v [optional]: (batch_size, seqlen_new, nheads_k, headdim_v). Similar to k.
        qv [optional]: (batch_size, seqlen, nheads, headdim_v)
        rotary_cos [optional]: (seqlen_ro, rotary_dim / 2). If not None, we apply rotary embedding
            to k and q. Only applicable if k and v are passed in. rotary_dim must be divisible by 16.
        rotary_sin [optional]: (seqlen_ro, rotary_dim / 2). Similar to rotary_cos.
        cache_seqlens: int, or (batch_size,), dtype torch.int32. The sequence lengths of the
            KV cache.
        cache_batch_idx: (batch_size,), dtype torch.int32. The indices used to index into the KV cache.
            If None, we assume that the batch indices are [0, 1, 2, ..., batch_size - 1].
            If the indices are not distinct, and k and v are provided, the values updated in the cache
                 might come from any of the duplicate indices.
        cache_leftpad: (batch_size,), dtype torch.int32. The index that the KV cache starts. If None, assume 0.
        page_table [optional]: (batch_size, max_num_blocks_per_seq), dtype torch.int32.
        softmax_scale: float. The scaling of QK^T before applying softmax.
            Default to 1 / sqrt(headdim).
        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
        window_size: (left, right). If not (-1, -1), implements sliding window local attention.
        attention_chunk: Optional[int]. If not None, splits the query into chunks of this size to save memory.
        softcap: float. Anything > 0 activates softcapping attention.
        rotary_interleaved: bool. Only applicable if rotary_cos and rotary_sin are passed in.
            If True, rotary embedding will combine dimensions 0 & 1, 2 & 3, etc. If False,
            rotary embedding will combine dimensions 0 & rotary_dim / 2, 1 & rotary_dim / 2 + 1
            (i.e. GPT-NeoX style).
        num_splits: int. If > 1, split the key/value into this many chunks along the sequence.
           If num_splits == 1, we don't split the key/value. If num_splits == 0, we use a heuristic
           to automatically determine the number of splits.
           Don't change this unless you know what you are doing.
        return_softmax_lse: bool. Whether to return the logsumexp of the attention scores.
        score_mod [optional]: A callable that takes the attention scores and applies a modification.
        aux_tensors [optional]: Some score_mods will want to read from global aux_tensors. This is how we thread them through to the inner kernel.

    Return:
        out: (batch_size, seqlen, nheads, headdim).
        softmax_lse [optional, if return_softmax_lse=True]: (batch_size, nheads, seqlen). The
            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
            normalization factor).
       N5FA4 is not available, please check your installation.z0FA4 does not support updating KV cache in-place.z&FA4 does not support rotary embedding.zCFA4 does not support non-consecutive batch indices or left padding.zFA4 does not support descale.r   NNqkvr    	seqused_ksoftmax_scalecausalwindow_sizesoftcap
num_splitspack_gqareturn_softmax_lselearnable_sinkr   	score_modaux_tensorsr   r   z+k_cache must have contiguous last dimensionz+v_cache must have contiguous last dimensionr         )dtyper   c                 S      g | ]}t |qS r   r   .0r   r   r   r   
<listcomp>       z+flash_attn_with_kvcache.<locals>.<listcomp>c                 S   r;   r   r<   r=   r   r   r   r?          c                 S   r;   r   r<   r=   r   r   r   r?      rB   c                 S   r;   r   r<   r=   r   r   r   r?      r@   r   )flash_attn_varlen_func_v4r   shape
isinstanceintr   fullint32r   r   r   ops
sgl_kernelfwddefault)%r+   k_cachev_cacher,   r-   qv
rotary_cos
rotary_sinr   r   r   r   r    r!   r"   r#   r$   r%   r&   r/   r0   r1   r'   r2   rotary_interleavedscheduler_metadatar3   r4   	sm_marginr5   sinksr7   r8   veroutsoftmax_lserestr   r   r   flash_attn_with_kvcache'   s(  |
	

	
 !"#&rZ   c                 C   s6  |dkr)t d usJ d|dkrd}t | ||f|||||	|
||||||||dS t s0td|d u s8|d u r<td|
d u rR| jd |d urM|jd nd	 d
 }
|d u rXd	nt|}tjjj	j
| ||d d |d ||d ||||d d d d d d ||||
||d	 |d ||fdd ||||d^}}}|r||g|R S |S )Nr(   r)   r   r*   )r    cu_seqlens_k	seqused_qr.   r   r/   r0   r1   r2   r4   r6   r5   r7   r8   z<flash_attn at sgl-kernel is only supported on sm90 and abovez2max_seqlen_q and max_seqlen_k are required for FA3r   r   r9   r   F)is_rotary_interleavedrS   r3   r4   rT   rU   )rC   r   NotImplementedError
ValueErrorrD   rF   r   rI   rJ   rK   rL   )r+   r,   r-   r    r[   r"   max_seqlen_kr\   r.   r   r/   r0   rO   r$   r%   r&   r1   r'   r2   r3   r4   rT   r5   rU   r7   r8   rV   rW   rX   rY   r   r   r   r     s   

&r   )N)NNNNNNNNNNNNNNNNNFr   Nr   TNr   Nr   FNNNr   )NNNNNNFNNNNr   r   r   r   Nr   FNNNr   )	functoolsr   typingr   r   r   rJ   r   ImportError_fa4_interfacer   rC   boolr   r   rF   TensorrZ   r   r   r   r   <module>   s    	

 p