o
    ۾iG                     @   s~  d dl mZmZmZmZ d dlZd dlmZ zddlm	Z	 dZ
dZW n ey: Z zeeZ
dZW Y dZ[ndZ[ww zddlmZ dZdZW n ey_ Z zeeZdZW Y dZ[ndZ[ww zd dlmZ dZdZW n ey Z zeeZdZW Y dZ[ndZ[ww d	Zd*d
eeee f fddZd*d
eeee f fddZd*d
eeee f fddZd*ded
efddZd*ded
ee fddZdd Zejdddddd dddd dd fdej deej  deej  deej  dee f
ddZ!dddd dddd ddddddddddd eddd dfd!eee  d"edefd#d$Z"	 			 			d+ddd%d&d'Z#	 			 			d+ddd%d(d)Z$dS ),    )OptionalUnionTupleListN   )_vllm_fa2_CTF)_vllm_fa3_C)_flash_attn_fwd   returnc                 C   s.   t s	ddt fS tj| d dk rdS dS )NFzFA2 is unavaible due to: r      )Fz=FA2 is only supported on devices with compute capability >= 8TN)FA2_AVAILABLEFA2_UNAVAILABLE_REASONtorchcudaget_device_capabilitydevice r   ]/home/ubuntu/.local/lib/python3.10/site-packages/vllm/vllm_flash_attn/flash_attn_interface.py_is_fa2_supported(   
   r   c                 C   sB   t s	ddt fS tj| d dk stj| d dkrdS dS )NFzFA3 is unavaible due to: r   	   
   )Fz<FA3 is only supported on devices with compute capability 9.0r   )FA3_AVAILABLEFA3_UNAVAILABLE_REASONr   r   r   r   r   r   r   _is_fa3_supported0   s   r   c                 C   s.   t s	ddt fS tj| d dkrdS dS )NFzFA4 is unavaible due to: r   r   )Fz>FA4 is only supported on devices with compute capability == 10r   )FA4_AVAILABLEFA4_UNAVAILABLE_REASONr   r   r   r   r   r   r   _is_fa4_supported9   r   r    
fa_versionc                 C   V   | dv sJ d|  | dkrt |d S | dkrt|d S | dkr)t|d S d S )Nr
         Unsupported FA version: r
   r   r$   r%   r   r   r    r!   r   r   r   r   is_fa_version_supportedA   s   r)   c                 C   r"   )Nr#   r&   r
   r   r$   r%   r'   r(   r   r   r   fa_version_unsupported_reasonJ   s   r*   c                 C   s"   | d ur|  ddkr|  S | S )Nr   )stride
contiguous)xr   r   r   maybe_contiguousZ   s   "r/   r+   r+   cache_seqlenscu_seqlens_qcu_seqlens_k_newcache_leftpad	page_sizec                 C   sZ   t |}|d u r
|}tjj| |||||||||	d |
d |||||d |d ||||}|S )Nr   r   )r/   r   opsr   get_scheduler_metadata)
batch_sizemax_seqlen_qmax_seqlen_knum_heads_qnum_heads_kvheaddimr1   	qkv_dtype	headdim_vr2   r3   r4   r5   max_seqlen_k_newcausalwindow_sizehas_softcap
num_splitspack_gqa	sm_marginscheduler_metadatar   r   r   r7   ^   s,   r7           rB   rD   c           "      C   sJ  |dus|dusJ d|du s|du sJ d|du s$|dus$J d|
du r/| j d d }
|du r6d}nt|dks>J |d	 |d
 f}dd | ||fD \} }}t|}|dkr|durp|durp|durp|durptd|durxtd|d
krtdtjj| |||||du r|n||d|||||	|
d||d	 |d
 ||o|	d	k|d\}} no|dkr|du sJ dtjjj	g | ||dd||||dd||||ddddd||||
||d	 |d
 |d||dd	||||R  \}} }!}!nt
d| |r#|| fS |S )aU  dropout_p should be set to 0.0 during evaluation
    Supports multi-query and grouped-query attention (MQA/GQA) by passing in K, V with fewer heads
    than Q. Note that the number of heads in Q must be divisible by the number of heads in KV.
    For example, if Q has 6 heads and K, V have 2 heads, head 0, 1, 2 of Q will attention to head
    0 of K, V, and head 3, 4, 5 of Q will attention to head 1 of K, V.

    If causal=True, the causal mask is aligned to the bottom right corner of the attention matrix.
    For example, if seqlen_q = 2 and seqlen_k = 5, the causal mask (1 = keep, 0 = masked out) is:
        1 1 1 1 0
        1 1 1 1 1
    If seqlen_q = 5 and seqlen_k = 2, the causal mask is:
        0 0
        0 0
        0 0
        1 0
        1 1
    If the row of the mask is all zero, the output will be zero.

    If window_size != (-1, -1), implements sliding window local attention. Query at position i
    will only attend to keys between
    [i + seqlen_k - seqlen_q - window_size[0], i + seqlen_k - seqlen_q + window_size[1]] inclusive.

    Arguments:
        q: (total_q, nheads, headdim), where total_q = total number of query tokens in the batch.
        k: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch.
        v: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch.
        cu_seqlens_q: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
           of the sequences in the batch, used to index into q.
        cu_seqlens_k: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
           of the sequences in the batch, used to index into kv.
        max_seqlen_q: int. Maximum query sequence length in the batch.
        max_seqlen_k: int. Maximum key sequence length in the batch.
        dropout_p: float. Dropout probability.
        softmax_scale: float. The scaling of QK^T before applying softmax.
            Default to 1 / sqrt(headdim).
        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
        window_size: (left, right). If not (-1, -1), implements sliding window local attention.
        softcap: float. Anything > 0 activates softcapping attention.
        alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of
            (-alibi_slope * |i + seqlen_k - seqlen_q - j|)
            is added to the attention score of query i and key j.
        deterministic: bool. Whether to use the deterministic implementation of the backward pass,
            which is slightly slower and uses more memory. The forward pass is always deterministic.
        return_attn_probs: bool. Whether to return the attention probabilities. This option is for
           testing only. The returned probabilities are not guaranteed to be correct
           (they might not have the right scaling).
    Return:
        out: (total, nheads, headdim).
        softmax_lse [optional, if return_softmax_lse=True]: (nheads, total_q_seqlen). The
            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
            normalization factor).
    Nz*cu_seqlens_k or seqused_k must be providedz>cu_seqlens_k and seqused_k cannot be provided at the same timez5seqused_k must be provided if block_table is providedr+         r0   r
   r   r   c                 S      g | ]}t |qS r   r/   .0r.   r   r   r   
<listcomp>       z*flash_attn_varlen_func.<locals>.<listcomp>zHFA2 does not support scheduler_metadata, q_descale, k_descale, v_descalezFA2 does not support s_auxz#FA2 does not support num_splits > 1Fr$   zAlibi is not supported in FA3Tr&   )shapelenr   
empty_likeNotImplementedErrorr6   r   
varlen_fwdr   fwd
ValueError)"qkvr9   r2   r:   cu_seqlens_k	seqused_kq_v	dropout_psoftmax_scalerA   rB   softcapalibi_slopesdeterministicreturn_attn_probsblock_tablereturn_softmax_lseoutrG   	q_descale	k_descale	v_descalerD   r!   s_auxcp_world_sizecp_rankcp_tot_seqused_kreal_window_sizedummy_cu_seqlens_ksoftmax_lse_r   r   r   flash_attn_varlen_func   s   V




		
rq   )rd   re   c                C   sv   |du r| j d d }dd | ||fD \} }}tjj| |||||||||||	|
|o.|dkd\}}|r9||fS |S )a  Compute attention with vertical and slash sparsity patterns.
    Most Arguments are the same with the flash_attn_func interface, except for 4 extra args:
    block_count and block_offset for slash sparsity patterns, and
    column_count and column_index for vertical sparsity patterns.
    For more details please refer to Appendix C.4.2 of paper https://arxiv.org/abs/2407.02490.

    Arguments:
        q: (batch_size, seqlen, nheads, headdim)
        k: (batch_size, seqlen, nheads_k, headdim)
        v: (batch_size, seqlen, nheads_k, headdim)
        block_count: (batch_size, nheads, cdiv(seqlen, BLOCK_M))
        block_offset: (batch_size, nheads, cdiv(seqlen, BLOCK_M), NNZ_S)
        column_count: (batch_size, nheads, cdiv(seqlen, BLOCK_M))
        column_index: (batch_size, nheads, cdiv(seqlen, BLOCK_M), NNZ_V)
        dropout_p: float. Dropout probability.
        softmax_scale: float. The scaling of QK^T before applying softmax.
            Default to 1 / sqrt(headdim).
        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
        alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of
            (-alibi_slope * |i + seqlen_k - seqlen_q - j|)
            is added to the attention score of query i and key j.
        deterministic: bool. Whether to use the deterministic implementation of the backward pass,
            which is slightly slower and uses more memory. The forward pass is always deterministic.
        return_attn_probs: bool. Whether to return the attention probabilities. This option is for
           testing only. The returned probabilities are not guaranteed to be correct
           (they might not have the right scaling).
    Return:
        out: (batch_size, seqlen, nheads, headdim).
        softmax_lse [optional, if return_softmax_lse=True]: (batch_size, nheads, seqlen). The
            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
            normalization factor).
    Nr+   rI   c                 S   rJ   r   rK   rL   r   r   r   rN   o  rO   z$sparse_attn_func.<locals>.<listcomp>r   )rP   r   r6   r   
fwd_sparse)rW   rX   rY   block_countblock_offsetcolumn_countcolumn_indexr]   r^   rA   r_   r`   ra   rb   rd   re   ro   r   r   r   sparse_attn_func9  s*   3
rw   c                C   s   |du r| j d d }dd | ||fD \} }}tjj| |||||||||d||	|
||d|||o4|dkd\}}|r?||fS |S )al
  Compute attention with vertical and slash sparsity patterns.
    Most Arguments are the same with the flash_attn_varlen_func interface, except for 4 extra args:
    block_count and block_offset for slash sparsity patterns, and
    column_count and column_index for vertical sparsity patterns.
    For more details please refer to Appendix C.4.2 of paper https://arxiv.org/abs/2407.02490.
    
    Arguments:
        q: (total_q, nheads, headdim), where total_q = total number of query tokens in the batch.
        k: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch.
        v: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch.
        block_count: (batch_size, nheads, cdiv(seqlen, BLOCK_M))
        block_offset: (batch_size, nheads, cdiv(seqlen, BLOCK_M), NNZ_S)
        column_count: (batch_size, nheads, cdiv(seqlen, BLOCK_M))
        column_index: (batch_size, nheads, cdiv(seqlen, BLOCK_M), NNZ_V)
        cu_seqlens_q: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
           of the sequences in the batch, used to index into q.
        cu_seqlens_k: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
           of the sequences in the batch, used to index into kv.
        max_seqlen_q: int. Maximum query sequence length in the batch.
        max_seqlen_k: int. Maximum key sequence length in the batch.
        dropout_p: float. Dropout probability.
        softmax_scale: float. The scaling of QK^T before applying softmax.
            Default to 1 / sqrt(headdim).
        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
        softcap: float. Anything > 0 activates softcapping attention.
        alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of
            (-alibi_slope * |i + seqlen_k - seqlen_q - j|)
            is added to the attention score of query i and key j.
        deterministic: bool. Whether to use the deterministic implementation of the backward pass,
            which is slightly slower and uses more memory. The forward pass is always deterministic.
        return_attn_probs: bool. Whether to return the attention probabilities. This option is for
           testing only. The returned probabilities are not guaranteed to be correct
           (they might not have the right scaling).
    Return:
        out: (total, nheads, headdim).
        softmax_lse [optional, if return_softmax_lse=True]: (nheads, total_q_seqlen). The
            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
            normalization factor).
    Nr+   rI   c                 S   rJ   r   rK   rL   r   r   r   rN     rO   z+sparse_attn_varlen_func.<locals>.<listcomp>Fr   )rP   r   r6   r   varlen_fwd_sparse)rW   rX   rY   rs   rt   ru   rv   r2   rZ   r9   r:   r]   r^   rA   r_   r`   ra   rb   rd   re   ro   r   r   r   sparse_attn_varlen_func  s6   >
ry   )N)rH   NFrH   NFF)%typingr   r   r   r   r   torch.nnnn r   r   r   ImportErrorestrr   r   r   flash_attn.cute.interfacer	   r   r   DEFAULT_FA_VERSIONboolr   r   r    intr)   r*   r/   bfloat16Tensorr7   rq   rw   ry   r   r   r   r   <module>   s   		

1

 :W