o
    i                  2   @   sB  d dl mZmZmZmZ d dlZd dlmZ d dlZe	dddkZ
e
r+ddlmZ nd dlZdd	 Zd
d Zdd ZejdkrIejjZejjZndkdddddZdkdddddZeZeZedddddejdejdejdedededed ed!ed"eej d#ed$eejejejejf fd%d&Zeddejdejdejdedededed ed!ed"eej d#ed$eejejejejf fd'd(Zejdkrej j!jZ"neZ"ed)ddd	*	*	+		,				,dldejdejdejd-ejd.ejd/ed0ededededed ed!ed"eej d#ed1eej d2eej d3eej d4ed$eejejejejf f(d5d6Z#ed)	*	*	+		,				,dldejdejdejd-ejd.ejd/ed0ededededed ed!ed"eej d#ed1eej d2eej d3eej d4ed$eejejejejf f(d7d8Z$ejdkrej j!j#Z%ne#Z%ed9d:dd	dkd;ejdejdejdejd<ejd=ejd>eej d?eej d@eej dedededed ed!ed"eej dAedBeej d$ejf&dCdDZ&ed9	dkd;ejdejdejdejd<ejd=ejd>eej d?eej d@eej dedededed ed!ed"eej dAedBeej d$ejf&dEdFZ'ejdkr+ej j!j&Z(ne&Z(edGd:dd		,dmd;ejdejdejdejd<ejd=ejd>eej d?eej d@eej d-ejd.ejd/ed0ededededed ed!ed"eej dAedBeej d4ed$ejf0dHdIZ)edG		,dmd;ejdejdejdejd<ejd=ejd>eej d?eej d@eej d-ejd.ejd/ed0ededededed ed!ed"eej dAedBeej d4ed$ejf0dJdKZ*ejdkrej j!j)Z+ne)Z+G dLdM dMej,j-Z.G dNdO dOej,j-Z/G dPdQ dQej,j-Z0G dRdS dSej,j-Z1G dTdU dUej,j-Z2G dVdW dWej,j-Z3	+		,	X	+		,	,dndYdZZ4	+		,	X	+		,	,dnd[d\Z5	+		,	X	+		,	,dnd]d^Z6	+		,	X	+		,	,dnd_d`Z7	+		,	X	+		,	,dndadbZ8	+		,	X	+		,	,	dodcddZ9										,	X	+	e		 	,dpdfeeeejf  dgeej dheej d1eej fdidjZ:dS )q    )OptionalSequenceTupleUnionN!FLASH_ATTENTION_TRITON_AMD_ENABLEFALSETRUE   )interface_fac                 C   s"   | d ur|  ddkr|  S | S )Nr	   )stride
contiguous)x r   U/home/ubuntu/vllm_env/lib/python3.10/site-packages/flash_attn/flash_attn_interface.pymaybe_contiguous   s   "r   c           	      C   s   |dksJ t j| \}}|dko|dk}|dko|dk}|dko%|dk}|dkr,dS |dkr6|s4dS dS |dkr<dS |dkrP|rJ|sH|rHdS dS |sNdS dS |d	krVdS |d
kr\dS |dkrbdS d S )N      r   	          @   `         )torchcudaget_device_capability)	devicehead_dim
is_dropout	is_causalmajorminoris_sm8xis_sm80is_sm90r   r   r   _get_block_size_n   s,   r'   c                 C   s   | | d | | S )Nr	   r   )r   mr   r   r   round_multiple1   s   r)   z2.4.0)device_typesschemac               C   s   dd }|d u r
|S |S )Nc                 S      | S Nr   funcr   r   r   wrap=      z$noop_custom_op_wrapper.<locals>.wrapr   )namefnmutates_argsr*   r+   r0   r   r   r   noop_custom_op_wrapper<      r5   )lib_stacklevelc               C   s   dd }|d u r
|S |S )Nc                 S   r,   r-   r   r.   r   r   r   r0   C   r1   z(noop_register_fake_wrapper.<locals>.wrapr   )opr3   r7   r8   r0   r   r   r   noop_register_fake_wrapperB   r6   r:   zflash_attn::_flash_attn_forwardr   r   )r4   r*   qkv	dropout_psoftmax_scalecausalwindow_size_leftwindow_size_rightsoftcapalibi_slopesreturn_softmaxreturnc                 C   sP   dd | ||fD \} }}t | ||d |	|||||||
d \}}}}||||fS )Nc                 S      g | ]}t |qS r   r   .0r   r   r   r   
<listcomp>Z       z'_flash_attn_forward.<locals>.<listcomp>)flash_attn_gpufwd)r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   outsoftmax_lseS_dmask	rng_stater   r   r   _flash_attn_forwardL   s"   rS   c                 C   s   dd | ||fD \} }}| j \}}}}|j d }t| }tj|||ftj| j| jd}tjd| j| j| jd}|
rQtj||t|dt|df| j| j| jd}tjdtj	| jd}||||fS )	Nc                 S   rG   r   rH   rI   r   r   r   rK   {   rL   z,_flash_attn_forward_fake.<locals>.<listcomp>r	   dtyper   layoutr   r      rU   r   )
shaper   
empty_likeemptyfloat32r   rV   rU   r)   int64)r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   
batch_sizeseqlen_q	num_heads	head_sizeseqlen_krO   rP   prR   r   r   r   _flash_attn_forward_fakem   s   

,rf   z&flash_attn::_flash_attn_varlen_forwardr           Fcu_seqlens_qcu_seqlens_kmax_seqlen_qmax_seqlen_kblock_table	leftpad_k	seqused_kzero_tensorsc                 C   s`   dd | ||fD \} }}t | ||d ||||||||||||	|
|||d \}}}}||||fS )Nc                 S   rG   r   rH   rI   r   r   r   rK      rL   z._flash_attn_varlen_forward.<locals>.<listcomp>)rM   
varlen_fwd)r;   r<   r=   rh   ri   rj   rk   r>   r?   r@   rA   rB   rC   rD   rE   rl   rm   rn   ro   rO   rP   rQ   rR   r   r   r   _flash_attn_varlen_forward   s2   rq   c                 C   s   dd | ||fD \} }}|d u}|  d }| j\}}}t| }tj||ftj| j| jd}tjd| j| j| jd}t	|d}t	|d}|rXtj||||f| j| j| jd}tjdtj
| jd}||||fS )	Nc                 S   rG   r   rH   rI   r   r   r   rK      rL   z3_flash_attn_varlen_forward_fake.<locals>.<listcomp>r	   rT   rW   r   rX   rZ   )numelr[   r   r\   r]   r^   r   rV   rU   r)   r_   )r;   r<   r=   rh   ri   rj   rk   r>   r?   r@   rA   rB   rC   rD   rE   rl   rm   rn   ro   paged_kvr`   total_qrb   _rO   rP   re   seqlen_q_roundedseqlen_k_roundedrR   r   r   r   _flash_attn_varlen_forward_fake   s   


 rx   z flash_attn::_flash_attn_backward)dqdkdvdoutrO   rP   ry   rz   r{   deterministicrR   c                 C   s\   dd | ||||fD \} }}}}t | ||||||||||	|
|||||d |\}}}}|S )Nc                 S   rG   r   rH   rI   r   r   r   rK     rL   z(_flash_attn_backward.<locals>.<listcomp>)rM   bwd)r|   r;   r<   r=   rO   rP   ry   rz   r{   r>   r?   r@   rA   rB   rC   rD   r}   rR   	softmax_dr   r   r   _flash_attn_backward   s8   "r   c                 C   s   dd | ||||fD \} }}}}|d u rt |}|d u r#t |}|d u r,t |}|j\}}}}t j||t|df|jt jd}|S )Nc                 S   rG   r   rH   rI   r   r   r   rK   6  rL   z-_flash_attn_backward_fake.<locals>.<listcomp>r   r   rU   )r   r\   r[   r]   r)   r   r^   )r|   r;   r<   r=   rO   rP   ry   rz   r{   r>   r?   r@   rA   rB   rC   rD   r}   rR   r`   ra   rb   ru   r   r   r   r   _flash_attn_backward_fake!  s   "


 r   z'flash_attn::_flash_attn_varlen_backwardc                 C   sf   dd | ||||fD \} }}}}t | |||||||||	|
|||||||||||d |\}}}}|S )Nc                 S   rG   r   rH   rI   r   r   r   rK   d  rL   z/_flash_attn_varlen_backward.<locals>.<listcomp>)rM   
varlen_bwd)r|   r;   r<   r=   rO   rP   ry   rz   r{   rh   ri   rj   rk   r>   r?   r@   rA   rB   rC   rD   r}   rR   ro   r   r   r   r   _flash_attn_varlen_backwardI  sB   "r   c                 C   s   dd | ||||fD \} }}}}|	  d }|j\}}}|d u r&t|}|d u r/t|}|d u r8t|}tj||d|  f|jtjd}|S )Nc                 S   rG   r   rH   rI   r   r   r   rK     rL   z4_flash_attn_varlen_backward_fake.<locals>.<listcomp>r	   r   r   )rr   r[   r   r\   r]   r   r^   )r|   r;   r<   r=   rO   rP   ry   rz   r{   rh   ri   rj   rk   r>   r?   r@   rA   rB   rC   rD   r}   rR   ro   r`   rt   rb   ru   r   r   r   r    _flash_attn_varlen_backward_fake  s   "


 r   c                   @   $   e Zd Zedd Zedd ZdS )FlashAttnQKVPackedFuncc                 C   sv  |
o|j }|d u r|jd d }|d d d d df  |d d d d df  |d d d d df  }}}|d}|d dkrntjj|dd|d  g}tjj|dd|d  g}tjj|dd|d  g}t|||||||d |d |||	o|dkd\}}}}|r| 	|||||| || _
|| _|| _|| _|| _|| _|| _|d	d |f }|	s|S |||fS )
Nr         r   r	   rY      r   r@   rA   rB   rC   rD   rE   .)requires_gradr[   detachsizer   nn
functionalpad_wrapped_flash_attn_forwardsave_for_backwardr>   r?   r@   window_sizerC   rD   r}   )ctxqkvr>   r?   r@   r   rC   rD   r}   rE   is_grad_enabledis_gradr;   r<   r=   head_size_og
out_paddedrP   rQ   rR   rO   r   r   r   forward  sB   
R

zFlashAttnQKVPackedFunc.forwardc                 G   s(  | j \}}}}}}|jd d dg|jdd  R  }	tj|	|j|jd}
|d}|}|d dkr@tjj	|dd|d  g}t
|||||||
d d d d df |
d d d d df |
d d d d df | j| j| j| jd | jd | j| j| j|d |
d	d |jd
 f }
|
d d d d d d d d d f
S )Nr   rZ   r   r   r	   rY   rR   .r   )saved_tensorsr[   r   r]   rU   r   r   r   r   r   _wrapped_flash_attn_backwardr>   r?   r@   r   rC   rD   r}   )r   r|   argsr;   r<   r=   rO   rP   rR   	qkv_shapedqkvr   dout_paddedr   r   r   backward  s:   $
zFlashAttnQKVPackedFunc.backwardN__name__
__module____qualname__staticmethodr   r   r   r   r   r   r     s
    
/r   c                   @   r   )FlashAttnVarlenQKVPackedFuncc                 C   sv  |o|j }|d u r|jd d }|d d df  |d d df  |d d df  }}}|d}|d dkretjj|dd|d  g}tjj|dd|d  g}tjj|dd|d  g}t|||||||||||d |d ||	|o}|dkd d\}}}}|r| 	||||||| || _
|| _|| _|| _|| _|| _|	| _|
| _|dd |f }|s|S |||fS )	Nr   r   r   r	   rY   r   r@   rA   rB   rC   rD   rE   rl   .)r   r[   r   r   r   r   r   r   "_wrapped_flash_attn_varlen_forwardr   r>   
max_seqlenr?   r@   r   rC   rD   r}   )r   r   
cu_seqlensr   r>   r?   r@   r   rC   rD   r}   rE   r   r   r;   r<   r=   r   r   rP   rQ   rR   rO   r   r   r   r     sN   
@

z$FlashAttnVarlenQKVPackedFunc.forwardc                 G   s(  | j \}}}}}}}	|jd d dg|jdd  R  }
tj|
|j|jd}|d}|}|d dkrAtjj	|dd|d  g}t
|||||||d d df |d d df |d d df ||| j| j| j| j| j| jd | jd | j| j| j|	d |d	d |jd
 f }|d d d d d d d d d d d fS )Nr   r   rZ   rY   r   r   r	   r   .r   )r   r[   r   r]   rU   r   r   r   r   r   #_wrapped_flash_attn_varlen_backwardr   r>   r?   r@   r   rC   rD   r}   )r   r|   r   r;   r<   r=   rO   rP   r   rR   r   r   r   r   r   r   r   r   C  sB   $
z%FlashAttnVarlenQKVPackedFunc.backwardNr   r   r   r   r   r   
  s
    
7r   c                   @   r   )FlashAttnKVPackedFuncc                 C   sj  |ot dd ||fD }|d u r|jd d }|d d d d df  |d d d d df  }}|d}|d dkrhtjj|dd|d  g}tjj|dd|d  g}tjj|dd|d  g}t|||||||d |d |||
o||dkd	\}}}}|r| 	|||||| || _
|| _|| _|| _|| _|| _|	| _|d
d |f }|
s|S |||fS )Nc                 s       | ]}|j V  qd S r-   r   rI   r   r   r   	<genexpr>x      
z0FlashAttnKVPackedFunc.forward.<locals>.<genexpr>r   r   r   r	   r   r   r   .)anyr[   r   r   r   r   r   r   r   r   r>   r?   r@   r   rC   rD   r}   )r   r;   kvr>   r?   r@   r   rC   rD   r}   rE   r   r   r<   r=   r   r   rP   rQ   rR   rO   r   r   r   r   i  sF   6

zFlashAttnKVPackedFunc.forwardc                 G   s8  | j \}}}}}}t|}	|jd d dg|jdd  R  }
tj|
|j|jd}|d}|}|d dkrEtjj	
|dd|d  g}t|||||||	|d d d d df |d d d d df | j| j| j| jd | jd | j| j| j|d |	d	d |jd
 f }	|d	d |jd
 f }|	|d d d d d d d d d fS )Nr   rY   rZ   r   r   r   r	   r   .r   )r   r   r\   r[   r]   rU   r   r   r   r   r   r   r>   r?   r@   r   rC   rD   r}   )r   r|   r   r;   r<   r=   rO   rP   rR   ry   kv_shapedkvr   r   r   r   r   r     s>   
$
zFlashAttnKVPackedFunc.backwardNr   r   r   r   r   r   h  
    
2r   c                   @   r   )FlashAttnVarlenKVPackedFuncc                 C   sx  |ot dd ||fD }|d u r|jd d }|d d df  |d d df  }}|d}|d dkrbtjj|dd|d  g}tjj|dd|d  g}tjj|dd|d  g}t||||||||||	|
d |
d |||oz|dkd d	\}}}}|r| 	|||||||| || _
|| _|| _|| _|	| _|
| _|| _|| _|| _|d
d |f }|s|S |||fS )Nc                 s   r   r-   r   rI   r   r   r   r     r   z6FlashAttnVarlenKVPackedFunc.forward.<locals>.<genexpr>r   r   r   r	   rY   r   r   .)r   r[   r   r   r   r   r   r   r   r   r>   rj   rk   r?   r@   r   rC   rD   r}   )r   r;   r   rh   ri   rj   rk   r>   r?   r@   r   rC   rD   r}   rE   r   r   r<   r=   r   r   rP   rQ   rR   rO   r   r   r   r     sX   *

z#FlashAttnVarlenKVPackedFunc.forwardc                 G   sD  | j \}}}}}}}	}
t|}|jd d dg|jdd  R  }tj||j|jd}|d}|}|d dkrGtjj	
|dd|d  g}t||||||||d d df |d d df ||	| j| j| j| j| j| jd | jd | j| j| j|
d |dd |jd	 f }|dd |jd	 f }||d d d d d d d d d d d d d fS )
Nr   rY   rZ   r   r   r	   r   .r   )r   r   r\   r[   r]   rU   r   r   r   r   r   r   rj   rk   r>   r?   r@   r   rC   rD   r}   )r   r|   r   r;   r<   r=   rO   rP   rh   ri   rR   ry   r   r   r   r   r   r   r   r      sF   
$
"z$FlashAttnVarlenKVPackedFunc.backwardNr   r   r   r   r   r     s
    
?r   c                   @   r   )FlashAttnFuncc                 C   s6  |ot dd |||fD }|d u r|jd d }|d}|d dkrNtjj|dd|d  g}tjj|dd|d  g}tjj|dd|d  g}t|||||||d |d ||	|ob|dkd	\}}}}|r| |||||| || _	|| _
|| _|| _|| _|	| _|
| _|d
d |f }|s|S |||fS )Nc                 s   r   r-   r   rI   r   r   r   r   8  r   z(FlashAttnFunc.forward.<locals>.<genexpr>r   r   r   r   r   r	   r   .)r   r[   r   r   r   r   r   r   r   r>   r?   r@   r   rC   rD   r}   )r   r;   r<   r=   r>   r?   r@   r   rC   rD   r}   rE   r   r   r   r   rP   rQ   rR   rO   r   r   r   r   (  sD   

zFlashAttnFunc.forwardc                 G   s  | j \}}}}}}t|t|t|}	}
}|d}|}|d dkr5tjj|dd|d  g}t|||||||	|
|| j| j	| j
| jd | jd | j| j| j|d |	dd |jd f }	|
dd |jd f }
|dd |jd f }|	|
|d d d d d d d d d fS )Nr   r   r   r	   r   .r   )r   r   r\   r   r   r   r   r   r>   r?   r@   r   rC   rD   r}   r[   )r   r|   r   r;   r<   r=   rO   rP   rR   ry   rz   r{   r   r   r   r   r   r   [  s<   "
zFlashAttnFunc.backwardNr   r   r   r   r   r   '  r   r   c                   @   r   )FlashAttnVarlenFuncc                 C   sP  |ot dd |||fD }|	d u r|jd d }	|d}|d dkrNtjj|dd|d  g}tjj|dd|d  g}tjj|dd|d  g}t|||||||||	|
|d |d |||of|dk|d	\}}}}|r| |||||||| || _	|| _
|| _|	| _|
| _|| _|| _|| _|| _|d
d |f }|s|S |||fS )Nc                 s   r   r-   r   rI   r   r   r   r     r   z.FlashAttnVarlenFunc.forward.<locals>.<genexpr>r   r   rY   r   r   r	   r   .)r   r[   r   r   r   r   r   r   r   r>   rj   rk   r?   r@   r   rC   rD   r}   )r   r;   r<   r=   rh   ri   rj   rk   r>   r?   r@   r   rC   rD   r}   rE   rl   r   r   r   r   rP   rQ   rR   rO   r   r   r   r   ~  sV   

zFlashAttnVarlenFunc.forwardc                 G   s&  | j \}}}}}}}	}
t|t|t|}}}|d}|}|d dkr7tjj|dd|d  g}t|||||||||||	| j| j	| j
| j| j| jd | jd | j| j| j|
d |dd |jd f }|dd |jd f }|dd |jd f }|||d d d d d d d d d d d d d d fS )NrY   r   r   r	   r   .r   )r   r   r\   r   r   r   r   r   rj   rk   r>   r?   r@   r   rC   rD   r}   r[   )r   r|   r   r;   r<   r=   rO   rP   rh   ri   rR   ry   rz   r{   r   r   r   r   r   r     sD   "
&zFlashAttnVarlenFunc.backwardNr   r   r   r   r   r   }  s
    
Ar   r   r   c	           	      C   s    t | ||||||||t 
S )a  dropout_p should be set to 0.0 during evaluation
    If Q, K, V are already stacked into 1 tensor, this function will be faster than
    calling flash_attn_func on Q, K, V since the backward pass avoids explicit concatenation
    of the gradients of Q, K, V.
    For multi-query and grouped-query attention (MQA/GQA), please see
    flash_attn_kvpacked_func and flash_attn_func.

    If window_size != (-1, -1), implements sliding window local attention. Query at position i
    will only attend to keys between [i - window_size[0], i + window_size[1]] inclusive.

    Arguments:
        qkv: (batch_size, seqlen, 3, nheads, headdim)
        dropout_p: float. Dropout probability.
        softmax_scale: float. The scaling of QK^T before applying softmax.
            Default to 1 / sqrt(headdim).
        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
        window_size: (left, right). If not (-1, -1), implements sliding window local attention.
        softcap: float. Anything > 0 activates softcapping attention.
        alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of (-alibi_slope * |i - j|) is added to
            the attention score of query i and key j.
        deterministic: bool. Whether to use the deterministic implementation of the backward pass,
            which is slightly slower and uses more memory. The forward pass is always deterministic.
        return_attn_probs: bool. Whether to return the attention probabilities. This option is for
           testing only. The returned probabilities are not guaranteed to be correct
           (they might not have the right scaling).
    Return:
        out: (batch_size, seqlen, nheads, headdim).
        softmax_lse [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen). The
            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
            normalization factor).
        S_dmask [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen, seqlen).
            The output of softmax (possibly with different scaling). It also encodes the dropout
            pattern (negative means that location was dropped, nonnegative means it was kept).
    )r   applyr   r   )	r   r>   r?   r@   r   rC   rD   r}   return_attn_probsr   r   r   flash_attn_qkvpacked_func  s   -r   c
           
      C   s"   t | |||||||||	t S )a  dropout_p should be set to 0.0 during evaluation
    If K, V are already stacked into 1 tensor, this function will be faster than
    calling flash_attn_func on Q, K, V since the backward pass avoids explicit concatenation
    of the gradients of K, V.
    Supports multi-query and grouped-query attention (MQA/GQA) by passing in KV with fewer heads
    than Q. Note that the number of heads in Q must be divisible by the number of heads in KV.
    For example, if Q has 6 heads and K, V have 2 heads, head 0, 1, 2 of Q will attention to head
    0 of K, V, and head 3, 4, 5 of Q will attention to head 1 of K, V.

    If causal=True, the causal mask is aligned to the bottom right corner of the attention matrix.
    For example, if seqlen_q = 2 and seqlen_k = 5, the causal mask (1 = keep, 0 = masked out) is:
        1 1 1 1 0
        1 1 1 1 1
    If seqlen_q = 5 and seqlen_k = 2, the causal mask is:
        0 0
        0 0
        0 0
        1 0
        1 1
    If the row of the mask is all zero, the output will be zero.

    If window_size != (-1, -1), implements sliding window local attention. Query at position i
    will only attend to keys between
    [i + seqlen_k - seqlen_q - window_size[0], i + seqlen_k - seqlen_q + window_size[1]] inclusive.

    Arguments:
        q: (batch_size, seqlen, nheads, headdim)
        kv: (batch_size, seqlen, 2, nheads_k, headdim)
        dropout_p: float. Dropout probability.
        softmax_scale: float. The scaling of QK^T before applying softmax.
            Default to 1 / sqrt(headdim).
        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
        window_size: (left, right). If not (-1, -1), implements sliding window local attention.
        softcap: float. Anything > 0 activates softcapping attention.
        alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of
            (-alibi_slope * |i + seqlen_k - seqlen_q - j|)
            is added to the attention score of query i and key j.
        deterministic: bool. Whether to use the deterministic implementation of the backward pass,
            which is slightly slower and uses more memory. The forward pass is always deterministic.
        return_attn_probs: bool. Whether to return the attention probabilities. This option is for
           testing only. The returned probabilities are not guaranteed to be correct
           (they might not have the right scaling).
    Return:
        out: (batch_size, seqlen, nheads, headdim).
        softmax_lse [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen). The
            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
            normalization factor).
        S_dmask [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen, seqlen).
            The output of softmax (possibly with different scaling). It also encodes the dropout
            pattern (negative means that location was dropped, nonnegative means it was kept).
    )r   r   r   r   )
r;   r   r>   r?   r@   r   rC   rD   r}   r   r   r   r   flash_attn_kvpacked_func!  s   ?r   c                 C   $   t | |||||||||	|
t S )a
  dropout_p should be set to 0.0 during evaluation
    Supports multi-query and grouped-query attention (MQA/GQA) by passing in KV with fewer heads
    than Q. Note that the number of heads in Q must be divisible by the number of heads in KV.
    For example, if Q has 6 heads and K, V have 2 heads, head 0, 1, 2 of Q will attention to head
    0 of K, V, and head 3, 4, 5 of Q will attention to head 1 of K, V.

    If causal=True, the causal mask is aligned to the bottom right corner of the attention matrix.
    For example, if seqlen_q = 2 and seqlen_k = 5, the causal mask (1 = keep, 0 = masked out) is:
        1 1 1 1 0
        1 1 1 1 1
    If seqlen_q = 5 and seqlen_k = 2, the causal mask is:
        0 0
        0 0
        0 0
        1 0
        1 1
    If the row of the mask is all zero, the output will be zero.

    If window_size != (-1, -1), implements sliding window local attention. Query at position i
    will only attend to keys between
    [i + seqlen_k - seqlen_q - window_size[0], i + seqlen_k - seqlen_q + window_size[1]] inclusive.

    Arguments:
        q: (batch_size, seqlen, nheads, headdim)
        k: (batch_size, seqlen, nheads_k, headdim)
        v: (batch_size, seqlen, nheads_k, headdim)
        dropout_p: float. Dropout probability.
        softmax_scale: float. The scaling of QK^T before applying softmax.
            Default to 1 / sqrt(headdim).
        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
        window_size: (left, right). If not (-1, -1), implements sliding window local attention.
        alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of
            (-alibi_slope * |i + seqlen_k - seqlen_q - j|)
            is added to the attention score of query i and key j.
        deterministic: bool. Whether to use the deterministic implementation of the backward pass,
            which is slightly slower and uses more memory. The forward pass is always deterministic.
        return_attn_probs: bool. Whether to return the attention probabilities. This option is for
           testing only. The returned probabilities are not guaranteed to be correct
           (they might not have the right scaling).
    Return:
        out: (batch_size, seqlen, nheads, headdim).
        softmax_lse [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen). The
            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
            normalization factor).
        S_dmask [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen, seqlen).
            The output of softmax (possibly with different scaling). It also encodes the dropout
            pattern (negative means that location was dropped, nonnegative means it was kept).
    )r   r   r   r   )r;   r<   r=   r>   r?   r@   r   rC   rD   r}   r   r   r   r   flash_attn_funco  s   =r   c                 C   r   )a	  dropout_p should be set to 0.0 during evaluation
    If Q, K, V are already stacked into 1 tensor, this function will be faster than
    calling flash_attn_varlen_func on Q, K, V since the backward pass avoids explicit concatenation
    of the gradients of Q, K, V.
    For multi-query and grouped-query attention (MQA/GQA), please see
    flash_attn_varlen_kvpacked_func and flash_attn_varlen_func.

    If window_size != (-1, -1), implements sliding window local attention. Query at position i
    will only attend to keys between [i - window_size[0], i + window_size[1]] inclusive.

    Arguments:
        qkv: (total, 3, nheads, headdim), where total = total number of tokens in the batch.
        cu_seqlens: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
           of the sequences in the batch, used to index into qkv.
        max_seqlen: int. Maximum sequence length in the batch.
        dropout_p: float. Dropout probability.
        softmax_scale: float. The scaling of QK^T before applying softmax.
            Default to 1 / sqrt(headdim).
        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
        window_size: (left, right). If not (-1, -1), implements sliding window local attention.
        softcap: float. Anything > 0 activates softcapping attention.
        alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of (-alibi_slope * |i - j|)
            is added to the attention score of query i and key j.
        deterministic: bool. Whether to use the deterministic implementation of the backward pass,
            which is slightly slower and uses more memory. The forward pass is always deterministic.
        return_attn_probs: bool. Whether to return the attention probabilities. This option is for
           testing only. The returned probabilities are not guaranteed to be correct
           (they might not have the right scaling).
    Return:
        out: (total, nheads, headdim).
        softmax_lse [optional, if return_attn_probs=True]: (nheads, total_q_seqlen). The
            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
            normalization factor).
        S_dmask [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen, seqlen).
            The output of softmax (possibly with different scaling). It also encodes the dropout
            pattern (negative means that location was dropped, nonnegative means it was kept).
    )r   r   r   r   )r   r   r   r>   r?   r@   r   rC   rD   r}   r   r   r   r    flash_attn_varlen_qkvpacked_func  s   2r   c                 C   s*   t | |||||||||	|
|||t S )a  dropout_p should be set to 0.0 during evaluation
    If K, V are already stacked into 1 tensor, this function will be faster than
    calling flash_attn_func on Q, K, V since the backward pass avoids explicit concatenation
    of the gradients of K, V.
    Supports multi-query and grouped-query attention (MQA/GQA) by passing in KV with fewer heads
    than Q. Note that the number of heads in Q must be divisible by the number of heads in KV.
    For example, if Q has 6 heads and K, V have 2 heads, head 0, 1, 2 of Q will attention to head
    0 of K, V, and head 3, 4, 5 of Q will attention to head 1 of K, V.

    If causal=True, the causal mask is aligned to the bottom right corner of the attention matrix.
    For example, if seqlen_q = 2 and seqlen_k = 5, the causal mask (1 = keep, 0 = masked out) is:
        1 1 1 1 0
        1 1 1 1 1
    If seqlen_q = 5 and seqlen_k = 2, the causal mask is:
        0 0
        0 0
        0 0
        1 0
        1 1
    If the row of the mask is all zero, the output will be zero.

    If window_size != (-1, -1), implements sliding window local attention. Query at position i
    will only attend to keys between
    [i + seqlen_k - seqlen_q - window_size[0], i + seqlen_k - seqlen_q + window_size[1]] inclusive.

    Arguments:
        q: (total_q, nheads, headdim), where total_q = total number of query tokens in the batch.
        kv: (total_k, 2, nheads_k, headdim), where total_k = total number of key tokens in the batch.
        cu_seqlens_q: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
           of the sequences in the batch, used to index into q.
        cu_seqlens_k: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
           of the sequences in the batch, used to index into kv.
        max_seqlen_q: int. Maximum query sequence length in the batch.
        max_seqlen_k: int. Maximum key sequence length in the batch.
        dropout_p: float. Dropout probability.
        softmax_scale: float. The scaling of QK^T before applying softmax.
            Default to 1 / sqrt(headdim).
        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
        window_size: (left, right). If not (-1, -1), implements sliding window local attention.
        softcap: float. Anything > 0 activates softcapping attention.
        alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of
            (-alibi_slope * |i + seqlen_k - seqlen_q - j|)
            is added to the attention score of query i and key j.
        deterministic: bool. Whether to use the deterministic implementation of the backward pass,
            which is slightly slower and uses more memory. The forward pass is always deterministic.
        return_attn_probs: bool. Whether to return the attention probabilities. This option is for
           testing only. The returned probabilities are not guaranteed to be correct
           (they might not have the right scaling).
    Return:
        out: (total, nheads, headdim).
        softmax_lse [optional, if return_attn_probs=True]: (nheads, total_q_seqlen). The
            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
            normalization factor).
        S_dmask [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen, seqlen).
            The output of softmax (possibly with different scaling). It also encodes the dropout
            pattern (negative means that location was dropped, nonnegative means it was kept).
    )r   r   r   r   )r;   r   rh   ri   rj   rk   r>   r?   r@   r   rC   rD   r}   r   r   r   r   flash_attn_varlen_kvpacked_func  s"   Ir   c                 C   s.   t | |||||||||	|
|||||t S )aq  dropout_p should be set to 0.0 during evaluation
    Supports multi-query and grouped-query attention (MQA/GQA) by passing in K, V with fewer heads
    than Q. Note that the number of heads in Q must be divisible by the number of heads in KV.
    For example, if Q has 6 heads and K, V have 2 heads, head 0, 1, 2 of Q will attention to head
    0 of K, V, and head 3, 4, 5 of Q will attention to head 1 of K, V.

    If causal=True, the causal mask is aligned to the bottom right corner of the attention matrix.
    For example, if seqlen_q = 2 and seqlen_k = 5, the causal mask (1 = keep, 0 = masked out) is:
        1 1 1 1 0
        1 1 1 1 1
    If seqlen_q = 5 and seqlen_k = 2, the causal mask is:
        0 0
        0 0
        0 0
        1 0
        1 1
    If the row of the mask is all zero, the output will be zero.

    If window_size != (-1, -1), implements sliding window local attention. Query at position i
    will only attend to keys between
    [i + seqlen_k - seqlen_q - window_size[0], i + seqlen_k - seqlen_q + window_size[1]] inclusive.

    Arguments:
        q: (total_q, nheads, headdim), where total_q = total number of query tokens in the batch.
        k: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch.
        v: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch.
        cu_seqlens_q: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
           of the sequences in the batch, used to index into q.
        cu_seqlens_k: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
           of the sequences in the batch, used to index into kv.
        max_seqlen_q: int. Maximum query sequence length in the batch.
        max_seqlen_k: int. Maximum key sequence length in the batch.
        dropout_p: float. Dropout probability.
        softmax_scale: float. The scaling of QK^T before applying softmax.
            Default to 1 / sqrt(headdim).
        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
        window_size: (left, right). If not (-1, -1), implements sliding window local attention.
        softcap: float. Anything > 0 activates softcapping attention.
        alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of
            (-alibi_slope * |i + seqlen_k - seqlen_q - j|)
            is added to the attention score of query i and key j.
        deterministic: bool. Whether to use the deterministic implementation of the backward pass,
            which is slightly slower and uses more memory. The forward pass is always deterministic.
        return_attn_probs: bool. Whether to return the attention probabilities. This option is for
           testing only. The returned probabilities are not guaranteed to be correct
           (they might not have the right scaling).
    Return:
        out: (total, nheads, headdim).
        softmax_lse [optional, if return_attn_probs=True]: (nheads, total_q_seqlen). The
            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
            normalization factor).
        S_dmask [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen, seqlen).
            The output of softmax (possibly with different scaling). It also encodes the dropout
            pattern (negative means that location was dropped, nonnegative means it was kept).
    )r   r   r   r   )r;   r<   r=   rh   ri   rj   rk   r>   r?   r@   r   rC   rD   r}   r   rl   r   r   r   flash_attn_varlen_funcZ  s&   Ir   Tcache_seqlenscache_batch_idxcache_leftpadc                 C   s   | ddksJ d| ddksJ ddd | ||fD \} }}|du r.| jd d }|durJt|trJtj|jd	 f|tj|jd
}t|}t|}t|
}
t	
| |||||||||	|
|d|||d	 |d |||\}}|rv||fS |S )a<  
    If k and v are not None, k_cache and v_cache will be updated *inplace* with the new values from
    k and v. This is useful for incremental decoding: you can pass in the cached keys/values from
    the previous step, and update them with the new keys/values from the current step, and do
    attention with the updated cache, all in 1 kernel.

    If you pass in k / v, you must make sure that the cache is large enough to hold the new values.
    For example, the KV cache could be pre-allocated with the max sequence length, and you can use
    cache_seqlens to keep track of the current sequence lengths of each sequence in the batch.

    Also apply rotary embedding if rotary_cos and rotary_sin are passed in. The key @k will be
    rotated by rotary_cos and rotary_sin at indices cache_seqlens, cache_seqlens + 1, etc.
    If causal or local (i.e., window_size != (-1, -1)), the query @q will be rotated by rotary_cos
    and rotary_sin at indices cache_seqlens, cache_seqlens + 1, etc.
    If not causal and not local, the query @q will be rotated by rotary_cos and rotary_sin at
    indices cache_seqlens only (i.e. we consider all tokens in @q to be at position cache_seqlens).

    See tests/test_flash_attn.py::test_flash_attn_kvcache for examples of how to use this function.

    Supports multi-query and grouped-query attention (MQA/GQA) by passing in KV with fewer heads
    than Q. Note that the number of heads in Q must be divisible by the number of heads in KV.
    For example, if Q has 6 heads and K, V have 2 heads, head 0, 1, 2 of Q will attention to head
    0 of K, V, and head 3, 4, 5 of Q will attention to head 1 of K, V.

    If causal=True, the causal mask is aligned to the bottom right corner of the attention matrix.
    For example, if seqlen_q = 2 and seqlen_k = 5, the causal mask (1 = keep, 0 = masked out) is:
        1 1 1 1 0
        1 1 1 1 1
    If seqlen_q = 5 and seqlen_k = 2, the causal mask is:
        0 0
        0 0
        0 0
        1 0
        1 1
    If the row of the mask is all zero, the output will be zero.

    If window_size != (-1, -1), implements sliding window local attention. Query at position i
    will only attend to keys between
    [i + seqlen_k - seqlen_q - window_size[0], i + seqlen_k - seqlen_q + window_size[1]] inclusive.

    Note: Does not support backward pass.

    Arguments:
        q: (batch_size, seqlen, nheads, headdim)
        k_cache: (batch_size_cache, seqlen_cache, nheads_k, headdim) if there's no block_table,
            or (num_blocks, page_block_size, nheads_k, headdim) if there's a block_table (i.e. paged KV cache)
            page_block_size must be a multiple of 256.
        v_cache: (batch_size_cache, seqlen_cache, nheads_k, headdim) if there's no block_table,
            or (num_blocks, page_block_size, nheads_k, headdim) if there's a block_table (i.e. paged KV cache)
        k [optional]: (batch_size, seqlen_new, nheads_k, headdim). If not None, we concatenate
            k with k_cache, starting at the indices specified by cache_seqlens.
        v [optional]: (batch_size, seqlen_new, nheads_k, headdim). Similar to k.
        rotary_cos [optional]: (seqlen_ro, rotary_dim / 2). If not None, we apply rotary embedding
            to k and q. Only applicable if k and v are passed in. rotary_dim must be divisible by 16.
        rotary_sin [optional]: (seqlen_ro, rotary_dim / 2). Similar to rotary_cos.
        cache_seqlens: int, or (batch_size,), dtype torch.int32. The sequence lengths of the
            KV cache.
        cache_batch_idx: (batch_size,), dtype torch.int32. The indices used to index into the KV cache.
            If None, we assume that the batch indices are [0, 1, 2, ..., batch_size - 1].
            If the indices are not distinct, and k and v are provided, the values updated in the cache
                 might come from any of the duplicate indices.
        cache_leftpad: (batch_size,), dtype torch.int32. The index that the KV cache starts. If None, assume 0.
        block_table [optional]: (batch_size, max_num_blocks_per_seq), dtype torch.int32.
        softmax_scale: float. The scaling of QK^T before applying softmax.
            Default to 1 / sqrt(headdim).
        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
        window_size: (left, right). If not (-1, -1), implements sliding window local attention.
        softcap: float. Anything > 0 activates softcapping attention.
        rotary_interleaved: bool. Only applicable if rotary_cos and rotary_sin are passed in.
            If True, rotary embedding will combine dimensions 0 & 1, 2 & 3, etc. If False,
            rotary embedding will combine dimensions 0 & rotary_dim / 2, 1 & rotary_dim / 2 + 1
            (i.e. GPT-NeoX style).
        alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of
            (-alibi_slope * |i + seqlen_k - seqlen_q - j|)
            is added to the attention score of query i and key j.
        num_splits: int. If > 1, split the key/value into this many chunks along the sequence.
           If num_splits == 1, we don't split the key/value. If num_splits == 0, we use a heuristic
           to automatically determine the number of splits.
           Don't change this unless you know what you are doing.
        return_softmax_lse: bool. Whether to return the logsumexp of the attention scores.

    Return:
        out: (batch_size, seqlen, nheads, headdim).
        softmax_lse [optional, if return_softmax_lse=True]: (batch_size, nheads, seqlen). The
            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
            normalization factor).
    r   r	   z+k_cache must have contiguous last dimensionz+v_cache must have contiguous last dimensionc                 S   rG   r   rH   rI   r   r   r   rK   &  rL   z+flash_attn_with_kvcache.<locals>.<listcomp>Nr   r   rZ   )r   r[   
isinstanceintr   fullint32r   r   rM   fwd_kvcache)r;   k_cachev_cacher<   r=   
rotary_cos
rotary_sinr   r   r   rl   r?   r@   r   rC   rotary_interleavedrD   
num_splitsreturn_softmax_lserO   rP   r   r   r   flash_attn_with_kvcache  sF   lr   r-   )	r   r   rg   NFNNNF)NF)rg   NFr   rg   NFF)	rg   NFr   rg   NFFN)NNNNNNNNNFr   rg   TNr   F);typingr   r   r   r   r   torch.nnr   osgetenvUSE_TRITON_ROCMflash_attn_triton_amdr
   rM   flash_attn_2_cudar   r'   r)   __version__library	custom_op_torch_custom_op_wrapperregister_fake_torch_register_fake_wrapperr5   r:   Tensorfloatboolr   rS   rf   ops
flash_attnr   rq   rx   r   r   r   r   r   r   r   autogradFunctionr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   <module>   s  

	
 	

	
2	
%	
3	
!	
?	
(R^WhVk
>
R
Q
I
d
b	
