o
    i                     @   s(  d Z ddlZddlZddlZddlmZ edd dd dd dejdej	d	ej	d
ej	dej	dej	dej	dej	dej	fddZ
ejdej	d
ej	fddZejdej	dej	dej	fddZejdej	dej	d	ej	d
ej	dej	dej	dej	dej	dej	fddZdd Zejejdddddded d!ejddd"ddded d!gg d#d$ed%d d&d d'd dejdej	d	ej	d
ej	d(ej	dej	dej	dej	dej	dej	fd)d*Zd5d+d,Z	d5d-d.ZG d/d0 d0ejjZejZG d1d2 d2ejjZejZG d3d4 d4ejjZejZdS )6a  
*Experimental* implementation of FlashAttention in Triton.
Tested with triton==2.0.0.dev20221202.
Triton 2.0 has a new backend (MLIR) but seems like it doesn't yet work for head dimensions
other than 64:
https://github.com/openai/triton/blob/d376020f90002757eea3ea9475d4f7cfc2ec5ead/python/triton/ops/flash_attention.py#L207
We'll update this implementation with the new Triton backend once this is fixed.

We use the FlashAttention implementation from Phil Tillet a starting point.
https://github.com/openai/triton/blob/master/python/tutorials/06-fused-attention.py

Changes:
- Implement both causal and non-causal attention.
- Implement both self-attention and cross-attention.
- Support arbitrary seqlens (not just multiples of 128), for both forward and backward.
- Support all head dimensions up to 128 (not just 16, 32, 64, 128), for both forward and backward.
- Support attention bias.
- Speed up the forward pass a bit, and only store the LSE instead of m and l.
- Make the backward for d=128 much faster by reducing register spilling.
- Optionally parallelize the backward pass across seqlen_k, to deal with the case of
small batch size * nheads.

Caution:
- This is an *experimental* implementation. The forward pass should be quite robust but
I'm not 100% sure that the backward pass doesn't have race conditions (due to the Triton compiler).
- This implementation has only been tested on A100.
- If you plan to use headdim other than 64 and 128, you should test for race conditions
(due to the Triton compiler), as done in tests/test_flash_attn.py
"test_flash_attn_triton_race_condition". I've tested and fixed many race conditions
for different head dimensions (40, 48, 64, 128, 80, 88, 96), but I'm still not 100% confident
that there are none left for other head dimensions.

Differences between this Triton version and the CUDA version:
- Triton version doesn't support dropout.
- Triton forward is generally faster than CUDA forward, while Triton backward is
generally slower than CUDA backward. Overall Triton forward + backward is slightly slower
than CUDA forward + backward.
- Triton version doesn't support different sequence lengths in a batch (i.e., RaggedTensor/NestedTensor).
- Triton version supports attention bias, while CUDA version doesn't.
    Nc                 C      | d | d  dkS Nseqlen_qBLOCK_Mr    argsr   r   R/home/ubuntu/vllm_env/lib/python3.10/site-packages/flash_attn/flash_attn_triton.py<lambda><       r
   c                 C   r   Nseqlen_kBLOCK_Nr   r   r   r   r   r	   r
   =   r   c                 C      | d | d kS NheaddimBLOCK_HEADDIMr   r   r   r   r	   r
   >       EVEN_MEVEN_NEVEN_HEADDIM	BIAS_TYPE	IS_CAUSALr   r   r   r   r   r   c&           D   	   C   s  t d}&t d}'|'| }(|'| })|&|$ t d|$ }*t d|%}+t d| },| |(|  |)|	  |*d d d f |
 |,d d d f   }-||(|  |)|  |+d d d f | |,d d d f   }.||(|  |)|  |+d d d f | |,d d d f   }/|dkr||(|  |)|  |+ }0n |dkr||(|  |)|  |*d d d f | |+d d d f   }0||'|  |* }1t j|$gt jdtd }2t j|$gt jdtd }3t j|$| gt jd}4|!|"@ r|#rt |-}5n?t j|-|,d d d f |k dd}5n.|#r
t j|-|*d d d f |k dd}5nt j|-|*d d d f |k |,d d d f |k @ dd}5|s)|n	t |&d |$ |}6td|6|%D ]}7t 	|7|%}7|"|!@ ri|#rTt |.|7|  }8nOt j|.|7|  |,d d d f |k dd}8n:|#rt j|.|7|  |7|+ d d d f |k dd}8n t j|.|7|  |7|+ d d d f |k |,d d d f |k @ dd}8t j|$|%gt jd}9|9t j
|5|8d	d
7 }9|"s|9t |7|+ d d d f |k dtd7 }9|r|9t |*d d d f |7|+ d d d f kdtd7 }9|dkrw|dkr!|"rt |0|7 t j}:nt j|0|7 |7|+ |k ddt j}:|:d d d f }:n8|dkrY|!|"@ r7t |0|7 t j}:n"t j|0|7 |*d d d f |k |7|+ d d d f |k @ ddt j}:|9| |: }9t t |9d|2};t |9|;d d d f  }<nt t |9d| |2};t |9| |;d d d f  }<t |<d}=t |3|; }>t |1|> t |1}>|4|>d d d f  }4|"|!@ r|#rt |/|7|  }?nOt j|/|7|  |,d d d f |k dd}?n:|#rt j|/|7|  |7|+ d d d f |k dd}?n t j|/|7|  |7|+ d d d f |k |,d d d f |k @ dd}?|<|?j}<|4t 
|<|?7 }4|;}3t |2|; |= }@|;t |@ }2q9t |3|2 }At |1|A t |1}A|4|Ad d d f  }4t d}&|&|$ t d|$ }*||'|  |* }Bt |B|2 t d| },||(|  |)|  |*d d d f | |,d d d f   }C|!r|#rt |C|4 d S t j|C|4|,d d d f |k d d S |#rt j|C|4|*d d d f |k d d S t j|C|4|*d d d f |k |,d d d f |k @ d d S )Nr      vectormatrixdtypeinf        maskotherTtrans_b-infnoner"   )tl
program_idarangezerosfloat32floatloadminimumrangemultiple_ofdotwheretomaximummaxexpsumstorer   log)DQKVBiasOutLseTMPsoftmax_scale	stride_qb	stride_qh	stride_qm	stride_kb	stride_kh	stride_kn	stride_vb	stride_vh	stride_vn	stride_bb	stride_bh	stride_bm	stride_ob	stride_oh	stride_omnheadsr   r   seqlen_q_roundedr   CACHE_KEY_SEQLEN_QCACHE_KEY_SEQLEN_Kr   r   r   r   r   r   r   r   start_moff_hboff_boff_hoffs_moffs_noffs_dq_ptrsk_ptrsv_ptrsb_ptrst_ptrslse_im_iacc_oqend_nstart_nkqkbiasm_ijpl_ijacc_o_scalevl_i_newo_scalelse_ptrsout_ptrsr   r   r	   _fwd_kernel:   s  
0
666"""*
*

**6



	

*

*

"$$*
ru   c                 C   sR  t d}t d}||	 }||	 }|| t d| }t d|}t j| ||  ||  |d d d f |  |d d d f  |d d d f |
k |d d d f |k @ ddt j}t j|||  ||  |d d d f |  |d d d f  |d d d f |
k |d d d f |k @ ddt j}t j|| dd}t |||  | | d S )Nr   r   r    r!   )axis)r)   r*   r+   r/   r5   r-   r9   r:   )r@   DODeltarP   rQ   rR   
stride_dob
stride_doh
stride_domrS   r   rT   r   r   r   rW   rX   rY   rZ   r[   r]   ododeltar   r   r	   _bwd_preprocess_do_o_dot  s<   

6&&	r   c                 C   s   |	|@ r6|
rt || t | | d S t j|||d d d f |k d t j| ||d d d f |k d d S |
rZt j|||d d d f |k d t j| ||d d d f |k d d S t j|||d d d f |k |d d d f |k @ d t j| ||d d d f |k |d d d f |k @ d d S )Nr(   )r)   r:   )dk_ptrsdv_ptrsdkdvr\   r]   r   r   r   r   r   r   r   r	   _bwd_store_dk_dvL  s    $ $48r   
ATOMIC_ADDc            >      C   s  |sdn| | | | } | t d| }!| | t d| }"t d|}#t d|}$||!d d d f | |$d d d f   }%||"d d d f | |$d d d f   }&||"d d d f | |$d d d f   }'||!d d d f | |$d d d f   }(||!d d d f | |$d d d f   })|dkr||" }*n|dkr||!d d d f | |"d d d f   }*t j||gt jd}+t j||gt jd},| |kr||"d d d f | |$d d d f   }-||"d d d f | |$d d d f   }.t|.|-|,|+|"|$|||||d d S ||@ r5|rt |&}/t |'}0nyt j|&|$d d d f |k dd}/t j|'|$d d d f |k dd}0nX|rYt j|&|"d d d f |k dd}/t j|'|"d d d f |k dd}0n4t j|&|"d d d f |k |$d d d f |k @ dd}/t j|'|"d d d f |k |$d d d f |k @ dd}0t ||}1t| |1| |D ]y}2t |2|}2|2|# }3||@ rt |%}4n.|rt j|%|3d d d f |k dd}4nt j|%|3d d d f |k |$d d d f |k @ dd}4t j	|4|/dd	}5|st 
|"d d d f |k |5td
}5|rt 
|3d d d f |"d d d f k|5td
}5|dkrt   |dkrI|r2t |*t j}6nt j|*|"|k ddt j}6|6d d d f }6n2|dkr{||@ r]t |*t j}6nt j|*|3d d d f |k |"d d d f |k @ ddt j}6|5| |6 }5||@ st   t |	|3 }7|dkrt |5| |7d d d f  }8nt |5|7d d d f  }8||@ rt |(}9nt j|(|3d d d f |k |$d d d f |k @ dd}9|+t j	|8|9j|9dd7 }+||@ st   t j	|9|0dd	}:|st   t |
|3 };|8|:|;d d d f   | |4j}<|,t j	|<|4dd7 },||@ s*t   |s||@ rJt j|)dd}=|=t 	|<|/7 }=t j|)|=dd n|rxt j|)|3d d d f |k ddd}=|=t 	|<|/7 }=t j|)|=|3d d d f |k dd nt j|)|3d d d f |k |$d d d f |k @ ddd}=|=t 	|<|/7 }=t j|)|=|3d d d f |k |$d d d f |k @ dd n@t 	|<|/}=||@ rt |)|= n.|rt j|)|=|3d d d f |k d nt j|)|=|3d d d f |k |$d d d f |k @ d |)|| 7 })|%|| 7 }%|(|| 7 }(|dkr|*|| 7 }*q||"d d d f | |$d d d f   }-||"d d d f | |$d d d f   }.t|.|-|,|+|"|$|||||d d S )Nr   r   r   r   r   r    r!   Tr$   r&   r'   )trans_a
evict_last)eviction_policy)r"   r#   r   )r"   r   r(   )r)   r+   r,   r-   r   r/   cdivr1   r2   r3   r4   r.   debug_barrierr5   r8   r   r:   
atomic_add)>rh   r<   r=   r>   r?   rw   DQDKDVLSEDrC   rF   rI   rL   rO   r{   
stride_dqm
stride_dkn
stride_dvnr   r   r   r   r   r   r   r   r   r   r   r   begin_moffs_qmr\   r[   r]   r^   r_   r`   do_ptrsdq_ptrsra   r   r   r   r   ri   rp   num_block_mrW   offs_m_currrf   rj   rk   rc   rm   r}   dpDidsdqr   r   r	   _bwd_kernel_one_col_blockl  sX  $(((((
(
((

 " "**
"&".



&

 
&
$
&&
"&
((
r   c                    s    fddS )Nc                    s   |     S )N)zero_)nargsnamer   r	   r
   z  s    zinit_to_zero.<locals>.<lambda>r   r   r   r   r	   init_to_zeroy  s   r      F)r   r   SEQUENCE_PARALLEL   r   r   )	num_warps
num_stagespre_hookT)rU   rV   r   r   r   )configskeyc                 C   r   r   r   r   r   r   r	   r
     r   c                 C   r   r   r   r   r   r   r	   r
     r   c                 C   r   r   r   r   r   r   r	   r
     r   r   c3           8      C   s  t d}3|3|# }4|3|# }5| |4| |5|  7 } ||4| |5|  7 }||4| |5|  7 }||4| |5|  7 }||4| |5|  7 }||4| |5|  7 }||4|  |5|!  7 }|*dkra||4| |5|  7 }|	|3|& 7 }	||3|& 7 }|-st |%|2}6td|6D ](}7t|7| |||||||||	|
||||||||"|$|%|'fd|*|+|,|.|/|0|1|2d	 qzd S t d}7t|7| |||||||||	|
||||||||"|$|%|'fd|*|+|,|.|/|0|1|2d	 d S )Nr   r'   r   F)	r   r   r   r   r   r   r   r   r   T)r)   r*   r   r1   r   )8r<   r=   r>   r?   rw   r   r   r   r   r   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   ry   rz   r{   
stride_dqb
stride_dqhr   
stride_dkb
stride_dkhr   
stride_dvb
stride_dvhr   rS   r   r   rT   r   rU   rV   r   r   r   r   r   r   r   r   r   rX   rY   rZ   num_block_nrh   r   r   r	   _bwd_kernel}  s   
T

$r   c                    s  | j \ }|j \}}}}|j  ||fksJ |j  ||fks$J |dks,J d| j|j  kr<|jksAJ d J d| jtjtjfv sNJ d| jrW|jrW|jsYJ |padt| }|d u}	d}
|	r|j| jtjfv suJ |jszJ |	 dksJ |
dd	kr| }|j d
d  d	|fkrd}
n|j d
d  |fkrd}
ntd| |}|	r|
d|
d	|
d
fnd}td d }tj |f| jtjd}tj |f| jtjd}t| }tt|d}d}|dkrdnd} fdd}t| g | |||||||| 
d| 
d
| 
d	|
d|
d
|
d	|
d|
d
|
d	||
d|
d
|
d	|||d |d |
||R |||d	d |||fS )Nr   z5FlashAttention only support head dimensions up to 128z#All tensors must have the same typezOnly support fp16 and bf16      ?r'      r      r   r   GLast 2 dimensions of bias must be (1, seqlen_k) or (seqlen_q, seqlen_k)r   r   r   r   )devicer      @   r   c                       t | d   fS Nr   tritonr   METAbatchrS   r   r   r	   r
   S      z%_flash_attn_forward.<locals>.<lambda>    )r   r   r   r   )shaper   torchfloat16bfloat16is_cudamathsqrtr.   dimstride
contiguousRuntimeErrorexpandceilemptyr   r-   
empty_liker7   r   next_power_of_2ru   )rf   ri   rp   rk   causalrC   d_r   has_bias	bias_typebias_stridesrT   lsetmpr|   r   BLOCKr   gridr   r   r	   _flash_attn_forward,  s   *
$
	
 !"
'r   c                    s  |  ddkr|  } |j\ }|j\}}}|dksJ td d }|j |fks2J | d| d  krS| d  krS| d  krSdksVJ  J | d| d  kro| d  krodksrJ  J |pzdt| }tj|tjd}t|}t	t
|d} fdd}t| || || d	| d
| d|  d	|  d
|  d||d|d |	d u}d}|r|	j|jtjfv sJ |	jsJ |	 dksJ |	 ddksJ |	jd
d  dfkrd}n|	jd
d  fkrd}ntd|	 }	|r#|	 d	|	 d|	 d
fnd} fdd}t| g ||||	| ||||||| d	| d
| d| d	| d
| d| d	| d
| d||  d	|  d
|  d| d	| d
| d| d	| d
| d| d	| d
| d||d d ||
|R   || d S )Nr   r   r   r   r   r   c                    r   r   r   r   r   r   r	   r
     r   z&_flash_attn_backward.<locals>.<lambda>r   r   )r   r   r'   r   r   r   r   r   c                    s$   | d rt | d nd  fS )Nr   r   r   r   r   )r   rS   r   r   r	   r
     s   r   )r   r   r   r   r   r   r   r   r-   r7   r   r   r   r   r.   r   r   r   r   r   copy_)r}   rf   ri   rp   r|   r   r   r   r   rk   r   rC   r   r   rT   dq_accumr~   r   r   r   r   r   r   )r   rS   r   r   r	   _flash_attn_backward~  s  H8

&	
 !"#$%&'(+,-3r   c                   @   &   e Zd ZedddZedd ZdS )FlashAttnQKVPackedFuncNFc                 C   s   | ddkr| }t|dddddf |dddddf |dddddf |||d\}}| _| |||| || _|S )a5  
        qkv: (batch, seqlen, 3, nheads, headdim)
        bias: optional, shape broadcastible to (batch, nheads, seqlen, seqlen).
            For example, ALiBi mask for causal would have shape (1, nheads, 1, seqlen).
            ALiBi mask for non-causal would have shape (1, nheads, seqlen, seqlen)
        r   r   Nr   r   rk   r   rC   )r   r   r   rC   save_for_backwardr   )ctxqkvrk   r   rC   r|   r   r   r   r	   forward  s   	zFlashAttnQKVPackedFunc.forwardc                 C   s   | j \}}}}| jd rJ dt U t|}t||d d d d df |d d d d df |d d d d df |||d d d d df |d d d d df |d d d d df || j| jd W d    n1 slw   Y  |d d d fS )Nr   1FlashAttention does not support bias gradient yetr   r   r   saved_tensorsneeds_input_gradr   inference_moder   r   r   rC   )r   r}   r   r|   r   rk   dqkvr   r   r	   backward  s(   

zFlashAttnQKVPackedFunc.backwardNFN__name__
__module____qualname__staticmethodr   r   r   r   r   r	   r     s
    r   c                   @   r   )FlashAttnKVPackedFuncNFc                 C   sr   dd ||fD \}}t ||dddddf |dddddf |||d\}}| _| ||||| || _|S )an  
        q: (batch, seqlen_q, nheads, headdim)
        kv: (batch, seqlen_k, 2, nheads, headdim)
        bias: optional, shape broadcastible to (batch, nheads, seqlen_q, seqlen_k).
            For example, ALiBi mask for causal would have shape (1, nheads, 1, seqlen_k).
            ALiBi mask for non-causal would have shape (1, nheads, seqlen_q, seqlen_k)
        c                 S   &   g | ]}| d dkr|n| qS r   r   r   r   .0xr   r   r	   
<listcomp>4     & z1FlashAttnKVPackedFunc.forward.<locals>.<listcomp>Nr   r   r   r   rC   r   r   )r   rf   kvrk   r   rC   r|   r   r   r   r	   r   *  s   
0zFlashAttnKVPackedFunc.forwardc           	      C   s   | j \}}}}}t| jdkr| jd rJ dt H t|}t|}t|||d d d d df |d d d d df ||||d d d d df |d d d d df || j| jd W d    n1 sgw   Y  ||d d d fS )N   r   r   r   r   r   )	r   lenr   r   r   r   r   r   rC   )	r   r}   rf   r  r|   r   rk   r   dkvr   r   r	   r   <  s,   


zFlashAttnKVPackedFunc.backwardr   r   r   r   r   r	   r   )  
    r   c                   @   r   )FlashAttnFuncNFc           	      C   sT   dd |||fD \}}}t ||||||d\}}| _| |||||| || _|S )aw  
        q: (batch_size, seqlen_q, nheads, headdim)
        k, v: (batch_size, seqlen_k, nheads, headdim)
        bias: optional, shape broadcastible to (batch, nheads, seqlen_q, seqlen_k).
            For example, ALiBi mask for causal would have shape (1, nheads, 1, seqlen_k).
            ALiBi mask for non-causal would have shape (1, nheads, seqlen_q, seqlen_k)
        c                 S   r   r   r   r   r   r   r	   r   e  r  z)FlashAttnFunc.forward.<locals>.<listcomp>r   r  )	r   rf   ri   rp   rk   r   rC   r|   r   r   r   r	   r   [  s   
zFlashAttnFunc.forwardc                 C   s   | j \}}}}}}| jd rJ dt ) t|}t|}	t|}
t||||||||	|
|| j| jd W d    n1 sBw   Y  ||	|
d d d fS )Nr  r   r   r   )r   r}   rf   ri   rp   r|   r   rk   r   r   r   r   r   r	   r   m  s,   



zFlashAttnFunc.backwardr   r   r   r   r   r	   r  Z  r  r  r   )__doc__r   r   r   triton.languagelanguager)   
heuristicsjit	constexprru   r   r   r   r   autotuneConfigr   r   r   autogradFunctionr   applyflash_attn_qkvpacked_funcr   flash_attn_kvpacked_funcr  flash_attn_funcr   r   r   r	   <module>   s    ) !"#$%& ^,	
   

+,-./0123 
S
w1.
.