o
    ¸iÚ  ã                   @   sl  d dl Z d dlZd dlZd dlZd dlZd dlm  mZ d dl	m
Z
 d dlmZmZ zd dlmZ W n ey>   dZY nw d dlmZmZ d dlmZmZmZ d dlmZmZmZ d dlmZmZ e  d	d
¡dkZe  dd
¡dkZ e  dd
¡dkZ!e  dd
¡dkZ"e  dd
¡dkZ#e  dd
¡dkZ$e  dd
¡dkZ%e  dd
¡dkZ&e  dd
¡dkp´ej' (d¡d  dk Z)e  dd
¡dkZ*e  dd
¡dkZ+e  dd
¡dkZ,e  dd
¡dkZ-e  dd
¡dkZ.g e*sãdgng  e+sêdgng  e,sñdgng  e-sødgng  e.sÿdgng  Z/ej0 1d ej2ge&sej3gng  e)sej4gng  ¡ej0 1d!g d"¢¡ej0 1d#d$g¡ej0 1d%d$g¡ej0 1d&d'ge$s>d(gng  ¡ej0 1d)d$ge#sMd*gng  ¡ej0 1d+d$d*g¡ej0 1d,d$g¡ej0 1d-e/¡ej0 1d.g d/¢¡d0d1„ ƒƒƒƒƒƒƒƒƒƒZ5ej0 1d ej2ge&s‰ej3gng  e)s’ej4gng  ¡ej0 1d!g d"¢¡ej0 1d#d$g¡ej0 1d%d$g¡ej0 1d&d'ge$s·d(gng  ¡ej0 1d)d$ge#sÆd*gng  ¡ej0 1d+d$d*g¡ej0 1d2d$d*g¡ej0 1d-e/¡ej0 1d.g d3¢¡d4d5„ ƒƒƒƒƒƒƒƒƒƒZ6ej0 1d ej2ge)sej4gng  ¡ej0 1d!g d"¢¡ej0 1d6d$ge"sd*gng  ¡ej0 1d7d8d9ge#s*d:gng  ¡ej0 1d;e"s8d*d$gnd*g¡ej0 1d<d$d*g¡ej0 1d=e"sNd$d*gnd$g¡ej0 1d>e"saedurag d?¢nd'g¡ej0 1d@dge!sqg dA¢ng  ¡ej0 1dBd$d*g¡ej0 1dCd$d*g¡ej0 1dDd$d*g¡ej0 1d-dg¡ej0 1d.g dE¢¡dFdG„ ƒƒƒƒƒƒƒƒƒƒƒƒƒƒZ7dHdI„ Z8ej0 1d ej2g¡ej0 1d+d$d*g¡ej0 1d-dg¡ej0 1d.dJg¡dKdL„ ƒƒƒƒZ9ej0 1d ej2g¡ej0 1d+d$d*g¡ej0 1d-g dM¢¡ej0 1d.g dN¢¡dOdP„ ƒƒƒƒZ:dQdR„ Z;ej0 1d ej<ej3ej2g¡ej0 1d-g dS¢¡ej0 1dTg dU¢¡ej0 1dVg dW¢¡dXdY„ ƒƒƒƒZ=d]d[d\„Z>dS )^é    N)Úparse_schema)Ú	rearrangeÚrepeat)Úapply_rotary_emb)Ú	pad_inputÚunpad_input)Úattention_refÚgenerate_qkvÚgenerate_random_padding_mask)Úflash_attn_funcÚflash_attn_varlen_funcÚflash_attn_combine)Úflash_attn_with_kvcacheÚget_scheduler_metadataÚ FLASH_ATTENTION_DISABLE_BACKWARDÚFALSEÚTRUEÚFLASH_ATTENTION_DISABLE_SPLITÚFLASH_ATTENTION_DISABLE_PAGEDKVÚ FLASH_ATTENTION_DISABLE_APPENDKVÚFLASH_ATTENTION_DISABLE_LOCALÚFLASH_ATTENTION_DISABLE_SOFTCAPÚFLASH_ATTENTION_DISABLE_PACKGQAÚFLASH_ATTENTION_DISABLE_FP16ÚFLASH_ATTENTION_DISABLE_FP8Úcudaé	   ÚFLASH_ATTENTION_DISABLE_HDIM64ÚFLASH_ATTENTION_DISABLE_HDIM96ÚFLASH_ATTENTION_DISABLE_HDIM128ÚFLASH_ATTENTION_DISABLE_HDIM192ÚFLASH_ATTENTION_DISABLE_HDIM256é@   é`   é€   éÀ   é   ÚdtypeÚmha_type)ÚmhaÚmqaÚgqaÚhas_qvFÚdeterministicÚsoftcapç        g      .@ÚlocalTÚcausalÚ
V_colmajorÚdzseqlen_q,seqlen_k)©é   r5   ©r"   r$   )r$   r%   ©r&   r&   ©éï   r5   ©é  é   ©éq   éË   )r>   r$   ©r$   éÙ   ©r>   éÓ   ©él   r&   ©r&   é   )é€  r&   ©é€  r$   ©rG   r&   ©é   rM   ©éÿ  rM   ©rM   rO   )é   rQ   )é€  rR   c           5         sL  |r|d dksˆt jkrt d¡ d‰t j d¡ |dkr dnd‰ d}|	d	kr*|n|	d
kr0dnd‰ˆt jkr:t jnˆ}|dkrH|dkrHd|gn|dkrQdd|gn|g}ˆt jkr\|g}tskt  d|d d¡ 	¡ dgndg}t
 ||¡D ].\}}t jˆ | ||ˆ|d}|dkrŽ|| d }| ˆ¡ |¡ ¡ }t jˆ |ˆ|ˆ|d ˆ¡ |¡ ¡ }t jˆ |ˆ|ˆ|d ˆ¡ |¡ ¡ }|rÒt jˆ | ||ˆ|d ˆ¡ |¡}nd }|sØdnt  d|d¡ ¡ }ˆt jkr÷‡ ‡‡fdd„tdƒD ƒ\}}}nd\}}}‡fdd„|||fD ƒ\}}}|r| ¡  ˆ¡ ¡ nd }|r*tt| ¡ dƒ ¡ dƒ ¡ }t|||d d ||||||||d\}}t|||d d ||||||||dd ˆt jkrUˆnd d!\}} d|d" d" |  ¡  ¡  	¡  }!|dkrrdnd}"td#||  ¡  ¡  	¡ › ƒ td$||  ¡  ¡  	¡ › ƒ ts™dd gndg}#ts£ddgndg}$t
 |#|$¡D ]O\}%}&t||||||||||||%|&d%}'td&|'|  ¡  ¡  	¡ › ƒ td'|'|  ¡  ¡  	¡ › ƒ |'|  ¡  ¡  	¡ |"||  ¡  ¡  	¡  |! ksúJ ‚q¬ts£ˆt jkr£|s£|s£|dks£|dks£t  |'¡}(|( ¡ |' ¡   d(¡  dd¡})t j! "|'|||f|(¡\}*}+}t j! "||||f|(¡\},}-}.t j! "||||f|(¡\}/}0}1td)|*|,  ¡  ¡  	¡ › ƒ td*|+|-  ¡  ¡  	¡ › ƒ td+||.  ¡  ¡  	¡ › ƒ td,|*|,  ¡  ¡  	¡ › ƒ td-|+|-  ¡  ¡  	¡ › ƒ td.||.  ¡  ¡  	¡ › ƒ td/|/|,  ¡  ¡  	¡ › ƒ td0|0|-  ¡  ¡  	¡ › ƒ td1|1|.  ¡  ¡  	¡ › ƒ td2|/|,  ¡  ¡  	¡ › ƒ td3|0|-  ¡  ¡  	¡ › ƒ td4|1|.  ¡  ¡  	¡ › ƒ d|,d" d" |,  ¡  ¡  	¡  |dkrdnd5 }2|*|,  ¡  ¡  	¡ |"|/|,  ¡  ¡  	¡  |2 ks;J ‚d|-d" d" |-  ¡  ¡  	¡  |dkrQdnd5 }3|+|-  ¡  ¡  	¡ |"|0|-  ¡  ¡  	¡  |3 ksoJ ‚d|.d" d" |.  ¡  ¡  	¡  |dkr…dnd5 }4||.  ¡  ¡  	¡ |"|1|.  ¡  ¡  	¡  |4 ks£J ‚qtd S )6Né   r   zQV_colmajor requires seqlen_k to be a multiple of 16 and dtype to be float8_e4m3fnr   é   r   é   é   r)   r+   r5   r$   r%   r"   r&   rG   ©r5   ©Údevicer'   r/   é   ©éÿÿÿÿr\   ©rU   c                    ó$   g | ]}t jˆ ˆˆt jd d ‘qS ©rX   rU   ©ÚtorchÚrandÚfloat32©Ú.0Ú_©Ú
batch_sizerY   Ú	nheads_kv© úL/home/ubuntu/vllm_env/lib/python3.10/site-packages/hopper/test_flash_attn.pyÚ
<listcomp>   ó   $ z*test_flash_attn_output.<locals>.<listcomp>r<   ©NNNc                    ó   g | ]}|  ¡  ˆ ¡ ¡ ‘qS rj   ©ÚdetachÚtoÚrequires_grad_©re   Úx©r'   rj   rk   rl      ó    zb s h d -> b h d szb h d s -> b s h d©r1   ÚqvÚ	q_descaleÚ	k_descaleÚ	v_descaleÚwindow_sizeÚattention_chunkr.   FT©r1   ry   rz   r{   r|   r}   r~   r.   ÚupcastÚreorder_opsÚintermediate_dtypeç333333Ó?úPytorch max diff: úPytorch mean diff: )
r1   ry   rz   r{   r|   r}   r~   r.   Úpack_gqaÚ
num_splitsúOutput max diff: úOutput mean diff: r\   údQ max diff: údK max diff: údV max diff: údQ mean diff: údK mean diff: údV mean diff: údQ Pytorch max diff: údK Pytorch max diff: údV Pytorch max diff: údQ Pytorch mean diff: údK Pytorch mean diff: údV Pytorch mean diff: ça2U0*©3?)#ra   Úfloat8_e4m3fnÚpytestÚskipÚrandomÚmanual_seedÚbfloat16ÚDISABLE_LOCALÚrandintÚitemÚ	itertoolsÚproductÚrandnrr   rs   ÚtolistÚrangerq   r   Ú
contiguousr   ÚabsÚmaxÚprintÚmeanÚDISABLE_PACKGQAÚDISABLE_SPLITr   ÚDISABLE_BACKWARDÚ
randn_likeÚfloatÚsumÚ	transposeÚautogradÚgrad)5Úseqlen_qÚseqlen_kr3   r1   r0   r.   r2   r-   r,   r(   r'   ÚnheadsÚ	dtype_refÚdv_valsÚattention_chunk_valsÚdvr~   Úq_refÚk_refÚv_refÚqv_refr}   rz   r{   r|   ÚqÚkÚvry   Úout_refÚattn_refÚout_ptÚattn_ptÚfwd_atolÚrtolÚpack_gqa_valsÚnum_splits_valsr†   r‡   ÚoutÚgÚdo_oÚdqÚdkÚdq_refÚdk_refÚdv_refÚdq_ptÚdk_ptÚdv_ptÚdq_atolÚdk_atolÚdv_atolrj   ©rh   rY   r'   ri   rk   Útest_flash_attn_output5   sê   7
0
$&&$
"

õ
ò õ:ÿÿþ


262626€ ärØ   Úadd_unused_qkv)r4   )r5   r<   )rU   r5   )iÿ  r5   )r<   i  r6   ©r$   r$   r7   r=   r@   rB   rD   rF   )i3  r&   rI   rK   rL   rN   rP   ©rT   rT   c           N         sh  d‰t j | | | t|ƒd  t|ƒ ¡ | dkrdnd‰ d}|	dkr&|n|	dkr,dnd‰ˆt jkr6t jnˆ}|d	krD|d
krDd	|gn|dkrMdd|gn|g}ˆt jkrX|g}| |krktskt  d|d d¡ ¡ dgndg}t	 
||¡D ]¼\}}t jˆ | ||ˆ|d}|dkr’|| d  ¡  ¡ }| ˆ¡ |¡ ¡ }t jˆ |ˆ|ˆ|d ˆ¡ |¡ ¡ }t jˆ |ˆ|ˆ|d ˆ¡ |¡ ¡ }|rÖt jˆ | ||ˆ|d ˆ¡ |¡}nd }|sÜdnt  d|d¡}ˆt jkrù‡ ‡‡fdd„tdƒD ƒ\}}}nd\}}}dd„ |||fD ƒ\}}}|r| ¡ nd }t| ˆ ˆddd}t|ˆ ˆddd}dd„ }|||| ˆ |jƒ\}} ||||ˆ |jƒ\}}!t||||||d| |!d 	\}"}#}$}%}&}'}(})}*}+}}}}},}-}.‡fd!d„|"|#|$fD ƒ\}"}#}$t|||||||||||||d"\}/}0t|||||||||||||ddˆt jkr˜ˆnd d#\}1}2td$|1|/  ¡  ¡  ¡ › ƒ td%|1|/  ¡  ¡  ¡ › ƒ | d urÆt| d&ƒ}3d|/d' d' |/  ¡  ¡  ¡  }4|dkrÝdnd}5tsæddgndg}6tsðddgndg}7t	 
|6|7¡D ]b\}8}9t|"|#|$|&|'|*|+|(|)||%||||||d(}:|,|:ƒ};| d ur!|; |3d¡ td)|;|/  ¡  ¡  ¡ › ƒ td*|;|/  ¡  ¡  ¡ › ƒ |;|/  ¡  ¡  ¡ |5|1|/  ¡  ¡  ¡  |4 ksZJ ‚qùts1ˆt jkr1|s1|dks1|dks1t  |:¡}<|< ¡ |: ¡    d+¡ !d+d,¡}=t j" #|:|"|#|$f|<¡\}>}?}@|-|>ƒ}A|.|?ƒ}B|.|@ƒ}|!d ur¶t|!d&ƒ}C|B |Cd¡ | |Cd¡ | d urÁ|A |3d¡ |,|<ƒ}Dt j" #|/|||f|D¡\}E}F}Gt j" #|1|||f|D¡\}H}I}Jtd-|A|E  ¡  ¡  ¡ › ƒ td.|B|F  ¡  ¡  ¡ › ƒ td/||G  ¡  ¡  ¡ › ƒ td0|A|E  ¡  ¡  ¡ › ƒ td1|B|F  ¡  ¡  ¡ › ƒ td2||G  ¡  ¡  ¡ › ƒ td3|H|E  ¡  ¡  ¡ › ƒ td4|I|F  ¡  ¡  ¡ › ƒ td5|J|G  ¡  ¡  ¡ › ƒ td6|H|E  ¡  ¡  ¡ › ƒ td7|I|F  ¡  ¡  ¡ › ƒ td8|J|G  ¡  ¡  ¡ › ƒ d|Ed' d' |E  ¡  ¡  ¡  |dkr«dnd9 }K|A|E  ¡  ¡  ¡ |5|H|E  ¡  ¡  ¡  |K ksÉJ ‚d|Fd' d' |F  ¡  ¡  ¡  |dkrßdnd9 }L|B|F  ¡  ¡  ¡ |5|I|F  ¡  ¡  ¡  |L ksýJ ‚d|Gd' d' |G  ¡  ¡  ¡  |dkrdnd9 }M||G  ¡  ¡  ¡ |5|J|G  ¡  ¡  ¡  |M ks1J ‚qtd S ):Nr   rU   rT   r   rV   r)   r+   r5   r$   r%   r"   r&   rG   rW   r   rX   r/   rZ   r[   r]   c                    r^   r_   r`   rd   rg   rj   rk   rl   o  rm   z1test_flash_attn_varlen_output.<locals>.<listcomp>r<   rn   c                 S   s   g | ]}|  ¡  ¡ ‘qS rj   )rq   rs   rt   rj   rj   rk   rl   r  s    rš   F)ÚmodeÚzero_lengthsTc                 S   sH   |rt |||ƒ}t | |¡}t t | |¡|¡}||fS | }d }||fS ©N)r
   ra   Úlogical_andÚlogical_xorÚ
logical_or)Úpadding_maskÚ
add_unusedÚmax_seq_lenÚbsrY   Úanother_maskÚ	attn_maskÚunused_maskrj   rj   rk   Ú_gen_unused_masks{  s   ÿþz8test_flash_attn_varlen_output.<locals>._gen_unused_masks)ry   ÚkvpackedÚquery_unused_maskÚkey_unused_maskc                    ro   rj   rp   rt   rv   rj   rk   rl   ¢  rw   rx   r   r„   r…   zb s -> b s 1 1rƒ   )
Ú	seqused_qÚ	seqused_kr1   ry   rz   r{   r|   r}   r~   r.   rˆ   r‰   r\   éþÿÿÿrŠ   r‹   rŒ   r   rŽ   r   r   r‘   r’   r“   r”   r•   r–   )$ra   rš   r›   Úintr—   rœ   r   rž   rŸ   r    r¡   r¢   rq   rs   rr   r¤   r
   rY   r	   r   r¨   r¦   r§   r©   r   rª   r«   r   Úmasked_fill_r¬   r­   r®   r¯   r°   r±   r²   )Nr³   r´   r3   rÙ   r1   r0   r.   r-   r,   r(   r'   rµ   r¶   r·   r¸   r¹   r~   rº   r»   r¼   r½   r}   rz   r{   r|   r¾   r¿   rÀ   ry   Úquery_padding_maskÚkey_padding_maskré   rë   rì   Úq_unpadÚk_unpadÚv_unpadÚqv_unpadÚcu_seqlens_qÚcu_seqlens_krí   rî   Úmax_seqlen_qÚmax_seqlen_kÚoutput_pad_fnÚ	dq_pad_fnÚ	dk_pad_fnrÁ   rÂ   rÃ   rÄ   Úq_zero_maskingrÅ   rÆ   rÇ   rÈ   r†   r‡   Ú	out_unpadrÉ   Úg_unpadrË   Údq_unpadÚdk_unpadÚdv_unpadrÌ   rÍ   Úk_zero_maskingrÊ   rÎ   rÏ   rÐ   rÑ   rÒ   rÓ   rÔ   rÕ   rÖ   rj   r×   rk   Útest_flash_attn_varlen_output  sL  5(0
,&&$
"

ÿ
ÿÿÿÿî
õ
ò

 ð
:ÿÿ





262626€ ¢r  Únew_kvzcausal,local)FF)TF)FTÚseqlen_new_eq_seqlen_qÚhas_rotary_seqlensÚrotary_interleavedÚrotary_fraction)r/   g      à?g      ð?Ú	page_size)r5   rZ   r$   Úhas_leftpadÚhas_batch_idxÚvarlen_q))r5   r$   )r5   iS  )r<   rM   )r"   i   )r"   r&   ©r<   r;   )r"   rT   )rS   i N  rÚ   rF   )rT   iù  c           U         s  |d ur|| dkrt  ¡  ˆ|kr|rt  ¡  |s"|dkr"t  ¡  |dkr,|	r,t  ¡  d‰tj d¡ d‰ |s:ˆ nˆ d }d}t t|| ƒd ¡d }|dkrS|n|d	krYd
nd}|| dkscJ ‚|tjkrktj	n|}|dkry|dkryd|gn|dkr‚dd|gn|g}|tjkr|g}|s‘|r t
s t d
|d d¡ ¡ dgndg}t ||¡D ]]\}}|dkoµ|dk}tjˆ ˆ||ˆ|d |¡ |¡}|rÛtjˆ ˆ||ˆ|d |¡ |¡}nd }|rtˆˆ ˆdd}t||ƒ^}‰}}} ‡ ‡‡fdd„}!|rt|dƒˆ nd }"n
d }|}|}"d\}}|sdnt d|d¡}#|
r"ˆn
t d
ˆd
 d¡ ¡ }$d }%d }&|rxtjˆ |$||ˆ|d |¡ |¡}'tjˆ |$||ˆ|d |¡ |¡}(|rrt|$ˆ ˆdd}&t|'|&ƒ^})}*}%} t|(|&ƒ^}+} n|'|(})}+nd\}'}(})}+|d u r¨tj||||ˆ|d |¡ |¡},tj||||ˆ|d |¡ |¡}-d }.nt||||||ˆ||ƒ	\},}-}.}/}0}1tj|rÁdnd
|rØ||sÌ|rÓ|d
krÓˆn|$ d
 n|d
 ˆ ftjˆd‰|röt ‡‡fdd„tˆ ƒD ƒ¡}2nd }2|r	tj|tjˆdd ˆ … }3nd }3ttj|ˆdd ƒ}4tˆd!ƒ}5|s"|4|5k }6n|r,|&jd"d#d$n|$}7|4|5|7 k }6|rFt |6|4|2 d"¡ d"|¡k¡}6|	sKˆnˆd }8|dkrµtj|d u r]|n|1| |d ˆdd tj }9t  |9¡j|d% |¡ |¡}:t !|9¡j|d% |¡ |¡};|s|ršt"||:|;|8|d&}<ntt"t|d'ƒ|:|;|8|d&d(ˆd)}<t"|'|:|;|8|d&}=n	d\}:};||'}<}=|sÃ|,n|,|3  #¡ }>|sÎ|-n|-|3  #¡ }?|r t |5|4k|4|5|7 k ¡}@t|=dƒ}At|(dƒ}B|rø|A|* }A|B|* }B|A|>|@< |B|?|@< t$|>d*|| d+}Ct$|?d*|| d+}Dt%|<|C|D||6|||#||2d,
\}E}Ft%|<|C|D||6|||#|d-d#|2|tjkr7|nd d.\}G}F| |¡}|rJ| |¡nd }|, |¡},|- |¡}-|d ur`|/ |¡nd }/|d url|0 |¡nd }0|'d urx|' |¡nd }'|(d ur„|( |¡nd }(|)d ur|) |¡nd })|+d urœ|+ |¡nd }+|d ur¨| |¡nd }|r·|d ur·|" |¡nd }"|:d urÃ|: |¡nd }:|;d urÏ|; |¡nd };|d u rÚ|, #¡ n|/ #¡ }H|d u rç|- #¡ n|0 #¡ }It&sòd
dgnd
g}Jd-d#g}Kt |J|K¡D ]\}L}M|Mr$t'ˆ |r|nˆ||||ˆ|j(|||%|2|$|||#||Ld/}Nnd }Nt|Ms,d
ndƒD ]Ô}F|d u rB|, )|H¡ |- )|I¡ n
|/ )|H¡ |0 )|I¡ t*|sR|n||d u rZ|,n|/|d u rb|-n|0|ri|sk|'n|)|rr|st|(n|+fi d0|s}|n|"“d1|:“d2|;“d3ˆ“d4|3“d5|2“d6|.“d7|“d8|%“d9|“d:|8“d;|“d<|#“d=|“d>|“d?|N“d@|L“dAd#“Ž^}O}P} |r¾|!|Oƒ}Ot+dB|O|E  ,¡  -¡  ¡ › ƒ t+dC|O|E  ,¡  .¡  ¡ › ƒ t+dD|G|E  ,¡  -¡  ¡ › ƒ t+dE|G|E  ,¡  .¡  ¡ › ƒ |r»|d u r!|s
|, |¡n|, |¡|3 }Q|s|- |¡n|- |¡|3 }Rn@t|/ |¡|s+|.n|.|3  /¡  dFˆ dGd d …d |…f  |¡}Qt|0 |¡|sK|.n|.|3  /¡  dFˆ dGd d …d |…f  |¡}R|> |¡ |¡}>|? |¡ |¡}?|tjurt 0|R|?¡s€J ‚ntj1|R|?dHdHdIsJ ‚|dkrœt 0|Q|>¡s›J ‚n|tjur¯tj1|Q|>dHdHdIs®J ‚ntj1|Q|>dJdJdIs»J ‚|tjkrÃdKnd}S|O|E  ,¡  -¡  ¡ |S|G|E  ,¡  -¡  ¡  dL ksàJ ‚|tjkrèdndM}T|O|E  ,¡  .¡  ¡ |T|G|E  ,¡  .¡  ¡  ksJ ‚q/qÿq©d S )NNr   r/   r   é   rU   rV   rS   r)   r*   r5   r<   r$   r%   r"   r&   rG   rW   rX   rš   )rÜ   c                    s   t | ˆˆ ˆƒS rÞ   )r   )Úoutput_unpad)rh   Ú	indices_qr³   rj   rk   Ú<lambda>©  s    ÿz)test_flash_attn_kvcache.<locals>.<lambda>zb s ... -> (b s) ...)NNr[   r]   )NNNN©r'   rY   c              	      sL   g | ]"}ˆ |   ¡ d krtjd ˆ |   ¡ dtjˆdntjdtjˆd‘qS )r   rW   r  r5   )rŸ   ra   rž   Úint32Úzeros)re   Úi)Úcache_seqlensrY   rj   rk   rl   ß  s    ÿÿÿz+test_flash_attn_kvcache.<locals>.<listcomp>)rY   zs -> 1 szb -> b 1r\   T)Úkeepdimsrv   )Úseqlen_offsetsÚinterleavedzb s h d -> b 1 (s h) dzb 1 (s h) d -> b s h d)Úszb s h d -> b s (h g) d)rÊ   )r1   ry   r}   r~   Úkey_leftpadF)r1   ry   r}   r~   r€   r   r  r‚   )
Ú	headdim_vrø   Úcu_seqlens_k_newÚcache_leftpadÚmax_seqlen_k_newr  r1   r}   r~   r‡   ry   Ú
rotary_cosÚ
rotary_sinr  Úcache_batch_idxr!  Ú
page_tablerø   r   rú   Úrotary_seqlensr1   r}   r~   r
  Úscheduler_metadatar‡   Úreturn_softmax_lserˆ   r‰   r„   r…   ú8(b nblocks) block_size ... -> b (nblocks block_size) ...©Úbgü©ñÒMbP?)rÆ   Úatolgš™™™™™¹?rZ   çñhãˆµøä>g      ø?)2r˜   r™   ra   rš   r›   ÚmathÚfloorrð   r—   rœ   r   rž   rŸ   r    r¡   r¢   rr   r
   r   r   Ú_generate_block_kvcacher  Úcatr¤   ÚrandpermÚaranger¯   rß   Ú	unsqueezeÚexpandrb   ÚpiÚcosÚsinr   Úcloner   r   r«   r   r'   Úcopy_r   r¨   r¦   r§   r©   ÚflattenÚequalÚallclose)Ur³   r´   r3   r  r  r  r  r  r
  r	  r  r1   r0   r  r(   r'   Úbatch_size_cacherµ   Ú
rotary_dimÚnheads_kr¶   r·   r¸   r¹   r~   r,   r¾   ry   rò   rô   rø   rú   Úrestrü   r÷   r}   Ú
seqlen_newr   Úkey_new_padding_maskr¿   rÀ   rõ   Ú	indices_krö   Úk_cacheÚv_cacher&  Úk_cache_pagedÚv_cache_pagedÚ
num_blocksr!  r%  r4  Úcache_seqlens_expandedró   Úk_new_seqlensr'  Úangler8  r9  Úq_roÚk_roÚk_cache_refÚv_cache_refÚupdate_maskÚk_to_updateÚv_to_updateÚk_cache_repÚv_cache_reprÁ   rf   rÃ   Úk_cache_savedÚv_cache_savedrÈ   Úprecompute_metadata_valsr‡   Úprecompute_metadatar(  rÉ   ÚlseÚk_cache_selectÚv_cache_selectÚmultÚ	mult_meanrj   )rh   r  rY   r  r³   rk   Útest_flash_attn_kvcacheA  sN  E0
,"$ ""
""	ÿù*ÿö
þÿ

ÿ
ýûúÿ	
ÿû÷
ÿ
ÿ


ö
ó



ú	



û€úùø	÷
öõôóòñðïîíìëê
é
ÿÿýüüýüü
66¥ó Çr`  c	                 C   sÒ   t  | | ¡| d }	tj|	|||||d |¡ |¡}
tj|	|||||d |¡ |¡}ttj|	tj|dd|d}t|
| ¡  d|dd d …d | …f }t|| ¡  d|dd d …d | …f }||||
||	fS )Nr<   rX   r  z(b nblocks) -> b nblocksr+  r*  )	r/  Úceilra   r¢   rr   r   r3  r  r<  )r´   r  rh   rA  r3   r¹   rY   r'   r¶   rJ  rH  rI  r&  rF  rG  rj   rj   rk   r1  Â  s>   ÿþÿþý
ýü
ýür1  )r"   i    c                 C   s€   d}t j d¡ d}d}d}t j|| ||||d}	t j||||||d}
t j||||||d}tdƒD ]
}t|	|
||d q3d S )	Nr   r   rU   rS   rZ   rX   éd   ©r1   )ra   rš   r›   r¢   r¤   r   )r³   r´   r3   r1   r'   rY   rh   rµ   ri   r¾   r¿   rÀ   rf   rj   rj   rk   Útest_flash_attn_clusterÜ  s   ÿrd  )é    é(   é;   r"   éP   r#   éo   r$   é    r%   éà   r&   ))r5   r9   r8   r  r:   )rM   r$   )éa   rl  rÚ   )éÈ   rm  r7   )é  rn  )rH   rH   )rG   rG   )é   ro  rL   rÛ   c              	   C   s”  d}t j d¡ t jdt j|d}d}d}t j|| ||||dd}	t j||||||dd}
t j||||||dd}t j d	¡ t|	|
||d
}t  |¡}t j 	||	|
|f|¡\}}}d|d d |  
¡  ¡  ¡  }tdƒD ]Y}t j d	¡ t|	|
||d
}t  ||¡s†J ‚t j 	||	|
|f|¡\}}}t j|||d}|s³td|› d|›d||  
¡  ¡  ¡ › ƒ t  ||¡s»J ‚t  ||¡sÃJ ‚|sÇJ ‚qnd S )Nr   r   l       F r  é<   rZ   T)rY   r'   Úrequires_gradé*   rc  rU   rƒ   iè  )r-  zIter z, dq_atol = z, dQ max diff: )ra   rš   r›   ÚemptyÚuint8r¢   r   r­   r±   r²   r¦   r§   rŸ   r¤   r=  r>  r¨   )r³   r´   r3   r1   r'   rY   Údummyrh   rµ   r¾   r¿   rÀ   Úout0rÊ   Údq0Údk0Údv0rÔ   r  rÉ   rÌ   rÍ   r¹   Údq_equalrj   rj   rk   Útest_flash_attn_race_conditionõ  s2   
 *
ór{  c                 C   s\   t j|dd}t  || ¡}t  t  |¡t  |¡B t  |¡|¡}| d¡|   d¡}||fS )z|
    out_partial: (num_splits, batch_size, seqlen, nheads, d)
    lse_partial: (num_splits, batch_size, nheads, seqlen)
    r   )Údimr\   )	ra   Ú	logsumexpÚexpÚwhereÚisinfÚisnanÚ
zeros_liker5  r¯   )Úout_partialÚlse_partialr[  ÚscalerÉ   rj   rj   rk   Úattention_combine_ref3  s
   $r†  )r"   r#   r$   r%   r&   rG   Úseqlen)
r5   rU   r<   re  r"   r&   r>   rE   rJ   rM   r‡   )	r5   rU   r<   r  é   re  é7   rl  é…   c              	   C   sö  t rt ¡  d}tj d¡ d}d}tj| d |||||tjd dd¡d | … }tj| ||d ||tjd dd	¡d d …d d …d d …d |…f }t	d
ƒ || d d …d |d …f< t
|||d\}	}
t||ƒ\}}| |¡}td|
|  ¡  ¡  ¡ › ƒ td|
|  ¡  ¡  ¡ › ƒ td|	|  ¡  ¡  ¡ › ƒ td|	|  ¡  ¡  ¡ › ƒ td||  ¡  ¡  ¡ › ƒ td||  ¡  ¡  ¡ › ƒ tj|
|dddsÔJ ‚d}|	|  ¡  ¡  ¡ |||  ¡  ¡  ¡  ks÷tj|	|dddsùJ ‚d S d S )Nr   r5   r  rS   rU   rX   r<   r\   rï   Úinf)Ú	out_dtypezLSE max diff: zLSE mean diff: rˆ   r‰   r„   r…   r.  )r-  rÆ   )r«   r˜   r™   ra   rš   r›   r¢   rc   r°   r®   r   r†  rr   r¨   r¦   r§   rŸ   r©   r>  )r‡   r‡  r3   r'   rY   rh   rµ   rƒ  r„  rÉ   r[  rÁ   Úlse_refrÃ   Úmultiplerj   rj   rk   Útest_flash_attn_combine?  s*   .@"
Jr  Úreturnc                   C   st   t jjjjj tdƒ¡sJ ‚t jjjjj tdƒ¡sJ ‚t jjj	jj tdƒ¡s*J ‚t jjj
jj tdƒ¡s8J ‚d S )Na[  flash_attn_3::fwd(Tensor q, Tensor k, Tensor v, Tensor(k_new!)? k_new=None, Tensor(v_new!)? v_new=None, Tensor? q_v=None, Tensor(out!)? out=None, Tensor? cu_seqlens_q=None, Tensor? cu_seqlens_k=None, Tensor? cu_seqlens_k_new=None, Tensor? seqused_q=None, Tensor? seqused_k=None, int? max_seqlen_q=None, int? max_seqlen_k=None, Tensor? page_table=None, Tensor? kv_batch_idx=None, Tensor? leftpad_k=None, Tensor? rotary_cos=None, Tensor? rotary_sin=None, Tensor? seqlens_rotary=None, Tensor? q_descale=None, Tensor? k_descale=None, Tensor? v_descale=None, float? softmax_scale=None, bool is_causal=False, int window_size_left=-1, int window_size_right=-1, int attention_chunk=0, float softcap=0., bool is_rotary_interleaved=False, Tensor? scheduler_metadata=None, int num_splits=0, bool? pack_gqa=None, int sm_margin=0) -> (Tensor(out!), Tensor, Tensor, Tensor)a(  flash_attn_3::bwd(Tensor dout, Tensor q, Tensor k, Tensor v, Tensor out, Tensor softmax_lse, Tensor(dq!)? dq=None, Tensor(dk!)? dk=None, Tensor(dv!)? dv=None, Tensor? cu_seqlens_q=None, Tensor? cu_seqlens_k=None, Tensor? seqused_q=None, Tensor? seqused_k=None, int? max_seqlen_q=None, int? max_seqlen_k=None, float? softmax_scale=None, bool is_causal=False, int window_size_left=-1, int window_size_right=-1, float softcap=0., bool deterministic=False, int sm_margin=0) -> (Tensor(dq!), Tensor(dk!), Tensor(dv!), Tensor, Tensor, Tensor, Tensor, Tensor)zflash_attn_3::fwd_combine(Tensor out_partial, Tensor lse_partial, Tensor(out!)? out=None, ScalarType? out_dtype=None) -> (Tensor(out!), Tensor)a(  flash_attn_3::get_scheduler_metadata(int batch_size, int max_seqlen_q, int max_seqlen_k, int num_heads, int num_heads_k, int headdim, int headdim_v, ScalarType qkv_dtype, Tensor seqused_k, Tensor? cu_seqlens_q=None, Tensor? cu_seqlens_k=None, Tensor? cu_seqlens_k_new=None, Tensor? seqused_q=None, Tensor? leftpad_k=None, int? page_size=None, int max_seqlen_k_new=0, bool is_causal=False, int window_size_left=-1, int window_size_right=-1, int attention_chunk=0, bool has_softcap=False, int num_splits=0, bool? pack_gqa=None, int sm_margin=0) -> Tensor)ra   ÚopsÚflash_attn_3ÚfwdÚdefaultÚ_schemaÚis_backward_compatible_withr   ÚbwdÚfwd_combiner   rj   rj   rj   rk   Útest_flash3_bw_compatibilitym  s   
ÿ
ÿ
ÿÿr™  )r  N)?Úosr/  r    r˜   ra   Útorch.nn.functionalÚnnÚ
functionalÚFÚtorch._Cr   Úeinopsr   r   Úflash_attn.layers.rotaryr   ÚImportErrorÚpaddingr   r   Ú	test_utilr   r	   r
   Úflash_attn_interfacer   r   r   r   r   Úgetenvr¬   r«   ÚDISABLE_PAGEDKVÚDISABLE_APPENDKVr   ÚDISABLE_SOFTCAPrª   ÚDISABLE_FP16r   Úget_device_capabilityÚDISABLE_FP8ÚDISABLE_HDIM64ÚDISABLE_HDIM96ÚDISABLE_HDIM128ÚDISABLE_HDIM192ÚDISABLE_HDIM256ÚCOMPILED_HDIMSÚmarkÚparametrizerœ   Úfloat16r—   rØ   r  r`  r1  rd  r{  r†  rc   r  r™  rj   rj   rj   rk   Ú<module>   sì    ÿ$ÿþýüûÿ4þ 44þ "t & þ$  Pÿþþ"#