o
    Iiۯ                     @   s  d dl Z d dlZd dlZd dlZd dlZd dlm  mZ d dl	m
Z
mZ d dlmZ d dlmZmZ d dlmZmZmZ d dlmZmZmZmZ e ddd	kZe d
dd	kZe ddd	kZe ddd	kZe ddd	kZe ddd	kZ e ddd	kZ!e ddd	kZ"e ddd	kpej#$dd  dk Z%e ddd	kZ&e ddd	kZ'e ddd	kZ(e ddd	kZ)e ddd	kZ*g e&sdgng  e'sdgng  e(sdgng  e)sdgng  e*sdgng  Z+ej,-dej.ge"sej/gng  e%sej0gng  ej,-dg d ej,-d!d"gej,-d#d$ge sd%gng  ej,-d&d"ges*d'gng  ej,-d(d"d'gej,-d)d"gej,-d*e+ej,-d+g d,d-d. Z1ej,-dej.ge"seej/gng  e%snej0gng  ej,-dg d ej,-d!d"gej,-d#d$ge sd%gng  ej,-d&d"gesd'gng  ej,-d(d"d'gej,-d/d"d'gej,-d*e+ej,-d+g d0d1d2 Z2ej,-dej.ge%sej0gng  ej,-d3d4gesd gng  ej,-dg d ej,-d5d"gesd'gng  ej,-d6d7d8gesd9gng  ej,-d:esd'd"gnd'gej,-d;es)d"d'gnd"gej,-d<es7g d=nd$gej,-d>dgesGg d?ng  ej,-d@d"d'gej,-dAd"d'gej,-dBd"d'gej,-d*dgej,-d+g dCdDdE Z3dFdG Z4ej,-dej.gej,-d(d"d'gej,-d*dgej,-d+dHgdIdJ Z5ej,-dej.gej,-d(d"d'gej,-d*g dKej,-d+g dLdMdN Z6dOdP Z7ej,-dej8ej/ej.gej,-d*g dQej,-dRg dSej,-d3g dTdUdV Z9dS )W    N)	rearrangerepeat)apply_rotary_emb)	pad_inputunpad_input)attention_refgenerate_qkvgenerate_random_padding_mask)flash_attn_funcflash_attn_varlen_funcflash_attn_combineflash_attn_with_kvcache FLASH_ATTENTION_DISABLE_BACKWARDFALSETRUEFLASH_ATTENTION_DISABLE_SPLITFLASH_ATTENTION_DISABLE_PAGEDKV FLASH_ATTENTION_DISABLE_APPENDKVFLASH_ATTENTION_DISABLE_LOCALFLASH_ATTENTION_DISABLE_SOFTCAPFLASH_ATTENTION_DISABLE_PACKGQAFLASH_ATTENTION_DISABLE_FP16FLASH_ATTENTION_DISABLE_FP8cuda	   FLASH_ATTENTION_DISABLE_HDIM64FLASH_ATTENTION_DISABLE_HDIM96FLASH_ATTENTION_DISABLE_HDIM128FLASH_ATTENTION_DISABLE_HDIM192FLASH_ATTENTION_DISABLE_HDIM256@   `            dtypemha_type)mhamqagqadeterministicFsoftcap        g      .@localTcausal
V_colmajordzseqlen_q,seqlen_k)   r2   r    r"   )r"   r#   r$   r$      r2        q      )r;   r"   r"      r;      l   r$   r$      )  r$     r"   rD   r$      rJ     rJ   rJ   rL   )   rN   )  rO   c
           1         s  |sdnd}
|r|d dkst jkrtd dt jd |dkr&dnd d}|d	kr0|n|d
kr6dndt jkr@t jn}t j | |||d}|dkrW|| d }||	 }t j |||d|	 }t j |||d|	 }|sdnt 
d|d}t jkr fddtdD \}}}nd\}}}fdd|||fD \}}}|rtt| d d	 }t|||d d ||||||
|d\}}t|||d d ||||||
|ddt jkrnd d\}}d|d d |     }|dkrdnd}td||      td||      ts8ddgndg}tsBddgndg}t||D ]P\}} t|||||||||
||| d\}!}"td |!|      td!|!|      |!|    |||     | ksJ qKtst jkr|st |!}#|# |!  d"dd}$t j |!|||f|#\}%}&}'t j ||||f|#\}(})}*t j ||||f|#\}+},}-td#|%|(      td$|&|)      td%|'|*      td&|%|(      td'|&|)      td(|'|*      td)|+|(      td*|,|)      td+|-|*      td,|+|(      td-|,|)      td.|-|*      tsBt jkrD|sFd|(d d |(     |dkrdnd/ }.|%|(    ||+|(     |. ksJ d|)d d |)     |dkrdnd/ }/|&|)    ||,|)     |/ ksJ d|*d d |*     |dkr$dnd/ }0|'|*    ||-|*     |0 ksHJ d S d S d S d S )0Nr      zQV_colmajor requires seqlen_k to be a multiple of 16 and dtype to be float8_e4m3fnr      r         r'   r)   r2   devicer%   r,      rX   rR   c                    $   g | ]}t j t jd d qS rT   rR   torchrandfloat32.0_
batch_sizerU   	nheads_kv J/home/ubuntu/.local/lib/python3.10/site-packages/hopper/test_flash_attn.py
<listcomp>      $ z*test_flash_attn_output.<locals>.<listcomp>r9   NNNc                       g | ]}|    qS rf   detachtorequires_grad_ra   xr%   rf   rg   rh          zb s h d -> b h d szb h d s -> b s h d)r.   	q_descale	k_descale	v_descalewindow_sizesink_token_lengthr+   FT)
r.   rt   ru   rv   rw   rx   r+   upcastreorder_opsintermediate_dtype333333?Pytorch max diff: Pytorch mean diff: )	r.   rt   ru   rv   rw   rx   r+   pack_gqa
num_splitsOutput max diff: Output mean diff: rX   dQ max diff: dK max diff: dV max diff: dQ mean diff: dK mean diff: dV mean diff: dQ Pytorch max diff: dK Pytorch max diff: dV Pytorch max diff: dQ Pytorch mean diff: dK Pytorch mean diff: dV Pytorch mean diff: a2U0*3?)!r]   float8_e4m3fnpytestskiprandommanual_seedbfloat16randnrn   ro   randintranger   rm   
contiguousr   absmaxitemprintmeanDISABLE_PACKGQADISABLE_SPLIT	itertoolsproductr
   DISABLE_BACKWARD
randn_likefloatsum	transposeautogradgrad)1seqlen_qseqlen_kr0   r.   r-   r+   r/   r*   r&   r%   rx   nheads	dtype_refq_refk_refv_refrw   rt   ru   rv   qkvout_refattn_refout_ptattn_ptfwd_atolrtolpack_gqa_valsnum_splits_valsr   r   outlsegdo_odqdkdvdq_refdk_refdv_refdq_ptdk_ptdv_ptdq_atoldk_atoldv_atolrf   rd   rU   r%   re   rg   test_flash_attn_output0   s   6
&&
"


 
:
262626r   add_unused_qkv)r1   )r2   r9   )rR   r2   )i  r2   )r9   i  r3   r"   r"   r4   r:   r=   r?   rA   rC   )i3  r$   rF   rH   rI   rK   rM   rQ   rQ   c
           H         s  dt j| | | t|d  t|  | dkrdnd d}
|dkr&|
n|dkr,dndt jkr6t jn}t j | |
||d	}|d
krQ|| d   }|		| }t j |||d			| }t j |||d			| }|sdnt 
d|d}t jkr fddtdD \}}}nd\}}}dd |||fD \}}}t|  ddd}t| ddd}dd }||||  |j\}}|||| |j\}}t|||||d||d\}}}}}} }!}"}#}}}}$}%}&fdd|||fD \}}}t|||||||||||d\}'}(t|||||||||||ddt jkr1nd d\})}*td|)|'      td|)|'      |d ur_t|d }+d|'d! d! |'     },|d
krvdnd}-tsddgndg}.tsddgndg}/t|.|/D ]b\}0}1t|||||| |!|"|#||||||d\}2}3|$|2}4|d ur|4|+d
 td"|4|'      td#|4|'      |4|'    |-|)|'     |, ksJ qts!t jkr!t |2}5|5 |2  d$ d$d%}6t j!"|2|||f|5\}7}8}9|%|7}:|&|8};|&|9}<|d urBt|d }=|;|=d
 |<|=d
 |d urM|:|+d
 |$|5}>t j!"|'|||f|>\}?}@}At j!"|)|||f|>\}B}C}Dtd&|:|?      td'|;|@      td(|<|A      td)|:|?      td*|;|@      td+|<|A      td,|B|?      td-|C|@      td.|D|A      td/|B|?      td0|C|@      td1|D|A      tsƈt jkrd|?d! d! |?     |dkr@dnd2 }E|:|?    |-|B|?     |E ks^J d|@d! d! |@     |dkrtdnd2 }F|;|@    |-|C|@     |F ksJ d|Ad! d! |A     |dkrdnd2 }G|<|A    |-|D|A     |G ksJ d S d S d S )3Nr   rR   rQ   r   rS   r'   r)   r2   rT   r,   rV   rW   r   rY   c                    rZ   r[   r\   r`   rc   rf   rg   rh   M  ri   z1test_flash_attn_varlen_output.<locals>.<listcomp>r9   rj   c                 S   s   g | ]}|   qS rf   )rm   ro   rp   rf   rf   rg   rh   P  s    r   F)modezero_lengthsTc                 S   sH   |rt |||}t| |}tt| ||}||fS | }d }||fS N)r	   r]   logical_andlogical_xor
logical_or)padding_mask
add_unusedmax_seq_lenbsrU   another_mask	attn_maskunused_maskrf   rf   rg   _gen_unused_masksX  s   z8test_flash_attn_varlen_output.<locals>._gen_unused_masks)kvpackedquery_unused_maskkey_unused_maskc                    rk   rf   rl   rp   rr   rf   rg   rh   }  rs   )r.   rt   ru   rv   rw   r+   )	r.   rt   ru   rv   rw   r+   ry   rz   r{   r}   r~   zb s -> b s 1 1r|   r   r   rX   r   r   r   r   r   r   r   r   r   r   r   r   r   )#r]   r   r   intr   r   r   rm   ro   rn   r   r   r	   rU   r   r   r   r   r   r   r   r   r   r   r   r   r   masked_fill_r   r   r   r   r   r   r   )Hr   r   r0   r   r.   r-   r+   r*   r&   r%   r   r   r   r   r   rw   rt   ru   rv   r   r   r   query_padding_maskkey_padding_maskr   r   r   q_unpadk_unpadv_unpadcu_seqlens_qcu_seqlens_k	seqused_q	seqused_kmax_seqlen_qmax_seqlen_koutput_pad_fn	dq_pad_fn	dk_pad_fnr   r   r   r   q_zero_maskingr   r   r   r   r   r   	out_unpadr   r   g_unpadr   dq_unpaddk_unpaddv_unpadr   r   r   k_zero_maskingr   r   r   r   r   r   r   r   r   r   rf   r   rg   test_flash_attn_varlen_output  s  3(&&
"






 

:



262626r   r   r2   new_kvzcausal,local)FF)TF)FTseqlen_new_eq_seqlen_qrotary_interleavedrotary_fraction)r,   g      ?g      ?	page_size)r2   rV   r"   has_leftpadhas_batch_idxvarlen_q))r2   r"   )r2   iS  )r9   rJ   )r    i   )r    r$   r9   r8   )r    rQ   )rP   i N  )r2      )rP   r  r   rC   )rQ   i  c           F         sp	  |d ur|| dkrt   |kr|rt   |s"|dkr"t   dtjd d |s0 n d }d}tt|| d d }|dkrI|n|d	krOd
nd}|| dksYJ |tjkratj	n|}tj
 |||d||}|rt dd}t||^}}}} fdd}nd }|}d\}}|sdntd|d}|	rn
td
d
 d }d }d }|rtj
 ||||d||} tj
 ||||d||}!|rt| dd}t| |^}"}#}}t|!|^}$}n| |!}"}$nd\} }!}"}$|d u r,tj
|||||d||}%tj
|||||d||}&d }'nt||||||\}%}&}'}(})}*tj|rCdnd
|rZ||
sN|rU|d
krUn| d
 n|d
  ftjd|rxtfddt D }+nd }+|rtj|tjdd   },nd },ttj|dd}-td}.|s|-|.k }/n|r|jdddn|}0|-|.|0 k }/|rt|/|-|+dd|k}/|dkr.tj|d u r|n|*| |d dd tj }1t|1j|d||}2t|1j|d||}3|
s	|rt||2|3|d }4nttt|d!|2|3|d d"d#}4t| |2|3|d }5n	d\}2}3|| }4}5|s<|%n|%|,   }6|sG|&n|&|,   }7|ryt|.|-k|-|.|0 k }8t|5d$}9t|!d$}:|rq|9|# }9|:|# }:|9|6|8< |:|7|8< t!|6d%|| d&};t!|7d%|| d&}<t"|4|;|<||/|
||+d'\}=}>t"|4|;|<||/|
|d(d|+|tjkr|nd d)\}?}>||}|r||nd }|%|}%|&|}&|d ur|(|nd }(|d ur|)|nd })| d ur| |nd } |!d ur|!|nd }!|"d ur|"|nd }"|$d ur|$|nd }$|2d ur|2|nd }2|3d ur)|3|nd }3t#|s1|n||d u r9|%n|(|d u rA|&n|)|rH|sJ| n|"|rQ|sS|!n|$f|2|3|,|+|'||||
|||dd*^}@}A}|rq||@}@t$d+|@|= % &    t$d,|@|= % '    t$d-|?|= % &    t$d.|?|= % '    |rn|d u r|s|%|n|%||, }B|s|&|n|&||, }Cn@t|(||s|'n|'|, (  d/ d0d d d |f |}Bt|)||s|'n|'|, (  d/ d0d d d |f |}C|6||}6|7||}7|tjur4t)|C|7s3J ntj*|C|7d1d1d2s@J |dkrOt)|B|6sNJ n|tjurbtj*|B|6d1d1d2saJ ntj*|B|6d3d3d2snJ |tjkrvd4nd}D|@|= % &  |D|?|= % &   d5 ksJ |tjkrdnd6}E|@|= % '  |E|?|= % '   ksJ d S )7Nr   r,   r      rR   rS   rP   r'   r(   r2   r9   rT   r   )r   c                    s   t |  S r   )r   )output_unpad)rd   	indices_qr   rf   rg   <lambda>m  s    z)test_flash_attn_kvcache.<locals>.<lambda>)NNrW   rY   r2   )NNNNr%   rU   c              	      sL   g | ]"} |   d krtjd  |   dtjdntjdtjdqS )r   r	  r
  r2   )r   r]   r   int32zeros)ra   i)cache_seqlensrU   rf   rg   rh     s    z+test_flash_attn_kvcache.<locals>.<listcomp>)rU   zs -> 1 szb -> b 1rX   T)keepdimsrr   )seqlen_offsetsinterleavedzb s h d -> b 1 (s h) dzb 1 (s h) d -> b s h d)szb s ... -> (b s) ...zb s h d -> b s (h g) d)r   )r.   rw   key_leftpadF)r.   rw   ry   rz   r  r{   )
rotary_cos
rotary_sinr  cache_batch_idxcache_leftpad
page_tabler   cu_seqlens_k_newr   r.   rw   r   r   return_softmax_lser   r   r}   r~   8(b nblocks) block_size ... -> b (nblocks block_size) ...bgMbP?)r   atolg?rV   h㈵>g      ?)+r   r   r]   r   r   mathfloorr   r   r   r   rn   r	   r   r   r   _generate_block_kvcacher  catr   randpermr   aranger   r   	unsqueezeexpandr^   picossinr   cloner   r   r   r   r   r   r   flattenequalallclose)Fr   r   r0   r  r  r   r   r   r   r   r.   r-   r   r&   r   r%   batch_size_cacher   
rotary_dimnheads_kr   r   r   r   r   r   restr   rw   
seqlen_newr  key_new_padding_maskr   r   r   	indices_kr   k_cachev_cacher  k_cache_pagedv_cache_paged
num_blocksr  r  r%  cache_seqlens_expandedr   k_new_seqlensangler)  r*  q_rok_rok_cache_refv_cache_refupdate_maskk_to_updatev_to_updatek_cache_repv_cache_repr   rb   r   r   r   k_cache_selectv_cache_selectmult	mult_meanrf   )rd   r  rU   r  r   rg   test_flash_attn_kvcache  s  E"""
""	*



	












66rK  c                 C   s   t | | | d }tj||||||d}tj||||||d}	ttj|tj|dd|d}
t||
  d|dd d d | f }t|	|
  d|dd d d | f }|||
||	|fS )Nr9   rT   r
  z(b nblocks) -> b nblocksr  r  )r   ceilr]   r   r   r$  r  r,  )r   r   rd   r1  r0   rU   r%   r:  r8  r9  r  r6  r7  rf   rf   rg   r"  a  s6   

r"  )r    i    c                 C   s   d}t jd d}d}d}t j|| ||||d}	t j||||||d}
t j||||||d}tdD ]
}t|	|
||d q3d S )	Nr   r   rR   rP   rV   rT   d   r.   )r]   r   r   r   r   r
   )r   r   r0   r.   r%   rU   rd   r   re   r   r   r   rb   rf   rf   rg   test_flash_attn_cluster{  s   rO  )    (   ;   r    P   r!   o   r"      r#      r$   ))r2   r6   r5   r  r7   )rJ   r"   )a   rW  r   )   rX  r4   )  rY  )rE   rE   )rD   rD   )   rZ  rI   r   c              	   C   s  d}t jd t jdt j|d}d}d}t j|| ||||dd}	t j||||||dd}
t j||||||dd}t jd	 t|	|
||d
\}}t |}t j	||	|
|f|\}}}d|d d | 
    }tdD ]c}t jd	 t|	|
||d
\}}t ||sJ t ||sJ t j	||	|
|f|\}}}t j|||d}|std| d|d|| 
     t ||sJ t ||sJ |sJ qpd S )Nr   r   l       F r
  <   rV   T)rU   r%   requires_grad*   rN  rR   r|   i  )r  zIter z, dq_atol = z, dQ max diff: )r]   r   r   emptyuint8r   r
   r   r   r   r   r   r   r   r-  r.  r   )r   r   r0   r.   r%   rU   dummyrd   r   r   r   r   out0lse0r   dq0dk0dv0r   r  r   r   r   r   r   dq_equalrf   rf   rg   test_flash_attn_race_condition  s4   
 *
rg  c                 C   s\   t j|dd}t || }t t |t |B t ||}|d|  d}||fS )z|
    out_partial: (num_splits, batch_size, seqlen, nheads, d)
    lse_partial: (num_splits, batch_size, nheads, seqlen)
    r   )dimrX   )	r]   	logsumexpexpwhereisinfisnan
zeros_liker&  r   )out_partiallse_partialr   scaler   rf   rf   rg   attention_combine_ref  s
   $rr  )r    r!   r"   r#   r$   seqlen)r2   rR   r9   rP  r    r$   r;   rB   rG   rJ   rQ   )	r2   rR   r9   r     rP  7   rW     c              	   C   s  t rt  d}tjd d}d}tj| d |||||tjdddd |  }tj| ||d ||tjddd	d d d d d d d |f }t	d
 || d d d |d f< t
|||d\}	}
t||\}}||}td|
|      td|
|      td|	|      td|	|      td||      td||      tj|
|dddsJ d}|	|    |||     kstj|	|dddsJ d S d S )Nr   r2   r  rP   rR   rT   r9   rX   r   inf)	out_dtypezLSE max diff: zLSE mean diff: r   r   r}   r~   r  )r  r   )r   r   r   r]   r   r   r   r_   r   r   r   rr  rn   r   r   r   r   r   r.  )r   rs  r0   r%   rU   rd   r   ro  rp  r   r   r   lse_refr   multiplerf   rf   rg   test_flash_attn_combine  s*   .@"
Jr{  ):osr   r   r   r]   torch.nn.functionalnn
functionalFeinopsr   r   flash_attn.layers.rotaryr   paddingr   r   	test_utilr   r   r	   flash_attn_interfacer
   r   r   r   getenvr   r   DISABLE_PAGEDKVDISABLE_APPENDKVDISABLE_LOCALDISABLE_SOFTCAPr   DISABLE_FP16r   get_device_capabilityDISABLE_FP8DISABLE_HDIM64DISABLE_HDIM96DISABLE_HDIM128DISABLE_HDIM192DISABLE_HDIM256COMPILED_HDIMSmarkparametrizer   float16r   r   r   rK  r"  rO  rg  rr  r_   r{  rf   rf   rf   rg   <module>   s    $0 4$ "]  $  "