o
    IiSL                     @   s  d dl Z d dlmZmZ d dlZd dlZd dlZd dlZd dlZd dl	Z						d(ddZ
											d)d	d
Ze jdddge jdddge jdg de jdg de jdg de jdddge jdg ddd Ze jdddge jdddge jdg de jdg de jdg de jdddge jdg ddd Ze jdejge jddge jdddge jdddge jdg d e jdg de jdg de jd!ddge jdddge jdg d"d#d$ Ze jdejge jddge jdddge jdddge jdg d e jdg de jdg de jd!ddge jdddge jdg d"d%d& Zed'kre  dS dS )*    N)	rearrangerepeatr   c                 C   s  t tj| |tjdd}tj||tjd}|d ur3t |d}t|d|jd d}t||k|| d}|d u r9|nt |dd}	|d u rG| nt |dd}
|d dk ra|||	 |
 |d	  kS |d u rkt||n|	}	t	|t
||	 |
 |d	  |	k|||	 |
 |d  k S )
Ndevicedtypezs -> s 1zb -> b 1 1 1zs -> b 1 1 sr   )bl        r      )r   torcharangelongr   shapewheresum	full_like
logical_orminimum)seqlen_qseqlen_kwindow_sizequery_padding_maskkey_padding_maskr   key_leftpadrow_idxcol_idxsksq r   L/home/ubuntu/.local/lib/python3.10/site-packages/hopper/test_attn_kvcache.pyconstruct_local_mask
   s*   	
r            FTc              	   C   s*  |r|	d df}	| j }|r|  | | } }}| jd |jd }}t|d| jd |jd  d}t|d| jd |jd  d}| jd }|sXtd| t| |}ntd| |t| }|
dkrt||
 }| }||
 }|dur|	t
| d	td
 |	d dks|	d dkrt|||	||| j|d}|	|td
 |dur|| }tj|dd|j }|	d dks|	d dkr|tj|dddd}|dur|t
| dd}dd|  }|dur|| d}n|}td||| }|dur	|	t
| dd |j|d|j|dfS )a  
    Arguments:
        q: (batch_size, seqlen_q, nheads, head_dim)
        k: (batch_size, seqlen_k, nheads_k, head_dim)
        v: (batch_size, seqlen_k, nheads_k, head_dim)
        query_padding_mask: (batch_size, seqlen_q)
        key_padding_mask: (batch_size, seqlen_k)
        attn_bias: broadcastable to (batch_size, nheads, seqlen_q, seqlen_k)
        dropout_p: float
        dropout_mask: (batch_size, nheads, seqlen_q, seqlen_k)
        causal: whether to apply causal masking
        window_size: (int, int), left and right window size
        upcast: whether to cast all inputs to fp32, do all computation in fp32, then cast
            output back to fp16/bf16.
        reorder_ops: whether to change the order of operations (scaling k instead of scaling q, etc.)
            without changing the math. This is to estimate the numerical error from operation
            reordering.
    Output:
        output: (batch_size, seqlen_q, nheads, head_dim)
        attention: (batch_size, nheads, seqlen_q, seqlen_k), softmax after dropout
    r   r
   zb s h d -> b s (h g) d   )gr   zbthd,bshd->bhtsNzb s -> b 1 1 sz-inf)r   )dimT)r$   keepdimr!   zb s -> b 1 s 1      ?zbhts,bshd->bthdzb s -> b s 1 1r   )r   floatr   r   r   einsummathsqrttanhmasked_fill_r   r    r   softmaxtomasked_fillall)qkvr   r   	attn_bias	dropout_pdropout_maskcausalr   softcapupcastreorder_opsr   dtype_ogr   r   dscores
local_mask	attentiondropout_scalingattention_dropoutputr   r   r   attention_ref-   sX   %
	
rD   r8   num_requestsr
      query_seqlen)r
      x   context_seqlen)i   i;  i  headdim)@         gqa_parallelznheads_kv, gqa_ratio)	r
   r
   r"         rT   r
       )rR      )rH   r
   r
      )   rF   )rH   r"   c              
   C   s   d}|}	|}
| | }t j|	|
| |fdt jd}t j|	|
| |fdt jd}t j||||fdt jd}t j|g| t jdd}t j  t||||d\}}tj	|||||dd|d\}}t j  || 
   dksnJ || 
   d	ks|J d S )
Ncudar   r   r   r8   r
   T)r2   k_cachev_cachecache_seqlensr8   
num_splitsreturn_softmax_lserO   gMbp?g-C6*?)r   randnbfloat16tensorint32r[   synchronizerD   flash_attn_interfaceflash_attn_with_kvcacheabsmaxitemmean)	nheads_kv	gqa_ratiorE   rG   rJ   rK   r8   rO   r   
num_cachescache_seqlennheads_qr^   r_   r2   r`   out_ref_out_fa3lse_fa3r   r   r   test_flash_attn_kvcache_nosplit   s@   



 rw   rT   )i@  i  i  c                 C   sb  d}|}	|}
| | }t j|	|
| |fdt jd}t j|	|
| |fdt jd}t j||||fdt jd}|t j}|t j}|t j}t j|g| t jdd}t j  t	||||d\}}t jdgt j
dd}t jdgt j
dd}t jdgt j
dd}tj|||||dd||||d\}}t j  ||    d	ksJ ||    d
ksJ d S )Nr[   r   r\   r]   r&   r
   T)r2   r^   r_   r`   r8   ra   rb   rO   	descale_q	descale_k	descale_vg{Gz?Mb`?)r   rc   rd   r/   float8_e4m3fnre   rf   r[   rg   rD   float32rh   ri   rj   rk   rl   rm   )rn   ro   rE   rG   rJ   rK   r8   rO   r   rp   rq   rr   r^   r_   r2   r`   rs   rt   rx   ry   rz   ru   rv   r   r   r   #test_flash_attn_kvcache_nosplit_fp8   sN   



 r~   r   use_heuristic_only)r
   rH      cache_seqlen_rand)	rP   )rF   r
   )r"   r"   rS   )rF   rF   rQ   )rT   	   rX   rU   c                  C   s$  d}d}|dkrd}n|}| | }|rd}nd}t j||| |fdt jd}t j||| |fdt jd}t j||||fdt jd}||
}||
}||
}t j|t jddd | }|rmt jd|d |ft jd|nt j|g| t jdd}t j	  t
j||||||dd	d
d	\}}td|d D ]}t
j|||||||d	|	|d
\}}t j	  td|| td|| td||||     td||||     td||||     td||||     |r||    dksJ ||    dksJ n||    dks&J ||    dks5J |   }|   }|   }|   }||    }||    }|tjkru|tjks||dks|J |tjkr|tjks|dksJ qd S )Nr[   rY      r
   rM   r   r\   r'   TF)	r2   r^   r_   r`   cache_batch_idxr8   ra   rb   rO   r   )
r2   r^   r_   r`   r   r8   ra   rb   rO   max_seqlen_k_hint
output-ref
output-fa3output-max-diffoutput-mean-difflse-max-difflse-mean-diff{Gz?MbP?r{   -C6?)r   rc   rd   r/   randpermrf   randintre   r[   rg   rh   ri   rangeprintrj   rk   rl   rm   r*   inf) rn   ro   rE   rG   rJ   rK   r8   r   r   rO   r   r   rp   rq   rr   
max_splitsr^   r_   r2   
cache_idxsr`   rs   lse_refiru   rv   lse_max_reflse_mean_reflse_max_fa3lse_mean_fa3lse_max_difflse_mean_diffr   r   r   test_flash_attn_kvcache_output	  s   


<



 &&r   c           #      C   sl  d}d}|dkrd}n|}| | }|rd}nd}t j||| |fdt jd}t j||| |fdt jd}t j||||fdt jd}||
}||
}||
}t j|t jddd | }|rmt jd|d |ft jd|nt j|g| t jdd}t j	  t jd	gt j
dd}t jd	gt j
dd}t jd	gt j
dd}tj||||||dd
d|||d\}}td|d D ]}tj|||||||d
|	||||d\}}t j	  td|| td|| td||||     td||||     td||||     td||||     |r;||    dks+J ||    dks:J n||    dksJJ ||    dksYJ |   }|   }|   }|   } ||    }!||    }"|tjkr|tjks|!dksJ |tjkr| tjks|"dksJ qd S )Nr[   rY   r   r
   rM   r   r\   r'   r&   TF)r2   r^   r_   r`   r   r8   ra   rb   rO   rx   ry   rz   r   )r2   r^   r_   r`   r   r8   ra   rb   rO   r   rx   ry   rz   r   r   r   r   r   r   g?r   g{Gz?r{   r   r   )r   rc   rd   r/   r   rf   r   re   r[   rg   r}   rh   ri   r   r   rj   rk   rl   rm   r*   r   )#rn   ro   rE   rG   rJ   rK   r8   r   r   rO   r   r   rp   rq   rr   r   r^   r_   r2   r   r`   rx   ry   rz   rs   r   r   ru   rv   r   r   r   r   r   r   r   r   r   "test_flash_attn_kvcache_output_fp8t  s   


<



 &&r   __main__)r   NNNN)NNNr!   NFr   r!   TFN)pytesteinopsr   r   r   
flash_attnrh   	itertoolsr*   timer    rD   markparametrizerw   r~   rd   r   r|   r   __name__mainr   r   r   r   <module>   s    
'
Z*0P
V
