o
    i?                     @   s  d dl Z d dlZd dlm  mZ d dlmZmZ d dl	m
Z
 d dlmZmZ z
d dlmZmZ W n ey>   d\ZZY nw zd dlmZ W n eyR   dZY nw dZejZd	Zd
ZdZdZg dZedd D ]^Zedkrsenedkr~eed d
nedkrd
ndZedv rdndZedkrdnedkrdneZ edkoe dkZ!edv rdndZ"edkoe"dkoeduZ#e$d  dZ%dZ&e'de(  de de de de  de"  d d! d"D D ]Zej)ege% eej*d#Z&d Z+ej,e%eeeeed$Z-zHej,e%eee eed$Z.ej,e%eeeeed$Z/e"durAee" d ks!J d%d! e/e.fD \Z/Z.e
ej0e%e e" eej1d#d&ee" d'Z2ndZ2W n ej3yO   Y qw e!r^ej,e%eee eed$ndZ4ee%eeeeee&e-je e"d(d)Z5d*d+ Z6e 7d
 esee6d
d,d-Z8n%ej9:  ej9;ej9<  ee6d,d.Z8W d   n	1 sw   Y  e#ree&ee e eZ=e!rej>e-e4gd/d0ne-Z?ej>e.e/gd/d0Z@d1d+ ZAe 7d
 eseeAd
d,d-ZBn%ej9:  ej9;ej9<  eeAd,d.ZBW d   n	1 sw   Y  e&du ree% ne&C D ZEeEe ee   d e-F d  e!r-e4F d nd  e-F e  e d  ZGeeE e ee e!rFdnd
   d ZHeGd2 d3 ZIeHd4 d3 ZJeeIeJZKe'd5e d6eshd7nd8 d9e8d: d;d<eGd= e8d>  d?d@eHdA e8d>  d?dB e#re'd5e dCesd7nd8 d9eBd: d;d<eGd= eBd>  d?d@eHdA eBd>  d?dB e'dDeHeG d; e'dEeKd?dF qqjdS )G    N)do_benchdo_bench_cudagraph)	rearrange)flash_attn_with_kvcacheget_scheduler_metadata)flash_mla_with_kvcacheget_mla_metadata)NN)pytorch_profilercudai          F)mhagqamqamlagla      r   r      r      )r   r   @   i   r      
z, nheads_q = z, nheads_kv = z, headdim = z, headdim_v = z, page_size = c                 C   s   g | ]}|d  qS )i    ).0sr   r   Q/home/ubuntu/vllm_env/lib/python3.10/site-packages/hopper/benchmark_mla_decode.py
<listcomp>>   s    r   )r   r      r          r   )devicedtype)r"   r!   c                 C   s   g | ]	}t |d tdqS )zb (n p) h d -> (b n) p h d)p)r   	page_size)r   xr   r   r   r   H       z(b s) -> b s)r   T)	headdim_vr$   causalc                   C   s   t tttttttdtd	S )NT)cache_seqlens
num_splitsqv
page_tabler(   scheduler_metadata)	r   qk_cachev_cacher)   r*   r+   r,   r-   r   r   r   r   <lambda>X   r&   r1   
   )warmuprep)r4   )dimc                   C   s   t tttttgtR ddiS )Nr(   T)r   q_concatkv_cache_concatr,   r)   r'   mla_metadatar   r   r   r   r1   g   s    g  z_Bg    .Ag vCz	Seqlen = z
, FA3 time z w CUDA Graphz: g     @@z.1fz us, g&.>gMbP?z.0fz GB/s, g-q=z	 TFLOPS/sz, FlashMLA timezArithmetic intensity: zIdeal time: z us)Ltimetorchtorch.nn.functionalnn
functionalFtriton.testingr   r   einopsr   flash_attn_interfacer   r   	flash_mlar   r   ImportErrorflash_attn.utils.benchmarkr	   r!   bfloat16r"   seqlenseqlen_qnheads_quse_bench_cudagraphattn_variantsattn_variantmax	nheads_kvheaddimr'   has_qvr$   should_run_flashmlamanual_seed
batch_sizer)   printuppertensorintr*   randnr.   r0   r/   arangeint32r,   OutOfMemoryErrorr+   r-   fn0sleept0r
   synchronizestreamStreamr9   concatr7   r8   fn1t1sumitemtotal_seqlennumelmem_ioflopsideal_h100_time_memideal_h100_time_flopideal_h100_timer   r   r   r   <module>   s   2
0
 



J&
TT