o
    Ii3                     @   sp   d dl Z d dlZd dlZd dlZd dlZd dlZd dlm  mZ dd Z	dd Z
dd Zedkr6e  dS dS )	    Nc                 C   s   | dkrdS d| d   > S )N   )
bit_length)x r   M/home/ubuntu/.local/lib/python3.10/site-packages/hopper/benchmark_split_kv.pyround_up_to_power_of_2
   s   r   c                 O   sP   t j  tdD ]	}| |i | q	tjd| ||dd}|d}|j}|S )N   zfn(*args, **kwargs))fnargskwargs)stmtglobals   )torchcudasynchronizerange	benchmarkTimertimeitmean)r	   r
   r   _tmeasurementavg_timer   r   r   r      s   


r   c            *      C   sN  t jt j j} d}d}d}t j}t d dg}g }|t	g dg dg d t
dd	 |D }t
d
d	 |D }|D ]b\}	}
}}t j||||fd|d}t j||||fd|d}td|	 d td|
 d| d|  |du rtdddddddddddddd |D ]\}}}|| | | d }|| |
 | d }t|
| }d}|| }|| t||  }t j|||
|fd|d}t j|t jddd | }t j|g| t jdd}ttj||||||d d! d! }ttj||||||dd"d#	d! d! }ttj||||||dd$|d%
d! d! }|rLd$}td&}td"|D ]o} ttj||||||d| d#	d! d! }!tj||||||d| d#}"tj||||||dd"d#}#|"|#  
  }$|"|#    }%t|$st|%s|$d'ks|%d(krtd)|  d*|$ d+|%  |!|k r|!}| }q-d$}&td&}'td"|D ]o} ttj||||||d| d#	d! d! }!tj||||||d| d#}"tj||||||dd"d#}#|"|#  
  }$|"|#    }%t|$s t|%s |$d'ks |%d(krtd,|  d*|$ d+|%  |!|'k r|!}'| }&q||& |  }(||' })|)d-krLttj||||||dd$|d%
d! d! }ttj||||||d|&d#	d! d! }'|du rtd.| d/| d0| d1|d2d3|'d2d4|d2d5|& d6||' d2d7|(d2d8|| d9 d2 |du rt|d|d|d|d:|d;|| d<|| d9 d: qqAd S )=N   FT*   )zLlama-3.1-70B@         )i   i @  i   )r      r      c                 s   s    | ]\}}}|V  qd S Nr   ).0r   reqsr   r   r   	<genexpr>N       zmain.<locals>.<genexpr>c                 s   s    | ]\}}}|V  qd S r"   r   )r#   seqlenr   r   r   r   r%   O   r&   r   )devicedtypez***zQHEADS:z
, KVHEADS:z
, HEADDIM:CONTEXTz<9BSZz<5QLENz<6FA2z<10FA3RATIOz<7zGB/sr    r   )r)   r(   )qk_cachev_cachecache_seqlenscache_batch_idxcausalg     @@r   )r0   r1   r2   r3   r4   r5   gqa_parallel
num_splitsr   )	r0   r1   r2   r3   r4   r5   r6   r7   max_seqlen_k_hintinfgMb`?g-C6?z"Numerical error too high: Splits: z, Max: z, Mean: z(Numerical error too high (gqa): Splits: g?zCONTEXT:z, BSZ:z, QLEN:z, FA2:z.2fz, FA3 SPLIT MANUAL:z, FA3:z, FA3 NUM SPLITS:z, RATIO:z, EFF:z, GB/s:gMbP?z<10.2fz<9.2fz<7.2f)r   r   get_device_propertiescurrent_devicemulti_processor_countbfloat16manual_seedextend	itertoolsproductmaxrandnprintr   mathceilrandpermint32tensorr   
flash_attnflash_attn_with_kvcacheflash_attn_interfacefloatr   absitemr   isnan)*num_sms
max_splitscheck_all_splitsr5   r)   model_configsall_batch_configs
num_cachescache_seqlen
model_namenheads_q	nheads_kvheaddimr1   r2   context_seqlennum_requestsquery_seqlenbytes_kvbytes_qblockHblockMblockM_div_Hnum_work_tilesr0   
cache_idxsr3   fa2_time_heuristicfa3_time_one_splitfa3_time_gqa_heuristicfa3_fastest_num_splitsfa3_fastest_splitk_timer7   r   out0out1max_diff	mean_difffa3_fastest_num_splits_gqafa3_fastest_splitk_time_gqa
efficiencyheuristic_ratior   r   r   main#   s  

2



,


,





 rs   __main__)r   rJ   rL   r@   timerE   torch.utils.benchmarkutilsr   r   r   rs   __name__r   r   r   r   <module>   s      #
