o
    i3                     @   sp   d dl Z d dlZd dlZd dlZd dlZd dlZd dlm  mZ dd Z	dd Z
dd Zedkr6e  dS dS )	    Nc                 C   s   | dkrdS d| d   > S )N   )
bit_length)x r   O/home/ubuntu/vllm_env/lib/python3.10/site-packages/hopper/benchmark_split_kv.pyround_up_to_power_of_2
   s   r   c                 O   sP   t j  tdD ]	}| |i | q	tjd| ||dd}|d}|j}|S )N   zfn(*args, **kwargs))fnargskwargs)stmtglobals   )torchcudasynchronizerange	benchmarkTimertimeitmean)r	   r
   r   _tmeasurementavg_timer   r   r   r      s   


r   c            +      C   sp  t jt j j} d}d}d}t j}d}t d dg}g }|t	dgg ddg t
dd	 |D }t
d
d	 |D }	|D ]s\}
}}}|| dksPJ td|
 d td| d| d| d|  || }|| }t j||	||fd|d}t j||	||fd|d}|du rtdddddddddddddd |D ]\}}}|| | | d  }|| | | d  }t|| }d!}|| }|| t||  }t j||||fd|d}t j|t jdd"d | }t j|g| t jdd"}ttj||||||d#d$ d$ }ttj||||||ddd%	d$ d$ }ttj||||||ddd%	d$ d$ }|r]d}td&} td|D ]o}!ttj||||||d|!d%	d$ d$ }"tj||||||d|!d%}#tj||||||ddd%}$|#|$  
  }%|#|$    }&t|%st|&s|%d'ks|&d(krtd)|! d*|% d+|&  |"| k r|"} |!}q?d}'td&}(td|D ]o}!ttj||||||d|!d%	d$ d$ }"tj||||||d|!d%}#tj||||||ddd%}$|#|$  
  }%|#|$    }&t|%st|&s|%d'ks|&d(krtd,|! d*|% d+|&  |"|(k r(|"}(|!}'q||' |  })||( }*|*d-kr]ttj||||||ddd%	d$ d$ }ttj||||||d|'d%	d$ d$ }(|du rtd.| d/| d0| d1|d2d3|(d2d4|d2d5|' d6||( d2d7|)d2d8|| d9 d2 |du rt|d|d|d|d:|d;|| d<|| d9 d: qqAd S )=N   Tr   *   )zLlama-3.1-70B@         i   )r      r      c                 s   s    | ]\}}}|V  qd S Nr   ).0r   reqsr   r   r   	<genexpr>P       zmain.<locals>.<genexpr>c                 s   s    | ]\}}}|V  qd S r"   r   )r#   seqlenr   r   r   r   r%   Q   r&   r   z***zQHEADS:z
, KVHEADS:z
, HEADDIM:z, TP:r   )devicedtypeFCONTEXTz<9BSZz<5QLENz<6FA2z<10FA3RATIOz<7zGB/sr    r   )r)   r(   )qk_cachev_cachecache_seqlenscache_batch_idxcausalg     @@)r0   r1   r2   r3   r4   r5   pack_gqa
num_splitsinfgMb`?g-C6?z"Numerical error too high: Splits: z, Max: z, Mean: z(Numerical error too high (gqa): Splits: g?zCONTEXT:z, BSZ:z, QLEN:z, FA2:z.2fz, FA3 SPLIT MANUAL:z, FA3:z, FA3 NUM SPLITS:z, RATIO:z, EFF:z, GB/s:gMbP?z<10.2fz<9.2fz<7.2f)r   r   get_device_propertiescurrent_devicemulti_processor_countbfloat16manual_seedextend	itertoolsproductmaxprintrandnr   mathceilrandpermint32tensorr   
flash_attnflash_attn_with_kvcacheflash_attn_interfacefloatr   absitemr   isnan)+num_sms
max_splitscheck_all_splitsr5   r)   	tp_degreemodel_configsall_batch_configs
num_cachescache_seqlen
model_namenheads_q	nheads_kvheaddimr1   r2   context_seqlennum_requestsquery_seqlenbytes_kvbytes_qblockHblockMblockM_div_Hnum_work_tilesr0   
cache_idxsr3   fa2_time_heuristicfa3_time_one_splitfa3_time_gqa_heuristicfa3_fastest_num_splitsfa3_fastest_splitk_timer7   r   out0out1max_diff	mean_difffa3_fastest_num_splits_gqafa3_fastest_splitk_time_gqa
efficiencyheuristic_ratior   r   r   main#   s  

 2



,


,





 rs   __main__)r   rI   rK   r?   timerD   torch.utils.benchmarkutilsr   r   r   rs   __name__r   r   r   r   <module>   s      )
