o
    Ii                     @   s   d dl Z d dlZd dlZd dlm  mZ d dlZd dl	Z	d dl
Z
e	jddZejddd ejdedd	 ejd
edd	 ejddd ejddd e ZdddZdddZdd Zedkrge  dS dS )    NzProcess some integers.)descriptionz--causal
store_true)actionz--splits   )typedefaultz	--repeats
   z
--validatez--gqa Tc                 K   sH   |rt |d tjd| |dt d}||}|r t || ||fS )zCUse Pytorch Benchmark on the forward pass of an arbitrary function.z- Forward passzfn(**kwinputs))fnkwinputs)stmtglobalsnum_threads)print	benchmarkTimertorchget_num_threadstimeit)r
   repeatsdescverboser   tm r   G/home/ubuntu/.local/lib/python3.10/site-packages/hopper/test_kvcache.pybenchmark_fa_kv_old   s   


r   c                 O   sl   t dD ]	}| |i | q|}tj  t }t |D ]	}| |i | qtj  t }|| | S )N   )ranger   cudasynchronizetime)r
   r   argskwargs_nitersstartendr   r   r   benchmark_fa_kv"   s   

r(   c                      s  d} d}d}t j}d}d d}d}dg d}t|}|| }	g d	}
t||ks+J tfd
d|dd  D s<J ||ksBJ t fdd|
D sOJ t d t j| ||fd|d}t j| ||fd|d}t jd|| |fd|d}t j|
d gt jdd}t jdgt jdd}t j|d | |fd|d}t j|
dd  dg|	  t jdd}t j	|t jddd |d  }t
jr{tj|||||tt
jt
jdd\}}tj|||||tt
jdtt
jdd	\}}tj|||||tt
jt
jtt
jdd	\}}tj|||||tt
jt
jd}td td||    | td||     tj|||||tt
jt
jdd\}}td td||||  |j td||     td||     td||     tdt
j ttjt
j|||||tt
jt
jd	}ttjt
j|||||tt
jt
jd	}td ttjt
j|||||tt
jt
jd	}ttjt
j|||||tt
jt
jd	}td t
j|d! |d! ||  td"t
j|d! |d! ||  d S )#N@         i @  i         )i     r   )i   i (  i0  c                 3       | ]}| k V  qd S Nr   .0s)small_request_ntokensr   r   	<genexpr>K       zmain.<locals>.<genexpr>r   c                 3   r/   r0   r   r1   )cache_seqlenr   r   r5   M   r6   i:  r   )devicedtyper   )r9   r8   T)qk_cachev_cachecache_seqlenscache_batch_idxcausal
num_splitsreturn_softmax_lse)	r:   r;   r<   r=   r>   r?   r@   gqa_decodingrA   )r:   r;   r<   r=   r>   r?   r@   bigzdiff-maxz	diff-meansmalllsezlse-dif-maxfa3)r   r:   r;   r<   r=   r>   r?   r@   zfa2 zbig (split, fa3, fa2, ratio):i@B zsmall (split, fa3, fa2, ratio):)r   float16lensumallmanual_seedrandntensorint32randpermr"   validaterF   flash_attn_with_kvcacheboolr?   splitsgqafa2r   absmaxitemmeanshaper   r(   ) nheads_q	nheads_kvheaddimr9   
num_cachesntokensmax_queries_per_batchquery_seqlensnum_queriesnum_padding_queriescontext_seqlensr;   r<   q_buf_largecache_seqlen_largecache_idx_largeq_buf_smallcache_seqlens_smallcache_idxs_smallout0lse0out1_split1lse1_split1out1lse1out2out3lse_fa2time_fa3_bigtime_fa3_smalltime_fa2_bigtime_fa2_smallr   )r7   r4   r   main/   s  "














"rx   __main__)r   r	   T)r   )r   flash_attn_interfacerF   
flash_attnrU   torch.utils.benchmarkutilsr   r!   argparsemathArgumentParserparseradd_argumentint
parse_argsr"   r   r(   rx   __name__r   r   r   r   <module>   s*    

 ;
