o
    پi                     @   s  d dl Z d dlZd dlZd dlZd dlmZ d dlmZmZm	Z	m
Z
 d dlmZ d dlmZ ej Zdejdejdejd	ejd
df
ddZdejdejdejd	ejd
df
ddZdejdejdejd	ejd
df
ddZe 	d1dejdejdejd	ejded
dfddZe	dd ed dD dgdZe	ddgdgdZe	g ddgdZe	g ddgdZg d Zg d!Zg d"Z e!e "eeeeZ#ej$%ej$j&g d#e#d$eee d%d&i d'	d(e'd)e'd*e'd+e'd$e(f
d,d-Z)e*d.kre)j+d/d0 dS dS )2    Nrmsnorm)DEFAULT_DEVICEDEFAULT_DTYPEget_benchmark_rangerun_benchmarkfused_inplace_qknorm)get_current_device_stream_fastqkq_weightk_weightreturnc                 C   s   | j d }| d|} |d|}t }t| t| || d tjt t|||d W d    n1 s8w   Y  |t d S )Nout)	shapeviewr
   
alt_streamwait_streamr   torchcudastream)r   r   r   r   head_dimcurrent_stream r   \/home/ubuntu/.local/lib/python3.10/site-packages/sglang/jit_kernel/benchmark/bench_qknorm.pysglang_aot_qknorm   s   

r   c                 C   s   t | ||| d S Nr   )r   r   r   r   r   r   r   sglang_jit_qknorm'   s   r    c                 C   s,   ddl m} || || d ||||d d S )Nr   r   r   )
flashinferr   )r   r   r   r   r   r   r   r   flashinfer_qknorm1   s   r"   ư>epsc           	      C   s   |   djddd}|  djddd}||  }||  }| |   | |    ||  | |    d S )N   r   T)dimkeepdim)floatpowmeanrsqrtcopy_)	r   r   r   r   r$   q_meank_meanq_normk_normr   r   r   torch_impl_qknorm=   s   r1   c                 C   s   g | ]}d | qS )r%   r   ).0nr   r   r   
<listcomp>N   s    r4         )
full_rangeci_range      )   r%   r9   r:   r;   )      i   i   r<   aotjitfir   )zSGL AOT KernelzSGL JIT Kernel
FlashInferPyTorch))orange-)bluez--)greenz-.)red:)r   GQAnum_kv_heads
batch_sizeprovideruszqknorm-performance)	x_namesx_valsline_arg	line_vals
line_namesstylesylabel	plot_nameargsr   rJ   rK   rL   c                    s   || }t j||| fttdt j||| fttdt j| ttdt j| ttdttttd  fdd}t|S )N)dtypedevicer>   c                      s     S r   r   r   FN_MAPr   r   rM   r   r   r   r   <lambda>   s    zbenchmark.<locals>.<lambda>)	r   randnr   r   r   r    r"   r1   r   )r   rJ   rK   rL   rM   num_qo_headsfnr   rZ   r   	benchmarke   s    r`   __main__T)
print_data)r#   ),	itertoolsr   tritontriton.testing
sgl_kernelr   !sglang.jit_kernel.benchmark.utilsr   r   r   r   sglang.jit_kernel.normr	   sglang.srt.utilsr
   r   Streamr   Tensorr   r    r"   compiler(   r1   rangeBS_RANGE	GQA_RANGEKV_HEAD_RANGEHEAD_DIM_RANGE	LINE_VALS
LINE_NAMESSTYLESlistproductconfigstestingperf_report	Benchmarkintstrr`   __name__runr   r   r   r   <module>   s    




