o
    پi.3                     @   s  d Z ddlZddlZddlmZ ddlmZ ddlZddlZddl	Zddl
mZmZ ddlmZmZ ddlmZmZmZ ejdd	d
kZdZdZdZdZdZeddG dd dZeddedededejfddZdejdejdejdejdejdejd eddfd!d"Z dejdejdejdejdejdejd#eddfd$d%Z!d&ejd'ejdejd(ejd)ejdejd ed*eddfd+d,Z"d&ejd'ejdejd(ejd)ejdejd-ed.eddfd/d0Z#dejdejd1ejdejdejd2ejddfd3d4Z$ed5d6 e%dd7D d7gd8Z&eg d9d:gd8Z'g d;Z(g d<Z)g d=Z*e+e,e'e&Z-ej./ej.j0d.d>ge-d?e(e)e*d@dAi dB	d.ed>ed?e1dee2e2e2f fdCdDZ3dTdFdGZ4ej./ej.j0d.d>ge-d?e(e)e*d@dHi dB	d.ed>ed?e1dee2e2e2f fdIdJZ5e6dKkre7e'Z8ee e8fZ9ee e8fZ:eej;e9ej<dEdLej;e9ej<dEdLej;e:ej<ddMej;e:ej<ddMdNa=e>dO e>dP e>dO e3j?ddQ e>dR e>dS e>dO e5j?ddQ dS dS )Ua8  Benchmark for HiCache JIT kernel performance.

This benchmark tests the performance of KV cache transfer operations
between GPU and CPU (host pinned memory), comparing:
- SGL AOT Kernel: Pre-compiled transfer_kv kernels from sgl_kernel
- SGL JIT Kernel: JIT-compiled hicache kernels
- PyTorch Indexing: Plain PyTorch index copy
- PyTorch 2 Stream: PyTorch implementation using 2 CUDA streams

Tests cover:
- One Layer: CPU->GPU
- All Layer: GPU->CPU

Note: Uses do_bench instead of do_bench_cudagraph since CUDA graph
capture doesn't support CPU-GPU memory transfers.
    N)	dataclass)Tuple)transfer_kv_all_layertransfer_kv_per_layer)DEFAULT_QUANTILESget_benchmark_range)can_use_hicache_jit_kerneltransfer_hicache_all_layertransfer_hicache_one_layerDISABLE_TORCH01   Ti   i      )frozenc                   @   sL   e Zd ZU ejed< ejed< ejed< ejed< dededd fdd	Zd
S )HiCacheCachek_cache_cudav_cache_cudak_cache_hostv_cache_host
num_layerselement_sizereturnc                    s`   dt jdt jf fdd}dt jdt jf fdd}t|| j|| j|| j|| jdS )Ntr   c                    (   t  }| d d | ddfS Nr   )GPU_CACHE_SIZEview	unflatten)r   needed_cudar   r    ]/home/ubuntu/.local/lib/python3.10/site-packages/sglang/jit_kernel/benchmark/bench_hicache.py
slice_cuda6       z*HiCacheCache.get_slice.<locals>.slice_cudac                    r   r   )HOST_CACHE_SIZEr   r   )r   needed_hostr!   r"   r#   
slice_host:   r%   z*HiCacheCache.get_slice.<locals>.slice_hostr   r   r   r   )torchTensorr   r   r   r   r   )selfr   r   r$   r(   r"   r!   r#   	get_slice5   s   zHiCacheCache.get_sliceN)__name__
__module____qualname__r*   r+   __annotations__intr-   r"   r"   r"   r#   r   .   s   
 



r   	page_sizesizemax_sizer4   r   c                   sz   dt dt f fdd}| |kr|  dksJ t||d ||  }t }|d d d f   |   d |  S )Nxr   c                    s   |   d   S )Nr   r"   )r7   r3   r"   r#   alignI   s   zgen_indices.<locals>.alignr   )r2   r*   randpermarangeflattencuda)r5   r6   r4   r8   indicesoffsetsr"   r3   r#   gen_indicesF   s
   
(r?   k_cache_dstv_cache_dstindices_dstk_cache_srcv_cache_srcindices_src	item_sizec                 C   s   t || ||||| dS )z)SGL AOT Kernel for single layer transfer.N)r   )r@   rA   rB   rC   rD   rE   rF   r"   r"   r#   sglang_aot_transfer_oneR   s   
rG   element_dimc              	   C   s   t | ||||||d dS )z)SGL JIT Kernel for single layer transfer.)rH   N)r
   )r@   rA   rB   rC   rD   rE   rH   r"   r"   r#   sglang_jit_transfer_oneg   s   

rI   
k_ptrs_dst
v_ptrs_dst
k_ptrs_src
v_ptrs_srcr   c              	   C   s   t || |||||| dS )z&SGL AOT Kernel for all layer transfer.N)r   )rJ   rK   rB   rL   rM   rE   rF   r   r"   r"   r#   sglang_aot_transfer_all|   s   rN   stride_bytesr   c                 C   s   t | ||||||||d	 dS )z&SGL JIT Kernel for all layer transfer.)kv_cache_src_stride_byteskv_cache_dst_stride_bytesr   N)r	   )rJ   rK   rB   rL   rM   rE   rO   r   r"   r"   r#   sglang_jit_transfer_all   s   
rR   indices_dst_on_dstindices_src_on_srcc                 C   s.   | j }|| || |< || |||< dS )zPyTorch indexing baseline.N)deviceto)r@   rA   rS   rC   rD   rT   
dst_devicer"   r"   r#   pytorch_transfer   s   	rX   c                 C   s   g | ]}d | qS )   r"   ).0nr"   r"   r#   
<listcomp>       r\      )
full_rangeci_range)@         i      rd   aotjitpytorch)zSGL AOT KernelzSGL JIT KernelPyTorch))orange-)bluez--)red:
batch_sizeprovideruszhicache-one-layer-h2d)	x_namesx_valsline_arg	line_vals
line_namesstylesylabel	plot_nameargsc           	         sR  t jtd}|j|j|j|jt|d   t	|t
t	|ttr2 \}|  tj      fddfddfddd}|dkrxt d	sxtd
td
td
fS tr|dv rtd
td
td
fS tjj|| tddd\}}}d| t d| t d| t fS )z&One Layer: Host (CPU) -> Device (GPU).r   r      c                      "    fddt tD S )Nc              
      0   g | ]}t | | | |  qS r"   )rG   rZ   ielement_bytesindices_dst_gpuindices_src_gpur@   rC   rA   rD   r"   r#   r\          
=benchmark_one_layer_h2d.<locals>.<lambda>.<locals>.<listcomp>range
NUM_LAYERSr"   r   r"   r#   <lambda>       
z)benchmark_one_layer_h2d.<locals>.<lambda>c                      r}   )Nc              
      r~   r"   )rI   r   r   r   r   r@   rC   rA   rD   r"   r#   r\     r   r   r   r"   r   r"   r#   r     r   c                           fddt tD S )Nc              	      .   g | ]}t | |  | | qS r"   rX   r   r   indices_src_cpur@   rC   rA   rD   r"   r#   r\         	r   r   r"   r   r"   r#   r         	re   rg   r   nanrh         	quantileswarmuprep  )cacher-   r   r   r   r   r   r*   manual_seedr?   r&   r   ENABLE_SORTsortcpur<   synchronizer   r   floatr   tritontestingdo_benchr   	r   ro   rp   cache_localmappingFN_MAPmsmin_msmax_msr"   )	r   r   r   r   r   r@   rC   rA   rD   r#   benchmark_one_layer_h2d   s:   


&


r   r<   c                 C   s   t jdd | D t j|dS )z!Create a tensor of data pointers.c                 S   s   g | ]}|  qS r"   )data_ptr)rZ   r   r"   r"   r#   r\   3  r]   z&_create_ptr_tensor.<locals>.<listcomp>dtyperU   )r*   tensoruint64)tensorsrU   r"   r"   r#   _create_ptr_tensor0  s
   r   zhicache-all-layer-d2hc           	         s  t jt| d}|j|j	|j|jt|d |   t	|t
t	|ttr2 \}|  tj  |    tfddttD t	fddttD tfddttD tfddttD 
 
fdd	 
fd
d		fdd	d}|dkrt dstdtdtdfS tr|dv rtdtdtdfS tjj|| tddd\}}}d| t d| t d| t fS )z&All Layer: Device (GPU) -> Host (CPU).r{   r|   c                       g | ]} | qS r"   r"   r   )k_caches_srcr"   r#   r\   ^  r]   z+benchmark_all_layer_d2h.<locals>.<listcomp>c                    r   r"   r"   r   )v_caches_srcr"   r#   r\   _  r]   c                    r   r"   r"   r   )k_caches_dstr"   r#   r\   `  r]   c                    r   r"   r"   r   )v_caches_dstr"   r#   r\   a  r]   c                	      s   t  tS N)rN   r   r"   r   r   r   rJ   rL   rK   rM   r"   r#   r   d      z)benchmark_all_layer_d2h.<locals>.<lambda>c                	      s   t   S r   )rR   r"   r   r"   r#   r   n  r   c                      r   )Nc              	      r   r"   r   r   indices_dst_cpur   r   r   r   r   r"   r#   r\   x  r   z=benchmark_all_layer_d2h.<locals>.<lambda>.<locals>.<listcomp>r   r"   r   r"   r#   r   x  r   re   rg   r   r   r   r   r   r   r   )r   r-   r   r   r   r   r   r*   r   r?   r   r&   r   r   r   r<   r   r   r   r   r   r   r   r   r   r   r   r   r"   )r   r   r   r   r   r   rJ   rL   r   r   rK   rM   r#   benchmark_all_layer_d2h9  sB   




"


r   __main__r   )r   
pin_memoryr)   z<============================================================z&One Layer: Host -> Device (CPU -> GPU))
print_dataz=
============================================================z6All Layer: Device -> Host (GPU -> CPU) [per-layer avg])r<   )@__doc__	itertoolsosdataclassesr   typingr   r*   r   triton.testing
sgl_kernelr   r   !sglang.jit_kernel.benchmark.utilsr   r   sglang.jit_kernel.hicacher   r	   r
   environgetr   	PAGE_SIZEr   r   r&   r   r   r2   r+   r?   rG   rI   rN   rR   rX   r   BS_RANGEELEMENT_SIZE_RANGE	LINE_VALS
LINE_NAMESSTYLESlistproductCONFIGSr   perf_report	Benchmarkstrr   r   r   r   r.   maxMAX_SIZEDEVICE_SHAPE
HOST_SHAPEemptybfloat16r   printrunr"   r"   r"   r#   <module>   s   	


	
	


S	
O