o
    پi                     @   s  d dl Z d dlZd dlZd dlZd dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ e
 ZdZdZdZee ZdZdZejZdZd	d
 Zdd Zdd Zdd Zdd Zdd ZercddgZng dZg dZg dZ g dZ!dd Z"ej#$ej#j%dgedee e!dd i d!	de&de'fd"d#Z(ere)e *d$d%gd%gZ+ne)e *g d&g d'Z+g dZ,g dZ-g dZ.ej#$ej#j%d(d)ge+de,e-e.dd*i d!	d(e&d)e&de'fd+d,Z/e0d-kre(j1d.d/ e/j1d.d/ dS dS )0    N)concat_mla_absorb_q)concat_mla_k)is_in_ci   @      cudac                 C      t | || d S N)aot_kkk_nopek_rope r   `/home/ubuntu/.local/lib/python3.10/site-packages/sglang/jit_kernel/benchmark/bench_concat_mla.pyaot_concat_mla_k      r   c                 C   r	   r
   )jit_kr   r   r   r   jit_concat_mla_k    r   r   c                 C   sR   |j d }|| d d d d d |f< |d| j d d| d d d d |d f< d S )N   )shapeexpand)r   r   r   nope_head_dimr   r   r   torch_concat_mla_k$   s   
.r   c                 C   
   t | |S r
   )aot_absorb_qabr   r   r   aot_concat_mla_absorb_q*      
r!   c                 C   r   r
   )jit_absorb_qr   r   r   r   jit_concat_mla_absorb_q.   r"   r$   c                 C   sB   | j d }| |d d d d d |f< ||d d d d |d f< d S )Nr   )r   )r   r    out
a_last_dimr   r   r   torch_concat_mla_absorb_q2   s   
r'         )r(   r   r)   i   i   i    i @  i   aotjittorch)zSGL AOT KernelzSGL JIT KernelPyTorch))orange-)bluez--)greenz-.c                 C   s   t j| ttd fttd}|dddddtf }t j| ddt fttd}|ddddt df }t j| ttfttd}|||fS )zJAllocate oversized containers and slice to produce non-contiguous tensors.r   dtypedeviceNr   )	r-   randnNUM_LOCAL_HEADSQK_NOPE_HEAD_DIMDTYPEDEVICEQK_ROPE_HEAD_DIMempty
K_HEAD_DIM)
num_tokensk_nope_containerr   k_rope_containerr   r   r   r   r   _create_concat_mla_k_dataB   s$   
rA   r>   provideruszconcat-mla-k-performance)	x_namesx_valsline_arg	line_vals
line_namesstylesylabel	plot_nameargsc                    sb   t | \tttd  fdd}g d}tjj||d\}}}d| d| d| fS )Nr*   c                      s     S r
   r   r   FN_MAPr   r   r   rB   r   r   <lambda>o   s    z$bench_concat_mla_k.<locals>.<lambda>g      ?g?g?	quantiles  )rA   r   r   r   tritontestingdo_bench_cudagraph)r>   rB   fnrR   msmin_msmax_msr   rM   r   bench_concat_mla_kZ   s   r[         )r   r\      r]       )r   r^   r_   r   dim_0dim_1zconcat-mla-absorb-q-performancec                    s   t j| |tttdt j| |tttddkr-t j| |tt ttdfdd}nttd  fdd}g d}t	j
j||d\}}}d	| d	| d	| fS )
Nr3   r-   c                      s   t  S r
   )r'   r   )r   r    r%   r   r   rO      s    z+bench_concat_mla_absorb_q.<locals>.<lambda>)r+   r,   c                      s     S r
   r   r   )rN   r   r    rB   r   r   rO      s    rP   rQ   rS   )r-   r6   
A_LAST_DIMr9   r:   
B_LAST_DIMr<   r!   r$   rT   rU   rV   )r`   ra   rB   rW   rR   rX   rY   rZ   r   )rN   r   r    r%   rB   r   bench_concat_mla_absorb_q   s   rd   __main__T)
print_data)2	itertoolsr-   rT   triton.testing
sgl_kernelr   r   r   r   !sglang.jit_kernel.benchmark.utilsr   sglang.jit_kernel.concat_mlar#   r   IS_CIr7   r8   r;   r=   rb   rc   bfloat16r9   r:   r   r   r   r!   r$   r'   NUM_TOKENS_VALSK_LINE_VALSK_LINE_NAMESK_STYLESrA   rU   perf_report	Benchmarkintstrr[   listproductABSORB_Q_VALSQ_LINE_VALSQ_LINE_NAMESQ_STYLESrd   __name__runr   r   r   r   <module>   s    
