o
    پi%                     @   sT  d dl Z d dlZd dlZd dlZd dlmZ d dlmZ	 d dlm
Z d dlmZ d dlm
Z dZejdejfddZd)d	efd
dZdd Zejdddgejdddgejdddgejdddgejdddgejdddgejdddgejdddgejdddgejdejejejgdedededejd df
d!d"Zejddgejdg d#ejddgejddgejddgejddgejddgejddgejddgejdejgdedededejd df
d$d%Zd&d' Ze d(kr(e!e"g dS dS )*    N)FusedSetKVBufferArg)%apply_rope_with_cos_sin_cache_inplacecudaitersc                 C   s   t d}t jd|d t jd}t jddt jd}t jddt jd}t jddt jd}t|D ]}|| | }|||? A }q/|dkrJt | | d S d S )Nr       )dtypeif i_n<   )tl
program_idfulluint32rangestore)out_ptrr   pidxacsh_r   r   U/home/ubuntu/.local/lib/python3.10/site-packages/sglang/jit_kernel/tests/test_rope.pyburn_kernel   s   
r      msc                 C   s2   t | d }tjddtjd}t| ||d |S )Ni N  r   r   devicer   )r   )inttorchemptyr   r   )r   gridr   outr   r   r   triton_burn%   s   r#   c           
   	   C   sh   d|t jd| dt jtd|    }t j|t jtd}t d||}| }| }t j||fdd}	|	S )Ng      ?r      r   r   z	i,j -> ij)dim)r   arangefloat32DEVICEeinsumcossincat)

rotary_dimmax_position_embeddingsbaser   inv_freqtfreqsr,   r-   cacher   r   r   create_cos_sin_cache,   s   r6   bsr      seq_len   num_qo_heads   num_kv_headshead_dim@   r/      
interleaveFT
enable_pdlsave_kv_cacher   returnc
           *      C   sd  ||k rt d|d| |s|rt d|d|d tj| | || t|	d}
tj| | || t|	d}tj| | || t|	d}| | d }tj|||t|	d}tj|||t|	d}tj|tjtdd | |   }tj	|td	
| }|}d
}t||||	}|
 }| }| }| }| }| }t|||jd d||jd dd d |d}|
 }| }| }| }| } | }!t|||jd d| | jd dd d |!d}"tj }#tj }$tddd t|
}%|% |% }&}'tj  tj|#$ tddd ||& }t|||||| |r'|nd |d W d    n	1 s7w   Y  tj|$$ tddd ||' }t|||||| |rZ|"nd |d W d    n	1 sjw   Y  tj  |	tjkr|dnd}(|	tjkrdnd})tjj|||(|)d tjj|||(|)d tjj|||(|)d tjj|| |(|)d d S )N	head_dim= < rotary_dim=(save_kv_cache=, enable_pdl=) is not allowedr   r$   r%   r   '  r   r&   )valuek_bufferv_bufferk_scalev_scale	cache_loc
   )   )r!   	positionsquerykey	head_sizecos_sin_cacheis_neoxfused_set_kv_buffer_argrB   gMbP?gư>)atolrtol)pytestskipr   randnr*   zerosrandpermint64cloner(   repeatr6   FusedSetKVBufferArgJitviewshapeFusedSetKVBufferArgKernelr   Streamr#   
randn_likesynchronizestream)apply_rope_with_cos_sin_cache_inplace_jit,apply_rope_with_cos_sin_cache_inplace_kernelr)   testingassert_close)*r7   r9   r;   r=   r>   r/   rA   rB   rC   r   qkvKV_POOL_SIZErM   rN   out_cache_locpos_idsmax_seq_lenr1   rY   q_jitk_jitv_jitk_buffer_jitv_buffer_jitout_cache_loc_jitfused_set_kv_buffer_arg_jitq_kernelk_kernelv_kernelk_buffer_kernelv_buffer_kernelout_cache_loc_kernelfused_set_kv_buffer_arg_kernel
stream_jitstream_kernelrr_jitr_kernelr\   r]   r   r   r   	test_rope=   s   


	
	



r   )r   r:   rS   c
           &   	   C   s$  ||k rt d|d| |s|rt d|d|d tj| | || t|	d}
tj| | || t|	d}tj| | || t|	d}| | d }tj|||t|	d}tj|||t|	d}tj|tjtdd | |   }tj	|td	
| }|}d
}t||||	}|
 }| }| }| }| }| }|
 }| }| }| }| }| } |||||| d |d}!tt|!}"|||||| d |d}#tt|#}$td|  d|  td|"d dd|$d dd |$dkr|"dkr|$|" ntd}%td|%dd d S d S )NrE   rF   rG   rH   rI   r   r$   r%   rJ   rK   rT   z
Performance Test - Batch=z	, SeqLen=zJIT: i  z.9fz	ms, SGL: r   r   infzSpeedup (SGL/JIT): z.2fr   )r^   r_   r   r`   r*   ra   rb   rc   rd   r(   re   r6   
bench_ropern   ro   printfloat)&r7   r9   r;   r=   r>   r/   rA   rB   rC   r   rr   rs   rt   ru   rM   rN   rv   rw   rx   r1   rY   ry   rz   r{   r|   r}   r~   r   r   r   r   r   r   jit_argsjit_timekernel_argskernel_timespeedupr   r   r   test_bench_rope   s   




"
r   c                 C   sl   d}d}t |D ]	}| di | qtj  t }t |D ]	}| di | qtj  t | | S )NrR   d   r   )r   r   r   rl   time)fnargswarmup	iterationr   
start_timer   r   r   r     s   

r   __main__)r   )#r   r^   r   tritontriton.languagelanguager
   
sgl_kernelr   ri   r   ro   sglang.jit_kernel.roperf   rn   r*   jit	constexprr   r   r#   r6   markparametrizebfloat16float16r)   boolr   r   r   r   __name__main__file__r   r   r   r   <module>   sx    	
w	
W
