o
    پi<                     @   sd  d dl Z d dlmZmZmZ d dlZd dlZd dlZd dlm	Z
 d dlmZ ejde
jfddZd<defd	d
Zdd Zdd Zdejdejdejdedejf
ddZG dd dejjZdd Zdd Zdd Zejdg dd  d!D d"d"d#d$d%ejd&d#d#d'd'fd(d)d*d+d%ejd&d,d-d.d,fd-d)d/d+d%ejd&d0d1d.d,fd)d)d2d+d3ejd&d,d-d#d4fd)d)d2d+d3ejd&d,d-d5d.fd-d)d/d+d3ejd&d0d1d.d,fd"d"d#d$d%ej d&d#d#d'd'fd(d)d*d+d%ej d&d,d-d.d,fd-d)d/d+d%ej d&d0d1d.d,fd)d)d2d+d3ej d&d,d-d#d4fd)d)d2d+d3ej d&d,d-d5d.fd-d)d/d+d3ej d&d0d1d.d,fd"d#d6d+d%ej!d&d5d"d4d.fd)d"d2d+d%ej!d&d4d)d5d4fd(d)d*d+d%ej!d&d.d(d4d.fejd7d%d3gd8d9 Z"ejdd"d"d*d$d%ejd&d'd'd4d4fd"d"d*d$d%ejd&d.d5d4d4fd"d"d*d$d%ejd&d4d"d4d4fd"d"d*d$d%ejd&d5d)d4d4fd"d"d*d$d%ejd&d#d-d4d4fd"d"d*d$d%ejd&d"d6d4d4fgd:d; Z#dS )=    N)OptionalTupleUnion)rotary_embeddingitersc                 C   s   t d}t jd|d t jd}t jddt jd}t jddt jd}t jddt jd}t|D ]}|| | }|||? A }q/|dkrJt | | d S d S )Nr       dtypeif i_n<   )tl
program_idfulluint32rangestore)out_ptrr   pidxacsh_r   r   X/home/ubuntu/.local/lib/python3.10/site-packages/sglang/jit_kernel/tests/test_pos_enc.pyburn_kernel   s   
r      msc                 C   s2   t | d }tjddtjd}t| ||d |S )Ni N  r   cuda)devicer
   )r   )inttorchemptyr   r   )r   gridr   outr   r   r   triton_burn   s   r%   c                 C   s|   || }t j|||| ||d}t j|||| ||d}	t jdt|d d|ft j|d}
|||| }|	||| }	||	|
fS )zCreate test inputs.r
   r   r      d   )r!   randnrandintminlongview)	head_size
batch_sizeseq_lenr   r
   num_q_headsnum_kv_headstotal_tokensquerykeypos_idsr   r   r   create_test_inputs$   s   
r7   c              	   C   s   |}t |d}tj|| ||d}d|tjd| dtj|d|    }tj|tj|d}	t|	|}
t|
|}t|
|}||ddd| d f< ||dd| d df< |S )z*Create cos/sin cache for rotary embedding.r(   r&         ?r   r'   N)	maxr!   zerosarangefloat32outercostosin)
rotary_dimmax_position_embeddingsbaser
   r   max_posextended_max_poscos_sin_cacheinv_freqtfreqs	cos_cache	sin_cacher   r   r   create_cos_sin_cache;   s&   
rL   r   r>   r@   is_neox_stylereturnc                 C   s   | d| j}| d| j}|rtj| ddd\}}n| ddddf }| ddddf }|| ||  }|| ||  }|rLtj||fddS tj||fdddS )z
    Args:
        x: [num_tokens, num_heads, head_size]
        cos: [num_tokens, head_size // 2]
        sin: [num_tokens, head_size // 2]
        is_neox_style: Whether to use the Neox-style or GPT-J-style rotary
            positional embeddings.
    r'   dim.Nr   )	unsqueezer?   r
   r!   chunkcatstackflatten)r   r>   r@   rM   x1x2o1o2r   r   r   _apply_rotary_embV   s   r\   c                       s   e Zd Zdedededededejddf fd	d
Zdeee	f dej
fddZdej
fddZ		ddej
dej
deej
 deej
 deej
ej
f f
ddZ  ZS )RotaryEmbeddingr.   rA   rB   rC   rM   r
   rN   Nc                    sN   t    || _|| _|| _|| _|| _|| _|  }|  | j	d|dd d S )NrF   F)
persistent)
super__init__r.   rA   rB   rC   rM   r
   _compute_cos_sin_cacheregister_buffer)selfr.   rA   rB   rC   rM   r
   cache	__class__r   r   r`   u   s   
	zRotaryEmbedding.__init__c                 C   s(   d|t jd| jdt jd| j   }|S )Nr8   r   r'   r	   )r!   r;   rA   float)rc   rC   rG   r   r   r   _compute_inv_freq   s   z!RotaryEmbedding._compute_inv_freqc                 C   sR   |  | j}tj| jtjd}td||}| }| }tj	||fdd}|S )zCompute the cos and sin cache.r	   z	i,j -> ijrP   rQ   )
rh   rC   r!   r;   rB   rg   einsumr>   r@   rU   )rc   rG   rH   rI   r>   r@   rd   r   r   r   ra      s   z&RotaryEmbedding._compute_cos_sin_cache	positionsr4   r5   offsetsc                 C   s.  |dur|| }|  }|jd }| jd|}|jddd\}}|j}	||d| j}|dd| jf }
|d| jdf }t|
||| j	}
t
j|
|fdd|	}|| j}|dur|j}||d| j}|dd| jf }|d| jdf }t|||| j	}t
j||fdd|}|| j}||fS )z-A PyTorch-native implementation of forward().Nr   r'   rP   rQ   .)rW   shaperF   index_selectrT   r-   r.   rA   r\   rM   r!   rU   reshaper?   r
   )rc   rj   r4   r5   rk   
num_tokenscos_sinr>   r@   query_shape	query_rot
query_pass	key_shapekey_rotkey_passr   r   r   forward_native   s,   	
zRotaryEmbedding.forward_native)NN)__name__
__module____qualname__r    boolr!   r
   r`   r   rg   Tensorrh   ra   r   r   rw   __classcell__r   r   re   r   r]   s   s>    	r]   c                 C   s   t | |||||d|S )zEInitialize Torch Native RotaryEmbedding based on vLLM implementation.r.   rA   rB   rC   rM   r
   )r]   r?   )r.   rA   rB   rC   rM   r
   r   r   r   r   get_torch_rotary_embedding   s   r   c                 C   sH   zddl m} W n ty   td Y nw || |||||d|S )z$Initialize SglKernelRotaryEmbedding.r   )SglKernelRotaryEmbeddingzDSglKernelRotaryEmbedding is not available. Test case can be removed.r~   )#sgl_kernel.testing.rotary_embeddingr   ImportErrorpytestskipr?   )r.   rA   rB   rC   rM   r
   r   r   r   r   r   get_sgl_rotary_embedding   s"   r   c                 C   s   | du r|du s
J dS |dusJ t |  rJ dt | r(J d|t jkr/dnd}|t jkr8dnd}t jj| |||d dS )z4Compare results between JIT and SGL implementations.NzNaN in JIT resultszNaN in SGL resultsg{Gz?gh㈵>)atolrtol)r!   isnananyr<   testingassert_close)jit_outsgl_outr
   r   r   r   r   r   compare_results   s   r   zhead_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype, device, batch_size, seq_len, num_q_heads, num_kv_headsc                 C   s,   g | ]\}}d d dddt jd||ddfqS )@      @  Tr      )r!   bfloat16).0bsslr   r   r   
<listcomp>  s    r   ))r   r   )    r   )   r   )   r   )r'   r   )   r   r   r   r   Tr   r   r   r   r   i'  r'   r   r   i7     '   i   Fr         key_is_nonec              	   C   s  t | |||||	|
\}}}t|||||}t| ||||||}||_t|}| | }}| | }}td }td }|rMd}d}t	ddd | | }}tj
  tj
| t	ddd || }t|||| ||d\}}W d   n1 sw   Y  tj
| t	ddd || }|j|||d\}}W d   n1 sw   Y  tj
  t||| t||| dS )z8Test correctness of JIT rotary embedding implementation.r   Ng      Y@)r   )r#   rj   r4   r5   r.   rF   is_neoxrj   r4   r5   )r7   rL   r   rF   r!   
randn_likecloneget_device_moduleStreamr%   r   synchronizestreamr   rw   r   )r.   rA   rB   rC   rM   r
   r   r/   r0   r1   r2   r   r4   r5   r6   rF   torch_rotary_embr	query_jitkey_jitquery_torch	key_torch
stream_jitstream_kernelr_jitr_torchquery_jit_outkey_jit_outquery_torch_outkey_torch_outr   r   r   test_correctness   sb   /

	


r   c           (   	   C   s\  t | |||||	|
\}}}t|||||}t| ||||||}||_d}t|D ]&}| | }}t|||| ||d | | }}|j|||d q(d}tj	
  t }t|D ]}| | }}t|||| ||d q^tj	
  t | | }tj	
  t }t|D ]}| | }}|j|||d qtj	
  t | | }| | }}| | } }!t|||| ||d\}"}#|j|| |!d\}$}%t|"|$| t|#|%| || }&td| d| d|&  td|d	 d
d|d	 d
d |dkr |dkr|| ntd}'td|'dd |dkr*|dks,J dS )zPPerformance test comparing JIT and SGL implementations with accuracy validation.r   r   r   r(   z
Performance Test - Batch=z	, SeqLen=z	, Tokens=zJIT: i  z.9fz	ms, SGL: r   r   infzSpeedup (SGL/JIT): z.2fr   N)r7   rL   r   rF   r   r   r   forward_cudar!   r   r   timer   printrg   )(r.   rA   rB   rC   rM   r
   r   r/   r0   r1   r2   r4   r5   r6   rF   sgl_rotary_embwarmupr   
query_warmkey_warmquery_sgl_warmkey_sgl_warm	iteration
start_timer   r   jit_time	query_sglkey_sglsgl_timequery_jit_finalkey_jit_finalquery_sgl_finalkey_sgl_finalr   r   query_sgl_outkey_sgl_outr3   speedupr   r   r   test_performanceg  s   

		




	
"
r   )r   )$r   typingr   r   r   r   r!   tritontriton.languagelanguager   sglang.jit_kernel.pos_encr   jit	constexprr   rg   r%   r7   rL   r|   r{   r\   nnModuler]   r   r   r   markparametrizer   r<   float16r   r   r   r   r   r   <module>   s    
T	
K