o
    پi                     @   s   d dl Z d dlZd dlmZ d dlmZ ejejddiddejddiddejdd	iddejdd
iddgddgdej	dej
dej
fddZ	dde jde jde jdede jf
ddZe rnddlmZ eZdS dS )    N)current_platformBLOCK_HS_HALF       )	num_warps@               	head_sizeinterleaved)configskeyc           "      C   sn  t d}|| | }|||  }|||  }|||	  }| ||  }|d }td||D ]}|t d| }||k }t j|| |dd}t j|| |dd}d| }d| d }t j|| |dd}t j|| |dd}|t j}|t j}|t j}|t j}t | ||| } t |||| }!t j|| | |j	|d t j|| |!|j	|d q-d S )Nr   r   g        )maskother   )r   )
tl
program_idrangearangeloadtofloat32fmastoredtype)"
output_ptrx_ptrcos_ptrsin_ptr	num_headsr   
num_tokensstride_x_rowstride_cos_rowstride_sin_rowr   r   row_idx	token_idx	x_row_ptrcos_row_ptrsin_row_ptroutput_row_ptrhead_size_halfblock_startoffsets_halfr   cos_valssin_vals
offsets_x1
offsets_x2x1_valsx2_valsx1_fp32x2_fp32cos_fp32sin_fp32o1_valso2_vals r;   ]/home/ubuntu/.local/lib/python3.10/site-packages/sglang/jit_kernel/diffusion/triton/rotary.py_rotary_embedding_kernel   s2   
r=   Fxcossinreturnc                 C   s   t | }|  dkr| j\}}}}n| j\}}}d}|d dks%J d| d|}	|d|}
|| | f}|rX|jd |krX|dd d df  }|dd d df  }n| }| }t| |
|	||||||	d|d|d| |S )N   r   r   r   z head_size must be divisible by 2.)torch
empty_likedimshapeview
contiguousr=   stride)r>   r?   r@   r   outputbszr"   r!   r   
x_reshapedoutput_reshapedgridr;   r;   r<   apply_rotary_embeddingC   s8   
rP   r   )apply_rotary_embedding_native)F)rD   tritontriton.languagelanguager   'sglang.multimodal_gen.runtime.platformsr   autotuneConfigjit	constexprr=   TensorboolrP   is_npunpu_fallbackrQ   r;   r;   r;   r<   <module>   sD    	2
+