o
    پiN                     @  s   d dl mZ d dlZd dlmZ d dlmZmZ d dlZd dl	Z	d dl
mZmZmZ d dlmZ er8d dlmZ ed3d
dZedg dd		d4d5ddZed g d!d		d4d6d"d ZeG d#d$ d$Zd%d& Z	'		d7d8d1d2ZdS )9    )annotationsN)	dataclass)TYPE_CHECKINGOptional)
cache_onceis_arch_support_pdlload_jit)register_custom_op)Modulereturnr
   c                  C  sf   t tjj } | d d  sJ dt| d d  | d d  }tddgdgt|gdS )Ndataincludezflashinfer headers are missing  apply_rope_pos_ids_cos_sin_cachezelementwise/rope.cuh)r   z%ApplyRopePosIdsCosSinCacheKernel::run)
cuda_filescuda_wrappersextra_include_paths)	pathlibPath
flashinfer__file__parentresolveexistsstrr   )flashinfer_dirflashinfer_include_path r   J/home/ubuntu/.local/lib/python3.10/site-packages/sglang/jit_kernel/rope.py,_jit_apply_rope_pos_ids_cos_sin_cache_module   s   
r   .apply_rope_pos_ids_cos_sin_cache_with_kv_cache)qkq_ropek_ropek_bufferv_buffer)op_namemutates_argsFr    torch.Tensorr!   r"   r#   cos_sin_cachepos_idsvr$   r%   kv_cache_loc
interleavebool
enable_pdlNonec                 C  s*   t  }|| ||||||
|||||	 dS )a  
    Apply RoPE (Rotary Positional Embedding) with position IDs and cos/sin cache.

    Args:
        q: Input Q tensor of shape [nnz, num_qo_heads, head_dim]
        k: Input K tensor of shape [nnz, num_kv_heads, head_dim]
        q_rope: Output Q tensor with RoPE applied, same shape as q
        k_rope: Output K tensor with RoPE applied, same shape as k
        cos_sin_cache: Cos/sin cache of shape [max_seq_len, rotary_dim]
        pos_ids: Position IDs of shape [nnz]
        interleave: Whether to use interleaved RoPE
        enable_pdl: Enable PDL (Programmable Data Layout)
        v: Optional V tensor for KV caching
        k_buffer: Optional K buffer for KV caching
        v_buffer: Optional V buffer for KV caching
        kv_cache_loc: Optional KV cache location tensor
    Nr   r   )r    r!   r"   r#   r)   r*   r+   r$   r%   r,   r-   r/   moduler   r   r   r   )   s   #1apply_rope_pos_ids_cos_sin_cache_without_kv_cache)r    r!   r"   r#   c           	      C  s*   t  }|| |||||||d d d d  d S )Nr1   )	r    r!   r"   r#   r)   r*   r-   r/   r2   r   r   r   r3   ^   s   c                   @  sB   e Zd ZU dZded< ded< ded< ded< ded< ded	< d
S )FusedSetKVBufferArga  
    value : Optional[torch.Tensor]
        Value tensor, shape: ``(nnz, num_v_heads * head_size)``.
    k_buffer : Optional[torch.Tensor]
        Buffer for keys, shape: ``(nnz, num_k_heads * head_size)``.
    v_buffer : Optional[torch.Tensor]
        Buffer for values, shape: ``(nnz, num_v_heads * head_size)``.
    k_scale : Optional[float]
        Scale factor for keys.
    v_scale : Optional[float]
        Scale factor for values.
    cache_loc : Optional[torch.Tensor]
        Cache location tensor, used for indexing kv cache.
    r(   valuer$   r%   zOptional[float]k_scalev_scale	cache_locN)__name__
__module____qualname____doc____annotations__r   r   r   r   r4      s   
 r4   c                 C  s   |  | jd d|S )Nr   )viewshape)x	head_sizer   r   r   _view_3d   s   rC   T	positionsquerykeyrB   intis_neoxfused_set_kv_buffer_argOptional[FusedSetKVBufferArg]Optional[bool]c           
      C  s  |j tjkr
td|du rt o|du}| }dur=|jdu s$J d|jdu s-J d|jj tjks=J d|jj |du}	|	rpt	t
||t
||t
||t
||||  t
|j|t
|j|t
|j||j| | dS tt
||t
||t
||t
||||  | | dS )a`  
    Apply rotary embedding to keys and queries with precomputed cos/sin values.
    This is designed to be compatible with the SGL/vLLM implementation.
    The result is inplace applied to the input tensors.

    Parameters
    ----------
    positions : torch.Tensor
        Position indices, shape: ``(nnz)``.
    query : torch.Tensor
        Query tensor, shape: ``(nnz, num_q_heads * head_size)``.
    key : torch.Tensor
        Key tensor, shape: ``(nnz, num_k_heads * head_size)``.
    cos_sin_cache : torch.Tensor
        Cosine and Sine cache tensor, shape: ``(max_seq_len, rotary_dim)``.
        Cosine is the first half and Sine is the second half on rotary_dim.
    is_neox : bool
        Whether to use Neox style RoPE, default: ``True``.

        * If ``True``, the last dimension of the query/key tensor is not interleaved, i.e.,
          we rotate the first half dimensions ``([..., :head_dim//2])`` and the second half
          dimensions ``([..., head_dim//2:])``.

        * If ``False``, the last dimension of the query/key tensor is interleaved, i.e.,
          we rotate the even dimensions ``([..., ::2])`` and odd dimensions ``([..., 1::2])``.
    fused_set_kv_buffer_arg : FusedSetKVBufferArg
        Fuse the set-kv-buffer operation into this kernel

    Note
    ----
    The rotary dimension is determined by the cosine cache and sine cache.
    zcos_sin_cache should be float32Nzk_scale is not yet supportedzv_scale is not yet supportedza.cache_loc.dtype=)dtypetorchfloat32
ValueErrorr   r6   r7   r8   int64r   rC   longr5   r$   r%   r3   )
rD   rE   rF   rB   r)   rH   rI   r/   asave_kv_cacher   r   r   %apply_rope_with_cos_sin_cache_inplace   sD   * 


rT   )r   r
   )FF)r    r(   r!   r(   r"   r(   r#   r(   r)   r(   r*   r(   r+   r(   r$   r(   r%   r(   r,   r(   r-   r.   r/   r.   r   r0   )r    r(   r!   r(   r"   r(   r#   r(   r)   r(   r*   r(   r-   r.   r/   r.   r   r0   )TNN)rD   r(   rE   r(   rF   r(   rB   rG   r)   r(   rH   r.   rI   rJ   r/   rK   r   r0   )
__future__r   r   dataclassesr   typingr   r   r   rM   sglang.jit_kernel.utilsr   r   r   sglang.srt.utils.custom_opr	   tvm_ffi.moduler
   r   r   r3   r4   rC   rT   r   r   r   r   <module>   sB    1
