o
    پi6                     @   s  d dl mZ d dlmZmZ d dlZd dlmZ 			dRdejdejde	d	eej d
ee
 dejfddZ		dSdejdejdejde	d
ee
 ddfddZ			dRdejdejde	d	eej d
ee
 dejfddZ		dSdejdejdejde	d
ee
 ddfddZdejdejddfddZdTdejd	ejdejfddZdTdejd	ejdejfddZdTdejd	ejdejfddZejjdurdTdejd	ejdejfddZeG d d! d!Zd"d# Z	$		dUd%ejd&ejd'ejd(ed)ejd*e
d+ee d
ee
 ddfd,d-Z	$dVd%ejd&ejd'ejd(ed)ejd*e
fd.d/Z	0	 dWd1ejd2ejd3ejd4ejd5ejd6ejd7ejd8ed9eddfd:d;Zdejdejfd<d=Zd1ejd>ejd?ejfd@dAZdBejdCejfdDdEZdFdGd0dHejfdIejdJedKe
dLe	dMe	dNedOej fdPdQZ!dS )X    )	dataclass)ListOptionalN)is_arch_support_pdlư>inputweightepsout
enable_pdlreturnc                 C   <   |du r	t | }|du rt }t jjj|| ||| |S )aG  Root mean square normalization.

    ``out[i] = (input[i] / RMS(input)) * weight[i]``

    Parameters
    ----------
    input: torch.Tensor
        Input tensor, shape (batch_size, hidden_size).
    weight: torch.Tensor
        Weight tensor, shape (hidden_size,).
    eps: float
        Epsilon for numerical stability.
    out: Optional[torch.Tensor]
        The output tensor, if specified, the kernel will update this tensor inplace.
    enable_pdl: Optional[bool]
        Whether to enable `programmatic dependent launch
        <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#programmatic-dependent-launch-and-synchronization>`_
        If None, will be automatically enabled on Hopper architecture.

    Returns
    -------
    output: torch.Tensor
        Normalized tensor, shape (batch_size, hidden_size).
    N)torch
empty_liker   ops
sgl_kernelrmsnormdefaultr   r   r	   r
   r    r   J/home/ubuntu/.local/lib/python3.10/site-packages/sgl_kernel/elementwise.pyr   
      
r   residualc                 C   *   |du rt  }tjjj| |||| dS )a  Fused add root mean square normalization.

    Step 1:
    ``residual[i] += input[i]``

    Step 2:
    ``input[i] = (residual[i] / RMS(residual)) * weight[i]``

    Parameters
    ----------
    input: torch.Tensor
        Input tensor, shape (batch_size, hidden_size).
    residual: torch.Tensor
        Residual tensor, shape (batch_size, hidden_size).
    weight: torch.Tensor
        Weight tensor, shape (hidden_size,).
    eps: float
        Epsilon for numerical stability.
    enable_pdl: Optional[bool]
        Whether to enable `programmatic dependent launch
        <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#programmatic-dependent-launch-and-synchronization>`_
        If None, will be automatically enabled on Hopper architecture.
    N)r   r   r   r   fused_add_rmsnormr   r   r   r   r	   r   r   r   r   r   1   
   

r   c                 C   r   )a_  Gemma-style root mean square normalization.

    ``out[i] = (input[i] / RMS(input)) * (weight[i] + 1)``

    Parameters
    ----------
    input: torch.Tensor
        Input tensor, shape (batch_size, hidden_size).
    weight: torch.Tensor
        Weight tensor, shape (hidden_size,).
    eps: float
        Epsilon for numerical stability.
    out: Optional[torch.Tensor]
        The output tensor, if specified, the kernel will update this tensor inplace.
    enable_pdl: Optional[bool]
        Whether to enable `programmatic dependent launch
        <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#programmatic-dependent-launch-and-synchronization>`_
        If None, will be automatically enabled on Hopper architecture.

    Returns
    -------
    output: torch.Tensor
        Gemma Normalized tensor, shape (batch_size, hidden_size).
    N)r   r   r   r   r   gemma_rmsnormr   r   r   r   r   r   V   r   r   c                 C   r   )a  Gemma-style fused add root mean square normalization.

    Step 1:
    ``residual[i] += input[i]``

    Step 2:
    ``input[i] = (residual[i] / RMS(residual)) * (weight + 1)``

    Parameters
    ----------
    input: torch.Tensor
        Input tensor, shape (batch_size, hidden_size).
    residual: torch.Tensor
        Residual tensor, shape (batch_size, hidden_size).
    weight: torch.Tensor
        Weight tensor, shape (hidden_size,).
    eps: float
        Epsilon for numerical stability.
    enable_pdl: Optional[bool]
        Whether to enable `programmatic dependent launch
        <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#programmatic-dependent-launch-and-synchronization>`_
        If None, will be automatically enabled on Hopper architecture.
    N)r   r   r   r   gemma_fused_add_rmsnormr   r   r   r   r   r   }   r   r   outputc                 C   s   | j |j ksJ | j  d|j  | jd d |jd d ks2J | jd d  d|jd d  | jd d|jd  ksOJ | jd  dd|jd   d S )N !=    )ndimshaper   r   r   r   r   _check_shape   s   " r&   c                 C   z   | j d | jj d dkrtd|d urt| | ntj| j d d | j d d f | j| jd}tjj	j
||  |S Nr!      r   z*The pointers must be multiple of 16 bytes.r"   devicedtype)r$   r,   itemsize
ValueErrorr&   r   emptyr+   r   r   silu_and_mulr   r   r
   r   r   r   r0         r0   c                 C   r'   r(   )r$   r,   r-   r.   r&   r   r/   r+   r   r   gelu_tanh_and_mulr   r1   r   r   r   r3      r2   r3   c                 C   r'   r(   )r$   r,   r-   r.   r&   r   r/   r+   r   r   gelu_and_mulr   r1   r   r   r   r4      r2   r4   c                 C   s   | j d | jj d dkrtd| j d  d| jj d|dur3| j |j ks2J | j  d|j  nt| }tjj||  |S )	z
        Quick-GELU:  y = x * sigmoid(1.702 * x)

        The CUDA/HIP kernel uses 128-bit (16-byte) vector loads & stores,
        so the last-dimension byte length must be a multiple of 16 bytes.
        r!   r)   r   zThe last dimension (z) x itemsize (z!) must be a multiple of 16 bytes.Nr    )	r$   r,   r-   r.   r   r   r   r   
gelu_quickr1   r   r   r   r5      s   $
r5   c                   @   sR   e Zd ZU dZejed< ejed< ejed< ee ed< ee ed< ejed< dS )	FusedSetKVBufferArga  
    value : Optional[torch.Tensor]
        Value tensor, shape: ``(nnz, num_v_heads * head_size)``.
    k_buffer : Optional[torch.Tensor]
        Buffer for keys, shape: ``(nnz, num_k_heads * head_size)``.
    v_buffer : Optional[torch.Tensor]
        Buffer for values, shape: ``(nnz, num_v_heads * head_size)``.
    k_scale : Optional[float]
        Scale factor for keys.
    v_scale : Optional[float]
        Scale factor for values.
    cache_loc : Optional[torch.Tensor]
        Cache location tensor, used for indexing kv cache.
    valuek_bufferv_bufferk_scalev_scale	cache_locN)	__name__
__module____qualname____doc__r   Tensor__annotations__r   floatr   r   r   r   r6      s   
 


r6   c                 C   s   |  | jd d|S )Nr   r!   )viewr$   )x	head_sizer   r   r   _view_3d
     rG   T	positionsquerykeyrF   cos_sin_cacheis_neoxfused_set_kv_buffer_argc           	      C   s  |j tjkr
td|du rt o|du}| }dur=|jdu s$J d|jdu s-J d|jj tjks=J d|jj tj	j
jt||t||t||t||||  | ||durct|j|nd|durnt|j|nd|duryt|j|nd|dur|j dS d dS )a`  
    Apply rotary embedding to keys and queries with precomputed cos/sin values.
    This is designed to be compatible with the SGL/vLLM implementation.
    The result is inplace applied to the input tensors.

    Parameters
    ----------
    positions : torch.Tensor
        Position indices, shape: ``(nnz)``.
    query : torch.Tensor
        Query tensor, shape: ``(nnz, num_q_heads * head_size)``.
    key : torch.Tensor
        Key tensor, shape: ``(nnz, num_k_heads * head_size)``.
    cos_sin_cache : torch.Tensor
        Cosine and Sine cache tensor, shape: ``(max_seq_len, rotary_dim)``.
        Cosine is the first half and Sine is the second half on rotary_dim.
    is_neox : bool
        Whether to use Neox style RoPE, default: ``True``.

        * If ``True``, the last dimension of the query/key tensor is not interleaved, i.e.,
          we rotate the first half dimensions ``([..., :head_dim//2])`` and the second half
          dimensions ``([..., head_dim//2:])``.

        * If ``False``, the last dimension of the query/key tensor is interleaved, i.e.,
          we rotate the even dimensions ``([..., ::2])`` and odd dimensions ``([..., 1::2])``.
    fused_set_kv_buffer_arg : FusedSetKVBufferArg
        Fuse the set-kv-buffer operation into this kernel

    Note
    ----
    The rotary dimension is determined by the cosine cache and sine cache.
    zcos_sin_cache should be float32Nzk_scale is not yet supportedzv_scale is not yet supportedza.cache_loc.dtype=)r,   r   float32r.   r   r:   r;   r<   int64r   r    apply_rope_pos_ids_cos_sin_cacher   rG   longr7   r8   r9   )	rI   rJ   rK   rF   rL   rM   rN   r   ar   r   r   %apply_rope_with_cos_sin_cache_inplace  s>   * 
rT   c                 C   s   t jjj| ||||| d S N)r   r   r   rotary_embeddingr   )rI   rJ   rK   rF   rL   rM   r   r   r   rV   d  s   
rV      kvk_outv_outr:   r;   locmultoffsetc	           	      C   s"   t jj| ||||||||	 d S rU   )r   r   r   downcast_fp8)	rX   rY   rZ   r[   r:   r;   r\   r]   r^   r   r   r   r_   q  s   r_   c                 C   s   t jj| | d S rU   )r   r   r   copy_to_gpu_no_cer%   r   r   r   r`     rH   r`   k_nopek_ropec                 C   s   t jj| || d S rU   )r   r   r   concat_mla_k)rX   ra   rb   r   r   r   rc     s   rc   rS   bc                 C   sP   | j ^ }}tjg || j d |j d  R | j| jd}tjj| || |S )Nr!   r*   )r$   r   r/   r+   r,   r   r   concat_mla_absorb_q)rS   rd   
batch_dims_r
   r   r   r   re     s   $re   Fg        i'  tdimflip_sin_to_cosdownscale_freq_shiftscale
max_periodr,   c           	   	   C   s@   t j}| jd }t j||f|| jd}t jj| ||||||S )a  
    Create sinusoidal timestep embeddings.

    # TODO: review, output dtype always be float32. According to python code:
    #  sglang/python/sglang/multimodal_gen/runtime/layers/visual_embedding.py

    Args:
        t: Tensor of shape [B] with timesteps
        dim: Embedding dimension
        max_period: Controls the minimum frequency of the embeddings

    Returns:
        Tensor of shape [B, dim] with embeddings
    r   )r,   r+   )r   rO   r$   r/   r+   r   r   timestep_embedding)	rh   ri   rj   rk   rl   rm   r,   
batch_sizer   r   r   r   rn     s   
rn   )r   NN)r   NrU   )TNN)T)rW   r   )"dataclassesr   typingr   r   r   sgl_kernel.utilsr   rA   rC   boolr   r   r   r   r&   r0   r3   r4   versionhipr5   r6   rG   intrT   rV   r_   r`   rc   re   rO   r,   rn   r   r   r   r   <module>   sN   
+
(
+
%

	
\
	



