o
    i"                  &   @   sF  d Z ddlZddlmZ ddlZddlmZ ddlmZm	Z	m
Z
 ddlmZmZmZ ejdd	 Zejd
d Zejdd ZdejfddZe										d/dejdejdejdejdejdejdejdeej deej dedeej dedeej d ed!eej d"eej d#ed$ejf$d%d&Zed'd(d)dejdejdejdejdejdejdejdeej deej dedeej ded*ejd ed!eej d"eej d#ed$df$d+d,Zed'dejdejdejdejdejdejdejdeej deej dedeej ded*ejd ed!eej d"eej d#ed$df$d-d.ZdS )0a3  
Copyright (c) 2025 by FlashInfer team.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

  http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
    N)Optional   )flashinfer_api)!gen_selective_state_update_module&gen_selective_state_update_sm90_module'gen_selective_state_update_sm100_module)get_compute_capabilityregister_custom_opregister_fake_opc                   C   
   t   S )zEGet cached JIT-compiled selective_state_update module (base version).)r   build_and_load r   r   ]/home/ubuntu/vllm_env/lib/python3.10/site-packages/flashinfer/mamba/selective_state_update.py&get_selective_state_update_module_base      
r   c                   C   r   )zLGet cached JIT-compiled selective_state_update module (SM90/Hopper version).)r   r   r   r   r   r   &get_selective_state_update_module_sm90%   r   r   c                   C   r   )zQGet cached JIT-compiled selective_state_update module (SM100+/Blackwell version).)r   r   r   r   r   r   'get_selective_state_update_module_sm100+   r   r   devicec                 C   s.   t | \}}|dkrt S |dkrt S t S )N
   	   )r   r   r   r   )r   major_r   r   r   !get_selective_state_update_module1   s   r   FstatexdtABCDzdt_biasdt_softplusstate_batch_indicespad_slot_idoutdisable_state_updateintermediate_states_bufferintermediate_state_indicescache_stepsreturnc                 C   s  |dk}|   dkr| d} |  dkr|d}|  dkr%|d}|dur4|  dkr4|d}|  dkr?|d}|rL|  dkrL|d}|  dkrW|d}|rd|  dkrd|d}|  dkro|d}|r||  dkr||d}|  dkr|d}|r|  dkr|d}|dur|  dkr|d}|r|  dkr|d}|du rt|}n|}t| |||||||||	|
|||||| |S )a[
  Selective state update operation for Mamba layers (the generation phase).

    Parameters
    ----------
    state : torch.Tensor
        State tensor with shape (state_cache_size, dim, dstate) or (state_cache_size, nheads, dim, dstate)
    x : torch.Tensor
        Input tensor with shape (batch, dim) or (batch, nheads, dim) for single-token
        or (batch, T, nheads, dim) for multi-token
    dt : torch.Tensor
        Delta time tensor with shape (batch, dim) or (batch, nheads, dim) for single-token
        or (batch, T, nheads, dim) for multi-token
    A : torch.Tensor
        A matrix with shape (dim, dstate) or (nheads, dim, dstate)
    B : torch.Tensor
        B matrix with shape (batch, dstate) or (batch, ngroups, dstate) for single-token
        or (batch, T, ngroups, dstate) for multi-token
    C : torch.Tensor
        C matrix with shape (batch, dstate) or (batch, ngroups, dstate) for single-token
        or (batch, T, ngroups, dstate) for multi-token
    D : torch.Tensor
        D vector with shape (dim,) or (nheads, dim)
    z : Optional[torch.Tensor]
        Optional z tensor with shape (batch, dim) or (batch, nheads, dim) for single-token
        or (batch, T, nheads, dim) for multi-token
    dt_bias : Optional[torch.Tensor]
        Optional dt bias with shape (dim,) or (nheads, dim)
    dt_softplus : bool
        Whether to apply softplus to dt
    state_batch_indices : Optional[torch.Tensor]
        Optional batch indices for cache processing with shape (batch,)
    pad_slot_id : int
        If state_batch_indices is passed, lets the kernel identify padded entries
        that will not be processed. For example: state_batch_indices = [pad_slot_id, 1, 20, pad_slot_id]
        in this case, the kernel will not process entries at indices 0 and 3
    out : Optional[torch.Tensor]
        Optional output tensor (same shape as x)
    disable_state_update : bool
        If True, skip updating the state tensor (useful for speculative decoding verification)
    intermediate_states_buffer : Optional[torch.Tensor]
        Optional buffer for caching intermediate states during speculative decoding
        with shape (batch, cache_steps, nheads, dim, dstate)
    intermediate_state_indices : Optional[torch.Tensor]
        Optional indices mapping batch elements to intermediate state buffer positions
        with shape (batch,)
    cache_steps : int
        Number of steps/tokens to cache for speculative decoding

    Returns
    -------
    output : torch.Tensor
        Output tensor with shape (batch, dim) or (batch, nheads, dim) for single-token
        or (batch, T, nheads, dim) for multi-token
          r   r   N)dim	unsqueezetorch
empty_like_selective_state_update)r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   is_mtpoutputr   r   r   selective_state_update>   sj   K













r5   z"flashinfer::selective_state_update)r   r4   r(   )mutates_argsr4   c                 C   s4   t | j| |||||||||	|
|||||| dS )zLInternal function registered with torch.library for torch.compile() support.N)r   r   r5   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r4   r'   r(   r)   r*   r   r   r   r2      s&   
r2   c                 C   s   dS )z@Fake implementation for torch.compile() meta tensor propagation.Nr   r7   r   r   r   _selective_state_update_fake   s   r8   )
NNFNr   NFNNr   )__doc__	functoolstypingr   r0   api_loggingr   	jit.mambar   r   r   utilsr   r	   r
   cacher   r   r   r   r   Tensorboolintr5   r2   r8   r   r   r   r   <module>   s   


		
 	
)	
