o
    پi                     @   s   d dl mZ d dlZd dlmZ d dlmZ ddlmZ ddddddefdej	d	ej	d
eej	 deej	 deej	 deej	 deej	 dee
 defddZddddefdej	dej	d	ej	d
eej	 dee
 deej	 deej	 defddZdS )    )OptionalN)causal_conv1d_fwd)causal_conv1d_update   )PAD_SLOT_IDsiluxweightbiasquery_start_loccache_indiceshas_initial_stateconv_states
activationpad_slot_idc	           
   
   K   sZ   |dvrt d| ddkr|  } |dur| nd}t| |||||||dv |	 | S )a  
    x: (batch, dim, seqlen) or (dim,cu_seq_len) for varlen
        sequences are concatenated from left to right for varlen
    weight: (dim, width)
    bias: (dim,)
    query_start_loc: (batch + 1) int32
        The cumulative sequence lengths of the sequences in
        the batch, used to index into sequence. prepended by 0.
        for example: query_start_loc = torch.Tensor([0,10,16,17]),
        x.shape=(dim,17)
    cache_indices: (batch)  int32
        indicates the corresponding state index,
        like so: conv_state = conv_states[cache_indices[batch_id]]
    has_initial_state: (batch) bool
        indicates whether should the kernel take the current state as initial
        state for the calculations
    conv_states: (...,dim,width - 1) itype
        updated inplace if provided
    activation: either None or "silu" or "swish"
    pad_slot_id: int
            if cache_indices is passed, lets the kernel identify padded
            entries that will not be processed,
            for example: cache_indices = [pad_slot_id, 1, 20, pad_slot_id]
            in this case, the kernel will not process entries at
            indices 0 and 3


    out: (batch, dim, seqlen)
    Nr   swishz'activation must be None, silu, or swishr   Nr   r   )NotImplementedErrorstride
contiguousr   )
r   r	   r
   r   r   r   r   r   r   kwargs r   c/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/attention/mamba/causal_conv1d.pycausal_conv1d_fn   s"   )r   
conv_statecache_seqlensconv_state_indicesc           
   	   C   s`   |dvrt d| |dv }|  dk}	|	r| d} t| ||||||| |	r.| d} | S )a  
    x: (batch, dim) or (batch, dim, seqlen)
    conv_state: (batch, dim, state_len), where state_len >= width - 1
    weight: (dim, width)
    bias: (dim,)
    cache_seqlens: (batch,), dtype int32.
        If not None, the conv_state is treated as a circular buffer.
        The conv_state will be updated by copying x to the conv_state
        starting at the index
        @cache_seqlens % state_len.
    conv_state_indices: (batch,), dtype int32
        If not None, the conv_state is a larger tensor along the batch dim,
        and we are selecting the batch coords specified by conv_state_indices.
        Useful for a continuous batching scenario.
    pad_slot_id: int
            if cache_indices is passed, lets the kernel identify padded
            entries that will not be processed,
            for example: cache_indices = [pad_slot_id, 1 ,20 ,pad_slot_id]
            in this case, the kernel will not process entries at
            indices 0 and 3
    out: (batch, dim) or (batch, dim, seqlen)
    r   z1activation must be None, silu, or swish, actual: r      r   )r   dim	unsqueezecausal_conv1d_update_kernelsqueeze)
r   r   r	   r
   r   r   r   r   activation_valr!   r   r   r   r   M   s*    


r   )typingr   torch
sgl_kernelr   r   r"   causal_conv1d_tritonr   Tensorstrintr   r   r   r   r   <module>   sj   	
A