o
    پi V                     @   s  d dl Z d dlZd dlmZ d dlmZ ejdejdejdejdejdejdejfd	d
Z	ejdejdejdejdejdejdejdejdejdejdejfddZ
ejdejdejdejdejdejdejdejfddZejdejdejdejdejdejdejdejdejfddZG dd de jjZejZd*ddZejdejdejfddZ	d+de jd e jd!e jd"e jd#e jd$e jded%e jfd&d'ZG d(d) d)ZdS ),    N)	rearrangebhdeBLOCKCBLOCKc           .      C   s.  t d}|| }|| }t d}|| }|| | }|| |	 }|| |	 }||
 }|| }||	 }||	 }|| }|| }||	 }| | | | t d|d d d f |  t d|d d d f  }|| | t d|d d d f |  t d|d d d f  }|| | t d|d d d f |	  t d|	d d d f  }|| | | t d|d d d f |	  t d|	d d d f  }|| } t | }!|}"t d||"|  }#t j|||#d d d f  |k ddt j}$t j||	gt jd}%t|"d D ]z}&t d||&|  }'|#d d d f |'d d d f  }(|!|( })t |(dk|) t	d})t 
|)}*t j|||'d d d f  |k ddt j}+t j|||'d d d f  |k ddt j},t |$|+|* }-|%t |-|,7 }%||| 7 }|||	 7 }qt j||%|jj||#d d d f  |k d d S )Nr              maskotherdtypez-infr   )tl
program_idarangeloadtofloat32zerosrangewherefloatexpdotstorer   
element_ty).QKVOutSr   r   nr   r   r   	NUM_BLOCKr   offoff_bh	off_block
off_cblockoff_h	qk_offsetv_offseto_offsetblock_offsetqk_block_offsetv_block_offseto_block_offsetcblock_offsetq_cblock_offseto_cblock_offsetQ_block_ptrK_trans_block_ptrV_block_ptrO_block_ptrS_block_ptrsiq_indexqqkvjkv_indexdiffs_indexdecayk_transvqk rG   e/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/attention/linear/lightning_attn.py_fwd_diag_kernel   s   

	

$ 

rI   D_FBLOCKE_FBLOCK
NUM_FBLOCK
NUM_CBLOCKc           (      C   s  t d}t d}|| }||	 }|| }|| }|| | }|| | }|| | }||
 | | }| | | t d|d d d f |  t d|d d d f  }|| | t d|d d d f |  t d|d d d f  }|| | t d|d d d f |  t d|d d d f  }|||	  t d|d d d f  }t d|}t j||gt jd}||
d kr||
d |	  } n|	} t | || |  }!tt | ||}"|||" | 7 }t|"D ]O}#d|# |! }$t j||!|  |d d d f |$kdd}%t j||!|  |d d d f |$kdd}&t |}'|t 	|%|' |&7 }||| 7 }||| 7 }||7 }qt 
|||jj d S )Nr   r	   r   r
   r   )r   r   r   r   r   cdivminr   r   r   r   r   r   r   )(r    r!   K_decayKVr   r   r$   r   r   r   r%   rJ   rK   rL   r   rM   r'   r(   r*   r.   k_block_offsetr0   kv_block_offsetk_offsetr,   	kv_offsetr6   r7   KV_block_ptrk_decay_ptrr@   kvsplit_n
left_shift
num_blocksr?   
left_boundrD   rE   k_decayrG   rG   rH   _fwd_kv_parallel   s   

$	



r^   c                 C   sR  t d}|| }||	 | | }|| t d|
d d d f |  t d|d d d f  }| | }t |}|| | }|| t d|
d d d f |  t d|d d d f  }t |t j}t|	D ]7}t|||  |}t |t j | }t |t j}t 	|||j
j || | }||| 7 }qit 	|| d S )Nr   )r   r   r   r   r   r   r   rO   r   r   r   r   )r#   rQ   
KV_HISTORYr   r   r$   r   r   r   r%   rJ   rK   r'   r*   rU   rV   s_ptrsr:   kv_history_offsetKV_HISTORY_block_ptrkv_prer;   
block_sizeblock_decaykv_currG   rG   rH   _fwd_kv_reduce   s>   

rg   c           (      C   s`  t d}|| }t d}|| }|| }t d}||	 }|| }|| }|| }|| | || |  }|| | || |  | }||
 | | || |  | }| | t d|d d d f |  t d|d d d f  }|| t d|d d d f |  t d|d d d f  }|| t d|d d d f |  t d|d d d f  }|| }t |}t d|} t |t j}!|t d| }"t j||"d d d f |k ddt j}#t |t j || | d d d f   }$t |#|!|$ }%t j||"d d d f |k ddt j}&|&|% }'t j||'|j	j
|"d d d f |k d d S )Nr   r	      r
   r   r   )r   r   r   r   r   r   r   r   r   r   r   )(r   r"   r#   rQ   r   r   r$   r   r   r   r%   rK   r   rM   r'   r*   off_ncoff_noff_coff_en_offsetc_offsete_offsetr.   q_offsetr-   rU   r5   r8   rV   r9   r:   c_arrayrX   r<   r=   q_decayqkv_none_diagqkv_diagr>   rG   rG   rH   _fwd_none_diag_kernel7  sL   


 ::
(,( 
ru   c                   @   s   e Zd Zedd ZdS )
_attentionc                 C   sR  |  }|  }|  }|  }tj }|d dk r tdd|j\}}}	}
|jd }tj|||	|f|j|jd}d}t	
|	|}d}|| }|| dksRJ d	tjd||jd
d }t| ||dd  }|| | |f}t| ||||||||	|
||||d d}|
| }|
| dksJ || }|| dksJ d}|| }|| dksJ d	tj||||
|ftj|jd}|| |f}t| |||||||	|
||||||||d || |f}t| ||||||	|
|||||d || || f}t| |||||||	|
||||||d | ||||| || _|tj||dgddfS )Nr      z(Flash attention currently only supportedzfor compute capability >= 80r   device       z"BLOCK must be a multiple of CBLOCK)rz   r	   )r   r%   r   @   )r   r%   rJ   rK   rL   r   rM   )r   r%   rJ   rK   )r   r%   rK   r   rM   rh   )dim)
contiguoustorchcudaget_device_capabilityRuntimeErrorshapeemptyr   rz   tritonrN   r   r   reshaperI   r   r^   rg   ru   save_for_backwardr   cat	unsqueeze)ctxr=   krE   r:   
kv_history
capabilityr   r   r$   r   r   or   r%   r   rM   arrayr]   gridrL   rJ   rK   rX   rG   rG   rH   forward  s   

z_attention.forwardN)__name__
__module____qualname__staticmethodr   rG   rG   rG   rH   rv     s    rv   r{   c                    sJ  | j d }|j d }| dkr|dddd}|dkrdnd |  dks1J d| d  d fd	d
t|  d D }|d |krK|| t|}	d}
|du rjtj| j d | j d ||ftj| j	d}n|
  }t|	d D ]*}|| }||d  }| d||f }|d||f }t|||||\}}|
| }
qv|
|fS )a$  
    Apply lightning attention algorithm
    to compute attention efficiently.

    Args:
        q: Query tensor of shape [batch, heads, seq_len, dim]
        k: Key tensor of shape [batch, heads, seq_len, dim]
        v: Value tensor of shape [batch, heads, seq_len, dim_v]
        ed: Decay rate tensor of shape [heads]
        block_size: Size of blocks for block-sparse attention
        kv_history: Optional key-value history from previous computations

    Returns:
        output: Attention output
        kv: Updated key-value history
    rx   r	      r}   r   zDimension d (z) must be divisible by m ()c                    s   g | ]} | qS rG   rG   ).0r;   mrG   rH   
<listcomp>,  s    z'lightning_attention.<locals>.<listcomp>Nry   .)r   r~   viewr   appendlenr   r   r   rz   cloner   lightning_attention_)r=   r   rE   edrd   r   r   r   arrr$   outputr;   r:   q1k1r   rX   rG   r   rH   lightning_attention  s0   

"

r   D
BLOCK_SIZEc           '      C   s  t d}t d}t d}t || }|dkrdS |}|}t || }t d|}t d|||  }|dddf | |dddf |  }|| ||	  }|| ||	  }|| ||	  }||
 ||  }||k }||k }t j| | | |dd}t j|| | |dd} t j|| | |dd}!| dddf |!dddf  }"|dddf |dddf @ }#t | }|| | }$t j|$|#dd}%|"||%  }"|dddf t j|" }&t j|&dd}&t j|$|"|#d	 t j|| | |&|d	 dS )
z
    Kernel for linear attention decoding with KV cache.

    This kernel computes attention for a single token using the KV cache.
    r   r	   rh   rx   Nr
   r   )axisr   )	r   r   r   r   r   r   r   sumr   )'q_ptrk_ptrv_ptrkv_cache_ptr
slope_rateslot_idx
output_ptrr   qkv_b_strideqkv_h_stridecache_b_stridecache_h_stridecache_d0_stridecache_d1_strider   pid_bpid_hpid_dslot_idbatch_idhead_idratioqk_d_offsetsv_d_offsetscache_d_offsetsrp   rT   r,   cache_offsetqk_maskv_maskr=   r   rE   kv_outerkv_maskkv_ptrkv_cache_oldr   rG   rG   rH   _linear_attn_decode_kernelE  s@   


&  r   r|   r=   r   rE   	kv_cachesr   r   returnc                 C   s   | j \}}}	}
|j ||d|
fksJ |j ||d|
fksJ t| }|||
| f}| d}| d}|d}|d}|d}|d}t| | |||||||
|||||||d t|d}|d S )a  
    Perform linear attention decoding using Triton kernels.

    Args:
        q: Query tensor of shape [B, H, 1, D]
        k: Key tensor of shape [B, H, 1, D]
        v: Value tensor of shape [B, H, 1, D]
        kv_caches: Key-value cache tensor
        slope_rate: Decay rate tensor
        slot_idx: Slot indices for batches
        BLOCK_SIZE: Size of blocks for processing

    Returns:
        output: Attention output tensor
    r	   r   rh      )r   zb h n d -> b n (h d))r   r   
empty_likestrider   r   squeezer   )r=   r   rE   r   r   r   r   BH_r   r   r   r   r   r   r   r   r   rG   rG   rH   linear_decode_forward_triton  s<   







r   c                   @   sN   e Zd ZdZe	ddejdejdejdejdejded	ed
ejfddZdS )BailingLinearKernela  
    Linear attention kernel implementation for Bailing models.

    This class is adapted from MiniMaxText01LinearKernel in vllm:
    https://github.com/vllm-project/vllm/blob/a9138e85b14047e06300685b48e3485b995425fb/vllm/model_executor/models/minimax_text_01.py#L289

    The implementation maintains the same functionality while being renamed to
    match our Bailing model naming convention.
    Nr=   r   rE   r   r   rd   	layer_idxr   c              	   K   s   | tj}|  dk}|r| d} |d}|d}| j\}	}
}}|}|d|
|| }t| |||||d\}}|	|d d d d dd d d d f |
|| |jd dks`J d|
ddd||
| g S )Nr   r   r	   )rd   r   rx   zbatch size must be 1)r   r   r   r~   r   r   r   r   r   copy_r   	transpose)r=   r   rE   r   r   rd   r   kwargsshould_pad_dimr   r   r$   r   r   r   r   rG   rG   rH   jit_linear_forward_prefix  s   



2$z-BailingLinearKernel.jit_linear_forward_prefix)N)	r   r   r   __doc__r   r   Tensorintr   rG   rG   rG   rH   r     s*    
	r   )r{   N)r|   )r   r   triton.languagelanguager   einopsr   jit	constexprrI   r^   rg   ru   autogradFunctionrv   applyr   r   r   r   r   r   r   rG   rG   rG   rH   <module>   s   	
 	
i	@	
R 
3X
C