o
    
۾iQ                     @   s  d dl Z d dlmZ d dlmZmZ ejdejdejdejdejdejd	ejfd
dZejdejdejdejdejdejdejdejdejd	ejdejfddZ	ejdejdejdejdejdejdejdejfddZ
ejdejdejdejdejdejdejd	ejdejfddZG dd de jjZejZ		d,de jde jde jde jdede jdB dee je jf fd d!Zejd"ejd#ejfd$d%Z	&d-de jde jde jd'e jd(e jd)e jd#ede jfd*d+ZdS ).    N)	rearrange)tltritonbhdeBLOCKCBLOCKc           .      C   s.  t d}|| }|| }t d}|| }|| | }|| |	 }|| |	 }||
 }|| }||	 }||	 }|| }|| }||	 }| | | | t d|d d d f |  t d|d d d f  }|| | t d|d d d f |  t d|d d d f  }|| | t d|d d d f |	  t d|	d d d f  }|| | | t d|d d d f |	  t d|	d d d f  }|| } t | }!|}"t d||"|  }#t j|||#d d d f  |k ddt j}$t j||	gt jd}%t|"d D ]z}&t d||&|  }'|#d d d f |'d d d f  }(|!|( })t |(dk|) t	d})t 
|)}*t j|||'d d d f  |k ddt j}+t j|||'d d d f  |k ddt j},t |$|+|* }-|%t |-|,7 }%||| 7 }|||	 7 }qt j||%|jj||#d d d f  |k d d S )Nr              maskotherdtypez-infr   )r   
program_idarangeloadtofloat32zerosrangewherefloatexpdotstorer   
element_ty).QKVOutSr   r   nr   r   r	   	NUM_BLOCKr
   offoff_bh	off_block
off_cblockoff_h	qk_offsetv_offseto_offsetblock_offsetqk_block_offsetv_block_offseto_block_offsetcblock_offsetq_cblock_offseto_cblock_offsetQ_block_ptrK_trans_block_ptrV_block_ptrO_block_ptrS_block_ptrsiq_indexqqkvjkv_indexdiffs_indexdecayk_transvqk rH   ]/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/layers/lightning_attn.py_fwd_diag_kernel
   s   

	

$ 

rJ   D_FBLOCKE_FBLOCK
NUM_FBLOCK
NUM_CBLOCKc           (      C   s  t d}t d}|| }||	 }|| }|| }|| | }|| | }|| | }||
 | | }| | | t d|d d d f |  t d|d d d f  }|| | t d|d d d f |  t d|d d d f  }|| | t d|d d d f |  t d|d d d f  }|||	  t d| }t d|}t j||gt jd}||
d kr||
d |	  n|	} t | || |  }!tt | ||}"|||" | 7 }t|"D ]W}#d|# |! }$t j||!|  |d d d f |$kdd}%t j||!|  |d d d f |$kdd}&t |}'|'d d d f }'|t 	|%|' |&7 }||| 7 }||| 7 }||7 }qt 
|||jj d S )Nr   r   r   r   r   )r   r   r   r   r   cdivminr   r   r   r   r   r   r   )(r!   r"   K_decayKVr   r   r%   r   r   r	   r&   rK   rL   rM   r
   rN   r(   r)   r+   r/   k_block_offsetr1   kv_block_offsetk_offsetr-   	kv_offsetr7   r8   KV_block_ptrk_decay_ptrrA   kvsplit_n
left_shift
num_blocksr@   
left_boundrE   rF   k_decayrH   rH   rI   _fwd_kv_parallel   s   

	 



r_   c                 C   sR  t d}|| }||	 | | }|| t d|
d d d f |  t d|d d d f  }| | }t |}|| | }|| t d|
d d d f |  t d|d d d f  }t |t j}t|	D ]7}t|||  |}t |t j | }t |t j}t 	|||j
j || | }||| 7 }qit 	|| d S )Nr   )r   r   r   r   r   r   r   rP   r   r   r   r   )r$   rR   
KV_HISTORYr   r   r%   r   r   r	   r&   rK   rL   r(   r+   rV   rW   s_ptrsr;   kv_history_offsetKV_HISTORY_block_ptrkv_prer<   
block_sizeblock_decaykv_currH   rH   rI   _fwd_kv_reduce   s>   

rh   c           (      C   s`  t d}|| }t d}|| }|| }t d}||	 }|| }|| }|| }|| | || |  }|| | || |  | }||
 | | || |  | }| | t d|d d d f |  t d|d d d f  }|| t d|d d d f |  t d|d d d f  }|| t d|d d d f |  t d|d d d f  }|| }t |}t d|} t |t j}!|t d| }"t j||"d d d f |k ddt j}#t |t j || | d d d f   }$t |#|!|$ }%t j||"d d d f |k ddt j}&|&|% }'t j||'|j	j
|"d d d f |k d d S )Nr   r      r   r   r   )r   r   r   r   r   r   r   r   r   r   r   )(r    r#   r$   rR   r   r   r%   r   r   r	   r&   rL   r
   rN   r(   r+   off_ncoff_noff_coff_en_offsetc_offsete_offsetr/   q_offsetr.   rV   r6   r9   rW   r:   r;   c_arrayrY   r=   r>   q_decayqkv_none_diagqkv_diagr?   rH   rH   rI   _fwd_none_diag_kernel9  sL   


 ::
(,( 
rv   c                   @   s   e Zd Zedd ZdS )
_attentionc                 C   sR  |  }|  }|  }|  }tj }|d dk r tdd|j\}}}	}
|jd }tj|||	|f|j|jd}d}t	
|	|}d}|| }|| dksRJ d	tjd||jd
d }t| ||dd  }|| | |f}t| ||||||||	|
||||d d}|
| }|
| dksJ || }|| dksJ d}|| }|| dksJ d	tj||||
|ftj|jd}|| |f}t| |||||||	|
||||||||d || |f}t| ||||||	|
|||||d || || f}t| |||||||	|
||||||d | ||||| || _|tj||dgddfS )Nr      z(Flash attention currently only supportedzfor compute capability >= 80r   device       z"BLOCK must be a multiple of CBLOCK)r{   r   )r	   r&   r
   @   )r	   r&   rK   rL   rM   r
   rN   )r	   r&   rK   rL   )r	   r&   rL   r
   rN   ri   )dim)
contiguoustorchcudaget_device_capabilityRuntimeErrorshapeemptyr   r{   r   rO   r   r   reshaperJ   r   r_   rh   rv   save_for_backwardr	   cat	unsqueeze)ctxr>   krF   r;   
kv_history
capabilityr   r   r%   r   r   or	   r&   r
   rN   arrayr^   gridrM   rK   rL   rY   rH   rH   rI   forward  s   

z_attention.forwardN)__name__
__module____qualname__staticmethodr   rH   rH   rH   rI   rw     s    rw   r|   r>   r   rF   edre   r   returnc                    sJ  | j d }|j d }| dkr|dddd}|dkrdnd |  dks1J d| d  d fd	d
t|  d D }|d |krK|| t|}	d}
|du rjtj| j d | j d ||ftj| j	d}n|
  }t|	d D ]*}|| }||d  }| d||f }|d||f }t|||||\}}|
| }
qv|
|fS )a$  
    Apply lightning attention algorithm
    to compute attention efficiently.

    Args:
        q: Query tensor of shape [batch, heads, seq_len, dim]
        k: Key tensor of shape [batch, heads, seq_len, dim]
        v: Value tensor of shape [batch, heads, seq_len, dim_v]
        ed: Decay rate tensor of shape [heads]
        block_size: Size of blocks for block-sparse attention
        kv_history: Optional key-value history from previous computations

    Returns:
        output: Attention output
        kv: Updated key-value history
    ry   r      r~   r   zDimension d (z) must be divisible by m ()c                    s   g | ]} | qS rH   rH   ).0r<   mrH   rI   
<listcomp>4  s    z'lightning_attention.<locals>.<listcomp>Nrz   .)r   r   viewr   appendlenr   r   r   r{   cloner   lightning_attention_)r>   r   rF   r   re   r   r   r   arrr%   outputr<   r;   q1k1r   rY   rH   r   rI   lightning_attention  s0   

"

r   D
BLOCK_SIZEc           '      C   s   t d}t d}t d}t || t j}|dkr dS |}|}t || }t d|}t d|||  }|dddf | |dddf |  }|| ||	  }|| ||	  }|| ||	  }||
 ||  }||k }||k }t j| | | |dd}t j|| | |dd} t j|| | |dd}!| dddf |!dddf  }"|dddf |dddf @ }#t | }|| | }$t j|$|#dd}%|"||%  }"|dddf t j|" }&t j|&dd}&t j	|$|"|#d	 t j	|| | |&|d	 dS )
z
    Kernel for linear attention decoding with KV cache.

    This kernel computes attention for a single token using the KV cache.
    r   r   ri   ry   Nr   r   )axisr   )
r   r   r   r   int64r   r   r   sumr   )'q_ptrk_ptrv_ptrkv_cache_ptr
slope_rateslot_idx
output_ptrr   qkv_b_strideqkv_h_stridecache_b_stridecache_h_stridecache_d0_stridecache_d1_strider   pid_bpid_hpid_dslot_idbatch_idhead_idratioqk_d_offsetsv_d_offsetscache_d_offsetsrq   rU   r-   cache_offsetqk_maskv_maskr>   r   rF   kv_outerkv_maskkv_ptrkv_cache_oldr   rH   rH   rI   _linear_attn_decode_kernelM  s@   


&  r   r}   	kv_cachesr   r   c                 C   s   | j \}}}	}
|j ||d|
fksJ |j ||d|
fksJ t| }|||
| f}| d}| d}|d}|d}|d}|d}t| | |||||||
|||||||d t|d}|d S )a  
    Perform linear attention decoding using Triton kernels.

    Args:
        q: Query tensor of shape [B, H, 1, D]
        k: Key tensor of shape [B, H, 1, D]
        v: Value tensor of shape [B, H, 1, D]
        kv_caches: Key-value cache tensor
        slope_rate: Decay rate tensor
        slot_idx: Slot indices for batches
        BLOCK_SIZE: Size of blocks for processing

    Returns:
        output: Attention output tensor
    r   r   ri      )r   zb h n d -> b n (h d))r   r   
empty_likestrider   r   squeezer   )r>   r   rF   r   r   r   r   BH_r   r   r   r   r   r   r   r   r   rH   rH   rI   linear_decode_forward_triton  s<   







r   )r|   N)r}   )r   einopsr   vllm.triton_utilsr   r   jit	constexprrJ   r_   rh   rv   autogradFunctionrw   applyr   Tensorinttupler   r   r   rH   rH   rH   rI   <module>   s   	
 	
l	@	
R 
:X