o
    پi                      @   s  d dl mZ d dlZd dlZd dlmZ d dlmZ d dl	m
Z
mZ e
 r)ddgnddgZejdgd	d
ejdejdejdejdejdejdejfddZejdd eD g ddejdgd	d
ejdejdejdejdejdejdejdejdejfddZddddejfdejdededed eej d!ed"eej d#ejfd$d%Zddddejfdejdededed eej d!ed"eej d#ejfd&d'Zeddddejfdejdededed eej d!ed"eej d#ejfd(d)ZdS )*    )OptionalN)prepare_chunk_indices)check_shared_meminput_guard    @      T)do_not_specializeBHBTREVERSE	HAS_SCALE	IS_VARLEN
HEAD_FIRSTc                 C   s  t dt d}}|| || }}|rQt ||d  t jt ||d  d t j}}t || t jt || d t j}}|| }n|| || | }}|rt | ||  ||  |fd|| f|fd}t |||  ||  |fd|| f|fd}n,t | ||  | |f|f|| f|fd}t |||  | |f|f|| f|fd}t j|ddt j}t j|dd}|	rt j|dd}| |d   | }|
r||9 }t j	|||j
jdd d S )Nr         )r   )r   boundary_check)axis)tl
program_idloadtoint32make_block_ptrfloat32cumsumsumstoredtype
element_ty)soscale
cu_seqlenschunk_indicesr	   r   r   r   r   r   r   r   i_ti_bhi_bi_hi_nboseosp_sp_ob_sb_ob_z r4   Z/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/attention/fla/cumsum.py chunk_local_cumsum_scalar_kernel   s@   

&&,,r6   c                 C   s(   g | ]}d D ]}t jd|i|dqqS ))r         BS)	num_warps)tritonConfig).0r9   r:   r4   r4   r5   
<listcomp>H   s    r>   )r   r   Sr   r   r   r   )configskeyr?   r9   c                 C   s  t dt dt d}}}|| || }}|rWt ||d  t jt ||d  d t j}}t || t jt || d t j}}|| }n|| || | }}t d|	}|rt |d d d f |d d d f kdd}nt |d d d f |d d d f kdd}|rt | || ||  |  ||f|df||	 ||
 f|	|
fd}t ||| ||  |  ||f|df||	 ||
 f|	|
fd}n@t | || | |  ||f|| df||	 ||
 f|	|
fd}t ||| | |  ||f|| df||	 ||
 f|	|
fd}t j|ddt j}t j	||d	d
}|r2||9 }t j
|||jjdd d S )Nr   r   r   g      ?g        )r   r   )r   r   r   F)
allow_tf32)r   r   r   r   r   arangewherer   r   dotr    r!   r"   )r#   r$   r%   r&   r'   r	   r   r   r?   r   r9   r   r   r   r   i_sr(   r)   r*   r+   r,   r-   r.   o_im_sr/   r0   r1   r2   r4   r4   r5    chunk_local_cumsum_vector_kernelG   sr   "

,*	

	rI   Fg
chunk_sizereverser%   r&   
head_firstoutput_dtypereturnc                 C   s   |r	| j \}}}	n| j \}}	}|d| d  ksJ d|}
|d ur(t||
nd }|d u r4t|	|
nt|}| tj| |p@| jd}} ||| f}t	| || ||||	|||
|||d u|d uddd | S )Nr   r   chunk_size must be a power of 2r!   r8      )r#   r$   r%   r&   r'   r	   r   r   r   r   r   r   r   r:   
num_stages)
shape
bit_lengthr   r;   cdivlentorch
empty_liker!   r6   )rJ   rK   rL   r%   r&   rM   rN   r   r   r	   r   r'   NTg_orggridr4   r4   r5   chunk_local_cumsum_scalar   s@   	
r]   c                    s   |r
| j \ }}n| j \ }}|}	|d urt||nd }
|d u r(t||	nt|
|d| d  ks:J d| tj| |pB| jd}}  fdd}t	| || |||
| ||	|||d u|d ud | S )Nr   r   rP   rQ   c                    s   t | d | d   fS )Nr?   r9   )r;   rV   )metar   r   rZ   r4   r5   r\      s   z'chunk_local_cumsum_vector.<locals>.grid)r#   r$   r%   r&   r'   r	   r   r   r?   r   r   r   r   r   )
rT   r   r;   rV   rW   rU   rX   rY   r!   rI   )rJ   rK   rL   r%   r&   rM   rN   r	   r?   r   r'   r[   r\   r4   r_   r5   chunk_local_cumsum_vector   sB   	

r`   c              	   K   sx   |d ur| j d dksJ dt| j dkr!t| ||||||dS t| j dkr3t| ||||||dS td| j  d)	Nr   r   z;Only batch size 1 is supported when cu_seqlens are providedrR   )rJ   rK   rL   r%   r&   rM   rN   r7   zUnsupported input shape zN, which should be (B, T, H, D) if `head_first=False` or (B, H, T, D) otherwise)rT   rW   r]   r`   
ValueError)rJ   rK   rL   r%   r&   rM   rN   kwargsr4   r4   r5   chunk_local_cumsum   s6   	
rc   )typingr   rX   r;   triton.languagelanguager   %sglang.srt.layers.attention.fla.indexr   %sglang.srt.layers.attention.fla.utilsr   r   BS_LISTjit	constexprr6   autotunerI   floatTensorintboolr!   r]   r`   rc   r4   r4   r4   r5   <module>   s   	
1	
Q
.
3	