o
    پi                     @   s"  d dl mZ d dlZd dlZd dlmZ d dlmZ d dl	m
Z
mZ d dlmZmZ e r1ddgnddgZer;d	d
gng dZejdgddejdejdejdejdejdejdejdejdejfddZ				d$dejdejdejdejdeej dee deej d ed!ejfd"d#ZdS )%    )OptionalN)prepare_chunk_indices)expsafe_exp)check_shared_memis_nvidia_hopper@                )r   r      T)do_not_specializeHHgKVBTBKBVUSE_G	IS_VARLENc           -   	   C   s  t dt dt d}}}||
 ||
 }}|r_|}t ||d  t jt ||d  d t j}}t || t jt || d t j}}|| }	t |	|}nt |	|}|| | }||	 ||	 |	 }}| || ||
|   | 7 } ||| ||
|   | 7 }|||
 | | 7 }|||
 | | 7 }|||
 | t j| | 7 }t j||gt jd}t j||gt jd}t	t ||D ]m}t 
| |	|f|| df|| || f||fd} t 
|||	fd|| f|| || f||fd}!t 
|||f|df|| || f||fd}"t j| dd}#t j|!dd}$t j|"dd}%|t |#|%7 }|t |#|$7 }q|r|||
 | 7 }t 
||	f|
f|| f|fd}&t j|&dd}'|t|'d d d f  }|t|'d d d f |'d d d f   }t d|}(|(d d d f |(d d d f k})t |)|d}t 
||	|f|
| df|| || f||fd}*t 
||	|f|
| df|| || f||fd}+t j|*dd},|| t ||,j|,|  }t j|+||+jjdd d S )	Nr      r   )dtype)r   r   )r   r   )boundary_check)r   )tl
program_idloadtoint32cdivint64zerosfloat32rangemake_block_ptrdotr   r   arangewherer   store
element_ty)-qkvhgo
cu_seqlenschunk_indicesscaler   r   r   r   r   r   r   r   r   r   i_vi_ti_bhi_bi_hi_tgi_nboseosNTb_ob_Ai_kp_qp_kp_hb_qb_kb_hp_gb_go_im_Ap_vp_ob_v rO   [/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/attention/fla/chunk_o.pychunk_fwd_kernel_o   sv   "
 (($ ( (( rQ   r,   r-   r.   r/   r0   r4   r2   
chunk_sizereturnc                    s   g | j |j d R \ }}	}
|j d t|tdt|}|d ur)t||nd }|d u r5t||nt||d u rD|j d d }t	|} fdd}t
| | ||||||||f	||	|
|dd|d u|d ud	d
d |S )N   g      c                    s   t | d   fS )Nr   )tritonr!   )metaBr   r>   r   rO   rP   grid   s   zchunk_fwd_o.<locals>.gridr	   r   r   r   )r   r   r   r   r   r   r   r   r   r   	num_warps
num_stages)shapeminmaxrW   next_power_of_2r   r!   lentorch
zeros_likerQ   )r,   r-   r.   r/   r0   r4   r2   rR   r   r   r   r   r3   r1   r[   rO   rY   rP   chunk_fwd_o~   sF    



re   )NNNr   )typingr   rc   rW   triton.languagelanguager   %sglang.srt.layers.attention.fla.indexr   "sglang.srt.layers.attention.fla.opr   r   %sglang.srt.layers.attention.fla.utilsr   r   BKV_LIST	NUM_WARPSjit	constexprrQ   Tensorfloat
LongTensorintre   rO   rO   rO   rP   <module>   sh   e	