o
    پi7                  
   @   s   d dl mZ d dlZd dlZd dlmZ d dlmZ d dl	m
Z
 ejdgddejdejd	ejfd
dZejdgddejdejd	ejfddZejdgddejdejd	ejfddZe
dejfdejdeej dejdejfddZdS )    )OptionalN)prepare_chunk_indices)input_guardT)do_not_specializeHBT	IS_VARLENc                 C   sh  t dt d}}	|	| |	| }
}|rQt ||d  t jt ||d  d t j}}t || t jt || d t j}}|| }n|
| |
| | }}| || | |  } ||| | d  }|d | }t | ||f|| df|d |fdd}t ||df|d df|d dfdd}t j|ddt j}t t ddd d d f t ddd d d f k|d }t dd}t	dt
d||d  D ]7}t | |d | | |  | |  }|t |d d d f | d }||k}t |d d d f ||}q||d d d f |d d d f k7 }t j||j|jjd	d
dd d S )Nr            r   r   r
   r   r   r
   boundary_checkrtnefp_downcast_rounding)tl
program_idloadtoint32make_block_ptrfloat32wherearangerangeminsumstoredtype
element_ty)AAd
cu_seqlenschunk_indicesr   r   r   r	   i_ti_bhi_bi_hi_nboseosoffsetp_Ap_Aib_Ao_iib_amask r7   ^/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/attention/fla/solve_tril.pysolve_tril_16x16_kernel   sH   

 (<( $
r9   c	                 C   s  t dt d}	}
|
| |
| }}|rQt ||	d  t jt ||	d  d t j}}	t || t jt || d t j}}|| }n|| || | }}| || | d 7 } ||| | d 7 }||| | d 7 }t | |df|d df|	d d dfdd}t ||df|d df|	d dfdd}t ||df|d df|	d d dfdd}t ||df|d df|	d dfdd}t ||df|d df|	d d dfdd}t ||df|d df|	d d dfdd}t j|dd	t j}t j|dd	t j}t j|dd	t j}t jt j||d
d|d
d }t j||j|j	j
dddd	 t j||j|j	j
dddd	 t j||j|j	j
dddd	 d S )Nr   r
   r       r   r   r   r   r   ieeeinput_precisionr   r   )r   r   r   r   r   r   r   dotr!   r"   r#   )r$   r%   Air&   r'   r   r   r   r	   r(   r)   r*   r+   r,   r-   r.   p_A_21p_Ad_11p_Ad_22p_Ai_11p_Ai_22p_Ai_21A_21Ai_11Ai_22Ai_21r7   r7   r8   #merge_16x16_to_32x32_inverse_kernelP   sr   

$ $ $$
rJ   c	           ;      C   s	  t dt d}	}
|
| |
| }}|rQt ||	d  t jt ||	d  d t j}}	t || t jt || d t j}}|| }n|| || | }}| || | d 7 } ||| | d 7 }||| | d 7 }t | |df|d df|	d d dfdd}t | |df|d df|	d d dfdd}t | |df|d df|	d d dfdd}t | |df|d df|	d d	 dfdd}t | |df|d df|	d d	 dfdd}t | |df|d df|	d d	 dfdd}t ||df|d df|	d dfdd}t ||df|d df|	d d dfdd}t ||df|d df|	d d dfdd}t ||df|d df|	d d	 dfdd}t j|d
dt j}t j|d
dt j}t j|d
dt j}t j|d
dt j}t j|d
dt j}t j|d
dt j}t j|d
dt j} t j|d
dt j}!t j|d
dt j}"t j|d
dt j}#t jt j|!|dd| dd }$t jt j|"|dd|!dd }%t jt j|#|dd|"dd }&t j|"t j|| ddt j||$dd dd }'t j|#t j||!ddt j||%dd dd }(t j|#t j|| ddt j||$dd t j||'dd dd })t ||df|d df|	d dfdd}*t ||df|d df|	d d dfdd}+t ||df|d df|	d d dfdd},t ||df|d df|	d d	 d	fdd}-t ||df|d df|	d d dfdd}.t ||df|d df|	d d dfdd}/t ||df|d df|	d d dfdd}0t ||df|d df|	d d	 dfdd}1t ||df|d df|	d d	 dfdd}2t ||df|d df|	d d	 dfdd}3t j|*| j|*j	j
ddd
d t j|+|!j|+j	j
ddd
d t j|,|"j|,j	j
ddd
d t j|-|#j|-j	j
ddd
d t j|.|$j|.j	j
ddd
d t j|/|'j|/j	j
ddd
d t j|0|%j|0j	j
ddd
d t j|1|)j|1j	j
ddd
d t j|2|(j|2j	j
ddd
d t j|3|&j|3j	j
ddd
d t jdt jd}4t ||df|d df|	d dfdd}5t ||df|d df|	d dfdd}6t ||df|d df|	d d	fdd}7t ||df|d df|	d d dfdd}8t ||df|d df|	d d d	fdd}9t ||df|d df|	d d d	fdd}:t j|5|4j|5j	j
ddd
d t j|6|4j|6j	j
ddd
d t j|7|4j|7j	j
ddd
d t j|8|4j|8j	j
ddd
d t j|9|4j|9j	j
ddd
d t j|:|4j|:j	j
ddd
d d S )Nr   r
   r   @   r   r   r   r:   0   r   r   r;   r<   r   r   )r"   )r   r   r   r   r   r   r   r>   r!   r"   r#   zeros);r$   r%   r?   r&   r'   r   r   r   r	   r(   r)   r*   r+   r,   r-   r.   r@   p_A_32p_A_31p_A_43p_A_42p_A_41rA   rB   p_Ad_33p_Ad_44rF   A_32A_31A_43A_42A_41rG   rH   Ai_33Ai_44rI   Ai_32Ai_43Ai_31Ai_42Ai_41rC   rD   p_Ai_33p_Ai_44rE   p_Ai_31p_Ai_32p_Ai_41p_Ai_42p_Ai_43
fill_zerosp_Ai_12p_Ai_13p_Ai_14p_Ai_23p_Ai_24p_Ai_34r7   r7   r8   #merge_16x16_to_64x64_inverse_kernel   s  

$$$$$$ $$$ $$$$$$$$$   $$$
ro   r$   r&   output_dtypereturnc                 C   s<  | j d dv s	J | j \}}}}tj|||d| j|dkrtjn|d}|dur,t|dnd}|dur6t|nt|d}	t	|	|| f | |||||||duddd
 |dkrX|S tj||||| j|d}
|d	krjt
nt}|durut||nd}|durt|nt||}	||	|| f | ||
||||||dudd
d |
S )a  
    Compute the inverse of the lower triangular matrix
    A should be strictly lower triangular, i.e., A.triu() == 0.

    Args:
        A (torch.Tensor):
            [B, T, H, K]
        cu_seqlens (torch.Tensor):
            The cumulative sequence lengths of the input tensor.
            Default: None.
        output_dtype (torch.dtype):
            The dtype of the output tensor. Default: `torch.float`

    Returns:
        (I + A)^-1 with the same shape as A
    )r   r:   rK   r   )devicer"   Nr
      )
r$   r%   r&   r'   r   r   r   r	   	num_warps
num_stagesr:      )r$   r%   r?   r&   r'   r   r   r   r	   ru   rv   )shapetorchemptyrs   floatr   lentritoncdivr9   rJ   ro   )r$   r&   rp   Br   r   r   r%   r'   NTr?   merge_fnr7   r7   r8   
solve_tril  sX   r   )typingr   ry   r}   triton.languagelanguager   %sglang.srt.layers.attention.fla.indexr   %sglang.srt.layers.attention.fla.utilsr   jit	constexprr9   rJ   ro   r{   Tensorr"   r   r7   r7   r7   r8   <module>   sV   8	N	 j