o
    پi                  
   @  s   d dl mZ d dlmZmZmZmZ d dlZzd dlm	Z
 W n ey4 Z z
dZ
eZW Y dZ[ndZ[ww dZdFdd	Z									
									
dGdHd-d.Z																			
			/	0		 		 				
dIdJdDdEZdS )K    )annotations)CallableOptionalTupleUnionN)flash_attn_varlen_funcxOptional[torch.Tensor]returnc                 C  s"   | d ur|  ddkr|  S | S )N   )stride
contiguous)r    r   X/home/ubuntu/.local/lib/python3.10/site-packages/sglang/jit_kernel/flash_attention_v4.py_maybe_contiguous   s   "r   Fr   r   r   qtorch.Tensorkvcu_seqlens_qcu_seqlens_k	seqused_q	seqused_kmax_seqlen_qOptional[int]max_seqlen_k
page_tablesoftmax_scaleOptional[float]causalboolsoftcapwindow_size#Tuple[Optional[int], Optional[int]]learnable_sinksinks
num_splitsintpack_gqaOptional[bool]	score_modOptional[Callable]aux_tensorsOptional[list]return_softmax_lse_objectc                 K  s  t d u r	tdtdd | ||fD \} }}dd ||fD \}}dd ||fD \}}t|	}	|d u r:|d ur:|}|dkr@d}t di d| d	|d
|d|d|d|d|d|d|d|	d|
d|d|d|d|d|d|d|d|}|r|S t|tr|d S |S )NzVendored FlashAttention CUTE is not available (cannot import sglang.jit_kernel.flash_attention.cute). Please check your source tree.c                 S     g | ]}t |qS r   r   .0tr   r   r   
<listcomp>4       z*flash_attn_varlen_func.<locals>.<listcomp>c                 S  r3   r   r4   r5   r   r   r   r8   5   s    c                 S  r3   r   r4   r5   r   r   r   r8   8   r9   r   )NNr   r   r   r   r   r   r   r   r   r   r   r!   r#   r$   r&   r(   r*   r,   r.   r   r   )_flash_attn_varlen_funcImportError_flash_attn_import_errorr   
isinstancetuple)r   r   r   r   r   r   r   r   r   r   r   r!   r#   r$   r&   r'   r(   r*   r,   r.   r0   r1   resultr   r   r   r      sx   
	

r           Tk_cachev_cacheqv
rotary_cos
rotary_sincache_seqlens"Optional[Union[int, torch.Tensor]]cache_batch_idxcache_leftpadcu_seqlens_k_newrotary_seqlens	q_descale	k_descale	v_descaleTuple[int, int]attention_chunkfloatrotary_interleaved	sm_marginc!           #      K  sF  |d us|d us|d urt d|d us|d us|d ur t d|	d us(|
d ur,t d|d us8|d us8|d ur<t dt|trPtj|jd f|tj|jd}tdi d| d|d	|d
|d|d|d|d|d|d|dkru|nd d|d|dkr|ndd|d|d|d|dd}"| r|"S t|"t	r|"d S |"S )Nz0FA4 does not support updating KV cache in-place.z+FA4 path does not support rotary embedding.zHFA4 path does not support non-consecutive batch indices or left padding.z"FA4 path does not support descale.r   )dtypedevicer   r   r   r   r   r   r   r   r!   r#   r@   r$   r(   r   r*   r&   r,   r.   r0   Tr   )
NotImplementedErrorr=   r)   torchfullshapeint32rU   r   r>   )#r   rA   rB   r   r   rC   rD   rE   rF   rH   rI   r   r   rJ   r   rK   rL   rM   rN   r   r!   r$   rP   r#   rR   scheduler_metadatar(   r*   rS   r'   r,   r.   r0   r1   r?   r   r   r   flash_attn_with_kvcache^   sp   $
	

r\   )r   r	   r
   r	   )NNNNNNNNFNr   NNr   NNNF),r   r   r   r   r   r   r   r	   r   r	   r   r	   r   r	   r   r   r   r   r   r	   r   r    r!   r"   r#   r    r$   r%   r&   r	   r'   r	   r(   r)   r*   r+   r,   r-   r.   r/   r0   r"   r1   r2   )NNNNNNNNNNNNNNNNNFr   Nr@   TNr   Nr   NNNF)Br   r   rA   r   rB   r   r   r	   r   r	   rC   r	   rD   r	   rE   r	   rF   rG   rH   r	   rI   r	   r   r	   r   r	   rJ   r	   r   r   rK   r	   rL   r	   rM   r	   rN   r	   r   r    r!   r"   r$   rO   rP   r   r#   rQ   rR   r"   r(   r)   r*   r+   rS   r)   r'   r	   r,   r-   r.   r/   r0   r"   r1   r2   )
__future__r   typingr   r   r   r   rW   &sglang.jit_kernel.flash_attention.cuter   r:   	Exception_er<   r   r\   r   r   r   r   <module>   s|    
L