o
    ٷi-                     @   sT   d dl Z d dlmZ d dlmZmZmZ eeZG dd deZ	G dd deZ
dS )    N)init_logger)AttentionBackendAttentionImplAttentionMetadatac                   @   sj   e Zd ZU dZeed< edefddZede	e
 fddZedefdd	Zeded
 fddZdS )FlashAttentionBackendTaccept_output_bufferreturnc                 C      dS )NT )clsr
   r
   e/home/ubuntu/.local/lib/python3.10/site-packages/vllm_omni/diffusion/attention/backends/flash_attn.pysupports_attention_mask      z-FlashAttentionBackend.supports_attention_maskc                   C   s   g dS )N)@   `            r
   r
   r
   r
   r   get_supported_head_sizes   s   z.FlashAttentionBackend.get_supported_head_sizesc                   C   r	   )N
FLASH_ATTNr
   r
   r
   r
   r   get_name   r   zFlashAttentionBackend.get_nameFlashAttentionImplc                   C   s   t S N)r   r
   r
   r
   r   get_impl_cls   r   z"FlashAttentionBackend.get_impl_clsN)__name__
__module____qualname__r   bool__annotations__classmethodr   staticmethodlistintr   strr   typer   r
   r
   r
   r   r      s   
 r   c                   @   s  e Zd Z			ddedededededB d	ed
dfddZede	j
ee	j
df B d
e	j
fddZde	j
de	j
de	j
de	j
d
e	j
f
ddZ	dde	j
de	j
de	j
ded
e	j
f
ddZ	dde	j
de	j
de	j
ded
e	j
f
ddZ	dde	j
de	j
de	j
ded
e	j
f
ddZdS ) r   FN 	num_heads	head_sizesoftmax_scalecausalnum_kv_headsprefixr   c                 K   s   || _ || _|| _d S r   )r&   r)   r(   )selfr&   r'   r(   r)   r*   r+   extra_impl_argsr
   r
   r   __init__%   s   

zFlashAttentionImpl.__init__out.c                 C   s   t | tr	| d S | S )Nr   )
isinstancetuple)r/   r
   r
   r   _unwrap_flash_output3   s   z'FlashAttentionImpl._unwrap_flash_outputquerykeyvalueattention_maskc                 C   s   ddl m}m}m}m} |jdksJ d|d}	||||||	|\}
}}}\}}\}}||
||f||||d| j| jd}| 	|}||||d|	S )Nr   )
_pad_input_unpad_input_upad_inputflash_attn_varlen_func   z0attention_mask must be 2D, (batch_size, seq_len)   )cu_seqlens_qcu_seqlens_kmax_seqlen_qmax_seqlen_kr)   r(   )
/vllm_omni.diffusion.attention.backends.utils.far7   r8   r9   r:   ndimsizer)   r(   r2   )r,   r3   r4   r5   r6   r7   r8   r9   r:   query_lengthqkv	indices_qcu_seq_lens_qcu_seq_lens_kmax_length_qmax_length_k	out_unpadr
   r
   r   _forward_varlen_masked8   s,   
	
z)FlashAttentionImpl._forward_varlen_maskedattn_metadatac           	      C   sr   ddl m}m} |std|dur|jnd}|dur)t| r)| ||||S ||||| j| j	d}| 
|S )z)CUDA/ROCm flash attention implementation.r   )HAS_FLASH_ATTNflash_attn_funczFlashAttentionBackend requires Flash Attention. Please install one of: fa3-fwd, flash-attention, or flash-attn. Otherwise, use SDPA backend by setting DIFFUSION_ATTENTION_BACKEND=TORCH_SDPANrA   )rB   rQ   rR   ImportError	attn_masktorchanyrO   r)   r(   r2   )	r,   r3   r4   r5   rP   rQ   rR   r6   r/   r
   r
   r   forward_cuda\   s*   
zFlashAttentionImpl.forward_cudac                 C   s   ddl m}m} |std|dur|jnd}|dur)t| r)| ||||S | dd \}}	tj	d|d |	 |	tj
|jd}
|dd}|dd}|dd}|||||
|
|	|	| j| jd	}| |}|j||	g|jdd R  S )	z#XPU flash attention implementation.r   )rQ   r:   zFlashAttentionBackend requires Flash Attention. Please assure vllm-xpu-kernels properly installed. Otherwise, use SDPA backend by setting DIFFUSION_ATTENTION_BACKEND=TORCH_SDPANr;   r<   )stepdtypedevice)r=   r>   r?   r@   r)   r(   )rB   rQ   r:   rS   rT   rU   rV   rO   rD   arangeint32rZ   flattenr)   r(   r2   reshapeshape)r,   r3   r4   r5   rP   rQ   r:   r6   
batch_sizeq_len
cu_seqlensr/   r
   r
   r   forward_xpu   s>    
zFlashAttentionImpl.forward_xpuc              	   C   sP   zddl m} W n ty   tdw |r|jnd}|||||dddd}|S )	z,NPU attention implementation using mindiesd.r   )attention_forwarda  FlashAttentionBackend NPU implementation requires MindIE-SD. Please install MindIE-SD to enable NPU attention support. For installation details, see https://gitcode.com/Ascend/MindIE-SDOtherwise, use SDPA backend by setting DIFFUSION_ATTENTION_BACKEND=TORCH_SDPANmanualfused_attn_scoreBNSD)rT   opt_modeop_typelayout)mindiesdrd   rS   rT   )r,   r3   r4   r5   rP   rd   r6   outputr
   r
   r   forward_npu   s$   	zFlashAttentionImpl.forward_npu)FNr%   r   )r   r   r   r"   floatr   r#   r.   r    rU   Tensorr1   r2   rO   r   rW   rc   rm   r
   r
   r
   r   r   $   s    	
&
)
,
9r   )rU   vllm.loggerr   /vllm_omni.diffusion.attention.backends.abstractr   r   r   r   loggerr   r   r
   r
   r
   r   <module>   s   