o
    ٷi-                  
   @   s  d dl Z d dlm  mZ d dlmZ dZdZe	 r:zd dl
mZ e r-d dlmZmZ W n eefy9   Y n}w e rRzd dlmZ W no eefyQ   Y new z
d dlmZmZ W n eefyg   Y nw edu rz
d dlmZmZ W n eefy   Y nw edu rz
d dlmZmZ W n eefy   Y nw edu rz
d dlmZmZ W n eefy   Y nw edupeduZdd Zddd	Zd
d Zde jdee je jef fddZde jde jde jde jdef
ddZdd ZdS )    N)current_omni_platform)is_aiter_found_and_supported)flash_attn_funcflash_attn_varlen_func)r   c                 C   s$   | j dg| jdd R  }|| S )a  
    A local implementation of the PyTorch indexing operation `tensor[indices]` on the first axis,
    after flattening the first two dimensions of the tensor. This is functionally equivalent to
    FA2's `index_first_axis` and replaces the need to import it.
       N)reshapeshape)tensorindicesreshaped_tensor r   c/home/ubuntu/.local/lib/python3.10/site-packages/vllm_omni/diffusion/attention/backends/utils/fa.py_index_first_axisJ   s   r   c           	      C   s   |dur|| n|}|j dtjd}|j dtjd}tj| dd }|  }ttj	|dtjdd}t
| |||||fS )a4  
    unpad_input function for flash attention variants that do not have them within their pkg themselves, e.g. fa3.

    Arguments:
        hidden_states: (batch, seqlen, ...)
        attention_mask: (batch, seqlen), bool / int, 1 means valid and 0 means not valid.
        unused_mask: (batch, seqlen), bool / int, 1 means the element is allocated but unused.

    Return:
        hidden_states: (total_nnz, ...), where total_nnz = number of tokens selected in attention_mask + unused_mask.
        indices: (total_nnz), the indices of masked tokens from the flattened input sequence.
        cu_seqlens: (batch + 1), the cumulative sequence lengths, used to index into hidden_states.
        max_seqlen_in_batch: int
        seqused: (batch), returns the number of tokens selected in attention_mask + unused_mask.
    Nr   dimdtypeFas_tupler      r   )sumtorchint32nonzeroflattenmaxitemFpadcumsumr   )	hidden_statesattention_maskunused_mask	all_masksseqlens_in_batchused_seqlens_in_batchr   max_seqlen_in_batch
cu_seqlensr   r   r   _unpad_inputV   s   r)   c                 C   sL   | j dd }tj|| g|R | j| jd}| ||< |j||g|R  S )a  
    pad_input function for flash attention variants that do not have them within their pkg themselves, e.g. fa3.

    Arguments:
        hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask.
        indices: (total_nnz), the indices that represent the non-masked tokens of the original padded input sequence.
        batch: int, batch size for the padded sequence.
        seqlen: int, maximum sequence length for the padded sequence.

    Return:
        hidden_states: (batch, seqlen, ...)
    r   N)devicer   )r	   r   zerosr*   r   view)r!   r   batchseqlenr   outputr   r   r   
_pad_inputv   s   "r0   r"   returnc                 C   sV   | j dtjd}tj|  dd }|  }ttj	|dtjdd}|||fS )a~  
    Retrieves indexing data required to repad unpadded (ragged) tensors.

    Arguments:
        attention_mask (`torch.Tensor`):
            Boolean or int tensor of shape (batch_size, sequence_length), 1 means valid and 0 means not valid.

    Return:
        indices (`torch.Tensor`):
            The indices of non-masked tokens from the flattened input sequence.
        cu_seqlens (`torch.Tensor`):
            The cumulative sequence lengths, used to index into ragged (unpadded) tensors.
             `cu_seqlens` shape is (batch_size + 1,).
        max_seqlen_in_batch (`int`):
            Maximum sequence length in batch.
    r   r   Fr   r   r   )
r   r   r   r   r   r   r   r   r   r    )r"   r%   r   r'   r(   r   r   r   _get_unpad_data   s   r2   query_layer	key_layervalue_layerquery_lengthc                 C   s<  t j r
dt jj_t|\}}}|jd |jd  }	kr>|ddd|	ddddf |ddd|	ddddf }}|j\}
}}}t||}t||}||kr_t| |} |}|}|}n3|dkr}d}t j	|
d t j
| jd}|dd }| d} n|dd| df }|| |^} }}}}| |||||f||ffS )a  
    Unpads query, key, and values tensors, using a single dimension for all tokens even though they belong
    to different batches. This function is used instead of `flash_attn.bert_padding.unpad_input` in
    order to avoid the recomputation of the same intermediary tensors for query, key, value tensors.

    Arguments:
        query_layer (`torch.Tensor`):
            Query state with padding. Shape: (batch_size, query_length, num_heads, head_dim).
        key_layer (`torch.Tensor`):
            Key state with padding. Shape: (batch_size, kv_seq_len, num_key_value_heads, head_dim).
        value_layer (`torch.Tensor`):
            Value state with padding. Shape: (batch_size, kv_seq_len, num_key_value_heads, head_dim).
        attention_mask (`torch.Tensor`):
            Boolean or int tensor of shape (batch_size, sequence_length), 1 means valid and 0 means not valid.
        query_length (`int`):
            Target length.
        unpad_input_func:
            The function to use for unpadding the input tensors.

    Return:
        query_layer (`torch.Tensor`):
            Query state without padding. Shape: (total_target_length, num_heads, head_dim).
        key_layer (`torch.Tensor`):
            Key state with padding. Shape: (total_source_length, num_key_value_heads, head_dim).
        value_layer (`torch.Tensor`):
            Value state with padding. Shape: (total_source_length, num_key_value_heads, head_dim).
        indices_q (`torch.Tensor`):
            The indices of non-masked tokens from the flattened input target sequence.
        (cu_seqlens_q, cu_seqlens_k) (`tuple[int]`):
            The cumulative sequence lengths for the target (query) and source (key, value), used to index into
             ragged (unpadded) tensors. `cu_seqlens` shape is (batch_size + 1,).
        (max_seqlen_in_batch_q, max_seqlen_in_batch_k) (`tuple[int]`):
            Maximum sequence length in batch (`max_seqlen_in_batch_q` for the target sequence i.e. query,
            `max_seqlen_in_batch_k` for the source sequence i.e. key/value).
    Tr   r   N)r   r*   )r   compileris_compiling_dynamoconfigcapture_scalar_outputsr2   r	   r   aranger   r*   squeeze)r3   r4   r5   r"   r6   unpad_input_func	indices_kcu_seqlens_kmax_seqlen_in_batch_kseq_len
batch_size
kv_seq_lennum_key_value_headshead_dimcu_seqlens_qmax_seqlen_in_batch_q	indices_q_r   r   r   _upad_input   s:   
+
B


rK   c                 C   sF   | du rdS t j| jd | jd|   }|dko"||     S )a"  
    Check the position ids whether packed sequences are indicated or not
        1. Position ids exist
        2. Flattened sequences only are supported
        3. Compile-friendly `not (torch.diff(position_ids, dim=-1) >= 0).all()`, i.e.
        we have multiple increasing sequences
    NFr   )r*   )r   r<   r	   r*   minabsr   bool)position_idsrC   increasing_position_sequencesr   r   r   _is_packed_sequence   s   rQ   )N)r   torch.nn.functionalnn
functionalr   vllm_omni.platformsr   r   r   is_rocmvllm._aiter_opsr   aiterImportErrorModuleNotFoundErroris_xpu#vllm.v1.attention.backends.fa_utilsfa3_fwd_interfaceflash_attn_interface
flash_attnflash_attn.flash_attn_interfaceHAS_FLASH_ATTNr   r)   r0   Tensortupleintr2   rK   rQ   r   r   r   r   <module>   sx   
 "
U