o
    io                     @   s   d dl mZ d dlZddlmZ ddlmZ ze r#d dlmZ eZ	ne
dW n eyB Z zeeZdd	 Z	W Y dZ[ndZ[ww 							dd
ejjdejdejdejdeej dedejfddZdS )    )OptionalN   )PagedAttentionCache)is_flash_attn_2_available)flash_attn_varlen_funczFlash Attention 2 is not installed. Please refer to https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install itc                  O   s   t dt )Nz)flash_attn_varlen_func is not available: )	Exceptionmsg)argskwargs r   ^/home/ubuntu/LTX-2/.venv/lib/python3.10/site-packages/transformers/integrations/flash_paged.pyFLASH_ATTN_VARLEN_FUNC   s   r   moduleqkvattention_maskcachereturnc                 K   s  t | ddsdn| jd df}|dkrdnd}|dur)|j||| jfi |\}}t|tr6|| }|	| }	|
durCt|
d	rC|
j}nt}d
|v rPd
|	d
ini }||
ddd | | |tj|tj ||	f| jd|d|}t|tr|d }|dfS )a  Perform the forward pass of attention with paged key-value cache.

    This function handles the cache updates and performs the attention computation
    using the flash_attn_varlen_func for efficient processing.

    Args:
        q: (total_q, nheads, headdim), where total_q = total number of query tokens in the batch.
        k: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch.  but if there is a block table it can be the full k
        v: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch.  but if there is a block table it can be the full v
        cu_seq_lens_q: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
           of the sequences in the batch, used to index into q.
        cu_seq_lens_k: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
           of the sequences in the batch, used to index into kv.
        max_seqlen_q: int. Maximum query sequence length in the batch.
        max_seqlen_k: int. Maximum key sequence length in the batch.
        dropout_p: float. Dropout probability.
        softmax_scale: float. The scaling of QK^T before applying softmax.
            Default to 1 / sqrt(headdim).
        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
        window_size: (left, right). If not (-1, -1), implements sliding window local attention.
        softcap: float. Anything > 0 activates softcapping attention.
    sliding_windowF)r      r   full_attentionsliding_attentionNr   s_auxr   T)softmax_scalecausalwindow_size)getattrr   update	layer_idx
isinstancedicthasattrr   r   get	transposesqueeze
contiguoustotorchint32clonescalingtuple)r   r   r   r   r   r   cu_seq_lens_qcu_seq_lens_kmax_seqlen_qmax_seqlen_kimplementationr
   r   
layer_typer   custom_kwargsattn_outputr   r   r   paged_attention_forward   s:   $


r6   )NNNNNNN)typingr   r)   generation.continuous_batchingr   utilsr   
flash_attnr   r   RuntimeErrorr   ereprr   nnModuleTensorr6   r   r   r   r   <module>   sN    