o
    i
                     @   sz   d dl Z ddlmZ ddlmZ e rd dlmZ 							dde jjde j	de j	d	e j	d
e j	dede j	fddZ
dS )    N   )PagedAttentionCache)is_flash_attn_2_available)flash_attn_varlen_funcmoduleqkvattention_maskcachereturnc                 K   s   |j ||| jfd|i|\}}t|ddd|ddd|ddd|tj|tj||	| jddd
}|dfS )	a  Perform the forward pass of attention with paged key-value cache.

    This function handles the cache updates and performs the attention computation
    using the flash_attn_varlen_func for efficient processing.

    Args:
        q: (total_q, nheads, headdim), where total_q = total number of query tokens in the batch.
        k: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch.  but if there is a block table it can be the full k
        v: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch.  but if there is a block table it can be the full v
        cumulative_seqlens_q: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
           of the sequences in the batch, used to index into q.
        cumulative_seqlens_k: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
           of the sequences in the batch, used to index into kv.
        max_seqlen_q: int. Maximum query sequence length in the batch.
        max_seqlen_k: int. Maximum key sequence length in the batch.
        dropout_p: float. Dropout probability.
        softmax_scale: float. The scaling of QK^T before applying softmax.
            Default to 1 / sqrt(headdim).
        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
        window_size: (left, right). If not (-1, -1), implements sliding window local attention.
        softcap: float. Anything > 0 activates softcapping attention.
    cumulative_seqlens_k   r   r   T)r   )softmax_scalecausalwindow_sizeN)	update	layer_idxr   	transposesqueezetotorchint32scaling)r   r   r   r	   r
   r   cumulative_seqlens_qr   max_seqlen_qmax_seqlen_kblock_tableskwargsattn_output r!   h/home/ubuntu/maya3_transcribe/venv/lib/python3.10/site-packages/transformers/integrations/flash_paged.pypaged_attention_forward   s    $

r#   )NNNNNNN)r   generation.continuous_batchingr   utilsr   
flash_attnr   nnModuleTensorr#   r!   r!   r!   r"   <module>   s6    