o
    i0                     @   s|   d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZm	Z	m
Z
 d dlmZmZ G dd dejZG dd dejZdS )	    N)	rearrange)index_first_axis	pad_inputunpad_input)convert_blockmaskflash_blocksparse_attn_funcc                       sD   e Zd ZdZ					d fdd	Z							dd	d
Z  ZS )FlashBlocksparseAttentiona{  Implement the scaled dot product attention with softmax.
    Arguments
    ---------
        softmax_temp: The temperature to use for the softmax attention.
                      (default: 1/sqrt(d_keys) where d_keys is computed at
                      runtime)
        attention_dropout: The dropout rate to apply to the attention
                           (default: 0.1)
    N           c           	         sn   t    tj|| _|| _|| _|d d d d }| j|}| 	d| t
| jdd}| 	d| d S )N      layoutF)causalblockmask_converted)super__init__hydrautilsinstantiatesparsity_configsoftmax_temp	dropout_pmake_layoutregister_bufferr   r   )	selfr   r   attention_dropoutmax_seq_lengthdevicedtyper   r   	__class__ \/home/ubuntu/vllm_env/lib/python3.10/site-packages/flash_attn/flash_blocksparse_attention.pyr      s   
	z"FlashBlocksparseAttention.__init__FTc	              
   C   st  |rJ |du s
J |j tjksJ |jsJ |du r|jd }	|jd }
|
d d d d }|d | jjd ksEJ |d | jjd k| jd|d d|d f }|du rt|d}|
}tjd|	d |
 |
tj|j	d}t
|||| jry| jnd|| j|d	}t|d
|	d}|dfS |j}|jd }t|d}t||\}}}}}t|dd|d}t
|||| jr| jnd|| j|d	}ttt|d||	|
d|d}|dfS |dusJ |}
|
d d d d }|d | jjd ksJ |d | jjd k| jd|d d|d f }|r!t
|||| jr| jnd|| j|d	}|dfS t
||| j| jr-| jnd|| j|dd}|dfS )a  Implements the multihead softmax attention.
        Arguments
        ---------
            qkv: The tensor containing the query, key, and value. (B, S, 3, H, D) if key_padding_mask is None
            attn_mask: An implementation of BaseMask that encodes where each
                       query can attend to
            key_padding_mask: An implementation of BaseMask that encodes how
                         many query each sequence in the batch consists of
        Nr   r   r      zb s ... -> (b s) ...)stepr   r   r	   )softmax_scaler   z(b s) ... -> b s ...)bz b s three h d -> b s (three h d)z nnz (three h d) -> nnz three h d   threehznnz h d -> nnz (h d)zb s (h d) -> b s h d)r+   F)r%   r   convert_mask)r   torchfloat16is_cudashaper   r   arangeint32r   r   trainingr   r   bool_matrixr   r   r   )r   qkv	attn_maskkey_padding_maskr   
cu_seqlensmax_sneed_weightsr,   
batch_sizeseqlenseqlen_rounded	blockmaskoutputkey_padding_mask_boolnheadsxx_unpadindices_output_unpadr!   r!   r"   forward0   s   



	6

	&z!FlashBlocksparseAttention.forward)Nr	   r
   NN)NNFNNFT)__name__
__module____qualname____doc__r   rG   __classcell__r!   r!   r   r"   r      s     r   c                       s<   e Zd Z							d	d fddZ	dd	d
Z  ZS )FlashBlocksparseMHATr	   Fr
   Nreturnc                    s   |sJ |	|
d}t    || _|| _|| _| j| dks"J d| j| | _| jdv s1J dtj|d| fd|i|| _t	|f||d|| _
tj||fd|i|| _d S )	N)r   r   r   z(self.kdim must be divisible by num_heads)r#       @   z&Only support head_dim == 16, 32, or 64r(   bias)r   r   )r   r   	embed_dimr   	num_headshead_dimnnLinearWqkvr   
inner_attnout_proj)r   rR   rS   r   rQ   batch_firstr   r   r   r   r   kwargsfactory_kwargsr   r!   r"   r      s&   

zFlashBlocksparseMHA.__init__c           
      C   sH   |  |}t|dd| jd}| j|||| jd\}}	| t|d|	fS )Nz b s (three h d) -> b s three h dr(   r)   )r7   r:   r   zb s h d -> b s (h d))rW   r   rS   rX   r   rY   )
r   rB   
x_ignored_x_ignored_1_r6   r7   r:   r5   contextattn_weightsr!   r!   r"   rG      s   


zFlashBlocksparseMHA.forward)TTr	   Fr
   NN)rN   N)NNF)rH   rI   rJ   r   rG   rL   r!   r!   r   r"   rM      s    #rM   )mathr   r-   torch.nnrU   einopsr   flash_attn.bert_paddingr   r   r   +flash_attn.flash_blocksparse_attn_interfacer   r   Moduler   rM   r!   r!   r!   r"   <module>   s     