o
    TiZ                     @   sB   d dl mZ d dlZd dlmZ d dlmZ G dd dejZdS )    N)distributed)SparsityConfigc                       sb   e Zd ZdZedddddf fdd	Ze Zd	d
 Zdd Z	dd Z
dddZdddZ  ZS )SparseSelfAttentiona9  Implements an efficient Sparse Self Attention of Transformer layer based on `Generative Modeling with Sparse Transformers`: https://arxiv.org/abs/1904.10509

    For more information please see, TODO DeepSpeed Sparse Transformer.

    For usage example please see, TODO DeepSpeed Sparse Transformer Tutorial.
       )	num_headsaddmuli   c                    s>   t    || _| j|}| d| d| _|| _|| _dS )at  Initialize the sparse self attention layer.
        Arguments:
            sparsity_config: optional: this parameter determines sparsity pattern configuration; it is based on SparsityConfig class.
            key_padding_mask_mode: optional: a string determining if key padding mask needs to be added, `add`, or be multiplied, `mul`.
            attn_mask_mode: optional: a string determining if attention mask needs to be added, `add`, or be multiplied, `mul`.
            max_seq_length: optional: the maximum sequence length this sparse attention module will be applied to; it controls the size of the master_layout.
        master_layoutTN)super__init__sparsity_configmake_layoutregister_buffer_need_layout_synchronizationkey_padding_mask_modeattn_mask_mode)selfr   r   r   max_seq_lengthr	   	__class__ h/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/ops/sparse_attention/sparse_self_attention.pyr      s   

zSparseSelfAttention.__init__c                 C   sv   | j rt rtj| jdd d| _ || jj dkr'td| d| jj d|| jj }| jdd |d |f  S )Nr   )srcFzSequence Length, z&, needs to be dividable by Block size !.)	r   distis_initialized	broadcastr	   r   block
ValueErrorcpu)r   L
num_blocksr   r   r   
get_layout2   s   zSparseSelfAttention.get_layoutc           	      C   s   ddl m} ddlm} |tjvr;| |}||| jjdddd}||| jjdddd}||| jj}|||ftj|< tj| S )	Nr   )MatMul)SoftmaxsddFT)trans_atrans_bdsd)	%deepspeed.ops.sparse_attention.matmulr#   &deepspeed.ops.sparse_attention.softmaxr$   r   opsr"   r   r   )	r   Hr    r#   r$   sparsity_layoutsparse_dot_sdd_ntsparse_dot_dsd_nnsparse_softmaxr   r   r   get_ops@   s   


zSparseSelfAttention.get_opsc                 C   s,   |  \}}}}||kr|ddddS |S )Nr            )sizepermute)r   xr    bszr   seq_lenhead_dimr   r   r   transpose_key_for_scoresR   s   z,SparseSelfAttention.transpose_key_for_scoresFc                 C   sD   | |}|r| }t|d ddD ]}|j|d}q|S | S )Nr2   r   )dim)typer=   rangesqueeze)r   qtyper7   is_key_padding_maskxdimdr   r   r   transpose_mask_for_sparseX   s   
z-SparseSelfAttention.transpose_mask_for_sparseNc              	   C   s   |j tjks
J d| \}}}	}
| ||	}|j|jks$|j|jkr(td|dur5| j|j |dd}|dur@| |j |}| ||	\}}}t	|
d }|||}||||||| j
| jd}|||}|S )a4  Applies forward phase of sparse self attention

        Arguments:
            query: required: query tensor
            key: required: key tensor
            value: required: value tensor
            rpe: optional: a tensor same dimension as x that is used as relative position embedding
            key_padding_mask: optional: a mask tensor of size (BatchSize X SequenceLength)
            attn_mask: optional: a mask tensor of size (SequenceLength X SequenceLength); currently only 2D is supported
            key_padding_mask_mode: optional: a boolean determining if key_padding_mask needs to be added or multiplied
            attn_mask_mode: optional: a boolean determining if attn_mask needs to be added or multiplied

        Return:
             attn_output: a dense tensor containing attention context
        znsparse attention only supports training in fp16 currently, please file a github issue if you need fp32 supportz(only self-attention is supported for nowNT)rB   g      )scalerpekey_padding_mask	attn_maskr   r   )dtypetorchhalfr5   r;   shapeNotImplementedErrorrE   r1   floatr   r   )r   querykeyvaluerG   rH   rI   r8   r   tgt_lenr:   r.   r/   r0   scalingattn_output_weightsattn_outputr   r   r   forwardb   s,   

	zSparseSelfAttention.forward)F)NNN)__name__
__module____qualname____doc__r   r   dictr+   r"   r1   r;   rE   rW   __classcell__r   r   r   r   r      s    


r   )	torch.nnnnrK   r   r   deepspeed.ops.sparse_attentionr   Moduler   r   r   r   r   <module>   s
   