o
    Ti:,                     @   s   d dl Z d dlZd dlmZ dd Zdd Zeddd iedd	d iejd
d Z	eddd ieddd iejdd Z
G dd de jjZG dd dZdS )    Nc                 C   sP   | d8 } | | d? O } | | d? O } | | d? O } | | d? O } | | d? O } | d7 } | S )N                nr   r   Z/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/ops/sparse_attention/softmax.pynext_power_of_2   s   r   c                 C   s   | dk rdS | dk rdS dS )Ni   r   i   r   r   r   r   r   r   r
   	num_warps   s
   r   c                  O      t | d |d  S N   BLOCKr   argsmetar   r   r
   <lambda>"       r   TNc                  O   r   r   r   r   r   r   r
   r   #   r   c           '      K   s  |d }|d }t d}t d}|| }|| }t d|| }t d|| }||d  }t |d }t |d }||k }t |||d }t || |d  d }t || |d  d }t || |d  d }t || |d  d }| ||  || |  ||  | }t j||td d	} | t j} |d
 r| | } |d r|||  ||	  ||  || |
  ||
  | }!t j|!|dd	}"| |" } |d r|||  ||  | }#t j|#|td d	}$|d rt |$dktd d}$| |$ } |d r<|||  || |  ||  | }%t j|%|td d	}&|d r8t |&dktd d}&| |& } t | } t j	|| |d d S )Nr   r   r   r   r   r      infmaskotherAPPLY_SCALE	APPLY_RPEAPPLY_KP_MASKKP_MASK_MULg        APPLY_ATTN_MASKATTN_MASK_MULr   )
tl
program_idarangeloadwherefloattofloat32softmaxstore)'XscaleLUTRPEKP_MATTN_Msizemax	stride_zxstride_zrpestride_hrpestride_srpestride_zkpmstride_zattnmr   r   r   pidhmpidzrxmrbmrxnrbnheadersizeoffsetcheckrbmnblockidcolumnidrowidheadidpxxprperpepkp_mkp_mpattn_mattn_mr   r   r
   _forward"   sP   

$4


$

rS   c                  O   r   Nr   r   r   r   r   r   r
   r   Z   r   c                  O   s   t | d |d  S rT   r   r   r   r   r
   r   [   r   c                 K   s\  t d}t d}	|d }
|d }|| }|| }t d|
| }t d|
| }||d  }t |d }t |d }||k }t |||d }t || |d  }| |	|  || |  ||  | } ||	|  || |  ||  | }t j| |dd}t j||dd}|t j}|t j}||t || d  | }t j|||d d S )	Nr   r   r   r   r   r   r   r$   )	r%   r&   r'   r(   r)   r+   r,   sumr.   )r/   r0   DXr1   r5   r6   
stride_zdxr   r<   r=   r   r   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rL   dxyr   r   r
   	_backwardZ   s,   

$$rZ   c                   @   s6   e Zd Ze Zedd Zedd Zedd ZdS )_sparse_softmaxc              	   C   s<  t jg t j| jd}| }t| jd D ]}t || |d d d d f df}qt 	|}t j
|d d dd|dd < t |  }|  d d df }|  d d df }	|  d d df }
t j||
|	|fddd}|d d|   }t j||fddd}t ||ft j|}|t| fS )Ndtypedevicer   )dimr   r   r   )torchtensorint64r^   clonerangeshapecatrU   
zeros_likecumsumr'   nonzerostackviewnumeltypeint32r+   intmax)layoutblockr^   _emptysizeshoffsetsidxheadrowscolumnscorerB   lutr   r   r
   make_lut   s   (
z_sparse_softmax.make_lutc                    s  |dkrdnd}|d u rd}d\}}}t jd|j|jd}nd}|d|d|d}}}|d u rDd}d}t jd|j|jd}nd}|d}|d u r^d}d}t jd|j|jd}nd}|d}|jd  |||||d	k|d	kd
} fdd}t| |||
|||||d|||||fi | | | | ||
 | _	| _
|| _|| _|| _|| _|| _|| _|| _|| _|S )N      ?FT)r   r   r   r   r\   r   r   mul)r   r   r   r    r"   r!   r#   c                    s   d d    gS Nr   r   r   optMrs   spdimsr   r
   r      s    z)_sparse_softmax.forward.<locals>.<lambda>)ra   emptyr]   r^   striderf   rS   
mark_dirtysave_for_backwardr   rs   maxlutr0   apply_scale	apply_rpeapply_kp_maskapply_attn_maskkp_mask_modeattn_mask_mode)ctxrL   r0   rN   key_padding_mask	attn_maskr   r   r   rs   r}   
num_blocksr   benchtimer   r   r7   r8   r9   r   r:   r   r;   r   gridr   r   r
   forward   s^   
"


	

z_sparse_softmax.forwardc                    sr   j \}}|jd   fdd}t| |j||j|d|djd |d d d d d d d d d d d d d d fS )Nr   c                    s   j d j d  j  gS r   )r   rs   r   r   r   r   r
   r      s    z*_sparse_softmax.backward.<locals>.<lambda>)r   )saved_tensorsrf   rZ   r0   r   r   rs   )r   rX   rL   r}   r   r   r   r
   backward   s
   

."z_sparse_softmax.backwardN)	__name__
__module____qualname__dictbwd_kernelsstaticmethodr~   r   r   r   r   r   r
   r[   {   s    

>r[   c                   @   s@   e Zd ZdZdd Zdd ZdddZ			
	
	
		dddZd
S )Softmaxar  Block-Sparse Softmax class; this class computes softmax on a block sparse matrix. It is also able to apply either/all of the following masks:
       - relative position embedding
       - key padding mask
       - attention mask

    For more details about sparsity config, please see `Generative Modeling with Sparse Transformers`: https://arxiv.org/abs/1904.10509
    c                  O   s   t j| i |S )N)r[   apply)r   kwargsr   r   r
   sparse_softmax   s   zSoftmax.sparse_softmaxc                 C   s2   |f}|| j vrt| j| j|| j |< | j | S )zCGenerates the sparsity layout used in block-sparse softmax
        )	lut_cacher[   r~   rr   rs   )selfr^   keyr   r   r
   r~      s   

zSoftmax.make_lutFc                 C   s4   |   | _|j| _|| _|| _|| _t | _	dS )a  Initialize the Block-Sparse Softmax class.

        Arguments:
             layout: required: sparsity layout tensor
             block: required: an integer determining the block size.
             bench: optional: set if you want to do benchmarking
        N)
rU   itemr   rf   r   rr   rs   r   r   r   )r   rr   rs   r   r   r   r
   __init__   s   	zSoftmax.__init__r   Naddc                 C   s   dg}|dur|j |j krtd|j  |dur%|j |j kr%td|j  |dur6|j |j kr6td|j  | |j\}	}
t|||||||| j| j|	| j|
| j	|}|d | _
|S )a  Applies softmax on a Block-Sparse input tensor.

        For more details about sparsity config, please see `Generative Modeling with Sparse Transformers`: https://arxiv.org/abs/1904.10509

        Arguments:
             x: required: a block-sparse tensor that softmax is applied on it; computation will be in place and result will be returned in the same tensor
             scale: optional: a float value; x values will be multiplied by this value before normalization. Default value is 1.0.
             rpe: optional: a tensor same dimension as x that is used as relative position embedding
             key_padding_mask: optional: a mask tensor of size (BatchSize X SequenceLength)
             attn_mask: optional: a mask tensor of size (SequenceLength X SequenceLength); currently only 2D is supported
             key_padding_mask_mode: optional: a boolean determining if key_padding_mask needs to be added or multiplied
             attn_mask_mode: optional: a boolean determining if attn_mask needs to be added or multiplied

        Return:
             x: a block-sparse tensor contains normalized input x using softmax; and masks applied if given
        Nz&relative position embedding must be %szAttention mask must be %szKey padding mask must be %sr   )r]   
ValueErrorr~   r^   r   r   r   rs   r   r   time_y)r   rL   r0   rN   r   r   key_padding_mask_moder   r   r}   r   r   r   r
   __call__  s   
zSoftmax.__call__)F)r   NNNr   r   )r   r   r   __doc__r   r~   r   r   r   r   r   r
   r      s    
r   )ra   tritontriton.languagelanguager%   r   r   
heuristicsjitrS   rZ   autogradFunctionr[   r   r   r   r   r
   <module>   s   5e