o
    Tio                     @   sZ   d Z ddlZddlZddlmZ ejdejdejdejfddZG dd	 d	ej	j
ZdS )
z
Inspired by original Triton implementation:
https://github.com/openai/triton/blob/release/2.1.x/python/tutorials/06-fused-attention.py
    NBLOCK_MBLOCK_DMODELBLOCK_Nc           3      C   st  t d}t d}|| }t j| | ||f||f|| df||fdd}t j|| ||f||fd||fdd}t j|| ||f||fd||fdd} || t d| }!t d|}"t j|gt jdtd }#t j|gt jd}$t j||gt jd}%|d	 }&t |}'|'|& t j	}'d}(|})t
|(|)|D ]v}*t |}+t | },t j||gt jd}-|-t |'|+7 }-t |#t |-d}.t j|#|. }/t j|-|.d d d f  }0|$d |/ }1|%|1d d d f 9 }%|%t |0t j	|,7 }%|$|/ t |0d }$|.}#t |d|f}t | |df} q|%|$d d d f  }%t j|| ||f||f|| df||fdd}2t |2|%t j	 d S )
Nr      )r   r   )baseshapestridesoffsetsblock_shapeorder)r   r   )r   r   )dtypeinfg/ldG?)tl
program_idmake_block_ptrarangezerosfloat32floatloadtofloat16rangedotmaximummaxmathexp2sumadvancestore)3QKVsm_scaleOut	stride_qz	stride_qh	stride_qm	stride_qk	stride_kz	stride_kh	stride_kn	stride_kk	stride_vz	stride_vh	stride_vk	stride_vn	stride_oz	stride_oh	stride_om	stride_onZHN_CTXr   r   r   start_moff_hz
qvk_offsetQ_block_ptrK_block_ptrV_block_ptroffs_moffs_nm_il_iaccqk_scaleqlohistart_nkvqkm_i_newalphap	acc_scaleO_block_ptr rQ   b/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/ops/transformer/inference/triton_ops.py_fwd_kernel   st   










rS   c                       s&   e Zd Z fddZdddZ  ZS )triton_flash_attnc                    s   t t|   d S )N)superrT   __init__)self	__class__rQ   rR   rV   |   s   ztriton_flash_attn.__init__Tc                 C   s.  |rdnd}|j d |j d |j d }}}	t|}
t|j d ||j d |j d  f}|dkr5dnd}t| |||||
|d|d|d|d	|d|d|d|d	|d|d|d|d	|
d|
d|
d|
d	|j d |j d |j d f||||dd
 |
S )N   @      r   r            )r   r   r   	num_warps
num_stages)r   torch
empty_liketritoncdivrS   stride)rW   rE   rI   rJ   r$   	block_128BLOCKLqLkLvogridra   rQ   rQ   rR   forward   sL   "
&ztriton_flash_attn.forward)T)__name__
__module____qualname__rV   ro   __classcell__rQ   rQ   rX   rR   rT   z   s    rT   )__doc__rc   re   triton.languagelanguager   jit	constexprrS   nnModulerT   rQ   rQ   rQ   rR   <module>   s   j