o
    i@,                     @   s   d Z ddlZddlZddlZddlmZ ejdejdejdejfddZ	ejdejdejfd	d
Z
ejdejdejdejfddZG dd dejjZejZdS )z
Fused Attention
===============
This is a Triton implementation of the Flash Attention algorithm
(see: Dao et al., https://arxiv.org/pdf/2205.14135v2.pdf; Rabe and Staats https://arxiv.org/pdf/2112.05682v2.pdf)
    NBLOCK_MBLOCK_DMODELBLOCK_Nc           ?   
   C   s|  t d}t d}|| t d| } t d|}!t d|}"||	 | d d d f |
  |"d d d f |  }#||	 |!d d d f |  |"d d d f |  }$||	 |!d d d f |
  |"d d d f |  }%| |# }&||$ }'||% }(|||  |  })t j|gt jdtd }*t j|gt jd}+t j||gt jd},t |&}-td|d | |D ]}.t |.|}.t |'|.|  }/t j||gt jd}0|0t j	|-|/dd7 }0|0|9 }0|0t 
| d d d f |.|!d d d f  kdtd7 }0t |0d}1t |0|1d d d f  }2t |2d}3t |*|1}4t |*|4 }5t |1|4 }6|5|+ |6|3  }7|6|7 }8|2|8d d d f  }2|+|7 |5 }9t |)|9 t |)}9|,|9d d d f  },t |(|.|  }:|2|:j}2|,t 	|2|:7 },|7}+|4}*qt d}|| t d| } |||  |  };|||  |  }<t |;|+ t |<|* t d|}!|| | d d d f |  |!d d d f |  }=||= }>t |>|, d S )Nr      dtypeinfTtrans_b-inf)tl
program_idarangezerosfloat32floatloadrangemultiple_ofdotwheremaxexpsummaximumstoretor   )?QKVsm_scaleTMPLMOut	stride_qz	stride_qh	stride_qm	stride_qk	stride_kz	stride_kh	stride_kn	stride_kk	stride_vz	stride_vh	stride_vk	stride_vn	stride_oz	stride_oh	stride_om	stride_onZHN_CTXr   r   r   start_moff_hzoffs_moffs_noffs_doff_qoff_koff_vq_ptrsk_ptrsv_ptrst_ptrsm_il_iaccqstart_nkqkm_ijpl_ijm_i_newalphabetal_i_newp_scale	acc_scalevl_ptrsm_ptrsoff_oout_ptrs rY   U/home/ubuntu/vllm_env/lib/python3.10/site-packages/flash_attn/flash_attn_triton_og.py_fwd_kernel   sf   
!
000
6

0r[   D_HEADc                 C   s  t d| t d| }t d|}t | |d d d f |  |d d d f  t j}	t ||d d d f |  |d d d f  t j}
t || t j}|
|d d d f  }
t j|	|
 dd}t ||d d d f |  |d d d f  |
 t || | d S )Nr   r   )axis)r   r   r   r   r   r   r   r   )r$   DOr"   NewDODeltar   r\   off_moff_nododenomdeltarY   rY   rZ   _bwd_preprocessx   s   
660rg   c           @   	   C   s"  t d}|| } || }!| | | |!|  7 } || | |!|  7 }|| | |!|  7 }|| | |!|  7 }|| | |!|  7 }|| | |!|  7 }|| | |!|  7 }td|D ]}"|"| }#|#t d| }$|"| t d| }%t d|}&t d|}'| |$d d d f | |'d d d f |   }(||%d d d f | |'d d d f |   })||%d d d f | |'d d d f |   }*||$d d d f | |'d d d f |   }+||$d d d f | |'d d d f |   },|||  }-|
||  }.t j||gt jd}/t j||gt jd}0t |)}1t |*}2t|#|| |D ]}3|3|& }4t |(}5t j|5|1dd}6t |4d d d f |%d d d f k|6t	d}6t |.|4 }7t 
|6| |7d d d f  }8t |+}9|/t j|8|9j|9dd7 }/t |-|4 }:t j||gt jd|:d d d f  };|;t j|9|2dd7 };|8|; | }<|0t j|<|5j|5dd7 }0t j|,dd}=|=t |<|1j|17 }=t j|,|=dd |,|| 7 },|(|| 7 }(|+|| 7 }+q||%d d d f | |'d d d f |   }>||%d d d f | |'d d d f |   }?t |>|/ t |?|0 qXd S )	Nr   r   Tr	   r   )trans_a
evict_last)eviction_policy)r   r   r   r   r   r   r   r   r   r   r   r   r   r   )@r   r   r   r    r$   r^   DQDKDVr"   r#   Dr%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r5   r6   r7   	num_blockr   r   r   r9   off_zoff_hrH   looffs_qmr;   r:   offs_kr@   rA   rB   do_ptrsdq_ptrsD_ptrsrV   dvdkrI   rT   r8   offs_m_currrG   rJ   mrL   rd   Didpdsdqdv_ptrsdk_ptrsrY   rY   rZ   _bwd_kernel   sh   
",,,,,


.
$,,r   c                   @   s$   e Zd Zedd Zedd ZdS )
_attentionc                 C   s  d}|j d |j d |j d }}}||kr||ksJ |dv s#J t|}	t|j d ||j d |j d  f}
tj|j d |j d  |j d f|jtjd}tj|j d |j d  |j d f|jtjd}tj|j d |j d  |j d f|jtjd}|dkrd	nd
}t|
 ||||||||	|	d|	d|	d|	d|	d|	d|	d|	d|	d|	d|	d|	d|		d|		d|		d|		d|j d |j d |j d f||||dd | 
||||	|| || _|
| _|| _|| _|	S )N   >          @   r      r   r   )devicer   r            r   r   r   	num_warps
num_stages)shapetorch
empty_liketritoncdivemptyr   r   r[   stridesave_for_backwardBLOCKgridr    r   )ctxrG   rI   rT   r    r   LqLkLvrc   r   tmpr"   r{   r   rY   rY   rZ   forward   sj   "
&$.."z_attention.forwardc                 C   s\  | j \}}}}}}| }tj|tjd}t|}	t|}
t|}t|}t| jd | jd  f |||||| j| j	d d}t
| jd f |||| j||||	|
||||d|d|d|d|d|d|d|d|d|d|d|d|jd |jd |jd | jd f| j| j| j	|dd ||j|	|
d fS )	Nr   r   r   )r   r\   r   r   r   r   )saved_tensors
contiguousr   
zeros_liker   r   rg   r   r   r   r   r    r   r   r   r   )r   rd   rG   rI   rT   rc   lr{   r   ry   rx   	do_scaledrf   r   rY   rY   rZ   backward2  sl   



#z_attention.backwardN)__name__
__module____qualname__staticmethodr   r   rY   rY   rY   rZ   r      s
    
8r   )__doc__pytestr   r   triton.languagelanguager   jit	constexprr[   rg   r   autogradFunctionr   apply	attentionrY   rY   rY   rZ   <module>   s:   eg
u