o
    i/                     @   sd   d dl Z d dlZd dlmZmZ d dlm  mZ d dl	Z	d dl
mZ d dlmZ G dd dZdS )    N)TypeOptional)utilsc                   @   s   e Zd Z		ddeej dededefddZede	fd	d
Z
dd Zejdejdejdejdeej deej deej dejfddZejdejdejdejdeej deej deej dejdejdejfddZdS ) FlashAttentionBackwardPreprocess   dtypehead_dimm_block_sizenum_threadsc                 C   s>   || _ || _d}tt|| | | _|| jk| _|| _dS )ao  
        All contiguous dimensions must be at least 16 bytes aligned which indicates the head dimension
        should be a multiple of 8.

        :param head_dim: head dimension
        :type head_dim: int
        :param m_block_size: m block size
        :type m_block_size: int
        :param num_threads: number of threads
        :type num_threads: int
            N)r   r	   intmathceilhead_dim_paddedcheck_hdim_oobr
   )selfr   r   r	   r
   hdim_multiple_of r   Z/home/ubuntu/vllm_env/lib/python3.10/site-packages/flash_attn/cute/flash_bwd_preprocess.py__init__   s   
z)FlashAttentionBackwardPreprocess.__init__returnc                 C   sD   | t jt jfvr
dS |d dkrdS |d dkrdS ||k r dS dS )a  Check if the kernel can be implemented with the given parameters.

        :param dtype: data type
        :type dtype: cutlass.Numeric
        :param head_dim: head dimension
        :type head_dim: int
        :param m_block_size: m block size
        :type m_block_size: int
        :param num_threads: number of threads
        :type num_threads: int

        :return: True if the kernel can be implemented, False otherwise
        :rtype: bool
        F   r   r   T)cutlassFloat16BFloat16)r   r   r	   r
   r   r   r   can_implement+   s   z.FlashAttentionBackwardPreprocess.can_implementc           	      C   s.  | j d dkr	dn| j d dkrdn
| j d dkrdnd}d}|| jj }tjtj | j|d}|| | _| j| j dks@J tj	| j| j | jfdd}t
d	|f}t|||| _t|||| _|tjj }tjtj tj|d}| j| j  | | j dksJ t|t
| jt
|| _d S )
Nr   r   @   r      )num_bits_per_copy)   r   )orderr   )r   r   widthcutemake_copy_atomnvgpuCopyUniversalOpgmem_threads_per_rowr
   make_ordered_layoutmake_layoutmake_tiled_copy_tvgmem_tiled_copy_Ogmem_tiled_copy_dOr   Float32r	   gmem_tiled_copy_dQaccum)	r   gmem_k_block_sizeuniversal_copy_bitsasync_copy_elemsatom_universal_copytOdO_layoutvOdO_layoutasync_copy_elems_accumatom_universal_copy_accumr   r   r   _setup_attributesE   sT   		


z2FlashAttentionBackwardPreprocess._setup_attributesmOmdOmdPsummLSEmLSElog2mdQaccumstreamc           	      C   sJ  t |j|jk rtdt |jt jt jfvrtdt |jt jfvr,tdt |d urAt |jt jfvrAtdt |d url|d usPJ dt |jt jfvr^tdt |jt jfvrltd|   t	|j
d | jt|j
d	 t|j
d
 f}| ||||||| j| j| j	j|| jddg|d d S )Nz(All tensors must have the same data typez%Only Float16 or BFloat16 is supportedzdPsum tensor must be Float32zdQaccum tensor must be Float32z3If mLSE is provided, mLSElog2 must also be providedzLSE tensor must be Float32zLSElog2 tensor must be Float32r      r   )gridblockr=   )r   
const_exprelement_type	TypeErrorr   r   r,   r6   r"   ceil_divshaper	   sizekernelr*   r+   r-   launchr
   )	r   r7   r8   r9   r:   r;   r<   r=   grid_dimr   r   r   __call__|   sH   


z)FlashAttentionBackwardPreprocess.__call__r*   r+   r-   c
           /   	   C   s  t j \}
}}t j \}}}| j| jf}t ||d |d f ||df}t ||d |d f ||df}||
}||
}||}||}t 	| j| jf}||}|d|}t
j||jd d}||}|d|}t
j||jd d}|jd }t || j}t|d urt |||d f | jf|f}tjj} |
||| j  k r||
 } t |}!t |}"t j|dgdt j|dgdksJ t j|dgdt j|dgdksJ t j|dgdt j|dgdksJ tjt |!jd ddD ][}#|d|#df d ||| j  |d d  k rZt j||d |#d f |!d |#d f t| jr5|d |#d f nd d	 t j||d |#d f |"d |#d f t| jrV|d |#d f nd d	 q|! tj|" tj jt jjd
dd}$t
j|$tj| jd}$t  t j|!dgdtj}%|%!|$ t |||d f | jf|f}&|d d dkrtjt |%ddD ]!}#|d|#df d }'|'|jd || j  k r|%|# nd
|&|'< qt|d ur
| j| j f}(t |||d f |(|f})|	|
}*|*|)}+t |+},|,"d
 t |	|,|+ t|d urCt |||d f | jf|f}-t#$t#j%}.|
||| j  k rE| tjj kr=| |. nd
|-|
< d S d S d S )Nr      )limitr   )moder>   T)unroll_full)predg        )r   Nr   )init_valreduction_profile)r!   )r   r   r   )&r"   arch
thread_idx	block_idxr	   r   
local_tile	get_slicepartition_Smake_identity_tensorr   predicate_krE   round_upr   rA   r,   infmake_fragment_likerF   rangecopyr   loadtoreduceReductionOpADDwarp_reduceoperatoraddr&   make_fragmentstorefillr   log2e)/r   r7   r8   r9   r:   r;   r<   r*   r+   r-   tidx_m_blocknum_head
batch_sizeblkOdO_shapegOgdOgmem_thr_copy_Ogmem_thr_copy_dOtOgOtOgdOcOdOtOcOt0OcOtOpOtOcdOt0OcdOtOpdOseqlen_qseqlen_q_roundedgLSElsetOrOtOrdOmdpsumdP_sumgdPsumrowblkdQaccum_shapegdQaccumgmem_thr_copy_dQaccum	tQgQaccumzerogLSElog2LOG2_Er   r   r   rG      s   








$$$. 

.



$z'FlashAttentionBackwardPreprocess.kernelN)r   r   )__name__
__module____qualname__r   r   Numericr   r   staticmethodboolr   r6   r"   jitTensorr   cudaCUstreamrJ   rG   	TiledCopyr   r   r   r   r      sf    
73	
r   )r   re   typingr   r   cuda.bindings.driverbindingsdriverr   r   cutlass.cuter"   flash_attn.cuter   r   r   r   r   r   <module>   s   