o
    i2                     @   st   d dl Z d dlmZ d dlm  mZ d dlZd dlm	Z	 d dl
mZmZ d dlmZ d dlmZ G dd dZdS )    N)Type)cpasyncwarp)ampere_helpers)utilsc                   @   s   e Zd Z				d!deej dededed	ed
efddZe	defddZ
dd ZejdejdejdejdejfddZejdejdejdejdejd
ejdejdejdejdejdejfddZd S )"!FlashAttentionBackwardPostprocess         Fdtypehead_dimm_block_sizenum_threadsAtomLayoutMdQ	dQ_swapABc                 C   sJ   || _ || _d}tt|| | | _|| jk| _|| _|| _|| _	dS )aa  Initializes the configuration for a flash attention v2 kernel.

        All contiguous dimensions must be at least 16 bytes aligned which indicates the head dimension
        should be a multiple of 8.

        :param head_dim: head dimension
        :type head_dim: int
        :param m_block_size: m block size
        :type m_block_size: int
            N)
r   r   intmathceilhead_dim_paddedcheck_hdim_oobr   r   r   )selfr   r   r   r   r   r   hdim_multiple_of r   [/home/ubuntu/vllm_env/lib/python3.10/site-packages/flash_attn/cute/flash_bwd_postprocess.py__init__   s   
z*FlashAttentionBackwardPostprocess.__init__returnc                 C   s8   | t jt jfvr
dS |d dkrdS |d dkrdS dS )a|  Check if the kernel can be implemented with the given parameters.

        :param dtype: data type
        :type dtype: cutlass.Numeric
        :param head_dim: head dimension
        :type head_dim: int
        :param m_block_size: m block size
        :type m_block_size: int

        :return: True if the kernel can be implemented, False otherwise
        :rtype: bool
        F   r   r   T)cutlassFloat16BFloat16)r   r   r   r   r   r   r   can_implement1   s   z/FlashAttentionBackwardPostprocess.can_implementc                 C   s  d}|t jj }tjtjtjjdt j|d}| j	| j
 | | jj dks'J t|t| jjt|| _tjtj t jt jjd}t|t| jjtd| _|| jj }tjtj | j|d}| j
| dkspJ t| j
| | jj}| jj| dksJ tj| jj| |fdd}td|f}	t|||	| _t| j	| j
 | _| jd}
t| j|
}t|| j	| j
fd| _d S )	Nr   )
cache_modenum_bits_per_copyr   r
   )r
   r   )order)r   r
   )r   Float32widthcutemake_copy_atomr   	CopyG2SOpLoadCacheModeGLOBALr   r   	tiled_mmasizemake_tiled_copy_tvmake_layoutg2s_tiled_copy_dQaccumnvgpuCopyUniversalOps2r_tiled_copy_dQaccumr   r   gcdmake_ordered_layoutgmem_tiled_copy_dQsdQaccum_layoutget_tile_size
sm80_utilsget_smem_layout_atomtile_to_shape
sdQ_layout)r   universal_copy_bitsasync_copy_elems_accumatom_async_copy_accumatom_universal_copy_accumasync_copy_elemsatom_universal_copygmem_threads_per_row
tdQ_layout
vdQ_layoutmma_shape_nsdQ_layout_atomr   r   r   _setup_attributesG   sh   
z3FlashAttentionBackwardPostprocess._setup_attributesmdQaccummdQscalestreamc           
      C   s^  t |jt jt jfvrtdt |d ur%t |jt jfvr%td| jd }t | j r:| j	|| j	 dfn|| j	 | j	df}t
jt| jt jd||d d |d d dfd}|| _|   tt
t j| jt
| j| j}t
|jd | jt
|jd	 t
|jd f}	| ||||| j| j| j| j| j| j
j|	|jddg||d
 d S )Nz%Only Float16 or BFloat16 is supportedzdQaccum tensor must be Float32r   r
   )   r   rN   r   rN   )permutation_mnk   )gridblocksmemrM   )r   
const_exprelement_typer   r    	TypeErrorr&   r   r   r   r(   make_tiled_mmar   MmaF16BF16Opr   r-   rI   maxsize_in_bytesr8   r=   ceil_divshaper   r.   kernelr1   r4   r7   launch)
r   rJ   rK   rL   rM   num_mma_warpsAtomLayoutdQr-   	smem_sizegrid_dimr   r   r   __call__   sV   	


z*FlashAttentionBackwardPostprocess.__call__r-   r8   r=   r1   r4   r7   c           /   	   C   s  t j \}}}t j \}}}| j| j f}t |||d f ||f}| j| jf}t ||d |d f ||df}tj	 }|j
tj|dd}t t j|j| jd|}|jd }t || j}||}||}||}t ||| t j  t jd t j  |	|}||}|t| r| j| jfn| j| jf}t |tj}t |t |ksJ ||} tjt |ddD ]}!||! | |!< qt || j}"|" |! | "| j t j  t j#t j$% | jtjj&d}#t '|#||}$|$|"}%|$|}&t |#|%|& |
|}'|'|}(|'|})t |)| j}*t j  t (|)|* t )| j| jf}+|'|+},tj*|,|jd	 d
}-tjt |*jd ddD ]/}.|,d|.df d |jd || j  k rt j|
|*d |.d f |(d |.d f |-d |.d f d qWd S )Nr   i   )byte_alignment)r   r
   T)unroll_fullr#      )limit)pred)+r(   arch
thread_idx	block_idxr   r   
local_tiler   r   SmemAllocatorallocate_tensorr&   make_tensor
recast_ptriteratorr   r\   round_up	get_slicepartition_Spartition_Dcopycp_async_commit_groupcp_async_wait_groupbarrierpartition_shape_CrT   make_fragmentr.   retilerangemake_fragment_likestoreloadtor)   r2   r3   r'   make_tiled_copy_Cautovec_copymake_identity_tensorpredicate_k)/r   rJ   rK   rL   r-   r   r8   r=   r1   r4   r7   tidx_m_blocknum_head
batch_sizeblkdQaccum_shapegdQaccumblkdQ_shapegdQrS   sdQaccumsdQseqlen_qseqlen_q_roundedg2s_thr_copy_dQaccumtdQgdQaccumtdQsdQaccumg2ss2r_thr_copy_dQaccumtdQsdQaccum	acc_shapeacctdQrdQaccumirdQsmem_copy_atom_dQsmem_thr_copy_dQ	taccdQrdQ	taccdQsdQgmem_thr_copy_dQtdQgdQtdQsdQtdQrdQcdQtdQcdQtdQpdQrest_mr   r   r   r]      sz   



















(z(FlashAttentionBackwardPostprocess.kernelN)r   r	   r
   F)__name__
__module____qualname__r   r   Numericr   boolr   staticmethodr!   rI   r(   jitTensorr&   cudaCUstreamrc   r]   TiledMma	ConstexprLayoutComposedLayout	TiledCopyr   r   r   r   r      sj    
D:	
r   )r   typingr   cuda.bindings.driverbindingsdriverr   r   cutlass.cuter(   cutlass.cute.nvgpur   r   flash_attn.cuter   r:   r   r   r   r   r   r   <module>   s   