o
    پi                     @   s   d dl Z d dlmZ d dlmZmZmZ d dlmZ d dl	m
  mZ d dlZd dlmZ d dlmZmZ d dlmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlm Z m!Z!m"Z"m#Z# G dd dZ$dS )    N)SimpleNamespace)TypeCallableOptional)partial)cpasyncwarp)Float32Int32)ampere_helpers)utils)AttentionMask)SeqlenInfoQK)
ParamsBaseSingleTileSchedulerSingleTileVarlenSchedulerTileSchedulerArgumentsc                F   @   s  e Zd Z																dd	eej d
edee dededededededededededededededef$ddZ	e
	ddefddZd eej d!eej d"eej d#eej d$eej d%eej d&eej d'eej d(eej d)eej dB d*eej dB d+eej dB d,eej dB fd-d.Zd/d0 Zd1d2 Zd3d4 Zej								dd5ejd6ejd7ejd8ejd9ejd:ejd;ejd<ejd=ejd>ejd?ejd@eej dAeej dBeej dCeej dDeeB dB dEeeB dB dFeeB dB dGeej f&dHdIZejd5ejd6ejd7ejd8ejd9ejd:ejd;ejd<ejd=ejd@eej dAeej dBeej dCeej d>ejdJejdKejdLejdMejdNejdOejdPejdQejdRejdSejdTejdUejdVejdWejdXejdYejdZejd[ejd\ed]eje  fDd^d_Zej	dd`ejdaejdbejdcejddejdee!dfe!dge!dhe die djejdJejdkee  fdldmZ"ejdnejdoejd<ejd=ejdpejdqejdTejdUejdrejdsejdtejduejdvejdwe#dxejdyejf dzd{Z$ejd|ejfd}d~Z%ejdejdejdejdejdwejdejfddZ&ejdejdejdejdejdwejdejfddZ'ejdejdVejdejdejdejdejdejdejdejdejdejdcejdwejfddZ(ejdejdejdejdejdejdejdejdejdejdejdejdcejdwejfddZ)dS )FlashAttentionBackwardSm80N   @            F   dtypehead_dim
head_dim_vqhead_per_kvheadm_block_sizen_block_sizenum_stages_Qnum_stages_dOnum_threadspack_gqa	is_causal
SdP_swapAB
dKV_swapAB	dQ_swapABAtomLayoutMSdPAtomLayoutNdKVAtomLayoutMdQ	V_in_regsc                 C   s   || _ d}tt|| | | _|dur|n|}||k| _tt|| | | _|| jk| _|| jk| _|| _	|| _
|| _|	| _|
| _|| _|| _|| _|| _|| _|| _|| _|| _|| _| jtjj }|dkos||kos|os| | _|| _|| _dS )a  Initializes the configuration for a flash attention v2 kernel.

        All contiguous dimensions must be at least 16 bytes aligned which indicates the head dimension
        should be a multiple of 8.

        :param head_dim: head dimension
        :type head_dim: int
        :param m_block_size: m block size
        :type m_block_size: int
        :param n_block_size: n block size
        :type n_block_size: int
        :param num_threads: number of threads
        :type num_threads: int
        :param is_causal: is causal
            Nr   )r   intmathceilhead_dim_paddedsame_hdim_kvhead_dim_v_paddedcheck_hdim_oobcheck_hdim_v_oobr   r   r   r"   r#   r$   r    r!   r%   r&   r'   r(   r)   r*   cutearch	WARP_SIZEMma_dKV_is_RSr+   share_QV_smem)selfr   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   hdim_multiple_ofnum_mma_warps r=   T/home/ubuntu/.local/lib/python3.10/site-packages/flash_attn_origin/cute/flash_bwd.py__init__   s4   $

z#FlashAttentionBackwardSm80.__init__returnc
                 C   s   | t jt jfvr
dS |d dkrdS |d dkrdS |d dkr"dS |d dkr*dS || | d }
|| | d }|| d }|| d }|	sL|
| nt|
|}|| | }td}||krbdS dS )	aU  Check if the kernel can be implemented with the given parameters.

        :param dtype: data type
        :type dtype: cutlass.Numeric
        :param head_dim: head dimension
        :type head_dim: int
        :param m_block_size: m block size
        :type m_block_size: int
        :param n_block_size: n block size
        :type n_block_size: int
        :param num_threads: number of threads
        :type num_threads: int
        :param is_causal: is causal
        :type is_causal: bool

        :return: True if the kernel can be implemented, False otherwise
        :rtype: bool
        Fr   r      r,   r   sm_80T)cutlassFloat16BFloat16maxutils_basicget_smem_capacity_in_bytes)r   r   r   r   r   r    r!   r"   r$   r+   smem_usage_Qsmem_usage_dOsmem_usage_Ksmem_usage_Vsmem_usage_QV
smem_usagesmem_capacityr=   r=   r>   can_implementZ   s(   
z(FlashAttentionBackwardSm80.can_implementmQ_typemK_typemV_typemdO_type	mLSE_typemdPsum_typemdQaccum_typemdK_typemdV_typemCuSeqlensQ_typemCuSeqlensK_typemSeqUsedQ_typemSeqUsedK_typec                 C   s  t ||  ko|  ko|kn   rtdt | jdkr6t ||	  ko,|kn   r5tdnt ||	  koBt jkn   rKtdt |t jt jfvrZtdt |t jfvrgtdt |t jfvrttdt |t jfvrtdt |
d t jfvrtd	t |d t jfvrtd
t |d t jfvrtdt |d t jfvrtd|| jksJ d S )Nz(All tensors must have the same data typer   z6mdK and mdV tensors must have the same data type as mQz=mdKaccum and mdVaccum tensors must have the data type Float32z%Only Float16 or BFloat16 is supportedzLSE tensor must be Float32zdPsum tensor must be Float32zdQaccum tensor must be Float32zcuSeqlensQ tensor must be Int32zcuSeqlensK tensor must be Int32zSeqUsedQ tensor must be Int32zSeqUsedK tensor must be Int32)	rC   
const_expr	TypeErrorr   r	   rD   rE   r
   r   )r:   rQ   rR   rS   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r]   r=   r=   r>   _check_type   s2   * "z&FlashAttentionBackwardSm80._check_typec                 C   s  t | j| j}t|| j| j| jfd| _|}t|| j	| jfd| _
t | j| j}t|| j	| jfd| _|}t|| j| j| jfd| _t | j| j	}t|| j| j	fd| _tj| j| jfdt| jdfd| _tj| j| j	| jfddt| jdfd}tj| j	| j| jfddt| jdfd}| js|n|| _d}|| jj }	tjtjtjjd| j|d	}
tjtj | j|d	}|jjd |	 }| j| dksJ d
tj | j| |fdd}| j|jd  dk| _!| j	|jd  dk| _"|jjd |	 }| j| dksJ dtj | j| |fdd}| j	|jd  dk| _#| j|jd  dk| _$td|	f}t%|
||| _&t%|
||| _'t%|||| _(t%|||| _)|t*j+j }t*,| j- rz|t*j+j }tjtjtjjdt*j+|d	}nd}tjtj t*j+t*j+jd	}t%|t| jt|| _.t%tjtj t*j+t*j+jd	t| jtd| _/t*,| j0dkr| j/| _(| j/| _)d S d S )N)r   r   r   )r   r   r   r   strider   r   )
cache_modenum_bits_per_copyz0num_threads must be divisible by tQK_shape_dim_1)r   r   )orderz1num_threads must be divisible by tVdO_shape_dim_1)1
sm80_utilsget_smem_layout_atomr   r0   r5   tile_to_shaper   r    	sQ_layoutr   	sK_layoutr2   	sV_layoutr!   
sdO_layoutsPdS_layoutmake_layoutround_upsLSE_layoutr%   sLSEMma_layoutwidthmake_copy_atomr   	CopyG2SOpLoadCacheModeGLOBALnvgpuCopyUniversalOpoutershaper"   make_ordered_layoutis_even_m_smem_qis_even_n_smem_kis_even_n_smem_vis_even_m_smem_domake_tiled_copy_tvgmem_tiled_copy_QKgmem_tiled_copy_VdOgmem_tiled_copy_dKgmem_tiled_copy_dVrC   r	   r^   varlen_qgmem_tiled_copy_LSEgmem_tiled_copy_dQaccumr   )r:   sQ_layout_atomsK_layout_atomsV_layout_atomsdO_layout_atomsPdS_layout_atomrr   sLSEMma_layout_transposeduniversal_copy_bitsasync_copy_elemsatom_async_copyatom_universal_copytQK_shape_dim_1
tQK_layouttVdO_shape_dim_1tVdO_layoutvQKVdO_layoutasync_copy_elems_accumatom_async_copy_accumr=   r=   r>   _setup_attributes   s   


z,FlashAttentionBackwardSm80._setup_attributesc                 C   sF  | j d }t| j r| j|| j dfn|| j | jdf}tjt| j	tj
d||d d |d d dfd}t| j rH| j|| j dfn|| j | jdf}tjt| j	tj
d||d d |d d dfd}t| j r{| j|| j dfn|| j | jdf}tjt| j	tj
d||d d |d d dfd}|||fS )Nr,   r   )rA   r   rA   r   rA   )permutation_mnk)r"   rC   r^   r%   r(   r5   make_tiled_mmar   MmaF16BF16Opr   r	   r&   r)   r'   r*   )r:   r<   AtomLayoutSdPtiled_mma_sdpAtomLayoutdKVtiled_mma_dkvAtomLayoutdQtiled_mma_dqr=   r=   r>   _get_tiled_mma(  s(   
222
z)FlashAttentionBackwardSm80._get_tiled_mmac              
      s   	fdd	j 	j	j	jfD \ tt	j t	j}tjjtjj		j
|f df dd 	j	jfD \	fdd	j	jfD \tjG  fddd}tjG  fdd	d	}t	j r||S |S )
Nc                    0   g | ]}t jjt jj jt |f d f qS )   r5   structAlignMemRanger   cosize.0layoutr:   r=   r>   
<listcomp>?      "zFFlashAttentionBackwardSm80._get_shared_storage_cls.<locals>.<listcomp>r   c                 S   s0   g | ]}t jjt jjtjt |f d f qS r   )r5   r   r   r   rC   r	   r   r   r=   r=   r>   r   E  r   c                    r   r   r   r   r   r=   r>   r   I  r   c                       N   e Zd ZU  ed< ed< ed< ed< ed< ed< ed< ed< d	S )
zSFlashAttentionBackwardSm80._get_shared_storage_cls.<locals>.SharedStorageSeparateQVsKsVsQsdOsLSEsdPsumsPsdSN__name__
__module____qualname____annotations__r=   )	sK_structsLSE_struct	sP_struct	sQ_struct	sV_struct
sdO_structsdPsum_struct
sdS_structr=   r>   SharedStorageSeparateQVN     
 r   c                       r   )
zQFlashAttentionBackwardSm80._get_shared_storage_cls.<locals>.SharedStorageSharedQVr   r   r   r   r   r   r   r   Nr   r=   )r   r   r   
sQV_structr   r   r   r   r=   r>   SharedStorageSharedQVZ  r   r   )rj   rk   rl   rm   rF   r5   r   r   r   r   r   rq   rn   rC   r^   r9   )r:   
cosize_sQVr   r   r=   )
r   r   r   r   r   r   r   r   r   r:   r>   _get_shared_storage_cls>  s    





""
z2FlashAttentionBackwardSm80._get_shared_storage_clsmQmKmVmdOmLSEmdPsummdQaccummdKmdVsoftmax_scalestreammCuSeqlensQmCuSeqlensK	mSeqUsedQ	mSeqUsedKsoftcapwindow_size_leftwindow_size_rightmdQ_semaphorec                    sB  |d u sJ d| j dd |||||||||	||||fD   dd   fdd|||||||||	f	D \	}}}}}}}}}	|d u| _|   |  }|  \}}}t|d ur^|jd n|jd	 }t|d urtt}|jd
 d }nt	}|jd
 }t
t|jd | j||dd
|jd	 |jd	 |jd
 | j| jft| jr| jnd||d}||}||}|
ttj }| jg |||||||||	|||||
|| j| j| j| j| j| j| j| j| j| j | j!| j"| j#||||||R  j$|| j%ddg|& |d d S )Nzsemaphore not supported yetc                 s   s"    | ]}|d ur|j nd V  qd S N)element_typer   tr=   r=   r>   	<genexpr>  s    z6FlashAttentionBackwardSm80.__call__.<locals>.<genexpr>c                    s,   g  fdd j d d D  j d R S )Nc                 3   s&    | ]}t j|d  jj dV  qdS )r   )divbyN)r5   assumer   rs   )r   sr   r=   r>   r     s   $ zHFlashAttentionBackwardSm80.__call__.<locals>.<lambda>.<locals>.<genexpr>ra   r   r=   r   r>   <lambda>  s   , z5FlashAttentionBackwardSm80.__call__.<locals>.<lambda>c              	      s6   g | ]}|d urt |jt j|j |dnd qS )Nra   )r5   make_tensoriteratorro   r{   r   
new_strider=   r>   r     s   6 z7FlashAttentionBackwardSm80.__call__.<locals>.<listcomp>r   r   r   )	num_blocknum_head	num_batch
num_splitsseqlen_kheaddim	headdim_vtotal_qtile_shape_mnqhead_per_kvhead_packgqar   r   )gridblocksmemr   )'r`   r   r   r   r   rC   r^   r{   r   r   r   r5   ceil_divr   r   r#   r   to_underlying_argumentsget_grid_shaper.   log2ekernelrj   rk   rl   rm   rn   rq   rr   r   r   r   r   r   r   launchr"   size_in_bytes)r:   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   SharedStorager   r   r   r   TileSchedulerr   tile_sched_argstile_sched_paramsgrid_dimsoftmax_scale_log2r=   r   r>   __call__g  s   
6
"



	
 !"#

z#FlashAttentionBackwardSm80.__call__r  rj   rk   rl   rm   rn   rq   rr   r   r   r   r   r   r   r   r   r   r  r	  r  c#                    s  t j \}#}$}$|"|!}%|% }&|&j\}'}( }$|&jrtj |jd |jd |
|||dt 	j
| j})d}*t| jrOt|'| j j
 j | j |*}*| j| jf}+| j| jf},| j| jf}-| j| jf}.tj r| d |(d f }/| |(d f }0| d |(d f }1| |(d f }2| |(d f }3nHj | j  }4t jdf|d |(d f }/t |4f||(d f }0t jdf|d |(d f }1t |4f||(d f }2t |4| j f||(d f }3t| j r|(| j n|(tj r fdd||fD \}5}6nfdd||fD \}5}6t |/|+d}7t |5|,|'df}8t |6|-|'df}9t |1|.d}:t |0| jfd};t |2| jfd}<t |3| j| j fd}=tj }>|>| }?|?j|}@|?j |}At| j! rr|?j"|}Bnt #t j$|@j%| j&d	|}B|?j'|}C|?j(|}D|?j)|}E|?j*|}F|?j+|}G|?j*|}H|?j+|}Id
d |@|C|A|D|EfD \}J}K}L}M}N|,|#}O|,|#}P|,|#}Q|,|#}R|O-|7}S|O.|@}T|O-|8}U|O.|A}V|P-|9}W|P.|B}X|P-|:}Y|P.|C}Z|Q-|;}[|Q.|F}\|Q-|<}]|Q.|G}^|R-|=}_|,|#}`|,|#}a|,|#}b|a/| j| jf}c|a/| j| jf}dt 0|ctj1}et 0|dtj1}f|e2d |f2d tj3|@d |`| j4d}gtj5|A|`| j4d}htj3|Cd |`| j4d}itj5|B|`| j4d}jtj3|M|a| j6d}ktj5|Kd |a| j6d}ltj3|N|a| j6d}mtj5|Jd |a| j6d}ntj3|E|b| j7d}otj5|L|b| j7d}pt| j4 rdnd}qt8|`9|H|q }rt8|`9|I|q }st :t;j<ddd| j&}tt :t;j<ddd| j&}utj=|t|| j4d,|#}vtj>|t|| j4d,|#}wtj=|u|| j6d,|#}xtj>|u|| j6d,|#}ytj=|t|| j7d,|#}ztj>|u|| j7d,|#}{t ?t j:t j@A | j&d| j&jB d|,|#}||v-|@}}|v-|C}~|w-|A}|w-|B}|x-|M}|x-|N}|y-|K}|y-|J}|z-|E}|{-|L}||.|D}||.|E}t C| j| jf}|O-|}|O,d-|}t| j| jkr|}|}nt C| j| jf}|P-|}|P,d-|}t C| jf}|Q-|}|jt D|d  }|jt D|d  }tjE||d}t| jFr|}ntjE||d}tG|`|a|b|g|h|i|j|k|l|m|n|o|p|e|fd}tGd5i d|vd|wd|xd|yd|zd|{d||d|}d |d!|~d"|d#|rd$|sd%|d&|d'|d(|d)|d*|d+|d,|}tG|R|_d-}tH| jI|||S|T||||[|\|j
d.}tH| jJ|||Y|Z||||]|^|j
d.}tH| jK||||||)|d/}| jL|P|W|X|'j|d0 t| jMrt jN  | jO|O|U|V|'j|d0 t jN  t| jMrt jPd t jQ  |wR|j}t S|w|| t jQ  |*}| jT| jUksJ tV| jTD ]O}t| jTdkp|| jTd k r|dks|| |)k r||| |d1 t jN  t|| jUk r)|dks|| |)k r$||| |d1 t jN  qtW| j| jj
j}tH|jX|'|`d| jd2}tYd}tYd}tY| jTd }tYd}tjZ|*|)dd3D ])}|||||||d4 | [|| jT}| [|| jU}| [|| jT}| [|| jU}q`t| jdkr|e\|e] |  t #|Aj%|}t #|Bj%|}| ^|e|f||	||||||#|'|( || d S d S )6Nr   )r   r   r   r   r   c                       g | ]
}| d d f qS r   r=   r   	batch_idxhead_idx_kvr=   r>   r   #      z5FlashAttentionBackwardSm80.kernel.<locals>.<listcomp>c              	      (   g | ]}t jd f|d df qS r   Nr5   domain_offsetoffset_kr   r  seqlenr=   r>   r   %     ( Nr   r   )r   c                 S   s   g | ]}t |qS r=   )r   transpose_viewr   r=   r=   r>   r   G  s            )NNr   )swapABNr   N)r   NNF   )	transposenum_matricesTr   rd   limit)thr_mma_sdpthr_mma_dkv
thr_mma_dqtSrQtSrKtdPrdOtdPrVtdVrPtdVrdOtdKrdStdKrQtdQrdStdQrKacc_dKacc_dVsmem_thr_copy_QdOsmem_thr_copy_KVsmem_thr_copy_PdStsmem_thr_copy_QdOtsmem_thr_copy_dSsmem_thr_copy_Ktr2s_thr_copy_PdStSsQtSsKtdPsdOtdPsV	tSsLSEMmatSsdPsumMmatPsPtdSsdStdVsPttdVsdOttdKsdSttdKsQttdQsdStdQsKt)gmem_thr_copy_dQaccumtdQgdQaccum)r  )
mma_paramssmem_copy_paramsgmem_copy_params
load_Q_LSEload_dO_dPsumm_block_maxr  )r  r   )smem_pipe_write_q)n_blockthr_mmamask_seqlenmask_causal)unroll)mask_fnr=   )_r5   r6   
thread_idxcreateinitial_work_tile_infotile_idxis_valid_tiler   r{   r   seqlen_qr   rC   r^   r$   rF   r   r   r0   r2   has_cu_seqlens_qoffset_qr  r#   r   has_cu_seqlens_k
local_tiler   SmemAllocatorallocater   
get_tensorr   r9   r   r   
recast_ptrr   r   r   r   r   r   r   	get_slicepartition_Spartition_Dpartition_shape_Cmake_fragmentr	   fillmma_make_fragment_Ar%   mma_make_fragment_Br&   r'   make_acc_tensor_mn_viewpartition_Crt   r   LdMatrix8x8x16bOpmake_tiled_copy_Amake_tiled_copy_Bmake_tiled_copy_Crx   ry   rs   make_identity_tensorrankpredicate_kr1   r   r   rM  rN  compute_one_m_blockload_Vr+   cp_async_commit_groupload_Kcp_async_wait_groupbarrierretilecopyr    r!   range_constexprr   
apply_maskr
   rangeadvance_pipelinestoreloadepilogue)r:   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  rj   rk   rl   rm   rn   rq   rr   r   r   r   r   r   r   r   r   r   r  r	  r  tidx_tile_scheduler	work_tilerQ  head_idxrO  m_block_min
blkQ_shape
blkK_shape
blkV_shapeblkdO_shapemQ_curmLSE_curmdO_cur
mdPsum_curmdQaccum_curpadded_offset_qmK_curmV_curgQgKgVgdOgLSEgdPsumgdQaccumr   storager   r   r   r   r   r   r   r   sLSEMma	sdPsumMmasQtsdOtsKtsPtsdStgmem_thr_copy_QKgmem_thr_copy_VdOgmem_thr_copy_lserH  tQgQtQsQtKgKtKsKtVgVtVsVtdOgdOtdOsdOtLSEgLSEtLSEsLSE
tLSEgdPsum
tLSEsdPsumrI  r$  r%  r&  acc_shape_dKacc_shape_dVr1  r2  r'  r(  r)  r*  r+  r,  r-  r.  r/  r0  LSEslicer>  r?  smem_copy_atomsmem_copy_atom_transposedr3  r4  r5  r6  r7  r8  r9  r:  r<  r;  r=  rB  rD  rC  rE  rF  rG  r@  rA  cQtQcQt0QcQtdOcdOt0dOcdOcdOcLSEtLSEcLSEd_headd_head_vtQpQtdOpdOrJ  rK  rL  rM  rN  rv  tdPrV_copy_viewm_blockstagemaskrV  smem_pipe_read_qsmem_pipe_read_dorP  smem_pipe_write_dom_tilesdKsdVr=   r  r  r  r>   r    sL  '
$	

"




































		










"





  z!FlashAttentionBackwardSm80.kernelr  r  r  rP  r  rJ  rK  rL  rM  rN  rO  rV  c                     sP  	fdd}fdd}j tj r$jjfnjjf}t|tj	}|
d tjtjdkrDdnd tj  tjj |jjjd d d tjdkrd|ndf jjjjd	 tjd	 }tjd tjdkr|ndf | t|d ur||d
 t|}d}tj|dgdt|ksJ tjtj|dgdddD ]}||d f t||d f   | ||   qt|tj	}|
d tjtj!dkrdnd tj  tjj |j"j#j$d d d tj!dkr|ndf j%jjtjdkr(|nd jd
 tj&d	 }tj&d tj!dkrF|ndf | t|}tj|dgdt|ksaJ tjtj|dgdddD ]}||d f ||d f   ||d f   ||    qnt|j'}||  (j' tj) rj*+|}t,j*|j- t|j'}||  (j' tj) rtj  tj) rj*+|}t,j*|j. tj)rt/|j0t1|j2}nj3}tjj4j5|j6j7j8d d d tj!dkr|ndf j9j:j)j;d
 tj   fdd}tjdkrG|| tj)rZt/|j0t1|j2}nj<}tjj4j=|j>j?j@d d d tjdkrx|ndf j9j:j)j;tjdkr|nd d tjdkrtj  || d S d S )Nc                     sB   t jdkrjd nd } | k r |  tj  d S )Nr   )rC   r^   r    r5   r6   rx  )m_block_next)rM  r  rO  r:   rP  r=   r>   load_Q_nextM  s   "
zCFlashAttentionBackwardSm80.compute_one_m_block.<locals>.load_Q_nextc                      s,   j  k r j   tj  d S r   )r!   r5   r6   rx  r=   )rN  r  rO  r:   r  r=   r>   load_dO_nextS  s   zDFlashAttentionBackwardSm80.compute_one_m_block.<locals>.load_dO_nextr  r   r   )swap_ABr  )r  )modeTunroll_full)hook_fnr  )	A_in_regsr  c                    s   j tj rjjfnjjf}t|tj	}|
d tjj |jjjjjjj| d
  j|} jd d f }t|t|ksTJ tjt|ddD ]}t|| t|| q^d S )Nr  )r  r  Tr  )r&  rh  rC   r^   r'   r   r0   r5   ri  r	   rj  rg   gemmr/  r0  rF  rG  r7  r8  rH  r|  rI  sizer  r   atomic_add_fp32elem_pointer)r  acc_shape_dQacc_dQacc_dQ_atomictdQgdQaccum_atomici)rL  r  rJ  r:   rK  r=   r>   dQ_mma  s$   $
z>FlashAttentionBackwardSm80.compute_one_m_block.<locals>.dQ_mma)r  r  r  )Ar$  rh  rC   r^   r%   r   r   r5   ri  r	   rj  r6   rz  r    r{  rg   r  r'  r(  r:  r;  r3  r4  make_fragment_liker>  autovec_copyr   rm  r  r  r  exp2fr  r!   r)  r*  r<  r=  r?  r   tor8   r9  r|  r}  r@  rA  r   r   convert_layout_acc_frgAr   r+  r%  r2  r,  rB  rC  r5  r6  r&   r-  r1  r.  rD  rE  ) r:   r  r  r  rP  r  rJ  rK  rL  rM  rN  rO  r  rV  r  r  acc_shape_SdPacc_StLSErLSEacc_S_mnbidxracc_dP
tLSErdPsum	acc_dP_mnrPtPrPrdStdSrdSr+  r  r-  r=   )
rL  rM  rN  r  rO  rJ  r:   rK  r  rP  r>   rv  <  s   $
 
$"
2
 
&$
 :
&
	&

z.FlashAttentionBackwardSm80.compute_one_m_blockr1  r2  r  r  	tiled_mmar  rQ  r   
batch_sizer  r  r  c           6   	      sR  t || j}|| | j t || j}|| | j ||
}||
}| t| j	 r<|| j
 n|t| j
dkrt j  t jt j | jd| jj d}t ||	|
}||}||}||}||}t ||| t ||| tj r fdd||fD \}}nfdd||fD \}}| j| jf}| j| jf}t |||df}t |||df} ||}!||}"||}#|| }$t |"| j}%t |$| j}&t j  t |!|% t |#|& t | j| jf}'||'}(|d|'})t| j| jkr!|(}*|)}+nt | j| jf},||,}*|d|,}+tj|(|d}-t| jrH|-}.ntj|*|d}.t t !|%j"d D ]<}/|)d|/df d j#|| j  |(d d  k rt j||%d |/d f |"d |/d f t| j$r|-d |/d f nd d	 qZt t !|&j"d D ]<}/|+d|/df d j#|| j  |*d d  k rt j||&d |/d f |$d |/d f t| j%r|.d |/d f nd d	 qd S t| j	 r|| j
 n|tj r fd
d||fD \}}n$j& | j  }0t '|0| j f|d f }t '|0| j f|d f }t || j| j f|f} t || j| j f|f}|| }1||}2||}3||}4t !|3t !|1ksfJ t !|4t !|2kssJ tj(t !|3ddD ]}5t)|3|5 t*|1|5 q}tj(t !|4ddD ]}5t)|4|5 t*|2|5 qd S )Nr   r   rd   c                    r  r   r=   r   r  r=   r>   r   
  r  z7FlashAttentionBackwardSm80.epilogue.<locals>.<listcomp>c              	      r  r  r  r   r  r=   r>   r     r  r   r"  predc                    s   g | ]	}| d f qS r   r=   r   r  r=   r>   r   G  s    Tr  )+r5   r  r   r  r  r  re  rC   r^   r#   r   r6   r{  rt   rx   ry   rs   rr  r|  rg  r}  r_  r   r0   r2   r`  rf  r  rs  r   ru  r1   r~  r  r{   r   r3   r4   r  r  r  r  r  )6r:   r1  r2  r   r   r  r  r   r   r  r  rQ  r   r  r  r  r  rdVrdKgmem_thr_copy_dKgmem_thr_copy_dVsmem_copy_atom_dKVsmem_thr_copy_dKV	taccdVrdV	taccdKrdK	taccdVsdV	taccdKsdKmdK_curmdV_curblkdK_shapeblkdV_shapegdKgdVtdKsdKtdKgdKtdVsdVtdVgdVtdKrdKtdVrdVcdKtdKcdKt0dKcdKtdVcdVt0dVcdVcdVtdKpdKtdVpdVrest_mpadded_offset_ktdVgdVaccumtdKgdKaccumacc_dV_atomicacc_dK_atomicr  r=   r  r>   r    s   













00



z#FlashAttentionBackwardSm80.epilogue
num_stagesc                 C   s   ||d k r
|d S dS )Nr   r   r=   )r:   pipeline_indexr  r=   r=   r>   r  Z  s   z+FlashAttentionBackwardSm80.advance_pipelinegmem_thr_copyr  r  r   r   c                 C   R  t | j| jf}||}|d|}	tj||d}
t	t 
|jd D ]~}| jsE|t 
|jd d k sE|d|df d | jk r|	d|df d ||| j  |d d  k }t |
d }t	t 
|jd D ]%}t	t 
|jd D ]}t| jr|
|||f ndo||||f< qzqmt j||d |d f |d |d f |d q(d S Nr   r"  r   r  Tr  )r5   rs  r   r0   rf  re  r   ru  rC   r~  r  r{   r~   r  r^   r3   r}  )r:   r   r  r  r   r  r   cKtKcKt0KcKtKpKnpredicate_n	predicatekr  r=   r=   r>   ry  ^  "   

6,,z!FlashAttentionBackwardSm80.load_Kr  r  c                 C   r!  r"  )r5   rs  r   r2   rf  re  r   ru  rC   r~  r  r{   r   r  r^   r3   r}  )r:   r   r  r  r   r  r   cVtVcVt0VcVtVpVr'  r(  r)  r*  r  r=   r=   r>   rw  {  r+  z!FlashAttentionBackwardSm80.load_Vgmem_tiled_copy_Qr  r  r  r  r  r  r  r  c                 C   s  t t|jd D ]}| js(|t|jd d k s(|d|df d | jk r|d|df d ||| j  |d d  k }t|d }t t|jd D ]%}t t|jd D ]}t | j	rl||||f ndoo||||f< q]qPtj
||d |d |f |d |d t | jdkr|ndf |d qt t|	jd D ](}|
d|f d | jk rt
||d ||f |	d |t | jdkr|ndf  qd S Nr   r   r  Tr  )rC   r~  r5   r  r{   r}   r   r  r^   r3   r}  r    )r:   r0  r   r  r  r  r  r  r  r  r  r   rP  r  mpredicate_mr)  r*  r  r=   r=   r>   rM    0   6,,"	 z%FlashAttentionBackwardSm80.load_Q_LSEgmem_tiled_copy_dOgmem_tiled_copy_dPsumr  r  r  r  r  tdPsumgdPsumtdPsumsdPsumtdPsumcdPsumc                 C   s  t t|jd D ]}| js(|t|jd d k s(|d|df d | jk r|d|df d ||| j  |d d  k }t|d }t t|jd D ]%}t t|jd D ]}t | j	rl||||f ndoo||||f< q]qPtj
||d |d |f |d |d t | jdkr|ndf |d qt t|jd D ](}|
d|f d | jk rt
||d ||f |	d |t | jdkr|ndf  qd S r1  )rC   r~  r5   r  r{   r   r   r  r^   r3   r}  r!   )r:   r5  r6  r  r  r  r  r  r7  r8  r9  r   rP  r  r2  r3  r)  r*  r  r=   r=   r>   rN    r4  z(FlashAttentionBackwardSm80.load_dO_dPsum)Nr   r   r   r   r   r   FFFFFr   r   r   F)F)NNNNNNNNr   )*r   r   r   r   rC   Numericr-   r   boolr?   staticmethodrP   r`   r   r   r   r5   jitTensorr	   cudaCUstreamfloatr
   r  r  ComposedLayoutLayout	TiledCopyTiledMma	Constexprr   r   r   rv  r   r  r  ry  rw  rM  rN  r=   r=   r=   r>   r      s   	

A.	

*u)	



i	
 !"#  l	
 	
 	
+	
r   )%r.   typesr   typingr   r   r   	functoolsr   cuda.bindings.driverbindingsdriverr?  rC   cutlass.cuter5   cutlass.cute.nvgpur   r   r	   r
   cutlass.utilsr   rG   flash_attn_origin.cuter   rg   flash_attn_origin.cute.maskr   "flash_attn_origin.cute.seqlen_infor   %flash_attn_origin.cute.tile_schedulerr   r   r   r   r   r=   r=   r=   r>   <module>   s    