o
    i                     @   s   d dl Z d dlmZ d dlmZmZmZ d dlmZ d dl	m
  mZ d dlZd dlmZ d dlmZmZ d dlm  mZ d dlmZ d dlmZ d dlmZ d d	lmZ G d
d dZdS )    N)SimpleNamespace)TypeCallableOptional)partial)cpasyncwarp)ampere_helpers)utils)AttentionMask)
SeqlenInfoc                9   @   s  e Zd Z															dd	eej d
edee dedededededededededededededef"ddZ	e
	ddefddZdeej d eej d!eej d"eej d#eej d$eej d%eej d&eej d'eej fd(d)Zd*d+ Zd,d- Zd.d/ Zejd0ejd1ejd2ejd3ejd4ejd5ejd6ejd7ejd8ejd9ejd:ejfd;d<Zejd0ejd1ejd2ejd3ejd4ejd5ejd=ejd7ejd8ejd9ejd>ejd?ejd@ejdAejdBejdCejdDejdEejdFejdGejdHejdIejdJejdKejdLejdMejdNejdOejf8dPdQZej	ddRejdSejdTejdUejdVejdWedXedYedZed[ed\ejd>ejd]ee fd^d_Z ejd`ejdaejd7ejd8ejdbejdcejdHejdIejddejdeejdfejdgejdhejfdidjZ!ejdkejfdldmZ"ejdnejdoejdpejdqejdrejdsejfdtduZ#ejdnejdvejdwejdqejdrejdsejfdxdyZ$ejdzejdJejd{ejd|ejd}ejd~ejdejdejdejdejdqejdUejdrejfddZ%ejdejdejdejdejdejdejdejdejdejdejdqejdUejdrejfddZ&dS )FlashAttentionBackwardSm80N   @            F   dtypehead_dim
head_dim_vqhead_per_kvheadm_block_sizen_block_sizenum_stages_Qnum_stages_dOnum_threads	is_causal
SdP_swapAB
dKV_swapAB	dQ_swapABAtomLayoutMSdPAtomLayoutNdKVAtomLayoutMdQ	V_in_regsc                 C   s   || _ d}tt|| | | _|dur|n|}||k| _tt|| | | _|| jk| _|| jk| _|| _	|| _
|| _|	| _|
| _|| _|| _|| _|| _|| _|| _|| _|| _| jtjj }|dkop||kop|op| | _|| _|| _dS )a  Initializes the configuration for a flash attention v2 kernel.

        All contiguous dimensions must be at least 16 bytes aligned which indicates the head dimension
        should be a multiple of 8.

        :param head_dim: head dimension
        :type head_dim: int
        :param m_block_size: m block size
        :type m_block_size: int
        :param n_block_size: n block size
        :type n_block_size: int
        :param num_threads: number of threads
        :type num_threads: int
        :param is_causal: is causal
            Nr   )r   intmathceilhead_dim_paddedsame_hdim_kvhead_dim_v_paddedcheck_hdim_oobcheck_hdim_v_oobr   r   r   r   r   r   r   r   r   r    r!   r"   r#   cutearch	WARP_SIZEMma_dKV_is_RSr$   share_QV_smem)selfr   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   hdim_multiple_ofnum_mma_warps r6   O/home/ubuntu/vllm_env/lib/python3.10/site-packages/flash_attn/cute/flash_bwd.py__init__   s2   #

z#FlashAttentionBackwardSm80.__init__returnc
                 C   s   | t jt jfvr
dS |d dkrdS |d dkrdS |d dkr"dS |d dkr*dS || | d }
|| | d }|| d }|| d }|	sL|
| nt|
|}|| | }tjd }||krbdS dS )	aU  Check if the kernel can be implemented with the given parameters.

        :param dtype: data type
        :type dtype: cutlass.Numeric
        :param head_dim: head dimension
        :type head_dim: int
        :param m_block_size: m block size
        :type m_block_size: int
        :param n_block_size: n block size
        :type n_block_size: int
        :param num_threads: number of threads
        :type num_threads: int
        :param is_causal: is causal
        :type is_causal: bool

        :return: True if the kernel can be implemented, False otherwise
        :rtype: bool
        Fr   r      r%   r   sm80T)cutlassFloat16BFloat16maxsm80_utils_basicSMEM_CAPACITY)r   r   r   r   r   r   r   r   r   r$   smem_usage_Qsmem_usage_dOsmem_usage_Ksmem_usage_Vsmem_usage_QV
smem_usagesmem_capacityr6   r6   r7   can_implementV   s(   
z(FlashAttentionBackwardSm80.can_implementmQ_typemK_typemV_typemdO_type	mLSE_typemdPsum_typemdQaccum_typemdK_typemdV_typec
           
      C   s  t ||  ko|  ko|kn   rtdt | jdkr6t ||	  ko,|kn   r5tdnt ||	  koBt jkn   rKtdt |t jt jfvrZtdt |t jfvrgtdt |t jfvrttdt |t jfvrtd|| jksJ d S )	Nz(All tensors must have the same data typer   z6mdK and mdV tensors must have the same data type as mQz=mdKaccum and mdVaccum tensors must have the data type Float32z%Only Float16 or BFloat16 is supportedzLSE tensor must be Float32zdPsum tensor must be Float32zdQaccum tensor must be Float32)r<   
const_expr	TypeErrorr   Float32r=   r>   r   )
r3   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   r6   r6   r7   _check_type   s"   * "z&FlashAttentionBackwardSm80._check_typec                 C   sT  t | j| j}t|| j| j| jfd| _|}t|| j	| jfd| _
t | j| j}t|| j	| jfd| _|}t|| j| j| jfd| _t | j| j	}t|| j| j	fd| _tj| j| jfdt| jdfd| _tj| j| j	| jfddt| jdfd}tj| j	| j| jfddt| jdfd}| js|n|| _d}|| jj }	tjtjtjjd| j|d	}
tjtj | j|d	}|jjd |	 }| j| dksJ d
tj | j| |fdd}| j|jd  dk| _!| j	|jd  dk| _"|jjd |	 }| j| dksJ dtj | j| |fdd}| j	|jd  dk| _#| j|jd  dk| _$td|	f}t%|
||| _&t%|
||| _'t%|||| _(t%|||| _)|t*j+j }tjtjtjjdt*j+|d	}t%|t| jt|| _,t%tjtj t*j+t*j+jd	t| jtd| _-t*.| j/dkr| j-| _(| j-| _)d S d S )N)r   r   r   )r   r   r   r   )strider   r   )
cache_modenum_bits_per_copyz0num_threads must be divisible by tQK_shape_dim_1)r   r   )orderz1num_threads must be divisible by tVdO_shape_dim_1)0
sm80_utilsget_smem_layout_atomr   r)   r.   tile_to_shaper   r   	sQ_layoutr   	sK_layoutr+   	sV_layoutr   
sdO_layoutsPdS_layoutmake_layoutround_upsLSE_layoutr   sLSEMma_layoutwidthmake_copy_atomr   	CopyG2SOpLoadCacheModeGLOBALnvgpuCopyUniversalOpoutershaper   make_ordered_layoutis_even_m_smem_qis_even_n_smem_kis_even_n_smem_vis_even_m_smem_domake_tiled_copy_tvgmem_tiled_copy_QKgmem_tiled_copy_VdOgmem_tiled_copy_dKgmem_tiled_copy_dVr<   rU   gmem_tiled_copy_LSEgmem_tiled_copy_dQaccumrS   r   )r3   sQ_layout_atomsK_layout_atomsV_layout_atomsdO_layout_atomsPdS_layout_atomrg   sLSEMma_layout_transposeduniversal_copy_bitsasync_copy_elemsatom_async_copyatom_universal_copytQK_shape_dim_1
tQK_layouttVdO_shape_dim_1tVdO_layoutvQKVdO_layoutasync_copy_elems_accumatom_async_copy_accumr6   r6   r7   _setup_attributes   s   


z,FlashAttentionBackwardSm80._setup_attributesc                 C   sF  | j d }t| j r| j|| j dfn|| j | jdf}tjt| j	tj
d||d d |d d dfd}t| j rH| j|| j dfn|| j | jdf}tjt| j	tj
d||d d |d d dfd}t| j r{| j|| j dfn|| j | jdf}tjt| j	tj
d||d d |d d dfd}|||fS )Nr%   r   )r:   r   r:   r   r:   )permutation_mnk)r   r<   rS   r   r!   r.   make_tiled_mmar   MmaF16BF16Opr   rU   r   r"   r    r#   )r3   r5   AtomLayoutSdPtiled_mma_sdpAtomLayoutdKVtiled_mma_dkvAtomLayoutdQtiled_mma_dqr6   r6   r7   _get_tiled_mma  s(   
222
z)FlashAttentionBackwardSm80._get_tiled_mmac              
      s   	fdd	j 	j	j	jfD \ tt	j t	j}tjjtjj		j
|f df dd 	j	jfD \	fdd	j	jfD \tjG  fddd}tjG  fdd	d	}t	j r||S |S )
Nc                    0   g | ]}t jjt jj jt |f d f qS )   r.   structAlignMemRanger   cosize.0layoutr3   r6   r7   
<listcomp>$      "zFFlashAttentionBackwardSm80._get_shared_storage_cls.<locals>.<listcomp>r   c                 S   s0   g | ]}t jjt jjtjt |f d f qS r   )r.   r   r   r   r<   rU   r   r   r6   r6   r7   r   *  r   c                    r   r   r   r   r   r6   r7   r   .  r   c                       N   e Zd ZU  ed< ed< ed< ed< ed< ed< ed< ed< d	S )
zSFlashAttentionBackwardSm80._get_shared_storage_cls.<locals>.SharedStorageSeparateQVsKsVsQsdOsLSEsdPsumsPsdSN__name__
__module____qualname____annotations__r6   )	sK_structsLSE_struct	sP_struct	sQ_struct	sV_struct
sdO_structsdPsum_struct
sdS_structr6   r7   SharedStorageSeparateQV3     
 r   c                       r   )
zQFlashAttentionBackwardSm80._get_shared_storage_cls.<locals>.SharedStorageSharedQVr   r   r   r   r   r   r   r   Nr   r6   )r   r   r   
sQV_structr   r   r   r   r6   r7   SharedStorageSharedQV?  r   r   )r_   r`   ra   rb   r?   r.   r   r   r   r   r   rf   rc   r<   rS   r2   )r3   
cosize_sQVr   r   r6   )
r   r   r   r   r   r   r   r   r   r3   r7   _get_shared_storage_cls#  s    





""
z2FlashAttentionBackwardSm80._get_shared_storage_clsmQmKmVmdOmLSEmdPsummdQaccummdKmdVsoftmax_scalestreamc                 C   s   | j dd |||||||||	f	D   |   |  }|  \}}}t|jd | jt|jd t|jd f}|
t	
t	j }| |||||||||	|
|| j| j| j| j| j| j| j| j| j| j| j| j| j||||j|| jddg| |d d S )Nc                 s   s"    | ]}|d ur|j nd V  qd S N)element_typer   tr6   r6   r7   	<genexpr>\  s    z6FlashAttentionBackwardSm80.__call__.<locals>.<genexpr>r   r   r   )gridblocksmemr   )rV   r   r   r   r.   ceil_divrp   r   sizer'   log2ekernelr_   r`   ra   rb   rc   rf   rg   rw   rx   ry   rz   r{   r|   launchr   size_in_bytes)r3   r   r   r   r   r   r   r   r   r   r   r   SharedStorager   r   r   grid_dimsoftmax_scale_log2r6   r6   r7   __call__L  s^   


z#FlashAttentionBackwardSm80.__call__mdQaccur   r_   r`   ra   rb   rc   rf   rg   rw   rx   ry   rz   r{   r|   r   r   r   r   c                 C   sX
  t j \}}}t j \}} }!t |jd | j}"d}#t| j	r7t
|| j |jd  |jd  | j |#}#| j| jf}$| j| jf}%| j| jf}&| j| jf}'t ||!d | d f |$d}(| | j })t ||!d |)d f |%|df}*t ||!d |)d f |&|df}+t ||!d | d f |'d},t ||!| d f | jfd}-t ||!| d f | jfd}.t ||!| d f | j| j fd}/tj }0|0|}1|1j|}2|1j|}3t| j r|1j|}4nt t j|2j| jd|}4|1j|}5|1j|}6|1j|}7|1j|}8|1j |}9|1j|}:|1j |};dd |2|5|3|6|7fD \}<}=}>}?}@|!|}A|!|}B|!|}C|!|}D|A"|(}E|A#|2}F|A"|*}G|A#|3}H|B"|+}I|B#|4}J|B"|,}K|B#|5}L|C"|-}M|C#|8}N|C"|.}O|C#|9}P|D"|/}Q|!|}R|!|}S|!|}T|S$| j| jf}U|S$| j| jf}Vt %|Utj&}Wt %|Vtj&}X|W'd |X'd tj(|2d	 |R| j)d
}Ytj*|3|R| j)d
}Ztj(|5d	 |R| j)d
}[tj*|4|R| j)d
}\tj(|?|S| j+d
}]tj*|=d	 |S| j+d
}^tj(|@|S| j+d
}_tj*|<d	 |S| j+d
}`tj(|7|T| j,d
}atj*|>|T| j,d
}bt| j) rdnd}ct-|R.|:|c }dt-|R.|;|c }et /t0j1ddd| j}ft /t0j1ddd| j}gtj2|f|| j)d
!|}htj3|f|| j)d
!|}itj2|g|| j+d
!|}jtj3|g|| j+d
!|}ktj2|f|| j,d
!|}ltj3|g|| j,d
!|}mt 4t j/t j56 | jd| jj7 d|!|}n|h"|2}o|h"|5}p|i"|3}q|i"|4}r|j"|?}s|j"|@}t|k"|=}u|k"|<}v|l"|7}w|m"|>}x|n#|6}y|n#|7}zt 8| j| jf}{|A"|{}||A!d"|{}}t| j| jkr||}~|}}nt 8| j| jf}|B"|}~|B!d"|}t 8| jf}|C"|}tj9|||jd d}t| j:rC|}n
tj9|~|jd d}t;|R|S|T|Y|Z|[|\|]|^|_|`|a|b|W|Xd}t;d3i d|hd|id|jd|kd|ld|md|nd|od|qd|pd |rd!|dd"|ed#|yd$|zd%|sd&|ud'|td(|vd)|wd*|x}t;|D|Qd+}t<|!|jd |jd }t=| j>|||E|F|||}||M|N||j?d,}t=| j@|||K|L|~|||O|P||j?d,}t=| jA||||||"|d-}| jB|B|I|J||jC|jd d. t| jDrt jE  | jF|A|G|H||jC|jd d. t jE  t| jDr9t jGd t jH  |iI|\}t J|i|r| t jH  |#}| jK| jLksDJ tM| jKD ]O}t| jKdkpZ|| jKd k rv|dksi|| |"k rq||| |d/ t jE  t|| jLk r|dks|| |"k r||| |d/ t jE  qJtN| j| j|j?|jC}t=|jO||Rd| j	d0}tPd}tPd}tP| jKd }tPd}tjQ|#|"dd1D ])}|||||||d2 | R|| jK}| R|| jL}| R|| jK}| R|| jL}qt| jdkr|WS|WT |
  t |3j|}t |4j|}| U|W|X||	|||||||| |! d S )4Nr   r   Nr   r   )r   c                 S   s   g | ]}t |qS r6   )r
   transpose_viewr   r6   r6   r7   r     s    z5FlashAttentionBackwardSm80.kernel.<locals>.<listcomp>        )NNr   )swapABNr   N)r   NNF   )	transposenum_matricesTr   rY      limit)thr_mma_sdpthr_mma_dkv
thr_mma_dqtSrQtSrKtdPrdOtdPrVtdVrPtdVrdOtdKrdStdKrQtdQrdStdQrKacc_dKacc_dVsmem_thr_copy_QdOsmem_thr_copy_KVsmem_thr_copy_PdStsmem_thr_copy_QdOtsmem_thr_copy_dSsmem_thr_copy_Ktr2s_thr_copy_PdStSsQtSsKtdPsdOtdPsV	tSsLSEMmatSsdPsumMmatPsPtdSsdStdVsPttdVsdOttdKsdSttdKsQttdQsdStdQsKt)gmem_thr_copy_dQaccumtdQgdQaccum)seqlen)
mma_paramssmem_copy_paramsgmem_copy_params
load_Q_LSEload_dO_dPsumm_block_maxr   )r  headdim)smem_pipe_write_q)n_blockthr_mmamask_seqlenmask_causal)unroll)mask_fnr6   )Vr.   r/   
thread_idx	block_idxr   rp   r   r<   rS   r   r?   r   r)   r+   
local_tiler   r
   SmemAllocatorallocater   
get_tensorr   r2   r   make_tensor
recast_ptriteratorr   r   r   r   r   r   	get_slicepartition_Spartition_Dpartition_shape_Cmake_fragmentrU   fillmma_make_fragment_Ar   mma_make_fragment_Br   r    make_acc_tensor_mn_viewpartition_Cri   r   LdMatrix8x8x16bOpmake_tiled_copy_Amake_tiled_copy_Bmake_tiled_copy_Crm   rn   rh   make_identity_tensorpredicate_kr*   r   r   r   r  seqlen_qr  compute_one_m_blockload_Vseqlen_kr$   cp_async_commit_groupload_Kcp_async_wait_groupbarrierretilecopyr   r   range_constexprr   
apply_maskInt32rangeadvance_pipelinestoreloadepilogue)r3   r   r   r   r   r   r   r   r   r   r   r   r_   r`   ra   rb   rc   rf   rg   rw   rx   ry   rz   r{   r|   r   r   r   r   tidx_r  head_idx	batch_idxr  m_block_min
blkQ_shape
blkK_shape
blkV_shapeblkdO_shapegQhead_idx_kvgKgVgdOgLSEgdPsumgdQaccumr   storager   r   r   r   r   r   r   r   sLSEMma	sdPsumMmasQtsdOtsKtsPtsdStgmem_thr_copy_QKgmem_thr_copy_VdOgmem_thr_copy_lser  tQgQtQsQtKgKtKsKtVgVtVsVtdOgdOtdOsdOtLSEgLSEtLSEsLSE
tLSEgdPsum
tLSEsdPsumr  r   r   r   acc_shape_dKacc_shape_dVr   r   r   r   r   r   r   r   r   r   r   r   LSEslicer  r  smem_copy_atomsmem_copy_atom_transposedr   r   r   r   r   r  r  r  r  r  r  r  r  r  r  r  r  r	  r
  cQtQcQt0QcQtdOcdOt0dOcdOcdOcLSEtLSEcLSEtQpQtdOpdOr  r  r  r  r  r  r<  tdPrV_copy_viewm_blockstagemaskr!  smem_pipe_read_qsmem_pipe_read_dor  smem_pipe_write_dom_tilesdKsdVr6   r6   r7   r     s  !"	
"

"




































		










"





z!FlashAttentionBackwardSm80.kernelr  r  r  r  r  r  r  r  r  r  r  r!  c                     sP  	fdd}fdd}j tj r$jjfnjjf}t|tj	}|
d tjtjdkrDdnd tj  tjj |jjjd d d tjdkrd|ndf jjjjd	 tjd	 }tjd tjdkr|ndf | t|d ur||d
 t|}d}tj|dgdt|ksJ tjtj|dgdddD ]}||d f t||d f   | ||   qt|tj	}|
d tjtj!dkrdnd tj  tjj |j"j#j$d d d tj!dkr|ndf j%jjtjdkr(|nd jd
 tj&d	 }tj&d tj!dkrF|ndf | t|}tj|dgdt|ksaJ tjtj|dgdddD ]}||d f ||d f   ||d f   ||    qnt|j'}||  (j' tj) rj*+|}t,j*|j- t|j'}||  (j' tj) rtj  tj) rj*+|}t,j*|j. tj)rt/|j0t1|j2}nj3}tjj4j5|j6j7j8d d d tj!dkr|ndf j9j:j)j;d
 tj   fdd}tjdkrG|| tj)rZt/|j0t1|j2}nj<}tjj4j=|j>j?j@d d d tjdkrx|ndf j9j:j)j;tjdkr|nd d tjdkrtj  || d S d S )Nc                     sB   t jdkrjd nd } | k r |  tj  d S )Nr   )r<   rS   r   r.   r/   r?  )m_block_next)r  r  r  r3   r  r6   r7   load_Q_next  s   "
zCFlashAttentionBackwardSm80.compute_one_m_block.<locals>.load_Q_nextc                      s,   j  k r j   tj  d S r   )r   r.   r/   r?  r6   )r  r  r  r3   r  r6   r7   load_dO_next  s   zDFlashAttentionBackwardSm80.compute_one_m_block.<locals>.load_dO_nextr   r   r   )swap_ABr   )r  )modeTunroll_full)hook_fnr  )	A_in_regsr  c                    s   j tj rjjfnjjf}t|tj	}|
d tjj |jjjjjjj| d
  j|} jd d f }t|t|ksTJ tjt|ddD ]}t|| t|| q^d S )Nr   )r  r  Tr  )r   r.  r<   rS   r    r   r)   r.   r/  rU   r0  r\   gemmr   r   r  r  r   r  r  rC  r  r   rH  r
   atomic_add_fp32elem_pointer)r  acc_shape_dQacc_dQacc_dQ_atomictdQgdQaccum_atomici)r  r  r  r3   r  r6   r7   dQ_mmaC  s$   $
z>FlashAttentionBackwardSm80.compute_one_m_block.<locals>.dQ_mma)r  r  r  )Ar   r.  r<   rS   r   r   r   r.   r/  rU   r0  r/   rA  r   rB  r\   r  r   r   r  r  r   r   make_fragment_liker  autovec_copyr
   r3  r   rH  rJ  exp2frK  r   r   r   r  r  r  r   tor1   r  rC  rD  r	  r
  r(  r*  convert_layout_acc_frgAr   r   r   r   r   r  r  r   r   r   r   r   r   r  r  ) r3   r  r  r  r  r  r  r  r  r  r  r  r   r!  r  r  acc_shape_SdPacc_StLSErLSEacc_S_mnbidxracc_dP
tLSErdPsum	acc_dP_mnrPtPrPrdStdSrdSr   r  r   r6   )
r  r  r  r  r  r  r3   r  r  r  r7   r<    s   $
 
$"
2
 
&$
 :
&
	&

z.FlashAttentionBackwardSm80.compute_one_m_blockr   r   r  r  	tiled_mmarM  r  num_head
batch_sizec           1   	   C   s  t || j}|| | j t || j}|| | j ||
}||
}t| j	dkrt j
  t jt j | jd| jj d}t ||	|
}||}||}||}||}t ||| t ||| | j| jf}| j| jf}t ||d |d f ||df}t ||d |d f ||df}||}||}||}||}t || j} t || j}!t j
  t ||  t ||! t | j| jf}"||"}#|d|"}$t| j| jkr|#}%|$}&nt | j| jf}'||'}%|d|'}&tj|#|jd d}(t| jr"|(})n
tj|%|jd d})tt  | jd D ]>}*|$d|*df d |jd || j  |#d d  k rtt j|| d |*d f |d |*d f t| j!rp|(d |*d f nd d q7tt  |!jd D ]>}*|&d|*df d |jd || j  |%d d  k rt j||!d |*d f |d |*d f t| j"r|)d |*d f nd d qd S || j	 }+t |||+d f | j| j f|f}t |||+d f | j| j f|f}||},||}-||}.||}/t  |.t  |,ksJ t  |/t  |-ksJ tj#t  |.dd	D ]}0t$|.|0 t%|,|0 q#tj#t  |/dd	D ]}0t$|/|0 t%|-|0 q=d S )
Nr   r   rY   r   r   r   predTr  )&r.   r  r   rJ  rK  r  r+  r<   rS   r   r/   rB  ri   rm   rn   rh   r8  rC  r-  rD  r   r)   r+   r$  r,  r  r9  r
   r:  rp   r*   rE  r   r,   r-   rH  r  r  )1r3   r   r   r   r   r  r  ry   rz   r  rM  r  r  r  rdVrdKgmem_thr_copy_dKgmem_thr_copy_dVsmem_copy_atom_dKVsmem_thr_copy_dKV	taccdVrdV	taccdKrdK	taccdVsdV	taccdKsdKblkdK_shapeblkdV_shapegdKgdVtdKsdKtdKgdKtdVsdVtdVgdVtdKrdKtdVrdVcdKtdKcdKt0dKcdKtdVcdVt0dVcdVcdVtdKpdKtdVpdVrest_mnum_head_kvtdVgdVaccumtdKgdKaccumacc_dV_atomicacc_dK_atomicr  r6   r6   r7   rL  q  s   













44
$$



z#FlashAttentionBackwardSm80.epilogue
num_stagesc                 C   s   ||d k r
|d S dS )Nr   r   r6   )r3   pipeline_indexr  r6   r6   r7   rI    s   z+FlashAttentionBackwardSm80.advance_pipelinegmem_thr_copyrk  rl  r   r  r  c                 C   R  t | j| jf}||}|d|}	tj||d}
t	t 
|jd D ]~}| jsE|t 
|jd d k sE|d|df d | jk r|	d|df d ||| j  |d d  k }t |
d }t	t 
|jd D ]%}t	t 
|jd D ]}t| jr|
|||f ndo||||f< qzqmt j||d |d f |d |d f |d q(d S Nr   r   r   r   Tr  )r.   r9  r   r)   r,  r+  r
   r:  r<   rE  r   rp   rs   r  rS   r,   rD  )r3   r  rk  rl  r   r  r  cKtKcKt0KcKtKpKnpredicate_n	predicatekr  r6   r6   r7   r@    "   

6,,z!FlashAttentionBackwardSm80.load_Krm  rn  c                 C   r  r  )r.   r9  r   r+   r,  r+  r
   r:  r<   rE  r   rp   rt   r  rS   r,   rD  )r3   r  rm  rn  r   r  r  cVtVcVt0VcVtVpVr  r  r  r  r  r6   r6   r7   r=    r  z!FlashAttentionBackwardSm80.load_Vgmem_tiled_copy_Qri  rj  r{  r|  r  rq  rr  r  c                 C   s  t t|jd D ]}| js(|t|jd d k s(|d|df d | jk r|d|df d ||| j  |d d  k }t|d }t t|jd D ]%}t t|jd D ]}t | j	rl||||f ndoo||||f< q]qPtj
||d |d |f |d |d t | jdkr|ndf |d qt t|	jd D ](}|
d|f d | jk rt
||d ||f |	d |t | jdkr|ndf  qd S Nr   r   r   Tr  )r<   rE  r.   r   rp   rr   r   r  rS   r,   rD  r   )r3   r  r{   ri  rj  r{  r|  r  rq  rr  r  r   r  r  mpredicate_mr  r  r  r6   r6   r7   r    0   6,,"	 z%FlashAttentionBackwardSm80.load_Q_LSEgmem_tiled_copy_dOgmem_tiled_copy_dPsumro  rp  r}  r~  r  tdPsumgdPsumtdPsumsdPsumtdPsumcdPsumc                 C   s  t t|jd D ]}| js(|t|jd d k s(|d|df d | jk r|d|df d ||| j  |d d  k }t|d }t t|jd D ]%}t t|jd D ]}t | j	rl||||f ndoo||||f< q]qPtj
||d |d |f |d |d t | jdkr|ndf |d qt t|jd D ](}|
d|f d | jk rt
||d ||f |	d |t | jdkr|ndf  qd S r  )r<   rE  r.   r   rp   ru   r   r  rS   r,   rD  r   )r3   r  r  ro  rp  r}  r~  r  r  r  r  r   r  r  r  r  r  r  r  r6   r6   r7   r  G  r  z(FlashAttentionBackwardSm80.load_dO_dPsum)Nr   r   r   r   r   r   FFFFr   r   r   F)Fr   )'r   r   r   r   r<   Numericr&   r   boolr8   staticmethodrI   rV   r   r   r   r.   jitTensorrU   cudaCUstreamr   r   ComposedLayoutLayout	TiledCopyTiledMma	ConstexprrG  r   r   r<  rL  rI  r@  r=  r  r  r6   r6   r6   r7   r      sd   	

?.	

j)	
?	
  H	
 	
l	
+	
r   )r'   typesr   typingr   r   r   	functoolsr   cuda.bindings.driverbindingsdriverr  r<   cutlass.cuter.   cutlass.cute.nvgpur   r   cutlass.utils.ampere_helpersr
   r	   r@   flash_attn.cuter\   flash_attn.cute.maskr   flash_attn.cute.seqlen_infor   r   r6   r6   r6   r7   <module>   s   