o
    پi                    @   s  d dl Z d dlmZ d dlmZmZmZmZ d dlm	Z	 d dl
m  mZ d dlZd dlmZ d dlmZmZmZmZmZ d dlmZmZmZ d dlmZmZ d dlmZ d dlm Z  d dl!m  m"Z# d d	l$m%Z& d d
l'm(Z) d dl'm"Z* d dl'mZ d d	l'm%Z% d dl+m,Z, d dl-m.Z.m/Z/ d dl0m1Z1 d dl2m3Z3 d dl4m5Z5 d dl6m7Z7m8Z8 d dl'm9Z9 d dl:m;Z; d dl<m=Z= d dl>m?Z?m@Z@mAZAmBZBmCZC d dlmDZD G dd dZEG dd deEZFG dd deEZGdS )    N)SimpleNamespace)TypeCallableOptionalList)partial)	ConstexprFloat32Int32
const_exprBoolean)cpasyncwarp	warpgroup)	ProxyKindSharedSpace)
LayoutEnum)
copy_utils)ampere_helpers)hopper_helpers)utils)AttentionMask)Softmaxapply_score_mod_inner)SeqlenInfoQK)	BlockInfo)BlockSparseTensors)produce_block_sparse_loadsconsume_block_sparse_loads)pipeline)PackGQA)NamedBarrierFwd)TileSchedulerArgumentsSingleTileSchedulerSingleTileLPTSchedulerSingleTileVarlenScheduler
ParamsBase)FastDivmodDivisorc                   @   s  e Zd ZU dZeed< 													d_deej d	ed
e	e dede
de
de
dedededede
de	ej de	ej de
fddZe	d`de
fddZdeej deej deej deej d eej dB d!eej dB d"eej dB d#eej dB d$eej dB fd%d&Zd'd( Zd)d* Zd+d, Zd-d. Zejd/ejd0ejd1ejd2ejd3e	ej d4ed5ejfd6d7Zejd8ejd9ejd2ejd3e	ej d:ejd;ed<ejd=e	ej d>ejd?ed@edAedBefdCdDZ ejdEdF Z!ejdGejdHejdIejdJed;edKefdLdMZ"ejdNejdOejdPejdQejdRejdSejdJedTed;edUejfdVdWZ#ejdNejdXejdYejdZejd[ejd\ejdJedTed;edUejfd]d^Z$dS )aFlashAttentionForwardBaseP   archN   FT   dtypehead_dim
head_dim_vqhead_per_kvhead	is_causalis_localpack_gqatile_mtile_n
num_stagesnum_threads	Q_in_regs	score_modmask_modhas_aux_tensorsc                 C   s   || _ d}tt|| | | _|dur|n|}||k| _tt|| | | _|| jk| _|| jk| _|| _	|| _
|| _|| _|| _|	| _|| _|
| _|| _|| _|| _t| _t|rcd| _dS d| _dS )a  Initializes the configuration for a flash attention kernel.

        All contiguous dimensions must be at least 16 bytes aligned, which means that the head dimension
        should be a multiple of 8.

        :param head_dim: head dimension
        :type head_dim: int
        :param tile_m: m block size
        :type tile_m: int
        :param tile_n: n block size
        :type tile_n: int
        :param num_threads: number of threads
        :type num_threads: int
        :param is_causal: is causal
        :param score_mod: A callable that takes the attention scores and applies a modification.
            Callable signature: ``score_mod(scores, batch_idx, head_idx, q_idx, kv_idx, aux_tensors) -> Any``
        :param mask_mod: A callable that takes the attention scores and returns a boolean representing whether that score should be masked.
            Callable signature: ``mask_mod(batch_idx, head_idx, q_idx, kv_idx, aux_tensors) -> Boolean``
           Nr+      )r-   intmathceil	tile_hdimsame_hdim_kv
tile_hdimvcheck_hdim_oobcheck_hdim_v_oobr0   r1   r2   r3   r4   r5   r7   r6   r8   r9   r:   r	   qk_acc_dtyper   vec_size)selfr-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   hdim_multiple_of rJ   T/home/ubuntu/.local/lib/python3.10/site-packages/flash_attn_origin/cute/flash_fwd.py__init__7   s.   %


z"FlashAttentionForwardBase.__init__returnc	                 C   s   | t jt jfvr
dS |d dkrdS |d dkrdS |d dkr"dS |d dkr*dS || d }	|| | d }
|| | d }|sF|	| nt|	|}||
 }td}||krZdS |d | dkrddS dS )	a=  Check if the kernel can be implemented with the given parameters.

        :param dtype: data type
        :type dtype: cutlass.Numeric
        :param head_dim: head dimension
        :type head_dim: int
        :param tile_m: m block size
        :type tile_m: int
        :param tile_n: n block size
        :type tile_n: int
        :param num_threads: number of threads
        :type num_threads: int
        :param is_causal: is causal
        :type is_causal: bool

        :return: True if the kernel can be implemented, False otherwise
        :rtype: bool
        F   r   r<       r=   sm_80T)cutlassFloat16BFloat16maxutils_basicget_smem_capacity_in_bytes)r-   r.   r/   r4   r5   r6   r7   r1   r8   smem_usage_Qsmem_usage_Ksmem_usage_Vsmem_usage_QV
smem_usagesmem_capacityrJ   rJ   rK   can_implementw   s,   
z'FlashAttentionForwardBase.can_implementmQ_typemK_typemV_typemO_type	mLSE_typemCuSeqlensQ_typemCuSeqlensK_typemSeqUsedQ_typemSeqUsedK_typec
           
      C   s   t ||  ko|  ko|kn   rtdt |tjtjfvr&tdt |d tfvr2tdt |d tfvr>tdt |d tfvrJtdt |d tfvrVtdt |	d tfvrbtd|| jksiJ d S )Nz(All tensors must have the same data typez%Only Float16 or BFloat16 is supportedzLSE tensor must be Float32z!cu_seqlens_q tensor must be Int32z!cu_seqlens_k tensor must be Int32zseqused_q tensor must be Int32zseqused_k tensor must be Int32)r   	TypeErrorrQ   rR   rS   r	   r
   r-   )
rH   r^   r_   r`   ra   rb   rc   rd   re   rf   rJ   rJ   rK   _check_type   s   (z%FlashAttentionForwardBase._check_typec                 C   s  |   \}}}}}t|| j| jfd| _t|| j| j| jfd| _t|| j| j	| jfd| _
t|| j| j	fd| _t|d urPt|| j| jfd| _nd | _d}|| jj }tjtjtjjd| j|d}tjtj | j|d}	|jjd | }
| j|
 dksJ d| j|
 dksJ dtj| j|
 |
fd	d
}tj| j|
 |
fd	d
}| j|jd  dksJ |jjd | }tj| j| |fd	d
}tj| j| |fd	d
}| j|jd  dksJ td|f}|}t|||| _t|||| _ t|||| _!t|	||| _"d S )N)r   r+   )r   r+   r=   r,   )
cache_mode)num_bits_per_copyr+   r   z0num_threads must be divisible by tQK_shape_dim_1)r+   r   )order)#_get_smem_layout_atomcutetile_to_shaper4   rA   	sQ_layoutr5   r6   	sK_layoutrC   	sV_layout	sO_layoutr   	sP_layoutr-   widthmake_copy_atomr   	CopyG2SOpLoadCacheModeGLOBALnvgpuCopyUniversalOpoutershapenum_Q_load_threadsnum_producer_threadsmake_ordered_layoutnum_epilogue_threadsmake_layoutmake_tiled_copy_tvgmem_tiled_copy_Qgmem_tiled_copy_Kgmem_tiled_copy_Vgmem_tiled_copy_O)rH   sQ_layout_atomsK_layout_atomsV_layout_atomsO_layout_atomsP_layout_atomuniversal_copy_bitsasync_copy_elemsatom_async_copyatom_universal_copytQK_shape_dim_1	tQ_layout	tK_layouttV_shape_dim_1	tV_layout	tO_layoutvQKV_layout	vO_layoutrJ   rJ   rK   _setup_attributes   s   


z+FlashAttentionForwardBase._setup_attributesc                 C      t  NNotImplementedErrorrH   rJ   rJ   rK   rl   .     z/FlashAttentionForwardBase._get_smem_layout_atomc                 C   r   r   r   r   rJ   rJ   rK   _get_tiled_mma1  r   z(FlashAttentionForwardBase._get_tiled_mmac                 C   r   r   r   r   rJ   rJ   rK   _get_shared_storage_cls4  r   z1FlashAttentionForwardBase._get_shared_storage_clsmQmKmVmOmLSEsoftmax_scalestreamc                 C   r   )Configures and launches the flash attention kernel.

        mQ/mK/mV/mO has same data types(supports fp16 and bf16) and same layout:
        (batch_size, seqlen_q, num_head, head_dim):(_, _, _, 1)
        r   )rH   r   r   r   r   r   r   r   rJ   rJ   rK   __call__7  s   z"FlashAttentionForwardBase.__call__acc_OlsesOseqlenr   
tma_atom_O	tiled_mmatidxm_blockhead_idx	batch_idxc           ,   	   C   s  t || j}|| | j t jjtt	j
| jd t| j| j}t ||	|
}||}||}t ||| t | j| jf}t| j| j| j| j}t|d urt|j ri|d ||f }nt| j rr|jnd|jf}t |f|d |f }t| j rt || jf|f}t |j t j!| jfdd}t "|j#|}|	|
}t$|%|}t j&|dgdt &|ksJ t$|%|}t$|d%|}|d d dkrt'(t &|j)d D ]!}||df d |j*|| j  |d d  k r|| ||df< qn|+|||	|
||j* t|j r)|d d ||f }nt| j r3|jnd|jf}t |df|d d |f }t| j,rt jj-t.j/t0j1d t jj2tt	j
| jt jj3 d t || j| jf|df} t4j5|dt !d|| dd	\}!}"}"t j6t j7 }#|#d
krt jjtt	j
| jt jj3 d |!  t j8  t jj9ddd d S d S t jjtt	j
| jd ||
}$|$:|}%t |%| j}&t ;|%|& t| j rQt || j| jf|df} |$| }'|$:|}(|d:|})tj<|(|j)d d}*t'(t &|&j)d D ];}+|)d|+df d |j*|| j  |(d d  k rMt j||&d |+d f |'d |+d f t| jrI|*d |+d f nd d qd S |=||&||
||j* d S )N
barrier_idnumber_of_threadsr   )r   stridemoder+   spaceTsingle_stage   )readlimitpred)>rm   make_fragment_liker-   storeloadtor*   barrierr>   r!   Epiloguer   r   get_smem_store_atommake_tiled_copy_C	get_sliceretilepartition_Dcopymake_identity_tensorr4   rC   r    rE   r0   r   has_cu_seqlens_qr3   offset_qdomain_offset
local_tileappendlayoutr   make_tensoriteratormake_acc_tensor_mn_viewpartition_CsizerQ   range_constexprr|   seqlen_q	store_LSE	use_tma_Ofence_proxyr   async_sharedr   
shared_ctabarrier_arrive	WARP_SIZEr   tma_get_copy_fnmake_warp_uniformwarp_idxcp_async_bulk_commit_groupcp_async_bulk_wait_grouppartition_Sautovec_copypredicate_kstore_O),rH   r   r   r   r   r   r   r   r   r   r   r   r   r   rOsmem_copy_atom_Osmem_thr_copy_OtaccOrOtaccOsOcOr3   mLSE_curoffsetgLSEgLSE_expanded_layoutgLSE_expandedthr_mma	taccOgLSEtaccOcOt0accOcOmmO_curgOr   _r   gmem_thr_copy_OtOsOtOrOtOgOtOcOt0OcOtOpOrest_mrJ   rJ   rK   epilogueI  s   




	



z"FlashAttentionForwardBase.epiloguec                 C   s   || j d k r|d S dS )Nr+   r   )r6   )rH   pipeline_indexrJ   rJ   rK   advance_pipeline  s   z*FlashAttentionForwardBase.advance_pipelinegmem_thr_copygQsQblockheaddimc              	   C   s   | |||}}t| j| jf}	||	}
|d|	}tj|
|d}t	
t|jd D ]7}|d|df d ||| j  |
d d  k rjtj||d |d f |d |d f t| jrf|d |d f nd d q3d S )Nr   r   r+   r   )r   r   rm   r   r4   rA   r   r   r   rQ   r   r   r|   r   r   rD   )rH   r  r  r  r  r   r  tQsQtQgQcQtQcQt0QcQtQpQr   rJ   rJ   rK   load_Q  s   

,z FlashAttentionForwardBase.load_Qgmem_tiled_copytKgKtKsKtKcKt0KcKtKpKsmem_pipe_writeneed_predicatesc                 C   sN  | j |jd j dk}t|
p| rt|r|	|| j   }nt|
 r'| j }nt|	|| j   | j }||d d 8 }tt|jd D ]7}|d|df d |k r}tj	||d |d |f |d |d t| j
dkrj|ndf t| jry|d |d f nd d qFd S tj	||d d d |f |d d d t| j
dkr|ndf t| jr|nd d d S )Nr   r+   r   )r5   tiler_mnr|   r   rQ   minr   rm   r   r   r6   rD   )rH   r  r  r  r  r  r  r  r  r   r   is_even_n_smem_kseqlen_limitnrJ   rJ   rK   load_K  s4   
 
z FlashAttentionForwardBase.load_KtVgVtVsVtVcVt0VcVtVpVc                 C   s  | j |jd j dk}t|
p| rtt|jd D ]}|s9|t|jd d k s9|d|df d | j k rt| jrE|d |d f nd }t|
r|	|| j   |d d  }|d|df d |k }t	|d }tt|jd D ]$}tt|jd D ]}t| jr||||f ndo||||f< qqutj
||d |d |f |d |d t| jdkr|ndf |d qd S tj
||d d d |f |d d d t| jdkr|ndf t| jr|nd d d S )Nr   r+   )Nr   NTr   )r5   r!  r|   r   rQ   r   rm   r   rE   r   r   r6   )rH   r  r'  r(  r)  r*  r+  r  r  r   r   is_even_n_smem_vr%  	predicater$  predicate_nkirJ   rJ   rK   load_V  sF    
z FlashAttentionForwardBase.load_V)Nr+   FFTr,   r,   r+   r,   FNNF)F)%__name__
__module____qualname__r*   r>   __annotations__r   rQ   Numericr   boolr   rL   staticmethodr]   rh   r   rl   r   r   rm   jitTensorr	   cudaCUstreamr   r   	TiledCopyCopyAtomTiledMmar
   r
  r  r  r&  r1  rJ   rJ   rJ   rK   r(   4   sd  
 	

@

9	

`	

	
-	
r(   c                ,   @   s  e Zd Zdd Zdd Zdd Zej					d7dejd	ejd
ejdejde	ej de
jde	e de	e de	e de	ej fddZej		d8dejd	ejd
ejdejde	ej dede	e de	e de	e dejdejdejdejdejdB dejdejdejdejdejdejd ejf*d!d"Zej				#	$d9d%ed&ed'ed(ed)ed*ed+ed,ed-edB d.ejd/ejd0ejd1ed2e	e d3ejd4ejf d5d6ZdS ):FlashAttentionForwardSm80c                 C   s:   t | j| j}|}t | j| j}|}d }|||||fS r   )
sm80_utilsget_smem_layout_atomr-   rA   rC   rH   r   r   r   r   r   rJ   rJ   rK   rl   J  s   z/FlashAttentionForwardSm80._get_smem_layout_atomc                 C   sx   t jt| jtd| jd ddf| jd d ddfd}t jt| jtd| jd ddf| jd d ddfd}||fS )N)r<   rN   r<   rO   r+   r<   )permutation_mnk)rm   make_tiled_mmar   MmaF16BF16Opr-   r	   r7   )rH   tiled_mma_qktiled_mma_pvrJ   rJ   rK   r   R  s   z(FlashAttentionForwardSm80._get_tiled_mmac                    s   fddj jjfD \ ttj tj}tjjtjjj	|f df tjG  fddd}tjG  fddd}t
j rQ|S |S )Nc                    s0   g | ]}t jjt jj jt |f d f qS )   )rm   structAlignMemRanger-   cosize.0r   r   rJ   rK   
<listcomp>`  s    "zEFlashAttentionForwardSm80._get_shared_storage_cls.<locals>.<listcomp>rI  c                       s&   e Zd ZU ed< ed<  ed< dS )zKFlashAttentionForwardSm80._get_shared_storage_cls.<locals>.SharedStorageQKVsVr  sKNr2  r3  r4  r5  rJ   )	sK_struct	sQ_struct	sV_structrJ   rK   SharedStorageQKVg  s   
 rW  c                       s   e Zd ZU ed<  ed< dS )zPFlashAttentionForwardSm80._get_shared_storage_cls.<locals>.SharedStorageSharedQVr  rR  NrS  rJ   )rT  
sQV_structrJ   rK   SharedStorageSharedQVm  s   
 rY  )ro   rp   rq   rT   rm   rM  rJ  rK  rL  r-   r   r8   )rH   
cosize_sQVrW  rY  rJ   )rT  rX  rU  rV  rH   rK   r   _  s   
z1FlashAttentionForwardSm80._get_shared_storage_clsNr   r   r   r   r   r   r   window_size_leftwindow_size_rightlearnable_sinkc                    s  |
du sJ d| j dd |||||fD   |  \}}|j| _| j| _| j| _| j| _| jdk| _	| 
  |  }dd   fdd	||||fD \}}}}d
d	 ||||fD \}}}}t|jtj|jg dd}t|jd | jt|jd t|jd f}ttj}t| jdu rt|| }d}nt|}t|}d}t|durt|jd t| jr| jnd }t|jd }t|}t|}||f}| |||||||||	| j| j| j | j!| j"| j#| j$| j%| j&|||||j'|| jddg|( |d dS )r   Nz.Learnable sink is not supported in this kernelc                 s   "    | ]}|d ur|j nd V  qd S r   element_typerO  trJ   rJ   rK   	<genexpr>  s     z5FlashAttentionForwardSm80.__call__.<locals>.<genexpr>Z   c                    ,   g  fdd j d d D  j d R S )Nc                 3   &    | ]}t j|d  jj dV  qdS r,   )divbyNrm   assumer`  rt   rO  srb  rJ   rK   rc       $ zGFlashAttentionForwardSm80.__call__.<locals>.<lambda>.<locals>.<genexpr>r   rm  rJ   rm  rK   <lambda>  
    z4FlashAttentionForwardSm80.__call__.<locals>.<lambda>c              	      *   g | ]}t |jt j|j |d qS r   rm   r   r   r   r|   ra  
new_striderJ   rK   rP        z6FlashAttentionForwardSm80.__call__.<locals>.<listcomp>c              	   S   s*   g | ]}t |jt j|jg d dqS )r+      r=   r   r   )rm   r   r   selectr   ra  rJ   rJ   rK   rP    rw  r=   r+   r   r   r   r=   ry  r+   )gridr  smemr   ))rh   r   r   num_mma_threadsr7   r~   r}   r   r*   r   r   r   rm   r   r   rz  r   ceil_divr|   r4   r?   log2er   r9   r	   r3   r0   r'   kernelro   rp   rq   rr   rs   r   r   r   r   launchsize_in_bytes)rH   r   r   r   r   r   r   r   r[  r\  r]  aux_tensorsrG  rH  SharedStoragegrid_dimLOG2_Esoftmax_scale_log2fastdiv_modsr   seqlen_kseqlen_q_divmodseqlen_k_divmodrJ   ru  rK   r   t  s   




z"FlashAttentionForwardSm80.__call__r  ro   rp   rq   rr   rs   r   r   r   r   rG  rH  r  c           Z         s  t j \}}}t j \}}}t j j j jd||	t	 j
r$ jndd}tj|jd |jd d}|||\}} | d }! j jf}" j jf}# j jf}$t |d d ||f |"|df}%| j }&t |d d |&|f |#d}'t |d d |&|f |$d}(tj })|)|}*|*j|
}+|*j|},t	 j r|*j|}-nt t j|+j jd|}-t |-}.|!|}/|!|}0|/"|,|/#|'}1}2|0"|-|0#|(}3}4|!|}5|!|}6|5$|5%|+|5&|5'|,d }7|6&|6'|.d }8|6( j jf}9t )|9t*}:|:+d	 t ,t-j.dd
d j};t ,t-j.dd
d j}<t/|;|!|t0|;|!|}=t0|<|!|}>#|+|=#|,}?|>#|.}@t 1 j jf}A|/#|A}B|/!d#|A}Ct	 j jkrv|B}D|C}Ent 1 j jf}F|0#|F}D|0!d#|F}Etj2|B|jd d}Gt	 j3r|G}Hn
tj2|D|jd d}Ht4j||:jd d |:jd  |d}I|I5  t6|5|6|7|8|:d}Jt6|=|>|?|@d}Kt7 j8||2|1|B|C|G|j9d}Lt7 j:||4|3|D|E|H|j9d}Mt7 j;|J|K|I|L|M j<|||||d}N|!|}O j=|O|%|+||j>|jd d t j?   fdd}Pt	 jr=|L|!ddd t j?  |P  t j@  tA jBD ]Q}Qt	 j pN|Qdkrm|Qdks]|!|Q dkrh|L|!|Q |Q|Qdkd t j?  t	|Q jBd k r|Qdks|!|Q dkr|M|!|Q |Q|Qdkd t j?  qCt	 j r|P  tC j j|j>|j9||	t	 j
r jnd}Rt7|RjD||5 j jt	 jEd ur|nd d}StFd}TtF jBd }U|N|!|T|Uddt7|Sddd  G|T}T G|U}Ut	 jp jr-|H|||}VtjI| d |V ddD ] }W| d |W }!|N|!|T|Udt7|Sddd  G|T}T G|U}UqtjI|!ddD ]}W|N|!|W d |T|Udd  G|T}T G|U}Uq4|IJ }X|IK|:|X t |+j|}Y L|:|IjM|||Y||d ||||| d S )NFr+   qhead_per_kvhead_packgqar   )seqlen_q_staticseqlen_k_staticNr   )r-   )NNr           r   )	transposenum_matricesTr   num_rowsr   )
thr_mma_qk
thr_mma_pvtSrQtSrKtOrVtr   )smem_thr_copy_Qsmem_thr_copy_Ksmem_thr_copy_VtSsQtSsKtOsVt)r   )
mma_paramssmem_copy_paramssoftmaxr&  r1  r9   r   r   r   r  r  )r   r  c                     sJ   t j jd d  t jr#t j  } t |  d S d S )Nr=   r+   )	rm   r*   cp_async_wait_groupr6   r   r8   r   r   r   )tSrQ_copy_viewrH   r  r  r  rJ   rK   preprocess_Q  s   


z6FlashAttentionForwardSm80.kernel.<locals>.preprocess_Q)r  r   )r   r   mask_causal
mask_localr  )mask_seqlen)is_first_n_block	check_infmask_fnunrollr=   )r  r  r  )Nrm   r*   
thread_idx	block_idxr   r4   r5   r1   r2   r   r3   r0   r   creater|   get_n_block_min_maxrA   rC   r   rQ   r   SmemAllocatorallocater  
get_tensorrR  r8   rQ  r   
recast_ptrr   r-   transpose_viewr   r   r   make_fragment_Apartition_Amake_fragment_Bpartition_Bpartition_shape_Cmake_fragmentr	   fillru   r   LdMatrix8x8x16bOpmake_tiled_copy_Amake_tiled_copy_Br   r   rB   r   resetr   r   r&  r  r1  compute_one_n_blockr9   r  r   cp_async_commit_groupr   r   r6   r   
apply_maskr:   r
   r  !get_n_block_min_causal_local_maskrangefinalize	rescale_Or
  row_sum)ZrH   r   r   r   r   r   r  r   r[  r\  ro   rp   rq   rr   rs   r   r   r   r   rG  rH  r  r  r  r   r  r   num_head
batch_size
block_infor   n_block_minn_block_maxn_block
blkQ_shape
blkK_shape
blkV_shaper  num_head_kvgKgVr}  storager  rR  rQ  sVtgmem_thr_copy_Kgmem_thr_copy_Vr  r  r(  r'  r  r  r  r  acc_shape_Or   smem_copy_atom_QKsmem_copy_atom_Vr  r  r  r  cKr  r  r)  r*  cVr  r+  r  r  r  r&  r1  r  gmem_thr_copy_Qr  stagemaskr  smem_pipe_readr  n_block_min_causal_local_maskn_tile	row_scaler   rJ   r  rK   r    s  






















	






z FlashAttentionForwardSm80.kernelFTr  r  r  r  r  r  r&  r1  r9   r   r   r   r   r  r  r  c                    s  fdd}|j jjf}t|t}|d |   fdd}|  tj	|j ||j
|j|j|jdddtjdkrD|ndf |j|jjd		 t|	durgj|j |
|||||j||d

 fdd}tjdkr|  |  t|dur||d |j| |d}||j| t|j}|| j t|jt !|j"}tjdkr|  |  t#|j$|j||j%|j&dddtjdkr|ndf |j' dS )zCompute one n_block of S/O.

        This function provides different variants for processing the first n block versus
        subsequent blocks.
        c                      s$   t j jd d  t j  d S )Nr=   )rm   r*   r  r6   r   rJ   r   rJ   rK   sync-  s   z;FlashAttentionForwardSm80.compute_one_n_block.<locals>.syncr  c                      sL   j dksj  d dkrj  d  oj dkd tj  d S )Nr+   r   r   r6   rm   r*   r  rJ   )r  r1  r  rH   r  rJ   rK   load_V_next8  s   zBFlashAttentionForwardSm80.compute_one_n_block.<locals>.load_V_nextNr+   r   )	A_in_regsr   r  r  c                      s0   j  dkr j  dd tj  d S )Nr   Fr  r  rJ   )r&  r  rH   r  rJ   rK   load_K_next`  s   zBFlashAttentionForwardSm80.compute_one_n_block.<locals>.load_K_next)r  is_firstr  )(r  r  r4   r5   rm   r  r	   r  rA  gemmr  r  r  r  r   r6   r  r  r8   apply_score_modr   r  online_softmaxr  r   r   r-   r   r   r   r   r   r   convert_layout_acc_frgAr   gemm_rsr  r  r  r  )rH   r  r  r  r  r  r  r&  r1  r9   r   r   r   r   r  r  r  r  r  r  acc_shape_Sacc_Sr  r  r  rPtOrPrJ   )r  r&  r1  r  rH   r  rK   r    st   
	
z-FlashAttentionForwardSm80.compute_one_n_block)NNNNNNN)NNNFT)r2  r3  r4  rl   r   r   rm   r9  r:  r   r;  r<  r	   r
   r   r  ComposedLayoutr=  r?  rQ   r   r   r   r   r   r  rJ   rJ   rJ   rK   r@  I  s    		
h	
  5	
r@  c                G       s  e Zd ZdZddddedef fddZdd	 Zd
d Zdd Ze	j
										djde	jde	jde	jde	jdee	j dedejdee	j dee	j dee	j dee	j dee	j deeB dB deeB dB dee	j dee dee f"d d!Ze	jeee	j  dfde	jde	jde	jde	jdee	j dee	j dee	j dee	j dee	j d"ee	j d#ee	j d$ee	j d%ee	j d&edee dee dee dee	j dee d'e	jd(e	jd)e	jd*e	jd+e	jdB d,e	jd-e	jd.e	jd/e	jd0e	jd1e	jd2e	jd3ed4eje d5eje fDd6d7Ze	j
de	jde	jde	jd8e	jd9e	jd:e	jd"e	jd#e	jd$e	jd;ejjd<ejjd=ej dee d>e!d?ed@ef dAdBZ"e	j
	dkd0e	jd1e	jd2e	jde	jde	jdee	j d8e	jd9e	jdCe	jdDee	j dEe	jdee	j d;ejjd<ejjd=ej d,e	jd/e	jd%ee	j dFed&edee d>e!d?edGed@edee dee f6dHdIZ#e	j
			JdldKedLedMe	jdNe$dOe%dPe&dQedRee dSefdTdUZ'e	j
dVedWefdXdYZ(e	j
			J	dmdZejj)ej*B dKedLedVed2e	jd;ejjd<ejjd[e	jdMe	jdNe$dOe%dPe&dRee dQee d\ejd]ejf d^d_Z+e	j
			dndZejj)ej*B dKedLedVed2e	jd;ejjd<ejjd[e	jdMe	jdNe$dOe%dPe&dRee dQee d]ejfd`daZ,e	j
dbdc Z-e	j
		dodee fdddeZ.dfdg Z/dhdi Z0  Z1S )pFlashAttentionForwardSm90rd  T)intra_wg_overlapmma_pv_is_rsr   r  c                   s(   t  j|i | || _|| _d| _d S )NrI  )superrL   r   r  buffer_align_bytes)rH   r   r  argskwargs	__class__rJ   rK   rL     s   
z"FlashAttentionForwardSm90.__init__c                 C   s|   t ttj| j| j| j}|}t ttj| j| j| j}|}| j	s5t ttj| j| j
| j}nd }|||||fS r   )r   make_smem_layout_atomsm90_utils_basicrB  r   	ROW_MAJORr-   rA   rC   r  r5   rC  rJ   rJ   rK   rl     s*   z/FlashAttentionForwardSm90._get_smem_layout_atomc              
   C   s   t j| j| jtjjtjjt| jd ddfd| jfd}t j| j| jtjjtjj	t| jd ddfd| j
f| jr:tjjntjjd}t j| j| jtjjtjj	t| jd ddfd| j
ftjjd}|||fS )N@   r+   )atom_layout_mnkr!  )r  r!  a_source)r	  make_trivial_tiled_mmar-   r   OperandMajorModeKr	   r4   r5   MNrC   r  OperandSourceRMEMSMEM)rH   rG  rH  tiled_mma_pv_rsrJ   rJ   rK   r     s@   	


z(FlashAttentionForwardSm90._get_tiled_mmac              	      s4  fddj jjfD \ttj tj}tjjtjjj	|f df t
jd ur;tjnd}tjjtjjj	|f df tjjtjdf tjjtjjd f  tjjtjjd f tjG  fddd}tjG  fdd	d	}t
j r|S |S )
Nc                    s2   g | ]}t jjt jj jt |f  jf qS rJ   )rm   rJ  rK  rL  r-   rM  r  rN  r   rJ   rK   rP    s    $zEFlashAttentionForwardSm90._get_shared_storage_cls.<locals>.<listcomp>rI  r   r=   c                       sF   e Zd ZU ed<  ed< ed< ed< ed< ed< ed< dS )	zKFlashAttentionForwardSm90._get_shared_storage_cls.<locals>.SharedStorageQKVmbar_ptr
mbar_ptr_K
mbar_ptr_VrQ  r  rR  sPNrS  rJ   )mbar_ptr_K_structmbar_ptr_QO_structmbar_ptr_V_structrT  	sP_structrU  rV  rJ   rK   rW    s   
 rW  c                       s>   e Zd ZU ed<  ed< ed< ed< ed< ed< dS )zPFlashAttentionForwardSm90._get_shared_storage_cls.<locals>.SharedStorageSharedQVr  r  r  r  rR  r  NrS  rJ   )r  r  r  rT  r  rX  rJ   rK   rY    s   
 rY  )ro   rp   rq   rT   rm   rM  rJ  rK  rL  r-   r   rs   rQ   Int64r6   r8   )rH   rZ  	cosize_sPrW  rY  rJ   )	r  r  r  rT  r  rX  rU  rV  rH   rK   r     s   
 	z1FlashAttentionForwardSm90._get_shared_storage_clsNr   r   r   r   r   r   r   mCuSeqlensQmCuSeqlensK	mSeqUsedQ	mSeqUsedK
mPageTabler[  r\  r]  blocksparse_tensorsr  c           3         sb  | j dd |||||||	|
|f	D   dd fdd||||fD \}}}}t|du r2g dng d	fd
d||fD \}}t|	du rMg dng d	  fdd||fD \}}t|du rhg dnddg}t|durxt||nd}|  \}}}|j| _d| _| j| j | _| j| jd  | _	d| _
| j| _| j| _| jdkrdn| jdkrdnd| _| jdkrdn| jdkrdnd| _t|du| _t| jr| jdko| jdkn| jdk| _| jdko| jo| j| j dk | _| jdko|du o|
du o| j | _|   dd || j| jfdf|| j| jf| jf|| j| jf| jf|| j| jfdffD \| _| _| _ | _!d| _"t| j# rVt$%|j&t'j(| j| jf| _"| ) }t| jr(| j|j*d f|j*d |j*d g|j*dd R }|j+d |j+d f|j+d |j+d | j g|j+dd R }t,-|j.t,j/||d}| j|j*d f|j*d |j*d g|j*dd R }|j+d |j+d f|j+d |j+d | j g|j+dd R }t,-|j.t,j/||d}t|dur(| j|j*d f|j*d g|j*dd R }|j+d |j+d f|j+d | j g|j+dd R }t,-|j.t,j/||d}t01 }t01 }t02 }dd d|| jfd|| jfd|| j ffD | _3d \} }!t| jrdt04||| j| j| jf\} }!t04||t,j| jddgd!| j| jfd\}"}#t04||t,j| j ddgd!| j| jfd\}$}%d \}&}'t| jrt04||| j!| j| jf\}&}'t|dup|
durt5}(nt| j6 p| j7rt8nt9}(t:t,;t,|j*d | jt,|j*d t|du rt,|j*d n	t,|j*d d dt,|j*d |j*d |j*d t|durt,|j*d nt,|j*d t,|j*d  | j| jf||
t| jr3| jnd| j&j<d" d#| j6p@| j7d$})|(=|)}*|(>|*}+t?@t?jA},t| jBdu rb||, }-d}n|,}-|}t|durqtC|}t|dur|tC|}d}.t|durt,|j*d t| jr| jnd }/t|du rt,|j*d n	|j*d |j*d  }0tD|/}1tD|0}2|1|2f}.| jEg t| jr|!n||#|%t| jr|'n||||	|
|| |"|$|&|-|||||| j| j| j | j!| j"| jF| jG| jH| jI||||*|(|||.R  jJ|+| j	ddg|dd% dS )&r   c                 s   r^  r   r_  ra  rJ   rJ   rK   rc    s
    
z5FlashAttentionForwardSm90.__call__.<locals>.<genexpr>c                    re  )Nc                 3   rf  rg  ri  rk  rm  rJ   rK   rc    rn  zGFlashAttentionForwardSm90.__call__.<locals>.<lambda>.<locals>.<genexpr>ro  r   rm  rJ   rm  rK   rp    rq  z4FlashAttentionForwardSm90.__call__.<locals>.<lambda>c              	      rr  rs  rt  ra  ru  rJ   rK   rP    rw  z6FlashAttentionForwardSm90.__call__.<locals>.<listcomp>Nrx  )r   r=   r+   c                       g | ]}t | qS rJ   r   rz  ra  )QO_layout_transposerJ   rK   rP        c                    r&  rJ   r'  ra  )KV_layout_transposerJ   rK   rP    r)  r{  r+   r   r,   rO      r=         8      rd  c                 S   s&   g | ]\}}}t |jtj||qS rJ   )
sm90_utilsmake_smem_layoutr`  r   r
  )rO  mXr|   r  rJ   rJ   rK   rP  E  s    ry  r   c              
   S   s0   i | ]\}}}|t |jt j|d dgdqS )r   r+   r   )rm   r  r`  rz  )rO  namer2  r   rJ   rJ   rK   
<dictcomp>  s    z6FlashAttentionForwardSm90.__call__.<locals>.<dictcomp>Qr  Vr  r   rN   F)total_qtile_shape_mnr   r"  r  element_sizeis_persistentlpt)r|  r  r   min_blocks_per_mp)Krh   r   r   rz  r   r   r~  num_threads_per_warp_groupnum_mma_warp_groupsr7   r~   r}   r   num_mma_regsnum_producer_regsrQ   use_block_sparsityr   rA   use_scheduler_barrierr*   r3   r4   r0   	use_tma_Qr   r   r5   r6   rC   ro   rp   rq   rr   rs   r  r0  r1  r-   r   r
  r   r|   r   rm   r   r   r   r   CopyBulkTensorTileG2SOpCopyBulkTensorTileS2GOptma_copy_bytesmake_tiled_tma_atomr%   r1   r2   r#   r$   r"   r  rt   to_underlying_argumentsget_grid_shaper?   r  r  r9   r
   r'   r  r   r   r   r   r  )3rH   r   r   r   r   r   r   r   r   r!  r"  r#  r$  r[  r\  r]  r%  r  LSE_layout_transposerG  rH  r  r  shape_Q_packedstride_Q_packedshape_O_packedstride_O_packedshape_LSE_packedstride_LSE_packedr   gmem_tiled_copy_KVr   
tma_atom_Qtma_tensor_Q
tma_atom_Ktma_tensor_K
tma_atom_Vtma_tensor_Vr   tma_tensor_OTileSchedulertile_sched_argstile_sched_paramsr  r  r  r  r   r  r  r  rJ   )r*  r(  rv  rK   r     s  



&	













	
 !"#$%

z"FlashAttentionForwardSm90.__call__rR  rT  rV  r   r  ro   rp   rq   rr   rs   r   r   r   r   rG  rH  r  r[  rY  r  c%           :      C   s  t jt j }%|%dkr!|
|||fD ]}&t|&d ur t|& qtj	 }'|'
|"}(|(j })|%dkrBt| j rBt j|)| j tjtjjj}*tjtjjj| jt jj }+tjj|(j | j|*|+| jd dd},tjj|(j | j|*|+| jd dd}-|(jj|j|jd}.|(j j|j|jd}/t| j! r|(j"j|j|jd}0n|(jj|j|j|j#d	}0t$|0}1d }2t|d ur|(j%j|j|jd}2|(jj|j|j| j&d	}3t'| j(| j)| j*| j+d||t| j,r| j-ndd
}4t.t/jt| j, r|j0d n|j0d d |j0d ||||	d}5t.t1| j(| j)||t| j,r| j-ndd}6t.|!j| }7|%dk rHt j2| j3 | 4||||.|/|0|
|||,|-|)||4|5|7 d S t j5| j6 t j7 \}8}9}9|8d }8| 8|||||||.|/|1|2|3||,|-|)||||8|||4|5|6|7||#|$ d S )Nr   r+   r  T)barrier_storager6   producer_groupconsumer_grouptx_count
defer_syncr6  F)swizzle)ra  r-   r  )r  r  r   r!  r"  r#  )r[  r\  r  r   r,   )9rm   r*   r   r   r   r   prefetch_descriptorrQ   r   r  r  r  data_ptrrC  mbarrier_initr}   r   CooperativeGroupAgentThreadr~  r   PipelineTmaAsyncr  r  r6   rF  r  r  r  r{   innerrR  r8   rQ  r`  r  r  r-   r   r4   r5   r1   r2   r3   r0   r   r   r|   r   warpgroup_reg_deallocr@  r   warpgroup_reg_allocr?  r  mma):rH   r   r   r   r   r   r   r!  r"  r#  rR  rT  rV  r   r  r   r[  r\  r]  r%  ro   rp   rq   rr   rs   r   r   r   r   rG  rH  r  r[  rY  r  r  r  r   tma_atomr}  r  
mbar_ptr_Qpipeline_kv_producer_grouppipeline_kv_consumer_group
pipeline_k
pipeline_vr  rR  rQ  r  r  r   r  SeqlenInfoClsAttentionMaskClsTileSchedulerClsr   r  rJ   rJ   rK   r    s   (





"	
z FlashAttentionForwardSm90.kernelr  rR  rQ  rq  rr  rn  r  rs  ru  c           +      C   s  t jt j d }|dkrtd}ttjjj	| j
}| }| }|jr|j\}}}}||}|j||ddd d |f }t| j rL|| j n|}|j||ddd d |f }|j||ddd d |f }t || j| jfd}t || j| jfd} t| jrt || j| jf|df}!tj|dt d|!|dd\}"}}t|dt d||\}#}}t|#|
}#t|	dt d| |\}$}}t|$|}$t| j r|||\}%}&|&d }'|
j|t| jr| j d	 ndd
 t| jr|"|
!|d |#|'|d t| j" rJ|| |$|'|d |#  tj$|&d |% ddD ]%}(|&d |( d }'|
| |#|'|d || |$|'|d |#  q#nhtj$|&d |% ddD ]+}(|&|( d })|)d }'|% }*|#  |
| |#|'|d ||* |$|)|*d qU|%}'|| |$|'|d |#  nt&||||||"|#|$|
|| j| j d	 | j"t| jr| jnd}|'  |(  |) }|js)d S d S d S )Nr   r   r+   ry  dimr  Tr   r5  )extra_tx_count)tma_bar_ptr)src_idxproducer_stater  )*rm   r*   r   r   r
   r   make_pipeline_staterQ   PipelineUserTypeProducerr6   initial_work_tile_infois_valid_tiletile_idxoffset_batch_Qr   r3   r0   offset_batch_Kr   r5   rA   rC   rC  r4   r   r   r   tma_producer_copy_fnrA  r  producer_acquirerF  producer_get_barrierr   advancer  cloner   prefetch_next_workadvance_to_next_workget_current_work)+rH   r   r   r   r  rR  rQ  rR  rT  rV  rq  rr  rn  r%  r  rs  ru  warp_idx_in_wgq_producer_phasekv_producer_statetile_scheduler	work_tiler   r   r   r  r   mQ_curhead_idx_kvmK_curmV_curr  r  r  r  r&  r1  r  r  r  r0  n_block_prevkv_producer_state_prevrJ   rJ   rK   r     s   












zFlashAttentionForwardSm90.loadr  r  r   r   rt  c           T      C   s  t j|| j }t j| j| jd}||}|||} |||}!|| |}"|	| 
|}#t| jrQ|| j| jf}$t tt |$| j}%n||!|
}%|	|!
|	}&t| j| j}'t |'||}(t|
d ur}|(|
nd })|   || j| jf}*t |*t}+t|(|)d},ttj|| j| jf|"|#}-ttj||+|%|&}.tt| jr| j n| j!|-||||+|%|,dd	}/t"d}0t#$t%j#j&j'| j(}1| }2|2) }3t*j+||+j,d d |+j,d  |d}4t| j-|-||%|,|4d}5t| j.||.d	}6|3j/r||3j0\}7}8}9}:||9};t%|d uo|;j1p|;j2}<t%|d uo)|;j3p)|;j4}=t%|d urK|\}>}?|<s<|>nt5|;j6|=sE|?nt5|;j7f}||;}@t|@j8|9|8|7|| j9| j:||d
	}Ad }Bt| j;d urvt| j<||9|8|7|||d}Bt|/|;|4|Bd}Ct| j= rt>| j| j?| j@| jA}D|;jB||9ddd d |8f }E|DC|E||||7|;j6 t jD| |E|;|7\}F}Gt| j= rt jjF||0d |0dN }0d}Ht| jG rt| jr|5|Gd |;|1t|A| jHd|Bdd}1n| I  |C|1|Gd |;t|.dddt|A| jHddd}1d}H|Gd8 }Gt| j9p| j:rH|J|;|7|F}It%jK|G|I ddD ]}J|C|1|Gd |J |;t|.|H dt|A| jHddd}1d}Hq$t%L|G|I}G|M|;|7|F}Kt%jK|G|K ddD ]}J|C|1|Gd |J |;t|.|H dt|A| jHddd}1d}HqXt| j:o|jNd urt%L|G|K}Gt%jK|G|F ddD ]}J|C|1|Gd |J |;t|.|H dt|A| jHddd}1d}Hqt| jr|6|1|H d}1d}Hn7| O  n2tP||9|8|7|;|1|.|C|5|6|A|B|H| jH|| j| jI| jOt| jQr| jAnd\}1}H}L|Ls|4R  |+Sd d }Mt|d urPt| jQ rt||8 }MnCt T|4jUt}Mt V| j| jf}NtW|X|N}Ot%jKt Y|MddD ] }P|7| j |O|P d  }Q|Q| jA |8| jA  }Rt||R |M|P< q/|4jZ|Md}S|4[|+|S | \|+|4j]||||;|||||7|8|9 |2^  |2_ }3|3j/sd S d S )Nr   )smem_thr_copy_PtPsPT)	mma_qk_fnr  rq  rr  r   r  r  r  r   r+   r  )r  rq  r  r  r  )rr  	mma_pv_fn)r   r   r   r   r  r  r  r  r  )r   r  score_mod_fnry  rv  )phaseF)r:   )r  r   kv_consumer_stater  r  is_first_block)	zero_init)r:   r  )r  r   r  r  r  r  )r  r   r  r  )r  r  r  )unroll_full)sink_val)`rm   r*   r   r=  r   r>  r   r  r  r  r  r   r  r  r4   r5   r  r   r  r-   r   r   r   mma_initrC   r	   r   r   r0  gemm_zero_init
gemm_w_idxr   mma_one_n_block_intrawg_overlapmma_one_n_blockr
   r   r|  rQ   r}  Consumerr6   r  r   r  r|   first_half_block_overlaplast_half_block_overlapr  r  r   has_seqused_qhas_cu_seqlens_khas_seqused_kr'   r   r  r  r1   r2   r9   r  rC  r    rA   rD   r0   r  r  cp_async_mbarrier_arrive_noincr  mbarrier_waitrA  r:   warp_scheduler_barrier_syncr  r  r"  !get_n_block_min_before_local_maskr[  warp_scheduler_barrier_arriver   r3   r  r  r   row_maxr   r   r   r   r  r  r
  r  r  r  )TrH   rG  rH  r  r   r   r   r  rR  r  r  r   r]  rq  rr  rn  r   r   r   r   r  r   r  rs  rt  ru  r%  r  r  warp_group_idxwarp_group_thread_layoutr  	wg_mma_qk	wg_mma_pvr  r  acc_S_shaper  r  smem_copy_atom_Pr  r  r  r   r  r  r  mma_one_n_block_allq_consumer_phaser  r  r  r  process_first_half_blockprocess_last_half_blockr   r   r   r  r   recompute_fastdiv_mods_qrecompute_fastdiv_mods_kr  r  r  r  r  r  r3   r  r  r  O_should_accumulater  r  n_block_min_before_local_maskprocessed_anyr  cStScS_mnrrow
q_head_idxr  rJ   rJ   rK   rl  L  s  "

	








 zFlashAttentionForwardSm90.mmaFr  r  r  r  r  r   r  r  r  c                 C   s   | ||| ||jdd}|| t|
dur"|
|||d |	||dd |j||d t|jt	
|j}t| jrB|nt|| j}|| | j t| j r||j|}t|j||j tjjtjjjtjjjd tj  |S )	zAProcesses the first half block when using intra-warpgroup-overlapr   B_idxwg_waitNr  r   T)r  r  )r  r   )consumer_waitconsumer_try_waitindexconsumer_releaser   r  rm   r   r   r   r  r   r  r   r-   r   r   r   r  r   r   r  r*   r   r   r   r   r   	sync_warp)rH   r  r  r  rq  r  r  r  r   r  r  r  r  tOrP_acctOrP_curtPrPrJ   rJ   rK   r    s&   

z2FlashAttentionForwardSm90.first_half_block_overlapr  r  c                 C   s8   | ||| ||j|dd || |  |S )z>Processes the final PV GEMM when using intra-warpgroup-overlapr   )r  r  r  )r  r  r  r  r  )rH   r  rr  r  r  rJ   rJ   rK   r    s
   

z1FlashAttentionForwardSm90.last_half_block_overlapr  r   r  r  c                 C   sZ  | ||| ||jdd}|   td || t|d ur+||||d t|d ur7|||d |j|||d}t	
|jt|j}t| jrQ|	nt	|| j}t|| t| j rs|
j|}t	|
j||
j ||| t| j rt	jjtjtjd t	j  | ||| |   ||jdd || |   |S )Nro  r  r   r  r  r  r  r   )!r  r  r  r  r   
wait_groupr  r   r  rm   r   r   r   r  r   r  r   r-   cvt_f16r  r   r   r  r  r*   r   r   r   r   r   r  r  r  )rH   r  r  r  r  r  rq  rr  r   r  r  r  r   r  r  r  r  r  r  r  r  r  rJ   rJ   rK   r    s6   



z)FlashAttentionForwardSm90.mma_one_n_blockc                 C   sj  |  }|  |||| |   ||jdd}|||| ||jdd |   td |	| t
|d urG||||d t
|d urS|||d |j||d}td |	| t|jt|j}t
| jrv|	nt|| j}t|| t
| j r|
j|}t|
j||
j ||| t
| j rtjjtjtj d tj!  |S )	Nro  r  r+   r  r  r  r   r   )"r  r  r  r  r  r  r  r   r  r  r   r  rm   r   r   r   r  r   r  r   r-   r  r  r   r   r  r  r*   r   r   r   r   r   r  )rH   r  r  r  r  r  rq  rr  r   r  r  r  r   r  r  r  smem_pipe_read_vr  r  r  r  r  rJ   rJ   rK   r  )	  s:   




z9FlashAttentionForwardSm90.mma_one_n_block_intrawg_overlapc                 C   sF   t jdd}t| jr|dkr!tjjttj	d| j
 d d S d S d S )NFr  r+   r=   r   )r   canonical_warp_group_idxr   rB  rm   r*   r   r>   r!   WarpSchedulerWG1r=  )rH   r  rJ   rJ   rK   r  f	  s   

z"FlashAttentionForwardSm90.mma_initc                 C   st   t | j| jf}t || j || j f|}||}t||| j|||| j| j	|	|
|d t
| jr4| jndd d S )Nr+   )seqlen_infoconstant_q_idxr0   )rm   r   r4   r5   r   r   r   r9   rG   rF   r   r3   r0   )rH   r  r   r   r   r  r  r   r   r  r  r  tScSrJ   rJ   rK   r  p	  s$   

z)FlashAttentionForwardSm90.apply_score_modc                 C   s>   t | jrtjjttjd tj	dd d| j
 d d S d S )Nr+   Fr  r=   r   )r   rB  rm   r*   r   r>   r!   r  r   r  r=  r   rJ   rJ   rK   r  	  s   


z5FlashAttentionForwardSm90.warp_scheduler_barrier_syncc                 C   sz   t | jr;| jdv sJ tjddd }t | jdkr d| }n	|d }|| j }tjjtt	j
| d| j d d S d S )N)r=   ry  Fr  r+   r=   r   )r   rB  r>  r   r  rm   r*   r   r>   r!   r  r=  )rH   cur_wgnext_wgrb  rJ   rJ   rK   r  	  s   



z7FlashAttentionForwardSm90.warp_scheduler_barrier_arrive)
NNNNNNNNNNr   )NNF)NNFT)NNTr  )2r2  r3  r4  r*   r7  rL   rl   r   r   rm   r9  r:  r   r	   r;  r<  r
   r>   r   listr   r  r>  r  r=  r?  r&   rQ   r   r   r   PipelineAsyncPointerr   r   rl  r   r   r   r  r  PipelineStatePipelineStateSimpler  r  r  r  r  r  __classcell__rJ   rJ   r  rK   r    s   "%
	


  '%	
 !"# >	
w 	
  `	
1	
;	
<
	
"	r  )Hr?   typesr   typingr   r   r   r   	functoolsr   cuda.bindings.driverbindingsdriverr;  rQ   cutlass.cuterm   r   r	   r
   r   r   cutlass.cute.nvgpur   r   r   cutlass.cute.archr   r   cutlass.utilsr   rU   r   cutlass.utils.hopper_helpersr   r	  quackr   quack_copy_utilsflash_attn_origin.cuter   rA  r0  flash_attn_origin.cute.maskr   flash_attn_origin.cute.softmaxr   r   "flash_attn_origin.cute.seqlen_infor   !flash_attn_origin.cute.block_infor   %flash_attn_origin.cute.block_sparsityr   )flash_attn_origin.cute.block_sparse_utilsr   r   r   flash_attn_origin.cute.pack_gqar    $flash_attn_origin.cute.named_barrierr!   %flash_attn_origin.cute.tile_schedulerr"   r#   r$   r%   r&   r'   r(   r@  r  rJ   rJ   rJ   rK   <module>   sP           =