o
    ih                    @   sh  d dl Z d dlmZ d dlmZmZmZmZ d dlm	Z	 d dl
m  mZ d dlZd dlmZ d dlmZ d dlmZmZmZ d dlm  mZ d dlm  mZ d dlmZ d dlmZ d d	lmZ d d
l m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z' d dlm(Z( d dl)m*Z* d dl+m,Z, d dl-m.Z.m/Z/m0Z0m1Z1m2Z2 G dd dZ3G dd de3Z4G dd de3Z5dS )    N)SimpleNamespace)TypeCallableOptionalTuple)partial)
const_expr)cpasyncwarp	warpgroup)ampere_helpers)hopper_helpers)utils)AttentionMask)Softmax)
SeqlenInfo)	BlockInfo)pipeline)PackGQA)NamedBarrierFwd)TileSchedulerArgumentsSingleTileSchedulerSingleTileLPTSchedulerSingleTileVarlenScheduler
ParamsBasec                   @   s  e Zd ZU dZeed< 										d]deej d	ed
e	e dede
de
de
dedededede
fddZe	d^de
fddZdeej deej deej deej deej dB deej dB deej dB d eej dB d!eej dB fd"d#Zd$d% Zd&d' Zd(d) Zd*d+ Zejd,ejd-ejd.ejd/ejd0e	ej d1ejd2ejd3ejfd4d5Zejd6ejd7ejd/ejd0e	ej d8ejd9ed:ejd;e	ej d<ejd=ejd>ejd?ejd@ejfdAdBZejdCdD Z ejdEejdFejdGejdHejd9ejdIejfdJdKZ!ejdLejdMejdNejdOejdPejdQejdHejdRejd9ejdSej"fdTdUZ#ejdLejdVejdWejdXejdYejdZejdHejdRejd9ejdSej"fd[d\Z$dS )_FlashAttentionForwardBaseP   archN   FT   dtypehead_dim
head_dim_vqhead_per_kvhead	is_causalis_localpack_gqam_block_sizen_block_size
num_stagesnum_threads	Q_in_regsc                 C   s   || _ d}tt|| | | _|dur|n|}||k| _tt|| | | _|| jk| _|| jk| _|| _	|| _
|| _|| _|| _|	| _|| _|
| _|| _dS )a  Initializes the configuration for a flash attention kernel.

        All contiguous dimensions must be at least 16 bytes aligned, which means that the head dimension
        should be a multiple of 8.

        :param head_dim: head dimension
        :type head_dim: int
        :param m_block_size: m block size
        :type m_block_size: int
        :param n_block_size: n block size
        :type n_block_size: int
        :param num_threads: number of threads
        :type num_threads: int
        :param is_causal: is causal
           N)r    intmathceilhead_dim_paddedsame_hdim_kvhead_dim_v_paddedcheck_hdim_oobcheck_hdim_v_oobr#   r$   r%   r&   r'   r(   r*   r)   r+   )selfr    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   hdim_multiple_of r7   O/home/ubuntu/vllm_env/lib/python3.10/site-packages/flash_attn/cute/flash_fwd.py__init__'   s"   

z"FlashAttentionForwardBase.__init__returnc	                 C   s   | t jt jfvr
dS |d dkrdS |d dkrdS |d dkr"dS |d dkr*dS || d }	|| | d }
|| | d }|sF|	| nt|	|}||
 }tjd }||krZdS |d | dkrddS dS )	aU  Check if the kernel can be implemented with the given parameters.

        :param dtype: data type
        :type dtype: cutlass.Numeric
        :param head_dim: head dimension
        :type head_dim: int
        :param m_block_size: m block size
        :type m_block_size: int
        :param n_block_size: n block size
        :type n_block_size: int
        :param num_threads: number of threads
        :type num_threads: int
        :param is_causal: is causal
        :type is_causal: bool

        :return: True if the kernel can be implemented, False otherwise
        :rtype: bool
        F   r   r,          sm80T)cutlassFloat16BFloat16maxsm80_utils_basicSMEM_CAPACITY)r    r!   r"   r'   r(   r)   r*   r$   r+   smem_usage_Qsmem_usage_Ksmem_usage_Vsmem_usage_QV
smem_usagesmem_capacityr7   r7   r8   can_implementY   s*   
z'FlashAttentionForwardBase.can_implementmQ_typemK_typemV_typemO_type	mLSE_typemCuSeqlensQ_typemCuSeqlensK_typemSeqUsedQ_typemSeqUsedK_typec
           
      C   s   t ||  ko|  ko|kn   rtdt |tjtjfvr&tdt |d tjfvr3tdt |d tjfvr@tdt |d tjfvrMtdt |d tjfvrZtdt |	d tjfvrgtd|| jksnJ d S )Nz(All tensors must have the same data typez%Only Float16 or BFloat16 is supportedzLSE tensor must be Float32z!cu_seqlens_q tensor must be Int32z!cu_seqlens_k tensor must be Int32zseqused_q tensor must be Int32zseqused_k tensor must be Int32)r   	TypeErrorr?   r@   rA   Float32Int32r    )
r5   rL   rM   rN   rO   rP   rQ   rR   rS   rT   r7   r7   r8   _check_type   s   (z%FlashAttentionForwardBase._check_typec                 C   s  |   \}}}}}t|| j| jfd| _t|| j| j| jfd| _t|| j| j	| jfd| _
t|| j| j	fd| _t|d urPt|| j| jfd| _nd | _d}|| jj }tjtjtjjd| j|d}tjtj | j|d}	|jjd | }
| j|
 dksJ d| j|
 dksJ dtj| j|
 |
fd	d
}tj| j|
 |
fd	d
}| j|jd  dksJ |jjd | }tj| j| |fd	d
}tj| j| |fd	d
}| j|jd  dksJ td|f}|}t|||| _t|||| _ t|||| _!t|	||| _"d S )N)r   r   )r   r   r=   r   )
cache_mode)num_bits_per_copyr   r   z0num_threads must be divisible by tQK_shape_dim_1)r   r   )order)#_get_smem_layout_atomcutetile_to_shaper'   r0   	sQ_layoutr(   r)   	sK_layoutr2   	sV_layout	sO_layoutr   	sP_layoutr    widthmake_copy_atomr	   	CopyG2SOpLoadCacheModeGLOBALnvgpuCopyUniversalOpoutershapenum_Q_load_threadsnum_producer_threadsmake_ordered_layoutnum_epilogue_threadsmake_layoutmake_tiled_copy_tvgmem_tiled_copy_Qgmem_tiled_copy_Kgmem_tiled_copy_Vgmem_tiled_copy_O)r5   sQ_layout_atomsK_layout_atomsV_layout_atomsO_layout_atomsP_layout_atomuniversal_copy_bitsasync_copy_elemsatom_async_copyatom_universal_copytQK_shape_dim_1	tQ_layout	tK_layouttV_shape_dim_1	tV_layout	tO_layoutvQKV_layout	vO_layoutr7   r7   r8   _setup_attributes   sh   z+FlashAttentionForwardBase._setup_attributesc                 C      t  NNotImplementedErrorr5   r7   r7   r8   r\         z/FlashAttentionForwardBase._get_smem_layout_atomc                 C   r   r   r   r   r7   r7   r8   _get_tiled_mma   r   z(FlashAttentionForwardBase._get_tiled_mmac                 C   r   r   r   r   r7   r7   r8   _get_shared_storage_cls   r   z1FlashAttentionForwardBase._get_shared_storage_clsmQmKmVmOmLSEsoftmax_scalesoftcapstreamc	           	      C   r   )Configures and launches the flash attention kernel.

        mQ/mK/mV/mO has same data types(supports fp16 and bf16) and same layout:
        (batch_size, seqlen_q, num_head, head_dim):(_, _, _, 1)
        r   )	r5   r   r   r   r   r   r   r   r   r7   r7   r8   __call__   s   z"FlashAttentionForwardBase.__call__acc_OlsesOseqlenrv   
tma_atom_O	tiled_mmatidxm_blockhead_idx	batch_idxc           *      C   s  t || j}|| | j t jjtt	j
| jd t| j| j}t ||	|
}||}||}t ||| t | j| jf}t| j| j| j| j}t|d urt|j ri|d ||f }nt| j rr|jnd|jf}t |f|d |f }t| j rt || jf|f}t |j t j!| jfdd}t "|j#|}|	|
}t$|%|}t j&|dgdt &|ksJ t$|%|}t$|d%|}|d d dkrt'(t &|j)d D ]!}||df d |j*|| j  |d d  k r|| ||df< qn|+|||	|
||j* t|j r)|d d ||f }nt| j r3|jnd|jf}t |df|d d |f }t| j,rt jj-t jj.j/t jj0j1d t jj2tt	j
| jt jj3 d t || j| jf|df} t45|dt !dt 6|ddt 6| dd\}!}"t j7t j8 }#|#d	krt jjtt	j
| jt jj3 d t ||!|" t j9  t jj:dd
d d S d S t jjtt	j
| jd ||
}$|$;|}!t |!| j}%t <|!|% t| j r`t || j| jf|df} |$| }"|$;|}&|d;|}'tj=|&|j)d d}(t'(t &|%j)d D ];})|'d|)df d |j*|| j  |&d d  k r\t j||%d |)d f |"d |)d f t| jrX|(d |)d f nd d q"d S |>||%||
||j* d S )N
barrier_idnumber_of_threadsr   )r   stridemoder   spacer=      T)readlimitpred)?r]   make_fragment_liker    storeloadtor   barrierr-   r   Epiloguerp   r   get_smem_store_atommake_tiled_copy_C	get_sliceretilepartition_Dcopymake_identity_tensorr'   r2   r   r4   r#   r   has_cu_seqlens_qr&   offset_qdomain_offset
local_tileappendlayoutrq   make_tensoriteratormake_acc_tensor_mn_viewpartition_Csizer?   range_constexprrl   seqlen_q	store_LSE	use_tma_Ofence_proxy	ProxyKindasync_sharedSharedSpace
shared_ctabarrier_arrive	WARP_SIZEr	   tma_partitiongroup_modesmake_warp_uniformwarp_idxcp_async_bulk_commit_groupcp_async_bulk_wait_grouppartition_Sautovec_copypredicate_kstore_O)*r5   r   r   r   r   r   r   rv   r   r   r   r   r   r   rOsmem_copy_atom_Osmem_thr_copy_OtaccOrOtaccOsOcOr&   mLSE_curoffsetgLSEgLSE_expanded_layoutgLSE_expandedthr_mma	taccOgLSEtaccOcOt0accOcOmmO_curgOtOsOtOgOr   gmem_thr_copy_OtOrOtOcOt0OcOtOpOrest_mr7   r7   r8   epilogue  s   


. 
 




0	z"FlashAttentionForwardBase.epiloguec                 C   s   || j d k r|d S dS )Nr   r   )r)   )r5   pipeline_indexr7   r7   r8   advance_pipeline{  s   z*FlashAttentionForwardBase.advance_pipelinegmem_thr_copygQsQblockheaddimc              	   C   s   | |||}}t| j| jf}	||	}
|d|	}tj|
|d}t	
t|jd D ]7}|d|df d ||| j  |
d d  k rjtj||d |d f |d |d f t| jrf|d |d f nd d q3d S )Nr   r   r   r   )r   r   r]   r   r'   r0   r   r   r   r?   r   r   rl   r   r   r3   )r5   r   r   r   r  r   r  tQsQtQgQcQtQcQt0QcQtQpQr   r7   r7   r8   load_Q  s   

,z FlashAttentionForwardBase.load_Qgmem_tiled_copytKgKtKsKtKcKt0KcKtKpKsmem_pipe_writeneed_predicatesc                 C   sN  | j |jd j dk}t|
p| rt|r|	|| j   }nt|
 r'| j }nt|	|| j   | j }||d d 8 }tt|jd D ]7}|d|df d |k r}tj	||d |d |f |d |d t| j
dkrj|ndf t| jry|d |d f nd d qFd S tj	||d d d |f |d d d t| j
dkr|ndf t| jr|nd d d S )Nr   r   r   )r(   tiler_mnrl   r   r?   minrange_constepxrr]   r   r   r)   r3   )r5   r
  r  r  r  r  r  r  r  r   r  is_even_n_smem_kseqlen_limitnr7   r7   r8   load_K  s0   
 
 
z FlashAttentionForwardBase.load_KtVgVtVsVtVcVt0VcVtVpVc                 C   s  | j |jd j dk}t|
p| rtt|jd D ]}|s9|t|jd d k s9|d|df d | j k rt| jrE|d |d f nd }t|
r|	|| j   |d d  }|d|df d |k }t	|d }tt|jd D ]$}tt|jd D ]}t| jr||||f ndo||||f< qqutj
||d |d |f |d |d t| jdkr|ndf |d qd S tj
||d d d |f |d d d t| jdkr|ndf t| jr|nd d d S )Nr   r   )Nr   NTr   )r(   r  rl   r   r?   r  r]   r   r4   r   r   r)   )r5   r
  r  r  r  r  r  r  r  r   r  is_even_n_smem_vr  	predicater  predicate_nkir7   r7   r8   load_V  s6   4*  
z FlashAttentionForwardBase.load_V)
Nr   FFTr   r   r   r   F)F)%__name__
__module____qualname__r   r-   __annotations__r   r?   Numericr   boolr9   staticmethodrK   rX   r   r\   r   r   r]   jitTensorrV   cudaCUstreamr   r   	TiledCopyCopyAtomTiledMmarW   r   r   r	  	Constexprr  r#  r7   r7   r7   r8   r   #   sV  
 	

20	

J		
m
	
+	
r   c                +   @   s  e Zd Zdd Zdd Zdd Zej					d5dejd	ejd
ejdejde	ej de
jde	ej de	ej de	ej de	ej de	ej fddZejdejd	ejd
ejdejde	ej dejde	ej dejdejdejdejdejdejdejdB dejdejdejdejd ejd!ejd"ejf*d#d$Zej		%	&d6d'ejd(ejd)ejd*ed+ed,ed-ed.ed/ed0e	e d1ejd2ejfd3d4ZdS )7FlashAttentionForwardSm80c                 C   s:   t | j| j}|}t | j| j}|}d }|||||fS r   )
sm80_utilsget_smem_layout_atomr    r0   r2   r5   rw   rx   ry   rz   r{   r7   r7   r8   r\     s   z/FlashAttentionForwardSm80._get_smem_layout_atomc                 C   s|   t jt| jtjd| jd ddf| jd d ddfd}t jt| jtjd| jd ddf| jd d ddfd}||fS )N)r,   r;   r,   r<   r   r,   )permutation_mnk)r]   make_tiled_mmar
   MmaF16BF16Opr    r?   rV   r*   )r5   tiled_mma_qktiled_mma_pvr7   r7   r8   r     s   z(FlashAttentionForwardSm80._get_tiled_mmac                    s   fddj jjfD \ ttj tj}tjjtjjj	|f df tjG  fddd}tjG  fddd}t
j rQ|S |S )Nc                    s0   g | ]}t jjt jj jt |f d f qS )   r]   structAlignMemRanger    cosize).0r   r   r7   r8   
<listcomp>	  s    "zEFlashAttentionForwardSm80._get_shared_storage_cls.<locals>.<listcomp>r<  c                       s&   e Zd ZU ed< ed<  ed< dS )zKFlashAttentionForwardSm80._get_shared_storage_cls.<locals>.SharedStorageQKVsVr   sKNr$  r%  r&  r'  r7   )	sK_struct	sQ_struct	sV_structr7   r8   SharedStorageQKV  s   
 rJ  c                       s   e Zd ZU ed<  ed< dS )zPFlashAttentionForwardSm80._get_shared_storage_cls.<locals>.SharedStorageSharedQVr   rE  NrF  r7   )rG  
sQV_structr7   r8   SharedStorageSharedQV  s   
 rL  )r_   r`   ra   rB   r]   rA  r>  r?  r@  r    r   r+   )r5   
cosize_sQVrJ  rL  r7   )rG  rK  rH  rI  r5   r8   r     s   
z1FlashAttentionForwardSm80._get_shared_storage_clsNr   r   r   r   r   r   r   r   window_size_leftwindow_size_rightlearnable_sinkc                    s  |du sJ d| j dd |||||fD   |  \}}|j| _| j| _| j| _| j| _| jdk| _	| 
  |  }dd   fdd	||||fD \}}}}d
d	 ||||fD \}}}}t|jtj|jg dd}t|jd | jt|jd t|jd f}ttj}t|dur|| }d}n|| }t|| }| ||||||||	|
| j| j| j| j| j| j | j!| j"| j#|||j$|| jddg|% |d dS )r   N.Learnable sink is not supported in this kernelc                 s   "    | ]}|d ur|j nd V  qd S r   element_typerB  tr7   r7   r8   	<genexpr>2  s     z5FlashAttentionForwardSm80.__call__.<locals>.<genexpr>Z   c                    ,   g  fdd j d d D  j d R S )Nc                 3   &    | ]}t j|d  jj dV  qdS r   )divbyNr]   assumerT  rd   rB  srV  r7   r8   rW  =     $ zGFlashAttentionForwardSm80.__call__.<locals>.<lambda>.<locals>.<genexpr>r   ra  r7   ra  r8   <lambda>=     , z4FlashAttentionForwardSm80.__call__.<locals>.<lambda>c              	      *   g | ]}t |jt j|j |d qS r   r]   r   r   rq   rl   rU  
new_strider7   r8   rC  >     * z6FlashAttentionForwardSm80.__call__.<locals>.<listcomp>c              	   S   s*   g | ]}t |jt j|jg d dqS )r      r=   r   r   r]   r   r   selectr   rU  r7   r7   r8   rC  ?  rk  r=   r   r   r   r   r=   rm  r   )gridr  smemr   )&rX   r   r   num_mma_threadsr*   rn   rm   rp   r   r   r   r   r]   r   r   ro  r   ceil_divrl   r'   r.   log2er   r?   rV   kernelr_   r`   ra   rb   rc   rs   rt   ru   rv   launchsize_in_bytes)r5   r   r   r   r   r   r   r   r   rN  rO  rP  r:  r;  SharedStoragegrid_dimLOG2_Esoftmax_scale_log2softcap_valr7   ri  r8   r     sj   "


z"FlashAttentionForwardSm80.__call__r}  r~  r_   r`   ra   rb   rc   rs   rt   ru   rv   r:  r;  rz  c           Y         s  t j \}}}t j \}}}t j j j j||	t	 j
r# jndd}t|jd |jd d}|||\}}|d } j jf}  j jf}! j jf}"t |d d ||f | |df}#| j }$t |d d |$|f |!d}%t |d d |$|f |"d}&tj }'|'|}(|(j|
})|(j|}*t	 j r|(j|}+nt t j|)j jd|}+t|+},| |}-| |}.|-!|*|-"|%}/}0|.!|+|."|&}1}2| |}3| |}4|3#|3$|)|3%|3&|*d }5|4%|4&|,d }6|4' j jf}7t (|7tj)}8|8*d t +t,j-d	d
d j}9t +t,j-dd
d j}:t.|9| |t/|9| |};t/|:| |}<"|)|;"|*}=|<"|,}>t 0 j jf}?|-"|?}@|- d"|?}At	 j jkru|@}B|A}Cnt 0 j jf}D|."|D}B|. d"|D}Ctj1|@|jd d}Et	 j2r|E}Fn
tj1|B|jd d}Ft3||8jd d |8jd  d}G|G4  t5|3|4|5|6|8d}Ht5|;|<|=|>d}It6 j7||0|/|@|A|E|j8d}Jt6 j9||2|1|B|C|F|j8d}Kfdd}Lt6 j:|H|I|G|J|K|Ld}M| |}N j;|N|#|)||j<|jd d t j=   fdd}Ot	 jr:|J|ddd t j=  |O  t j>  t? j@D ]Q}Pt	 j pK|Pdkrj|PdksZ||P dkre|J||P |P|Pdkd t j=  t	|P j@d k r|Pdks||P dkr|K||P |P|Pdkd t j=  q@t	 j r|O  tA j j|j<|j8||	t	 j
r jnd}Qt6|QjB||3 j jd}RtCd}StC j@d }T|M||S|Tddt6|Rddd  D|S}S D|T}Tt	 jp jr!|E|||}UtjF|d |U ddD ] }V|d |V }|M||S|Tdt6|Rd	dd  D|S}S D|T}Tq tjF|ddD ]}V|M||V d |S|Tdd  D|S}S D|T}Tq(|GG }W|GH|8|W t |)j|}X I|8|GjJ|||X|d ||||| d S ) Nr   qhead_per_kvhead_packgqar   )r   seqlen_kNr   r    )NNr           Fr   )	transposenum_matricesTr   num_rows)
thr_mma_qk
thr_mma_pvtSrQtSrKtOrVtr   )smem_thr_copy_Qsmem_thr_copy_Ksmem_thr_copy_VtSsQtSsKtOsVt)r   c                    2   t  d ur| tjj|    dd d S d S NT)fastmathr   r   r]   r.   tanhr   acc_Sr~  r7   r8   scoremod_premask_fn     "z=FlashAttentionForwardSm80.kernel.<locals>.scoremod_premask_fn)
mma_paramssmem_copy_paramssoftmaxr  r#  r  )r   r  c                     sJ   t j jd d  t jr#t j  } t |  d S d S )Nr=   r   )	r]   r   cp_async_wait_groupr)   r   r+   r   r   r   )tSrQ_copy_view)r5   r  r  r  r7   r8   preprocess_Q  s   


z6FlashAttentionForwardSm80.kernel.<locals>.preprocess_Q)r  r  r   r   mask_causal
mask_localmask_seqlen)is_first_n_block	check_infmask_fnunrollr=   )r  r  r  )Kr]   r   
thread_idx	block_idxr   r'   r(   r$   r%   r   r&   r#   r   rl   get_n_block_min_maxr0   r2   r   r?   r   SmemAllocatorallocater   
get_tensorrE  r+   rD  r   
recast_ptrr   r    transpose_viewr   r   r   make_fragment_Apartition_Amake_fragment_Bpartition_Bpartition_shape_Cmake_fragmentrV   fillre   r
   LdMatrix8x8x16bOpmake_tiled_copy_Amake_tiled_copy_Br   r   r1   r   resetr   r   r  r  r#  compute_one_n_blockr	  r   cp_async_commit_groupr   r  r)   r   
apply_maskrW   r   !get_n_block_min_causal_local_maskrangefinalize	rescale_Or   row_sum)Yr5   r   r   r   r   r   r}  r~  rN  rO  r_   r`   ra   rb   rc   rs   rt   ru   rv   r:  r;  rz  r   _r   num_head
batch_size
block_infor   n_block_minn_block_maxn_block
blkQ_shape
blkK_shape
blkV_shaper   num_head_kvgKgVrr  storager   rE  rD  sVtgmem_thr_copy_Kgmem_thr_copy_Vr  r  r  r  r  r  r  r  acc_shape_Or   smem_copy_atom_QKsmem_copy_atom_Vr  r  r  r  cKr  r  r  r  cVr  r  r  r  r  r  r#  r  r  gmem_thr_copy_Qr  stagemaskr  smem_pipe_readr  n_block_min_causal_local_maskn_tile	row_scaler   r7   )r5   r  r~  r  r  r8   rw  p  s   













 
	














z FlashAttentionForwardSm80.kernelFTr  r  r  r  r  r  r  r#  r  r  r  r  c                    s  fdd}|j jjf}t|tj}|d |   fdd}|  t	j
|j ||j|j|j|jdddtjdkrE|ndf |j|jjd		 |	| fd
d}tjdkrp|  |  t|
dur||
|d |j| |d}||j| t|j}|| j t|jt |j!}tjdkr|  |  t	"|j#|j||j$|j%dddtjdkr|ndf |j& dS )zCompute one n_block of S/O.

        This function provides different variants for processing the first n block versus
        subsequent blocks.
        c                      s$   t j jd d  t j  d S )Nr=   )r]   r   r  r)   r   r7   r   r7   r8   sync  s   z;FlashAttentionForwardSm80.compute_one_n_block.<locals>.syncr  c                      sL   j dksj  d dkrj  d  oj dkd tj  d S )Nr   r   r  r)   r]   r   r  r7   )r  r#  r  r5   r  r7   r8   load_V_next  s
   zBFlashAttentionForwardSm80.compute_one_n_block.<locals>.load_V_nextNr   r   )	A_in_regsc                      s0   j  dkr j  dd tj  d S )Nr   Fr  r  r7   )r  r  r5   r  r7   r8   load_K_next  s   zBFlashAttentionForwardSm80.compute_one_n_block.<locals>.load_K_nextr  is_firstr  )'r  r  r'   r(   r]   r  r?   rV   r  r4  gemmr  r  r  r  r   r)   r  r  r+   r   online_softmaxr  r   r   r    r   r   r   r   r   r   convert_layout_acc_frgAr   gemm_rsr  r  r  r  )r5   r  r  r  r  r  r  r  r#  r  r  r  r  r  acc_shape_Sr  r  r  r  rPtOrPr7   )r  r  r#  r  r5   r  r8   r  j  sF   
"
"z-FlashAttentionForwardSm80.compute_one_n_block)NNNNN)NFT)r$  r%  r&  r\   r   r   r]   r+  r,  r   r-  r.  r?   rV   rW   r   rw  ComposedLayoutr/  r1  r2  r   r   r   r  r7   r7   r7   r8   r3    s    		
R	
 z	
r3  c                B       sX  e Zd ZdZdddef fddZdd Zd	d
 Zdd Ze	j
									dfde	jde	jde	jde	jdee	j dejdejdee	j dee	j dee	j dee	j dee	j dejeB dB dejeB dB dejeB dB dee	j f ddZe	jde	jde	jde	jde	jdee	j dee	j dee	j dee	j dee	j d ee	j d!ee	j d"ee	j d#ee	j d$ejd%eej deej deej d&e	jd'e	jd(e	jd)e	jd*e	jdB d+e	jd,e	jd-e	jd.e	jd/e	jd0e	jd1e	jd2ed3eje d4eje f@d5d6Ze	j
de	jde	jde	jd7e	jd8e	jd9e	jd e	jd!e	jd"e	jd:ejjd;ejjd<ejd=e d>ed?efd@dAZ!e	j
d/e	jd0e	jd1e	jde	jde	jdee	j d7e	jd8e	jdBe	jdCee	j dDe	jd:ejjd;ejjd<ejd+e	jd.e	jd#ee	j dEejd$ejd%ejd=e d>edFed?ef0dGdHZ"e	j
		I		dgdJejdKejj#ej$B d/e	jd0e	jd1e	jd:ejjd;ejjdLe%dMe%dNe&dOedPee dQejdRejdSej'fdTdUZ(e	j
			dhdJejdKejj#ej$B d/e	jd0e	jd1e	jd:ejjd;ejjdLe%dMe%dNe&dOedPee dRejdSej'fdVdWZ)e	j
dXdY Z*dZd[ Z+d\d] Z,d^e	jd_e	jd`e	jdaejjdbejdcejj#ej$B fdddeZ-  Z.S )iFlashAttentionForwardSm90rX  T)intra_wg_overlapr  c                   s"   t  j|i | || _d| _d S )NT)superr9   r  mma_pv_is_rs)r5   r  argskwargs	__class__r7   r8   r9     s   
z"FlashAttentionForwardSm90.__init__c                 C   s   t ttjjj| j| j	| j}|}t ttjjj| j| j
| j}|}| js;t ttjjj| j| j| j}nd }|||||fS r   )r   make_smem_layout_atomsm90_utils_basicr5  r?   r   
LayoutEnum	ROW_MAJORr    r0   r2   r  r(   r6  r7   r7   r8   r\     s.   z/FlashAttentionForwardSm90._get_smem_layout_atomc              
   C   s   t j| j| jtjjtjjtj| jd ddfd| j	fd}t j| j| jtjjtjj
tj| jd ddfd| jf| jr<tjjntjjd}t j| j| jtjjtjj
tj| jd ddfd| jftjjd}|||fS )N@   r   )atom_layout_mnkr  )r  r  a_source)r  make_trivial_tiled_mmar    r   OperandMajorModeKr?   rV   r'   r(   MNr2   r  OperandSourceRMEMSMEM)r5   r:  r;  tiled_mma_pv_rsr7   r7   r8   r     s<   	


z(FlashAttentionForwardSm90._get_tiled_mmac              	      s\  t j rdnd}d}d}fddtjjjf|||fD \ttjtj}tj	j
tj	jj|f df t jd urOtjnd}tj	j
tj	jj|f df tj	jtjdf tj	jtjjd f  tj	jtjjd f tj	G  fddd}tj	G  fd	d
d
}t j r|S |S )Nr   r<  c                    s4   g | ]\}}t jjt jj jt |f |f qS r7   r=  )rB  r   	alignmentr   r7   r8   rC    s    "zEFlashAttentionForwardSm90._get_shared_storage_cls.<locals>.<listcomp>r   r=   c                       sF   e Zd ZU ed<  ed< ed< ed< ed< ed< ed< dS )	zKFlashAttentionForwardSm90._get_shared_storage_cls.<locals>.SharedStorageQKVmbar_ptr
mbar_ptr_K
mbar_ptr_VrD  r   rE  sPNrF  r7   )mbar_ptr_K_structmbar_ptr_QO_structmbar_ptr_V_structrG  	sP_structrH  rI  r7   r8   rJ    s   
 rJ  c                       s>   e Zd ZU ed<  ed< ed< ed< ed< ed< dS )zPFlashAttentionForwardSm90._get_shared_storage_cls.<locals>.SharedStorageSharedQVr  r  r  r   rE  r  NrF  r7   )r  r  r  rG  r  rK  r7   r8   rL    s   
 rL  )r   r&   zipr_   r`   ra   rB   r]   rA  r>  r?  r@  r    rc   r?   Int64r)   r+   )r5   sQ_alignmentsK_alignmentsV_alignmentrM  	cosize_sPrJ  rL  r7   )	r  r  r  rG  r  rK  rH  rI  r5   r8   r     s*   
 	z1FlashAttentionForwardSm90._get_shared_storage_clsNr   r   r   r   r   r   r   mCuSeqlensQmCuSeqlensK	mSeqUsedQ	mSeqUsedK
mPageTabler   rN  rO  rP  c           -         s  |du sJ d| j dd |||||||	|
|f	D   dd fdd||||fD \}}}}t|du r:g d	ng d
fdd||fD \}}t|	du rUg d	ng d
  fdd||fD \}}t|du rpg dnddg}t|durt|jtj|j|dnd}|  \}}}|j| _	d| _
| j	| j
 | _d| _| j	| _| j	| _d| _d| _t| jr| jdko| jdkn| jdk| _| jdko|du o|
du o| j | _|   |  }t }t }t }t|jtj| jddgd| _t|jtj| j ddgd| _!t|jtj| j"ddgd| _#t$||| j| j%| jf\}}t$||tj| j ddgd| j&| jfd\}}t$||tj| j"ddgd| j&| j'fd\}}t| jrmt$||| j(| j%| j'f\}}nd}t| jr=| j)|j*d f|j*d |j*d g|j*dd R } |j+d |j+d f|j+d |j+d | j) g|j+dd R }!t|jtj,| |!d}| j)|j*d f|j*d |j*d g|j*dd R }"|j+d |j+d f|j+d |j+d | j) g|j+dd R }#t|jtj,|"|#d}t|dur=| j)|j*d f|j*d g|j*dd R }$|j+d |j+d f|j+d | j) g|j+dd R }%t|jtj,|$|%d}t|dupF|
durLt-}&nt| j. pT| j/rYt0nt1}&t2t3t|j*d | j%t|j*d t|du r~t|j*d n	t|j*d d t|j*d |j*d |j*d t|durt|j*d nt|j*d t|j*d  | j%| j&f||
t| jr| j)nd| j4j5d d| j.p| j/d}'|&6|'}(|&7|(})t89t8j:}*t|du r||* }+d},n||* }+t;<|| },t|dur	t;=|}t|durt;=|}| j>g t| j r!|n|||||||	|
||||||+|,||| j| j | j"| j(| j?| j@| jA| jB| jC||||(|&|R  jD|)| jEddg| |dd dS )r   NrQ  c                 s   rR  r   rS  rU  r7   r7   r8   rW  =  s    z5FlashAttentionForwardSm90.__call__.<locals>.<genexpr>c                    rY  )Nc                 3   rZ  r[  r]  r_  ra  r7   r8   rW  A  rb  zGFlashAttentionForwardSm90.__call__.<locals>.<lambda>.<locals>.<genexpr>rc  r   ra  r7   ra  r8   rd  A  re  z4FlashAttentionForwardSm90.__call__.<locals>.<lambda>c              	      rf  rg  rh  rU  ri  r7   r8   rC  B  rk  z6FlashAttentionForwardSm90.__call__.<locals>.<listcomp>rl  )r   r=   r   c              	      &   g | ]}t |jt j|j d qS r   rn  rU  )QO_layout_transposer7   r8   rC  D      c              	      r#  r$  rn  rU  )KV_layout_transposer7   r8   rC  I  r&  rp  r   r   r   r   r<         r=   rX  rm  r   r;   F)total_qtile_shape_mnr  r   r  element_sizeis_persistentlpt)rq  r  rr  r   min_blocks_per_mp)FrX   r   r]   r   r   ro  r   r   r   rs  num_threads_per_warp_groupnum_mma_warp_groupsrn   rm   rp   num_mma_regsnum_producer_regsr  r0   use_scheduler_barrierr   r&   r   r   r   r	   CopyBulkTensorTileG2SOpCopyBulkTensorTileS2GOpry  rT  r_   tma_copy_q_bytesr`   tma_copy_k_bytesra   tma_copy_v_bytesmake_tiled_tma_atomr'   r(   r2   rb   r#   rl   r   rq   r   r$   r%   r   r   r   rt  r    rd   to_underlying_argumentsget_grid_shaper.   ru  rv  r?   rV   rW   rw  rc   rs   rt   ru   rv   rx  r*   )-r5   r   r   r   r   r   r   r   r  r  r   r!  r"  r   rN  rO  rP  LSE_layout_transposer:  r;  r  rz  rs   gmem_tiled_copy_KVrv   
tma_atom_Qtma_tensor_Q
tma_atom_Ktma_tensor_K
tma_atom_Vtma_tensor_Vr   shape_Q_packedstride_Q_packedshape_O_packedstride_O_packedshape_LSE_packedstride_LSE_packedTileSchedulertile_sched_argstile_sched_paramsr{  r|  r}  r~  r7   )r'  r%  rj  r8   r   "  s^  "



* $   


2<2<*40<





	
 !

z"FlashAttentionForwardSm90.__call__r?  rA  rC  r   r}  r~  r_   r`   ra   rb   rc   rs   rt   ru   rv   r:  r;  r  rM  rK  rz  c!           7      C   s$  t jt j }!|!dkr,t| j rt|
 t| t| t| jr,t| t	j
 }"|"| }#|#j }$|!dkrOt j|$t| j rKdn| j t	jt	jjj}%t	jt	jjj| j| j }&tjj|#j | j|%|&| jdd}'tjj|#j | j|%|&| jd}(|#jj|j |j!d})|#j"j|j |j!d}*t| j# r|#j$j|j |j!d}+n|#jj|j |j!|j%d}+t
&|+},t|d ur|#j'|}-|#j'j|j |j!d}.nd\}.}-|#j|}/t (t j)|/j*|j!|/j%d	|j }0t+| j,| j-| j.| j/||t| jr| j0ndd
}1t1t2t| j r|j3d n|j3d d |j3d ||||	d}2t1t4| j,| j-||t| jr6| j0ndd}3t1|j|}4|!dk rat j5| j6 | 7||||)|*|+|
|||'|(|$|1|2|4 d S t j8| j9 t j: \}5}6}6|5d }5| ;|||||||)|*|,|.|0|'|(|$||||5|||1|2|3|4 d S )Nr   r   F)barrier_storager)   producer_groupconsumer_grouptx_count	init_wait)rN  r)   rO  rP  rQ  )swizzle)rS  r    )NNr  r  )seqlen_q_staticseqlen_k_staticr  r  r   r!  )rN  rO  r  r   r   )<r]   r   r   r   r   r&   r	   prefetch_descriptorr   r?   r   r  r  r  data_ptrmbarrier_initrm   r   CooperativeGroupAgentThreadrs  r0  PipelineTmaAsyncNoClustercreater  r)   r8  r  r9  r   r  rk   innerrE  r+   rD  rT  r  r  r   r  r   r   r'   r(   r$   r%   r#   r   r   rl   r   warpgroup_reg_deallocr3  r   warpgroup_reg_allocr2  r  mma)7r5   r   r   r   r   r   r  r  r   r!  r?  rA  rC  r   r}  r~  rN  rO  r_   r`   ra   rb   rc   rs   rt   ru   rv   r:  r;  r  rM  rK  rz  r   rr  r  
mbar_ptr_Qpipeline_kv_producer_grouppipeline_kv_consumer_group
pipeline_k
pipeline_vr   rE  rD  r  sP_pir  sO_pir   r  SeqlenInfoClsAttentionMaskClsTileSchedulerClsr   r  r7   r7   r8   rw    s   $







 
 &

z FlashAttentionForwardSm90.kernelr   rE  rD  re  rf  rb  r  ri  rk  c           *         s  t jt j d }|dkrtd}ttjjj	| j
}| }| }|jr|j\}} | tj rC|d d | f }nt| j rLjndjf}t |df|d d |f }t| j ri|| j n|tj r fdd||fD \}}nfdd||fD \}}t || j| jfd}t || j| jfd}t| j rt || j| jf|df}t|dt dt |ddt |dd\}}t|dt dt |ddt |dd\} }!t|	dt dt |ddt |dd\}"}#t| j||!| |
}$t| j|	|#|"|}%t| j rC|dN }t j   t j!|| j" W d    n	1 s5w   Y  t j#||||d	 |$|\}&}'tj%|'|& dd
D ]}(|'|( d })|$|)|d |%|)|d |&  qT|'  |(  |) }|js*d S d S d S )Nr   r   r   c                    s   g | ]
}|d d  f qS r   r7   rU  )r   head_idx_kvr7   r8   rC    s    z2FlashAttentionForwardSm90.load.<locals>.<listcomp>c              	      s(   g | ]}t jd f|dd f qS )r   N)r]   r   offset_krU  )rl  r   r7   r8   rC    s   ( r  r=   tma_bar_ptrr  )producer_state)*r]   r   r   r   r?   rW   r   make_pipeline_statePipelineUserTypeProducerr)   initial_work_tile_infois_valid_tiletile_idxr   r   r&   r   r   r#   has_cu_seqlens_kr   r(   r0   r2   r'   r	   r   rq   r   r   r  	elect_onembarrier_arrive_and_expect_txr7  r   r  r  advanceprefetch_next_workadvance_to_next_workget_current_work)*r5   r   r   r   r   rE  rD  r?  rA  rC  re  rf  rb  r  ri  rk  warp_idx_in_wgq_producer_phasekv_producer_statetile_scheduler	work_tiler   r   mQ_curr   mK_curmV_curr  r  r   r  r  r  r  r  r  r  r#  r  r  r"  r  r7   )r   rl  r   r8   r     s   


zFlashAttentionForwardSm90.loadr  r  r   r   rj  c           F         s  t j|| j }t j| j| jd}||}|||}|||}|||}|	|
|}t| jrQ|| j| jf} t tt | | j}!n|||
}!|	|
|	}"t| j| j}#t |#||}$t|
d ur}|$|
nd }%|   || j| jf}&t |&tj}'t|||!|"|'d}(t|$|%d})tt| jr| jn| j||||||(|)dd	}*t d}+t!"tj!j#j$| j%},| }-|-& }.|.j'r fdd}/t(||'j)d d |'j)d	  d
}0t|*|0|/d}1|.j*\}2}3}4||4}5||5j+|5j,}6t|6j-|2|| j.| j/d}7|00  t| j1r_t2| j| j3| j4| j5}8t|5j6 r1|d d |3|4f }9nt| j1 r;|5j7nd|5j7f}:t 8|:df|d d |3f }9|89|9||||2|5j+ tj:|dd |;|5|2\};}<t jj<||+d |+d	N }+d}=t| jrt || j| jftj}>|=|, t>j?||>||d d d |,j@f ddd |A|, |/|> |7|>|<d	 dd |0jB|>dd t C|>jDt|>jE}?t| jr|(jFnt G|?| j}!tH|?|! t| j r |$I|!}@t J|$|@|% t jjKt jjLjMt jjNjOd t jP  n| Q  |1|<d	 |,dt|7dddd},d}=|<d	8 }<t| j.p!| j/rR|R|5|2|;}AtjS|<|A d	dD ]}B|<d	 |B }C|1|C|,t|7dd|=d},d}=q4tT|<|A}<|U|5|2|;}DtjS|<|D d	dD ]}B|<d	 |B }C|1|C|,d|=d},d}=qbt| j/o|jVd urtT|<|D}<tjS|<|; d	dD ]}B|<d	 |B }C|1|C|,dt|7dd|=d},d}=qt| jr|=|,|W|, t>j?||(jX|(jF|(jYd d d |,j@f |= dd tZ[d |A|, |,\  n| ]  |0^ }E|0_|'|E | `|'|0ja||||5|||||2|3|4 |-b  |-c }.|.j'sd S d S )Nr   )r  r  r  r  r   )smem_thr_copy_PtPsPT)r:  r;  r  re  rf  r  r  r  r   c                    r  r  r  r  r  r7   r8   r  4  r  z:FlashAttentionForwardSm90.mma.<locals>.scoremod_premask_fnr   r  )r  r  r  )noinc)phaseF	zero_initwg_wait)r  r  )r  r   r  )r  r  O_should_accumulater  )r  r  )r  r  )r  r  r  rc  )dr]   r   r   r0  rq   r1  r   r  r  r  r  r   r  r  r'   r(   r  r   r  r    r   r   r   mma_initr2   r?   rV   r   r   r  mma_one_n_block_intrawg_overlapmma_one_n_blockrW   r   rq  rr  Consumerr)   rt  ru  r   rl   rv  r   r  r  r$   r%   r  r&   r   r0   r3   r#   r   r   r   r	  cp_async_mbarrier_arrive_sharedr  mbarrier_waitconsumer_wait
sm90_utilsr  indexconsumer_releaser  r   r   r   r  r   cvt_f16r   r   r   r   r   r   r   	sync_warpwarp_scheduler_barrier_syncr  r  r  !get_n_block_min_before_local_maskrN  consumer_try_waitr   r  r   
wait_grouprz  warp_scheduler_barrier_arriver  r  r   r  r|  r}  )Fr5   r:  r;  r  r   r   r   r   rE  r  r  r   re  rf  rb  rs   rv   r   r   r}  r~  r  ri  rj  rk  warp_group_idxwarp_group_thread_layoutr  	wg_mma_qk	wg_mma_pvr  r  acc_S_shaper  r  smem_copy_atom_Pr  r  r  r   r  r  mma_one_n_block_allq_consumer_phasekv_consumer_stater  r  r  r  r  r   r   r   r   r  r  r&   r  r   r  r  r  r  tOrP_acctPrPr  r  r  n_block_min_before_local_maskr  r7   r  r8   ra    s  


 

 





 zFlashAttentionForwardSm90.mmaFr  r  r  r  r  r  r  r  r  r  c              	   C   s  t || j| jftj}|||| t	j
|||j|jd d d |jf ddd |   td || || t|d urI|||d |
j|||d}t |jt|j}t| jrd|jnt || j}t|| t| j r|	j|j}t  |	j||	j! |
"|j#| t| j rt j$j%t j$j&j't j$j(j)d t j$*  |||| | +  t	j
||j#|j|j,d d d |jf | dd || |-  |S )NTrc  r  r   r  r  r   ).r]   r  r  r'   r(   r?   rV   r  r  r  r  r  r  r  r  r   r  r  r   r  r   r   r   r  r   r  r  r   r    r  r  r   r   r  r  r   r   r   r   r   r   r   r  r  r  rz  )r5   r  r  r:  r;  r  re  rf  r  r  r  r  r  r  r  r  r  r  r  r  r  r7   r7   r8   r    sH   




z)FlashAttentionForwardSm90.mma_one_n_blockc              	   C   s  |  }|  t|| j| jftj}|	||
| |   tj|||j|jd d d |jf ddd |	||
| tj||j|j|jd d d |jf | dd |   td || || t|d urt|||d |
j||d}td || t|jt|j}t| jr|jnt || j!}t"|| t| j r|	j#$|}t%|	j#||	j& |
'|j| t| j rtj(j)tj(j*j+tj(j,j-d tj(.  |S )	NTrc  r  r   r  r  r   r   )/clonerz  r]   r  r  r'   r(   r?   rV   r  r  r  r  r  r  r  r  r   r  r  r  r   r  r  r   r  r   r   r   r  r   r  r   r    r  r  r   r   r  r  r   r   r   r   r   r   r  )r5   r  r  r:  r;  r  re  rf  r  r  r  r  r  r  r  smem_pipe_read_vr  r  r  r  r  r7   r7   r8   r     sL   





z9FlashAttentionForwardSm90.mma_one_n_block_intrawg_overlapc                 C   sF   t jdd}t| jr|dkr!tjjttj	d| j
 d d S d S d S )NFr  r   r=   r   )r   canonical_warp_group_idxr   r4  r]   r   r   r-   r   WarpSchedulerWG1r0  )r5   r  r7   r7   r8   r  =  s   

z"FlashAttentionForwardSm90.mma_initc                 C   s>   t | jrtjjttjd tj	dd d| j
 d d S d S )Nr   Fr  r=   r   )r   r4  r]   r   r   r-   r   r  r   r  r0  r   r7   r7   r8   r  G  s   

z5FlashAttentionForwardSm90.warp_scheduler_barrier_syncc                 C   s   t | jr>| jdv sJ tjddd }t | jdkrd| n|| jd k r*|d nd}tjjtt	j
| d| j d d S d S )N)r=   rm  Fr  r   r=   r   r   )r   r4  r1  r   r  r]   r   r   r-   r   r  r0  )r5   cur_wgnext_wgr7   r7   r8   r  N  s   
0
z7FlashAttentionForwardSm90.warp_scheduler_barrier_arrivetma_atomr  r  r   r  rp  c                 C   s8   | | tj||d |f |d |jf ||d d S )Nrn  )producer_acquirer]   r   r  producer_get_barrier)r5   r  r  r  r   r  rp  r7   r7   r8   r  Y  s   


z FlashAttentionForwardSm90.load_K)	NNNNNNNNN)NFTT)NTT)/r$  r%  r&  r   r)  r9   r\   r   r   r]   r+  r,  r   r?   rV   r-  r.  floatrW   r-   r   rw  r0  r  r/  r1  r   r2  r   r   PipelineAsyncPointerr   r   ra  PipelineStatePipelineStateSimpler   r   Booleanr  r  r  r  r  r  __classcell__r7   r7   r  r8   r    s<    *
	
 6	
 ! ,	
X	
 h	
:	
<
	r  )6r.   typesr   typingr   r   r   r   	functoolsr   cuda.bindings.driverbindingsdriverr-  r?   cutlass.cuter]   r   cutlass.cute.nvgpur	   r
   r   cutlass.utils.ampere_helpersr   r   rC   cutlass.utils.hopper_helpersr   r  flash_attn.cuter4  r  flash_attn.cute.maskr   flash_attn.cute.softmaxr   flash_attn.cute.seqlen_infor   flash_attn.cute.block_infor   r   flash_attn.cute.pack_gqar   flash_attn.cute.named_barrierr   flash_attn.cute.tile_schedulerr   r   r   r   r   r   r3  r  r7   r7   r7   r8   <module>   s>      Q   G