o
    i                    @   sH  d dl Z d dlZd dlmZmZmZmZ d dlmZ d dl	m
  mZ d dlZd dlmZ d dlmZmZmZ d dlmZ d dlm  m  mZ d dlm  mZ d dlm  mZ d dlmZ d dlm Z  d dl!m"Z" d d	l#m$Z$ d d
l%m&Z& d dl'm(Z) d dl'mZ* d dl+m,Z, d dl-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3 G dd dZ4dS )    N)TypeTupleCallableOptional)partial)Float32Int32
const_expr)cpasync)AttentionMask)SoftmaxSm100)
SeqlenInfo)	BlockInfo)PackGQA)mma_sm100_desc)blackwell_helpers)
FastDivmod)TileSchedulerArgumentsSingleTileSchedulerStaticPersistentTileSchedulerSingleTileLPTSchedulerSingleTileVarlenScheduler
ParamsBasec                9   @   sx  e Zd ZdZ								ddedee d	eje d
edededededefddZ	dd Z
ej									ddejdejdejdejdeej dedejdeej deej deej deej deej d eeB dB d!eeB dB d"eeB dB d#eej f d$d%Zejdejdejdejdejdeej deej deej deej deej deej d&ejd'ejd(ejd)eej d*ed+ee d!ee d"ee d#eej d,ejd-ejd.ejd/ejd0ejd1eej d2ejd3ejd4ef8d5d6Zejd7ejjd8ejjdejdejdejd9ejd:ejd;ejdeej d&ejd'ejd(ejd<ejjd=ejd>ed?e d@e f"dAdBZ!ejd2ejjd3ejjd9ejd:ejd;ejdCej"dDej"dEej"dFe#ejejf dGe$ej dHe#ejejf d<ejjd=ejd>ed?e d@e f dIdJZ%ejdKeeB d*ed7ejjdLejdMejdeej d#eej d=ejd>ed?e dNe d@e fdOdPZ&ej		ddQedRedSedTedUe'd=ejdVed7ejjdWejdXejdYejdZejd[ejd\ejdMejdKeeB d]ee  d^ed_e#ejejejf f&d`daZ(ejd7ejjd8ejjdbejdGe$ej dMejdejdejdcejd#eej d)ejd=ejd*ed>ed?e d@e fdddeZ)ejdfejjdgejdhediefdjdkZ*ejdfejjdgejdhediedcejf
dldmZ+ejdejdcejd1ejd)eej d=ejd?e d@e fdndoZ,dpejdqejdrejdsejdtejduedKedvefdwdxZ-ej	ddpejdyejdzejdsejdtejdued{ejj.d|e/d}ee fd~dZ0ejdejdKedvefddZ1dd Z2dS )FlashAttentionForwardSm100d   N   F   Thead_dim
head_dim_vqhead_per_kvhead	is_causalis_localpack_gqam_block_sizen_block_sizeis_persistentc
                    s  d}
t t||
 |
  _|d ur|n|}||k _t t||
 |
  _ j jk _| jk _| jk _| _	| _
d _ jdv sJJ  j| | jf _|| jf _| j|f _t _t _d _|	 _| _| _| _| _|r| j dksJ dd _ jdko jd	k _ jr j jksJ d _d
 _d _d _d _d _d _d _ d}| _!t"j#j$t%g  j j j j j j j R   _&d _'d j
g _( fddt) jD  _* j*d  j  _+ j+|ksJ  j
d  _, fddt)dD  _- j( _. jdk r)d _/d	 _0d _1n js1 jr3dnd _/d	 _0 jsA jrCd	nd _1d _2d _3d S )N      )r   r'   )r   r   r   z?For PackGQA, m_block_size must be divisible by qhead_per_kvheadF   @   )r   r   r'      )            )   	   
            )   )   i   r   c                    s&   g | ]} j d   j | j  qS ))tmem_s_offsetr$   head_dim_v_padded.0iself U/home/ubuntu/vllm_env/lib/python3.10/site-packages/flash_attn/cute/flash_fwd_sm100.py
<listcomp>   s   & z7FlashAttentionForwardSm100.__init__.<locals>.<listcomp>r7   c                    s   g | ]
} j |  j qS r?   )r8   tmem_s_to_p_offsetr:   r=   r?   r@   rA          `      0      P         )4intmathceilhead_dim_paddedsame_hdim_kvr9   same_hdim_kv_paddedcheck_hdim_oobcheck_hdim_v_oobr#   r$   q_stage	cta_tilermma_tiler_qkmma_tiler_pvr   qk_acc_dtypepv_acc_dtypecluster_shape_mnr%   r    r!   r   r"   s0_s1_barrieroverlap_sO_sQsoftmax0_warp_idssoftmax1_warp_idscorrection_warp_idsmma_warp_idload_warp_idepilogue_warp_idsempty_warp_idstmem_alloc_colscutearch	WARP_SIZElenthreads_per_ctatmem_alloc_sync_bar_idr8   rangetmem_o_offset
tmem_totalrB   tmem_p_offsettmem_vec_offsetnum_regs_softmaxnum_regs_correctionnum_regs_othernum_regs_emptybuffer_align_bytes)r>   r   r   r   r    r!   r"   r#   r$   r%   hdim_multiple_ofSM100_TMEM_CAPACITY_COLUMNSr?   r=   r@   __init__7   s   

z#FlashAttentionForwardSm100.__init__c                 C   sz   | j jdkrdnd| _d| _d| _| jdko| jdko| jdk| _| jr/| j| j| j  d nd| _	| j	d	 dks;J d
S )a  Set up configurations and parameters for the FMHA kernel operation.

        This method initializes and configures various attributes required for the
        execution of the fused multi-head attention kernel, mainly about the pipeline stages:

        - Sets up staging parameters for Q, K, V inputs and accumulator data
        - Configures pipeline stages for softmax, correction, and epilogue operations
        r/   r+   r*   r   r'   r(   r   r   rJ   N)
q_dtypewidthkv_stage	acc_stage	epi_stagerN   r9   uneven_kv_smemr#   uneven_kv_smem_offsetr=   r?   r?   r@   _setup_attributes   s   
 "z,FlashAttentionForwardSm100._setup_attributesmQmKmVmOmLSEsoftmax_scalestreammCuSeqlensQmCuSeqlensK	mSeqUsedQ	mSeqUsedK
mPageTablesoftcapwindow_size_leftwindow_size_rightlearnable_sinkc           =         sx  |j _|j _|j _|j _dd fdd||||fD \}}}}t|du r/g dng dfdd||fD \}}t|	du rJg dng d  fd	d||fD \}}t|du reg d
nddg}t|dur|t|jtj	|j
|dnd}t|	du rg dng d}t|jtj	|j
|d}tjj| _tjj| _tjj| _tjj|_tjtjjkrtdtjtjjkrtdtjtjjkrtdtjjkrtdj dj tjjkrtdj dj   jdko!|du o!|
du _d_tjdko9j o9j  o9j!rK|dusF|
durHdnd_tj"j#}tj$j%}tjj}t&'jjjj(|j)dd }t&'j|jj*|j+dd |}g j,dR _-t.t/j-|j0j1f_2j+dd _3t&4|j)jj5t&6|j)jj7t&4|j+jj8}t&6|j+jj7}t&9jjj3j:}tj; rJtt<j=j>d d}tt<|j=j>d d}tj?st<||n|| d }t@jAdtj/g j=j1dd j7R g j=j>dd |R dt@|jAdtj/g |j=j1dd j7R g |j=j>dd |R d}tj!rjB|j1d f|j1d |j1d g|j1dd R }|j>d |j>d f|j>d |j>d jB g|j>dd R }t|jtj/||d}jB|j1d f|j1d |j1d g|j1dd R } |j>d |j>d f|j>d |j>d jB g|j>dd R }!t|jtj/| |!d}t|durjB|j1d f|j1d g|j1dd R }"|j>d |j>d f|j>d jB g|j>dd R }#t|jtj/|"|#d}tCD|}$tCE }%tjFG|$|tj	g ddj)|j2j1\}&}'tjFH|$|tj	g ddj)|j2j1\}(})tjFH|$|tj	|g ddj+|j2j1\}*}+tItJ|j1j3},tj r~d_Kd _LtjjMtNjK _OtjrtCP|%|tj	|ddgd|,\}-}d}.nGd}-d!}/|/jjQ }0tjRtjFS j|/d"}1|j=j1d d |0 }2tjTjO|2 |2fd#d$}3jU|3j1d  dksJ t/d|0f}4tV|1|3|4}.tWjtj	g dd_XtWjtj	g dd_YtWjtj	|g dd_Zt|dup|
dur%t[}5ntjp,j r2t\}5ntj] r;t^nt_}5t`tatb|j1d jcd tb|j1d t|du rbtb|j1d n	tb|j1d d t|du rztb|j1d n	|j1d |j1d  |j1d |j1d t|durtb|j1d ntb|j1d tb|j1d  jcdd ||
tj!rjBndjjQd% j]jpȈj d&}6|5d|6}7|5_e|5f|7}8d_gjgj5 _hjhj5 _ijij7 _jjjj7 _kjkd _ljld _mjmd _njnd _ojoj: _pjpj: _qjqd _rjrd% _sjsd _tjtd _utjv rAtw|ndtjxG fd'd(d(}9|9_ytz{tzj|}:t|du ri||: };d}<n
||: };t}|| }<t|dur~t~|}t|durt~|}|'|)|+||||	|
|||&|(|*|-|;|<|||||||.|||7j|8jddgj-jyW |dd) dS )*a  Execute the Fused Multi-Head Attention operation on the provided tensors.

        This method prepares the input tensors for processing, validates their shapes and types,
        configures the computation parameters, and launches the CUDA kernel.

        The method handles:
        1. Tensor layout transformations for specific memory access patterns
        2. Validation of tensor shapes and data types
        3. Initialization of hardware-specific parameters and memory layouts
        4. Configuration of TMA (Tensor Memory Access) operations
        5. Grid and work scheduling computation
        6. Kernel launch with appropriate parameters
        c                    s,   g  fdd j d d D  j d R S )Nc                 3   s&    | ]}t j|d  jj dV  qdS )r   )divbyN)rd   assumeelement_typerx   )r;   str?   r@   	<genexpr>   s   $ zHFlashAttentionForwardSm100.__call__.<locals>.<lambda>.<locals>.<genexpr>r7   strider   r?   r   r@   <lambda>   s   , z5FlashAttentionForwardSm100.__call__.<locals>.<lambda>c              	      s*   g | ]}t |jt j|j |d qS )r   )rd   make_tensoriteratormake_layoutshaper;   r   )
new_strider?   r@   rA      s   * z7FlashAttentionForwardSm100.__call__.<locals>.<listcomp>N)r   r*   r'   r   )r   r'   r   c              	      &   g | ]}t |jt j|j d qS moderd   r   r   selectlayoutr   )QO_layout_transposer?   r@   rA          c              	      r   r   r   r   )KV_layout_transposer?   r@   rA      r   )r'   r   r   r   r   r   )r   r   r'   r*   )r   r   r'   z!The layout of mQ is not supportedz!The layout of mK is not supportedz!The layout of mV is not supportedzType mismatch: z != Z   r&   r)       r1   r'   r7   r   r*   )r   r   r'   )r5   r6   r?   r   )num_bits_per_copy)r   r   )orderr/   )total_qtile_shape_mnr   r   qhead_per_kvhead_packgqaelement_sizer%   lptc                       s   e Zd ZU ejjejjf e	d< e
e	d< ejjejj d f e	d< ejjejjjf jf e	d< ejjejjjef jf e	d< ejjejjje f jf e	d< dS )	z:FlashAttentionForwardSm100.__call__.<locals>.SharedStoragembar_ptrtmem_holding_bufr'   sScalesOsQsKN)__name__
__module____qualname__rd   structMemRangecutlassInt64
mbar_total__annotations__r   r   rS   r#   Aligno_dtypers   rw   cosizek_dtyper?   )	sK_layoutsO_size	sQ_layoutr>   r?   r@   SharedStorage  s&   
  r   )gridblockclustersmemr   min_blocks_per_mp)r   rw   r   v_dtyper   r	   rd   r   r   r   r   r   utils
LayoutEnumfrom_tensormma_major_modeq_major_modek_major_modev_major_modeo_layouttcgen05OperandMajorModeKRuntimeErrorMN	TypeErrorr~   re   	use_tma_Oe2e_freqrN   r    r!   r"   CtaGroupONEOperandSourceTMEMsm100_utils_basicmake_trivial_tiled_mmarW   rU   rX   rV   rY   cluster_shape_mnktiled_divider   thr_idr   cluster_layout_vmnkepi_tilemake_smem_layout_arS   make_smem_layout_bry   rz   make_smem_layout_epir{   rP   maxouterr   r|   make_composed_layoutinnerr   r
   CopyBulkTensorTileG2SOpCopyBulkTensorTileS2GOpnvgpumake_tiled_tma_atom_Amake_tiled_tma_atom_Bcompositionmake_identity_layoutra   rb   rf   rg   num_epilogue_threadsmake_tiled_tma_atomrx   make_copy_atomCopyUniversalOpmake_ordered_layoutr#   make_tiled_copy_tvsize_in_bytestma_copy_q_bytestma_copy_k_bytestma_copy_v_bytesr   r   r%   r   r   r   ceil_divsizerT   to_underlying_argumentstile_scheduler_clsget_grid_shapembar_load_q_full_offsetmbar_load_q_empty_offsetmbar_load_kv_full_offsetmbar_load_kv_empty_offsetmbar_P_full_O_rescaled_offsetmbar_S_full_offsetmbar_O_full_offsetmbar_softmax_corr_full_offsetmbar_softmax_corr_empty_offsetmbar_corr_epi_full_offsetmbar_corr_epi_empty_offsetmbar_s0_s1_sequence_offsetmbar_tmem_dealloc_offsetmbar_P_full_2_offsetr   r[   r   r   shared_storagerL   log2er   r   kernellaunchrh   )=r>   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   LSE_layout_transposeV_layout_transpose	cta_groupp_sourcep_major_modetiled_mma_qktiled_mma_pv	tP_layout	sV_layout	sO_layout	stride_sK	stride_sVstage_strideshape_Q_packedstride_Q_packedshape_O_packedstride_O_packedshape_LSE_packedstride_LSE_packedtma_load_optma_store_op
tma_atom_Qtma_tensor_Q
tma_atom_Ktma_tensor_K
tma_atom_Vtma_tensor_Vo_cta_v_layout
tma_atom_Ogmem_tiled_copy_Ouniversal_copy_bitsasync_copy_elemsatom_universal_copytO_shape_dim_1	tO_layout	vO_layoutTileSchedulertile_sched_argstile_sched_paramsgrid_dimr   LOG2_Esoftmax_scale_log2softcap_valr?   )r   r   r   r   r   r   r>   r@   __call__   s  #"



* ,

"HH2<2<*4

		00<




z#FlashAttentionForwardSm100.__call__r/  r1  r3  r6  rC  rD  r   r   r!  r"  r#  r7  r  r   r@  c           :         sp  t jt j }|dkr't| t| t| t|dur't| tj	 }|
 j}|j } |dkrdt jD ]"}!t j|  j |! t jg t j|  j |! t jg qA|dkrtdD ]"}!t j|  j |! t jjd  t j|  j |! t jjd  qm|dkrt jrtdD ]}!t j|  j |! t jj q|dkrt jD ](}!t j|  j |! t jjt j  t j|  j |! t jjt j  q|dkr'tdD ]:}!t j|  j |! t jjt j t j   t j|  j! |! t jg t j|  j" |! t jg q|d	krHtdD ]}!t j|  j# |! t jjt j   q1|d
krgt j|  j$ t jjtg  j  j% jR    &|  j' }"|j(j)|j*|j+d}#|j,j)|j*|j+d}$t -t .|$j/|j+|j*}%t j0 r|j1j)|j*|j+d}&nt -t .|#j/|j+|j*}&|j2)t 3 j j4 d }'|5d}(|5d})|(6 j7d  j7d f}*|(8|*}+t j9t:dt j;j<dd},t -|,|+j=|)6 j>d  j>d f}-|)8|-t? fddt@dD }.t? fddt@ jD }/t -j/|j*}0|)A|0d  fddt@dD }1tB jCd  jCd  jD jE||t jFrN jGndd}2tHtIt jF r`|jJd n|jJd d t|
du rr|jJd n	|jJd |
jJd  ||||	d}3tHtK j4 jL||t jFr jGndd}4tH jMjN|}5tt jOdkr| jOd krt jP jQ | jkrt jP jR  S|(|)||||#|$|%|
||||"| |2|3|5 | jkr:t jP jR tT jU}6| jkrt jV|6|jW t jX   Y|||#|$|%|j+|j+|j+|.|/|1|"| |2|3|5 t jZ  t j[|  j$ d tT jU}6t jj\t:d|jWd},t j]|,|6 | jd kr\| jd kr\t jP jR  ^||&||| |3|5 | jd k rt j_ j` tH ja||(|'||| |2|3|4|5d}7t j rtT| j%d k rdnd}8|7|8t -j/|8dkr jbd n jbd  j=d t jc|  j$  nP| j%d k rt -j/ jbd  j=}9|7d|9d t jc|  j$  | jd k r| j%d krt -j/ jbd  j=}9|7d|9d t jc|  j$  | jd kr6| jk r6t jP jd  e|(|)|/|'|||&||| ||2|3|5 t jc|  j$  dS )a?  The device kernel implementation of the Fused Multi-Head Attention.

        This kernel coordinates multiple specialized warps to perform different phases of the FMHA computation:
        1. Load warp: Loads Q, K, V data from global memory to shared memory using TMA
        2. MMA warp: Performs matrix multiplications (Q*K^T and P*V)
        3. Softmax warps: Compute softmax normalization on attention scores
        4. Correction warps: Apply adjustments to intermediate results
        5. Epilogue warp: Handles final output transformation and storage

        The kernel implements a complex pipeline with overlapping computation and memory operations,
        using tensor memory access (TMA) for efficient data loading, warp specialization for different
        computation phases, and optional attention masking.
        r   Nr   r'   r+   r*   r/   r,   r-   r.   )swizzler&   )	mem_spaceassumed_alignc                 3   *    | ]}t j j|  jV  qd S N)rd   r   r   r8   r   r;   stage)r>   tStSr?   r@   r         "z4FlashAttentionForwardSm100.kernel.<locals>.<genexpr>c                 3   rI  rJ  )rd   r   r   rk   r   rK  )r>   tOtOr?   r@   r     rN  NNNr   c                    s6   g | ]}t j jj jj  j|   jqS r?   )rd   r   r   rW   rx   rw   rm   r   rK  )r>   tOrPr?   r@   rA     s    z5FlashAttentionForwardSm100.kernel.<locals>.<listcomp>)r   )seqlen_q_staticseqlen_k_staticr   r   r   r   )r   r   r   )	alignmentptr_to_buffer_holding_addrr7   )
rC  
thr_mma_qkr   r   r   r   
block_infoSeqlenInfoClsAttentionMaskClsTileSchedulerCls)rL  tStSi)frd   re   make_warp_uniformwarp_idxr
   prefetch_descriptorr	   r   r   SmemAllocatorallocater  r   data_ptrrange_constexprrS   mbarrier_initr  rg   r`   r  r_   r  rf   r  rZ   r  r  r^   r  ra   r  r\   r  r  r  r  r]   make_and_init_load_kv_pipeliner	  r   
get_tensorr   r   r   r   
recast_ptrr   r[   r   r   r   r#   	get_slicepartition_shape_CrU   make_fragment_Cmake_ptrr   AddressSpacetmemr   rV   tuplerj   make_fragment_Ar   rT   r    r!   r"   r   r   r   r   r   r$   r  createrb   warpgroup_reg_deallocrr   rq   loadr   rc   
alloc_tmemr   	sync_warpmmarelinquish_tmem_alloc_permitmbarrier_waitretrieve_tmem_ptrdealloc_tmemepilogue_s2gwarpgroup_reg_allocro   softmax_loopr8   mbarrier_arriverp   correction_loop):r>   r   r   r   r   r   r   r   r   r   r   r/  r1  r3  r6  rC  rD  r   r   r   r   r   r!  r"  r#  r7  r  r   r@  r]  r   storager   r<   pipeline_kvr   r   sVr   r   rV  
thr_mma_pvqk_acc_shape	tStS_faketmem_ptrpv_acc_shapetStSstOtOstPtOrPsrW  rX  rY  rZ  rc   r{  rL  r[  r?   )r>   rQ  rO  rM  r@   r    s  .





 " "
&(
0 "
*




$*




 , z!FlashAttentionForwardSm100.kernelrV  r  r   r   r  r  r   rW  rX  rZ  c           1         s4  t d}tjtjjj| j}| }| }|jr|j	\}} | }t
|j r3|d d | f }nt
| j r<|jnd|jf}t|df|d d |f }t|tj| jddgdd}t
| j rh|| j n|t
|	d u rt
|j r fdd||fD \}}nt|jdf|d d f }td|jf|d d f }t|tj| jddgdd}t|tj| jddgdd}n+fd	d||fD \}}t|tj| jddgdd
}t|tj| jddgdd}||} ||}!||}"t|
dtdt|ddt| dd\}#}$t|dtdt|ddt|!dd\}%}&t|dtdt|ddt|"dd\}'}(t| j|
|$|#|| j || j  |d})t| j!||&|%|| j" || j# dd}*t| j!||(|'|| j" || j# dd}+|$||\},}-|)| j%| d dd t
|	d ur|	 |-d f nd }.|*|-d ||.d |&  t
| j%dkr|)| j%| d dd |dN }|+|-d ||.d |&  tj'|-d |, ddD ].}/|-d |/ }0t
|	d ur|	 |0f nd }.|*|0||.d |&  |+|0||.d |&  q|(  |)  |* }|jsd S d S )Nr   r   r'   r   Nr   c                    s   g | ]
}|d d  f qS rJ  r?   r   	batch_idxhead_idx_kvr?   r@   rA   k  rC   z3FlashAttentionForwardSm100.load.<locals>.<listcomp>)r   Nc                    s   g | ]
}|d d  d f qS rJ  r?   r   )r  r?   r@   rA   s  rC   )Nr   N)r   NNr*   )phaser   )K_or_VV)r   rL  )r   producer_statepage_idxunroll)+r   r   pipelinemake_pipeline_statePipelineUserTypeProducerry   initial_work_tile_infois_valid_tiletile_idxr	   has_cu_seqlens_qr"   offset_qrd   domain_offset
local_tiler   rU   r   has_cu_seqlens_koffset_krV   partition_Apartition_Br
   tma_partitionr   group_modesr   load_Qr  r  load_KVr	  r
  get_n_block_min_maxrS   advancerj   prefetch_next_workadvance_to_next_workget_current_work)1r>   rV  r  r   r   r   r   r   r  r   r/  r1  r3  r  r   rW  rX  rZ  q_producer_phasekv_producer_statetile_scheduler	work_tilem_blockhead_idxseqlenmQ_curoffsetgQmK_curmV_curgKgVtSgQtSgKtOgVtQsQtQgQtKsKtKgKtVsVtVgVr  load_Kload_Vn_block_minn_block_maxr  r<   n_blockr?   r  r@   rq  D  s    





"zFlashAttentionForwardSm100.load
sQ_swizzle
sK_swizzle
sV_swizzler  r  r  c           /   
      s  | d}| d}|}||}||}tjdkr)|d |d fn|d |d f|j|j fddtdD } fddtdD }td}tj	
tj	jjj}td}| }| }|jr|j\}}}||} || |\}!}"tjD ]`}#tj|j |# | t|#dkr|| |d d d |jf }$|d d d |jf }%tjrň|%|j|j}%||# |$|%d tj  t|j |#  W d    n1 sw   Y  q|d	N }| | |!  d
}&tj|"d	 |! d	dD ]}'|| |" }(|j|j})}*|d d d |)f }+tdD ]}#tj|j# |# | |d d d |)f },tjrI|,|)|*},||# |+|,|& |j$ |# |d t|#d	krj| |( |(!  t|#dkrz|!  || |j|j}-}.|d d d |-f }%tjr|%|-|.}%||# |d d d |-f |%d tj  t|j |#  W d    n	1 sw   Y  q&| | |!  |d	N }d}&qtj  tjD ]}#t|j% |#  qW d    n	1 sw   Y  || |j|j})}*|d d d |)f }+tdD ]U}#tj|j# |# | |d d d |)f },tjr=|,|)|*},||# |+|,|& |j$ |# |d tj  t|j& |#  W d    n	1 siw   Y  q|d	N }| | |!  |'  |( }|jstd S d S )Nr   r'   rP  )NNNr   c                    s<   g | ]}t tj j| | d d d |f ddqS )NT)sA
sA_swizzle
sB_swizzle	zero_init)r   sm100_utilsgemm_ptx_partialr8   rK  )	qk_mma_opr  r   r  r>   tSrQsr?   r@   rA         z2FlashAttentionForwardSm100.mma.<locals>.<listcomp>c                    s<   g | ]}t tj jjd kr|nd | dddqS )r'   r   N)r  r  r  )r   r  r  rk   rS   rK  )	pv_mma_opr  r>   r  r?   r@   rA     r  )tCrBsBr   Fr  )r  r  r  r   
mbar_phaseT))rg  rn  make_fragment_Br	   rS   oprj   r   r   r  r  r  Consumerry   r  r  r  r  rb  rd   re   rv  r  consumer_waitindexr|   offset_kv_smemr  	elect_oner   commitr  consumer_releaser  cloner  r  r  r  r  r  )/r>   r  r   r   r   r  r  r  r  r  r  r  r  r   rW  rX  rZ  rV  r  tSrQtSrKtOrVgemm_Sigemm_Pimma_q_consumer_phasemma_kv_consumer_stateP_full_O_rescaled_phaser  r  r  r  r  r  r  r  rL  tSrKisK_curO_should_accumulater<   mma_kv_release_stateVi_indexVi_phasetOrVisV_curKi_indexKi_phaser?   )	r  r  r  r   r  r  r>   r  r  r@   rt    s   




	



"	



"
 zFlashAttentionForwardSm100.mmarL  r[  r   rY  c           7      C   s  t j d t jjt| j  }t | jd | jd f}||}t 	|j
t | jdf}t |j|}t 	|j
t | jdf}t |j|}| jd d | jj }t 	|j
t | j|f}t |j| j |}t tjtjdt}t|||}||}t tjtjdt}t|||}||}||j}t tjtjdt}t||}||} | |}!td}"td}#t|dkrdnd}$t jt j  d }%| j!|% }&| }'|'" }(|(j#rB|(j$\})}*}+|
|+},|	%|,|)\}-}.||,j&|,j'}/t(|/j)|)d | ||| j*| j+d}0t,|t-| j.jdkr,dnd	d
}1|1/  t(| j0|1||&||| ||||!||d}2t j1|| j2 | |# |#dN }#|2|"|#|$|.d dt(|0ddd\}"}#}$|.d8 }.t-| j*pt| j+r|	3|,|)|-}3t4j5|.|3 ddD ]}4|.d |4 }5|2|"|#|$|5t(|0ddd\}"}#}$qt46|.|3}.|	7|,|)|-}6t4j5|.|6 ddD ]}4|.|4 d }5|2|"|#|$|5\}"}#}$qt-| j+o|	j8durt46|.|6}.t4j5d|.|- ddD ]}4|.d |4 }5|2|"|#|$|5t(|0ddd\}"}#}$q|1j9d |||| j  < t-|dup|dur*|1j:d |||| j  | jd  < t j;|| j< |  |'=  |'> }(|(j#sdS dS )a  Compute softmax on attention scores from QK matrix multiplication.

        This method handles the softmax computation for either the first or second half of the
        attention matrix, depending on the 'stage' parameter. It calculates row-wise maximum
        and sum values needed for stable softmax computation, applies optional masking, and
        transforms raw attention scores into probability distributions.

        The implementation uses specialized memory access patterns and efficient math operations
        for computing exp(x) using exp2 functions. It also coordinates pipeline
        synchronization between MMA, correction, and sequence processing stages.
        r   r   r   r&   r+   r'   )r  thr_mmathr_tmem_loadmask_causal
mask_localg       @        )rescale_threshold)softmaxr   r  rV  r  thr_tmem_storethr_tmem_store_scaletStS_t2rtStScale_r2ttStP_r2tr   rL  T)mask_seqlen)is_firstmask_fnr  F)r  N)?rd   re   
thread_idxrf   rg   r\   make_identity_tensorrU   partition_Cr   r   r   r#   r   r   r   rx   rB   r   r   copy
Ld32x32bOp
Repetitionr   make_tmem_copyrg  partition_S
St32x32bOppartition_Dr   r   r\  r]  r  r  r  r  r  seqlen_qseqlen_kr   apply_mask_sm100r    r!   r   r	   rw   resetsoftmax_steprv  r  !get_n_block_min_causal_local_maskr   rj   min!get_n_block_min_before_local_maskr   row_sumrow_maxr|  r  r  r  )7r>   rL  rC  rV  r[  r   r   r   r   rW  rX  rY  rZ  tidxcS_basetScStStS_scale_layouttStScaletScS_vec_layouttScS_vectilePlikeFP32tStP_layouttStPtmem_load_atomr  r  tmem_store_scale_atomr  r  tSrScale_r2t_shapetmem_store_atomtiled_tmem_storer  r   mma_si_consumer_phasesi_corr_producer_phases0_s1_sequence_phasewarp_idx_in_wgr  r  r  r  r  r  r  r  r  maskr  r  r  n_block_min_causal_local_maskn_tiler  n_block_min_before_local_maskr?   r?   r@   r{  x  s   





"&$$"z'FlashAttentionForwardSm100.softmax_loopr'  r(  r)  r  r  r  r  r  r  r  r  r   r  r  returnc           !   	   C   s  | j d tj | jj }|t| j d | j d f}t|jt	| j
df}t|j|}t|jt	| j
|f}t|j|}|	|j}tj|| j | | t|| j}t|	|| t|durp|||d || |\}}t| r|	j}||||| j
  < tj|| j |  ||| t| jrtj|| |d  | t|
|jt}ttj|j| jd|j}|j |||du o| j!dk| j"d t| jrtj|| d| d   t#$t%|jd	 d d
 D ]} t|
|dd| f |dd| f  qtj&  tj|| j' |  t#$t%|jd	 d d
 t%|jd	 D ]} t|
|dd| f |dd| f  q3tj&  tj|| j( |  tj|| j) | | |*| || |dA |dA |dA fS )aN  Perform a single step of the softmax computation on a block of attention scores.

        This method processes one block of the attention matrix, computing numerically stable
        softmax by first finding the row maximum, subtracting it from all elements, applying
        exponential function, and then normalizing by the sum of exponentials. It also handles
        optional masking of attention scores.

        The method involves several key operations:
        1. Loading attention scores from tensor memory
        2. Applying optional masking based on position
        3. Computing row-wise maximum values for numerical stability
        4. Transforming scores using exp2(x*scale - max*scale)
        5. Computing row sums for normalization
        6. Coordinating pipeline synchronization between different processing stages
        r   r   N)r  r+   )dtyper   )e2er   r'   r*   )+rU   r   rx   r   r  rd   r  r   r   r   r#   r   r   r  r   re   rv  r  make_fragmentrW   r  r	   update_row_maxrq  thr_idxr|  r  scale_subtract_rowmaxrZ   r  rf  rw   apply_exp2_convertrN   r   r   rb  r  fence_view_async_tmem_storer  r  r  update_row_sum)!r>   r'  r(  r)  r  r  r   r  rV  r  r  r  r  r  r   r   rL  r  r  r  r  r  r  tScP_layouttScPtScS_t2r_shapetSrS_t2rr  	acc_scaler  tSrP_r2t_f32tSrP_r2tr<   r?   r?   r@   r  "  sP   % 


"$
0&
z'FlashAttentionForwardSm100.softmax_steprM  r   c           6         s  | t jd  jd f}tjt jdft fddt	dD t|jt jdf}t
|j|}ttjtjd j}t|d }tj d tjjt j  }||fddt	dD }|j}tj| j d  tj| j d  td}td}td}| }| }|jr|j \}}}||} |!| |\}!}"tj"| j# d | tj| j$ d  tj"| j# d | |dN }t%|t&}#t'j	|"|! d ddD ]Z}$t'(dD ]N}%tj"| j# |% | |||% j   }&tj)|&d	k dk}'|'r2 *|| j+dkr,|%nd ||& tj| j |%  tj| j$ d|%   q|dN }qtj| j$ d  d g j+ }(d g j+ })t,|	d urt, j- rt&|	| }*|*g j+ })n&t'( j+D ]}% j+| |%  j |  j. | j.  }+t&|	|+ |)|%< qt'( j+D ]}%tj"| j# |% | |||% j   },t,|d up|	d ur|||% j   jd   }-nd }-tj| j$ |%  t,|	d ur	t/0t/j1}.|,t23|)|% |. |-|  7 },|,d
kp|,|,k}/|,|-|/f|(|%< tj4|/s!|,nd	}&tj"| j5 |% | tj"| j6 |% |  7|||% ||&|d d |%f  tj| j8 |%  tj| j |%  qt,|d urt,| j9 rz|d ||f }0nt, j- r| j:nd| j:f}1t;|1f|d |f }0t'( j+D ]S}%t<|0 jf j+| |% f}2|(|% \},}-}/t/=d}3|/s|-| t2>|, |3 nt&j? }4t, j- r| j@n| j@ j. }5||5 j+| |%  j  k r|4|2|< q|dN }|dN }|dN }|A  |B }|jsd S d S )Nr   r   c                 3   s(    | ]}t j j|  V  qd S rJ  )rd   r   r   rn   rK  )r>   rM  r  r?   r@   r     s     z=FlashAttentionForwardSm100.correction_loop.<locals>.<genexpr>r'   c                    s   g | ]	}  | qS r?   )r  rK  )	tStScalesthr_tmem_load_vecr?   r@   rA     s    z>FlashAttentionForwardSm100.correction_loop.<locals>.<listcomp>r  g      ?r  g       @)Cr  rd   r  rU   r   r   r   r#   rm  rj   r   r   r   r   r  r  r	  rW   r
  re   r  rf   rg   r^   rg  r  r   r|  r  r   r  r  r  r  rv  r  r  r2  r   r   rb  vote_ballot_synccorrection_rescalerS   r	   r"   r   rL   r  r  r   exp2f
rcp_approxr  r  correction_epiloguer  r  r  r  r  loglog2finfr  r  r  )6r>   rV  r  rM  r  r   r   r   r   r   r6  r   rC  rW  rX  rZ  r  r  r  tmem_load_v_atomtiled_tmem_load_vecr  tStScales_t2rtSrScale_t2r_shapesoftmax_corr_consumer_phaseo_corr_consumer_phasecorr_epi_producer_phaser  r  r  r  r  r  r  r  tSrScale_t2rr<   rL  scaleshould_rescalestatslearnable_sink_valsink_val
q_head_idxr  r  rB  acc_O_mn_row_is_zero_or_nanmLSE_curr  gLSELN2lser  r?   )r>   rM  r  r@  rA  r@   r}    s     

(
 z*FlashAttentionForwardSm100.correction_loopr  rO  r  rR  c                 C   s  t | jd | jd f}||}d}t tjtj|| j	}t tj
tj|| j	}	t |jt | j|f}
t |jt | j|f}t |j|
}t |j|}t||}t|	|}||}||}||}||j}||}| j| }t ||f| j	}t|D ]h}|d|f }t |jt |jd }t |j|}t |j||  |j}t ||| tdt |dD ]}t j|| ||d  f||f\||< ||d < qt |j||  |j}t ||| qt j  dS )a  Rescale intermediate attention results based on softmax normalization factor.

        This method performs a crucial correction step in the attention computation pipeline.
        When processing attention in blocks, the softmax normalization factors may change
        as new blocks are processed. This method rescales previously computed partial
        output values to account for updated normalization factors.

        The implementation uses efficient tensor memory operations to:
        1. Load existing partial attention output from tensor memory
        2. Apply the scaling factor to all elements
        3. Store the rescaled results back to tensor memory
        r   r   r&   Nr'   )rd   r  rV   r  r   r   r  r  r	  rX   r  r   r   r   r#   r   r   r
  rg  r  r  r   r9   r2  r   rb  r  re   mul_packed_f32x2r7  )r>   r  rO  r  rR  cOtOcOcorr_tile_sizer"  r%  tOtO_i_layouttOcO_i_layouttOtO_itOcO_itiled_tmem_loadr&  r  r  tOtO_t2rtOrO_t2r_shapetOtO_r2t	frg_counttOrO_frgr<   
tOrO_frg_itTMrO_i_layouttTMrO_i
tOtO_t2r_ij
tOtO_r2t_ir?   r?   r@   rC  3  sJ   





z-FlashAttentionForwardSm100.correction_rescalec                 C   s   t | jd | jd f}d| jj }||}||}	t |t | j|f}
t |	t | j|f}t |t | j|f}| j	d |f}t
j| j| j| j| j|dd}t||
d }||}t
| j| j| j|}t j||j|jd}||
d }||d }||d }t| j| D ]i}|d	dd|f }|d	dd|f }t |d	dd|f j| j}t ||| tdt |d
D ]}t j|| ||d  f||f\||< ||d < qt |j| j}|  }|!|"| j t ||| qt jj#t jj$j%t jj&j'd d	S )as  Apply final scaling and transformation to attention output before writing to global memory.

        This correction_epilogue function handles the final processing step for attention output values.
        It applies a scaling factor to the accumulated attention results and prepares the
        data for efficient transfer back to global memory.

        The method performs:
        1. Loading of accumulated attention results from tensor memory
        2. Application of the final output scaling factor
        3. Type conversion if necessary (typically from higher precision accumulator to output precision)
        4. Reorganization of data for optimal memory access patterns
        5. Preparation for efficient TMA store operations

        :param thr_mma: Thread MMA operation for the computation
        :type thr_mma: cute.core.ThrMma
        :param tOtO: Tensor containing accumulated attention output
        :type tOtO: cute.Tensor
        :param scale: Final scaling factor to apply to the output
        :type scale: Float32
        :param sO: Shared memory tensor for the final output
        :type sO: cute.Tensor
        r   r      F)use_2cta_instrs)NNr   )	layout_tvtiler_mn)rs  NNr'   )space)(rd   r  rV   r   rx   r  logical_divider   r#   r   r   get_tmem_load_opr   rX   r   r
  rg  get_smem_store_opmake_tiled_copylayout_dst_tv_tiledru  r  r  r   rb  r9   r2  r   r  r  re   r]  rq  storetofence_proxy	ProxyKindasync_sharedSharedSpace
shared_cta)r>   r  rO  r  rR  r   r^  r`  tOsOr_  rc  rd  tOsO_iepi_subtiletmem_copy_atomre  r  smem_copy_atomtiled_smem_storerf  tOsO_s2rtOcO_t2rr<   rn  
tOsO_r2s_irj  ro  tSMrOo_vecr?   r?   r@   rF  r  sZ    

	

z.FlashAttentionForwardSm100.correction_epiloguec                 C   sp  t d}| }	|	 }
|
jr|
j\}}}||}t|j r(|d d ||f }nt| j r1|jnd|jf}t	|df|d d |f }t
|| j| jfd}t| jrt|dtdt|ddt|dd\}}t| jD ]'}tj|| j | | t||d |f |d | j| | f  tj  qqt| jD ]}tjjd| dd tj|| j |  qntj d tjjt| j   }|!|}|"|}t#| j| jf}|$|}|"|}|!d"|}t%j&||j'd d}| jrJ t(| j| j| j)| j*}t| jD ]}tj|| j | | t+|d | j,}t-|d d d |f | t| j rtt.|j'd D ]D}|d|df d |j/| j| | | j  |d d  k rtj||d |d f |d |d | j| | f | j)r|d |d f nd d	 qBn|0||||| j| | |j/ tj|| j |  q|dN }|	1  |	2 }
|
jsd S d S )
Nr   r  r   r'   T)read)limitrP  )pred)3r   r  r  r  r	   r  r"   r  rd   r  r  r#   r9   r   r
   r  r   r  r   rb  rS   re   rv  r  r  cp_async_bulk_commit_groupcp_async_bulk_wait_groupr|  r  r  rf   rg   ra   rg  r  r  r  r   predicate_kr   r   rR   r   make_fragment_liker   autovec_copyr  r  store_Or  r  )r>   r   r   r7  r6  r   rX  rZ  epi_consumer_phaser  r  r  r  r  r  mO_curr  gOr  tOgOrL  r  gmem_thr_copy_Or^  r_  t0OcOtOpOr"   tOrOrest_mr?   r?   r@   ry    st   
( 




: 	z'FlashAttentionForwardSm100.epilogue_s2gtma_atomr  r  mbar_full_ptrmbar_empty_ptrr   r  c	           	      C   sz   t j|| | t j  t j|| | j W d    n1 s#w   Y  t j||d |f |d |f || d d S )Ntma_bar_ptr)rd   re   rv  r  mbarrier_arrive_and_expect_txr   r  )	r>   r  r  r  r  r  r   rL  r  r?   r?   r@   r    s   
z!FlashAttentionForwardSm100.load_QtXgXtXsXr  r  r  c
                 C   s  |dv sJ t |dkr| jn| j}
|j|j}}tj|| | t |dko)| jr8|dkr8tj|d | tj	  tj
|| |
 W d    n1 sQw   Y  |d |f }t | jrj| |||dA }t |	d u rv|d |f n|d d|	f }tj||||| d d S )N)r   r  r   r   r   r  )r	   r   r  r  r  rd   re   rv  r|   r  r  r  r  )r>   r  r  r  r  r  r   r  r  r  tma_copy_bytesrL  r  tXsX_curtXgX_curr?   r?   r@   r  +  s   
&z"FlashAttentionForwardSm100.load_KVsXc                 C   s@   t | jr|dkrdn| jdd|   }t|j| |jS |S )Nr   r   r'   )r	   r|   r}   rd   r   r   r   )r>   r  rL  r  r  r?   r?   r@   r  K  s   
z)FlashAttentionForwardSm100.offset_kv_smemc                 C   sT   t jt jjjt| jg}t jt jjjt| jg}t jjj	|| j
||| jdS )N)barrier_storage
num_stagesproducer_groupconsumer_grouptx_count)r   r  CooperativeGroupAgentThreadrg   r`   r_   PipelineTmaUmmaro  ry   r   )r>   load_kv_mbar_ptrload_kv_producer_groupload_kv_consumer_groupr?   r?   r@   rd  V  s   z9FlashAttentionForwardSm100.make_and_init_load_kv_pipeline)Nr   FFFr   r   T)	NNNNNNNNN)NFrJ  )3r   r   r   re   rK   r   r   	Constexprboolrv   r~   rd   jitTensorr   cudaCUstreamfloatr   rE  r  CopyAtomComposedLayout	TiledCopyTiledMmar   coreThrMmar  PipelineAsyncPointerr   r   rq  Swizzler   rm  rt  r{  r   r  r}  rC  rF  ry  r  PipelineStatestrr  r  rd  r?   r?   r?   r@   r   3   s   	

i
	



  V	
  9	
s	
 @	
 *	
a	
 />UP	
	

r   )5enumrL   typingr   r   r   r   	functoolsr   cuda.bindings.driverbindingsdriverr  r   cutlass.cuterd   r   r   r	   cutlass.cute.nvgpur
   cutlass.cute.nvgpu.tcgen05r   r   cutlass.utils.blackwell_helpersr   r   r   flash_attn.cute.utilsflash_attn.cute.maskr   flash_attn.cute.softmaxr   flash_attn.cute.seqlen_infor   flash_attn.cute.block_infor   flash_attn.cute.pack_gqar   flash_attn.cuter   
sm100_descr  flash_attn.cute.fast_mathr   flash_attn.cute.tile_schedulerr   r   r   r   r   r   r   r?   r?   r?   r@   <module>   s,    