o
    پi
~                     @   s   d dl mZmZ d dlmZ d dlZd dlmZ d dlmZm	Z	m
Z
 d dlm  mZ d dlmZ ejddejd	e	d
ededdf
ddZejdejde	deddfddZeddG dd dZdS )    )OptionalCallable)	dataclassN)Float32Int32
const_expr)SeqlenInfoQKZ   FX	col_limitarchrank1returnc                 C   s.  t |dkr|d d t|d d }n|}t |s&t| jt| d  nt| j}tt|dD ]^}t	||d  d}d|> d }ttd||d  D ]@}	t
|d|	> @ }
|d |	 }t |ru|
rn| | ntj | |< qStt| jd D ]}|
r| ||f ntj | ||f< qqSq6d S )Nr	               r   )r   mincutesizeshaperankcutlassrange_constexprceil_divmaxBooleanr   inf)r
   r   r   r   col_limit_transformedncolscol_limit_right_smaskiin_boundcr r'   O/home/ubuntu/.local/lib/python3.10/site-packages/flash_attn_origin/cute/mask.pymask_r2p   s"   ."r)   row_limit_topnum_repc                 C   s   d}|||  | t |||  | }t| j}tt|dD ];}t||d  d}d|> d }tt d||d  D ]}	t|d|	> @ }
|d |	 }|
rVt	j
 n| | | |< q?q"d S )Nr   r   r   r   )r   r   r   r   r   r   r   r   r   r   r   )r
   r*   r+   num_wgrow_limit_top_transformedr   r    row_limit_top_sr"   r#   	out_boundr%   r'   r'   r(   mask_r2p_transposed,   s   r0   T)frozenc                    @   s  e Zd ZU eje ed< eje ed< eed< dZe	e
 ed< dZe	e
 ed< dZeje ed< d	Zeje ed
< ede
fddZede
fddZej					d)dejdej
dej
dej
dej
dejdeje deje deje deje	e  de	e ddfddZej									d*dejde
de
dejdejdeje deje deje deje	e  de
de
de	e deddfd d!Zej								"d+dejd#ejd$ejdej
dej
dejdejdejdeje	e  de
de
de	e d%ed&eddfd'd(ZdS ),AttentionMasktile_mtile_nseqlen_infoNwindow_size_leftwindow_size_rightr   qhead_per_kvhead_packgqaFswap_ABr   c                 C      | j jS N)r5   seqlen_qselfr'   r'   r(   r<   Q      zAttentionMask.seqlen_qc                 C   r:   r;   )r5   seqlen_kr=   r'   r'   r(   r@   U   r?   zAttentionMask.seqlen_kNNacc_S	batch_idxhead_idxm_blockn_blockthr_mmamask_seqlenmask_causal
mask_localmask_modaux_tensorsc           >   	   C   s  |r|	rJ dt j|| jd}| j| jf}t| js|n|d d d }t j||| jd}t j|d|| jd}t	| j rFdnd}t	| j rPdnd}|d | }|dk r^d}| j
|| j  | }t	| or|	 or|
d u rt	|rt	d}t	| rtjt|jd ddD ],}|d|f | |k}tjt|jd ddD ]}|rtj n|||f |||f< qqd S t||d	d
 d S d S t	| o|	 o|
d urt	t|jd }t	t|jd }t	|d uo|d d uo|d d u}t	|o|ot	|d u}t|D ]}||df | }||| j  }|} |}!t	| jdkr<|| j }"|| j |" }!|| j } | }#t	|rLt| |d \}$} t|D ]}%|d|%f | }&||& || j  }'|'}(t	|rtt|'|d \}$}(t |tj})t |!tj}*t | tj}+t |(tj},|
|)|*|+|,| j|}-tt |-}.t	|r|#| jkp|'| j
k}/|/rtjj |||%f< qQ|.r|||%f ntjj |||%f< qQ|.r|||%f ntjj |||%f< qQqd S t	| j r}|jjd d }0d }1t	| jdkr<| jrJ dtjj|0 dksJ dt|jd |0ks'J |j}2|| j ||2|0 df d  | j }1d| j
 || j  | j | }3t	|rt	| j }tjt|jd ddD ]p}t	| jdkr{||df d || j  }4n
t j|1||0 |0d}4|4|3 }5t	|rt |5|}5t	| rtjt|jd ddD ]}|d|f d |5krtj n|||f |||f< qqct||d f |5d	dd qcd S t	| j!d ur|3| j! nd }6t	| j"d ur|3d | j" nd }7tjt|jd ddD ]w}t	| jdkr||df d || j  }4n
t j|1||0 |0d}4t	| j!d ur2|4|6 }5n| j}5t	|r@t |5|}5t	| j"d urL|4|7 nd}8tjt|jd ddD ]}|d|f d }9|9|5kso|9|8k rwtj |||f< q[qd S | jdksJ |d | }:|| j || j  |: }3t	|rtjt|jd ddD ]D}|d|f | };|;|kr|r| jn|;|3 }<tjt|jd ddD ]}||df | |<k rtj n|||f |||f< qϐqd S tjt|jd ddD ]R}|d|f | };|;|kr| jn|;|3 | j! }<|;|3 | j" }=tjt|jd ddD ]#}||df | }4|4|<k s>|4|=krBtj n|||f |||f< q*qd S )N.mask_causal and mask_local cannot be both True)	transposer   r   FTunroll_fullr	   )r   z&swap_AB with PackGQA not supported yetz%threads_per_row must divide WARP_SIZE)widthr   r   )#utilsmake_acc_tensor_mn_viewr9   r3   r4   r   make_identity_tensorpartition_C	get_slicer   r@   r   ranger   r   r   r   r)   r   r8   divmodscalar_to_ssar   r5   r   ssa_to_scalarr<   tv_layout_Cr   	WARP_SIZEthr_idxshuffle_syncr   r7   r6   )>r>   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   fastdiv_modsacc_S_mn	acc_shapecStScS_mnt0ScS_mnROWCOLthr_col_offsetseqlenk_col_limitr2pr%   oobr&   nrowr   has_fastdivwrap_aux_indices	local_rowglobal_row_idxrow_for_modhead_idx_for_modhead_offsetrow_for_seqlen_colcol_idx_localglobal_col_idxcol_for_modbatch_idx_ssahead_idx_ssa	q_idx_ssa
kv_idx_ssa
mask_valuecondout_of_boundsthreads_per_row	mma_m_idxtidxcausal_row_offsetrow_idxcol_limit_rightlocal_row_offset_rightlocal_row_offset_leftcol_limit_leftcol_idxthr_row_offsetcol0r*   row_limit_botr'   r'   r(   
apply_maskY   sf  
"






((/










zAttentionMask.apply_maskthr_tmem_loadcheck_q_boundaryc           /      C   s  |r|rJ d| j | jf}t| js|n|d d d }||}||}|dk r-d}| j|| j  }d}t| oA| oA|	d u rzt|rxt| rnt	j
t|jddD ]}|| d |kretj n|| ||< qWd S t||ddd d S d S t| o| o|	d urt|d uo|d d uo|d d u}t|
t	j}tt|j}t	|D ]}| js|| d n|| d }| js|| d n|| d }||| j   }||| j  }t| jdkr|| j }|| j | }|| j }n|}|}|} t|o|d ur|rt||d \}!} |}"t|o|o|d ur(t||d \}!}"t|t	j}#t| t	j}$t|"t	j}%|	||#|$|%| j|}&t	t|&}'|'rV|| ntj ||< t|rq|| jkrktj n|| ||< |r|| jkr~tj n|| ||< qd S d| j || j  | j }(|d d || j   })t| jdkr|)| j })t|r|)|( }*t|rt	|*|}*tt|j}t| rt	j
|ddD ]}|| d |*krtj n|| ||< qd S t||*ddd d S t| jd ur|(| j nd }+t| jd ur|(d | j nd },t| jd ur%|)|+ }*n| j}*t|r3t	|*|}*t| jd ur?|)|, nd}-t	j
t|jddD ]}|| d }.|.|*ks^|.|-k rbtj n|| ||< qLd S )	NrM   rO   r   TrP   r   d   rS   )r3   r4   r   rV   r9   rW   partition_Dr@   r   r   rY   r   r   r   r   r)   rT   r[   r   r   r8   rZ   r5   r   r\   r<   r   r7   r6   )/r>   rB   rE   rF   rG   r   rH   rI   rJ   rK   rC   rD   rL   ra   r   rc   rd   tScStScS_t2rrj   rk   r#   rn   r{   r   	row_coord	col_coord
global_row
global_colrt   rs   mask_rowmask_row_for_modrv   global_col_for_modr|   mask_row_ssar~   r   r   r   r   r   r   r   r   r   r'   r'   r(   apply_mask_sm100<  s   


&




  )


*


zAttentionMask.apply_mask_sm100Tr   	t0ScS_t2ris_full_blockcheck_m_boundaryc           1      C   s  |r|rJ dt | j rdnd}t | j rdnd}|d | dks(J d|d | }| j|| j  | }t | oB| oB|	dury|rt |r|dkrftjt|jddD ]	}tj	j
 ||< qZdS |rt t|j}t|D ];}|| | }|| | }||| j  }||| j  }|| jk}|| jk}|p|}|rtj	j
 n|| ||< qudS dS dS t |duo|d duo|d du}t |o|ot |du}t|
tj}t|tj} t t|j}t|D ]}|| | }|| | }||| j  }||| j  }|}!|}"t |r#t||d \}#}!t||d \}#}"t|!tj}$t|"tj}%|	|| |$|%| j|}&tt|&}'|'rJ|| ntj	j
 ||< t |rv|o]|| jk}|| jk}|pg|}|rptj	j
 n|| ||< qdS t | o| rt |r|dkrtjt|jddD ]}tj	j
 ||< qdS dS dS |d | }(| j|| j  |( })|)| }*t |r|*}+t |r|dkr| j}+d},t |, rtjt|jddD ]}|| | |+k rtj	j
 n|| ||< qdS tj|dgd}-t||+|- dS t | jdur|*| j }+nd}+t | jdur+|*| j }.t |r8|dkr8| j}+tjt|jddD ])}|| | }/|/|+k }0t | jdur]|0|/|.kO }0|0retj	j
 n|| ||< qCdS )	a  
        Backward pass: mask S = K @ Q.T where n_block tiles seqlen_k and m_block tiles seqlen_q.

        Coordinate conventio:
        - ROW corresponds to Q (m_block)
        - COL corresponds to KV (n_block)

        is_full_block: If True, skip mask_mod (all elements valid). Only apply seqlen masking.
        check_m_boundary: If False, skip seqlen_q boundary check (optimization for non-boundary m_blocks).
                          When iterating m_blocks in forward order, only the last m_block may be partial.
        rM   r   r   z	col0 == 0NTrP   )mode)r   r9   r@   r4   r   rY   r   r   r   r   r   r   r3   r<   rT   r[   r   rZ   r5   r   r\   r0   r7   r6   )1r>   rB   r   r   rE   rF   rH   rI   rJ   rK   rC   rD   rL   ra   r   r   rg   rh   ri   rj   r#   r   r   r   global_q	global_kvq_out_of_boundskv_out_of_boundsr   rn   ro   r{   r|   q_idx_for_modkv_idx_for_modrv   r}   r~   r   r   r   seqlenq_row_limitcausal_offsetr*   rk   r+   r   r   
local_maskr'   r'   r(   apply_mask_sm100_transposed  s   	







!




"



 z)AttentionMask.apply_mask_sm100_transposed)FNNrA   )FNNNNrA   F)NNNNrA   FT)__name__
__module____qualname__r   	Constexprint__annotations__r   r6   r   r   r7   r8   r9   boolpropertyr<   r@   r   jitTensorTiledMmar   listr   	TiledCopyr   r   r'   r'   r'   r(   r2   G   s   
 	
 c
	
 	
r2   )r	   F)typingr   r   dataclassesr   r   cutlass.cuter   r   r   r   flash_attn_origin.cute.utilsrT   "flash_attn_origin.cute.seqlen_infor   r   r   r   r   r)   r0   r2   r'   r'   r'   r(   <module>   s   $