o
    پi                     @   s   d dl mZmZ d dlmZ d dlZd dlmZ d dlmZm	Z	m
Z
 d dlm  m  m  mZ ddlmZ ejdd	ejd
e	dededdf
ddZejd	ejde	deddfddZejd	ejde	de	ddfddZeddG dd dZdS )    )OptionalCallable)	dataclassN)Float32Int32
const_expr   )SeqlenInfoQKZ   FX	col_limitarchrank1returnc                 C   s.  t |dkr|d d t|d d }n|}t |s&t| jt| d  nt| j}tt|dD ]^}t	||d  d}d|> d }ttd||d  D ]@}	t
|d|	> @ }
|d |	 }t |ru|
rn| | ntj | |< qStt| jd D ]}|
r| ||f ntj | ||f< qqSq6d S )Nr
         r      r   )r   mincutesizeshaperankcutlassrange_constexprceil_divmaxBooleanr   inf)r   r   r   r   col_limit_transformedncolscol_limit_right_smaskiin_boundcr r'   _/home/ubuntu/.local/lib/python3.10/site-packages/sglang/jit_kernel/flash_attention/cute/mask.pymask_r2p   s"   ."r)   row_limit_topnum_repc                 C   s   d}|||  | t |||  | }t| j}tt|dD ];}t||d  d}d|> d }tt d||d  D ]}	t|d|	> @ }
|d |	 }|
rVt	j
 n| | | |< q?q"d S )Nr   r   r   r   )r   r   r   r   r   r   r   r   r   r   r   )r   r*   r+   num_wgrow_limit_top_transformedr   r    row_limit_top_sr"   r#   	out_boundr%   r'   r'   r(   mask_r2p_transposed,   s   r0   col_limit_leftcol_limit_rightc                 C   s   t t| j}tt|dD ]Y}t||d  d}t||d  d}t|d}t|d}d|> d }d|> d }|| @ }	ttd||d  D ]}
t	|	d|
> @ }|d |
 }|rc| | nt
j | |< qLqdS )a  
    Dual-bound masking using two bitmasks for SM100, following mask_r2p.
    Masks elements where: NOT (col_limit_left <= col < col_limit_right)

    Uses bit manipulation to create a range mask:
        mask_right = (1 << right) - 1  -> bits (right-1)..0 are 1
        mask_left  = (1 << left) - 1   -> bits (left-1)..0 are 1
        mask_range = mask_range = mask_right & ~ mask_left -> bits (right-1)..left are 1
    r   r   r   N)r   r   r   r   r   r   r   r   r   r   r   r   )r   r1   r2   r   r    right_sleft_s
mask_right	mask_left
mask_ranger#   r$   r%   r'   r'   r(   mask_r2p_dual_boundG   s   


r8   T)frozenc                    @   s
  e Zd ZU eje ed< eje ed< eed< dZe	e
 ed< dZe	e
 ed< dZeje ed< d	Zeje ed
< ede
fddZede
fddZej					d)dejdej
dej
dej
dej
dejdeje deje deje deje	e  de	e ddfddZej										d*dejde
de
dejdejdeje deje deje deje	e  de
de
de	e deddfd d!Zej								"d+dejd#ejd$ejdej
dej
dejdejdejdeje	e  de
de
de	e d%ed&eddfd'd(ZdS ),AttentionMasktile_mtile_nseqlen_infoNwindow_size_leftwindow_size_rightr   qhead_per_kvhead_packgqaFswap_ABr   c                 C      | j jS N)r=   seqlen_qselfr'   r'   r(   rD   v      zAttentionMask.seqlen_qc                 C   rB   rC   )r=   seqlen_krE   r'   r'   r(   rH   z   rG   zAttentionMask.seqlen_kNNacc_S	batch_idxhead_idxm_blockn_blockthr_mmamask_seqlenmask_causal
mask_localmask_modaux_tensorsc           >   	   C   s  |r|	rJ dt j|| jd}| j| jf}t| js|n|d d d }t j||| jd}t j|d|| jd}t	| j rFdnd}t	| j rPdnd}|d | }|dk r^d}| j
|| j  | }t	| or|	 or|
d u rt	|rt	d}t	| rtjt|jd ddD ],}|d|f | |k}tjt|jd ddD ]}|rtj n|||f |||f< qqd S t||d	d
 d S d S t	| o|	 o|
d urt	t|jd }t	t|jd }t	|d uo|d d uo|d d u}t	|o|ot	|d u}t|D ]}||df | }||| j  }|} |}!t	| jdkr<|| j }"|| j |" }!|| j } | }#t	|rLt| |d \}$} t|D ]}%|d|%f | }&||& || j  }'|'}(t	|rtt|'|d \}$}(t |tj})t |!tj}*t | tj}+t |(tj},|
|)|*|+|,| j|}-tt |-}.t	|r|#| jkp|'| j
k}/|/rtjj |||%f< qQ|.r|||%f ntjj |||%f< qQ|.r|||%f ntjj |||%f< qQqd S t	| j r}|jjd d }0d }1t	| jdkr<| jrJ dtjj|0 dksJ dt|jd |0ks'J |j}2|| j ||2|0 df d  | j }1d| j
 || j  | j | }3t	|rt	| j }tjt|jd ddD ]p}t	| jdkr{||df d || j  }4n
t j|1||0 |0d}4|4|3 }5t	|rt |5|}5t	| rtjt|jd ddD ]}|d|f d |5krtj n|||f |||f< qqct||d f |5d	dd qcd S t	| j!d ur|3| j! nd }6t	| j"d ur|3d | j" nd }7tjt|jd ddD ]w}t	| jdkr||df d || j  }4n
t j|1||0 |0d}4t	| j!d ur2|4|6 }5n| j}5t	|r@t |5|}5t	| j"d urL|4|7 nd}8tjt|jd ddD ]}|d|f d }9|9|5kso|9|8k rwtj |||f< q[qd S | jdksJ |d | }:|| j || j  |: }3t	|rtjt|jd ddD ]D}|d|f | };|;|kr|r| jn|;|3 }<tjt|jd ddD ]}||df | |<k rtj n|||f |||f< qϐqd S tjt|jd ddD ]R}|d|f | };|;|kr| jn|;|3 | j! }<|;|3 | j" }=tjt|jd ddD ]#}||df | }4|4|<k s>|4|=krBtj n|||f |||f< q*qd S )N.mask_causal and mask_local cannot be both True)	transposer   r   FTunroll_fullr
   )r   z&swap_AB with PackGQA not supported yetz%threads_per_row must divide WARP_SIZE)widthr   r   )#utilsmake_acc_tensor_mn_viewrA   r;   r<   r   make_identity_tensorpartition_C	get_slicer   rH   r   ranger   r   r   r   r)   r   r@   divmodscalar_to_ssar   r=   r   ssa_to_scalarrD   tv_layout_Cr   	WARP_SIZEthr_idxshuffle_syncr   r?   r>   )>rF   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   fastdiv_modsacc_S_mn	acc_shapecStScS_mnt0ScS_mnROWCOLthr_col_offsetseqlenk_col_limitr2pr%   oobr&   nrowr   has_fastdivwrap_aux_indices	local_rowglobal_row_idxrow_for_modhead_idx_for_modhead_offsetrow_for_seqlen_colcol_idx_localglobal_col_idxcol_for_modbatch_idx_ssahead_idx_ssa	q_idx_ssa
kv_idx_ssa
mask_valuecondout_of_boundsthreads_per_row	mma_m_idxtidxcausal_row_offsetrow_idxr2   local_row_offset_rightlocal_row_offset_leftr1   col_idxthr_row_offsetcol0r*   row_limit_botr'   r'   r(   
apply_mask~   sf  
"






((/










zAttentionMask.apply_maskthr_tmem_loadcheck_q_boundaryc           0      C   s  |r|rJ d| j | jf}t| js|n|d d d }||}||}|dk r-d}| j|| j  }d}t| oA| oA|	d u rzt|rxt| rnt	j
t|jddD ]}|| d |kretj n|| ||< qWd S t||ddd d S d S t| o| o|	d urt|d uo|d d uo|d d u}t|
t	j}tt|j}t	|D ]}| js|| d n|| d }| js|| d n|| d }||| j   }||| j  }t| jdkr|d usJ t||\}}|| j | } n|} |}|}!t|o|d ur|rt||d \}"}!|}#t|o|o|d ur+t||d \}"}#t| t	j}$t|!t	j}%t|#t	j}&|	||$|%|&| j|}'t	t|'}(|(rY|| ntj ||< t|rt|| jkrntj n|| ||< |r|| jkrtj n|| ||< qd S d| j || j  | j })|d d || j   }*t| jdkr|*| j }*t|r|*|) }+t|rt	|+|}+tt|j}t| rt	j
|ddD ]}|| d |+krtj n|| ||< qd S t||+ddd d S t| jd ur|)| j nd },t| jd ur|)d | j nd }-t| jd ur(|*|, }+n| j}+t|r6t	|+|}+t| jd urB|*|- nd}.t| rut	j
t|jddD ]}|| d }/|/|+ksg|/|.k rktj n|| ||< qUd S t||.|+ d S )	NrU   rW   r   TrX   r   d   r[   )r;   r<   r   r^   rA   r_   partition_DrH   r   r   ra   r   r   r   r   r)   r\   rc   r   r   r@   rb   r=   r   rd   rD   r   r?   r>   r8   )0rF   rJ   rM   rN   rO   r   rP   rQ   rR   rS   rK   rL   rT   ri   head_divmodr   rk   rl   tScStScS_t2rrr   rs   r#   rv   r   r   	row_coord	col_coord
global_row
global_colmask_rowr|   r{   mask_row_for_modr~   global_col_for_modr   mask_row_ssar   r   r   r   r   r2   r   r   r1   r   r'   r'   r(   apply_mask_sm100a  s   


&



  )


*


	zAttentionMask.apply_mask_sm100Tr   	t0ScS_t2ris_full_blockcheck_m_boundaryc           1      C   s  |r|rJ dt | j rdnd}t | j rdnd}|d | dks(J d|d | }| j|| j  | }t | oB| oB|	dury|rt |r|dkrftjt|jddD ]	}tj	j
 ||< qZdS |rt t|j}t|D ];}|| | }|| | }||| j  }||| j  }|| jk}|| jk}|p|}|rtj	j
 n|| ||< qudS dS dS t |duo|d duo|d du}t |o|ot |du}t|
tj}t|tj} t t|j}t|D ]}|| | }|| | }||| j  }||| j  }|}!|}"t |r#t||d \}#}!t||d \}#}"t|!tj}$t|"tj}%|	|| |$|%| j|}&tt|&}'|'rJ|| ntj	j
 ||< t |rv|o]|| jk}|| jk}|pg|}|rptj	j
 n|| ||< qdS t | o| rt |r|dkrtjt|jddD ]}tj	j
 ||< qdS dS dS |d | }(| j|| j  |( })|)| }*t |r|*}+t |r|dkr| j}+d},t |, rtjt|jddD ]}|| | |+k rtj	j
 n|| ||< qdS tj|dgd}-t||+|- dS t | jdur|*| j }+nd}+t | jdur+|*| j }.t |r8|dkr8| j}+tjt|jddD ])}|| | }/|/|+k }0t | jdur]|0|/|.kO }0|0retj	j
 n|| ||< qCdS )	a  
        Backward pass: mask S = K @ Q.T where n_block tiles seqlen_k and m_block tiles seqlen_q.

        Coordinate conventio:
        - ROW corresponds to Q (m_block)
        - COL corresponds to KV (n_block)

        is_full_block: If True, skip mask_mod (all elements valid). Only apply seqlen masking.
        check_m_boundary: If False, skip seqlen_q boundary check (optimization for non-boundary m_blocks).
                          When iterating m_blocks in forward order, only the last m_block may be partial.
        rU   r   r   z	col0 == 0NTrX   )mode)r   rA   rH   r<   r   ra   r   r   r   r   r   r   r;   rD   r\   rc   r   rb   r=   r   rd   r0   r?   r>   )1rF   rJ   r   r   rM   rN   rP   rQ   rR   rS   rK   rL   rT   ri   r   r   ro   rp   rq   rr   r#   r   r   r   global_q	global_kvq_out_of_boundskv_out_of_boundsr   rv   rw   r   r   q_idx_for_modkv_idx_for_modr~   r   r   r   r   r   seqlenq_row_limitcausal_offsetr*   rs   r+   r   r   
local_maskr'   r'   r(   apply_mask_sm100_transposed  s   	







!




"



 z)AttentionMask.apply_mask_sm100_transposed)FNNrI   )FNNNNrI   NF)NNNNrI   FT)__name__
__module____qualname__r   	Constexprint__annotations__r	   r>   r   r   r?   r@   rA   boolpropertyrD   rH   r   jitTensorTiledMmar   listr   	TiledCopyr   r   r'   r'   r'   r(   r:   l   s   
 	
 c
	
 	
r:   )r
   F)typingr   r   dataclassesr   r   cutlass.cuter   r   r   r   ,sglang.jit_kernel.flash_attention.cute.utils
jit_kernelflash_attentionr\   r=   r	   r   r   r   r   r)   r0   r8   r:   r'   r'   r'   r(   <module>   s.   $$