o
    i7                     @   sZ   d dl mZ d dlmZ d dlZd dlmZ d dlm  mZ eddG dd dZ	dS )    )Optional)	dataclassNT)frozenc                   @   s  e Zd ZU eje ed< eje ed< ejed< ejed< dZe	ej ed< dZ
e	ej ed< dZeje ed	< ej	
ddejdejdejdejdeje deje deje ddfddZejdejdejdejdejdejdejdejdejddfddZdS )AttentionMaskm_block_sizen_block_sizeseqlen_qseqlen_kNwindow_size_leftwindow_size_right   qhead_per_kvhead_packgqaFacc_Sm_blockn_blockthr_mmamask_seqlenmask_causal
mask_localreturnc                 C   s  |r|rJ dt |}t| j| jf}	t ||	}
t |d|	}|
d d }| j|| j  | }t	
| o@| rt	
|rt	jt|
jd ddD ]/}|d|f d |k}t	jt|
jd ddD ]}|rvt	jj n|||f |||f< qmqTd S d S |jjd d }t	
| jdkrtjj| dksJ dt|jd |ksJ |j}|| j |
|| df d  | j }d| j || j  | j | }t	
|rEt	jt|
jd ddD ]]}t	
| jdkr|
|df d || j  }n
t j||| |d}|| }t	
|rt	||}t	jt|
jd ddD ]}|d|f d |kr6t	jj n|||f |||f< q$qd S t	
| jd urS|| j nd }t	
| jd ure|d | j nd }t	jt|
jd ddD ]|}t	
| jdkr|
|df d || j  }n
t j||| |d}t	
| jd ur|| }t	
|rt	||}n| j}t	
| jd ur|| nd}t	jt|
jd ddD ]}|d|f d }||ks||k rt	jj |||f< qАqtd S )N.mask_causal and mask_local cannot be both Truer   r   Tunroll_fullz%threads_per_row must divide WARP_SIZE)width)utilsmake_acc_tensor_mn_viewcutemake_identity_tensorr   r   partition_C	get_slicer	   cutlass
const_exprrangesizeshapeFloat32inftv_layout_Cr   arch	WARP_SIZEthr_idxr   shuffle_syncminr   r
   )selfr   r   r   r   r   r   r   acc_S_mncStScS_mnt0ScS_mnthr_col_offsetseqlenk_col_limitcoobrthreads_per_rowtidx	mma_m_idxcausal_row_offsetrow_idxcol_limit_rightlocal_row_offset_rightlocal_row_offset_leftcol_limit_leftcol_idx rA   J/home/ubuntu/vllm_env/lib/python3.10/site-packages/flash_attn/cute/mask.py
apply_mask   s   

$
8
zAttentionMask.apply_maskthr_tmem_loadc	                 C   s  |r|rJ dt | j| jf}	||	}
||
}| j|| j  }t| o*| rt|rtt 	|j
}t|d dk rctj|ddD ]}|| d |krZtjj n|| ||< qKd S tj|d ddD ]A}||d  }tt|d}td|> d }tdD ] }t|d|> @ }|r||d |  ntjj ||d | < qqld S d S d| j || j  | j }|d d || j  }t| jdkr|| j }t|rm|| }t|rt||}tt 	|j
}t|d dk rtj|ddD ]}|| d |krtjj n|| ||< qd S tj|d ddD ]B}||d  }tt|d}td|> d }tdD ]"}t|d|> @ }|r\||d |  ntjj ||d | < qFq(d S t| jd ur{|| j nd }t| jd ur|d | j nd }t| jd ur|| }t|rt||}n| j}t| jd ur|| nd}tjt 	|j
ddD ]}|| d }||ks||k rtjj n|| ||< qd S )Nr      r   Tr   r   )r   r   r   r   r   partition_Dr	   r    r!   r#   r$   r"   r%   r&   Uint32maxrange_constexprBooleanr   r   r,   r   r
   )r-   r   r   r   r   rD   r   r   r   r/   tScStScS_t2rr3   ncoliscol_limit_right_scol_limit_right_curmask
mask_i_bitr:   r;   r<   r=   r>   r?   r@   rA   rA   rB   apply_mask_sm100w   s   


 ,

"
0

zAttentionMask.apply_mask_sm100)F)__name__
__module____qualname__r    	Constexprint__annotations__Int32r
   r   r   r   r   jitTensorTiledMmaboolrC   	TiledCopyrT   rA   rA   rA   rB   r      s^   
 

		`	
r   )
typingr   dataclassesr   r    cutlass.cuter   flash_attn.cute.utilsr   r   rA   rA   rA   rB   <module>   s   