o
    پi                     @   sD  d Z ddlmZmZmZ ddlmZ ddlZddlm	Z	 de
de
de
fdd	ZG d
d deZG dd deZdejdee
df dededB deeg ef B dB dejfddZdedejdB dejdB dee
e
e
f dee
e
e
e
f dedB deeg ef B dB deejdB ejdB f fddZde
de
de
d e
d!e
d"e
d#e
deee
e
e
f ee
e
e
e
f f fd$d%Zde
de
de
d e
d!e
d"e
d&e
deee
e
e
f ee
e
e
e
f f fd'd(Zddd)d*edee
e
e
f dee
e
e
e
f dedB deeg ef B dB defd+d,Zd*edefd-d.Z	/d5d*ed0ededB fd1d2Zd3d4 ZdS )6z,
Block-sparsity utilities for FlexAttention
    )Callable
NamedTupleTupleNto_cute_tensorabreturnc                 C   s   | | d | S N    )r   r   r   r   Y/home/ubuntu/.local/lib/python3.10/site-packages/flash_attn_origin/cute/block_sparsity.pyceildiv   s   r   c                   @   sF   e Zd ZU ejed< ejed< ejdB ed< ejdB ed< dd ZdS )BlockSparseTensorsmask_block_cntmask_block_idxNfull_block_cntfull_block_idxc                 C   s&   t |dkrg |d d R }t| S )N   )lenr   )selfvaluesr   r   r   __new_from_mlir_values__   s   z+BlockSparseTensors.__new_from_mlir_values__)__name__
__module____qualname__cuteTensor__annotations__r   r   r   r   r   r      s   
 

r   c                   @   sF   e Zd ZU ejed< ejed< dZejdB ed< dZejdB ed< dS )BlockSparseTensorsTorchr   r   Nr   r   )r   r   r   torchr   r   r   r   r   r   r   r   r      s
   
 

r   tensorexpected_shape.tensor_namecontexthintc           
   	   C   s   | j |k}|s	| S ttdd | j |}|sD|rd| dnd}t|r'| n|}|r0d| nd}	t| | d| j  d| d	|	 | j|  S )
zOCheck if we need to expand the tensor to expected shape, and do so if possible.c                 S   s   | |kp| dkS r
   r   )curtgtr   r   r   <lambda>/   s    z)_expand_sparsity_tensor.<locals>.<lambda>z () z Hint: z with shape z& cannot be expanded to expected shape .)shapeallmapcallable
ValueErrorexpand
contiguous)
r!   r"   r#   r$   r%   needs_expand
can_expandcontext_clauseresolved_hinthint_clauser   r   r   _expand_sparsity_tensor$   s   
r8   namecntidxexpected_count_shapeexpected_index_shapec           	      C   s   |d u |d u krt |  d|  d|d u s|d u rdS |jtjks(|jtjkr/t |  d|j|jkr?t |  d|  d|jrE|jsLt |  dt|||  d||}t|||  d||}||fS )	Nz_block_cnt and z0_block_idx must both be provided or both be None)NNz*_block tensors must have dtype torch.int32z%_block_idx must be on the same devicez _block tensors must live on CUDA
_block_cnt
_block_idx)r0   dtyper    int32deviceis_cudar8   )	r9   r:   r;   r<   r=   r$   r%   expanded_cntexpanded_idxr   r   r   _check_and_expand_block;   s&   	rF   
batch_sizenum_headseqlen_qseqlen_km_block_sizen_block_sizeq_stagec                 C   s:   || }t ||}t ||}	| ||f}
| |||	f}|
|fS )zSReturn (expected_count_shape, expected_index_shape) for block sparse normalization.r   )rG   rH   rI   rJ   rK   rL   rM   m_block_size_effectiveexpected_m_blocksexpected_n_blocksr<   r=   r   r   r    get_block_sparse_expected_shapesY   s   



rR   subtile_factorc                 C   s:   || }t ||}t ||}	| ||	f}
| ||	|f}|
|fS )a/  Return (expected_count_shape, expected_index_shape) for backward block sparse normalization.

    Backward uses Q-direction indexing (transposed from forward), where shapes are
    indexed by N-blocks first, then M-blocks. The sparse_block_size_q is determined
    by subtile_factor * m_block_size.
    rN   )rG   rH   rI   rJ   rK   rL   rS   sparse_block_size_qrP   rQ   r<   r=   r   r   r   $get_block_sparse_expected_shapes_bwdk   s   


rU   )r$   r%   tensorsc          	      C   s   | j d u s
| jd u rtdtd| j | j||||\}}|d u s$|d u r(tdtd| j| j||||\}}|d urD|j|jkrDtdt||||dS )NzFmask_block_cnt and mask_block_idx must be provided for block sparsity.maskfullz3All block sparse tensors must be on the same device)r   r   r   r   )r   r   r0   rF   r   r   rB   r   )	rV   r<   r=   r$   r%   mask_cntmask_idxfull_cntfull_idxr   r   r   normalize_block_sparse_tensors   s<   		r]   c                 C   s   t dd | j| jfD S )Nc                 s   s    | ]}|d uV  qd S )Nr   .0tr   r   r   	<genexpr>   s    z,is_block_sparsity_enabled.<locals>.<genexpr>)anyr   r   )rV   r   r   r   is_block_sparsity_enabled   s   rc   Tenable_tvm_ffic           
         sZ   t | sdS | \}}}} fdd||fD \}} fdd||fD \}}	t||||	S )zLConvert torch block sparsity tensors to CuTe tensors, optionally for tvm ffiNc                    s   g | ]
}t |d d dqS )   assumed_alignleading_dimrd   r   r^   rd   r   r   
<listcomp>   s    z0to_cute_block_sparse_tensors.<locals>.<listcomp>c                    s(   g | ]}|d urt |dd dnd qS )Nre   rf   rg   r   r^   rj   r   r   rk      s    )rc   r   )
rV   rd   r   r   r   r   mask_block_cnt_tensormask_block_idx_tensorfull_block_cnt_tensorfull_block_idx_tensorr   rj   r   to_cute_block_sparse_tensors   s4   

rp   c                 C   s
   d| _ | S )zHConvenience decorator to mark mask_mod as safe for 5-point fast samplingT)use_fast_sampling)mask_modr   r   r   fast_sampling   s   rs   )T)__doc__typingr   r   r   cutlass.cuter   r    %flash_attn_origin.cute.cute_dsl_utilsr   intr   r   r   r   strr8   rF   rR   rU   r]   boolrc   rp   rs   r   r   r   r   <module>   s    


 
 

+
&