o
    ia                     @   sr   d dl Z d dlZd dlmZ dd Zdd Zdd ZG dd	 d	ejjZ	G d
d dejjZ
				dddZdS )    Nc                 C   s   |rJ | j \}}| jtjd} | jdddd\}}|jdd}| jdddjdddf }|tj|| jd	|f }| jddddjdddf }	|tj|| jd	|	f }
|d
 }|||f  d7  < ||
|	f  d7  < d||dk< |j	
 jtjdS )ao  Convert from the 0-1 format to the format used by the CUDA code.
    0 means the block is skipped.
    nonzero means the block is not skipped.
    Argument:
        blockmask: (row, col): a 0-1 tensor
    Return:
        blockmask_converted: (col, row), dtype torch.int32: for each column, it contains the row
            indices of the nonzero blocks, padded with -1 to reach length @row.
            The indices are multiplied by 4, with the smallest bit used to encode whether
            it is the first nonzero in its row, and the 2nd smallest bit to encode whether it is
            the last nonzero in its row..
    )dtyper   T)dimstable
descending)r   )r   r   N)device         )shapetotorchuint8sortargsortindicesaranger   T
contiguousint32)	blockmaskcausalnrowncolnonzero_valnonzero_sorted_rowidxnonzero_unsorted_rowidxlast_nonzero_col_per_row#last_nonzero_col_per_row_after_sortfirst_nonzero_col_per_row$first_nonzero_col_per_row_after_sortnonzero_idx r"   a/home/ubuntu/vllm_env/lib/python3.10/site-packages/flash_attn/flash_blocksparse_attn_interface.pyconvert_blockmask   s$   
r$   c                 C   s:   t | |||||||d 	^}}	}
|r|
d nd }||	|fS )Nr   )flash_attn_cuda	fwd_block)qkv
cu_seqlensr   	dropout_pmax_ssoftmax_scaler   return_softmaxcontextsoftmax_lserestS_dmaskr"   r"   r#   _flash_blocksparse_attn_forward*   s
   

r1   c                 C   s*   t | ||||||||	||
d \}}}|S N)r%   	bwd_block)doutr'   outr0   r.   r(   r   r)   r*   r+   r   dqkvdp	softmax_dr"   r"   r#    _flash_blocksparse_attn_backward6   s   
r9   c                   @   $   e Zd Zedd Zedd ZdS )FlashBlocksparseAttnFunc              
   C   s|   |dkr	t j nd }|d u r|jd d }t|||||||dd\}	}
}| ||	||
||| || _|| _|| _|| _	|	S )Nr   r         Fr   r,   
r   cudaget_rng_stater   r1   save_for_backwardr)   r*   r+   r   ctxr'   r(   r   r)   r*   r+   r   	rng_stater-   r.   r0   r"   r"   r#   forwardW   s&   
zFlashBlocksparseAttnFun.forwardc                 C   s~   | j \}}}}}}}|d urtj }	tj| t|||||||| j| j| j| j	}
|d ur5tj|	 |
d d d d d d d fS r2   
saved_tensorsr   r?   r@   set_rng_stater9   r)   r*   r+   r   )rC   r4   r'   r-   r0   r.   r(   r   rD   cur_rng_stater6   r"   r"   r#   backwardn   s(   
z FlashBlocksparseAttnFun.backwardN__name__
__module____qualname__staticmethodrE   rJ   r"   r"   r"   r#   r;   V   
    
r;   c                   @   r:   )FlashBlocksparseAttnFunWithSc              
   C   s   |dkr	t j nd }|d u r|jd d }t|||||||dd\}	}
}| ||	||
||| || _|| _|| _|| _	|	||
fS )Nr   r   r<   Tr=   r>   rB   r"   r"   r#   rE      s&   

z$FlashBlocksparseAttnFunWithS.forwardc                 C   s|   | j \}}}}}}	}
|
d urtj }tj|
 t|||||||	| j| j| j| j	}|
d ur5tj| |d d d d d d fS r2   rF   )rC   r4   _dS_dmask_ignored_dsoftmax_sum_ignoredr'   r-   r0   r.   r(   r   rD   rI   r6   r"   r"   r#   rJ      s(   
z%FlashBlocksparseAttnFunWithS.backwardNrK   r"   r"   r"   r#   rQ      rP   rQ   FTc	           
   	   C   s2   |st nt}	|rt||d}|	| ||||||S )z0dropout_p should be set to 0.0 during evaluation)r   )r;   rQ   r$   apply)
r'   r(   r   r)   r*   r+   r   return_attn_probsconvert_maskfuncr"   r"   r#   flash_blocksparse_attn_func   s   rX   )NFFT)r%   r   torch.nnnnr$   r1   r9   autogradFunctionr;   rQ   rX   r"   r"   r"   r#   <module>   s   # 36