o
    پi(                     @   s   d dl mZmZmZmZ d dlZd dlmZ dd Z	ddej	dej	dej	d	ej	d
e
de
de
dedeej	ej	ej	ej	f fddZ	ddej	dej	dej	d	ej	dej	dej	d
e
de
de
dedeej	ej	ej	ej	f fddZ							ddddddZ							ddddddZdS )    )ListOptionalTupleUnionNc                 C   s"   | d ur|  ddkr|  S | S )N   )stride
contiguous)x r   P/home/ubuntu/.local/lib/python3.10/site-packages/sgl_kernel/sparse_flash_attn.pymaybe_contiguous   s   "r   T	q_seqlens
kv_seqlensvertical_indexesslash_indexescontext_sizeblock_size_Mblock_size_Ncausalreturnc                 C   s   | d}| d}	| d}
| d}|| d | }tj||	|| j| jd}tj||	||
| j| jd}tj||	|| j| jd}tj||	||| j| jd}tjjj||||| ||||||| ||||fS Nr   r      )dtypedevice)	sizetorchzerosr   r   ops
sgl_kernelconvert_vertical_slash_indexesdefault)r   r   r   r   r   r   r   r   
batch_size	num_heads	nnz_slashnnz_verticalnum_rowsblock_countblock_offsetcolumn_countcolumn_indexr   r   r   r       sT   





	r    vertical_indices_countslash_indices_countc
                 C   s   | d}
| d}| d}| d}|| d | }tj|
||| j| jd}tj|
|||| j| jd}tj|
||| j| jd}tj|
|||| j| jd}tjjj||||| |||||||||	 ||||fS r   )	r   r   emptyr   r   r   r   (convert_vertical_slash_indexes_mergeheadr!   )r   r   r   r   r+   r,   r   r   r   r   r"   r#   r$   r%   r&   r'   r(   r)   r*   r   r   r   r.   D   sX   




	r.           F)return_softmax_lseoutc                C   sx   |du r| j d d }dd | ||fD \} }}tjjj| |||||||||||	|
|o/|dkd\}}|r:||fS |S )a  Compute attention with vertical and slash sparsity patterns.
    Most Arguments are the same with the flash_attn_func interface, except for 4 extra args:
    block_count and block_offset for slash sparsity patterns, and
    column_count and column_index for vertical sparsity patterns.
    For more details please refer to Appendix C.4.2 of paper https://arxiv.org/abs/2407.02490.

    Arguments:
        q: (batch_size, seqlen, nheads, headdim)
        k: (batch_size, seqlen, nheads_k, headdim)
        v: (batch_size, seqlen, nheads_k, headdim)
        block_count: (batch_size, nheads, cdiv(seqlen, BLOCK_M))
        block_offset: (batch_size, nheads, cdiv(seqlen, BLOCK_M), NNZ_S)
        column_count: (batch_size, nheads, cdiv(seqlen, BLOCK_M))
        column_index: (batch_size, nheads, cdiv(seqlen, BLOCK_M), NNZ_V)
        dropout_p: float. Dropout probability.
        softmax_scale: float. The scaling of QK^T before applying softmax.
            Default to 1 / sqrt(headdim).
        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
        alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of
            (-alibi_slope * |i + seqlen_k - seqlen_q - j|)
            is added to the attention score of query i and key j.
        deterministic: bool. Whether to use the deterministic implementation of the backward pass,
            which is slightly slower and uses more memory. The forward pass is always deterministic.
        return_attn_probs: bool. Whether to return the attention probabilities. This option is for
           testing only. The returned probabilities are not guaranteed to be correct
           (they might not have the right scaling).
    Return:
        out: (batch_size, seqlen, nheads, headdim).
        softmax_lse [optional, if return_softmax_lse=True]: (batch_size, nheads, seqlen). The
            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
            normalization factor).
    Nr         c                 S      g | ]}t |qS r   r   .0r
   r   r   r   
<listcomp>       z$sparse_attn_func.<locals>.<listcomp>r   )shaper   r   r   
fwd_sparser!   )qkvr'   r(   r)   r*   	dropout_psoftmax_scaler   softcapalibi_slopesdeterministicreturn_attn_probsr0   r1   softmax_lser   r   r   sparse_attn_func   s*   3

rE   c                C   s   |du r| j d d }dd | ||fD \} }}tjjj| |||||||||d||	|
||d|||o5|dkd\}}|r@||fS |S )ah
  Compute attention with vertical and slash sparsity patterns.
    Most Arguments are the same with the flash_attn_varlen_func interface, except for 4 extra args:
    block_count and block_offset for slash sparsity patterns, and
    column_count and column_index for vertical sparsity patterns.
    For more details please refer to Appendix C.4.2 of paper https://arxiv.org/abs/2407.02490.

    Arguments:
        q: (total_q, nheads, headdim), where total_q = total number of query tokens in the batch.
        k: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch.
        v: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch.
        block_count: (batch_size, nheads, cdiv(seqlen, BLOCK_M))
        block_offset: (batch_size, nheads, cdiv(seqlen, BLOCK_M), NNZ_S)
        column_count: (batch_size, nheads, cdiv(seqlen, BLOCK_M))
        column_index: (batch_size, nheads, cdiv(seqlen, BLOCK_M), NNZ_V)
        cu_seqlens_q: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
           of the sequences in the batch, used to index into q.
        cu_seqlens_k: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
           of the sequences in the batch, used to index into kv.
        max_seqlen_q: int. Maximum query sequence length in the batch.
        max_seqlen_k: int. Maximum key sequence length in the batch.
        dropout_p: float. Dropout probability.
        softmax_scale: float. The scaling of QK^T before applying softmax.
            Default to 1 / sqrt(headdim).
        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
        softcap: float. Anything > 0 activates softcapping attention.
        alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of
            (-alibi_slope * |i + seqlen_k - seqlen_q - j|)
            is added to the attention score of query i and key j.
        deterministic: bool. Whether to use the deterministic implementation of the backward pass,
            which is slightly slower and uses more memory. The forward pass is always deterministic.
        return_attn_probs: bool. Whether to return the attention probabilities. This option is for
           testing only. The returned probabilities are not guaranteed to be correct
           (they might not have the right scaling).
    Return:
        out: (total, nheads, headdim).
        softmax_lse [optional, if return_softmax_lse=True]: (nheads, total_q_seqlen). The
            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
            normalization factor).
    Nr   r2   c                 S   r3   r   r4   r5   r   r   r   r7     r8   z+sparse_attn_varlen_func.<locals>.<listcomp>Fr   )r9   r   r   r   varlen_fwd_sparser!   )r;   r<   r=   r'   r(   r)   r*   cu_seqlens_qcu_seqlens_kmax_seqlen_qmax_seqlen_kr>   r?   r   r@   rA   rB   rC   r0   r1   rD   r   r   r   sparse_attn_varlen_func   s6   >

rK   )T)r/   NFr/   NFF)typingr   r   r   r   r   torch.nnnnr   Tensorintbooltupler    r.   rE   rK   r   r   r   r   <module>   s    	
C	

EW