o
     i$o                     @   s  d dl Z d dlmZ d dlmZ d dlmZmZmZm	Z	m
Z
 d dlZd dlmZ ddlmZmZmZmZmZmZ ddlmZmZmZ dd	lmZ dd
lmZmZ eG dd dZ	dLdejdejdeej de	ee  eej f fddZ!dee	e df  dee  fddZ"dee  deej dejfddZ#dee	e df  dee  deej$ deej dejf
ddZ%dee	e df  dee  deej dejfddZ&dee	e df  deej de	ejee  f fd d!Z'dee  d"e deej dejfd#d$Z(dee	e df  deej dejfd%d&Z)dee	e df  deej de	ejejf fd'd(Z*dee	e df  deej dejfd)d*Z+d+e d,e d-e de,fd.d/Z-d+e d,e d-e d0e,d1ed2ej$dee
e  fd3d4Z.			5	dMd6ejd7ejd8ejd9ejd:ejd;ejd<ed=ee
e  d>ee
e  d0e,d?ee	ejejf  dejfd@dAZ/G dBdC dCej0Z1edDe dEe dee	e df  fdFdGZ2dEee  dee	e df  fdHdIZ3dDe dEe de fdJdKZ4dS )N    N)	dataclass)	lru_cache)ListOptionalSequenceTupleType)fmha   )flashflash30memory_efficient_attention_forward_requires_grad"memory_efficient_attention_partialmerge_attentionstriton_splitk)AttentionBiasPagedBlockDiagonalGappyKeysMask PagedBlockDiagonalPaddedKeysMask)AttentionFwOpBase)_get_use_fa3fa3_availablec                   @   s:  e Zd ZU dZeeedf  ed< ej	ed< ej	ed< ej	ed< e
e ed< ej	ed< ej	ed	< ej	ed
< ej	ed< ej	ed< ej	ed< ej	ed< e
ej	 ed< e
ej	 ed< e
e ed< ee		ddeeedf  deej deej dd fddZe		ddeeedf  deej deej dd fddZdS )TreeAttnMetadataa  
    tree_choices: definition of the tree, tuples sorted by length, each corresponding
        to a node. See the docstring of TreeAttnMetadata.from_tree_choices.
    attention_bias: Medusa-style tree attention bias as an explicit tensor
        of shape (tree_size, tree_size), where tree_size is the total number
        of nodes in the tree. It can be used as a spec_attn_bias ("right"
        or "suffix" attention part) in tree_attention.
        See tree_attention_with_sync for a usage example.
    tree_indices: 1D tensor of size tree_size which maps tree nodes to draft tokens.
        Tree nodes are assumed to be in the same order as in tree_choices
        (see TreeAttnMetadata.from_tree_choices).
    retrieval_indices: a tensor of shape (number of leaves, depth + 1), where one
        row corresponds to one path, and contains indices of the tree nodes
        on that path. Paths are padded with -1 from the right.
        The paths (row dim) are unsorted.
    path_lengths: real lengths for each of the paths.
    tree_seq_position_ids: 1D tensor of size tree_size which indicates which head
        a node belongs to. Equivalently, it shows the sequence position of the
        node within the corresponding path.
    parent_node_indices: 1D tensor of size tree_size which for each node contains
        position of its parent + 1. For root node(s) it contains 0.
    child_node_indices: a tensor of shape (tree_size, max_num_children_per_node),
        in which each row contains indices of children of the corresponding node.
        Rows corresponding to nodes which have less than max_num_children_per_node
        children are padded by repeating the last child index.
        For leaf nodes the values are meaningless and filled with 0.
    num_children_per_node: 1D tensor of size tree_size which contains the number of
        children for each node.
    candidate_idx: 1D tensor of size tree_size, contains index of each node among its "siblings".
        Takes values from 0 to the number of children of the parent node minus 1.
    num_nodes_per_level: 1D tensor of the number of nodes at each level (including root).
    num_children_per_node_at_level: List of 1D tensors, each containing the number of children at the tree level.
    subtree_size: List of integers, each containing the number of nodes in the subtree at the tree level.
    Example:
        Tree choices
          `[(0,), (0, 0), (0, 1), (0, 2), (1,), (1, 0), (1, 1), (1, 2), (2,), (2, 0), (2, 1), (2, 2)]`
        represents a tree that looks like this:
            0
            |-- 1
            |   |-- 4
            |   |-- 5
            |   |-- 6
            |
            |-- 2
            |   |-- 7
            |   |-- 8
            |   |-- 9
            |
            |-- 3
                |-- 10
                |-- 11
                |-- 12

        with TreeAttnMetadata
            tree_indices=tensor([0, 1, 2, 3, 4, 5, 6, 4, 5, 6, 4, 5, 6])
            retrieval_indices=tensor([[ 0,  1,  5],
                                      [ 0,  2,  9],
                                      [ 0,  3, 11],
                                      [ 0,  1,  4],
                                      [ 0,  2,  8],
                                      [ 0,  3, 10],
                                      [ 0,  1,  6],
                                      [ 0,  2,  7],
                                      [ 0,  3, 12]])
            path_lengths=[3, 3, 3, 3, 3, 3, 3, 3, 3]
            tree_seq_position_ids=tensor([0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2])
            child_node_indices=tensor([[ 0,  1,  2],
                                       [ 3,  4,  5],
                                       [ 6,  7,  8],
                                       [ 9, 10, 11],
                                       [ 0,  0,  0],
                                       ...
                                       [ 0,  0,  0]])
            num_children_per_node=tensor([3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0])
            candidate_idx=tensor([0, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2])
            num_nodes_per_level=tensor([1, 3, 3])
            num_children_per_node_at_level=[tensor([3]), tensor([3, 3, 3]), tensor([0, 0, 0, 0, 0, 0, 0, 0, 0])]
            subtree_sizes=[1, 4, 13]
    .tree_choicesattention_biastree_indicesretrieval_indicespath_lengthstree_seq_position_idsparent_node_indiceschild_node_indicesnum_children_per_nodecandidate_idxnum_nodes_per_levelnum_nodes_per_level_cpunum_children_per_node_at_level"num_children_per_node_at_level_cpusubtree_sizesNdtypedevicereturnc                 C   s   |  |||S N)from_tree_choices)clsr   r'   r(    r-   O/home/ubuntu/.local/lib/python3.10/site-packages/xformers/ops/tree_attention.pyfrom_tree_choices_cached   s   z)TreeAttnMetadata.from_tree_choices_cachedc                 C   s   t |dd d}t|d }t|}t|||}t||\}}	t|||}
t||||}t||}t||\}}t	||}t
||}| }t|||\}}dd |D }t|||||	|
|||||||||S )a\  
        Args:
            tree_choices: tree description in the style of
                https://github.com/FasterDecoding/Medusa/blob/5e9805386/medusa/model/medusa_choices.py
                A typical tree description would look like:
                [(node0, node1, ...), (node0, node2), (node0, node3), (node1, node3), ..., (node0, node2, ..., nodeN)]
                Every tuple is corresponds to one node in the tree, encoded as a path from one of the root nodes to the
                node in question.
                For example, a node encoded as (1, 0, 3, ..., 2) is understood as:
                list all the root nodes and take node number 1
                list all children of that node and take node number 0
                list all children of that node and take node number 3
                ...
                list all children of that node and take node number 2 - that's the node encoded by this tuple.

            dtype: data type of the output mask tensor.
            device: device of the output tensors.
        Returns:
            TreeAttnMetadata object with all the fields.
        c                 S   s   t | | fS r*   lenxr-   r-   r.   <lambda>   s    z4TreeAttnMetadata.from_tree_choices.<locals>.<lambda>)keyr
   c                 S   s   g | ]}|  qS r-   )cpu).0rowr-   r-   r.   
<listcomp>   s    z6TreeAttnMetadata.from_tree_choices.<locals>.<listcomp>)sortedr1   _get_depth_counts_prepare_tree_indices_prepare_retrieval_indices_prepare_tree_position_ids_prepare_tree_attn_bias_prepare_parent_node_indices_prepare_child_node_indices_prepare_candidate_idx_get_num_nodes_per_levelr6   4_get_subtree_size_and_num_children_per_node_at_levelr   )r,   r   r'   r(   sorted_tree_choicestree_lendepth_countsr   r   r   r   tree_attn_maskr   r   r    r!   r"   r#   r&   r$   r%   r-   r-   r.   r+      s\   


z"TreeAttnMetadata.from_tree_choices)NN)__name__
__module____qualname____doc__r   r   int__annotations__torchTensorr   classmethodr   r   r'   r(   r/   r+   r-   r-   r-   r.   r   !   sT   
 P









r   r"   r    r(   r)   c              
   C   st   t | }dg}|d dg}td|D ]!}|tt| d |d   ||||d  ||   q||fS )Nr
   r   )r1   	unsqueezerangeappendrM   rO   sum)r"   r    r(   depthr&   r$   ir-   r-   r.   rD      s    rD   rE   .c                 C   sH   g }d}| D ]}t |}||kr|d ||d   d7  < |}q|S )Nr   r
   )r1   rT   )rE   rG   
prev_depthpathrV   r-   r-   r.   r;      s   
r;   rG   c                 C   s    t jdg|  |d}||dk S )Nr
   )r(   r   )rO   tensor)rG   r(   depth_counts_tensorr-   r-   r.   rC      s   rC   r'   c                 C   s   t | d }tj||ftj ||d}d}t|D ]}||||f< q||dddf< d}tt |D ]E}t|| D ]6}	| ||	  }
t |
dkrIq:g }tt |
d D ]}|| |
d|d  d  qS|||	| d |f< q:||| 7 }q2|S )a  
    Construct a Medusa-style tree attention bias as an explicit tensor.
    It can be used as a spec_attn_bias ("right" or "suffix" attention part)
    in tree_attention. See run_tree_attention_inner in test for a usage example.
    Args:
        sorted_tree_choices: tree description in the style of
            https://github.com/FasterDecoding/Medusa/blob/5e9805386/medusa/model/medusa_choices.py
            A typical tree description would look like:
            [(node0, node1, ...), (node0, node2), (node0, node3), (node1, node3), ..., (node0, node2, ..., nodeN)]
            Every tuple is corresponds to one node in the tree, encoded as a path from one of the root nodes to the
            node in question. Passed in sorted order.
            For example, a node encoded as (1, 0, 3, ..., 2) is understood as:
            list all the root nodes and take node number 1
            list all children of that node and take node number 0
            list all children of that node and take node number 3
            ...
            list all children of that node and take node number 2 - that's the node encoded by this tuple
        depth_counts: a list of integers, where the i-th element is the number of choices with depth i.
        dtype: data type of the output tensor.
        device: device of the output tensor.
    Returns:
        attention bias of shape (tree_size, tree_size),
        where tree_size is the total number of nodes in the tree.
    r
   r(   r'   r   N)r1   rO   fullinfrS   rT   index)rE   rG   r'   r(   rF   rH   mask_valrW   startjcur_tree_choiceancestor_idxcr-   r-   r.   r?     s,   r?   c                 C   s   t | d }tj||tjd}d|d< d\}}tt |D ].}|}t|| D ]}	| ||	  }
|
d | d }||||	 d < t||}q'||| 7 }q|S )a  
    Construct an index tensor for choices in the tree and their corresponding index in the draft tokens.
    Args:
        sorted_tree_choices: sorted from tree_choices input of prepare_tree_attn_metadata function
        depth_counts: a list of integers, where the i-th element is the number of choices with depth i.
        device: device of the output tensor.
    Returns:
        tree indices of shape (tree_len,). See docstring of TreeAttnMetadata for details.
    r
   r\   r   )r   r   )r1   rO   zeroslongrS   max)rE   rG   r(   rF   r   ra   max_idx_prev_levelrW   
cur_offsetrb   rc   tree_idxr-   r-   r.   r<   =  s   r<   r   c           	         s   rt dd D d nd}t}ddd D ]}|dd |v r-||dd  qg g }}|D ]+ dg fddtdt d D  }|t| ||dg|t|    q5tj|tj|d	}||fS )
a  
    Convert tree definition from the format used by Medusa and EAGLE (tree_choices, see docstring of
    TreeAttnMetadata.from_tree_choices) to a list of paths:
    [
        (node_index0_path0, node_index1_path0, ...),
        (node_index0_path1, node_index1_path1, ...),
        ...
    ]
    where each value is an index of a node inside the corresponding level of a tree.
    Returns:
        retrieval indices of shape (number of leaves, depth + 1)
        length of each path.
    c                 s       | ]}t |V  qd S r*   r0   )r7   noder-   r-   r.   	<genexpr>l      z-_prepare_retrieval_indices.<locals>.<genexpr>r
   Nrf   r   c                    s"   g | ]}  d | d qS )Nr
   )r_   )r7   levelleafr   r-   r.   r9   v  s    z._prepare_retrieval_indices.<locals>.<listcomp>r'   r(   )	ri   setremoverS   r1   rT   rO   rZ   rh   )	r   r(   
tree_depthleavesrn   pathsr   rY   paths_tensorr-   rr   r.   r=   \  s   
r=   rF   c                 C   sX   t j|t j|d}d}tt| D ]}|d ||d || |  d < || | 7 }q|S )a  
    Construct sequence position of each node within its path, can be used for positional embedding.
    Args:
        depth_counts: number of nodes at each of the levels of the tree.
        tree_len: total number of nodes in the tree including the root.
        device: device of the output tensor.
    Returns:
        tree position ids of shape (tree_len,). See docstring of TreeAttnMetadata for details.
    rt   r   r
   )rO   rg   int32rS   r1   )rG   rF   r(   tree_position_idsra   rW   r-   r-   r.   r>     s    r>   c              	   C   s\   g }| D ] }z| | |d d d  W q ty$   | d Y qw tj|tj|dS )Nrf   r
   r   rt   )rT   r_   
ValueErrorrO   rZ   rh   )rE   r(   rd   cur_medusa_choicer-   r-   r.   r@     s    r@   c                    s   g g }dg|  D ]"fddt | D }|t| |r%| q	dg q	tdd D   fddD tfddD sKJ tjtj|d	tj|tj|d	fS )
Nr-   c                    s8   g | ]\}}t  d  t |kr|dd  kr|qS )r
   Nrf   r0   )r7   rW   yr2   r-   r.   r9     s
    $z/_prepare_child_node_indices.<locals>.<listcomp>r   c                 s   rm   r*   r0   r7   r3   r-   r-   r.   ro     rp   z._prepare_child_node_indices.<locals>.<genexpr>c                    s(   g | ]}||d d  t |   qS )rf   Nr0   r   )max_num_childrenr-   r.   r9     s   ( c                 3   s$    | ]}t |t  d  kV  qdS )r   Nr0   r   )resr-   r.   ro     s   " rt   )	enumeraterT   r1   ri   allrO   rZ   rh   )r   r(   r    curr_childrenr-   )r   r   r3   r.   rA     s    
rA   c                    s(    fddt  D }tj|tj|dS )Nc                    s.   g | ]\} t  fd dd| D qS )c                 3   s(    | ]} d d |d d kV  qd S )Nrf   r-   )r7   another_node	curr_noder-   r.   ro     s
    
z4_prepare_candidate_idx.<locals>.<listcomp>.<genexpr>N)rU   )r7   curr_node_idxr   r   r.   r9     s    
z*_prepare_candidate_idx.<locals>.<listcomp>rt   )r   rO   rZ   rh   )r   r(   r!   r-   r   r.   rB     s   
rB   BG	tree_sizec                 C   s4   | | dkr
|dkp| | dk o|dk p| | dk S )n
    Heuristic to decide whether to use Triton Split-k or default (Flash Attention) for prefix attention.
       @      d      r-   )r   r   r   r-   r-   r.   use_triton_splitk_for_prefix  s
   
r   autotune	attn_biaskv_cache_dtypec                 C   s   |rt ntj}tjjr|S t|tjj	}t|tjj	}t|t
jj	}	|s*|s*|	s*|S tjjs0J t o5t }
|tjkr=|S | dkrK|
rK|rHtjS tjS t| ||rS|S dS )r   r   N)SplitKAutotuner   FwOprO   versionhip
isinstancer   FwOp_KVSplitSUPPORTED_ATTN_BIAS_TYPESr   cudar   r   uint8r   )r   r   r   r   r   r   triton_splitk_opfa3_splitkv_supportedfa3_supportedflash2_supporteduse_fa3r-   r-   r.   select_prefix_op  s*   
r   Fqspec_kspec_vcache_kcache_vspec_attn_biasprefix_attn_bias	prefix_op	suffix_opquantized_kv_scalesc           "      C   s  | j dk}|r"| d} |d|d}}|d|d}}| j\}}}}}|j\}}}}}|j\}}t|ttfrE|dksDJ n||ksKJ ||krW||krW||ksYJ |j|jksaJ ||  krk|ksyn J d|d|d|| jdd |jdd   kr|jdd ksn J d| jd	|jd
|j||||||}|	rtntj	}|du rt
||||	||j}|jtjkr(|
dusJ |tj	u sJ tj| d|| ||||d|| |||tj|d|| |||tj||
d d|| |||
d d|| ||dd}tj||d\}}||j}}n&t| d|| ||||d|| ||||d|| |||||d\}}||||||}|||||dddd}t| ||||pn|d\}}t||g||g| jd\} }!|r| d} | S )a  
    Compute Medusa/EAGLE/Hydra-style tree attention.
    Notice that this function takes as arguments biases for the left (prefix)
    and right (speculative suffix) parts of the attention.
    This way we avoid creating these biases on the fly, and
    allow this function to be used in performance-critical decoding
    jobs, including in CUDA graph mode. In the latter case one should
    construct the biases once, and update prefix_attn_bias with
    current seqlens before every graph replay; spec_attn_bias stays static,
    as it's determined by the tree structure.
    Args:
        q: query from speculative tokens, of shape (B, tree_size_q, (G), H, D)
        spec_k, spec_v: keys/values from speculative tokens, each of shape (B, tree_size_kv, (G), H, D).
            If tree_size_q < tree_size_kv, we assume the end of the query sequence aligns with end the k/v sequence,
            like in "from-bottom-right" attention masks. Such rectangular attention masks can be used when we are
            adding new nodes to the tree, and want to avoid recomputing attention for the existing nodes. For example,
            this can be used during draft token generation in EAGLE.
        cache_k/cache_v: queries/keys/values from the existing context, each of shape (B, Mk, (G), H, D)
        spec_attn_bias: attention bias of the "right" part of the attention (tree_size_q x spec tokens).
            This would typically be a an explicit tensor mask, precomputed once and not changing during decoding
        prefix_attn_bias: attention bias of the "left" part of the attention (tree_size_q x existing context).
            This bias would typically be block-diagonal padded non-causal (BlockDiagonalPaddedKeysMask), and it
            changes at every decoding step as K/V sequence lengths grow during decoding.
        prefix_op: attention backend which will be passed to memory_efficient_attention to compute prefix attention.
                   If None, will use Triton Split-K or Flash Attention depending on the heuristics.
        suffix_op: same as prefix_op, but for the suffix.
        autotune: If True, Triton Split-K will use autotuning when chosen
            as a default backend for prefix/suffix attention.
    Returns:
        attention output of shape (B, tree_size_q, (G), H, D)

    :Usage example:

        See also tree_attention_with_sync in tests/test_tree_attention.py

    .. code-block:: python

        # Create an attention bias for the prefix part of the attention
        prefix_attn_bias = BlockDiagonalPaddedKeysMask.from_seqlens(
            q_seqlen=[tree_size_q for _ in range(B)], kv_seqlen=kv_lens, kv_padding=Mk
        )
        # Create an explit attention bias for the speculative part of the attention
        spec_attn_bias = TreeAttnMetadata.from_tree_choices(tree_choices, q.dtype, q.device).attention_bias
        attn_output = tree_attention(
            q, spec_k, spec_v, cache_k, cache_v, spec_attn_bias, prefix_attn_bias
        )
    r   r   r
   ztree_size_q1=z tree_size_q=z tree_size_kv=Nzq.shape=z spec_k.shape=z spec_v.shape=r   T)queryr5   valuer   k_fp8_scale_shiftv_fp8_scale_shift
is_partial)op)r   r      )output_dtype)ndimrR   shaper   r   r   expandr   r   r   r   r'   rO   r   	InputsFp8viewr{   r	   1_memory_efficient_attention_forward_requires_gradlser   permuter   r   squeeze)"r   r   r   r   r   r   r   r   r   r   r   is_bmhkr   tree_size_qr   HDBkvMkG1H1D1tree_size_q1tree_size_kvr   fp8_inpoutctxattn_prefix
lse_prefixattn_suffix
lse_suffixattn_output_r-   r-   r.   tree_attention  s   
=

6	




r   c                   @   s   e Zd ZdZdS )r   TN)rI   rJ   rK   AUTOTUNEr-   r-   r-   r.   r     s    r   rw   	branchingc                 C   s   t |g|  dS )a  
    Construct a full tree of a given depth where each node (except for leaves) has a given number of children.
    The format is compatible with that used by Medusa and EAGLE:
    https://github.com/FasterDecoding/Medusa/blob/5e98053/medusa/model/medusa_choices.py
    For detailed description, see docstring of
    xformers.ops.tree_attention.TreeAttnMetadata.from_tree_choices .
    r   )construct_tree_choicesrw   r   r-   r-   r.   construct_full_tree_choices  s   r   c                    s@   g }t t D ]}|tj fddt |d D   q|S )zS
    Construct a tree based on given branching factor for each non-root level.
    c                    s   g | ]}t  | qS r-   )rS   )r7   kr   r-   r.   r9     s    z*construct_tree_choices.<locals>.<listcomp>r
   )rS   r1   extend	itertoolsproduct)r   choicesrW   r-   r   r.   r     s   (r   c                    s   t  fddt| D S )zi
    Number of nodes in a full tree of a given depth (including the root node) and branching factor.
    c                 3   s    | ]} | V  qd S r*   r-   )r7   rW   r   r-   r.   ro     rp   z%get_full_tree_size.<locals>.<genexpr>)rU   rS   r   r-   r   r.   get_full_tree_size  s   r   r*   )NNFN)5r   dataclassesr   	functoolsr   typingr   r   r   r   r   rO   xformers.opsr	   r   r   r   r   r   r   fmha.attn_biasr   r   r   fmha.commonr   fmha.dispatchr   r   r   rP   r(   rM   rD   r;   rC   r'   r?   r<   r=   r>   r@   rA   rB   boolr   r   r   r   r   r   r   r   r-   r-   r-   r.   <module>   s*    =
"

:

#





4

	

 
