o
     il                    @   s  d Z ddlZddlmZ ddlmZmZmZmZm	Z	m
Z
mZmZmZmZ ddlZdejdejfddZd	e
e d
ejdejfddZG dd dZdKde	ej dejfddZejdfddddeedf d
ejdeeejf de	e dedejfddZeG dd deZG dd deZeG dd  d eZeG d!d" d"ZeG d#d$ d$eZ eG d%d& d&eZ!eG d'd( d(eZ"eG d)d* d*e"Z#eG d+d, d,e"Z$eG d-d. d.eZ%eG d/d0 d0e%Z&eG d1d2 d2e%Z'eG d3d4 d4eZ(eG d5d6 d6e(Z)eG d7d8 d8eZ*eG d9d: d:e*Z+eG d;d< d<eZ,eG d=d> d>e,Z-eG d?d@ d@e#Z.eG dAdB dBe$Z/G dCdD dDejeZ0G dEdF dFej1j2Z3G dGdH dHe0Z4G dIdJ dJe4Z5ej67e4 ej67e5 e"e*e%e(e,fZ8dS )LaT  
This file contains biases that can be used as the `attn_bias` argument in
:attr:`xformers.ops.memory_efficient_attention`.
Essentially, a bias is a Tensor which will be added to the ``Q @ K.t`` before
computing the ``softmax``.


The goal of having custom made classes (instead of dense tensors) is that
we want to avoid having to load the biases from memory in the kernel, for
performance reasons. We also want to be able to know before-hand which
parts of the attention matrix we will need to compute (eg causal masks).


Some very common biases are LowerTriangularMask and BlockDiagonalMask.
    N)	dataclass)
AnyClassVarIterableListOptionalSequenceTupleTypeUnioncasttdevicec                 C   s4   | j |kr| S |t dkr| |S | j|ddS )NcpuTnon_blocking)r   torchto)r   r    r   O/home/ubuntu/.local/lib/python3.10/site-packages/xformers/ops/fmha/attn_bias.py
_to_device'   s
   

r   seqdtypec                 C   s6   |t dkrt j| |dS t j| |ddj|ddS )Nr   )r   T)r   
pin_memoryr   )r   r   tensorr   )r   r   r   r   r   r   _to_device_tensor0   s   r   c                	   @   sL   e Zd ZdZdZejdfdeedf dej	de
eejf dejfd	d
ZdS )AttentionBiasaL  Base class for a custom bias that can be applied         as the attn_bias argument in
    :attr:`xformers.ops.memory_efficient_attention`.

    That function has the ability to add a tensor, the
    attention bias, to the QK^T matrix before it is used
    in the softmax part of the attention calculation.
    The attention bias tensor with shape
    (B or 1, n_queries, number of keys)
    can be given as the attn_bias input.
    The most common use case is for an attention bias is
    to contain only zeros and negative infinities, which forms
    a mask so that some queries only attend to some keys.

    Children of this class define alternative things which can
    be used as the attn_bias input to define an attention bias which
    forms such a mask, for some common cases.

    When using an :attr:`xformers.ops.AttentionBias`
    instead of a :attr:`torch.Tensor`, the mask matrix does
    not need to be materialized, and can be
    hardcoded into some kernels for better performance.

    See:

    - :attr:`xformers.ops.fmha.attn_bias.LowerTriangularMask`
    - :attr:`xformers.ops.fmha.attn_bias.LowerTriangularFromBottomRightMask`
    - :attr:`xformers.ops.fmha.attn_bias.LowerTriangularMaskWithTensorBias`
    - :attr:`xformers.ops.fmha.attn_bias.BlockDiagonalMask`
    - :attr:`xformers.ops.fmha.attn_bias.BlockDiagonalCausalMask`

    Fr   shape.r   r   returnc                 C      t  z
        Materializes the bias as a `torch.Tensor`. This is very slow
        and we don't attempt to make it fast. Only use for debugging/testing.

        Shape should be like `[*, q_seqlen, k_seqlen]`
        NotImplementedErrorselfr   r   r   r   r   r   materialize[      zAttentionBias.materializeN)__name__
__module____qualname____doc__HOLDS_DENSE_TENSORr   float32r	   intr   r   strr   Tensorr%   r   r   r   r   r   7   s    !
r   r   c                 C   s*   | d u rt j rt dS t dS | S )Ncudar   )r   r0   is_availabler   r   r   r   r   _get_default_bias_devicej   s
   


r3   r   F)window_sizefrom_bottomrightr   .r4   r5   c                C   s   |t jur|nt j}t j| |d|d}| dd  \}}d}	|r#|| }	t j||	d|}
|d ur<t j|
|	| d d}
t |
}
|
|S )N   r   
fill_valuer   r   diagonal)r   bfloat16r,   fulltrilr   triulog)r   r   r   r4   r5   	create_asr   num_queriesnum_keysshiftmaskr   r   r   _materialize_causal_maskr   s    

rF   c                	   @   sn   e Zd ZU dZeed< eed< dddZddd	Zej	d
fde
edf dejdeeejf dejfddZdS )!LocalAttentionFromBottomRightMaska  
    A local attention mask

    The query at position :math:`q` can attend the key at position :math:`k` if
    :math:`q - window\_left <= k + s <= q + window\_right`

    With :math:`s = num\_queries - num\_keys`

    :Example:

    .. code-block:: python

        import torch
        from xformers.ops import fmha

        bias = fmha.attn_bias.LocalAttentionFromBottomRightMask(window_left=1, window_right=2)
        print(bias.materialize(shape=(4, 4)).exp())
        print(bias.materialize(shape=(4, 5)).exp())

    .. code-block:: text

        # 4x4
        tensor([[1., 1., 1., 0.],
                [1., 1., 1., 1.],
                [0., 1., 1., 1.],
                [0., 0., 1., 1.]])

        # 4x5
        tensor([[1., 1., 1., 1., 0.],
                [0., 1., 1., 1., 1.],
                [0., 0., 1., 1., 1.],
                [0., 0., 0., 1., 1.]])

    :Illustration:

    .. figure:: /_static/local_attn.png
        :width: 240px

        The total window size is :math:`window\_left + 1 + window\_right`
    window_leftwindow_rightr   c                 C   s   | S Nr   r$   r   r   r   r   r      s   z$LocalAttentionFromBottomRightMask.toNc                 C   s8   | j dk rtd| j  | jdk rtd| j d S )Nr   zrInvalid window value passed to `LocalAttentionFromBottomRightMask`: expected`window_left > 0` but got window_left=ztInvalid window value passed to `LocalAttentionFromBottomRightMask`: expected`window_right > 0` but got window_right=)rH   
ValueErrorrI   r$   r   r   r   __post_init__   s   

z/LocalAttentionFromBottomRightMask.__post_init__r   r   .r   r   c           	      C   sz   |t jur|nt j}t j||d|d}|dd  \}}|| }t j||| j d}t j||| j d}t |}|	|S )Nr6   r7   r9   r:   )
r   r<   r,   r=   r?   rH   r>   rI   r@   r   )	r$   r   r   r   rA   rE   rB   rC   rD   r   r   r   r%      s   

z-LocalAttentionFromBottomRightMask.materialize)r   rG   r   N)r'   r(   r)   r*   r-   __annotations__r   rN   r   r,   r	   r   r   r.   r   r/   r%   r   r   r   r   rG      s"   
 )


rG   c                	   @   sn   e Zd ZdZdejdd fddZejdfdee	df d	ej
deeejf dejfd
dZde	ddfddZdS )"LowerTriangularFromBottomRightMaska  
    A causal masking.

    This mask is exactly the same as :attr:`LowerTriangularMask` when there is
    the same number of queries and keys.
    When the number of queries is different from the number of keys,
    it is a triangular mask shifted so that the last query can attend to
    the last key.
    In other words, a query Q cannot attend to a key which is nearer the
    final key than Q is to the final query.


    .. figure:: /_static/causal_bottom_right.png

        The difference between :attr:`LowerTriangularMask` (left) and
        :attr:`LowerTriangularFromBottomRightMask` (right). They become
        equivalent if the number of queries equals the number of keys.
    r   r   c                 C      t | tu s
J d| S NPlease implement in subclass)typerQ   rK   r   r   r   r      s   z%LowerTriangularFromBottomRightMask.tor   r   .r   c                 C   s   t |||ddS )NT)r   r   r5   rF   r#   r   r   r   r%      s   z.LowerTriangularFromBottomRightMask.materializer4   0LowerTriangularFromBottomRightLocalAttentionMaskc                 C   s   t |S )z
        Create a new bias which combines local + causal attention.

        See :attr:`LowerTriangularFromBottomRightLocalAttentionMask`
        )rW   r$   r4   r   r   r   make_local_attention  s   z7LowerTriangularFromBottomRightMask.make_local_attentionN)r'   r(   r)   r*   r   r   r   r,   r	   r-   r   r   r.   r/   r%   rY   r   r   r   r   rQ      s&    	


rQ   c                	   @   sp   e Zd ZU dZeed< dejdd fddZddd	Z	ej
d
fdeedf dejdeeejf dejfddZdS )rW   aT  
    A mask that combines both :attr:`LowerTriangularFromBottomRightMask` and
    local attention.

    A query whose distance from the final query is X cannot attend to a key
    whose distance to the final key is either of:

    * less than X (i.e. "causal attention", same as :attr:`LowerTriangularFromBottomRightMask`)
    * greater than X + window_size (i.e. "local attention")


    .. figure:: /_static/causal_bottom_right_local.png

        The mask from :attr:`LowerTriangularFromBottomRightLocalAttentionMask`.
        The green area is calculated, and the grey area is masked out.
    _window_sizer   r   c                 C   rR   rS   )rU   rW   rK   r   r   r   r   *  s   z3LowerTriangularFromBottomRightLocalAttentionMask.toNc                 C   s   | j dkrtd| j  d S Nr   ,Expected `window_size > 0`, but window_size=)rZ   rL   rM   r   r   r   rN   2  s
   

z>LowerTriangularFromBottomRightLocalAttentionMask.__post_init__r   r   .r   c                 C      t |||| jddS NT)r   r   r4   r5   rF   rZ   r#   r   r   r   r%   8     z<LowerTriangularFromBottomRightLocalAttentionMask.materializerO   )r'   r(   r)   r*   r-   rP   r   r   r   rN   r,   r	   r   r   r.   r/   r%   r   r   r   r   rW     s(   
 

	
rW   c                   @   s   e Zd ZU dZejed< eed< eed< ee ed< dej	dd fdd	Z
deeeef  fd
dZedee dej	deeeee ejf fddZedddee deej	 dd fddZ	ddejdeee  deej fddZdS )_SeqLenInfoam  
    (Internal) Represents the division of a dimension into blocks.

    For example, to represents a dimension of length 7 divided into
    three blocks of lengths 2, 3 and 2, use `from_seqlength([2, 3, 2])`.
    The members will be:
        max_seqlen: 3
        min_seqlen: 2
        seqstart_py: [0, 2, 5, 7]
        seqstart: torch.IntTensor([0, 2, 5, 7])
    seqstart
max_seqlen
min_seqlenseqstart_pyr   r   c                 C   sB   t | tu s
J d| jj|kr| S tt| j|| j| j| jdS )NrT   )rb   rc   rd   re   )rU   ra   rb   r   r   rc   rd   re   rK   r   r   r   r   Z  s   
z_SeqLenInfo.toc                 c   s"    t | j| jdd  E d H  d S Nr6   )zipre   rM   r   r   r   	intervalse  s    z_SeqLenInfo.intervalsseqlensc                C   s~   t |tjrJ dg}d}d}|D ]}|dkrt||n|}t||}||t|d  |  qt|tj|d}||||fS )z
        Given sequence lengths, returns the min/max value and the sequence start
        positions (offsets), with first element being 0 (returned in list and Tensor).
        r   r6   r   r   )	
isinstancer   r/   minmaxappendlenr   int32)clsri   r   re   rc   rd   seqlenrb   r   r   r   _get_seqstarth  s   	
z_SeqLenInfo._get_seqstartNr2   c                C   s.   t |}| j||d\}}}}| ||||dS )zD
        Input tensors are assumed to be in shape [B, M, *]
        r2   )rc   rd   rb   re   )r3   rt   )rr   ri   r   rd   rc   re   rb   r   r   r   from_seqlens}  s   z_SeqLenInfo.from_seqlensxbatch_sizesc                 C   s   | j d |jd ks|jd dkr#td|j d| j d  d| j  |d u r1dgt| j d  }g }d}|D ]}|| j ||  | j |   ||7 }q7dd t||j|dd	D S )
Nrj   r6   r   z Invalid `torch.Tensor` of shape z+, expected format (B, M, *) with B=1 and M=z
 seqstart: c                 S   s,   g | ]\}}| |d g|jdd qS )rj      Nreshaper   ).0bsr   r   r   r   
<listcomp>  s    z%_SeqLenInfo.split.<locals>.<listcomp>dim)re   r   rL   rp   ro   rg   split)r$   rv   rw   split_chunksit
batch_sizer   r   r   r     s(   "

z_SeqLenInfo.splitrJ   )r'   r(   r)   r*   r   r/   rP   r-   r   r   r   r   r	   rh   classmethodrt   r   ru   r   r   r   r   r   r   ra   G  sB   
 

ra   c                
       s   e Zd ZU dZejed< ee ed< eed< dddZ	d	ej
dd fd
dZdeeeef  f fddZedddee d	eej
 ddfddZedddee ded	eej
 dd fddZ	ddejdeee  deej fddZ  ZS )_PaddedSeqLenInfouW  
    (Internal)  Represents the division of a dimension into blocks which are
    padded out to the same total length.

    For example, to represent a dimension of length 12 with space for
    three blocks of length 4, but where the occupied lengths are
    2, 3 and 2, use `from_seqlens_padded([2, 3, 2], 4)`.

    The layout along the dimension is

     0 ─►  block 0
           block 0
           <space>
           <space>
     4 ─►  block 1
           block 1
           block 1
           <space>
     8 ─►  block 2
           block 2
           <space>
           <space>
    12 ─►

    The members will be:
        max_seqlen: 3
        min_seqlen: 2
        seqstart_py: [0, 4, 8, 12]
        seqstart: torch.IntTensor([0, 4, 8, 12])
        seqlen_py: [2, 3, 2]
        seqlen: torch.IntTensor([2, 3, 2])
        padding: 4
    rs   	seqlen_pypaddingr   Nc                 C   s    t | jt | jd ksJ d S rf   )rp   re   r   rM   r   r   r   rN     s    z_PaddedSeqLenInfo.__post_init__r   c              	   C   sT   t | tu s
J d| jj|kr| S tt| j|| j| j| jt| j|| j	| j
dS )NrT   )rb   rc   rd   re   rs   r   r   )rU   r   rs   r   r   rb   rc   rd   re   r   r   rK   r   r   r   r     s   

z_PaddedSeqLenInfo.toc                 #   4    t t  | jD ]\\}}}||| fV  q
d S rJ   rg   superrh   r   r$   start_length	__class__r   r   rh        z_PaddedSeqLenInfo.intervalsr2   ri   ra   c                C      t d)NzPUse either `_SeqLenInfo.from_seqlens` or `_PaddedSeqLenInfo.from_seqlens_padded`)RuntimeErrorrr   ri   r   r   r   r   ru     s   z_PaddedSeqLenInfo.from_seqlensc             
      s   t |tjrJ t fdd|D sJ d| d  t|}ttdt|  d  }t|tj	|d}| |t |tr@|nt|t
|t|t|tj	|d| dS )	zz
        Input tensors are assumed to be in shape [B, M, *]
        seqstart = padding * torch.arange(batch_size)
        c                 3   s    | ]}| kV  qd S rJ   r   )r{   rs   r   r   r   	<genexpr>  s    
z8_PaddedSeqLenInfo.from_seqlens_padded.<locals>.<genexpr>zSeqlens z	 Padding r   r6   rk   )rs   r   rc   rd   rb   re   r   )rl   r   r/   allr3   listrangerp   r   rq   rn   rm   )rr   ri   r   r   re   rs   r   r   r   from_seqlens_padded  s$   
z%_PaddedSeqLenInfo.from_seqlens_paddedrv   rw   c                 C   r   )N_PaddedSeqLenInfo.splitr!   r$   rv   rw   r   r   r   r        r   rO   rJ   )r'   r(   r)   r*   r   r/   rP   r   r-   rN   r   r   r   r	   rh   r   r   ru   r   r   r   __classcell__r   r   r   r   r     sH   
 
"

r   c                       s   e Zd ZU dZejed< ee ed< dej	dd fddZ
deeeef  f fdd	Zed
ddee deej	 ddfddZedee dee dedej	dd f
ddZ	
ddejdeee  deej fddZ  ZS )_GappySeqInfou  
    (Internal) Flexible equivalent of _PaddedSeqLenInfo. There are two
    distinct semantics.

    (1) For non-paged masks:
    Represents the division of a dimension into blocks which are
    anywhere. Each just has a start and a length. The final start is the total
    length of the dimension.

    For example, to represent a dimension of length 14 like follows with
    three occupied lengths of
    6, 3 and 1, use `from_seqlens_padded([0, 7, 12, 14], [6, 3, 1])`.

    The layout along the dimension is

     0 ─►  block 0
           block 0
           block 0
           block 0
     4 ─►  block 0
           block 0
           <space>
           block 1
     8 ─►  block 1
           block 1
           <space>
           <space>
     12 ─► block 2
           <space>

    The members will be:
        max_seqlen: 6
        min_seqlen: 1
        seqstart_py: [0, 7, 12, 14]
        seqstart: torch.IntTensor([0, 7, 12, 14])
        seqlen_py: [6, 3, 1]
        seqlen: torch.IntTensor([6, 3, 1])

    (2) For paged masks:
    The notional space is divided into batch-size-many blocks.
    seqstart and seqstart_py is an offset in the block, not in
    the whole space, and doesn't have an extra last element.
    Otherwise as above.
    rs   r   r   r   c                 C   sP   t | tu s
J d| jj|kr| S tt| j|| j| j| jt| j|| j	dS )NrT   )rb   rc   rd   re   rs   r   )
rU   r   rs   r   r   rb   rc   rd   re   r   rK   r   r   r   r   J  s   

z_GappySeqInfo.toc                 #   r   rJ   r   r   r   r   r   rh   Y  r   z_GappySeqInfo.intervalsNr2   ri   ra   c                C   r   rJ   r!   r   r   r   r   ru   ]  s   z_GappySeqInfo.from_seqlens	seqstartspagedc             
   C   s   t |tjrJ t|}t|dkrtdt|t| |r!dndkr7|r(dnd}td| d| d| t|tj|d	}| ||t|t	|t|tj|d	|d
S )Nr   zNo elementsr6    z1 + zlen(seqstarts)=z should be zlen(seqlens)=rk   )rs   r   rc   rd   rb   re   )
rl   r   r/   r   rp   rL   r   rq   rn   rm   )rr   r   ri   r   r   re   extrars   r   r   r   from_seqlens_gappyc  s$   	z _GappySeqInfo.from_seqlens_gappyrv   rw   c                 C   r   )N_GappySeqInfo.splitr!   r   r   r   r   r     r   r   rJ   )r'   r(   r)   r*   r   r/   rP   r   r-   r   r   r   r	   rh   r   r   ru   boolr   r   r   r   r   r   r   r   r     sF   
 
-
r   c                   @   s  e Zd ZU dZeed< eed< dZeee	  ed< d3ddZ
ejd	fd
ee	df dejdeeejf dejfddZejd	fd
ee	df dejdeeejf dejfddZe	d4dddee	 deee	  deej dd fddZedeej ded ejf fddZe	d4deej deej deeej  ded ejejeej f fddZdejdeej fd d!Zdejdeej fd"d#Zdejdeej fd$d%Zd5d'd(Zd6d*d+Zd,e	dd-fd.d/Zd,e	dd0fd1d2ZdS )7BlockDiagonalMaska  
    A block-diagonal mask that can be passed as ``attn_bias``
    argument to :attr:`xformers.ops.memory_efficient_attention`.

    Queries and Keys are each divided into the same number of blocks.
    Queries in block i only attend to keys in block i.

    .. figure:: /_static/block_diag_bias.png

        This bias can be used to handle a batch of sequences of
        different lengths, via :attr:`BlockDiagonalMask.from_tensor_list`

    :Example:

    .. code-block:: python

        import torch
        from xformers.ops import fmha

        K = 16
        dtype = torch.float16
        device = "cuda"
        list_x = [
            torch.randn([1, 3, 1, K], dtype=dtype, device=device),
            torch.randn([1, 6, 1, K], dtype=dtype, device=device),
            torch.randn([1, 2, 1, K], dtype=dtype, device=device),
        ]
        attn_bias, x = fmha.BlockDiagonalMask.from_tensor_list(list_x)
        linear = torch.nn.Linear(K, K * 3).to(device=device, dtype=dtype)

        q, k, v = linear(x).reshape([1, -1, 1, 3, K]).unbind(-2)
        out = fmha.memory_efficient_attention(q, k, v, attn_bias=attn_bias)
        list_out = attn_bias.split(out)
        print(list_out[0].shape)  # [1, 3, 1, K]
        assert tuple(list_out[0].shape) == (1, 3, 1, K)

    	q_seqinfo	k_seqinfoN_batch_sizesr   c                 C   4   t | tu s
J dt| j|| j|| jdS NrT   r   r   r   )rU   r   r   r   r   r   rK   r   r   r   r        

zBlockDiagonalMask.tor   r   .r   r   c                 C   s   t j|||dS Nrk   )r   zerosr#   r   r   r   _create_block_mask  s
   z$BlockDiagonalMask._create_block_maskc                 C   s  |d | j jd ksJ |d | j jd f|d | jjd ks,J |d | jjd ftj|dd ||d}|tj  tt	| j
 | j 
 D ] \}\\}}\}}	| j|| |	| f||d|||||	f< qLtt|d D ]}
|d}qu||S )8Materialize the attention bias - for debugging & testingrj   r9   Nrk   rx   r   )r   re   r   r   emptyfill_mathinf	enumeraterg   rh   r   r   rp   	unsqueezeexpandr$   r   r   r   rE   iq_startq_endk_startk_endr   r   r   r   r%     s0   


zBlockDiagonalMask.materializer2   q_seqlen	kv_seqlenc                C   sb   t |}|du st|t|ksJ tj||d}|du s!||kr$|}ntj||d}| ||dS )a  Creates a :attr:`BlockDiagonalMask` from a list of tensors lengths for query and key/value.

        Args:
            q_seqlen (Union[Sequence[int], torch.Tensor]): List or tensor of sequence lengths for query tensors
            kv_seqlen (Union[Sequence[int], torch.Tensor], optional): List or tensor of sequence lengths for key/value.
                    (Defaults to ``q_seqlen``.)
        Returns:
            BlockDiagonalMask
        Nr2   r   r   )r3   rp   ra   ru   )rr   r   r   r   r   r   r   r   r   ru     s   zBlockDiagonalMask.from_seqlenstensorsc           	      C   sx   dd |D }g }|D ]}t |jd D ]
}||jd  qq| |}||_tdd |D }tj|dd}||fS )aR  Creates a :attr:`BlockDiagonalMask` from a list of tensors, and returns the tensors
        concatenated on the sequence length dimension

        .. figure:: /_static/block_diag_cat_split.png

            See also :attr:`BlockDiagonalMask.split` to split the returned
            :attr:`torch.Tensor` back to a list of tensors of varying sequence length

        Args:
            tensors (Sequence[torch.Tensor]): A list of tensors of shape ``[B, M_i, *]``.
                All tensors should have the same dimension and the same batch size ``B``, but
                they can have different sequence length ``M``.

        Returns:
            Tuple[BlockDiagonalMask, torch.Tensor]: The corresponding bias for the attention
            along with `tensors` concatenated on the sequence length dimension, with shape ``[1, sum_i{M_i}, *]``
        c                 S      g | ]}|j d  qS r   r   r{   r   r   r   r   r}         z6BlockDiagonalMask.from_tensor_list.<locals>.<listcomp>r   r6   c                 s   s,    | ]}| d dg|jdd V  qdS r6   rj   rx   Nry   r{   rv   r   r   r   r     s   * z5BlockDiagonalMask.from_tensor_list.<locals>.<genexpr>r~   )r   r   ro   ru   r   tupler   cat)	rr   r   rw   ri   rv   r   
block_diagtensors_bs1concat_tensorsr   r   r   from_tensor_list  s   
z"BlockDiagonalMask.from_tensor_list	tensors_q	tensors_k	tensors_vc                 C   sD  t |t |ks
J |d u st |t |ksJ dd |D }g g }}tt||D ]B\}\}}	|jd |	jd ks=J ||jd g|jd  7 }||	jd g|	jd  7 }|d u sm|| jd d |	jd d ksmJ q+| ||}
||
_|
tjdd |D ddtjdd |D dd|d urtjd	d |D ddfS d fS )
Nc                 S   r   r   r   r   r   r   r   r}   )  r   z;BlockDiagonalMask.from_tensor_lists_qkv.<locals>.<listcomp>r   r6   rx   c                 S   (   g | ]}| d dg|jdd qS r   ry   r   r   r   r   r}   4     ( r~   c                 S   r   r   ry   r   r   r   r   r}   5  r   c                 S   r   r   ry   r   r   r   r   r}   6  r   )rp   r   rg   r   ru   r   r   r   )rr   r   r   r   rw   	q_seqlens
kv_seqlensr   qkr   r   r   r   from_tensor_lists_qkv   s(   
,z'BlockDiagonalMask.from_tensor_lists_qkvr   c                 C      | j || jS rJ   )r   r   r   r$   r   r   r   r   split_queries;     zBlockDiagonalMask.split_queriesc                 C   r   rJ   )r   r   r   r   r   r   r   split_kv>  r   zBlockDiagonalMask.split_kvc                 C   s    | j | ju sJ | j || jS )a%  The inverse operation of :attr:`BlockDiagonalCausalMask.from_tensor_list`

        Args:
            tensor (torch.Tensor): Tensor of tokens of shape ``[1, sum_i{M_i}, *]``

        Returns:
            Sequence[torch.Tensor]: A list of tokens with possibly different sequence lengths
        )r   r   r   r   r   r   r   r   r   A  s   	zBlockDiagonalMask.splitBlockDiagonalCausalMaskc                 C      t | j| j| jdS )zMakes each block causalr   )r   r   r   r   rM   r   r   r   make_causalM  
   zBlockDiagonalMask.make_causal&BlockDiagonalCausalFromBottomRightMaskc                 C   r   )z9Makes each block causal with a possible non-causal prefixr   )r   r   r   r   rM   r   r   r   make_causal_from_bottomrightU  r   z.BlockDiagonalMask.make_causal_from_bottomrightr4   %BlockDiagonalCausalLocalAttentionMaskc                 C      t | j| j| j|dS )z:Experimental: Makes each block causal with local attentionr   r   r   rZ   )r   r   r   r   rX   r   r   r   rY   ]     z&BlockDiagonalMask.make_local_attention4BlockDiagonalCausalLocalAttentionFromBottomRightMaskc                 C   r   )zSExperimental: Makes each block causal with local attention, start from bottom rightr   )r   r   r   r   rX   r   r   r   %make_local_attention_from_bottomrighth  r   z7BlockDiagonalMask.make_local_attention_from_bottomright)r   r   rJ   r   r   r   r   ) r'   r(   r)   r*   ra   rP   r   r   r   r-   r   r   r,   r	   r   r   r.   r   r/   r   r%   r   ru   r   r   r   r   r   r   r   rY   r   r   r   r   r   r     s   
 &




 
 


r   c                	   @   sR   e Zd ZdZdddZejdfdeedf dej	d	e
eejf dejfd
dZdS )r   ah  
    Same as :attr:`xformers.ops.fmha.attn_bias.BlockDiagonalMask`, except that each block is causal.

    Queries and Keys are each divided into the same number of blocks.
    A query Q in block i cannot attend to a key which is not in block i,
    nor one which is farther from the initial key in block i than Q
    is from the initial query in block i.
    r   c                 C   r   r   )rU   r   r   r   r   r   rK   r   r   r   r     r   zBlockDiagonalCausalMask.tor   r   .r   r   c                 C      t  j|||dS r   )LowerTriangularMaskr%   r#   r   r   r   r     s
   z*BlockDiagonalCausalMask._create_block_maskNr   )r'   r(   r)   r*   r   r   r,   r	   r-   r   r   r.   r   r/   r   r   r   r   r   r   t  s    
	
r   c                	   @   s\   e Zd ZdZdddZdddZejdfd	ee	d
f dej
deeejf dejfddZdS )r   a  
    Same as :attr:`xformers.ops.fmha.attn_bias.BlockDiagonalMask`, except that each block is causal.
    This mask allows for a non-causal prefix
    NOTE: Each block should have `num_keys >= num_queries` otherwise the forward pass is not
    defined (softmax of vector of `-inf` in the attention)

    Queries and keys are each divided into the same number of blocks.
    A query Q in block i cannot attend to a key which is not in block i,
    nor one which nearer the final key in block i than Q is to the
    final query in block i.
    r   c                 C   r   r   )rU   r   r   r   r   r   rK   r   r   r   r        

z)BlockDiagonalCausalFromBottomRightMask.toNc              	   C   sh   t t| j | j D ]$\}\\}}\}}|| }|| }||k r1td| d| d| dqd S )NzBlock #z has num_keys=z and num_queries=z$. Expected `num_keys >= num_queries`)r   rg   r   rh   r   rL   )r$   r   r   r   r   r   rB   rC   r   r   r   rN     s   z4BlockDiagonalCausalFromBottomRightMask.__post_init__r   r   .r   r   c                 C   r   Nr   r   r   rQ   r%   r#   r   r   r   r        z9BlockDiagonalCausalFromBottomRightMask._create_block_maskr   rO   )r'   r(   r)   r*   r   rN   r   r,   r	   r-   r   r   r.   r   r/   r   r   r   r   r   r     s    



r   c                   @   s   e Zd ZU dZeed< eed< dddZej	dfde
ed	f d
ejdeeejf dejfddZej	dfde
ed	f d
ejdeeejf dejfddZe	ddddee dedee dedeej dd fddZdejdeded ddfddZdS ) BlockDiagonalPaddedKeysMaskax  
    Same as :attr:`xformers.ops.fmha.attn_bias.BlockDiagonalMask`,
    except we support padding for k/v

    The keys and values are divided into blocks which are padded out to
    the same total length.
    For example, if there is space for 12 keys, for three blocks of
    max length 4, but we only want to use the first 2, 3 and 2
    of each block, use `kv_padding=4` and `kv_seqlens=[2, 3, 2]`.
    The queries are divided into blocks, without padding, of lengths given by
    q_seqlen.

    A query Q in block i cannot attend to a key which is not in block i,
    nor one which is not in use (i.e. in the padded area).
    r   r   r   c                 C   0   t | tu s
J dt| j|| j|dS NrT   r   )rU   r   r   r   r   rK   r   r   r   r     
   

zBlockDiagonalPaddedKeysMask.tor   r   .r   r   c                 C   s   t jd||dS )Ng        )r   r   )r   r   r#   r   r   r   r     s   z.BlockDiagonalPaddedKeysMask._create_block_maskc                 C   s   |d | j jd krtd|d | jjd krtdtj|dd ||d}|tj  t	t
| j | j  D ] \}\\}}\}}	| j|| |	| f||d|||||	f< q<tt|d D ]}
|d}qe||S )	r   rj   k shapes wrongr9   q shapes wrongNrk   rx   r   )r   re   rL   r   r   r   r   r   r   r   rg   rh   r   r   rp   r   r   r   r   r   r   r%     s(   
z'BlockDiagonalPaddedKeysMask.materializeNr2   r   
kv_paddingr   causal_diagonalc                C   sV   t |}|du st|t|ksJ ||ftj||d}tj|||d}| ||dS )a  Creates a :attr:`BlockDiagonalPaddedKeysMask` from a list of tensor
        lengths for query and key/value.

        Args:
            q_seqlen (Sequence[int]): List or tensor of sequence lengths for query tensors
            kv_padding (int): Padding for k/v - also an upperbound on each individual key length
            kv_seqlen (Sequence[int]): List or tensor of sequence lengths for key/value.
            causal_diagonal: unused, for BC only
        Returns:
            BlockDiagonalPaddedKeysMask
        Nr2   r   )r3   rp   ra   ru   r   r   rr   r   r   r   r   r   r   r   r   r   r   ru     s   z(BlockDiagonalPaddedKeysMask.from_seqlensblock_tables	page_size
paged_type PagedBlockDiagonalPaddedKeysMaskc                 C   s*   || j | j||d}|jd | |j_|S )Nr   r   r   r   r6   )r   r   r   r   )r$   r   r   r   
paged_biasr   r   r   
make_paged&  s   z&BlockDiagonalPaddedKeysMask.make_paged)r   r   rJ   )r'   r(   r)   r*   ra   rP   r   r   r   r,   r	   r-   r   r   r.   r   r/   r   r%   r   r   r   r   ru   r
   r  r   r   r   r   r     sj   
 





r   c                   @   s   e Zd ZU dZdZeed< dddZej	dfde
ed	f d
ejdeeejf dejfddZe	ddddee dedee dedeej dd fddZdS )+BlockDiagonalCausalWithOffsetPaddedKeysMaska  
    Same as :attr:`xformers.ops.fmha.attn_bias.BlockDiagonalCausalMask`,
    except an offset on causality is allowed for each block and we support padding for k/v

    The keys and values are divided into blocks which are padded out to
    the same total length.
    For example, if there is space for 12 keys, for three blocks of
    max length 4, but we only want to use the first 2, 3 and 2
    of each block, use `kv_padding=4` and `kv_seqlens=[2, 3, 2]`.
    The queries are divided into blocks, without padding, of lengths given by
    q_seqlen.

    A query Q in block i cannot attend to a key which is not in block i,
    nor one which is not in use (i.e. in the padded area),
    nor one which is nearer to the final key in block i
    than Q is to the final query in block i.
    Nr   r   c                 C   r   r   )rU   r  r   r   r   rK   r   r   r   r   L     

z.BlockDiagonalCausalWithOffsetPaddedKeysMask.tor   r   .r   r   c                 C   r   r   r   r#   r   r   r   r   U  r   z>BlockDiagonalCausalWithOffsetPaddedKeysMask._create_block_maskr2   r   r   r   c                C   sV   |du st |t |ksJ ||ft|}tj||d}tj|||d}| ||dS )a#  Creates a :attr:`BlockDiagonalCausalWithOffsetPaddedKeysMask` from a list of tensor
        lengths for query and key/value.

        Args:
            q_seqlen (Sequence[int]): List or tensor of sequence lengths for query tensors
            kv_padding (int): Padding for k/v - also an upperbound on each individual key length
            kv_seqlen (Sequence[int]): List or tensor of sequence lengths for key/value.
            causal_diagonal: unused, for BC only
        Returns:
            BlockDiagonalCausalWithOffsetPaddedKeysMask
        Nr2   r   )rp   r3   ra   ru   r   r   r   r   r   r   ru   _  s   z8BlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens)r   r  rJ   )r'   r(   r)   r*   r   r   rP   r   r   r,   r	   r-   r   r   r.   r   r/   r   r   r   r   ru   r   r   r   r   r  6  s@   
 



r  c                   @   s   e Zd ZU dZeed< dddZejdfde	edf d	ej
d
eeejf dejfddZedee dedee dedd f
ddZdS )/BlockDiagonalCausalLocalAttentionPaddedKeysMaska  
    Like :attr:`xformers.ops.fmha.attn_bias.BlockDiagonalCausalWithOffsetPaddedKeysMask`,
    except with a window size.

    A query Q in block i cannot attend to a key which is not in block i,
    nor one which is not in use (i.e. in the padded area),
    nor one which is nearer to the final key in block i
    than Q is to the final query in block i, nor one that is more than
    window_size further from the final key in block i than Q is
    to the final query in block i.
    rZ   r   c                 C   r   )NrT   r   r   rZ   )rU   r  r   r   r   rZ   rK   r   r   r   r     r   z2BlockDiagonalCausalLocalAttentionPaddedKeysMask.tor   r   .r   r   c                 C   r]   )NT)r   r   r   r4   r5   r_   r#   r   r   r   r     r`   zBBlockDiagonalCausalLocalAttentionPaddedKeysMask._create_block_maskr   r   r   r4   c                 C   sH   |d u st |t |ksJ ||ft|}t||}| |||dS )Nr  )rp   ra   ru   r   r   )rr   r   r   r   r4   r   r   r   r   r   from_seqlens_local  s   
zBBlockDiagonalCausalLocalAttentionPaddedKeysMask.from_seqlens_localN)r   r  )r'   r(   r)   r*   r-   rP   r   r   r,   r	   r   r   r.   r   r/   r   r   r   r	  r   r   r   r   r    s6   
 


r  c                   @   s   e Zd ZU dZeed< eed< ejed< e	ed< e
Zeee
  ed< dejdd fd	d
Zejdfdee	df dejdeeejf dejfddZedddee	 dee	 dejde	deej dd fddZdS )r  a  
    Same as BlockDiagonalPaddedKeysMask, but for paged attention.
    block_tables has shape [batch_size, max_num_pages] and K/V have shape
    [1, max_num_pages * page_size, num_heads, head_dim]
    or [1, max_num_pages * page_size, num_groups, num_heads, head_dim]
    r   r   r   r   _UNPAGED_TYPEr   r   c                 C   >   t | tu s
J dt| j|| j|t| j|| jdS NrT   r  )rU   r  r   r   r   r   r   r   rK   r   r   r   r        


z#PagedBlockDiagonalPaddedKeysMask.tor   r   .r   c                 C   s2  | j jd | j }| j| jt| jj|d}|	|||}t
t| j   d }|| j }tj|jdd |f ||d}	|	tj  t| j D ]I\}
\}}t| j jd D ]:}t
t| j |
 |  }|
| || j  }|| j }|| j }|| j }|d||||f |	d||||f< q[qM|	S )r   r6   r   Nrj   rk   .)r   r   r   r
  r   r   r   r   r   r%   r   r-   rn   itemr   r   r   r   r   r   rh   r   )r$   r   r   r   max_row_lenbias_nonpagedmask_nonpagedn_used_blocksmax_physical_len
mask_pagedbr   r   logical_page_idxphysical_page_idxk_logical_startk_logical_endk_physical_startk_physical_endr   r   r   r%     s:   




z,PagedBlockDiagonalPaddedKeysMask.materializeNr2   r   r   c                C   s\   t |t |ksJ ||ft|}tj||d}tj||jd | |d}| ||||dS )a  Creates a :attr:`PagedBlockDiagonalPaddedKeysMask` from a list of tensor
        lengths for query and key/value.

        Args:
            q_seqlen (Sequence[int]): List or tensor of sequence lengths for query tensors
            kv_padding (int): Padding for k/v - also an upperbound on each individual key length
            kv_seqlen (Sequence[int]): List or tensor of sequence lengths for key/value.
            causal_diagonal: unused, for BC only
        Returns:
            PagedBlockDiagonalPaddedKeysMask
        r2   r6   )r   r   r  )rp   r3   ra   ru   r   r   r   )rr   r   r   r   r   r   r   r   r   r   r   ru     s   z-PagedBlockDiagonalPaddedKeysMask.from_seqlens)r'   r(   r)   r*   ra   rP   r   r   r/   r-   r   r
  r   r
   r   r   r,   r	   r   r   r.   r%   r   r   r   ru   r   r   r   r   r    sJ   
 


'r  c                   @   (   e Zd ZdZeZdejdd fddZdS )0PagedBlockDiagonalCausalWithOffsetPaddedKeysMaska   
    Same as BlockDiagonalCausalWithOffsetPaddedKeysMask, but for paged attention.
    block_tables has shape [batch_size, max_num_pages] and K/V have shape
    [1, max_num_pages * page_size, num_heads, head_dim]
    or [1, max_num_pages * page_size, num_groups, num_heads, head_dim]
    r   r   c                 C   r  r  )rU   r  r   r   r   r   r   r   rK   r   r   r   r   0     


z3PagedBlockDiagonalCausalWithOffsetPaddedKeysMask.toN)	r'   r(   r)   r*   r  r
  r   r   r   r   r   r   r   r  #  s    r  c                   @   s   e Zd ZU dZeed< eed< dejdd fddZ	ej
dfd	eed
f dejdeeejf dejfddZedddee dee dee deej dd f
ddZdejdededed def
ddZdS )BlockDiagonalGappyKeysMaskz
    Same as :attr:`xformers.ops.fmha.attn_bias.BlockDiagonalMask`,
    except k/v is gappy.

    A query Q in block i only attends to a key which is in block i.
    r   r   r   r   c                 C   r   r   )rU   r  r   r   r   rK   r   r   r   r   J  r   zBlockDiagonalGappyKeysMask.tor   r   .r   c           
      C   s   |d | j jd krtd|| j f|d | jjd kr$td|| jftj|dd ||d}|tj  t	| j
 | j 
 D ]\\}}\}}d|||||f< qBtt|d D ]}	|d}q]||S )	r   rj   r   r9   r   Nrk   r   rx   )r   re   rL   r   r   r   r   r   r   rg   rh   r   rp   r   r   )
r$   r   r   r   rE   r   r   r   r   r   r   r   r   r%   Q  s   
z&BlockDiagonalGappyKeysMask.materializeNr2   r   kv_seqstartsr   c                C   sP   t |t |ksJ ||ft|}tj||d}tj||d|d}| ||dS )ztCreates a :attr:`BlockDiagonalGappyKeysMask` from a list of tensor
        lengths for query and key/value.
        r2   Fr   )rp   r3   ra   ru   r   r   )rr   r   r   r   r   r   r   r   r   r   ru   g  s   z'BlockDiagonalGappyKeysMask.from_seqlensr   r   notional_paddingr   PagedBlockDiagonalGappyKeysMaskc                    s   |j d |  fddt| jjdd D }t fdd|D s%J tj|| jjd|jd	}| jj	 ks9J || j
|||d
}|S )z
        Assuming our keys actually live in separate blocks of length
        notional_padding, convert to a Paged version.
        r6   c                    s   g | ]
\}}||   qS r   r   r{   r   r   )r!  r   r   r}         
z9BlockDiagonalGappyKeysMask.make_paged.<locals>.<listcomp>Nrj   c                 3   s(    | ]}d |  ko k n  V  qdS )r   Nr   )r{   r   r  r   r   r     s   & z8BlockDiagonalGappyKeysMask.make_paged.<locals>.<genexpr>Tr2   r  )r   r   r   re   r   r   r   r   r   rc   r   )r$   r   r   r!  r   new_seqstartsr   r  r   )r  r!  r   r  ~  s    
z%BlockDiagonalGappyKeysMask.make_paged)r'   r(   r)   r*   ra   rP   r   r   r   r   r,   r	   r-   r   r   r.   r/   r%   r   r   r   ru   r
   r   r  r   r   r   r   r  >  sP   
 


r  c                	   @   s\   e Zd ZdZdejdd fddZejdfdee	df d	ej
deeejf dejfd
dZdS )*BlockDiagonalCausalWithOffsetGappyKeysMaska  
    Same as :attr:`xformers.ops.fmha.attn_bias.BlockDiagonalCausalMask`,
    except k/v is gappy.

    A query Q in block i cannot attend to a key which is not in block i,
    nor one which is nearer to the final key in block i
    than Q is to the final query in block i.
    r   r   c                 C   r   r   )rU   r'  r   r   r   rK   r   r   r   r     r  z-BlockDiagonalCausalWithOffsetGappyKeysMask.tor   r   .r   c                 C   s   |d | j jd krtd|d | jjd krtdtj|dd ||d}|tj  t	t
| j | j  D ]!\}\\}}\}}	t j|| |	| f||d|||||	f< q<tt|d D ]}
|d	}qf||S )
r   rj   r   r9   r   Nrk   r   rx   r   )r   re   rL   r   r   r   r   r   r   r   rg   rh   rQ   r%   r   rp   r   r   r   r   r   r   r%     s*   
z6BlockDiagonalCausalWithOffsetGappyKeysMask.materializeN)r'   r(   r)   r*   r   r   r   r,   r	   r-   r   r   r.   r/   r%   r   r   r   r   r'    s    	
r'  c                   @   s   e Zd ZU dZeed< eed< ejed< e	ed< e
Zeee
  ed< dejdd fd	d
Zejdfdee	df dejdeeejf dejfddZedddee	 dee	 dee	 dejde	deej dd fddZdS )r"  a  
    Equivalent BlockDiagonalGappyKeysMask, but for paged attention.
    block_tables has shape [batch_size, max_num_pages] and K/V have shape
    [1, max_num_pages * page_size, num_heads, head_dim]
    or [1, max_num_pages * page_size, num_groups, num_heads, head_dim]
    r   r   r   r   r
  r   r   c                 C   r  r  )rU   r"  r   r   r   r   r   r   rK   r   r   r   r     r  z"PagedBlockDiagonalGappyKeysMask.tor   r   .r   c              	      sb  | j jd | j   fddt| jjD |d g }| j| jtj	|| jj
dt|dd}||||}tt| j   d }|| j }tj|jdd |f ||d	}	|	tj  t| j D ]I\}
\}}t| j jd D ]:}tt| j |
 |  }|
  || j  }|| j }|| j }|| j }|d
||||f |	d
||||f< qsqe|	S )r   r6   c                    s   g | ]
\}}||   qS r   r   r#  r%  r   r   r}     r$  z?PagedBlockDiagonalGappyKeysMask.materialize.<locals>.<listcomp>rj   Fr2   r   Nrk   .)r   r   r   r   r   re   r
  r   r   r   r   r   r   r%   r   r-   rn   r  r   r   r   r   rh   r   )r$   r   r   r   r&  r  r  r  r  r  r  r   r   r  r  r  r  r  r  r   r%  r   r%     sJ   


	



z+PagedBlockDiagonalGappyKeysMask.materializeNr2   r   r   r   c          	      C   sr   t |t |  krt |ksn J |||f|du r|jn|}tj||d}tj||d|d}| ||||dS )a  Creates a :attr:`PagedBlockDiagonalGappyKeysMask` from a list of tensor
        lengths for query and key/value.

        Note that unlike :attr:`BlockDiagonalGappyKeysMask`, kv_seqstarts is
        addressing in a different space for each batch element. For example
        if you were doing a BlockDiagonalPaddedKeysMask with two batch
        elements and padding=100, but wanted to change it so that the first
        key is ignored, then you would use BlockDiagonalGappyKeysMask with kv_seqstarts
        [1, 101, 200]. But if you were using PagedBlockDiagonalPaddedKeysMask
        but wanted to ignore the first key, you would provide this function with
        kv_seqstarts = [1, 1].
        Nr2   Tr  )rp   r   ra   ru   r   r   )	rr   r   r   r   r   r   r   r   r   r   r   r   ru     s    $z,PagedBlockDiagonalGappyKeysMask.from_seqlens)r'   r(   r)   r*   ra   rP   r   r   r/   r-   r  r
  r   r
   r   r   r,   r	   r   r   r.   r%   r   r   r   ru   r   r   r   r   r"    sN   
 


.		r"  c                   @   r  )/PagedBlockDiagonalCausalWithOffsetGappyKeysMaska  
    Same as BlockDiagonalCausalWithOffsetGappyKeysMask, but for paged attention.
    block_tables has shape [batch_size, max_num_pages] and K/V have shape
    [1, max_num_pages * page_size, num_heads, head_dim] or
    [1, max_num_pages * page_size, num_groups, num_heads, head_dim]
    r   r   c                 C   r  r  )rU   r(  r   r   r   r   r   r   rK   r   r   r   r   M  r  z2PagedBlockDiagonalCausalWithOffsetGappyKeysMask.toN)	r'   r(   r)   r*   r'  r
  r   r   r   r   r   r   r   r(  B  s    r(  c                	   @   sh   e Zd ZU dZdZeed< dddZdd Ze	j
d	fd
eedf de	jdeee	jf de	jfddZdS )r     
    (Experimental feature)
    Same as :attr:`xformers.ops.fmha.attn_bias.BlockDiagonalCausalMask`.
    This makes the mask "local" and the attention pattern banded.

    Query i only attends to keys in its block and cannot attend keys further than "window_size"
    from it.
    r   rZ   r   c                 C   8   t | tu s
J dt| j|| j|| j| jdS NrT   r   )rU   r   r   r   r   r   rZ   rK   r   r   r   r   h     

z(BlockDiagonalCausalLocalAttentionMask.toc                 C   s   | j dkrtd| j  dd t| jjd d | jjdd  D }dd t| jjd d | jjdd  D }t||D ]\}}|| j  |krWtd| d	| d
| j  q>d S )Nr   r\   c                 S      g | ]\}}|| qS r   r   r{   rv   yr   r   r   r}   x      zGBlockDiagonalCausalLocalAttentionMask.__post_init__.<locals>.<listcomp>rj   r6   c                 S   r-  r   r   r.  r   r   r   r}   ~  r0  z!No keys are attended in q_seqlen z
 k_seqlen z with sliding window )rZ   rL   rg   r   re   r   r   )r$   r   r   r   r   r   r   r   rN   s  s*   

z3BlockDiagonalCausalLocalAttentionMask.__post_init__r   r   .r   r   c                 C   s   t |||| jdS )N)r   r   r4   r_   r#   r   r   r   r     s   z8BlockDiagonalCausalLocalAttentionMask._create_block_maskN)r   r   )r'   r(   r)   r*   rZ   r-   rP   r   rN   r   r,   r	   r   r   r.   r   r/   r   r   r   r   r   r   [  s    
 	

r   c                	       sp   e Zd ZU dZdZeed< dddZ fddZe	j
d	fd
eedf de	jdeee	jf de	jfddZ  ZS )r   r)  r   rZ   r   c                 C   r*  r+  )rU   r   r   r   r   r   rZ   rK   r   r   r   r     r,  z7BlockDiagonalCausalLocalAttentionFromBottomRightMask.toc                    s(   t    | jdkrtd| j d S r[   )r   rN   rZ   rL   rM   r   r   r   rN     s   


zBBlockDiagonalCausalLocalAttentionFromBottomRightMask.__post_init__r   r   .r   r   c                 C   r]   r^   r_   r#   r   r   r   r     r`   zGBlockDiagonalCausalLocalAttentionFromBottomRightMask._create_block_mask)r   r   )r'   r(   r)   r*   rZ   r-   rP   r   rN   r   r,   r	   r   r   r.   r   r/   r   r   r   r   r   r   r     s    
 	


r   c                	       s   e Zd ZU dZejed< edddddZd fdd	Z	d
d Z
edddZdd Zedd Zejdfdeedf dejdeeejf dejfddZ  ZS )AttentionBiasSubTensorF
_subtensorNr2  r   c                K   r   rJ   r!   )rr   r2  r   kwargsr   r   r   __new__  s   zAttentionBiasSubTensor.__new__r   c                    s   t    d S rJ   )r   __init__)r$   argsr4  r   r   r   r6    s   zAttentionBiasSubTensor.__init__c                 C   s
   | j j S rJ   )r   r'   rM   r   r   r   __repr__     
zAttentionBiasSubTensor.__repr__r   c                 C   s`   |pi }|j tjjjtjjjtjjjtjjjfv r.| ||d jg|dd  R i |dS t	S )Nr   r6   r2  )
_overloadpacketr   opsatenclonedetach_to_copyr   r2  NotImplemented)rr   functypesr7  r4  r   r   r   __torch_dispatch__  s   *z)AttentionBiasSubTensor.__torch_dispatch__c                 C   s
   dgd fS )Nr2  r   rM   r   r   r   __tensor_flatten__  r9  z)AttentionBiasSubTensor.__tensor_flatten__c                 C   s   |d u sJ | |d dS )Nr2  r:  r   )rr   inner_tensorsmeta
outer_sizeouter_strider   r   r   __tensor_unflatten__  s   z+AttentionBiasSubTensor.__tensor_unflatten__r   r   .r   r   c                 C   r   r    r!   r#   r   r   r   r%     r&   z"AttentionBiasSubTensor.materializerO   r   N)r'   r(   r)   r+   r   r/   rP   staticmethodr5  r6  r8  r   rD  rE  rJ  r,   r	   r-   r   r   r.   r   r%   r   r   r   r   r   r1    s.   
 


r1  c                   @   s$   e Zd Zedd Zedd ZdS )_AddDenseBiasc                 C   s   t |tu sJ t|S rJ   )rU   r   !LowerTriangularMaskWithTensorBias)ctxcausal_biasr   r   r   r   forward  s   z_AddDenseBias.forwardc                 C   s   d |fS rJ   r   )rO  grad_outr   r   r   backward  s   z_AddDenseBias.backwardN)r'   r(   r)   rL  rQ  rS  r   r   r   r   rM    s
    
rM  c                	   @   st   e Zd ZdZdZedddddZejdfde	e
d	f d
ejdeeejf dejfddZdejddfddZdS )r   a!  
    A lower-triangular (aka causal) mask

    A query Q cannot attend to a key which is farther from the
    initial key than Q is from the initial query.

    See also :attr:`LowerTriangularFromBottomRightMask` if the number
    of queries is not equal to the number of keys/values.
    FNr   r3  c                K   s:   |du rt jd|d}t jj| g |j|jdd}||_|S )zg
        Note: create on CPU by default to avoid initializing CUDA context
        by mistake.
        Nr   r2   Fr   r   requires_grad)r   r   r/   _make_wrapper_subclassr   r   r2  )rr   r2  r   r4  r   r   r   r   r5    s   zLowerTriangularMask.__new__r   .r   r   r   c                 C   s   t |||dS r   rV   r#   r   r   r   r%   &  s   zLowerTriangularMask.materializebiasrN  c                 C   s   t | |S )zS
        Creates a new causal mask with an arbitrary ``torch.Tensor`` bias
        )rM  apply)r$   rW  r   r   r   add_bias.  s   zLowerTriangularMask.add_bias)r'   r(   r)   r*   r+   rL  r5  r   r,   r	   r-   r   r   r.   r   r/   r%   rY  r   r   r   r   r     s"    


r   c                	       sn   e Zd ZdZdZedd Zejdfde	e
df dejd	eeejf d
ejf fddZedddZ  ZS )rN  z:A lower-triangular (aka causal) mask with an additive biasTc                 C   s(   t jj| |j|j|j|jd}||_|S )NrT  )r   r/   rV  r   r   r   rU  r2  )rr   rW  r   r   r   r   r5  :  s   z)LowerTriangularMaskWithTensorBias.__new__r   r   .r   r   r   c                    s   t  j|||d| j S r   )r   r%   r2  r#   r   r   r   r%   F  s   z-LowerTriangularMaskWithTensorBias.materializer   Nc              	      sz   |pi }|j tjjjtjjjtjjjtjjjtjjjtjjj	tjjj
tjjjfv r;| fdd|D i |} |S tS )Nc                    s    g | ]}t | r|jn|qS r   )rl   r2  )r{   arr   r   r   r}   \  s     zHLowerTriangularMaskWithTensorBias.__torch_dispatch__.<locals>.<listcomp>)r;  r   r<  r=  r   selectslicer>  r?  r@  r   viewrA  )rr   rB  rC  r7  r4  outputr   r[  r   rD  N  s$   
z4LowerTriangularMaskWithTensorBias.__torch_dispatch__rK  )r'   r(   r)   r*   r+   rL  r5  r   r,   r	   r-   r   r   r.   r   r/   r%   r   rD  r   r   r   r   r   rN  5  s$    

rN  rJ   )9r*   r   dataclassesr   typingr   r   r   r   r   r   r	   r
   r   r   r   r/   r   r   r-   r   r   r   r3   r,   r.   r   rF   rG   rQ   rW   ra   r   r   r   r   r   r   r  r  r  r  r  r'  r"  r(  r   r   r1  autogradFunctionrM  r   rN  _dynamoallow_in_graphVARLEN_BIASESr   r   r   r   <module>   s   0	3


U/
3`lo o1oI8i
]1s>
/2..