o
     iw                     @   s  U d dl mZmZmZmZmZmZmZmZ d dl	Z	ddl
mZmZmZmZmZmZmZmZ ddlmZmZmZmZ ddlmZmZmZmZmZmZmZmZ ddl m!Z!m"Z"m#Z#m$Z$m%Z% ej&ej'fZ(ej&ej'fZ)ej&ej'fZ*ej&ej'fZ+ej&ej'fZ,ej&ej'fZ-dee	j. d	efd
dZ/ej&j0ej&ej'j0ej'iZ1dd Z2dd Z3G dd de	j4j5Z6			dLdddde	j.de	j.de	j.deee	j.ef  de7dee7 dee dee	j8 d	e	j.fddZ9e	j:;dd e	j:<dd d!d" Z=e	j:<dd#			dLde	j.de	j.de	j.deee	j.ef  de7dee7 d	e	j.fd$d%Z>			dLdddde	j.de	j.de	j.deee	j.ef  de7dee7 deee  dee	j8 d	e	j.fd&d'Z?			dLdddde	j.de	j.de	j.deee	j.ef  de7dee7 deee  dee	j8 d	ee	j.e	j.f fd(d)Z@			dLdd*d+e	j.d,e	j.d-e	j.de	j.de	j.de	j.deee	j.ef  de7dee7 deee  d	ee	j.e	j.e	j.f fd.d/ZA	dMd0edee d	e	j.fd1d2ZBd0edeee  d	e	j.fd3d4ZCd0edeee  d	ee	j.ef fd5d6ZDd-e	j.d0ed	eeE fd7d8ZFd9d:d;ed0ed+e	j.deee  d<eEd	efd=d>ZG			dLdddde	j.de	j.de	j.deee	j.ef  de7dee7 deeeee f  dee	j8 d	ee	j.e	j.f fd?d@ZH	A	dNdBee	j.ee	j. f dCee	j.ee	j. f dDeEdee	j8 d	ee	j.ee	j. f f
dEdFZIG dGdH dHe	j4j5ZJe	jKjLrej&nej&ej&ej&ej&gZMeee  eNdI< e	jKjLrej'nej'ej'ej'gZOeee  eNdJ< g dKZPdS )O    )AnyListOptionalSequenceTupleTypeUnioncastN   )	attn_biasck
ck_decoder	ck_splitkcutlassflashflash3triton_splitk)VARLEN_BIASESAttentionBiasBlockDiagonalMaskLowerTriangularMask)AttentionBwOpBaseAttentionFwOpBaseAttentionOpAttentionOpBaseContext	GradientsInputsbmk2bmhk)_dispatch_bw_dispatch_fw_ensure_op_supports_or_raise_get_use_fa3_set_use_fa3attn_bias_tensorreturnc                 C   s   |d u r| S |S N )attn_bias_ctxr$   r'   r'   N/home/ubuntu/.local/lib/python3.10/site-packages/xformers/ops/fmha/__init__.py_deserialize_bias4   s   r*   c                 C   s   | d ur| j tv r| j S | S r&   )NAME_OPS_LOOKUPopr'   r'   r)   _serialize_opD   s   r/   c                 C   s   t | tr	t|  S | S r&   )
isinstancestrr,   r-   r'   r'   r)   _unserialize_opJ   s   
r2   c                   @   s8   e Zd ZededefddZeejjj	dd Z
dS )_fMHAargsr%   c           
      G   s  t | }t|}t|}t||d\}}t|jtjr!|j}d }nd }|j}| |j|j	|j
|j|j |j| _|| _|jd urZ|d urW||jurWtd|j d|jj d|j}|d urt|jtr|jjjjd dkr|j|jkrtd|j d|j d|d u r|jjs|j	js|j
jrt|j|}	|	d ur|d ur|j|	ksJ |j d	t||	d
}|| _|| _|j| _|jjd |j	jd ko|jjd |j
jd ko|jd|j	jd |jjd  |j
jd  k| _|j| _|| _t || _!||jfS )N)inpr.   zSpecified op_bw=z), but forward op can only run with op_bw=z. Please set op_bw=None.r      z" is not compatible with the op_fw=zX, because they use different format of logsumexp. NOTE: This is new with xFormers 0.0.28z': wrong value for `VARLEN_LSE_PACKED` ?varlen_lse_packed)"r   r2   1_memory_efficient_attention_forward_requires_gradr0   r   torchTensorsave_for_backwardquerykeyvalueoutlse	rng_stater$   op_bw
ValueErrorr+   r   	q_seqinfoseqstartshapeVARLEN_LSE_PACKEDrequires_grad_detect_lse_packed_or_raiser   op_fwpstrideqkv_share_storagescaler(   lenn_args)
ctxrM   rE   r4   r5   rB   op_ctxr$   r(   r8   r'   r'   r)   forwardQ   s   






"

z_fMHA.forwardc                 C   s   | j \}}}}}| j}| j}	t|||t| j|| j| jd}
t|||	d}t	||
|| j
dd}d d |j|j|j|jfd| jd   S )Nr?   r@   rA   r   rN   rQ   )rC   rB   rD   T)rT   r5   gradr.   _skip_op_checksr&   r6   )saved_tensorsr$   rD   r   r*   r(   rN   rQ   r   $_memory_efficient_attention_backwardrE   dqdkdvdbrS   )rT   rX   grad_lser?   r@   rA   rB   rC   r$   rD   r5   rU   gradsr'   r'   r)   backward   s4   
z_fMHA.backwardN)__name__
__module____qualname__staticmethodr   rV   r<   autogradfunctiononce_differentiablerb   r'   r'   r'   r)   r3   P   s    Or3           )r.   output_dtyper?   r@   rA   r   rN   rQ   r.   rk   c             
   C      t t| ||||||d|dS )a_  Implements the memory-efficient attention mechanism following
    `"Self-Attention Does Not Need O(n^2) Memory" <http://arxiv.org/abs/2112.05682>`_.

    :Inputs shape:

    - Input tensors must be in format ``[B, M, H, K]``, where B is the batch size, M         the sequence length, H the number of heads, and K the embeding size per head

    - If inputs have dimension 3, it is assumed that the dimensions are ``[B, M, K]`` and ``H=1``

    - Inputs can also be of dimension 5 with GQA - see note below

    - Inputs can be non-contiguous - we only require the last dimension's stride to be 1


    :Equivalent pytorch code:

    .. code-block:: python

        scale = 1.0 / query.shape[-1] ** 0.5
        query = query * scale
        query = query.transpose(1, 2)
        key = key.transpose(1, 2)
        value = value.transpose(1, 2)
        attn = query @ key.transpose(-2, -1)
        if attn_bias is not None:
            attn = attn + attn_bias
        attn = attn.softmax(-1)
        attn = F.dropout(attn, p)
        attn = attn @ value
        return attn.transpose(1, 2).contiguous()

    :Examples:

    .. code-block:: python

        import xformers.ops as xops

        # Compute regular attention
        y = xops.memory_efficient_attention(q, k, v)

        # With a dropout of 0.2
        y = xops.memory_efficient_attention(q, k, v, p=0.2)

        # Causal attention
        y = xops.memory_efficient_attention(
            q, k, v,
            attn_bias=xops.LowerTriangularMask()
        )

    :Supported hardware:

        NVIDIA GPUs with compute capability above 6.0 (P100+), datatype ``f16``, ``bf16`` and ``f32``.

    :EXPERIMENTAL: Using with Multi Query Attention (MQA) and Grouped Query Attention (GQA):

        MQA/GQA is an experimental feature supported only for the forward pass.
        If you have 16 heads in query, and 2 in key/value, you can provide 5-dim tensors
        in the ``[B, M, G, H, K]`` format, where ``G`` is the number of head groups (here 2), and
        ``H`` is the number of heads per group (8 in the example).

        Please note that xFormers will not automatically broadcast the inputs, so you will need
        to broadcast it manually before calling `memory_efficient_attention`.

    :GQA/MQA example:

    .. code-block:: python

        import torch
        import xformers.ops as xops

        B, M, K = 3, 32, 128
        kwargs = dict(device="cuda", dtype=torch.float16)
        q = torch.randn([B, M, 8, K], **kwargs)
        k = torch.randn([B, M, 2, K], **kwargs)
        v = torch.randn([B, M, 2, K], **kwargs)
        out_gqa = xops.memory_efficient_attention(
            q.reshape([B, M, 2, 4, K]),
            k.reshape([B, M, 2, 1, K]).expand([B, M, 2, 4, K]),
            v.reshape([B, M, 2, 1, K]).expand([B, M, 2, 4, K]),
        )

    Raises:
        NotImplementedError: if there is no operator available to compute the MHA
        ValueError: if inputs are invalid

    :parameter query: Tensor of shape ``[B, Mq, H, K]``
    :parameter key: Tensor of shape ``[B, Mkv, H, K]``
    :parameter value: Tensor of shape ``[B, Mkv, H, Kv]``
    :parameter attn_bias: Bias to apply to the attention matrix - defaults to no masking.         For common biases implemented efficiently in xFormers, see :attr:`xformers.ops.fmha.attn_bias.AttentionBias`.         This can also be a :attr:`torch.Tensor` for an arbitrary mask (slower).
    :parameter p: Dropout probability. Disabled if set to ``0.0``
    :parameter scale: Scaling factor for ``Q @ K.transpose()``. If set to ``None``, the default         scale (q.shape[-1]**-0.5) will be used.
    :parameter op: The operators to use - see :attr:`xformers.ops.AttentionOpBase`.         If set to ``None`` (recommended), xFormers         will dispatch to the best available operator, depending on the inputs         and options.
    :return: multi-head attention Tensor with shape ``[B, Mq, H, Kv]``
    r?   r@   rA   rN   r   rQ   rk   r-   )_memory_efficient_attentionr   r?   r@   rA   r   rN   rQ   r.   rk   r'   r'   r)   memory_efficient_attention   s   p	rp   z+xformer::memory_efficient_attention_forwardz_(Tensor q, Tensor k, Tensor v, Tensor? b = None, float? p = 0.0, float? scale = None) -> TensorMetac                 C   s   |  | jS r&   )	new_emptyrI   )qkvr'   r'   r)   'memory_efficient_attention_forward_metaF  s   rv   CUDAc                 C   s   t | |||||S )a  
    This provides a torch-compilable wrapper op to
    memory_efficient_attention_forward in certain special cases.

    Note that the following are not supported
        - `op` input (?)
        - certain attn_bias types (?)
        - output_dtype
        - K != Kv
    )"memory_efficient_attention_forwardrW   r'   r'   r)   0memory_efficient_attention_forward_torch_wrapperM  s   ry   c             
   C   rl   )zY
    Calculates the forward pass of :attr:`xformers.ops.memory_efficient_attention`.
    rm   r-   )#_memory_efficient_attention_forwardr   ro   r'   r'   r)   rx   j  s   	rx   c          
   
   C   s<   |dkrt dtt| ||||||d|d\}}	||	jfS )a  
    Returns a tuple (output, lse), where `lse` can be used to compute the backward pass later.
    See :attr:`xformers.ops.memory_efficient_attention` for an explanation of the arguments
    See :attr:`xformers.ops.memory_efficient_attention_backward` for running the backward pass
    rj   dropout is not supported on the non-autograd API. If you want to use dropout, please call `memory_efficient_attention` directlyrm   r-   )NotImplementedErrorr;   r   rC   )
r?   r@   rA   r   rN   rQ   r.   rk   rB   rT   r'   r'   r)   0memory_efficient_attention_forward_requires_grad  s"   	

r}   r-   rX   outputrC   c	             
   C   sH   |dkrt dtt||dt||||||d| |	d}
|
j|
j|
jfS )a  
    Computes the gradient of the attention.
    Returns a tuple (dq, dk, dv)
    See :attr:`xformers.ops.memory_efficient_attention` for an explanation of the arguments.
    `lse` is the tensor returned by
    :attr:`xformers.ops.memory_efficient_attention_forward_requires_grad`
    rj   r{   )rB   rC   )r?   r@   rA   rN   r   rQ   r-   )r|   r[   r   r   r\   r]   r^   )rX   r~   rC   r?   r@   rA   r   rN   rQ   r.   	gradientsr'   r'   r)   #memory_efficient_attention_backward  s   
r   r5   c              
   C   s   t dd | j| j| jfD rt| |d ur|d dS d dS |  }t|d ur,|d nd }t|d ur8|d nd }t||| j| j| j| j	| j
| jd |S )Nc                 s   s    | ]}|j d u V  qdS )FNrK   .0xr'   r'   r)   	<genexpr>  s    z._memory_efficient_attention.<locals>.<genexpr>r   r-   r
   )allr?   r@   rA   rz   normalize_bmhkr/   r3   applyr   rN   rQ   reshape)r5   r.   output_shaperM   rE   r'   r'   r)   rn     s    rn   c                 C   sN   |    |  }|d u rt| d}nttd||  |j| dd^}}||S )NFrp   needs_gradientvalidate_inputsr   r    r!   rF   r   r   )r5   r.   r   rB   _r'   r'   r)   rz     s   
rz   c                 C   sf   |    |  }|d u rt| d}nttd||  |j| dd}|d d us(J |d ||d fS )NTrp   r   r
   r   r   )r5   r.   r   rB   r'   r'   r)   r;     s   r;   c           	      C   s  d| j  d|jj  dt|j }| j|jjd ks)| j dd |jj dd kr-t|| j d | j d g}|jj d |jj d g}|d |d koN||k}t|jtr|jj}|j	j d d |j
g}|d |d koo||k}|rv|rvdS |rzd	S |r~d
S t||st|dS )z
    Detects the LSE format if we're in a varlen case.
    Returns `None` if the format is not relevant (eg not varlen)
    Raises an exception if the `lse` has the wrong shape
    z9Input tensors have incompatible shapes.
  lse.shape    : z
  query.shape  : z
  attn_bias    : r
   r9   r6   r   NTF)rI   r?   typer   ndimrF   r0   r   rG   rH   
max_seqlen)	rC   r5   shape_mismatch_errlse_bmlse_packed_shape
lse_packedsilse_padded_shape
lse_paddedr'   r'   r)   rL     s6   0rL   F)rY   rT   rY   c          
      C   s(  |   |j|jjks|j| jjkr$td|j d| jj d|jj tdd |j|j|jfD \}}}|	  t
| j|}t|d}t| jd| _|du rWt||d}n|svttd	|| |durv||jkrvtd
|j d|j d|| ||}	|	j||	_|	j||	_|	j||	_|	S )z2Warning: grad/ctx.out is potentially in BMK formatzTAll tensors should be either in BMK (ndim=3) or BMHK (ndim=4) format. 
grad.shape : z 
out.shape  : z 
query.shape: c                 s       | ]}|j V  qd S r&   )rI   r   r'   r'   r)   r   4      
z7_memory_efficient_attention_backward.<locals>.<genexpr>r
   Nr7   r   zWrong LSE format for z< in variable seqlen case. Double-check that the BW operator z5 is compatible with the operator used in the FW pass.)r   r   r?   rB   rF   rI   tupler@   rA   r   rL   rC   r   r   r!   rJ   r+   r   r\   r   r]   r^   )
rT   r5   rX   r.   rY   shape_dqshape_dkshape_dvr8   ra   r'   r'   r)   r[   #  sF   	

r[   c                C   s   |dkrt dt|tr|d n|}t| ||||||dd}	t o.tdd | ||fD }
|
s>t|	|d\}}||jfS | j	d	krGt
d
t|trYt|d }t|d }n|du rbd }}nt|}d}t|||	j|	j|	j|	j|	j|	j|	j|	j
S )at  
    Returns a tuple (output, lse), where `output` is the attention in the style of
    memory_efficient_attention, and  `lse` is extra data, a log-sum-exp.
    The outputs of calls to this with the same query and separate keys and values
    can be merged with merge_attentions to obtain the attention of the queries
    against the disjoint union of the keys and values.

    Warning: The backward pass of this function is quite restricted. In particular
    we assume that in the forward pass the outputs were only used in merge_attention
    calculations, and that LSEs weren't used anywhere except in merge attentions.
    rj   zdropout is not supported.r   T)r?   r@   rA   rN   r   rQ   rk   
is_partialc                 s   r   r&   r   r   r'   r'   r)   r   t  r   z5memory_efficient_attention_partial.<locals>.<genexpr>r-      z&gradients not supported for 5D tensorsr
   N)r|   r0   r   r   r<   is_grad_enabledanyr;   rC   r   rF   r/   r3   r   r?   r@   rA   r   rN   rQ   rk   r   )r?   r@   rA   r   rN   rQ   r.   rk   fwopr5   is_gradrB   rT   rM   rE   r'   r'   r)   "memory_efficient_attention_partialP  sV   




r   T
attn_split	lse_split	write_lsec                 C   s  t | tj}t |tj}|r| jntdd | D }|r|jntdd |D }t o/|p/|}|r8|s8td|o>|o>| }	|	rttj| } ttj|}| j|jd kratd| j	d|j	| jdk}
|
rr| 
d	} |
d
}| j	\}}}}}}|j	\}}}}}||ks||ks||ks||ks||krtd| j	d|j	d| d| d| d| d| d| d| d| d| d| | dd	ddd
d} |dd
d	dd}| j}| j}|j}n=|r| d} |r|d}t| }t||krtdt| dt|g }g }d}
t|D ]Q}| | j|| jd kr2td| d| | j	d|| j	| | jdk}
|
rR|| | 
d
 ||| 
d q|| |  |||  q||} }| d j	\}}}}}|d j	\}}}}||ks||ks||ks||krtd| d j	d|d j	d| d| d| d| d| d| d| d| t|D ]R}| | j	|||||fkrtd| d| | j	d|||||f|| j	||||fkrtd| d|| j	d||||f| | dd
d	dd| |< q| d j}| d j}|d j}tj|||||||p(||d}|r<tj|||||||d}nd}|	rJt||| | ntj||g| |R  \}}|
rs|dddddf }|durs|dddf }||fS ) a  
    Combine attention output computed on different parts of K/V for the same
    query to get attention on the whole K/V. See https://arxiv.org/abs/2402.05099
    The result is equal to
        Out_full = (Out1 * exp(LSE1) + Out2 * exp(LSE2) + ...) / (exp(LSE1) + exp(LSE2) + ...)
        LSE_full = log(exp(LSE1) + exp(LSE2) + ...)

    Args:
        attn_split: attention outputs for chunks,
            either as a list of tensors of shapes [B, M, G, H, Kq] or [B, M, H, Kq]
            or as a single tensor of shape [num_chunks, B, M, G, H, Kq]
            or [num_chunks, B, M, H, Kq]
        lse_split: LSE for chunks,
            either as a list of tensors of shapes [B, G, H, M] or [B, H, M]
            or as a single tensor of shape [num_chunks, B, G, H, M] or [num_chunks, B, H, M]
        write_lse: whether to output LSE
        output_dtype: dtype of attn_out

    Returns:
        attn_out: [B, M, G, H, Kq] or [B, M, H, Kq]
        lse_out: [B, G, H, M] or [B, H, M] if write_lse
                 or None otherwise
    c                 s   r   r&   r   r   r'   r'   r)   r         z#merge_attentions.<locals>.<genexpr>c                 s   r   r&   r   r   r'   r'   r)   r     r   z5write_lse should be true if inputs require gradients.r
   z,Incompatible input shapes: attn_split.shape=z, lse_split.shape=r      r6   z lse_split.shape= /z,    r   zAIncompatible number of LSE and attention chunks: len(attn_split)=z, len(lse_split)=Fz$Incompatible input shapes for chunk z: attn_split[i].shape=z, lse_split[i].shape=z/Incompatible input shapes: attn_split[0].shape=z, lse_split[0].shape=z.Incompatible input shapes for attention chunk z, (B, M, G, H, Kq)=z(Incompatible input shapes for LSE chunk z: lse_split[i].shape=z, (B, G, H, M)=)devicedtyperK   N)r0   r<   r=   rK   r   r   rF   r	   r   rI   	unsqueezepermuter   r   unbindrR   rangeappendemptyr   merge_attentions_MergeAttentionsr   )r   r   r   rk   attn_is_concatlse_is_concatattn_requires_gradlse_requires_gradrK   concat_pathis_bmhk
num_chunksBMGHKqnum_chunks1B1G1H1M1r   
attn_dtype	lse_dtypeattn_unsqueezedlse_unsqueezediattn_outlse_outr'   r'   r)   r     s(  


(


 
(





r   c                   @   sh   e Zd Zedejdejdejdeejejf fddZedejdejdeeej d	f fd
dZ	dS )r   r   r   inputsr%   c                 G   sR   t |d }|d | ||d  }}t|||| | j||g|R   ||fS Nr6   )rR   r   merge_attentions_varargsr>   )rT   r   r   r   r   r   r   r'   r'   r)   rV   :  s   z_MergeAttentions.forward	grad_attnr`   .c                 C   sb   | j ^}}}t|d }|d | ||d  }}t||||||\}	}
d d g|	 |
 }t|S r   )rZ   rR   r   !merge_attentions_varargs_backwardr   )rT   r   r`   rB   rC   r   r   r   r   dattndlseretr'   r'   r)   rb   K  s   z_MergeAttentions.backwardN)
rc   rd   re   rf   r<   r=   r   rV   r   rb   r'   r'   r'   r)   r   9  s&    r   
ALL_FW_OPS
ALL_BW_OPS)r   r   r   r   +MemoryEfficientAttentionCutlassFwdFlashBwOp!MemoryEfficientAttentionCutlassOp(MemoryEfficientAttentionFlashAttentionOprp   MemoryEfficientAttentionCkOp#MemoryEfficientAttentionCkDecoderOpr   r   r   r"   r#   r   )Nrj   Nr&   )TN)Qtypingr   r   r   r   r   r   r   r	   r<    r   r   r   r   r   r   r   r   r   r   r   r   commonr   r   r   r   r   r   r   r   dispatchr   r    r!   r"   r#   FwOpBwOpr   r   r   r   r   "MemoryEfficientAttentionSplitKCkOpr=   r*   r+   r,   r/   r2   rg   Functionr3   floatr   rp   librarydefineimplrv   ry   rx   r}   r   rn   rz   r;   boolrL   r[   r   r   r   versioncudar   __annotations__r   __all__r'   r'   r'   r)   <module>   s  *(
(


v	

~
 
	

 
	

+	

%




-

1	

J
 #'