o
     iU                  
   @   s  d dl Z d dlmZ d dlmZ d dlmZmZmZm	Z	m
Z
mZmZmZmZmZ d dlZddlmZ ddlmZ d	d
lmZmZmZmZmZmZmZmZ dedefddZdeeej ef  deej gej f deeej ef  fddZ!G dd dej Z"dej deej e#f de"fddZ$eG dd dZ%eG dd dZ&eG dd dZ'G dd  d eZ(G d!d" d"e(Z)G d#d$ d$e(Z*eeee)  eee*  f Z+d%e,dej fd&d'Z-d(e	e. d)e.dej d*e,ddf
d+d,Z/dS )-    N)	dataclass)partial)
AnyCallableIterableListMappingOptionalSetTupleTypeUnion   )_built_with_cuda   )BaseOperator   )AttentionBiasAttentionBiasSubTensorBlockDiagonalGappyKeysMaskBlockDiagonalMaskBlockDiagonalPaddedKeysMaskLowerTriangularMaskPagedBlockDiagonalGappyKeysMask PagedBlockDiagonalPaddedKeysMaskattn_bias_typereturnc                 C   s$   t d | rdS | ttjfv rdS dS )NTF)
isinstancer   torchTensor)r    r    L/home/ubuntu/.local/lib/python3.10/site-packages/xformers/ops/fmha/common.py_is_bias_type_supported_in_BMK&   s
   
r"   	attn_biasopc                 C   s"   t | tjr| jdkr|| S | S )Nr   )r   r   r   ndim)r#   r$   r    r    r!   _attn_bias_apply/   s   r&   c                   @   s   e Zd Zg dZejjZ	ddejdejde	ejejgejf dej
dedd fd	d
ZdejfddZdeejejf fddZdd ZdS )ScaledTensor)scaledequant_funcoriginal_dtypeFdatar(   r)   r*   require_gradr   c                 C   s(   t jj| ||d}||_||_||_|S )a  
        Creates a new ScaledTensor subclass instance.

        Parameters:
        - data: The underlying quantized tensor (e.g., int8, int4).
        - scale: The scale tensor or scalar to be used for dequantization.
        - dequant_func: A callable that applies dequantization, which takes both the data and scale as input.
        - original_dtype: The data type before quantization (e.g., float32, float16).
        - require_grad: Whether or not to track gradients (default: False for inference use).
        )r,   )r   r   _make_subclassr(   r)   r*   )clsr+   r(   r)   r*   r,   instancer    r    r!   __new__>   s
   zScaledTensor.__new__c                 C   s(   t |  }| || j}|| jS )z
        Applies the custom dequantization function provided at the tensor's creation.
        After dequantization, the data is cast back to its original data type.
        )r   r   floatr)   r(   tor*   )selfr+   dequantized_datar    r    r!   
dequantize]   s   zScaledTensor.dequantizec                 C   s   | j | jfS )z
        Unpacks the ScaledTensor by returning its data and scale as a tuple.
        Returns:
        - A tuple of (data, scale), both of which are torch.Tensor objects.
        )r+   r(   r3   r    r    r!   unpackk   s   zScaledTensor.unpackc                 C   s   d| j  d| j d| j dS )z@
        Custom string representation for ScaledTensor.
        zScaledTensor(data=z, scale=z, original_dtype=))r+   r(   r*   r6   r    r    r!   __repr__s   s   zScaledTensor.__repr__NF)__name__
__module____qualname__	__slots__r   _C_disabled_torch_function_impl__torch_function__r   r   dtypeboolr0   r5   r   r7   r9   r    r    r    r!   r'   8   s(    
r'   xr(   c                 C   s4   t |trtj|g| jd}dd }t| |||dS )z;
    Pack a tensor into a tensorwise fp8 ScaledTensor.
    )devicec                 S   s   | |d d d d d d f  S Nr    )rD   r(   r    r    r!   r)      s   z2pack_fp8_tensorwise_per_head.<locals>.dequant_func)r+   r(   r)   r*   )r   r1   r   tensorrE   r'   )rD   r(   r*   r)   r    r    r!   pack_fp8_tensorwise_per_headz   s   
rH   c                   @   s
  e Zd ZU dZejed< ejed< ejed< dZee	eje
f  ed< dZeed< dZee ed	< dZeej ed
< dZeed< edejfddZedefddZdeejejejf fddZdeedf fddZdddZdejfddZedefddZdS )InputszE
    Stores inputs to the `memory_efficient_attention` operators
    querykeyvalueNr#           pr(   output_dtypeF
is_partialr   c                 C   s   | j jS rF   )rJ   rE   r6   r    r    r!   rE      s   zInputs.devicec                 C   s    | j d u r| jjd d S | j S )Ng      )r(   rJ   shaper6   r    r    r!   scale_float   s    zInputs.scale_floatc                 C   s   | j jdkr| j | j| jfS | j jdkr%| j d| jd| jdfS | jjdkrQ| j d d d d d d f | jd d d d d d f | jd d d d d d f fS J )N      r   r   )rJ   r%   rK   rL   	unsqueezer6   r    r    r!   get_qkv_in_bmghk   s   


zInputs.get_qkv_in_bmghk.c                 C   s   | j jdvrtd| j j d| jjtjkrt| j j}n| j jd d | jjd f }| j jdkrT| j 	d| _ | j
	d| _
| j	d| _t| jttj	dd| _|S )	Nr   rU   rT   zInvalid shape for query: z|. Expected shape [batch, seqlen, head_groups, num_heads_per_group, K], [batch, seqlen, num_heads, K], or [batch, seqlen, K].rQ   r   r   r   )dim)rJ   r%   
ValueErrorrR   rL   rB   r   int32tuplerV   rK   r&   r#   r   )r3   output_shaper    r    r!   normalize_bmhk   s   zInputs.normalize_bmhkc                    s   j  j jf} j jdvst fdd|D r,td j j d jj d jj t fdd|D r;tdt jt	t
tttfr_ jjjj}| j jkr_td	 j j d
| d jj jj  komtjkn  }t fdd|D }|s|std j j d jj d jj  j jdkrtt jstdt jj dd }t jtr jjr jj}n
t jtjrƈ j} j jdkr|d ur j jd  j jd  j jd  jjd f}|j|krtd|j d| d j j d jj d jj 
t jt	r,tdd |D r,td j j d jj d jj  jdk s8 jdkr@td j  j jd d \}} j jd }	 jjd d \}}
 jjd } jjtjk}|rl|n|	}d } j jdkr j j|||	fko jj||
|	fko jj||
|fk} j jd! } j jdkr j j||||	fko jj||
||fko jj||
||fk} j jd } j jd"kr j j|||||	fko jj||
|||fko jj||
|||fk}|std# j j d jj d jj d$d S )%NrX   c                 3       | ]
}|j  jj kV  qd S rF   )r%   rJ   .0rD   r6   r    r!   	<genexpr>   s    
z)Inputs.validate_inputs.<locals>.<genexpr>zIQuery/Key/Value should all have BMGHK, BMHK or BMK shape.
  query.shape: z
  key.shape  : z
  value.shape: c                 3   r_   rF   )rE   rJ   r`   r6   r    r!   rb          z0Query/Key/Value should all be on the same devicezPAttention bias and Query/Key/Value should be on the same device
  query.device: z
  attn_bias   : 
c                 3   s    | ]
}|j  jj kV  qd S rF   )rB   rJ   r`   r6   r    r!   rb      rc   zQuery/Key/Value should either all have the same dtype, or (in the quantized case) Key/Value should have dtype torch.int32
  query.dtype: z
  key.dtype  : z
  value.dtype: r   zKPlease provide inputs in BMHK format rather than BMK when using bias type ``rU   r   r   r   z"Invalid shape for attention bias: z (expected z)
  query.shape: c                 s   s    | ]
}|j d  dkV  qdS )r   r   N)rR   r`   r    r    r!   rb     rc   zDExpected batch_size=1 when using block-diagonal bias
  query.shape: rM   g      ?zInvalid dropout probability: p=rQ   TrT   z9Incompatible shapes for attention inputs:
  query.shape: z}
HINT: We don't support broadcasting, please use `expand` yourself before calling `memory_efficient_attention` if you need to)rJ   rK   rL   r%   anyrZ   rR   r   r#   r   r   r   r   r   	q_seqinfoseqstartrE   rB   r   r[   allr"   typer;   r   HOLDS_DENSE_TENSOR
_subtensorr   rN   )r3   qkvbias_devicequantized_dtypesnon_quantized_dtypesattn_bias_texpected_shapeBMqKMkvKvquantized_kv_cachekey_embed_dimvalid_shapesHGr    r6   r!   validate_inputs   s   
"





zInputs.validate_inputsc                 C   s2   | j d u r| jr| jjtjurtjS | jjS | j S rF   )rO   rP   rJ   rB   r   float64float32r6   r    r    r!   get_output_dtypeF  s
   
zInputs.get_output_dtypec                 C   s   t dd | j| j| jfD S )zP
        Number of bytes in the input, not counting the attention bias.
        c                 s   s    | ]	}|   V  qd S rF   )untyped_storagenbytesr`   r    r    r!   rb   R  s    
z Inputs.nbytes.<locals>.<genexpr>)sumrJ   rK   rL   r6   r    r    r!   r   M  s   zInputs.nbytes)r   N)r;   r<   r=   __doc__r   r   __annotations__r#   r	   r   r   rN   r1   r(   rO   rB   rP   rC   propertyrE   rS   r   rW   intr^   r~   r   r   r    r    r    r!   rI      s(   
 



xrI   c                   @   sl   e Zd ZU ejed< ejed< dZeed  ed< dZ	ee
 ed< dZeed< dd	ed
edejfddZdS )ContextlseoutNAttentionBwOpBaseop_bw	rng_stateFqkv_share_storagepad_toforce_pad_infr   c                 C   s   || j jd |  | }| j }|dkr@|r1|d d d d d | jjd f }||jd |  | }tjjj|d|gtjd}|S |rb| jjd |jd krb|d d d d | jjd d f 	tj |S )Nr   r   r   )rL   )
r   rR   r   r   nn
functionalpadmathinffill_)r3   r   r   
pad_amountr   r    r    r!   get_padded_lsea  s   "*zContext.get_padded_lser:   )r;   r<   r=   r   r   r   r   r	   r   r   r   r   rC   r   r   r    r    r    r!   r   W  s   
 

r   c                   @   s>   e Zd ZU ejed< ejed< ejed< dZeej ed< dS )	GradientsdqdkdvNdb)r;   r<   r=   r   r   r   r   r	   r    r    r    r!   r   n  s
   
 


r   c                   @   sX  e Zd ZU dZeed< ee ed< dZe	e
e
f ed< eej ed< eed< dZe
ed	< ed
fZee ed< eed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< eed< dZdZeed< ddgZee
 ed< ddgZee
 ed< ededefd d!Z ed"e
d#e
d$e
d%e
dee f
d&d'Z!ededee fd(d)Z"d
S )*AttentionOpBaseaI  Base class for any attention operator in xFormers

    See:

    - :attr:`xformers.ops.fmha.cutlass.FwOp`
    - :attr:`xformers.ops.fmha.cutlass.BwOp`
    - :attr:`xformers.ops.fmha.flash.FwOp`
    - :attr:`xformers.ops.fmha.flash.BwOp`
    - :attr:`xformers.ops.fmha.triton.FwOp`
    - :attr:`xformers.ops.fmha.triton.BwOp`
    OPERATORSUPPORTED_DEVICES)rT   r   CUDA_MINIMUM_COMPUTE_CAPABILITYSUPPORTED_DTYPESSUPPORTED_MAX_Kr   SUPPORTED_MIN_KNSUPPORTED_ATTN_BIAS_TYPESSUPPORTS_DROPOUTFSUPPORTS_CUSTOM_SCALESUPPORTS_DIFFERENT_VALUE_EMBEDSUPPORTS_OUTPUT_DTYPESUPPORTS_PARTIALTIS_DETERMINISTICSUPPORTS_BMGHKNAMEmemory_efficient_attentionVARLEN_LSE_PACKEDr   i,  _TEST_BATCH_SIZES       _TEST_Kdr   c                 C   s   |  | S rF   )not_supported_reasons)r.   r   r    r    r!   supports  s   zAttentionOpBase.supportsru   rw   rv   rx   c                 C   sd   g }| j s||kr|d t||| jkr|d| j  t||| jk r0|d| j  |S )Nz"query.shape[-1] != value.shape[-1]z(max(query.shape[-1], value.shape[-1]) > z(min(query.shape[-1], value.shape[-1]) < )r   appendmaxr   minr   )r.   ru   rw   rv   rx   reasonsr    r    r!   shape_not_supported_reasons  s   


z+AttentionOpBase.shape_not_supported_reasonsc                 C   s"  |j j}| j|d |jjd |d |jjtjkr|d n|jjd d}|j jj	}|j j}|| j
vr?|d| d| j
 d |dkrPtsPtjjdu rP|d	 |dkrstjjdu rstj|j}|| jk rs|d
| j d| d || jvr|d| d| j d t	|j| jvr|dt	|j  | js|jdur|j|ur|d |jr| js|d |jdkr| js|d |jdur| js|d |tju r|drtj|j jd dk r|d |  s|d | j st! r|d | j"s|j j#dkr|d |S )z
        Returns a list of reasons why this is not supported.
        The kernel can run these inputs only if the returned list is empty
        r   rQ   )ru   rw   rv   rx   zdevice=z (supported: r8   cudaNz'xFormers wasn't build with CUDA supportz"requires device with capability > z but your GPU has capability z
 (too old)zdtype=zattn_bias type is z!Custom output dtype not supportedzPartial attention not supportedrM   zdropout > 0.0zhas custom scaler      z$bf16 is only supported on A100+ GPUszCoperator wasn't built - see `python -m xformers.info` for more infozNoperator is non-deterministic, but `torch.use_deterministic_algorithms` is setrT   z&operator does not support BMGHK format)$rJ   rR   r   rK   rL   rB   r   r[   rE   rk   r   r   r   versionhipr   get_device_capabilityr   r   r#   r   r   rO   rP   r   rN   r   r(   r   bfloat16
startswithis_availabler   $are_deterministic_algorithms_enabledr   r%   )r.   r   query_shaper   device_typerB   device_capabilityr    r    r!   r     sj   
 












z%AttentionOpBase.not_supported_reasons)#r;   r<   r=   r   r   r   r
   strr   r   r   r   rB   r1   r   rk   r   r   rC   r   r   r   r   r   r   OPERATOR_CATEGORYr   r   r   r   classmethodrI   r   r   r   r    r    r    r!   r   w  sJ   
 r   c                
   @   s   e Zd ZU ejdejdejdiZeej	ef e
d< ejdejdejdiZeej	ef e
d< ed	ed
edeejee f fddZdS )AttentionFwOpBasega2U0*3?gMbp?{Gz?
ERROR_ATOLgh㈵>g-C6:?g{Gzt?
ERROR_RTOLinpneeds_gradientr   c                 C      t  rF   NotImplementedError)r.   r   r   r    r    r!   apply  s   zAttentionFwOpBase.applyN)r;   r<   r=   r   r1   halfr   r   r   rB   r   r   r   rI   rC   r   r   r	   r   r   r    r    r    r!   r     s"   
 r   c                	       s   e Zd ZU ejdejdejdiZeej	ef e
d< ejdejdejdiZeej	ef e
d< d	Zd
Zededee f fddZedededejdefddZ  ZS )r   gH}M?g?g?r   g-C6?r   g?r   FTr   r   c                    s:   t t| |}t|jtjr|jjr| js|	d |S )NzMComputing the bias gradient is not supported (attn_bias.requires_grad = True))
superr   r   r   r#   r   r   requires_gradSUPPORTS_ATTN_BIAS_GRADr   )r.   r   r   	__class__r    r!   r     s   z'AttentionBwOpBase.not_supported_reasonsctxr   gradc                 C   r   rF   r   )r.   r   r   r   r    r    r!   r   &  s   zAttentionBwOpBase.apply)r;   r<   r=   r   r1   r   r   r   r   rB   r   r   r   r   r   rI   r   r   r   r   r   r   r   __classcell__r    r    r   r!   r     s   
 &r   	num_headsc                 C   s<   | j dkr| S | | jd | || jd | jd gdS )NrU   r   r   r   )r   r   r   r   )r%   reshaperR   permute)rG   r   r    r    r!   bmk2bmhk0  s   
 r   r   name	alignmentc              
   C   s   |j d | dkr| | d| d n|d| dkr1| | d| d| d|  d	 |dd
krJ| | d| d|  d d S d S )NrQ   r   z.shape[-1] % z != 0rf   z.stride(-2) % z != 0 (z.stride() = r8   r   z.stride(-1) > 1 (z0) - you should call `.contiguous()` on the input)rR   r   stride)r   r   rD   r   r    r    r!   check_lastdim_alignment_stride18  s   r   )0r   dataclassesr   	functoolsr   typingr   r   r   r   r   r	   r
   r   r   r   r   _cpp_libr   commonr   r#   r   r   r   r   r   r   r   r   rC   r"   r   r&   r'   r1   rH   rI   r   r   r   r   r   AttentionOpr   r   r   r   r    r    r    r!   <module>   sb   0(	
	B
 I{&