o
     iF                     @   s  d dl mZ d dlmZ d dlmZmZmZmZm	Z	m
Z
mZmZ d dlZddlmZmZ ddlmZ dd	lmZmZmZmZmZmZmZmZmZmZmZmZmZm Z m!Z!m"Z"m#Z# dd
lm$Z$m%Z%m&Z&m'Z'm(Z(m)Z) de(de*fddZ+de(dee	ej, e	ej, e*e*f fddZ-de	eej,ef  de	ej, fddZ.dee/ de	eej,ef  ddfddZ0G dd de*eZ1de	eej,ef  de*fddZ2eG dd de%Z3eG dd de$Z4dS )     )replace)Enum)AnyIterableListMappingOptionalSetTupleUnionN   )get_operatorregister_operator   	attn_bias)AttentionBiasAttentionBiasSubTensor4BlockDiagonalCausalLocalAttentionFromBottomRightMask%BlockDiagonalCausalLocalAttentionMaskBlockDiagonalCausalMask*BlockDiagonalCausalWithOffsetGappyKeysMask+BlockDiagonalCausalWithOffsetPaddedKeysMaskBlockDiagonalGappyKeysMaskBlockDiagonalMaskBlockDiagonalPaddedKeysMask0LowerTriangularFromBottomRightLocalAttentionMask"LowerTriangularFromBottomRightMaskLowerTriangularMask!LowerTriangularMaskWithTensorBias0PagedBlockDiagonalCausalWithOffsetPaddedKeysMaskPagedBlockDiagonalGappyKeysMask PagedBlockDiagonalPaddedKeysMask)AttentionBwOpBaseAttentionFwOpBaseContext	GradientsInputscheck_lastdim_alignment_stride1inpreturnc                 C   s   dS )Nr    )r)   r+   r+   H/home/ubuntu/.local/lib/python3.10/site-packages/xformers/ops/fmha/ck.py_minimum_gemm_alignment,   s   r-   c                 C   sx   | j }t|tttttfr.|j| j	j
 |j| j	j
 |jj}|jj}|jj}|jj}nd }d }d}d}||||fS )N)r   
isinstancer   r   r   r"   r!   	k_seqinfotoquerydevice	q_seqinfoseqstart
max_seqlen)r)   r   
seqstart_k
seqstart_qmax_seqlen_qmax_seqlen_kr+   r+   r,   _get_seqlen_info0   s*   

r;   r   c                 C   s2   t | trt | tr| jS d S t | tjr| S d S N)r/   r   r   
_subtensortorchTensorr   r+   r+   r,   _get_tensor_biasM   s   

r@   reasonsc                 C   s   t |}|d urQdt|jj }d}t|jd D ]}||| dkr5| d| d|  d d}q|r=| d	 |d
dkrS| d|  d d S d S d S )N   Fr   r   zattn_bias.stride(-2) % z != 0 (attn_bias.stride() = )TzHINT: To use an `attn_bias` with a sequence length that is not a multiple of 8, you need to ensure memory is aligned by slicing a bigger tensor. Example: use `attn_bias = torch.zeros([1, 1, 5, 8])[:,:,:,:5]` instead of `torch.zeros([1, 1, 5, 5])`r.   z/attn_bias.stride(-1) > 1 (attn_bias.stride() = z/) - you should call `.contiguous()` on the bias)	r@   r>   finfodtypebitsrangendimstrideappend)rA   r   attn_bias_tensor	alignmentshow_padding_hintdr+   r+   r,   _check_bias_alignmentX   s*   rO   c                   @   s   e Zd ZdZdZdZdZdS )_CustomMaskTypez*
    (Matches CustomMaskType in C++.)
    r   r   r   N)__name__
__module____qualname____doc__NoCustomMaskCausalFromTopLeftCausalFromBottomRightr+   r+   r+   r,   rP   t   s
    rP   biasc                 C   sF   t | tttfrttjS t | ttt	j
tttfrttjS ttjS r<   )r/   r   r   r   intrP   rV   r   r   r   &BlockDiagonalCausalFromBottomRightMaskr   r   r    rW   rU   )rX   r+   r+   r,   _custom_mask_type~   s(   


r[   c                       sx  e Zd ZU dZeddZdhZee e	d< e
je
jhZee
j e	d< dZede
jeeeeeeeeeeejejee e!e"fZ#e$e% e	d	< d
Z&d
Z'd
Z(d
Z)d
Z*dZ+e
j,de
jde
jdiZ-e.e
je,f e	d< e
j,de
jde
jdiZ/e.e
je,f e	d< g dZ0e1e2 e	d< e3de4de5de6e
je7e8 f fddZ9e3de4de5de6e
je7e8 f fddZ:e3de4de1e f fddZ;  Z<S ) FwOpz0xFormers' MHA kernel based on Composable Kernel.xformersefficient_attention_forward_ckcudaSUPPORTED_DEVICESSUPPORTED_DTYPES   NSUPPORTED_ATTN_BIAS_TYPESTckFga2U0*3?g~jtx?gy&1?
ERROR_ATOLgh㈵>g~jth?g{Gz?
ERROR_RTOL)    `   rB      rb   _TEST_Kr)   needs_gradientr*   c                 C   s@  t |jtjvrtd|jjdv rtd|jjdv r#| j||dS |jjdks2J d|jj d }|j	
 d d	kr|j
 d d	ksJJ d
|j	 }|j	
 }|j	|d	 |d |d |d f|d	 |d |d |d f}|j }|j
 }|j|d	 |d |d |d f|d	 |d |d |d f}	n|j	dd}|jdd}	|jj\}
}
}}}
|j}t|jtrt|j}|d ur|jdkrt|dd}nt|jtjr|jjdkr|jdd}t||jdd||	|d}| j||d\}}|d||f}|d ur|jd||f}t|||d}||fS )NUnsupported attn_bias type)r   r      z Unsupported number of dimensions)   )rk      zquery has shape rm   r   z0key and value should be expanded in the same wayr   r   rn   )r2   keyvaluer   )lseout)typer   r\   rc   NotImplementedErrorr2   rH   
apply_bmhkshaperp   rI   rq   size
as_stridedflattenr/   r   r@   r>   r?   r   	unflattenrr   )clsr)   rk   ctxk_shapek_striderp   v_shapev_striderq   _GHqattn_bias_replacebias_tensorrs   rr   r+   r+   r,   apply   sd   






z
FwOp.applyc                 C   s$  t |jtjvrtdt|\}}}}| j|j|j|j	t
|j||||j|t|j|jt|jttttfr;|jjjnd t|jtttfrI|jjnd t|jttfrV|jjnd t|jttfrc|jjnd d\}}}	}
d }|rt|||jdkrytnd d}|jdkrtj|	|
gtjdd|_ ||fS )Nrl   )r2   rp   rq   r   r8   r7   r9   	dropout_pcompute_logsumexpcustom_mask_typescaleseqlen_kwindow_sizeblock_tables	page_sizer   )rs   rr   op_bwcpu)rE   r3   )!rt   r   r\   rc   ru   r;   OPERATORr2   rp   rq   r@   pr[   r   r/   r   r   r"   r!   r0   seqlenr   r   r   _window_sizer   r   r%   BwOpr>   tensorint64	rng_state)r|   r)   rk   r7   r8   r9   r   rs   rr   rng_seed
rng_offsetr}   r+   r+   r,   rv   	  s~   

	

=
zFwOp.apply_bmhkrN   c                    sH   t t| |}t|}t|d|j| t|d|j| t||j |S )Nr2   rq   )	superr\   not_supported_reasonsr-   r(   r2   rq   rO   r   )r|   rN   rA   matmul_alignment_mn	__class__r+   r,   r   ]  s   zFwOp.not_supported_reasons)=rQ   rR   rS   rT   r   r   r`   r	   str__annotations__r>   halfbfloat16ra   rE   SUPPORTED_MAX_Krt   r?   r   r   r   r   r   r   r   r   r   r   r   rZ   r   r   r"   r    r!   rc   r   r   SUPPORTS_DROPOUTSUPPORTS_CUSTOM_SCALESUPPORTS_DIFFERENT_VALUE_EMBEDSUPPORTS_PARTIALSUPPORTS_BMGHKNAMEfloatre   r   rf   rj   r   rY   classmethodr'   boolr
   r   r%   r   rv   r   __classcell__r+   r+   r   r,   r\      st   
 
9S$r\   c                	       s   e Zd ZU ejZeddZejZejZdZ	e
dejeeeeeejejf	Zee ed< dZejZejZejZdZdZg dZe e! ed	< e"d
e#de e$ f fddZ%e"de&de#dejde'fddZ(  Z)S )r   r]   efficient_attention_backward_ckri   Nrc   TckB)rg   @   rh   rB   ri   rj   rN   r*   c                    s  t t| |}t|}t|d|j| t|d|j| t|d|j| t||j	 t
|j	}|d ur|jr|jjdkrS|jdkrSg |jjd d |jjd R }n|jjd |jjdkrd|jjd nd|jjd |jjd f}t|j|kr|d	t|j d
| d |S )Nr2   rp   rq   rm   r   r   r   rn   z=Broadcasting the `attn_bias` tensor is not supported (shape: z/ expected: rC   )r   r   r   r-   r(   r2   rp   rq   rO   r   r@   requires_gradrH   rw   tuplerJ   )r|   rN   rA   r   rK   expected_bias_shaper   r+   r,   r     s0   
$


zBwOp.not_supported_reasonsr}   r)   gradc                 C   sP  t |jtjvrtdt|\}}}}|jj}d }	}
|jdkrI|j	d u s:|j	jt
jks:|j	jj dks:|j	jdkrBtd|j	 |j	 \}	}
| j|||j|j|jft|j||||t|jttfrk|jjjnd |j|j||j|	|
t|j|jt|jtttfr|jj nd d\}}}}t|jt
j!r|jj"sd }t#||||dS )	Nrl   r   g        r   )r   zInvalid rng_state: )r   r8   r7   r9   r:   r   	logsumexpoutputr   r   r   r   r   r   )dqdkdvdb)$rt   r   r   rc   ru   r;   r2   rE   r   r   r>   r   r3   rw   tolistr   r1   rp   rq   r@   r/   r   r   r0   r   rr   rs   r[   r   r   r   r   r   r?   r   r&   )r|   r}   r)   r   r7   r8   r9   r:   rE   r   r   grad_qgrad_kgrad_v	grad_biasr+   r+   r,   r     sl   



	1z
BwOp.apply)*rQ   rR   rS   r\   rT   r   r   r`   ra   r   rt   r>   r?   r   r   r   r   r   r   rZ   r   rc   r   r   r   SUPPORTS_ATTN_BIAS_GRADr   r   r   SUPPORTS_UNPADDED_LSEr   rj   r   rY   r   r'   r   r   r%   r&   r   r   r+   r+   r   r,   r   g  s6   
 
 &r   )5dataclassesr   enumr   typingr   r   r   r   r   r	   r
   r   r>   commonr   r    r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   rY   r-   r?   r;   r@   r   rO   rP   r[   r\   r   r+   r+   r+   r,   <module>   sB   (L 



 
 P