o
     iB                     @   s  d dl mZ d dlmZ d dlmZ d dlmZmZm	Z	m
Z
mZmZmZ d dlZddlmZmZ dd	lmZ dd
lmZmZmZmZmZmZmZmZmZmZmZ ddlmZm Z m!Z!m"Z"m#Z#m$Z$m%Z% ddl&m'Z' de(de)de)fddZ*de#de(fddZ+de#dee
ej, e
ej, e(e(f fddZ-de
eej,ef  de
ej, fddZ.de	e/ de
eej,ef  ddfddZ0G dd de(eZ1de
eej,ef  de(fd d!Z2eG d"d# d#e Z3eG d$d% d%eZ4dS )&    )replace)Enum)partial)AnyIterableListOptionalSetTupleUnionN   )get_operatorregister_operator   	attn_bias)AttentionBiasAttentionBiasSubTensor4BlockDiagonalCausalLocalAttentionFromBottomRightMask%BlockDiagonalCausalLocalAttentionMaskBlockDiagonalCausalMask+BlockDiagonalCausalWithOffsetPaddedKeysMaskBlockDiagonalMask0LowerTriangularFromBottomRightLocalAttentionMask"LowerTriangularFromBottomRightMaskLowerTriangularMask!LowerTriangularMaskWithTensorBias)AttentionBwOpBaseAttentionFwOpBaseContext	GradientsInputs_attn_bias_applycheck_lastdim_alignment_stride1)is_pt_cutlass_compatiblesmis_halfreturnc                 C   s   | dkrdS | dkr|S dS )NP   TF   F )r%   r&   r*   r*   M/home/ubuntu/.local/lib/python3.10/site-packages/xformers/ops/fmha/cutlass.py_uses_tensorcores)   s
   r,   inpc                 C   s   | j jdkrdS tj| j }|d d |d  }tjdtjdtjdi| jj	 }t
||dk}d}|dkr7d}|r@t|d	| }|S )
Ncudar   r   
          r(         )devicetypetorchr.   get_device_capabilityfloathalfbfloat16querydtyper,   max)r-   capr%   bits_per_scalaruses_tensorcoresmatmul_alignment_mnr*   r*   r+   _minimum_gemm_alignment1   s   rB   c                 C   sh   | j }t|ttfr&|jjj| jjksJ |jj}|jj}|jj	}|jj	}nd }d }d}d}||||fS )N)
r   
isinstancer   r   	k_seqinfoseqstartr4   r;   	q_seqinfo
max_seqlen)r-   r   
seqstart_k
seqstart_qmax_seqlen_qmax_seqlen_kr*   r*   r+   _get_seqlen_infoB   s   
rM   r   c                 C   s2   t | trt | tr| jS d S t | tjr| S d S N)rD   r   r   
_subtensorr6   Tensorr   r*   r*   r+   _get_tensor_biasW   s   

rQ   reasonsc                 C   s   t |}|d urQdt|jj }d}t|jd D ]}||| dkr5| d| d|  d d}q|r=| d	 |d
dkrS| d|  d d S d S d S )Nr3   Fr   r   zattn_bias.stride(-2) % z != 0 (attn_bias.stride() = )TzHINT: To use an `attn_bias` with a sequence length that is not a multiple of 8, you need to ensure memory is aligned by slicing a bigger tensor. Example: use `attn_bias = torch.zeros([1, 1, 5, 8])[:,:,:,:5]` instead of `torch.zeros([1, 1, 5, 5])`rC   z/attn_bias.stride(-1) > 1 (attn_bias.stride() = z/) - you should call `.contiguous()` on the bias)	rQ   r6   finfor<   bitsrangendimstrideappend)rR   r   attn_bias_tensor	alignmentshow_padding_hintdr*   r*   r+   _check_bias_alignmentb   s*   r^   c                   @   s   e Zd ZdZdZdZdZdS )_CustomMaskTypez*
    (Matches CustomMaskType in C++.)
    r   r   r   N)__name__
__module____qualname____doc__NoCustomMaskCausalFromTopLeftCausalFromBottomRightr*   r*   r*   r+   r_   ~   s
    r_   biasc                 C   sD   t | tttfrttjS t | ttt	j
ttfrttjS ttjS rN   )rD   r   r   r   intr_   re   r   r   r   &BlockDiagonalCausalFromBottomRightMaskr   r   rf   rd   )rg   r*   r*   r+   _custom_mask_type   s&   



rj   c                       s*  e Zd ZU dZe reddndZdhZee	 e
d< ejejejhZeej e
d< dZedejeeeeeeeejejefZe e! e
d	< d
Z"d
Z#d
Z$d
Z%dZ&dZ'g dZ(e)e* e
d< e+de,de-de.eje/e0 f fddZ1e+de,de-de.eje/e0 f fddZ2e+de,de)e	 f fddZ3  Z4S )FwOpzxFormers' MHA kernel based on CUTLASS.
    Supports a large number of settings (including without TensorCores, f32 ...)
    and GPUs as old as P100 (Sm60)
    aten_efficient_attention_forwardNr.   SUPPORTED_DEVICESSUPPORTED_DTYPESi   SUPPORTED_ATTN_BIAS_TYPESTFzcutlassF-ptr0   r3      _TEST_Kr-   needs_gradientr'   c                    sj  t  jtjvrtd jjdv r| j |dS  jjdks(J d jj d } jjdkr{ jjd dkr{t	t
jdd}t | j| j| jt jt	t
jd	dd
 | j |d\}}|d}|d urwt||jd|d}||fS  jjd	 }t
j }|g fddt|d D  }g }	t|D ]`\}
}|| t
j|I  jd d d d |
f } jd d d d |
f } jd d d d |
f }t jt	t
jd|
d}|	| jt ||||d
|d W d    n1 sw   Y  q|dd  D ]}|| qt
jdd |	D d	d}|r1t|t
jdd |	D dd|	d d jd}||fS )NUnsupported attn_bias type)   r2   )rt      zquery has shape rv   r   )dimr   )r;   keyvaluer   )lseoutc                    s   g | ]}t jj jjd qS ))r4   )r6   r.   Streamr;   r4   ).0_r-   r*   r+   
<listcomp>   s    zFwOp.apply.<locals>.<listcomp>)rx   indexc                 S   s   g | ]}|d  qS )r   r*   r~   or*   r*   r+   r     s    c                 S   s   g | ]}|d  j qS )r   )r{   r   r*   r*   r+   r     s    r   )r|   r{   op_bw)r5   r   rk   rp   NotImplementedErrorr;   rW   
apply_bmhkshaper   r6   squeezer   ry   rz   r"   	unsqueezer{   r.   current_streamrV   	enumeratewait_streamstreamselectrY   stackr   r   )clsr-   rt   ctxslice_opr|   n_groupsmain_streamstreamsoutsgroupr   r;   ry   rz   rg   sr*   r   r+   apply   sn   	



z
FwOp.applyc                 C   s   t |jtjvrtdt|\}}}}| j|j|j|j	t
|j|||||j|t|j|jt|jtr8|jjjnd t|jtttfrF|jjnd d\}}}	}
}}d }|rgt||d}|jdkrg|	|
f|_t|_||fS )Nru   )r;   ry   rz   rg   cu_seqlens_qcu_seqlens_krK   rL   	dropout_pcompute_log_sumexpcustom_mask_typescaleseqlen_kwindow_size)r|   r{   r   )r5   r   rk   rp   r   rM   OPERATORr;   ry   rz   rQ   prj   r   rD   r   rE   seqlenr   r   r   _window_sizer   	rng_stateBwOpr   )r   r-   rt   rI   rJ   rK   rL   r|   r{   rng_seed
rng_offsetr   r   r*   r*   r+   r   
  sL   
	!

zFwOp.apply_bmhkr]   c                    sH   t t| |}t|}t|d|j| t|d|j| t||j |S )Nr;   rz   )	superrk   not_supported_reasonsrB   r#   r;   rz   r^   r   )r   r]   rR   rA   	__class__r*   r+   r   =  s   zFwOp.not_supported_reasons)5r`   ra   rb   rc   r$   r   r   rn   r	   str__annotations__r6   r8   r9   r:   ro   r<   SUPPORTED_MAX_Kr5   rP   r   r   r   r   r   r   r   r   ri   r   r   rp   r   r   SUPPORTS_DROPOUTSUPPORTS_CUSTOM_SCALESUPPORTS_DIFFERENT_VALUE_EMBEDSUPPORTS_BMGHKVARLEN_LSE_PACKEDNAMErs   r   rh   classmethodr!   boolr
   r   r   r   r   r   __classcell__r*   r*   r   r+   rk      s^   
 ?2$rk   c                	       s   e Zd ZU ejZe reddndZejZej	Z	ej
Z
edejeeeeejejfZee ed< dZejZejZejZdZdZg dZe e! ed	< e"d
e#de e$ f fddZ%e"de&de#dejde'fddZ(  Z)S )r   rl   _efficient_attention_backwardNrp   TFzcutlassB-ptrq   rs   r]   r'   c                    s  t t| |}t|}t|d|j| t|d|j| t|d|j| t||j	 t
|j	}|d ur|jr|jjdkrS|jdkrSg |jjd d |jjd R }n|jjd |jjdkrd|jjd nd|jjd |jjd f}t|j|kr|d	t|j d
| d |S )Nr;   ry   rz   rv   r   r   r   r2   z=Broadcasting the `attn_bias` tensor is not supported (shape: z/ expected: rS   )r   r   r   rB   r#   r;   ry   rz   r^   r   rQ   requires_gradrW   r   tuplerY   )r   r]   rR   rA   rZ   expected_bias_shaper   r*   r+   r   p  s0   
$


zBwOp.not_supported_reasonsr   r-   gradc                 C   s2  t |jtjvrtdt|\}}}}|jj}t	  }	}
|j
dkr/|jd us*J |j\}	}
t|j}tj|jjdk}| j|||j|j|jf||d urS|jnd|||||jd|d|j||j
|	|
t|j|jd t|jtttfr{|jjnd d\}}}}t|jtj	r|jjsd }t||||dS )	Nru   g        )   rw   Fr0   )force_pad_inf)rg   bias_requires_gradr   r   rK   rL   	logsumexpr|   r   philox_seedphilox_offsetr   r   num_splits_keyr   )dqdkdvdb)r5   r   r   rp   r   rM   r;   r<   r6   rP   r   r   rQ   r.   r7   r4   r   tory   rz   r   get_padded_lser|   rj   r   rD   r   r   r   r   r    )r   r   r-   r   rI   rJ   rK   rL   r<   r   r   tensor_biasr   grad_qgrad_kgrad_v	grad_biasr*   r*   r+   r     s\   




	*z
BwOp.apply)*r`   ra   rb   rk   rc   r$   r   r   rn   ro   r   r5   r6   rP   r   r   r   r   r   ri   r   rp   r   r   r   SUPPORTS_ATTN_BIAS_GRADr   r   r   r   r   rs   r   rh   r   r!   r   r   r   r    r   r   r*   r*   r   r+   r   G  s:   
 &r   )5dataclassesr   enumr   	functoolsr   typingr   r   r   r   r	   r
   r   r6   commonr   r    r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   torch_attention_compatr$   rh   r   r,   rB   rP   rM   rQ   r   r^   r_   rj   rk   r   r*   r*   r*   r+   <module>   sH   $4$	


 
 '