o
     iw                  *   @   s  d dl Zd dlZd dlmZmZmZmZmZm	Z	m
Z
 d dlZd dlmZmZ ddlmZmZ ddlmZmZmZmZmZmZmZmZmZmZmZmZmZm Z m!Z!m"Z"m#Z# ddlm$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z* dd	l+m,Z,m-Z-m.Z.m/Z/m0Z0 d
Z1ej2j3de4drddl5m6Z6 ddl7m8Z8 e8dure8j9:dZ1nej23drd dl;m<Z6 ndZ6dd Z=de	ej> fddZ?dee dee fddZ@deAdeAdeAdeAdeAf
dd ZBdeAdeAfd!d"ZCe6durd#d$gZDd dlEZEdeAfd%d&ZFeDD ]\ZGZHeFeIe6eGjJZKeKeHksJ d'eG d(eH d)eK d*qejLjMd+d,d-gd.					/d[d0ejNd1ejNd2ejNd3eejN d4eejN d5eejN d6eAd7eAd8eOd9eOd:ePdeAdeAd;eejN d<eejN d=eejN d>eejN d?ePde
ejNejNf f&d@dAZQejLRd+			d\d0ejNd1ejNd2ejNd3eejN d4eejN d5eejN d6eAd7eAd8eOd9eOd:ePdeAdeAd;eejN d<eejN d=eejN de
ejNejNf f"dBdCZSeejTjUjVdDdEd0ejNd1ejNd2ejNd3eejN d4eejN d5eejN d6eAd7eAd8eOd9eOd:ePdeAdeAfdFdGZWdHePde
ejNejNejNf fdIdJZXejLjMdKd,d-gd.dHePdLejNd0ejNd1ejNd2ejNdMejNdNejNd3ejNd4ejNd6eAd7eAd9eOd:ePdeAdeAde
ejNejNejNf f dOdPZYejLRdKdHePdLejNd0ejNd1ejNd2ejNdMejNdNejNd3ejNd4ejNd6eAd7eAd9eOd:ePdeAdeAde
ejNejNejNf f dQdRZZeejTjUj[dDdEdHePdLejNd0ejNd1ejNd2ejNdMejNdNejNd3ejNd4ejNd6eAd7eAd9eOd:ePdeAdeAfdSdTZ\eG dUdV dVe%Z]eG dWdX dXe$Z^eG dYdZ dZe]Z_dS )]    N)AnyIterableListOptionalSequenceSetTuple)%_unpack_flash_attention_nested_shapesregister_flop_formula   )get_operatorregister_operator   )VARLEN_BIASES&BlockDiagonalCausalFromBottomRightMask4BlockDiagonalCausalLocalAttentionFromBottomRightMask%BlockDiagonalCausalLocalAttentionMask/BlockDiagonalCausalLocalAttentionPaddedKeysMaskBlockDiagonalCausalMask*BlockDiagonalCausalWithOffsetGappyKeysMask+BlockDiagonalCausalWithOffsetPaddedKeysMaskBlockDiagonalGappyKeysMaskBlockDiagonalMaskBlockDiagonalPaddedKeysMask!LocalAttentionFromBottomRightMask0LowerTriangularFromBottomRightLocalAttentionMask"LowerTriangularFromBottomRightMaskLowerTriangularMask0PagedBlockDiagonalCausalWithOffsetPaddedKeysMask PagedBlockDiagonalPaddedKeysMask)AttentionBwOpBaseAttentionFwOpBaseContext	GradientsInputsScaledTensorcheck_lastdim_alignment_stride1)_check_needs_no_topleft_convert_input_format
_is_causal_post_process_lse_window_sizez0.0.0z..._C_flashattention3)package   )_C_flashattention3)_build_metadatavflash_attn_interface)flashattn_hopper_cudac                 C   s"   | d ur|  ddkr|  S | S )Nr   )stride
contiguousx r8   L/home/ubuntu/.local/lib/python3.10/site-packages/xformers/ops/fmha/flash3.pymaybe_contiguousH   s   "r:   returnc                  C   s.   t jt jh} tjdddkr| t j | S )NXFORMERS_FLASH3_FP801)torchhalfbfloat16osenvirongetaddfloat8_e4m3fn)typesr8   r8   r9   supported_dtypesL   s   rH   attn_bias_typesc                 C   s$   t jdddkr| S dd | D S )NXFORMERS_FLASH3_PAGEDr=   r>   c                 S   s   g | ]	}t |ts|qS r8   )
issubclassr   .0r7   r8   r8   r9   
<listcomp>Y   s    z+_paged_attention_filter.<locals>.<listcomp>)rB   rC   rD   )rI   r8   r8   r9   _paged_attention_filterV   s
   rO   s_qs_kwindow_leftwindow_rightc                 C   s   |dk r|dk r| | S |dk r%|dkr%| | d  d | t d||    S |dk r+|}|dk r1|}t| |}t|| }||d | d 7 }|| | | 7 }t|d | }||d | d 7 }|| | | 7 }|S )Nr   r   r   )maxmin)rP   rQ   rR   rS   mask_nzlastq_ut	firstq_btr8   r8   r9   mask_non_zeros`   s    "

rY   c                 C   s   | \}}}}|\}	}
}}|\}}}}||	  kr|ks J  J |
|ks&J ||ks,J ||ks2J ||ks8J ||
 dks@J t ||||}d| | | | }|d| | | | 7 }|S )z^
    Count flops for self-attention.

    NB: We can assume that value_shape == key_shape
    r   r   )rY   )query_shape	key_shapevalue_shaperR   rS   bh_qrP   d_q_b2h_kvrQ   _d2_b3_h2_s3d_vrV   total_flopsr8   r8   r9   sdpa_flop_count   s   rh   )fwd   )bwd   c                 C   s0   t d| }|r|dd}t|S td)Nz	\((.*?)\)r   ,z.No valid argument list found in the docstring.)researchgroupsplitlen
ValueError)	docstringmatch	args_listr8   r8   r9   count_args_from_doc   s
   rw   z"Found func signature mismatch for z. Expected z	,actual: z/ Please update the version of Flash Attention3.zxformers_flash3::flash_fwdr8   cuda)mutates_argsdevice_typesFquerykeyvaluecu_seqlens_qcu_seqlens_k	seqused_kmax_seqlen_qmax_seqlen_kpsoftmax_scale	is_causal	descale_q	descale_k	descale_vblock_tableuse_kvsplitc                 C   s  dd | |fD \} }| ddkrt|}dd |||fD \}}}t|}|d u r|d u s1J |d u s7J |d u s?J dtjg | ||d d d d d d d d d d d d d d d ||||	|
||ddd	dd	dR  ^}}}||fS |d u r|d u r|d u sJ d
|rd	}| jd dkr| jd |jd krd}tjg | ||d d d d |d d ||||d d d d ||||	|
||ddd	d|dR  ^}}}| jd }|j}||d |d|d }|dddd|d}||fS tjg | ||d d d ||d u r5|nd d d ||||d d d d ||||	|
||ddddd dR  ^}}}||fS )Nc                 S      g | ]}t |qS r8   r:   rL   r8   r8   r9   rN      s    zmha_fwd.<locals>.<listcomp>r   c                 S   r   r8   r   rL   r8   r8   r9   rN      s    z7Block table is not supported for fixed-length query yetr           FzQFP8 attention does not yet support variable-length inputs during the forward pass@   r   Tr3   r-   )r4   r:   r.   ri   shapeviewpermutereshape)r{   r|   r}   r~   r   r   r   r   r   r   r   rR   rS   r   r   r   r   r   outsoftmax_lserestpack_gqanum_heads_qori_lse_shaper8   r8   r9   mha_fwd   s  
	
!"	

#	
"r   c                 C   sV   | j }| |}|d u r|d |d |d fn|d |d f}| j|tjd}||fS )Nr   r   r   )dtype)r   	new_emptyr?   float32)r{   r|   r}   r~   r   r   r   r   r   r   r   rR   rS   r   r   r   rZ   r   	lse_shapelser8   r8   r9   mha_fwd_fakel  s   
r   T)get_rawc              	      s@  d| j   krdksJ  J d|j   krdksJ  J d|j   kr*dks-J  J tjdddkrbd  } } }}| j dkrH| dn| } |j dkrT|dn|}|j dkr`|dn|}t| j dkrn| ddn| |j dkrz|ddn||j dkr|ddn|||||d	}|
rdt fd
d|D }|S )Nr-       XFORMERS_FLOP_FORMULA_WORST_CASEr=   r>   r   r   r   )r{   r|   r}   	cum_seq_q	cum_seq_kmax_qmax_kc                 3   s*    | ]\}}}}t ||| d V  qdS )rR   rS   N)rh   )rM   rZ   r[   r\   _r   r8   r9   	<genexpr>  s    

z mha_fwd_flops.<locals>.<genexpr>)ndimrB   rC   rD   	unsqueezer	   	transposesum)r{   r|   r}   r~   r   r   r   r   r   r   r   rR   rS   argskwargssizesresr8   r   r9   mha_fwd_flops  s.   	
r   grads_share_storagec                 C   s~   | r1t jg |jdd d|jd |jd R |j|jd}|dd|dd|ddfS t |t |t |fS )	Nr   r   r-   r3   )r   devicer   r   r   )r?   emptyr   r   r   select
empty_like)r   r{   r|   r}   chunkr8   r8   r9   _create_dq_dk_dv  s   *"r   zxformers_flash3::flash_bwddoutr   r   c                 C   s   d  }}t | |||\}}}d}|d u r;|d u sJ t|||||||||d d ||d d ||||dd|d^}}}}}n t||||||||||||||	|
||||dd|d^}}}}}|||fS )NFr   r   )r   r.   rk   )r   r   r{   r|   r}   r   r   r~   r   r   r   r   r   rR   rS   	seqused_qr   dqdkdvis_deterministic	softmax_dr   r8   r8   r9   mha_bwd  sp   
r   c                 C   s(   t |}t |}t |}|||fS N)r?   r   )r   r   r{   r|   r}   r   r   r~   r   r   r   r   r   rR   rS   r   r   r   r8   r8   r9   mha_bwd_fake$  s   



r   c                 O   s*   dt |||||d |	|
dd|||d d S )N   r   g      ?)
r~   r   r   r   r   r   r   r   rR   rS   r   )r   )r   r   r{   r|   r}   r   r   r~   r   r   r   r   r   rR   rS   r   r   r8   r8   r9   mha_bwd_flops;  s(   r   c                       s  e Zd ZU dZeddZdhZee e	d< dZ
e Zeej e	d< dZd	Zed
eeeeeeeeeeeeeee e!fZ"e#e$ e	d< e%e"Z"dZ&dZ'dZ(dZ)dZ*dZ+de, Z-e,Z.e/de0de1e f fddZ2e/	dde0de3de3de4ej5e6e7 f fddZ8  Z9S )FwOpzOperator that computes memory-efficient attention using         `Flash-Attention <https://github.com/HazyResearch/flash-attention>`_         implementation.
    xformers_flash3	flash_fwdrx   SUPPORTED_DEVICES)	   r   SUPPORTED_DTYPES   r   NSUPPORTED_ATTN_BIAS_TYPESFTzfa3F@dr;   c                    sH   t t| |}t|d|jd |jjd dvr|d t|| |S Nr{      r3   )r         r   z.only head-dim 64, 128, 192 or 256 is supported)superr   not_supported_reasonsr&   r{   r   appendr'   clsr   reasons	__class__r8   r9   r     s   

zFwOp.not_supported_reasonsinpneeds_gradientr   c                 C   s  |j j}g |j jd d |jjd }t|d|d\}}}}}	}
dd }||j \}}||j\}}||j\}}|j  dkr|j dkrt|j\}}t|jt	rY|jj
nd }| j|||f|||
||	|j|jt|j|||||||d\}}||}n(tj|j j|j j|j jd}tj|j jd |j jd	 |j jd
 g|j jtjd}t||d}|s|d fS t|t||t|ddd}||fS )Nr3   T)supports_mqar   c                 S   s   t | tr	|  S | d fS r   )
isinstancer%   unpackr6   r8   r8   r9   unpack_func  s   zFwOp.apply.<locals>.unpack_funcr   )r~   r   r   r   r   r   r   r   rR   rS   r   r   r   r   r   )r   r   r   r   )r   r   )varlen_lse_packed)r{   r   r}   r(   r|   numelr+   	attn_biasr   r   block_tablesOPERATORr   scale_floatr)   r   r?   zerosr   r   r   r   r"   r*   tuple)r   r   r   r   original_query_shape	out_shaper~   r   r   r   r   r   qr   kr   r0   r   win_left	win_rightr   r   r   ctxr8   r8   r9   apply  s   


 z
FwOp.apply)F):__name__
__module____qualname____doc__r   r   r   r   str__annotations__CUDA_MINIMUM_COMPUTE_CAPABILITYrH   r   r?   r   SUPPORTED_MAX_KSUPPORTED_MIN_Ktyper   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rO   SUPPORTS_DROPOUTSUPPORTS_CUSTOM_SCALESUPPORTS_DIFFERENT_VALUE_EMBEDSUPPORTS_BMGHKSUPPORTS_PARTIALUNPADDED_LSEFLASH_VERSIONNAMEVERSIONclassmethodr$   r   r   boolr   Tensorr   r"   r   __classcell__r8   r8   r   r9   r   e  s`   
 


r   c                
       s   e Zd ZU ejZeddZejZejZej	Z	ej
Z
ejZedeeeeeeeeef
ZejZejZejZdZdZddgZee ed< de  Z!e Z"e#d	e$d
e%e f fddZ&e#de'de$de(j)d
e*fddZ+  Z,S )BwOpr   	flash_bwdNF varlen_flatSUPPORTS_LSE_FORMATSzfa3B@r   r;   c                    sR   t t| |}t|d|jd t|| |jjd dvr"|d t|| |S r   )r   r
  r   r&   r{   r'   r   r   r   r   r8   r9   r     s   


zBwOp.not_supported_reasonsr   r   gradc                 C   s  |j j|jj|jj}}}t|dd\}}}}	}
}|j}t|jtr1|jd dks,J |d }n|jd |ks:J |d d d d d |f 	 }g |j jd d |jjd }|j
| jv saJ |j  r|j rt|j\}}| j|j||	 |j |j|j|j||j||	||
|||jt|jd\}}}t|||}ntt|j t|jt|jd}|j||_|j||_|j||_|S )	NF)r   r   r   r   r3   )rR   rS   r   r   )r   r   r   )r{   r   r|   r}   r(   r   r   r   r   r5   r   r   r   r+   r   qkv_share_storager   r   r   r)   r#   r?   
zeros_liker   r   r   )r   r   r   r  dq_shapedk_shapedv_shaper~   r   r   r   r   ctx_lsekernel_out_shaper   r   r   r   r   gradsr8   r8   r9   r     sd   






z
BwOp.apply)-r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   IS_DETERMINISTICr   r  r   r   r   r  r  r  r  r$   r   r   r"   r?   r  r#   r   r	  r8   r8   r   r9   r
    s>   
 


&r
  c                       sj   e Zd ZU dZdZeed< eefZ	e
e ed< e	ddedededeejee f f fd	d
Z  ZS )FwOp_KVSplitzOperator that computes memory-efficient attention using         `Flash-Attention3 <https://github.com/Dao-AILab/flash-attention/tree/main/hopper>`_         implementation with heuristic rules to dispatch decoding shapes to KVSplit Attention     Tenable_kvsplit_attnr   r   r   r   r;   c                    sR   |j }t|ts
J |jj|jjk}|jjdk}|o |o | jo |}t |||S )N
   )	r   r   r   	q_seqinfo
min_seqlen
max_seqlenr  r   r   )r   r   r   r   r   homogeneous_qshort_qr   r8   r9   r   e  s   zFwOp_KVSplit.apply)T)r   r   r   r   r  r  r   r   r   r   r   r   r  r$   r   r?   r  r   r"   r   r	  r8   r8   r   r9   r  W  s"   
 r  )NNNNF)NNN)`importlib.util	importlibrB   typingr   r   r   r   r   r   r   r?   torch.utils.flop_counterr	   r
   commonr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   flashr'   r(   r)   r*   r+   r  util	find_spec__package__r  r.   _cpp_libr/   flash_versionlstripr1   r2   r:   r   rH   rO   intrY   rh   EXPECTED_NUM_OF_ARGSrn   rw   namenum_of_argsgetattrr   num_of_args_from_doclibrary	custom_opr  floatr  r   register_faker   opsr   r   r   r   r   r   r  r   r   r
  r  r8   r8   r8   r9   <module>   s   $L$	

*


	
 
&	
	
;
	

K	
	
) 	h