o
     iq                  (   @   s  d dl Zd dlZd dlmZ d dlmZmZmZm	Z	m
Z
mZmZ d dlZddlmZmZ ddlmZmZmZmZmZmZmZmZmZmZmZmZmZmZm Z m!Z!m"Z"m#Z# ddlm$Z$m%Z%m&Z&m'Z'm(Z(m)Z) dd	l*m+Z+ d
Z,dZ-dZ.ej/j0du o{ej1j23 Z4dZ5ej6j7de8drddl9m:Z: ddl;m<Z< e<dure<j=>dZ,dZ-n~ej67drd dl?Z?d dl@Z?eAe?jBdre?jBjCZ:ne?jBjDZ:e?jEZ,dZFdZGeHdd e,Iddd D ZJeJeFk seJeGkrejKLdddkreMddNdd eFD  d dNd!d eGD  d"e, ddZ-ne4re+dd#du Z.ejOjPQ Z,e. Z-dZ5e,d
krejRjSd$d%d&gd'd(ejTd)ejTd*ejTd+e	ejT d,e	ejT d-e	ejT d.eUd/eUd0eVd1eVd2eWd3eUd4eUd5eWd6e	ejT d7eejTejTejTf f d8d9ZXejRYd$d:d; ZZejRjSd<d%d&gd'd=eWd>ejTd(ejTd)ejTd*ejTd?ejTd@ejTd+ejTd,ejTd.eUd/eUd0eVd1eVd2eWd3eUd4eUdAejTd7eejTejTejTf f$dBdCZ[ejRYd<dDdE Z\d=eWd7eejTejTejTf fdFdGZ]	dedHe(dIeWdJeWd7ee(e	ejT eUe	ejT eUe	ejT f fdKdLZ^dMe	eejTef  d7eWfdNdOZ_d7eWfdPdQZ`dMe	eejTef  d7eeUeUf fdRdSZadTe(dUeeb d7dfdVdWZcdXejTdYebdUeeb d7dfdZd[Zde-fd@ejTdHe(d\eeUd]f d^eWd7ejTf
d_d`ZeeG dadb dbe%ZfeG dcdd dde$ZgdS )f    N)zip_longest)AnyIterableListOptionalSetTupleUnion   )get_operatorregister_operator   )VARLEN_BIASESAttentionBias&BlockDiagonalCausalFromBottomRightMask4BlockDiagonalCausalLocalAttentionFromBottomRightMask%BlockDiagonalCausalLocalAttentionMask/BlockDiagonalCausalLocalAttentionPaddedKeysMaskBlockDiagonalCausalMask*BlockDiagonalCausalWithOffsetGappyKeysMask+BlockDiagonalCausalWithOffsetPaddedKeysMaskBlockDiagonalGappyKeysMaskBlockDiagonalMaskBlockDiagonalPaddedKeysMask!LocalAttentionFromBottomRightMask0LowerTriangularFromBottomRightLocalAttentionMask"LowerTriangularFromBottomRightMaskLowerTriangularMask0PagedBlockDiagonalCausalWithOffsetPaddedKeysMask PagedBlockDiagonalPaddedKeysMask)AttentionBwOpBaseAttentionFwOpBaseContext	GradientsInputscheck_lastdim_alignment_stride1)is_pt_flash_oldz0.0.0Fz..._C_flashattention)package   )_C_flashattention)_build_metadatavT
flash_attnflash_attn_cuda)r
      r   )r
   r.      c                 c   s    | ]}t |V  qd S N)int).0s r4   K/home/ubuntu/.local/lib/python3.10/site-packages/xformers/ops/fmha/flash.py	<genexpr>J   s    r6   .#XFORMERS_IGNORE_FLASH_VERSION_CHECK01z#Requires Flash-Attention version >=c                 C      g | ]}t |qS r4   strr2   ir4   r4   r5   
<listcomp>O       r@   z,<=c                 C   r;   r4   r<   r>   r4   r4   r5   r@   P   rA   z	 but got )forcezxformers_flash::flash_fwdr4   cuda)mutates_argsdevice_typesquerykeyvaluecu_seqlens_qcu_seqlens_k	seqused_kmax_seqlen_qmax_seqlen_kpsoftmax_scale	is_causalwindow_leftwindow_rightreturn_softmaxblock_tablesreturnc                 C   s   d}t r7tjjj| ||||||||
d|	|||d d}tr+|\}}}}}t||g}n|\}}}}}|||fS |d u r]|d u sAJ |d u sGJ t| ||d d ||	|
||||d \}}}}nt	| ||d |||d |d ||||	d|
||||d \}}}}|||fS )N        F)return_debug_maskscalewindow_size_leftwindow_size_rightrK   alibi_slopes)
_USE_PT_FLASH_ATTNtorchopsaten_flash_attention_forwardpt_flash_is_oldstackr)   fwd
varlen_fwd)rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   softcapret	attention	logsumexpphilox_seedphilox_offset_	rng_stateoutsoftmax_lser4   r4   r5   
_flash_fwd^   s   

ro   c                 C   s   t | }|d u r| j\}}}}|||g}n| j\}}}|jd d }tr*||g}n|||g}t j|| jt jd}t jdg| jt jd}|||fS )Nr   r   devicedtyper
   )r]   
empty_likeshapeVARLEN_LSE_PACKEDemptyrq   float32int64)rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rm   BMHK	lse_shapern   rl   r4   r4   r5   _flash_fwd_abstract   s   



r~   zxformers_flash::flash_bwdgrads_share_storagegradrm   lserl   c                 C   s  d}t r9|dks
J |d urtr|d }|d }n| }}tjjj|||||||||	|
|||||||d\}}}nHt| |||\}}}|d u re|d u sMJ t|||||||||d ||||||dd | nt	|||||||||||d |	|
||d||||dd | |||fS )NrV   r   r   )rX   rY   rZ   F)
r\   ra   r]   r^   r_   _flash_attention_backward_create_dq_dk_dvr)   bwd
varlen_bwd)r   r   rF   rG   rH   rm   r   rI   rJ   rL   rM   rN   rO   rP   rQ   rR   rl   re   
rng_state0
rng_state1dqdkdvr4   r4   r5   
_flash_bwd   s   

r   c                 O   s   t | |||S r0   )r   )r   r   rF   rG   rH   argskwargsr4   r4   r5   _flash_bwd_abstractJ  s   
r   c                 C   s~   | r1t jg |jdd d|jd |jd R |j|jd}|dd|dd|ddfS t |t |t |fS )	Nr   r(   )rr   rq   r   r
   )r]   rv   rt   rr   rq   selectrs   )r   rF   rG   rH   chunkr4   r4   r5   r   V  s   *"r   inpsupports_mqause_kvsplitc              
   C   s  | j jdv sJ | j | j| j}}}|jd }|jd }|jd }|jd }	|jd }
| j}t|trR|jj	j
| j j
ks?J |jj	}|jj	}|jj}|jj}d }n:t|tttfrz|jj	j
| j j
kseJ |jj	}|jj	}|jj}|jj}|jj}nd }d }d }| j jd }| jjd }|jdkr|sJ dd }||}||}||}|r|jdkr|d	dkr|d	dkr|d d d d d df }|d d d d d df }|d ur%||| d|	g}||| d|	g}||| d|
g}t|tr%|jd |j }|j||jg|jdd  R  }|j||jg|jdd  R  }|rS|jd
kr:|jd
kr:|jd
ks<J t|jjd }||d|jd |jd	 g}t||||| j| j| j| jd}||||||fS )N)r/      r   r   r   r   c                 S   sN   |  ddkr| d d d d d d df S | | jd | jd d| jd gS )Nr(   r   r   r   r/   )stridereshapert   )xr4   r4   r5   fold  s   z#_convert_input_format.<locals>.foldr/   r
   r(   )rF   rG   rH   	attn_biasrN   rX   output_dtype
is_partial)rF   ndimrG   rH   rt   r   
isinstancer   	k_seqinfoseqstartrq   	q_seqinfo
max_seqlenr   r   r   seqlenr   r   	page_sizeviewlenseqstart_pyr$   rN   rX   r   r   )r   r   r   rF   rG   rH   batchseqlen_q	seqlen_kv
head_dim_q
head_dim_vr   cu_seqlen_kcu_seqlen_qrL   rM   rK   r   	num_pagesnew_inpr4   r4   r5   _convert_input_formatf  s   







*
  (
r   r   c                 C   s    t | ttttttttt	t
tfS r0   )r   r   r   r   r   r   r   r   r   r   r   r   )r   r4   r4   r5   
_is_causal  s   r   c                 C   s   t | trt S dS )NT)
issubclassr   r\   )attn_bias_typer4   r4   r5   _is_paged_attention_supported  s   
r   c                 C   sB   d}d}t | ttttfr| jd }t | tr| j}| j}||fS )Nr   r   )	r   r   r   r   r   _window_sizer   rQ   rR   )r   win_left	win_rightr4   r4   r5   r     s   
	
r   dreasonsc                 C   s   t | jtr$t| jjj| jjjD ]\}}||kr!|d  d S qd S t | jtr=| j	j
d | jj
d kr?|d d S d S d S )NzIOnly support BlockDiagonalCausalMask if equal numbers of keys and queriesr   zCOnly support LowerTriangularMask if equal number ofkeys and queries)r   r   r   r   r   r   r   appendr   rF   rt   rG   )r   r   k_startq_startr4   r4   r5   _check_needs_no_topleft  s$   	r   r   namec                 C   s   | j dkrG| d| d}}| jd dkrdS | jd dks$|dkr&dS ||| jd  krI|d| d	|   d
| dt| j  dS dS dS )zD
    We want to be able to collapse the G/H dimensions together
    r   r
   r(   r   Nr   r   zAGQA is only supported when the G/H dimensions are contiguous
    z
.stride:  z
    z
.shape :  )r   r   rt   r   list)r   r   r   stride_gstride_hr4   r4   r5   _check_strides_for_bmghk  s(   
r   original_query_shape.varlen_lse_packedc                 C   s   t |jtst|dkr| d|dd S | S |r2t|dkr-| d|dd dS | dS |js7| S | dddjddd  }t|dkrT|d|dd S |S )Nr   r   r
   r/   r   )	start_dim)	r   r   r   r   	unflatten	unsqueezer   permuteflatten)r   r   r   r   lse_hkmr4   r4   r5   _post_process_lse'  s   
r   c                       s  e Zd ZU dZeddZdhZee e	d< dZ
ejejhZeej e	d< dZed	eeeeeeeeeeeeeee e!fZ"e#e$ e	d
< dd e"D Z"dZ%dZ&dZ'dZ(dZ)e*Z*e+r\de, dnde, Z-e,Z.e/de0de1e f fddZ2e/de0de3de4ej5e6e7 f fddZ8  Z9S )FwOpzOperator that computes memory-efficient attention using         `Flash-Attention <https://github.com/HazyResearch/flash-attention>`_         implementation.
    xformers_flash	flash_fwdrC   SUPPORTED_DEVICES   r   SUPPORTED_DTYPES   NSUPPORTED_ATTN_BIAS_TYPESc                 C   s   g | ]}t |r|qS r4   )r   )r2   br4   r4   r5   r@   h  s
    zFwOp.<listcomp>TFzfa2F@-ptr   rU   c                    s   t t| |}t|d|jd t|| t|jd| t|jd| t|jd| |j	rDt
sDt|jtrD|jj}|j|jkrD|d |S )NrF   r   rG   rH   z,partial attention with heterogeneous queries)superr   not_supported_reasonsr%   rF   r   r   rG   rH   r   ru   r   r   r   r   
min_seqlenr   r   )clsr   r   r   	__class__r4   r5   r   u  s    


zFwOp.not_supported_reasonsr   needs_gradientc                 C   s  d}|j j}g |j jd d |jjd }t|dd\}}}}}	}
|j  dkrh|j dkrht|j\}}t|jt	rA|jj
nd }| j|j |j|j|||
||	|j|jt|j||||d\}}}||}nBtj||j j|j jd}d }tjtrt|jtr|j jd |j jd |j jd	  gn|j jd |j jd |j jd	 g|j jtjd}|s|d fS t|t|||d
}|jdkrt|_||_||fS )NFr   Tr   r   )rQ   rR   rS   rT   rp   r
   r   )rm   r   rV   )rF   rt   rH   r   numelrG   r   r   r   r   rT   OPERATORrN   scale_floatr   r   r]   zerosrq   rr   rv   ru   r   rw   r"   r   BwOpop_bwrl   )r   r   r   rS   r   	out_shaperI   rL   rJ   rM   rK   r   r   rT   rm   rn   rl   ctxr4   r4   r5   apply  sx   




& 	

z
FwOp.apply):__name__
__module____qualname____doc__r   r   r   r   r=   __annotations__CUDA_MINIMUM_COMPUTE_CAPABILITYr]   halfbfloat16r   rr   SUPPORTED_MAX_Ktyper   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   SUPPORTS_DROPOUTSUPPORTS_CUSTOM_SCALESUPPORTS_DIFFERENT_VALUE_EMBEDSUPPORTS_BMGHKSUPPORTS_PARTIALru   r\   FLASH_VERSIONNAMEVERSIONclassmethodr$   r   r   boolr   Tensorr   r"   r   __classcell__r4   r4   r   r5   r   H  s\   
 
r   c                
       s   e Zd ZU ejZeddZejZejZej	Z	ej
Z
eeejeeeeeeehZee ed< ejZejZejZdZdZeZerGde  dnde  Z!e Z"dZ#e$de%d	e&e' f fd
dZ(e$de)de%de*j+d	e,fddZ-  Z.S )r   r   	flash_bwdr   Fzfa2B@r      r   rU   c                    s   t t| |}t|d|jd t|| |jjdkrGtj	
|j}|dv }t|jjd |jjd | jkrG|sG|jdkrG|d| j d |S )	NrF   r   rC   )r   )	   r   r   rV   zdrequires a GPU with compute capability 8.0 (A100) or 9.0 (H100) for dropout when 'query.shape[-1] > ')r   r   r   r%   rF   r   rq   r   r]   rC   get_device_capabilitymaxrG   rt   MAX_HEADDIM_DROPOUT_SM8xrN   r   )r   r   r   device_capabilityis_sm80_or_sm90r   r4   r5   r     s    
 
zBwOp.not_supported_reasonsr   r   r   c                 C   s  |j j|jj|jj}}}t|dd\}}}}	}
}|d u s J |j}t|jtr9t	r9|jd dks4J |d }n|jd |ksBJ |d d d d d |f 
 }g |j jd d |jjd }|j| jv siJ |j  r|j rt|j\}}t| j|j||
 |j |j|j|j||||	||
|j|jt|j|||jdkr|jnd d }ntt|j t|jt|jd	}|j dkr|j  |j  |j dkr|j  |j||_|j||_|j||_|S )
NFr   r   r   r
   r   rV   )rQ   rR   rl   )r   r   r   )rF   rt   rG   rH   r   r   r   r   r   ru   
contiguousrr   r   r   r   r#   r   qkv_share_storager   rm   rN   r   r   rl   r]   
zeros_liker   r   zero_r   )r   r   r   r   dq_shapedk_shapedv_shaperI   rL   rJ   rM   rK   ctx_lsekernel_out_shaper   r   gradsr4   r4   r5   r     sv   









z
BwOp.apply)/r   r   r   r   r   r   r   r   r   r   r   tuplesetr   
differencer   r   r   r   r   r   r   r   r   r   r   r   r   IS_DETERMINISTICr   ru   r\   r   r   r   r  r   r$   r   r=   r   r"   r]   r   r#   r   r   r4   r4   r   r5   r     s@   
 

&r   )F)himportlib.util	importlibos	itertoolsr   typingr   r   r   r   r   r   r	   r]   commonr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   torch_attention_compatr&   r   ru   ra   versionhipbackendsrC   is_flash_attention_available_TRY_PT_FLASH_ATTNr\   util	find_spec__package__ r)   _cpp_libr*   flash_versionlstripr,   flash_attn.flash_attn_interfacehasattrflash_attn_interfacer-   flash_attn_gpu__version__FLASH_VER_MINFLASH_VER_LASTr  splitflash_ver_parsedenvirongetImportErrorjoinnnrg   _get_flash_versionlibrary	custom_opr   r1   floatr   ro   register_faker~   r   r   r   r   r   r   r   r=   r   r   r   r   r   r4   r4   r4   r5   <module>   sh  $P 
 
	

]
 	

d


 j

 

! 