o
    ۷iq                    @  s  U d dl mZ d dlZd dlZd dlZd dlZd dlmZ d dlm	Z	 d dl
mZmZmZ d dlZd dlmZ d dlm  mZ ej rNd dlm  mZ ddlmZmZmZmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z* ddl+m,Z,m-Z- dd	l.m/Z/ d
dl0m1Z1 erd
dl0m2Z2 dZ3dZ4dZ5dZ6dZ7dZ8ee9Z:e oe de3Z;e Z<e oede4Z=e# oe$de5Z>e&de6Z?e% Z@e' oe(de7ZAe) oe*de8ZBe;rzd dlCmDZDmEZE d dlFmGZGmHZH W n1 eIeJeKfy ZL ze:MdeL d dZ;dZDdZEdZGdZHW Y dZL[LndZL[Lww dZDdZEdZGdZHe<rOzd dlNmDZO d dlNmEZP W n) eIeJeKfyN ZL ze:MdeL d dZ<dZOdZPW Y dZL[Ln	dZL[Lww dZOdZPe=rzd dlQmDZR W n% eIeJeKfy ZL ze:MdeL d dZ=dZRW Y dZL[LndZL[Lww dZRe>rzd dlSmTZTmUZUmVZVmWZWmXZXmYZY W n9 eIeJeKfy ZL ze:MdeL d dZ>dZTdZUdZVdZWdZXdZYW Y dZL[LndZL[Lww dZTdZWdZXdZUdZVdZYe?rzd dlZm  m[  m\Z\ W n% eIeJeKfy ZL ze:Md eL d dZ?dZ\W Y dZL[LndZL[Lww dZ\e@r9zd d!l]m^Z^ W n% eIeJeKfy8 ZL ze:Md"eL d dZ@dZ^W Y dZL[LndZL[Lww dZ^eArjzd d#l_m`Za W n% eIeJeKfyi ZL ze:Md$eL d dZAdZaW Y dZL[LndZL[Lww dZaeBrzd dlbmcZd W n% eIeJeKfy ZL ze:Md%eL d dZBdZdW Y dZL[LndZL[Lww dZdejed&krejfjgZhejfjiZjnd9ddd'd(d)Zkd9dd
d*d+d,ZlekZhelZjG d-d. d.eme	ZnG d/d0 d0ZoeG d1d2 d2Zpenjqepd3d4d5d6d
d7enjrepd3d8d
d9enjsepd:d4d;d<d
d7enjtepd:d8d
d9enjuepd=d>d
d9iZvd?ewd@< ejxenjyfd:dCdDZz		E				d;dddFd<dZd[Z{d=d]d^Z|d>d_d`Z}d>dadbZ~d?dgdhZd>didjZd>dkdlZ	d9d@dmdnZdAdodpZejdqdr	d9dBdxdyZ	d9dCdzd{Z		dDdEd|d}ZdFddZdd ZdGddZdAddZehdddd							 	E	
			 dHdIddZejd							 	E	
			 dHdIddZ		E						dJdKddZdLddZ		E						dJdKddZdLddZ		E						dJdKddZdLddZ		E						dJdKddZdLddZ		E						dJdKddZdLddZ		E						dJddEd
ddd ddMddZdLddĄZ		E						dJdKddƄZ		E						dJdKddȄZdLddʄZd9dNdd̄Z		E						dJdKdd΄ZdLddЄZdd҄ ZdOddՄZdPdQddلZG ddۄ dejjZdRddZdSddZdRddZdSddZdTddZe/dUddZe/dUddZG dd dejjZG dd dejjZG dd dejjZ			
dVdWddZ		E				dXdddYddZeojenje}eegdd		E				d;dZddZeojenjse}eegdd		E				d;dZdd Zeojenjte}eegdd		E				d[d\ddZeojenje}eegd		E				d[d\ddZeojenje}eegd					d]d^ddZeojenjqe}eegdd					E			d_d`d	d
Zeojenjre}eegdd					d]d^ddZeojenje}eegd					d]d^ddZeojenje~eegd		E				d;dZddZeojenje|e}egd						dadbddZ	dcddddZeojenjye}egdd		E					dedfddZeojenje}eegdd		E					dedfddZeojenje}egd		E					dedfddZeojenje}eegdd		E					dedfdd Zeojenje}egd		E					dedfd!d"Zeojenje}eegdd		E			dgdhd#d$Zeojenje}egd				didjd%d&Zeojenje~eegdd					dkdld'd(Zeojenjue~eegdd					dkdld)d*Zeojenje~eegd					dkdld+d,Zeojenjed-d egd					dkdld.d/Zeojenjed-d egd					dkdld0d1Zeojenjed2d egd					dkdld3d4Zeojenjed2d egd					dkdld5d6Zeojenje|e}egd		E					dedfd7d8ZdS (m      )annotationsN)	dataclass)Enum)TYPE_CHECKINGAnyCallable   )
get_loggeris_aiter_availableis_aiter_versionis_flash_attn_3_availableis_flash_attn_availableis_flash_attn_versionis_kernels_availableis_kernels_versionis_sageattention_availableis_sageattention_versionis_torch_npu_availableis_torch_versionis_torch_xla_availableis_torch_xla_versionis_xformers_availableis_xformers_version)DIFFUSERS_ATTN_BACKENDDIFFUSERS_ATTN_CHECKS)maybe_allow_in_graph   )gather_size_by_comm)ParallelConfigz2.6.3z0.1.5z2.1.12.5.0z2.2z0.0.29>=)flash_attn_funcflash_attn_varlen_func)_wrapped_flash_attn_backward_wrapped_flash_attn_forwardz.flash_attn is installed but failed to import: z+. Falling back to native PyTorch attention.F)r!   )r"   zflash_attn_3 failed to import: z#. Falling back to native attention.zaiter failed to import: )sageattnsageattn_qk_int8_pv_fp8_cuda!sageattn_qk_int8_pv_fp8_cuda_sm90sageattn_qk_int8_pv_fp16_cudasageattn_qk_int8_pv_fp16_tritonsageattn_varlenz sageattention failed to import: z!flex_attention failed to import: )npu_fusion_attentionztorch_npu failed to import: )flash_attentionztorch_xla failed to import: zxformers failed to import: z2.4.0)device_typesschemac               C  s   dd }|d u r
|S |S )Nc                 S     | S N funcr1   r1   Y/home/ubuntu/vllm_env/lib/python3.10/site-packages/diffusers/models/attention_dispatch.pywrap      zcustom_op_no_op.<locals>.wrapr1   )namefnmutates_argsr-   r.   r5   r1   r1   r4   custom_op_no_op      r:   )lib_stacklevelc               C  s   dd }|d u r
|S |S )Nc                 S  r/   r0   r1   r2   r1   r1   r4   r5      r6   z!register_fake_no_op.<locals>.wrapr1   )opr8   r<   r=   r5   r1   r1   r4   register_fake_no_op   r;   r?   c                   @  sp   e Zd ZdZdZdZdZdZdZdZ	dZ
d	Zd
ZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdS )AttentionBackendNameflash	flash_hubflash_varlenflash_varlen_hub_flash_3_flash_varlen_3_flash_3_hub_flash_3_varlen_hubaiterflexnative_native_cudnn_native_efficient_native_flash_native_math_native_npu_native_xlasagesage_hubsage_varlen_sage_qk_int8_pv_fp8_cuda_sage_qk_int8_pv_fp8_cuda_sm90_sage_qk_int8_pv_fp16_cuda_sage_qk_int8_pv_fp16_tritonxformersN)__name__
__module____qualname__FLASH	FLASH_HUBFLASH_VARLENFLASH_VARLEN_HUB_FLASH_3_FLASH_VARLEN_3_FLASH_3_HUB_FLASH_3_VARLEN_HUBAITERFLEXNATIVE_NATIVE_CUDNN_NATIVE_EFFICIENT_NATIVE_FLASH_NATIVE_MATH_NATIVE_NPU_NATIVE_XLASAGESAGE_HUBSAGE_VARLEN_SAGE_QK_INT8_PV_FP8_CUDA_SAGE_QK_INT8_PV_FP8_CUDA_SM90_SAGE_QK_INT8_PV_FP16_CUDA_SAGE_QK_INT8_PV_FP16_TRITONXFORMERSr1   r1   r1   r4   r@      s4    r@   c                   @  sr   e Zd Zi Zi Zi Ze Zee	Z
eZe		ddd	d
Zedd ZedddZedd ZedddZdS )_AttentionBackendRegistryNFbackendr@   constraintslist[Callable] | Nonesupports_context_parallelboolc                   s,   t d  d   fdd}|S )NzRegistering attention backend: z with constraints: c                   sH   | j  < pg j < tt| j j < r"j	 j
 | S r0   )	_backends_constraintssetinspect	signature
parameterskeys_supported_arg_names_supports_context_paralleladdvaluer2   rw   clsrx   rz   r1   r4   	decorator  s   
z5_AttentionBackendRegistry.register.<locals>.decorator)loggerdebug)r   rw   rx   rz   r   r1   r   r4   register  s   	z"_AttentionBackendRegistry.registerc                 C  s   | j | j| j  fS r0   )_active_backendr|   r   r1   r1   r4   get_active_backend&  s   z,_AttentionBackendRegistry.get_active_backendstrc                 C  s
   || _ d S r0   )r   )r   rw   r1   r1   r4   set_active_backend*  s   
z,_AttentionBackendRegistry.set_active_backendc                 C  s   t | j S r0   )listr|   r   r   r1   r1   r4   list_backends.  s   z'_AttentionBackendRegistry.list_backendsreturnc                 C  s   |j | jv }|S r0   )r   r   )r   rw   rz   r1   r1   r4   _is_context_parallel_available2  s   z8_AttentionBackendRegistry._is_context_parallel_available)NF)rw   r@   rx   ry   rz   r{   )rw   r   )rw   r@   r   r{   )rZ   r[   r\   r|   r}   r   r~   r   r@   r   r   r   _checks_enabledclassmethodr   r   r   r   r   r1   r1   r1   r4   rv   
  s&    

rv   c                   @  sv   e Zd ZU dZded< ded< dZded< dZded	< dZd
ed< dZded< dZ	ded< dZ
d
ed< dZd
ed< dS )_HubKernelConfigzEConfiguration for downloading and using a hub-based attention kernel.r   repo_idfunction_attrNz
str | Nonerevisionz
int | NoneversionzCallable | None	kernel_fnwrapped_forward_attrwrapped_backward_attrwrapped_forward_fnwrapped_backward_fn)rZ   r[   r\   __doc____annotations__r   r   r   r   r   r   r   r1   r1   r1   r4   r   ;  s   
 r   zkernels-community/flash-attn3r!   z(flash_attn_interface._flash_attn_forwardz)flash_attn_interface._flash_attn_backward)r   r   r   r   r   r"   )r   r   r   zkernels-community/flash-attn2z0flash_attn_interface._wrapped_flash_attn_forwardz1flash_attn_interface._wrapped_flash_attn_backwardz kernels-community/sage-attentionr%   z.dict['AttentionBackendName', _HubKernelConfig]_HUB_KERNELS_REGISTRYrw   str | AttentionBackendNamec              	   c  sh    | t jvrtd|  dt| } t|  t|  t j}t |  zdV  W t | dS t | w )z>
    Context manager to set the active attention backend.
    zBackend z is not registered.N)rv   r|   
ValueErrorr@   %_check_attention_backend_requirements"_maybe_download_kernel_for_backendr   r   )rw   old_backendr1   r1   r4   attention_backendl  s   

r           )rw   parallel_configquerytorch.Tensorkeyr   	attn_masktorch.Tensor | None	dropout_pfloat	is_causalr{   scalefloat | None
enable_gqaattention_kwargsdict[str, Any] | NoneAttentionBackendName | Noner   'ParallelConfig' | Noner   c	                  s   |pi }|	d u rt  \ }n
t|	 t j }| ||||||d|d|
i}tddr2||d< t jr^t|tt j   }|rNt	
d  d| d t j D ]	}|di | qT fd	d
| D }|di |S )N)r   r   r   r   r   r   r   _parallel_configr    r   r   z5Removing unsupported arguments for attention backend z: .c                   s$   i | ]\}}|t j  v r||qS r1   )rv   r   ).0kvbackend_namer1   r4   
<dictcomp>  s   $ z)dispatch_attention_fn.<locals>.<dictcomp>r1   )rv   r   r@   r|   getr   r   r~   r   r   warningr}   items)r   r   r   r   r   r   r   r   r   rw   r   
backend_fnkwargsremoved_kwargscheckr1   r   r4   dispatch_attention_fn  s6   	
r   Nonec                 K  s   | d ur
|rt dd S d S )Nz8`is_causal` cannot be True when `attn_mask` is not None.)r   )r   r   r   r1   r1   r4   _check_attn_mask_or_causal  s   r   c                 K  sD   | j |j ks| j |j krtd| j|jks| j|jkr tdd S )Nz1Query, key, and value must be on the same device.z/Query, key, and value must have the same dtype.)devicer   dtyper   r   r   r   r1   r1   r4   _check_device  s
   r   c                 K  s$   t | || | jjdkrtdd S )Ncudaz/Query, key, and value must be on a CUDA device.)r   r   typer   r   r1   r1   r4   _check_device_cuda  s   r   majorintminorr   c                   s   d	 fdd}|S )
Nr   r   r   r   r   r   c                   s<   t | || tj| j fk rtd  d dd S )NzJQuery, key, and value must be on a CUDA device with compute capability >= r   )r   torchr   get_device_capabilityr   r   r   r   r   r1   r4   check_device_cuda  s   z:_check_device_cuda_atleast_smXY.<locals>.check_device_cudar   r   r   r   r   r   r   r   r1   )r   r   r   r1   r   r4   _check_device_cuda_atleast_smXY  s   r   c                 K  s,   | j |j kr
td| j |j krtdd S )Nz'Query and key must have the same dtype.z)Query and value must have the same dtype.)r   r   r   r1   r1   r4   _check_qkv_dtype_match  s
   r   c                 K  s*   t | || | jtjtjfvrtdd S )Nz9Query, key, and value must be either bfloat16 or float16.)r   r   r   bfloat16float16r   r   r1   r1   r4   _check_qkv_dtype_bf16_or_fp16  s   r   c                 K  sd   | j d |j d krtd|j d |j d krtd|d ur.|j d |j d kr0tdd S d S )Nz0Query and key must have the same head dimension.z1Key and value must have the same sequence length.z4Attention mask must match the key's sequence length.)shaper   )r   r   r   r   r   r1   r1   r4   _check_shape  s   r   c                 C  s  | t jt jfv rtstd| j dt dd S | t jt jfv r-t	s+td| j dd S | t j
t jt jt jt jfv rWt sGtd| j dtdd	sUtd| j d
d S | t jkrltsjtd| j dt dd S | t jt jt jt jt jt jfv rtstd| j dt dd S | t jkrtstd| j dd S | t jkrtstd| j dd S | t j krt!std| j dt" dd S | t j#krt$std| j dt% dd S d S )NzFlash Attention backend 'zb' is not usable because of missing package or the version is too old. Please install `flash-attn>=z`.zFlash Attention 3 backend 'zp' is not usable because of missing package or the version is too old. Please build FA3 beta release from source.z	Backend 'zl' is not usable because the `kernels` package isn't available. Please install it with `pip install kernels`.r    z0.12zj' needs to be used with a `kernels` version of at least 0.12. Please update with `pip install -U kernels`.zAiter Attention backend 'z]' is not usable because of missing package or the version is too old. Please install `aiter>=zSage Attention backend 'ze' is not usable because of missing package or the version is too old. Please install `sageattention>=zFlex Attention backend 'zd' is not usable because of missing package or the version is too old. Please install `torch>=2.5.0`.zNPU Attention backend 'za' is not usable because of missing package or the version is too old. Please install `torch_npu`.zXLA Attention backend 'za' is not usable because of missing package or the version is too old. Please install `torch_xla>=zXformers Attention backend 'z`' is not usable because of missing package or the version is too old. Please install `xformers>=)&r@   r]   r_   _CAN_USE_FLASH_ATTNRuntimeErrorr   _REQUIRED_FLASH_VERSIONra   rb   _CAN_USE_FLASH_ATTN_3r^   r`   rc   rd   ro   r   r   re   _CAN_USE_AITER_ATTN_REQUIRED_AITER_VERSIONrn   rp   rq   rr   rs   rt   _CAN_USE_SAGE_ATTN_REQUIRED_SAGE_VERSIONrf   _CAN_USE_FLEX_ATTNrl   _CAN_USE_NPU_ATTNrm   _CAN_USE_XLA_ATTN_REQUIRED_XLA_VERSIONru   _CAN_USE_XFORMERS_ATTN_REQUIRED_XFORMERS_VERSION)rw   r1   r1   r4   r     s   





r      )maxsize
batch_size	seq_len_q
seq_len_kvr   torch.device | Nonec           
      C  s   t j| f|t j|d}t j| f|t j|d}t j| d t j|d}t j| d t j|d}t j|dd|dd < t j|dd|dd < |  }|  }	||f||f||	ffS )Nr   r   r   r   dim)r   fullint32zeroscumsummaxitem)
r   r   r   r   	seqlens_q	seqlens_kcu_seqlens_qcu_seqlens_kmax_seqlen_qmax_seqlen_kr1   r1   r4   3_prepare_for_flash_attn_or_sage_varlen_without_maskB  s   r	  c           
      C  s   t j| f|t j|d}|jdt jd}t j| d t j|d}t j| d t j|d}t j|dd|dd < t j|dd|dd < |  }|  }	||f||f||	ffS )Nr   r   )r   r   r   r   )r   r   r   sumr   r   r  r  )
r   r   r   r   r  r  r  r  r  r  r1   r1   r4   0_prepare_for_flash_attn_or_sage_varlen_with_maskT  s   r  c                 C  s$   |d u rt | |||S t| |||S r0   )r	  r  )r   r   r   r   r   r1   r1   r4   &_prepare_for_flash_attn_or_sage_varlene  s   r  	seq_len_kc                 C  sj  | j tjkrtd| j  d| jdkr| d||} n| jdkrA| dd|fvr:td| jd  d| d| ||} n\| jd	krj| dd|fvr]td| jd  d| d
| j	dd} | ||} n3| jdkr| dd|fvrtd| jd  d| d| |dd|} | j	dd} ntd| j | j||fkrtd| j d| d| d| S )z
    Normalize an attention mask to shape [batch_size, seq_len_k] (bool) suitable for inferring seqlens_[q|k] in
    FlashAttention/Sage varlen.

    Supports 1D to 4D shapes and common broadcasting patterns.
    z)Attention mask must be of type bool, got r   r   r   r   zattn_mask.shape[0] (z) must be 1 or z for 2D attention mask.   z for 3D attention mask.r      z for 4D attention mask.r   )r   r   z"Unsupported attention mask shape: z.Normalized attention mask shape mismatch: got z, expected (z, ))
r   r   r{   r   ndim	unsqueezeexpandsizer   any)r   r   r  r1   r1   r4   _normalize_attn_maskq  s<   



r  c                 C  s   ||kS r0   r1   	batch_idxhead_idxq_idxkv_idxr1   r1   r4   _flex_attention_causal_mask_mod  s   r  	attr_pathr   c                 C  sD   | }| dD ]}t||std| j d| dt||}q|S )Nr   zKernel module 'z"' does not define attribute path 'z'.)splithasattrAttributeErrorrZ   getattr)moduler  targetattrr1   r1   r4   _resolve_kernel_attr  s   
r%  c              
   C  s   | t vrd S t |  }|jd u }|jd uo|jd u }|jd uo"|jd u }|s+|s+|s+d S z1ddlm} ||j|j	|j
d}|rEt||j|_|rNt||j|_|rZt||j|_W d S W d S  tyu } ztd|j d|   d }~ww )Nr   )
get_kernel)r   r   z)An error occurred while fetching kernel 'z' from the Hub: )r   r   r   r   r   r   kernelsr&  r   r   r   r%  r   	Exceptionr   error)rw   configneeds_kernelneeds_wrapped_forwardneeds_wrapped_backwardr&  kernel_moduleer1   r1   r4   r     s.   
r   z,_diffusers_flash_attn_3::_flash_attn_forwardr1   r   )r9   r-   qr   r   softmax_scalecausalqv	q_descale	k_descale	v_descaleattention_chunksoftcap
num_splitspack_gqabool | Nonedeterministic	sm_margin!tuple[torch.Tensor, torch.Tensor]c                 C  s   d}t di d| d|d|d|d|d|d|d	|d
|d|d|	d|
d|d|d|d|dd}|^}}}|ddd}||fS )Nr   r   r0  r   r   r1  r2  r3  r4  r5  r6  window_sizer7  r8  r9  r:  r<  r=  return_attn_probsTr   r   r   r1   )flash_attn_3_funcpermute)r0  r   r   r1  r2  r3  r4  r5  r6  r7  r8  r9  r:  r<  r=  r@  resultoutlse_r1   r1   r4   _wrapped_flash_attn_3  sN   	

rH  c                 C  s0   d}| j \}}}}|||f}t| | |fS )Nr?  )r   r   
empty_like	new_empty)r0  r   r   r1  r2  r3  r4  r5  r6  r7  r8  r9  r:  r<  r=  r@  r   seq_len	num_headshead_dim	lse_shaper1   r1   r4   rG    s   
rG  Tctx#torch.autograd.function.FunctionCtx
return_lse	_save_ctxr   c              
   C  s   |	rt d|
r| ||| || _|| _|| _|| _|| _dd |||fD \}}}tjj	j
||||||||d}|dddd}|S )	Nz1Native attention does not support return_lse=Truec                 s       | ]}| d dddV  qdS r   r   r   r  NrC  r   xr1   r1   r4   	<genexpr>9      z/_native_attention_forward_op.<locals>.<genexpr>r   r   r   r   r   r   r   r   r   r   r   r  )r   save_for_backwardr   r   r   r   r   r   nn
functionalscaled_dot_product_attentionrC  rO  r   r   r   r   r   r   r   r   rQ  rR  r   rE  r1   r1   r4   _native_attention_forward_op  s,   
r`  grad_outc              
   O  s   | j \}}}|d |d |d dd |||fD \}}}	tjjj|||	| j| j| j| j	| j
d}
|
dddd}
|dddd}tjj|
|||	g|d	d
\}}}|dddd}|dddd}|dddd}|||fS )NTc                 s  rS  rT  rU  rV  r1   r1   r4   rX  U  rY  z0_native_attention_backward_op.<locals>.<genexpr>rZ  r   r   r   r  F)outputsinputsgrad_outputsretain_graph)saved_tensorsrequires_grad_r   r\  r]  r^  r   r   r   r   r   rC  autogradgrad)rO  ra  argsr   r   r   r   query_tkey_tvalue_trE  
grad_out_tgrad_query_t
grad_key_tgrad_value_t
grad_querygrad_key
grad_valuer1   r1   r4   _native_attention_backward_opI  s0   




ru  c                 C  s   |rt dd}|dd }|dd }|dd }||||f7 }tjjj|||||	||d|d	\	}}}}}}}}}|||||||f7 }|
rc| j|  || _|| _	|| _
|| _|| _|| _|dd }|d urw|dd }|	r}||fS |S )Nz6`enable_gqa` is not yet supported for cuDNN attention.r1   r   r   F)	r   r   r   	attn_biascompute_log_sumexpr   r   return_debug_maskr   )r   	transpose
contiguousr   opsaten#_scaled_dot_product_cudnn_attentionr[  r   r   r   r   max_qmax_krO  r   r   r   r   r   r   r   r   rQ  rR  r   tensors_to_saverE  rF  	cum_seq_q	cum_seq_kr~  r  philox_seedphilox_offsetdebug_attn_maskr1   r1   r4   _cudnn_attention_forward_opq  s@   
r  c                 O  s   | j \	}}}}}}	}
}}|dd }|dd }|dd }tjjj||||||||| j|	|
| j| j	| j
| j| jd\}}}dd |||fD \}}}|||fS )Nr   r   )	logsumexpr  r  rv  r  r  r~  r  r   r   r   c                 s       | ]}| d d V  qdS r   r   Nry  rz  rV  r1   r1   r4   rX    rY  z/_cudnn_attention_backward_op.<locals>.<genexpr>)rf  ry  rz  r   r{  r|  ,_scaled_dot_product_cudnn_attention_backwardr   r~  r  r   r   r   rO  ra  rj  r   r   r   r   rE  rF  r  r  r  r  rr  rs  rt  r1   r1   r4   _cudnn_attention_backward_op  s0   
r  c              	   C  s   |rt dd}|dd }|dd }|dd }||||f7 }tjjj|||||d|d\	}}}}}}}}}|||||||f7 }|
r^| j|  || _|| _	|| _
|| _|| _|dd }|d urr|dd }|	rx||fS |S )Nz=`enable_gqa` is not yet supported for native flash attention.r1   r   r   F)r   r   r   r   r   rx  r   )r   ry  rz  r   r{  r|  #_scaled_dot_product_flash_attentionr[  r   r   r   r~  r  r  r1   r1   r4   "_native_flash_attention_forward_op  s:   
r  c                 O  s   | j \	}}}}}}	}
}}|dd }|dd }|dd }tjjj|||||||||	|
| j| j| j	| j
| jd\}}}dd |||fD \}}}|||fS )Nr   r   )
r  r  r  r  r  r~  r  r   r   r   c                 s  r  r  r  rV  r1   r1   r4   rX  %  rY  z6_native_flash_attention_backward_op.<locals>.<genexpr>)rf  ry  rz  r   r{  r|  ,_scaled_dot_product_flash_attention_backwardr~  r  r   r   r   r  r1   r1   r4   #_native_flash_attention_backward_op  s.   
r  c                 C  s8  |d urt d|rt dd}d}d }d}tdd |||fD }|d u r-|jd d	 }|s9|d urA|jjd
krA|dkr?|nd}t|% t|||||||d |d
 |||	\}}}}|ddd
}W d    n1 snw   Y  |
r| 	|||||| || _
|| _|| _|| _|| _|| _|| _|	r||fS |S )Nz2`attn_mask` is not yet supported for flash-attn 2.z3`enable_gqa` is not yet supported for flash-attn 2.r?  r   Fc                 s      | ]}|j V  qd S r0   requires_gradrV  r1   r1   r4   rX  C      z._flash_attention_forward_op.<locals>.<genexpr>r         r   r   KH9r   )r   r  r   context_parallel_config_world_sizer   set_grad_enabledr$   rC  r[  r   r   r   r@  r8  alibi_slopesr<  )rO  r   r   r   r   r   r   r   r   rQ  rR  r   r@  r8  r  r<  grad_enabledrE  rF  S_dmask	rng_stater1   r1   r4   _flash_attention_forward_op+  sN   r  c                 O  s   | j \}}}}}}	t|t|t|}
}}t|||||||
||| j| j| j| jd | jd | j| j	| j
|	}|
dd |jd f }
|dd |jd f }|dd |jd f }|
||fS )Nr   r   .r   )rf  r   rI  r#   r   r   r   r@  r8  r  r<  r   )rO  ra  rj  r   r   r   r   rE  rF  r  rr  rs  rt  lse_dr1   r1   r4   _flash_attention_backward_opi  s4   "
r  c                 C  sj  |d urt d|rt dttj }|j}|j}|d u s!|d u r%td|d u r0|jd d }d}d}d }d}td	d
 |||fD }|sP|d urX|j	j
dkrX|dkrV|nd}t|' ||||||||d |d |||	\}}}}|ddd }W d    n1 sw   Y  |
r| |||||| || _|| _|| _|| _|| _|| _|| _|	r||fS |S )Nz<`attn_mask` is not yet supported for flash-attn hub kernels.z=`enable_gqa` is not yet supported for flash-attn hub kernels.zFlash attention hub kernels must expose `_wrapped_flash_attn_forward` and `_wrapped_flash_attn_backward` for context parallel execution.r   r  r?  r   Fc                 s  r  r0   r  rV  r1   r1   r4   rX    r  z2_flash_attention_hub_forward_op.<locals>.<genexpr>r   r   r  r   )r   r   r@   r^   r   r   r   r   r  r  r  r   r  rC  rz  r[  r   r   r   r@  r8  r  r<  )rO  r   r   r   r   r   r   r   r   rQ  rR  r   r*  r   r   r@  r8  r  r<  r  rE  rF  r  r  r1   r1   r4   _flash_attention_hub_forward_op  s\   
r  c                 O  s   t tj }|j}|d u rtd| j\}}}}	}
}t|t|t|}}}||||||	|
|||| j| j	| j
| jd | jd | j| j| j|}|dd |jd f }|dd |jd f }|dd |jd f }|||fS )NzfFlash attention hub kernels must expose `_wrapped_flash_attn_backward` for context parallel execution.r   r   .r   )r   r@   r^   r   r   rf  r   rI  r   r   r   r@  r8  r  r<  r   )rO  ra  rj  r   r*  r   r   r   r   rE  rF  r  rr  rs  rt  rG  r1   r1   r4    _flash_attention_hub_backward_op  s@   
"
r  r?  r@  r8  r9  r:  r<  r=  r@  tuple[int, int]c                C  s   |d urt d|dkrt d|rt dttj }|j}|d u r&td|d u r1|jd d }||||d d d d d d d d d d d d d d d d d d d d |f||d |d	 d||||d
^}}}|	ri|ddd	 nd }|
r| 	||||| || _
|| _|| _|| _|| _|| _|	r||fS |S )Nz>`attn_mask` is not yet supported for flash-attn 3 hub kernels.r   z>`dropout_p` is not yet supported for flash-attn 3 hub kernels.z?`enable_gqa` is not yet supported for flash-attn 3 hub kernels.ztFlash attention 3 hub kernels must expose `flash_attn_interface._flash_attn_forward` for context parallel execution.r   r  r   r   )r2  window_size_leftwindow_size_rightr7  r8  r9  r:  r=  r   )r   r   r@   rc   r   r   r   rC  rz  r[  r   r   r@  r8  r<  r=  )rO  r   r   r   r   r   r   r   r   rQ  rR  r   r@  r8  r9  r:  r<  r=  r*  r   rE  softmax_lserG  rF  r1   r1   r4   !_flash_attention_3_hub_forward_op   sv   
#r  c                 O  s   t tj }|j}|d u rtd| j\}}}}	}
t|}t|}t|}||||||	|
d d d d d d |||| j| j	| j
d | j
d | j| j| j |dd |jd f }|dd |jd f }|dd |jd f }|||fS )NzuFlash attention 3 hub kernels must expose `flash_attn_interface._flash_attn_backward` for context parallel execution.r   r   .r   )r   r@   rc   r   r   rf  r   rI  r   r   r@  r8  r<  r=  r   )rO  ra  rj  r   r*  r   r   r   r   rE  r  rr  rs  rt  r1   r1   r4   "_flash_attention_3_hub_backward_opX  sL   




r  c              	   C  sr   |d urt d|dkrt d|rt dt|||d|||	d}d }|	r1|^}}}|ddd	}|	r7||fS |S 
Nz4`attn_mask` is not yet supported for Sage attention.r   z4`dropout_p` is not yet supported for Sage attention.z5`enable_gqa` is not yet supported for Sage attention.NHDr0  r   r   tensor_layoutr   sm_scalerQ  r   r   r   )r   r%   rC  )rO  r   r   r   r   r   r   r   r   rQ  rR  r   rE  rF  rG  r1   r1   r4   _sage_attention_forward_op  s(   	
r  c              	   C  s   |d urt d|dkrt d|rt dttj j}||||d|||	d}d }|	r9|^}}}|ddd	 }|	r?||fS |S r  )r   r   r@   ro   r   rC  rz  )rO  r   r   r   r   r   r   r   r   rQ  rR  r   r3   rE  rF  rG  r1   r1   r4   _sage_attention_hub_forward_op  s*   

r  c                 G     t d)Nz4Backward pass is not implemented for Sage attention.NotImplementedErrorrO  ra  rj  r1   r1   r4   _sage_attention_backward_op  s   r  c                 C  s   |d urt |dkrd }|d urQ|jdkrQ|jd | jd krQ|jd |jd krQ|jd | jd |jd }}}|t j }|d|||d }|S )Nr   r   r   )	r   allr  r   tor{   r  r  rz  )r   r   r   BSqSkvr1   r1   r4   _maybe_modify_attn_mask_npu  s   
"r  c                 C  sh   |	rt dt|||}t||||d|dd |d u r%dt|jd  n|ddd| ddd	d }|S )
NANPU attention backend does not support setting `return_lse=True`.r   BSND      ?r      Fr   	
atten_maskinput_layoutpser   pre_tockensnext_tockens	keep_probsyncinner_precise)r   r  r+   r  mathsqrtr   r_  r1   r1   r4   _npu_attention_forward_op  s*   r  c                 O  r  )Nz:Backward pass is not implemented for Npu Fusion Attention.r  )rO  ra  rj  r   r1   r1   r4   _npu_attention_backward_op     r  c                 C  s   t | tjr
|  } | S r0   )
isinstancefuncolAsyncCollectiveTensorwait)tensorr1   r1   r4   _wait_tensor,  s   r  rW  c                 C  s4   | j }|  } t| d d |} | |} t| } | S r0   )r   flattenr  all_to_all_singlereshaper  )rW  groupr   r1   r1   r4   _all_to_all_single2  s   
r  scatter_idx
gather_idxc                 C  sR  t j|}|dkrS|dkrS| j\}}}}|| }	|| }
| ||||
|dd }|dkr7t||d}n|}||	||
|dddd }|||	|
|}|S |dkr|dkr| j\}}	}
}|
| }|	| }| ||||
|ddddd||
|||}|dkrt||}n|}|||||dd }|||||}|S t	d)ua  
    Perform dimension sharding / reassembly across processes using _all_to_all_single.

    This utility reshapes and redistributes tensor `x` across the given process group, across sequence dimension or
    head dimension flexibly by accepting scatter_idx and gather_idx.

    Args:
        x (torch.Tensor):
            Input tensor. Expected shapes:
            - When scatter_idx=2, gather_idx=1: (batch_size, seq_len_local, num_heads, head_dim)
            - When scatter_idx=1, gather_idx=2: (batch_size, seq_len, num_heads_local, head_dim)
        scatter_idx (int) :
            Dimension along which the tensor is partitioned before all-to-all.
        gather_idx (int):
            Dimension along which the output is reassembled after all-to-all.
        group :
            Distributed process group for the Ulysses group.

    Returns:
        torch.Tensor: Tensor with globally exchanged dimensions.
            - For (scatter_idx=2 → gather_idx=1): (batch_size, seq_len, num_heads_local, head_dim)
            - For (scatter_idx=1 → gather_idx=2): (batch_size, seq_len_local, num_heads, head_dim)
    r   r   r   r  r  r  z<Invalid scatter/gather indices for _all_to_all_dim_exchange.)
r   distributedget_world_sizer   r  ry  rz  r  rC  r   )rW  r  r  r  group_world_sizer   seq_len_localrL  rM  rK  num_heads_localx_temprE  outputr1   r1   r4   _all_to_all_dim_exchange?  s<    r  c                   @  s*   e Zd ZdZed	ddZedd ZdS )
SeqAllToAllDimz
    all_to_all operation for unified sequence parallelism. uses _all_to_all_dim_exchange, see _all_to_all_dim_exchange
    for more info.
    r   r   c                 C  s    || _ || _|| _t||||S r0   )r  
scatter_id	gather_idr  )rO  r  inputr  r  r1   r1   r4   forward  s   zSeqAllToAllDim.forwardc                 C  s"   t | j|| j| j}d |d d fS r0   )r  applyr  r  r  )rO  rd  
grad_inputr1   r1   r4   backward  s   zSeqAllToAllDim.backwardN)r   r   )rZ   r[   r\   r   staticmethodr  r  r1   r1   r1   r4   r    s    r  Hr  dist.ProcessGrouptuple[torch.Tensor, int]c                 C  sp   t j|d}d}|| dkr4|||  }|| | }||k s(J d| d| t| ddd|f } | |fS )zMaybe pad the head dimension to be divisible by world_size.
    x: torch.Tensor, shape (B, S_LOCAL, H, D) H: int, original global head num return: tuple[torch.Tensor, int], padded
    tensor (B, S_LOCAL, H + H_PAD, D) and H_PAD
    r  r   Padding head num ( should be less than new local head num )distr  Fpadrz  )rW  r  r  
world_sizeH_PADNEW_H_LOCALr1   r1   r4   _maybe_pad_qkv_head  s   r  r  c                 C  sV   t j|d}t j|d}|dkr'||d kr'| ddddd| ddf } |  S )zMaybe unpad the head dimension.
    x: torch.Tensor, shape (B, S_GLOBAL, H_LOCAL + H_PAD, D) H_PAD: int, head padding num return: torch.Tensor,
    unpadded tensor (B, S_GLOBAL, H_LOCAL, D)
    r  r   r   N)r  get_rankr  rz  )rW  r  r  rankr  r1   r1   r4   _maybe_unpad_qkv_head  s
   "r  c                 C  s   |du r| dfS t j|d}t j|d}d}|| dkrH|||  }|| | }||k s6J d| d| ||d krHt| ddd|f } | |fS )zMaybe pad the head dimension to be divisible by world_size.
    x: torch.Tensor, shape (B, S_GLOBAL, H_LOCAL, D) H: int, original global head num return: tuple[torch.Tensor, int],
    padded tensor (B, S_GLOBAL, H_LOCAL + H_PAD, D) and H_PAD
    Nr   r  r  r  r   )r  r  r  r  r  rz  )rW  r  r  r  r  r  r  r1   r1   r4   _maybe_pad_o_head  s   r  c                 C  s2   |dkr| ddddd| ddf } |   S )zMaybe unpad the head dimension.
    x: torch.Tensor, shape (B, S_LOCAL, H_GLOBAL + H_PAD, D) H_PAD: int, head padding num return: torch.Tensor,
    unpadded tensor (B, S_LOCAL, H_GLOBAL, D)
    r   N)rz  )rW  r  r  r1   r1   r4   _maybe_unpad_o_head  s   "r  dictc                 K  s:   t | jdksJ di }| jd |d< | jd |d< |S )Nr  zEQuery tensor must be 4-dimensional of shape (B, S_LOCAL, H_GLOBAL, D)r   NUM_QO_HEADr   	Q_S_LOCAL)lenr   )r   r   extra_kwargsr1   r1   r4   ulysses_anything_metadata  s
   r  Callable[..., torch.Tensor]c                   s   t jd}j\}}}}t|\ |  | }|||||ddddd |g| }	t|}
ddt	
|
|	d fd	d
}|S )zk
    x: torch.Tensor, shape (B, S_LOCAL, H, D) return: Callable that returns (B, S_GLOBAL, H_LOCAL, D)
    r  r   r   r   r  r  r   r   c                     s,   t dddd t S )Nr   r   r   r  )r  rC  rz  r  r1   r  r  rW  r1   r4   r     s   z-all_to_all_single_any_qkv_async.<locals>.waitNr   r   )r  r  r   r  r  rC  rz  r   r  r  r  )rW  r  r   r  r  S_LOCALr  DH_LOCALinput_split_sizesoutput_split_sizesr  r1   r  r4   all_to_all_single_any_qkv_async  s   $

	r  c           	   	     s   | dd}tjdt|\j}|\ }| dt}dddd g }t	||d fdd}|S )zr
    x: torch.Tensor, shape (B, S_GLOBAL, H_LOCAL, D) return: Callable that returns (B, S_LOCAL, H_GLOBAL, D)
    r  Nr  r  r   r   r   r  r   r   c                     sT   t  ddddd   tS )Nr   r   r   r  r  )r  r  rC  rz  r  r1   r  r
  r  r  r	  r  r  rW  r1   r4   r  (  s   z+all_to_all_single_any_o_async.<locals>.waitr  )
r   r  r  r  r   r   rC  rz  r  r  )	rW  r  r   r  r   S_GLOBALr  r  r  r1   r  r4   all_to_all_single_any_o_async  s   



	r  c                   @  ,   e Zd Ze	ddddZedddZdS )TemplatedRingAttentionNrO  rP  r   r   r   r   r   r   r   r   r   r{   r   r   r   rQ  r   r   c                 C  s  |j j}|j j}|j j}|d | }d  }}|
| _|| _|j| _|j| _|| _	t
| | g }tj|d| d}||}t|D ]v}|dkrm|| }| }|d | |}||d  |}|d | }|
| ||||||||d|dk|d\}}|j jr|t
j}|t
j}tddr|d}|d ur|t
jj|| ||   }|t
jj||  }|}|}qG||j}|d}|	r||fS |S )	Nr   r   
gather_dimr  TrR  r   <2.9.0r   ) r  
_ring_mesh_ring_local_rankring_degree
forward_opbackward_opr   q_shapekv_shaper   r   catr  rz  r  all_gather_tensor	get_groupchunkrangenumel
reshape_asconvert_to_fp32r  float32r   r  r\  r]  sigmoid
logsigmoidr   squeeze)rO  r   r   r   r   r   r   r   r   rQ  r  r  r   	ring_meshr  r  	next_rankprev_outprev_lse	kv_bufferikv	key_numelrE  rF  r1   r1   r4   r  5  s^   




zTemplatedRingAttention.forwardra  c                   s  | j jj}| j jj}| j jj}|d | }ttd|dg }| j jjr'tj	n j
}tj| j| jd}	tj| j| jd}
tj| j| jd}d }| j^}}}}t| | g }tj|d| d}||}t|D ]w}|dkr|| }| }|d | |}||d  |}|d | }| |  ^}}}}|dkrt|}|
 }|d | |
}
||d  |}|	|7 }	|
|7 }
||7 }||d k rt|
 | g }tj||| d}qq fdd|	|
|fD \}	}
}|	|
|d d d d d d d d d fS )Nr   r   r   r  r  c                 3  s    | ]	}|  jV  qd S r0   )r  r   rV  ra  r1   r4   rX    s    z2TemplatedRingAttention.backward.<locals>.<genexpr>)r   r  r  r  r  r   r$  r'  r   r(  r   r   r  r   r  rf  r   r  rz  r  r!  r"  r#  r%  r&  r  r  permute_tensor)rO  ra  rj  r,  r  r  r-  
next_ranksaccum_dtyperr  rs  rt  next_grad_kvr   r   r   rG  r0  r1  r2  r3  grad_query_opgrad_key_opgrad_value_opgrad_kv_buffergrad_key_numelr1   r4  r4   r    sH   



zTemplatedRingAttention.backwardr0   rO  rP  r   r   r   r   r   r   r   r   r   r   r   r{   r   r   r   r{   rQ  r{   r   r   rO  rP  ra  r   rZ   r[   r\   r  r  r  r1   r1   r1   r4   r  4  s    Ir  c                   @  r  )TemplatedUlyssesAttentionNrO  rP  r   r   r   r   r   r   r   r   r   r{   r   r   r   rQ  r   r   c                   s  |j j}|j j}|  |
| _|| _|| _|j\}}}}|j\}}}}|| }||||||	ddddd
 }||||||	ddddd
 }||||||	ddddd
 } fdd|||fD \}}}dd |||fD \}}}|
| |||||||||	d	|d
}|	r|^}}}||||||	ddddd
 }t| }|dd	dddd
 }|	r|||||	dddd
 }t| }|dd	ddd
 }nd }|	r||fS |S )Nr   r   r   r  r  c                 3      | ]}t | V  qd S r0   r  rV  r  r1   r4   rX        z4TemplatedUlyssesAttention.forward.<locals>.<genexpr>c                 s  s,    | ]}| d ddd dd V  qdS r   r   r   r  Nr  rC  rz  rV  r1   r1   r4   rX    s   * Tr  )r  _ulysses_meshulysses_degreer"  r  r  r   r   r  rC  rz  r  r  )rO  r   r   r   r   r   r   r   r   rQ  r  r  r   ulysses_meshr  r  	S_Q_LOCALr  r
  rG  
S_KV_LOCALr  rE  rF  r1   r  r4   r    sN   $$$
$
 
z!TemplatedUlyssesAttention.forwardra  c                   s
  | j jj}| j jj| |j\ }| | ddddd }t	|}|
dddddd }| | |^}}}} fdd|||fD \}	}
}fdd|	|
|fD \}	}
}d	d |	|
|fD \}	}
}|	|
|d d d d d d d d d fS )
Nr   r   r   r  r  c                 3  s4    | ]}|  d dddd V  qdS )r   r  r   r   r  N)r  rC  rz  rV  )r  r
  r  r	  r  r1   r4   rX    s
    "
z5TemplatedUlyssesAttention.backward.<locals>.<genexpr>c                 3  rB  r0   rC  rV  r  r1   r4   rX  
  rD  c                 s  s,    | ]}| d dddd d V  qdS rE  rF  rV  r1   r1   r4   rX    s    
)r   r  rG  rH  r"  r   r  rC  rz  r  r  r  )rO  ra  rj  rI  r  r9  r:  r;  rG  rr  rs  rt  r1   )r  r
  r  r	  r  r  r4   r    s"   

$
z"TemplatedUlyssesAttention.backwardr0   r>  r?  r@  r1   r1   r1   r4   rA    s    >rA  c                   @  s,   e Zd Ze	ddddZedddZdS )!TemplatedUlyssesAnythingAttentionNrO  rP  r   r   r   r   r   r   r   r   r{   r   r   rQ  r   r   c                 K  s  |j j}| }|
| _|| _|| _t|}t||fi |}t||fi |}t||fi |}| }| }| }|
| |||||||||	d|d}|	rP|^}}}t||fi |}|	rw|	d}t||fi |}| }| }|
d }n| }d }|	r||fS |S )NFr  r   )r  rG  r"  r  r  r   r  r  r  r  r+  rz  )rO  r   r   r   r   r   r   r   r   rQ  r  r  r   r   rI  r  metadata
query_waitkey_wait
value_waitrE  rF  rG  out_waitlse_waitr1   r1   r4   r    sL   

z)TemplatedUlyssesAnythingAttention.forwardra  c                 G  r  )NzQBackward pass for Ulysses Anything Attention in diffusers is not implemented yet.r  r  r1   r1   r4   r  U  r  z*TemplatedUlyssesAnythingAttention.backwardr0   )rO  rP  r   r   r   r   r   r   r   r   r   r   r   r{   r   r   r   r{   rQ  r{   r   r   r?  r@  r1   r1   r1   r4   rL    s    ArL  c                 C  s   |j j}| }t|| ||} t||||}t||||}t| |||||||||	|
|}|r8|^}}}n|}t||||}|r_tddrN|d}t||||}|d}||fS |S )z|
    Unified Sequence Parallelism attention combining Ulysses and ring attention. See: https://arxiv.org/abs/2405.07719
    r  r  r   )	r  rG  r"  r  r  r  r   r  r+  )r   r   r   r   r   r   r   r   rQ  r  r  r   r  r  rI  ulysses_grouprE  context_layerrF  rG  r  r1   r1   r4   _templated_unified_attention^  sF   


rU  )r   c	                C  s   |rt d|rt d|jjdkr'|jjdkr't| |||||||||	|
|S |jjdkr=t| |||||||||	|
|S |jjdkrg|jjrWt| |||||||||	|
|S t	| |||||||||	|
|S t d)Nz>Causal attention is not yet supported for templated attention.z1GQA is not yet supported for templated attention.r   z@Reaching this branch of code is unexpected. Please report a bug.)
r   r  r  rH  rU  r  r  ulysses_anythingrL  rA  )r   r   r   r   r   r   r   r   rQ  r  r  r   r1   r1   r4   %_templated_context_parallel_attention  s   rW  )rx   rz   c	                 C  s~   d }	|d ur
t d|d u r!t| ||||||d}
|r |
^}
}	}nt| ||d |||d|tt|d}
|r7|
\}
}	|r=|
|	fS |
S Nz.`attn_mask` is not supported for flash-attn 2.)r0  r   r   r   r1  r2  rA  Fr  r  r   )r   r!   rW  r  r  )r   r   r   r   r   r   r   rQ  r   rF  rE  rG  r1   r1   r4   _flash_attention  sB   	
rZ  c	                 C  s   d }	|d ur
t dttj j}
|d u r'|
| ||||||d}|r&|^}}	}nt| ||d |||d|tt|d}|r=|\}}	|rC||	fS |S rX  )r   r   r@   r^   r   rW  r  r  )r   r   r   r   r   r   r   rQ  r   rF  r3   rE  rG  r1   r1   r4   _flash_attention_hub.	  sD   	
r[  c	                 C  s  | j \}	}
}}|j \}}}}|d urt||	|}t|	|
||| jd\\}}\}}\}}g g }}t|	D ]}|| }|||d |f  |||d |f  q4| dd}tj|dd}tj|dd}t	t
j j}||||||||||||d}|d|	df}|S Nr   r   r   r   r   )r0  r   r   r  r  r  r  r   r1  r2  rA  r   )r   r  r  r   r$  appendr  r   r   r   r@   r`   r   	unflatten)r   r   r   r   r   r   r   rQ  r   r   r   rG  r   r  r  r  r  r  	key_validvalue_validb	valid_lenquery_packed
key_packedvalue_packedr3   rE  r1   r1   r4   _flash_varlen_attention_hubd	  s@   
rg  )rx   c	                 C  s   | j \}	}
}}|j \}}}}|d urt||	|}t|	|
||| jd\\}}\}}\}}g g }}t|	D ]}|| }|||d |f  |||d |f  q4| dd}tj|dd}tj|dd}t	|||||||||||d}|
d|	df}|S r\  )r   r  r  r   r$  r^  r  r   r   r"   r_  )r   r   r   r   r   r   r   rQ  r   r   r   rG  r   r  r  r  r  r  r`  ra  rb  rc  rd  re  rf  rE  r1   r1   r4   _flash_varlen_attention	  s>   
rh  c           
      C  s6   |d urt dt| ||||d\}}	|r||	fS |S )N.`attn_mask` is not supported for flash-attn 3.)r0  r   r   r1  r2  )r   rH  )
r   r   r   r   r   r   rQ  r   rE  rF  r1   r1   r4   _flash_attention_3	  s   
rj  rA  c                 C  s  |d urt dttj j}|
d u rS|di d| d|d|d|d|dd dd d	d d
d d|d|dddd d|ddd|	}|	rQ|d |d fS |S tjt||dd |dd}tjt||dd |dd}t	| ||d d||d|	|||
d}|	r|\}}||fS |S )Nri  r0  r   r   r1  r2  r3  r4  r5  r6  r@  r8  r9  r   r:  r<  r=  r   rA  r  r   FrY  r1   )
r   r   r@   rc   r   	functoolspartialr  r  rW  )r   r   r   r   r   r   r@  r8  r<  rA  r   r3   rE  r  r  rF  r1   r1   r4   _flash_attention_3_hub	  s   	
		rm  c                 C  s  | j \}}	}
}
|j \}
}}
}
|d urt|||}t||	||| jd\\}
}\}}\}}g g }}t|D ]}|| }|||d |f  |||d |f  q4| dd}tj|dd}tj|dd}t	t
j j}||||||||||d	^}}}
|d|df}|r||fS |S )Nr]  r   r   r   )	r0  r   r   r  r  r  r  r1  r2  r   )r   r  r  r   r$  r^  r  r   r   r   r@   rd   r   r_  )r   r   r   r   r   r   rQ  r   r   r   rG  r   r  r  r  r  r  r`  ra  rb  rc  rd  re  rf  r3   rE  rF  r1   r1   r4   _flash_attention_3_varlen_hubA
  s<   
rn  c                 C  s$  | j \}}	}
}
|j \}
}}
}
|d urt|||}t||	||| jd\\}
}\}}\}}g g }}t|D ]}|| }|||d |f  |||d |f  q4| dd}tj|dd}tj|dd}t	||||||||||d
}t
|tr~|^}}}
n|}d }|d|df}|r||fS |S )Nr]  r   r   r   )
r0  r   r   r  r  r  r  r1  r2  rA  r   )r   r  r  r   r$  r^  r  r   r   flash_attn_3_varlen_funcr  tupler_  )r   r   r   r   r   r   rQ  r   r   r   rG  r   r  r  r  r  r  r`  ra  rb  rc  rd  re  rf  rD  rE  rF  r1   r1   r4   _flash_varlen_attention_3w
  sD   

rq  c	              	   C  sn   |d urt d|st rt| |||||dd^}	}
}nt| ||||||d}	|r/|	^}	}
}|r5|	|
fS |	S )N0`attn_mask` is not supported for aiter attentionT)r0  r   r   r   r1  r2  rQ  )r   r   is_grad_enabledaiter_flash_attn_func)r   r   r   r   r   r   r   rQ  r   rE  rF  rG  r1   r1   r4   _aiter_flash_attention
  s0   
	
ru  0torch.Tensor | 'flex_attention.BlockMask' | Nonec	              
     s(  d }	d }
| j \}}}}|j \}}}} d u st tjr }
nQ|r-tt||||| j}
nCt rl j	dkrE 
 dd dd  ||||  jtjkre fdd}t||d ||| j}
n fdd}	ntdd	d
 | ||fD \} }}tj| |||	|
|||d}|dddd}|S )Nr   r   r   c                   s    | |||f S r0   r1   r  r   r1   r4   mask_mod  s   z(_native_flex_attention.<locals>.mask_modc                   s   |  ||||f  S r0   r1   )scorer  r  r  r  rw  r1   r4   	score_mod	  s   z)_native_flex_attention.<locals>.score_modzCAttention mask must be either None, a BlockMask, or a 2D/4D tensor.c                 s  rS  rT  rU  rV  r1   r1   r4   rX    rY  z)_native_flex_attention.<locals>.<genexpr>)r   r   r   rz  
block_maskr   r   rQ  r  )r   r  flex_attention	BlockMaskcreate_block_maskr  r   r   	is_tensorr  viewr  r  r   r{   r   rC  )r   r   r   r   r   r   r   rQ  r   rz  r{  r   r   rL  rG  r   rx  rE  r1   rw  r4   _native_flex_attention
  sD   


r  target_dtypetorch.dtype
reshape_4dc                 C  sZ   | j tjkrt| dtd} | j|d} n| j|d} |r+| j\}}| |dd|} | S )a'  
    Convert a 2D attention mask to an additive mask, optionally reshaping to 4D for SDPA.

    This helper is used by both native SDPA and xformers backends to handle both boolean and additive masks.

    Args:
        attn_mask: 2D tensor [batch_size, seq_len_k]
                   - Boolean: True means attend, False means mask out
                   - Additive: 0.0 means attend, -inf means mask out
        target_dtype: The dtype to convert the mask to (usually query.dtype)
        reshape_4d: If True, reshape from [batch_size, seq_len_k] to [batch_size, 1, 1, seq_len_k] for broadcasting

    Returns:
        Additive mask tensor where 0.0 means attend and -inf means mask out. Shape is [batch_size, seq_len_k] if
        reshape_4d=False, or [batch_size, 1, 1, seq_len_k] if reshape_4d=True.
    r   -inf)r   r   )r   r   r{   wherer   r  r   r  )r   r  r  r   r  r1   r1   r4   _prepare_additive_attn_mask  s   
r  c
                 C  s   |rt d|d ur+|jdkr+|jd | jd kr+|jd |jd kr+|dd}|	d u rUdd | ||fD \} }}tjjj| |||||||d}
|
dddd}
|
S t	| ||||||||t
t|	d	}
|
S )
NzDNative attention backend does not support setting `return_lse=True`.r   r   r   c                 s  rS  rT  rU  rV  r1   r1   r4   rX  c  rY  z$_native_attention.<locals>.<genexpr>rZ  r  rY  )r   r  r   r  r   r\  r]  r^  rC  rW  r`  ru  r   r   r   r   r   r   r   r   rQ  r   rE  r1   r1   r4   _native_attentionB  sH   

r  c
                 C  s   d }
|	d u rH|sHdd | ||fD \} }}t jjt jjjj t jjj| |||||||d}W d    n1 s:w   Y  |dddd}nt	| ||||||||t
t|	d}|r^|\}}
|rd||
fS |S )	Nc                 s  s$    | ]}| d ddd V  qdS rT  )rC  rz  rV  r1   r1   r4   rX    s   " z*_native_cudnn_attention.<locals>.<genexpr>rZ  r   r   r   r  rY  )r   r\  	attentionsdpa_kernel
SDPBackendCUDNN_ATTENTIONr]  r^  rC  rW  r  r  r   r   r   r   r   r   r   r   rQ  r   rF  rE  r1   r1   r4   _native_cudnn_attention  sB   r  c
                 C     |rt ddd | ||fD \} }}tjjtjjjj tjjj| |||||||d}
W d    n1 s8w   Y  |
	dddd}
|
S )	NzNNative efficient attention backend does not support setting `return_lse=True`.c                 s  rS  rT  rU  rV  r1   r1   r4   rX    rY  z._native_efficient_attention.<locals>.<genexpr>rZ  r   r   r   r  )
r   r   r\  r  r  r  EFFICIENT_ATTENTIONr]  r^  rC  r  r1   r1   r4   _native_efficient_attention  "   r  c
                 C  s   |d urt dd }
|	d u rP|sPdd | ||fD \} }}tjjtjjjj tjjj| ||d ||||d}W d    n1 sBw   Y  |	dddd}nt
| ||d |||||tt|	d	}|rf|\}}
|rl||
fS |S )
Nrr  c                 s  rS  rT  rU  rV  r1   r1   r4   rX    rY  z*_native_flash_attention.<locals>.<genexpr>rZ  r   r   r   r  rY  )r   r   r\  r  r  r  FLASH_ATTENTIONr]  r^  rC  rW  r  r  r  r1   r1   r4   _native_flash_attention  sF   r  c
                 C  r  )	NzINative math attention backend does not support setting `return_lse=True`.c                 s  rS  rT  rU  rV  r1   r1   r4   rX  #  rY  z)_native_math_attention.<locals>.<genexpr>rZ  r   r   r   r  )
r   r   r\  r  r  r  MATHr]  r^  rC  r  r1   r1   r4   _native_math_attention  r  r  c           	      C  s   |rt d|d u r8t| ||}t| ||| d|dd |d u r)dt| jd  n|ddd| ddd	d }|S t| ||||d |d |tt	|d
}|S )Nr  r   r  r  r   r  Fr   r  rY  )
r   r  r+   r  r  r  r   rW  r  r  )	r   r   r   r   r   r   rQ  r   rE  r1   r1   r4   _native_npu_attention3  sJ   r  c                 C  sn   |d urt d|rt ddd | ||fD \} }}| t| jd  } t| |||d}|ddd	d
}|S )Nz.`attn_mask` is not supported for XLA attentionzAXLA attention backend does not support setting `return_lse=True`.c                 s  rS  rT  rU  rV  r1   r1   r4   rX  z  rY  z(_native_xla_attention.<locals>.<genexpr>r   )r0  r   r   r2  r   r   r   r  )r   r  r  r   xla_flash_attentionrC  )r   r   r   r   r   rQ  r   rE  r1   r1   r4   _native_xla_attentioni  s   r  c                 C  s~   |d urt dd }|d u r!t| ||d|||d}	|r |	^}	}}
nt| ||d d||d|tt|d}	|r7|	\}	}|r=|	|fS |	S N/`attn_mask` is not supported for sage attentionr  r  r   FrY  )r   r%   rW  r  r  )r   r   r   r   r   r   rQ  r   rF  rE  rG  r1   r1   r4   _sage_attention  sB   	
r  c                 C  s   |d urt dd }ttj j}	|d u r'|	| ||d|||d}
|r&|
^}
}}nt| ||d d||d|tt|d}
|r=|
\}
}|rC|
|fS |
S r  )r   r   r@   ro   r   rW  r  r  )r   r   r   r   r   r   rQ  r   rF  r3   rE  rG  r1   r1   r4   _sage_attention_hub  sD   	
r  c                 C  s  |rt d| j\}}	}
}
|j\}
}}
}
|d urt|||}t||	||| jd\\}
}\}}\}}g g }}t|D ]}|| }|||d |f  |||d |f  q:| dd}tj	|dd}tj	|dd}t
|||||||||d	}|d|df}|S )Nz?Sage varlen backend does not support setting `return_lse=True`.r]  r   r   r   )	r0  r   r   r  r  r  r  r   r  r   )r   r   r  r  r   r$  r^  r  r   r   r*   r_  )r   r   r   r   r   r   rQ  r   r   r   rG  r   r  r  r  r  r  r`  ra  rb  rc  rd  re  rf  rE  r1   r1   r4   _sage_varlen_attention  s>   
r  	   c              	   C  &   |d urt dt| ||d|||dS Nr  r  r  )r   r&   r   r   r   r   r   r   rQ  r   r1   r1   r4   #_sage_qk_int8_pv_fp8_cuda_attention$     r  c              	   C  r  r  )r   r'   r  r1   r1   r4   (_sage_qk_int8_pv_fp8_cuda_sm90_attention?  r  r     c              	   C  r  r  )r   r(   r  r1   r1   r4   $_sage_qk_int8_pv_fp16_cuda_attentionZ  r  r  c              	   C  r  r  )r   r)   r  r1   r1   r4   &_sage_qk_int8_pv_fp16_triton_attentionu  r  r  c
                 C  s  |rt d| j\}
}}}|j\}}}}|rt }nw|d ur|jdkry|d}|d d d }tj|
|||f| j| j	d}t
|| jd}||d d d d d d d |f< td|d d d d d d |d f< |d d d d d d d |f }n|jd	krt d
|jd	kr||
|||| }|r|| dkrt d|| }| d|df} |d|dfddd|d}|d|dfddd|d}t| |||||}|r|dd}|S )NzFxformers attention backend does not support setting `return_lse=True`.r   r      r  r   )r  r  r  zDOnly 2D and 4D attention masks are supported for xformers attention.r   zKNumber of heads in query must be divisible by number of heads in key/value.r   r  )r   r   xopsLowerTriangularMaskr  r  r   r   r   r   r  r   r  type_asr_  memory_efficient_attentionr  )r   r   r   r   r   r   r   r   rQ  r   r   r   num_heads_qrG  r   num_heads_kvoriginal_seq_lenaligned_seq_lenaligned_maskmask_additivenum_heads_per_grouprE  r1   r1   r4   _xformers_attention  sH   



 $"

r  r0   )rw   r   )Nr   FNFN)r   r   r   r   r   r   r   r   r   r   r   r{   r   r   r   r{   r   r   rw   r   r   r   r   r   )r   r   r   r{   r   r   r   )r   r   r   r   r   r   )
r   r   r   r   r   r   r   r   r   r   )rw   r@   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )NN)r   r   r   r   r   r   r   r   r   r   r   r   )r   r   r   r   r  r   r   r   )r  r   )NFNNNNr   r   r   NFr   ) r0  r   r   r   r   r   r1  r   r2  r{   r3  r   r4  r   r5  r   r6  r   r7  r   r8  r   r9  r   r:  r;  r<  r{   r=  r   r   r>  )Nr   FNFFTN)rO  rP  r   r   r   r   r   r   r   r   r   r   r   r{   r   r   r   r{   rQ  r{   rR  r{   r   r   r?  )$rO  rP  r   r   r   r   r   r   r   r   r   r   r   r{   r   r   r   r{   rQ  r{   rR  r{   r   r   r@  r  r8  r   r9  r   r:  r;  r<  r{   r=  r   )r   r   r   r   r   r   )rW  r   r   r   )r   r   N)rW  r   r  r   r  r   r   r   )rW  r   r  r   r  r  r   r  )rW  r   r  r   r  r  r   r   )r   r   r   r   )rW  r   r  r  r   r  )Nr   r   )r   r   r   r   r   r   r   r   r   r   r   r{   r   r   r   r{   rQ  r{   r   r   r  r   r  r   )Nr   FNFF)r   r   r   r   r   r   r   r   r   r   r   r{   r   r   r   r{   rQ  r{   r   r   )r   r   r   r   r   r   r   r   r   r   r   r{   r   r   rQ  r{   r   r   r   r   )Nr   NFFN)r   r   r   r   r   r   r   r   r   r   r   r   r   r{   rQ  r{   r   r   r   r   )NNFFN)r   r   r   r   r   r   r   r   r   r   r   r{   rQ  r{   r   r   r   r   )NNFr?  r   FFN)r   r   r   r   r   r   r   r   r   r   r   r{   r@  r  r8  r   r<  r{   rA  r{   r   r   r   r   )NFNFFN)r   r   r   r   r   r   r   rv  r   r{   r   r   r   r{   rQ  r{   r   r   r   r   )T)r   r   r  r  r  r{   r   r   )Nr   FNFFN)r   r   r   r   r   r   r   r   r   r   r   r{   r   r   r   r{   rQ  r{   r   r   r   r   )Nr   NFN)r   r   r   r   r   r   r   r   r   r   r   r   rQ  r{   r   r   r   r   )NFFN)r   r   r   r   r   r   r   r   r   r{   rQ  r{   r   r   r   r   )NFNFN)r   r   r   r   r   r   r   r   r   r{   r   r   rQ  r{   r   r   r   r   )
__future__r   
contextlibrk  r   r  dataclassesr   enumr   typingr   r   r   r   torch.distributedr  r  torch.nn.functionalr\  r]  r  is_available)torch.distributed._functional_collectives_functional_collectivesr  utilsr	   r
   r   r   r   r   r   r   r   r   r   r   r   r   r   r   utils.constantsr   r   utils.torch_utilsr   _modeling_parallelr   r   r   r   r   _REQUIRED_FLEX_VERSIONr   r   rZ   r   r   r   r   r   r   r   r   r   
flash_attnr!   r"   flash_attn.flash_attn_interfacer#   r$   ImportErrorOSErrorr   r/  r   flash_attn_interfacerB  ro  rI   rt  sageattentionr%   r&   r'   r(   r)   r*   !torch.nn.attention.flex_attentionr  r|  	torch_npur+   $torch_xla.experimental.custom_kernelr,   r  xformers.opsr{  r  __version__library	custom_op
_custom_opregister_fake_register_faker:   r?   r   r@   rv   r   rc   rd   r^   r`   ro   r   r   contextmanagerrg   r   r   r   r   r   r   r   r   r   r   	lru_cacher	  r  r  r  r  r%  r   rH  rG  r`  ru  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  rh  Functionr  r  r  r  r  r  r  r  r  rA  rL  rU  rW  r   r]   rZ  r[  rg  r_   rh  ra   rj  rm  rn  rb   rq  re   ru  rf   r  r  r  rh   r  ri   r  rj   r  rk   r  rl   r  rm   r  rn   r  r  rp   r  rq   r  rr   r  rs   r  rt   r  ru   r  r1   r1   r1   r4   <module>   s
  
H

$


*1!6

I4	"+!+-:,6(>+E1X8+('I

!'_XB]	0	1	43	M	16)<%	;	0	3	1	.	/3