o
    ٷi{g                     @   s  d dl Z d dlZd dlmZ d dlmZ zHd dlmZmZm	Z	m
Z
mZmZ d dlmZmZmZ z
d dlmZ dZW n eyF   dZdZY nw ejd	krSd d
lmZ ndZd dlmZ W n eyg   edw d dlmZ d dlmZ ddlmZ ddl m!Z! ee"Z#dZ$g dZ%defddZ&de'de'fddZ(ej)r+g dZ*						dWddde j+de j+de j+d ee j+ d!e,d"e-d#ee, d$e-d%e-d&ed' fd(d)Z.								dXd*e j/j0j1de j+de j+de j+d ee j+ d!e,d"e-d#ee, d$e-d%e-d+e-d&ed' fd,d-Z2d*e j/j0j1d.e j+fd/d0Z3e&ej4 ej5ej4e	e
gdd1							dYde j+de j+de j+d ee j+ d!e,d"e-d#ee, d$e-d%e-d&ed' d2e j+fd3d4Z6e#7d5 								dXd*e j/j0j1de j+de j+de j+d ee j+ d!e,d"e-d#ee, d$e-d%e-d+e-d&ed' fd6d7Z8d*e j/j0j1d.e j+fd8d9Z9e(d:d; e:ed:sJ ej5ej;e	e
gdd1							dYde j+de j+de j+d ee j+ d!e,d"e-d#ee, d$e-d%e-d&ed' d2e j+fd<d=Z<e#7d> e&ej= ej5ej=eee
gdd1				dZde j+de j+de j+d"e-d#ee, d%e-d&ed' d2e j+fd?d@Z>e#7dA 								dXd*e j/j0j1de j+de j+de j+d ee j+ d!e,d"e-d#ee, d$e-d%e-d+e-d&ed' fdBdCZ?d*e j/j0j1d.e j+fdDdEZ@ere:edFrde&ejA ne#7dG e(dFdH e:edFsvJ ej5ejAe	ee
gdd1				d[de j+de j+de j+d#ee, d"e-d%e-d&ed' d2e j+fdIdJZBe#7dK ndZBe#7dL e&ejC ej5ejCe	ee
gdd1				d\de j+de j+de j+d!e,d#ee, d%e-d&ed' d2e j+fdMdNZDe#7dO 								dXd*e j/j0j1de j+de j+de j+d ee j+ d!e,d"e-d#ee, d$e-d%e-d+e-d&ed' fdPdQZEd*e j/j0j1d.e j+fdRdSZFdS d dTlm6Z6m>Z> zd dUlmBZB W n eyG   dZBY nw dZ<dZDe#7dV dS )]    N)Optional)current_platform)_AttentionBackendRegistryAttentionBackendName_check_device_check_shape_check_qkv_dtype_bf16_or_fp16_check_device_cuda)sageattn_sage_attention_forward_op_sage_attention_backward_op)flash_attn_funcTFnpu)npu_fusion_attention)ParallelConfigzContext parallelism requires the 'diffusers>=0.36.dev0'.Please install latest version of diffusers from source: 
pip3 install git+https://github.com/huggingface/diffusers.git)init_logger)ENV   )UnifiedTemplatedRingAttention) UnifiedTemplatedUlyssesAttentioni)_native_attention_sdpa_cudnn_attention_sage_attention_flash_attention_3_native_npu_attentionattn_backendc                 C   sh   t j| d  t j| d  t j| d  tt jtr$t j| d  d S | t jv r2t j| j	 d S d S N)
r   	_backendspop_constraints_supported_arg_names
isinstance_supports_context_paralleldictremovevalue)r    r&   g/home/ubuntu/.local/lib/python3.10/site-packages/cache_dit/parallelism/attention/_attention_dispatch.py_registry_pop_attn_backend?   s   
r(   memberr%   c                 C   sH   t t|}| |_||_tt| | |tj| < tj|  |tj	|< d S r   )
str__new__r   _name__value_setattr_member_map__member_names_append_value2member_map_)r)   r%   
new_memberr&   r&   r'   _set_new_attn_backendI   s   
r4   )_native_attention_forward_op _sdpa_cudnn_attention_forward_op_npu_attention_forward_op        )_parallel_configquerykey	attn_mask	dropout_p	is_causalscale
enable_gqa
return_lser9   r   c	                C   s   |d ur|	j }|tvrtd| d|rtd|rtd|jjdkr5t| |||||||||	|
|S |jjdkrKt| |||||||||	|
|S td)NzxTemplated context parallel attention with attn_mask is only supported for native attention backend, but got forward_op: .z>Causal attention is not yet supported for templated attention.z1GQA is not yet supported for templated attention.r   z@Reaching this branch of code is unexpected. Please report a bug.)	__name___ATTENTION_OPS_ALLOW_ATTN_MASK
ValueErrorcontext_parallel_configring_degreer   applyulysses_degreer   )r:   r;   r%   r<   r=   r>   r?   r@   rA   
forward_opbackward_opr9   forward_op_namer&   r&   r'   -_unified_templated_context_parallel_attention_   sV   rM   ctx	_save_ctxc              
   C   s   |
r|  ||| || _|| _|| _|| _|| _|	rQ|d ur"tdtjj	j
|dd|dd|dd|||dd d \}}|dd}|dd}||fS dd |||fD \}}}tjjj||||||||d}|dddd	}|S )
NzE`attn_mask` is not yet supported for native flash attention with lse.r      )r=   r>   r?   c                 s        | ]}| d dddV  qdS r   rP   r      Npermute.0xr&   r&   r'   	<genexpr>       z/_native_attention_forward_op.<locals>.<genexpr>r:   r;   r%   r<   r=   r>   r?   r@   r   rS   )save_for_backwardr<   r=   r>   r?   r@   rE   torchopsaten#_scaled_dot_product_flash_attention	transposenn
functionalscaled_dot_product_attentionrU   )rN   r:   r;   r%   r<   r=   r>   r?   r@   rA   rO   r9   outlser&   r&   r'   r5      sL   



r5   grad_outc              
   O   s   | j \}}}|d |d |d dd |||fD \}}}	tjjj|||	| j| j| j| j	| j
d}
|
dddd}
|dddd}tjj|
|||	g|d	d
\}}}|dddd}|dddd}|dddd}|||fS )NTc                 s   rQ   rR   rT   rV   r&   r&   r'   rY      rZ   z0_native_attention_backward_op.<locals>.<genexpr>r[   r   rP   r   rS   F)outputsinputsgrad_outputsretain_graph)saved_tensorsrequires_grad_r]   rb   rc   rd   r<   r=   r>   r?   r@   rU   autogradgrad)rN   rg   argskwargsr:   r;   r%   query_tkey_tvalue_tre   
grad_out_tgrad_query_t
grad_key_tgrad_value_t
grad_querygrad_key
grad_valuer&   r&   r'   _native_attention_backward_op   s6   




r|   )constraintssupports_context_parallelreturnc
                 C   s   |rt d|	d u r0dd | ||fD \} }}tjjj| |||||||d}
|
dddd}
|
S t| ||||||||tt|	d	}
|
S )
NzDNative attention backend does not support setting `return_lse=True`.c                 s   rQ   rR   rT   rV   r&   r&   r'   rY     rZ   z$_native_attention.<locals>.<genexpr>r[   r   rP   r   rS   rJ   rK   r9   )	rE   r]   rb   rc   rd   rU   rM   r5   r|   )r:   r;   r%   r<   r=   r>   r?   r@   rA   r9   re   r&   r&   r'   r     s>   
r   zRe-registered NATIVE attention backend to enable context parallelism with attn mask in cache-dit. You can disable this behavior by: export CACHE_DIT_ENABLE_CUSTOM_ATTN_DISPATCH=0.c                 C   s   |	rt d|
r| ||| || _|| _|| _|| _|| _dd |||fD \}}}tjj	
tjj	jj tjjj||||||||d}W d    n1 sPw   Y  |dddd}|S )	Nz:cudnn attention with sdpa does not support return_lse=Truec                 s   rQ   rR   rT   rV   r&   r&   r'   rY   W  rZ   z3_sdpa_cudnn_attention_forward_op.<locals>.<genexpr>r[   r   rP   r   rS   )rE   r\   r<   r=   r>   r?   r@   r]   rb   	attentionsdpa_kernel
SDPBackendCUDNN_ATTENTIONrc   rd   rU   rN   r:   r;   r%   r<   r=   r>   r?   r@   rA   rO   r9   re   r&   r&   r'   r6   <  s0   r6   c                 O      t d)Nz>Backward for cudnn attention with sdpa is not implemented yet.NotImplementedErrorrN   rg   rp   rq   r&   r&   r'   !_sdpa_cudnn_attention_backward_opg     r   _SDPA_CUDNN_sdpa_cudnnc
                 C   s   d }
|	d u rH|sHdd | ||fD \} }}t jjt jjjj t jjj| |||||||d}W d    n1 s:w   Y  |dddd}nt	| ||||||||t
t|	d}|r^|\}}
|rd||
fS |S )	Nc                 s   s$    | ]}| d ddd V  qdS rR   )rU   
contiguousrV   r&   r&   r'   rY     s   " z(_sdpa_cudnn_attention.<locals>.<genexpr>r[   r   rP   r   rS   r   )r]   rb   r   r   r   r   rc   rd   rU   rM   r6   r   )r:   r;   r%   r<   r=   r>   r?   r@   rA   r9   rf   re   r&   r&   r'   r   s  sB   r   zRegistered new attention backend: _SDPA_CUDNN to enable context parallelism with attn mask in cache-dit. You can disable it by: export CACHE_DIT_ENABLE_CUSTOM_ATTN_DISPATCH=0.c           
      C   sn   d }|d u rt | ||d|||d}|r|^}}}	nt| ||d d||d|tt|d}|r/|\}}|r5||fS |S )NNHD)qkvtensor_layoutr>   sm_scalerA   r8   Fr   )r
   rM   r   r   )
r:   r;   r%   r>   r?   rA   r9   rf   re   _r&   r&   r'   r     s>   	
r   zRe-registered SAGE attention backend to enable context parallelism with FP8 Attention in cache-dit. You can disable this behavior by: export CACHE_DIT_ENABLE_CUSTOM_ATTN_DISPATCH=0.c                 C   s   |durt d|rt d|dkrt d|du r!|jd d }|
r(td d	}d}d
}tdi d|d|d|d|d|ddddddddd|ddd|ddddd|ddd|	}|	rw|\}}|ddd}||fS |S ) zCFlash Attention 3 forward operation for cache-dit (inference only).Nz2`attn_mask` is not yet supported for flash-attn 3.z3`enable_gqa` is not yet supported for flash-attn 3.r8   z6`dropout_p` > 0 is not yet supported for flash-attn 3.g      zmFlash Attention 3 is configured for inference only, but _save_ctx=True was passed. Context will not be saved.r   r   Fr   r   r   softmax_scalecausalqv	q_descale	k_descale	v_descalewindow_sizeattention_chunkr   softcap
num_splitsr   pack_gqadeterministic	sm_marginreturn_attn_probsrP   r&   )rE   shapeloggerwarningflash_attn_3_funcrU   )rN   r:   r;   r%   r<   r=   r>   r?   r@   rA   rO   r9   r   r   r   re   rf   r&   r&   r'   _flash_attention_3_forward_op  sn   	
r   c                 O   r   )z3Flash Attention 3 backward operation for cache-dit.zaBackward pass for Flash Attention 3 with context parallelism is not implemented yet in cache-dit.r   r   r&   r&   r'   _flash_attention_3_backward_op#  s   r   _FLASH_3z>AttentionBackendName._FLASH_3 not found, creating new backend._flash_3c                 C   s   d }|d u rRd}d}	d}
t di d| d|d|d|d|d	d d
d dd dd d|ddd|	dddd d|
ddd|}|rQ|\}}|ddd}nt| ||d d||d|tt|d}|rh|\}}|rn||fS |S )Nr   r8   Fr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rP   r   r&   )r   rU   rM   r   r   )r:   r;   r%   r?   r>   rA   r9   rf   r   r   r   re   r&   r&   r'   r   7  sz   	
r   zRe-registered FLASH_3 attention backend to enable context parallelism with Ulysses Anything/Float8 in cache-dit. You can disable this behavior by: export CACHE_DIT_ENABLE_CUSTOM_ATTN_DISPATCH=0.zHFlash Attention 3 not available, skipping _FLASH_3 backend registration.c                 C   s   |rt d|d u r,t| ||d d|d u rdt| jd  n|tt| dd	d }|S t| ||d |d |d u rAdt| jd  n|d |tt	|d}|S )	NANPU attention backend does not support setting `return_lse=True`.BSNDg      ?r   rP   
atten_maskinput_layoutr?   pre_tockensnext_tockenshead_numr   r   )
rE   r   mathsqrtr   	MAX_TOKENsizerM   r7   _npu_attention_backward_op)r:   r;   r%   r=   r?   rA   r9   re   r&   r&   r'   r     s@   
r   zRe-registered _NATIVE_NPU attention backend to enable context parallelism You can disable this behavior by: export CACHE_DIT_ENABLE_CUSTOM_ATTN_DISPATCH=0.c                 C   sJ   |	rt d|d ur|tj }t||||d|tt|dd	d }|S )Nr   r   rP   r   r   )rE   tor]   boolr   r   r   r   r&   r&   r'   r7     s$   
r7   c                 O   r   )Nz:Backward pass is not implemented for Npu Fusion Attention.r   r   r&   r&   r'   r     r   r   )r   r   )r   z;Skipped custom attention backend registration in cache-dit.)Nr8   FNFF)Nr8   FNFFTN)Nr8   FNFFN)FNFN)NFFN)r8   NFN)Gr]   r   typingr   cache_dit.platformsr   #diffusers.models.attention_dispatchr   r   r   r   r   r	   r
   r   r   flash_attn_interfacer   r   _flash_attn_3_availableImportErrordevice_type	torch_npur   #diffusers.models._modeling_parallelr   cache_dit.loggerr   cache_dit.envsr   _templated_ringr   _templated_ulyssesr   rC   r   r   __all__r(   r*   r4   %CACHE_DIT_ENABLE_CUSTOM_ATTN_DISPATCHrD   Tensorfloatr   rM   rn   functionFunctionCtxr5   r|   NATIVEregisterr   infor6   r   hasattrr   r   SAGEr   r   r   r   r   _NATIVE_NPUr   r7   r   r&   r&   r&   r'   <module>   s     

	
	
I	

8

(		
,	

+

			
/
	*	

>


	:

	(	

!	