o
    ٷiIp                     @   s  d dl Z d dlZd dlmZmZmZ d dlZd dlmZ	 zd dl
mZ d dlmZ W n ey5   edw ddlmZmZmZmZmZmZmZmZmZmZmZmZmZ d dlmZ d d	lm Z  e e!Z"g d
Z#G dd dej$j%Z&G dd dej$j%Z'G dd dej$j%Z(G dd dej$j%Z)G dd dej$j%Z*G dd dej$j%Z+ej,dddee- dee- de-de-deee-  f
ddZ.ej/j0d ej1de-d!e	j2j3dej1fd"d#Z4G d$d% d%ej$j%Z5e6e7ej8d&ed ej1de-d'e	j2j3dej1f
d(d)Z9e6e7ej:d ej1de-d'ejj2j3dej1fd*d+Z;de<fd,d-Z=d.d/ Z>de<fd0d1Z?d2d3 Z@d4d5 ZAde<fd6d7ZBde<fd8d9ZCd:d; ZDde<fd<d=ZEde<fd>d?ZFdS )@    N)OptionalTupleList)ParallelConfig)EquipartitionSharderzContext parallelism requires the 'diffusers>=0.36.dev0'.Please install latest version of diffusers from source: 
pip3 install git+https://github.com/huggingface/diffusers.git   )_get_rank_world_size_gather_size_by_comm_all_to_all_single_o_async _all_to_all_single_qkv_fp8_async_all_to_all_single_o_fp8_async)_all_to_all_single_qkv_uneven_heads_async'_all_to_all_single_o_uneven_heads_async_all_to_all_single_any_o_async _all_to_all_single_any_qkv_async"_all_to_all_single_any_o_fp8_async$_all_to_all_single_any_qkv_fp8_async_all_to_all_single_qkv_async_prepare_ulysses_comm_metadata)ENV)init_logger)	 UnifiedTemplatedUlyssesAttentionr   enable_ulysses_anythingis_ulysses_anything_enableddisable_ulysses_anythingenable_ulysses_float8is_ulysses_float8_enableddisable_ulysses_float8is_ulysses_heads_no_paddingc                   @   sh   e Zd ZdZe	ddejjjdej	dej	dej	de
ej	 ded	ed
e
e dedede
d fddZdS )r   zBA unified wrapper for all Ulysses Attention variants in cache-dit.Nctxquerykeyvalue	attn_mask	dropout_p	is_causalscale
enable_gqa
return_lse_parallel_configr   c                 C   s   t  r&t rt|||||||||	|
||S t|||||||||	|
||S t r9t|||||||||	|
||S t rLt|||||||||	|
||S t|||||||||	|
||S N)	r   r   (_TemplatedUlyssesAnythingAttentionFloat8apply"_TemplatedUlyssesAnythingAttention _TemplatedUlyssesAttentionFloat8r   %_TemplatedUlyssesAttentionUnEvenHeads_TemplatedUlyssesAttention)r   r    r!   r"   r#   r$   r%   r&   r'   r(   
forward_opbackward_opr)    r3   f/home/ubuntu/.local/lib/python3.10/site-packages/cache_dit/parallelism/attention/_templated_ulysses.pyforward:   s   z(UnifiedTemplatedUlyssesAttention.forwardr*   )__name__
__module____qualname____doc__staticmethodtorchautogradfunctionFunctionCtxTensorr   floatboolr5   r3   r3   r3   r4   r   7   s6    	
r   c                   @      e Zd Ze	ddejjjdejdejdejde	ej de
ded	e	e
 d
edede	d fddZedejjjdejfddZdS )r0   Nr   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r   c                 C     |j j}| }|
| _|| _|| _t|}t||fi |}t||fi |}t||fi |}| }| }| }|
| |||||||||	d|d}|	rP|^}}}t||fi |}|	rw|	d}t||fi |}| }| }|
d }n| }d }|	r||fS |S NF)	_save_ctxr)   )context_parallel_config_ulysses_mesh	get_groupr1   r2   r)   r   r   r
   	unsqueezesqueeze
contiguousr   r    r!   r"   r#   r$   r%   r&   r'   r(   r1   r2   r)   ulysses_meshgroupmetadata
query_waitkey_wait
value_waitoutlse_out_waitlse_waitr3   r3   r4   r5      sL   

z"_TemplatedUlyssesAttention.forwardgrad_outc                 G      t dNzHBackward pass for Ulysses Attention in cache-dit is not implemented yet.NotImplementedErrorr   rY   argsr3   r3   r4   backward      z#_TemplatedUlyssesAttention.backwardr*   r6   r7   r8   r:   r;   r<   r=   r>   r?   r   r@   rA   r5   r`   r3   r3   r3   r4   r0      s@    	
?r0   c                   @   rB   )r/   Nr   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r   c                 C   rC   rD   )rG   rH   rI   r1   r2   r)   r   r   r   rJ   rK   rL   rM   r3   r3   r4   r5      sL   

z-_TemplatedUlyssesAttentionUnEvenHeads.forwardrY   c                 G   rZ   r[   r\   r^   r3   r3   r4   r`   .  ra   z._TemplatedUlyssesAttentionUnEvenHeads.backwardr*   rb   r3   r3   r3   r4   r/      s@    	
Br/   c                   @   rB   )r.   Nr   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r   c                 C   s  |j j}| }|
| _|| _|| _t|}t||fi |}t||fi |}t||fi |}| }| }| }|
| |||||||||	d|d}|	rP|^}}}t	||fi |}|	rw|
d}t||fi |}| }| }|d }n| }d }|	r||fS |S rD   )rG   rH   rI   r1   r2   r)   r   r   r   r   rJ   r
   rK   rL   )r   r    r!   r"   r#   r$   r%   r&   r'   r(   r1   r2   r)   rN   rO   rP   rR   rQ   rS   rT   rU   rV   rW   rX   r3   r3   r4   r5   :  sL   	

z(_TemplatedUlyssesAttentionFloat8.forwardrY   c                 G   rZ   )NzOBackward pass for Ulysses Attention Float8 in cache-dit is not implemented yet.r\   r^   r3   r3   r4   r`     ra   z)_TemplatedUlyssesAttentionFloat8.backwardr*   rb   r3   r3   r3   r4   r.   9  s@    	
Hr.   c                   @   rB   )r-   Nr   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r   c                 K   s  |j j}| }|
| _|| _|| _t|}t||fi |}t||fi |}t||fi |}| }| }| }|
| |||||||||	d|d}|	rP|^}}}t||fi |}|	rw|	d}t||fi |}| }| }|
d }n| }d }|	r||fS |S rD   )rG   rH   rI   r1   r2   r)   r   r   r   rJ   rK   rL   )r   r    r!   r"   r#   r$   r%   r&   r'   r(   r1   r2   r)   kwargsrN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   r3   r3   r4   r5     sL   

z*_TemplatedUlyssesAnythingAttention.forwardrY   c                 G   rZ   )NzQBackward pass for Ulysses Anything Attention in cache-dit is not implemented yet.r\   r^   r3   r3   r4   r`     ra   z+_TemplatedUlyssesAnythingAttention.backwardr*   rb   r3   r3   r3   r4   r-     s@    	
Cr-   c                   @   rB   )r+   Nr   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r   c                 K   s  |j j}| }|
| _|| _|| _t|}t||fi |}t||fi |}t||fi |}| }| }| }|
| |||||||||	d|d}|	rP|^}}}t	||fi |}|	rw|
d}t||fi |}| }| }|d }n| }d }|	r||fS |S rD   )rG   rH   rI   r1   r2   r)   r   r   r   r   rJ   r   rK   rL   )r   r    r!   r"   r#   r$   r%   r&   r'   r(   r1   r2   r)   rc   rN   rO   rP   rR   rQ   rS   rT   rU   rV   rW   rX   r3   r3   r4   r5     sL   	

z0_TemplatedUlyssesAnythingAttentionFloat8.forwardrY   c                 G   rZ   )NzXBackward pass for Ulysses Anything Attention Float8 in cache-dit is not implemented yet.r\   r^   r3   r3   r4   r`   -  ra   z1_TemplatedUlyssesAnythingAttentionFloat8.backwardr*   rb   r3   r3   r3   r4   r+     s@    	
Kr+   @   )maxsizeshapegather_dimsdim
world_sizereturnc                 C   s:   g }t |D ]}tt| }|| ||< || q|S r*   )rangelistcopydeepcopyappend)rf   rg   rh   ri   gather_shapesi
rank_shaper3   r3   r4   _fill_gather_shapes8  s   rs   tensorrO   c                    sv   t |\}}    j}|| }t||}tt|t|||} fdd|D }	tj|	 |d tj	|	|d}
|
S )Nc                    s    g | ]}t j| j jd qS ))devicedtype)r;   emptyru   rv   ).0rf   rt   r3   r4   
<listcomp>[  s    z(_all_gather_anything.<locals>.<listcomp>)rO   rh   )
r   rL   rf   r	   rs   tupledist
all_gatherr;   cat)rt   rh   rO   rV   ri   rf   rank_dimrg   rp   gathered_tensorsgathered_tensorr3   ry   r4   _all_gather_anythingE  s"   

	r   c                   @   s8   e Zd Zedejdedejj	fddZ
edd ZdS )	AllGatherAnythingFunctionrt   rh   rO   c                 C   s4   || _ || _t|| _t|| _t|||}|S r*   )rh   rO   r}   get_world_sizeri   get_rankrankr   )r   rt   rh   rO   r   r3   r3   r4   r5   m  s   z!AllGatherAnythingFunction.forwardc                 C   s$   t j|| j| jd}|| j d d fS )Nr{   )r;   tensor_splitri   rh   r   )r   grad_outputgrad_splitsr3   r3   r4   r`   {  s   z"AllGatherAnythingFunction.backwardN)r6   r7   r8   r:   r;   r?   intr}   device_mesh
DeviceMeshr5   r`   r3   r3   r3   r4   r   k  s    r   clsmeshc                 K   sX   |  | |  ksJ d|   d| d|   d|j|  |dt|  S )NzCannot shard tensor of size z along dim z across mesh of size .r{   )sizer   r}   r   rI   r   rt   rh   r   rc   r3   r3   r4   shard_anything  s   	 r   c                 K   s   |  }t||| }|S r*   )rL   r   r,   rI   r   r3   r3   r4   unshard_anything  s   	r   c                   C      t jS r*   )r   "CACHE_DIT_UNEVEN_HEADS_COMM_NO_PADr3   r3   r3   r4   r        r   c               
   K      z4t jrtjtkrtt_tt_td W d S dt _td tjtkr2tt_tt_t	d W d S W d S  t
yR } zdt _td|  W Y d }~d S d }~ww )NzUlysses Anything Attention is already enabled in cache-dit. but EquipartitionSharder.shard/unshard is not set correctly, resetting it to the correct shard/unshard_anything function.TzUlysses Anything Attention is enabled in cache-dit. Please note that this is an experimental feature and may not be fully tested.zlEquipartitionSharder.shard/unshard is set to shard/unshard_anything function for Ulysses Anything Attention.FzGFailed to enable Ulysses Anything Attention in cache-dit due to error: )r   "CACHE_DIT_ENABELD_ULYSSES_ANYTHINGr   shardr   r   unshardloggerwarninginfo	Exceptionerrorrc   er3   r3   r4   r     s4   


r   c                  K   r   r*   )r   r   rc   r3   r3   r4   r     r   r   c                  K      dt _td d S )NFz=Ulysses Anything Attention is manually disabled in cache-dit.)r   r   r   r   r   r3   r3   r4   r        r   c               
   K   r   )NzUlysses Anything Attention Float8 is already enabled in cache-dit. but EquipartitionSharder.shard/unshard is not set correctly, resetting it to the correct shard/unshard_anything function.TzUlysses Anything Attention Float8 is enabled in cache-dit. Please note that this is an experimental feature and may not be fully tested.zsEquipartitionSharder.shard/unshard is set to shard/unshard_anything function for Ulysses Anything Attention Float8.FzNFailed to enable Ulysses Anything Attention Float8 in cache-dit due to error: )r   )CACHE_DIT_ENABELD_ULYSSES_ANYTHING_FLOAT8r   r   r   r   r   r   r   r   r   r   r   r3   r3   r4   _enable_ulysses_anything_float8  s8   


r   c                  K   r   r*   )r   r   r   r3   r3   r4   #_is_ulysses_anything_float8_enabled	  r   r   c                  K   r   )NFzDUlysses Anything Attention Float8 is manually disabled in cache-dit.)r   r   r   r   r   r3   r3   r4    _disable_ulysses_anything_float8  r   r   c                  K   s$   t  rt  d S dt_td d S )NTzUlysses Attention Float8 is enabled in cache-dit. Please note that this is an experimental feature and may not be fully tested.)r   r   r    CACHE_DIT_ENABELD_ULYSSES_FLOAT8r   r   r   r3   r3   r4   r     s   r   c                  K   s   t jpt S r*   )r   r   r   r   r3   r3   r4   r   !  s   r   c                  K   s$   dt _td t rt  d S d S )NFz;Ulysses Attention Float8 is manually disabled in cache-dit.)r   r   r   r   r   r   r   r3   r3   r4   r   %  s
   

r   )Grm   	functoolstypingr   r   r   r;   torch.distributeddistributedr}   #diffusers.models._modeling_parallelr    diffusers.hooks.context_parallelr   ImportError_distributed_primitivesr   r	   r
   r   r   r   r   r   r   r   r   r   r   cache_dit.envsr   cache_dit.loggerr   r6   r   __all__r<   Functionr   r0   r/   r.   r-   r+   	lru_cacher   rs   compilerallow_in_graphr?   r   r   r   r   classmethodwrapsr   r   r   r   rA   r   r   r   r   r   r   r   r   r   r   r3   r3   r3   r4   <module>   s    <gLOUQ
Y
%!

%'