o
    ۷iI                     @   s   d dl Z d dlmZ d dlmZ d dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZ eeZG dd dejZdS )    N)init_logger)AttentionMetadata)SDPABackend)!build_parallel_attention_strategy)NoParallelAttention)RingParallelAttention)get_attn_backend)get_sp_group)get_forward_contextis_forward_context_availablec                       s   e Zd Z					ddededed	ed
edB dedededef fddZdd Z	dde	j
de	j
de	j
dede	j
f
ddZdd Zdd Z  ZS ) 	AttentionN       F	num_heads	head_sizecausalsoftmax_scalenum_kv_headsprefixscatter_idx
gather_idxuse_syncc
                    s"  t    td| _| j | _| j|||||d| _t |||||d| _d | _	|| _
|| _|| _|	| _|| _d| _d | _d | _z1t j}
|
j| _	|
jjdkrrd| _zt }|j| _t|| _W n tyq   d| _d | _Y nw W n ty   d| _d | _Y nw t|||	d| _t | _d S )N)r   r   r   r   r   Fr   T)r   r   r   )super__init__r   attn_backendget_impl_clsattn_impl_cls	attentionr   sdpa_fallbackbackend_prefr   r   r   r   r   use_ringring_pgring_runnerr
   omni_diffusion_configattention_backendparallel_configring_degreer	   
ring_groupr   	Exceptionr   parallel_strategyr   _no_parallel_strategy)selfr   r   r   r   r   r   r   r   r   configsp_group	__class__ Y/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm_omni/diffusion/attention/layer.pyr      sd   



zAttention.__init__c                 C   s   t  rt }|js| jS | jS )a8  Get the parallel strategy based on current SP active state.

        Returns NoParallelAttention if we're outside an SP sharded region
        (e.g., in noise_refiner/context_refiner before unified_prepare in Z-Image).
        This avoids unnecessary SP communication for layers not covered by _sp_plan.
        )r   r
   	sp_activer,   r+   )r-   ctxr2   r2   r3   _get_active_parallel_strategy^   s
   z'Attention._get_active_parallel_strategyquerykeyvalueattn_metadatareturnc                 C   sd   |   }|||||\}}}}}| jr"|| jur"| ||||}n| ||||}|||}|S N)r6   pre_attentionr"   r,   _run_ring_attention_run_local_attentionpost_attention)r-   r7   r8   r9   r:   strategyr5   outr2   r2   r3   forwardk   s   zAttention.forwardc              	   C   sX   |j tjkr#tdt| j d| j d|j  d | j	||||S | j	||||S )Nz3Only SDPA supports float32. Overriding user config z attention_backend='z' to 'sdpa' for dtype=.)
dtypetorchfloat32loggerwarning_oncetyper   r!   r    rC   r-   r7   r8   r9   r:   r2   r2   r3   r?      s   zAttention._run_local_attentionc                 C   s.   | j d ur| j j||||| j| jdS td)N)r   r   zCRing attention is enabled but strategy is not RingParallelAttention)r$   run_attentionr   r   RuntimeErrorrK   r2   r2   r3   r>      s
   
zAttention._run_ring_attention)Nr   r   r   Fr<   )__name__
__module____qualname__intboolfloatstrr   r6   rF   Tensorr   rC   r?   r>   __classcell__r2   r2   r0   r3   r      sP    	
C
r   )rF   torch.nnnnvllm.loggerr   /vllm_omni.diffusion.attention.backends.abstractr   +vllm_omni.diffusion.attention.backends.sdpar   &vllm_omni.diffusion.attention.parallelr   +vllm_omni.diffusion.attention.parallel.baser   +vllm_omni.diffusion.attention.parallel.ringr   &vllm_omni.diffusion.attention.selectorr   .vllm_omni.diffusion.distributed.parallel_stater	   #vllm_omni.diffusion.forward_contextr
   r   rN   rH   Moduler   r2   r2   r2   r3   <module>   s   	