o
    ۷i1	                     @  st   d dl mZ d dlmZ d dlmZ d dlZd dlmZ edddG dd	 d	Z	G d
d deZ
G dd dZdS )    )annotations)	dataclass)ProtocolN)AttentionMetadataT)frozenslotsc                   @  s   e Zd ZU dZded< dS )ParallelAttentionContextzOpaque per-forward context returned by a parallel strategy.

    Strategies may stash whatever they need here to finish post-processing after
    the attention kernel runs (e.g. reverse resharding, slicing metadata, etc.).
    strnameN)__name__
__module____qualname____doc____annotations__ r   r   a/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm_omni/diffusion/attention/parallel/base.pyr      s   
 r   c                   @  s@   e Zd ZdZedddZedddZdddZdddZdS )ParallelAttentionStrategyaO  Pluggable strategy for parallel attention communication/resharding.

    This is intentionally orthogonal to the attention *kernel* backend.
    The kernel backend implements `AttentionImpl.forward()` for a given device,
    while the parallel strategy implements how Q/K/V and outputs are sharded /
    communicated across ranks.
    returnboolc                 C     d S Nr   selfr   r   r   enabled"      z!ParallelAttentionStrategy.enabledr	   c                 C  r   r   r   r   r   r   r   r
   %   r   zParallelAttentionStrategy.namequerytorch.Tensorkeyvalueattn_metadataAttentionMetadata | Nonejtuple[torch.Tensor, torch.Tensor, torch.Tensor, AttentionMetadata | None, ParallelAttentionContext | None]c                 C     dS )zRuns before the attention kernel.

        Returns possibly transformed Q/K/V and metadata, and an optional context
        for `post_attention`.
        Nr   r   r   r   r   r   r   r   r   pre_attention(       z'ParallelAttentionStrategy.pre_attentionattn_outputctxParallelAttentionContext | Nonec                 C  r"   )z Runs after the attention kernel.Nr   r   r&   r'   r   r   r   post_attention5   r%   z(ParallelAttentionStrategy.post_attentionNr   r   r   r	   )
r   r   r   r   r   r   r   r    r   r!   r&   r   r'   r(   r   r   	r   r   r   r   propertyr   r
   r$   r*   r   r   r   r   r      s    
r   c                   @  s@   e Zd ZdZedddZedddZdddZdddZdS )NoParallelAttentionz5Default strategy: do nothing (single device / no SP).r   r   c                 C  r"   )NFr   r   r   r   r   r   @      zNoParallelAttention.enabledr	   c                 C  r"   )Nnoner   r   r   r   r   r
   D   r1   zNoParallelAttention.namer   r   r   r   r   r    c                 C  s   ||||d fS r   r   r#   r   r   r   r$   H   s   z!NoParallelAttention.pre_attentionr&   r'   r(   c                 C  s   |S r   r   r)   r   r   r   r*   Q   r   z"NoParallelAttention.post_attentionNr+   r,   )r   r   r   r   r   r   r   r    r-   r.   r   r   r   r   r0   =   s    
	r0   )
__future__r   dataclassesr   typingr   torch/vllm_omni.diffusion.attention.backends.abstractr   r   r   r0   r   r   r   r   <module>   s   

$