o
    ۷i3                     @   s
  d dl mZ d dlmZmZ d dlZd dlmZ ddl	m
Z
 er!	 e
eZeG dd dZeG dd	 d	Zed
dG dd dZed
dG dd dZeeeB eee B eedf B f Zeee B eedf B ZeeeeB f Zdedejdee fddZdS )    )	dataclass)TYPE_CHECKINGLiteralN   )
get_loggerc                   @   s>  e Zd ZU dZdZedB ed< dZedB ed< dZe	ed< dZ
ed ed	< d
Ze	ed< dZeed< dZeed< dZejed< dZejjjed< dZejjjed< dZejjjed< dZejjjed< dZeed< dZeed< dd Zedeeef fddZedeeef fddZ dededejdejjjfd d!Z!dS )"ContextParallelConfiga  
    Configuration for context parallelism.

    Args:
        ring_degree (`int`, *optional*, defaults to `1`):
            Number of devices to use for Ring Attention. Sequence is split across devices. Each device computes
            attention between its local Q and KV chunks passed sequentially around ring. Lower memory (only holds 1/N
            of KV at a time), overlaps compute with communication, but requires N iterations to see all tokens. Best
            for long sequences with limited memory/bandwidth. Number of devices to use for ring attention within a
            context parallel region. Must be a divisor of the total number of devices in the context parallel mesh.
        ulysses_degree (`int`, *optional*, defaults to `1`):
            Number of devices to use for Ulysses Attention. Sequence split is across devices. Each device computes
            local QKV, then all-gathers all KV chunks to compute full attention in one pass. Higher memory (stores all
            KV), requires high-bandwidth all-to-all communication, but lower latency. Best for moderate sequences with
            good interconnect bandwidth.
        convert_to_fp32 (`bool`, *optional*, defaults to `True`):
            Whether to convert output and LSE to float32 for ring attention numerical stability.
        rotate_method (`str`, *optional*, defaults to `"allgather"`):
            Method to use for rotating key/value states across devices in ring attention. Currently, only `"allgather"`
            is supported.

    Nring_degreeulysses_degreeTconvert_to_fp32	allgather)r   alltoallrotate_methodFulysses_anything_rank_world_size_device_mesh_flattened_mesh
_ring_mesh_ulysses_mesh_ring_local_rank_ulysses_local_rankc                 C   s   | j d u rd| _ | jd u rd| _| j dkr| jdkrtd| j dk s(| jdk r,td| jdkr:td| j d| jrO| jdkrFtd| j dkrQtdd S d S )	N   zfEither ring_degree or ulysses_degree must be greater than 1 in order to use context parallel inferencezF`ring_degree` and `ulysses_degree` must be greater than or equal to 1.r   z=Only rotate_method='allgather' is supported for now, but got .zIulysses_degree must be greater than 1 for ulysses_anything to be enabled.z8ulysses_anything cannot be enabled when ring_degree > 1.)r   r	   
ValueErrorr   NotImplementedErrorr   self r   Y/home/ubuntu/vllm_env/lib/python3.10/site-packages/diffusers/models/_modeling_parallel.py__post_init__U   s*   




z#ContextParallelConfig.__post_init__returnc                 C   s   | j | jfS N)r   r	   r   r   r   r   
mesh_shapek   s   z ContextParallelConfig.mesh_shapec                 C   s   dS )z$Dimension names for the device mesh.)ringulyssesr   r   r   r   r   mesh_dim_nameso   s   z$ContextParallelConfig.mesh_dim_namesrank
world_sizedevicemeshc                 C   s   || _ || _|| _|| _| j| j |kr$td| j d| j d| d| j | _| jd | _	| jd | _
| j	 | _| j
 | _d S )NzThe product of `ring_degree` (z) and `ulysses_degree` (z") must not exceed the world size (z).r$   r%   )r   r   r   r   r	   r   r   _flattenr   r   r   get_local_rankr   r   r   r'   r(   r)   r*   r   r   r   setupt   s   zContextParallelConfig.setup)"__name__
__module____qualname____doc__r   int__annotations__r	   r
   boolr   r   r   r   r   r   torchr)   r   distributeddevice_mesh
DeviceMeshr   r   r   r   r   r    propertytupler#   strr&   r.   r   r   r   r   r   )   s,   
 &r   c                
   @   s   e Zd ZU dZdZedB ed< dZeed< dZ	eed< dZ
ejed< dZejjjed< ddd	ed
edejdejjjdB fddZdS )ParallelConfigz
    Configuration for applying different parallelisms.

    Args:
        context_parallel_config (`ContextParallelConfig`, *optional*):
            Configuration for context parallelism.
    Ncontext_parallel_configr   r   r   r   )r*   r'   r(   r)   r*   c                C   s<   || _ || _|| _|| _| jd ur| j|||| d S d S r"   )r   r   r   r   r>   r.   r-   r   r   r   r.      s   
zParallelConfig.setup)r/   r0   r1   r2   r>   r   r4   r   r3   r   r   r6   r)   r   r7   r8   r9   r.   r   r   r   r   r=      s"   
 r=   T)frozenc                   @   s>   e Zd ZU dZeed< dZedB ed< dZeed< dd Z	dS )	ContextParallelInputa  
    Configuration for splitting an input tensor across context parallel region.

    Args:
        split_dim (`int`):
            The dimension along which to split the tensor.
        expected_dims (`int`, *optional*):
            The expected number of dimensions of the tensor. If provided, a check will be performed to ensure that the
            tensor has the expected number of dimensions before splitting.
        split_output (`bool`, *optional*, defaults to `False`):
            Whether to split the output tensor of the layer along the given `split_dim` instead of the input tensor.
            This is useful for layers whose outputs should be split after it does some preprocessing on the inputs (ex:
            RoPE).
    	split_dimNexpected_dimsFsplit_outputc                 C   s   d| j  d| j d| j dS )NzContextParallelInput(split_dim=, expected_dims=z, split_output=))rA   rB   rC   r   r   r   r   __repr__   s   zContextParallelInput.__repr__)
r/   r0   r1   r2   r3   r4   rB   rC   r5   rF   r   r   r   r   r@      s   
 r@   c                   @   s2   e Zd ZU dZeed< dZedB ed< dd ZdS )ContextParallelOutputa  
    Configuration for gathering an output tensor across context parallel region.

    Args:
        gather_dim (`int`):
            The dimension along which to gather the tensor.
        expected_dims (`int`, *optional*):
            The expected number of dimensions of the tensor. If provided, a check will be performed to ensure that the
            tensor has the expected number of dimensions before gathering.
    
gather_dimNrB   c                 C   s   d| j  d| j dS )Nz!ContextParallelOutput(gather_dim=rD   rE   )rH   rB   r   r   r   r   rF      s   zContextParallelOutput.__repr__)r/   r0   r1   r2   r3   r4   rB   rF   r   r   r   r   rG      s
   
 rG   .sizegroupr!   c                    sz   t j|d}tt j|d}d|v rdntj   fddt|D }t j|tj	| g tj
d|d dd |D }|S )zsGather the local size from all ranks.
    size: int, local size return: list[int], list of size from all ranks
    )rJ   cpuc                    s   g | ]}t jd  t jdqS ))r   r)   dtype)r6   emptyint64).0_gather_devicer   r   
<listcomp>$  s    z'gather_size_by_comm.<locals>.<listcomp>rL   c                 S   s   g | ]}|d    qS )r   )item)rP   sr   r   r   rT   +  s    )distget_world_sizer<   get_backendr6   acceleratorcurrent_acceleratorrange
all_gathertensorrO   )rI   rJ   r(   comm_backendsgathered_sizesr   rR   r   gather_size_by_comm  s   ra   )dataclassesr   typingr   r   r6   torch.distributedr7   rW   utilsr   r/   loggerr   r=   r@   rG   dictr<   r3   listr;   ContextParallelInputTypeContextParallelOutputTypeContextParallelModelPlanProcessGroupra   r   r   r   r   <module>   s,   
\  +