o
    پi"                     @   s  d dl Z d dlmZ d dlZd dlm  mZ d dlm	Z	 d dl
mZmZ d dlmZ de	_er6d dlmZ e eZdejd	ejfd
dZdejd	ejfddZddejded	ejfddZddejded	ejfddZ		d dejdejdejdddedefddZdS )!    N)TYPE_CHECKING)_cp_options)get_sp_groupget_ulysses_parallel_world_size)torch_releaseF)AttentionImpltensorreturnc                 C   s   t | tjr
|  S | S )zu
    When tracing the code, the result tensor is not an AsyncCollectiveTensor,
    so we cannot call ``wait()``.
    )
isinstanceft_cAsyncCollectiveTensorwait)r    r   \/home/ubuntu/.local/lib/python3.10/site-packages/sglang/multimodal_gen/runtime/layers/usp.py_maybe_wait   s   r   xc                 C   sN   t  j}|d usJ d| j}|  } tj| d d |d} t| } | |} | S )Nz)Ulysses process group is not initialized.)output_split_sizesinput_split_sizesgroup)r   ulysses_groupshapeflattenr   all_to_all_singler   reshape)r   
ulysses_pgx_shaper   r   r   _usp_all_to_all_single$   s   
r      head_dimc           
      C   s:  t  }|dkr	| S | jdksJ d| j |dv s!J d| |dkr'dnd}|dkr4|dkr4| }n
| d||d }|j\}}}}|| dksVJ d	| d
| d|dddd }t|}|||| |d|ddddd||| d|}|dkr|dkr|S g d}	d|	|< d|	|< |t|	 S )a  
    Perform Ulysses-style input all-to-all over the head dimension.

    Default layout expects heads at dim=1 and sequence at dim=2:
        [b, h, s_local, d] -> [b, h_local, s_global, d]

    If heads are at dim=2 (input is [b, s_local, h, d]), set head_dim=2, and the
    function returns [b, s_global, h_local, d], preserving the original
    head/sequence dim ordering.

    Args:
        x: A 4D tensor with layout [b, *, *, d] where '*' are sequence and heads
        head_dim: Which dimension index corresponds to heads (1 or 2)

    Returns:
        Tensor with the same dim order as input, with heads sharded and sequence gathered.
    r      x must have 4 dimensions, got r      head_dim must be 1 or 2, got r"   r      zh (#) must be divisible by world_size ()r   NNr$   r   ndimpermute
contiguousr   r   r   tuple
r   r   
world_sizeseq_dimx_cbhsd	new_orderr   r   r   _usp_input_all_to_all1   s2   r7   c           
      C   s:  t  }|dkr	| S | jdksJ d| j |dv s!J d| |dkr'dnd}|dkr4|dkr4| }n
| d||d }|j\}}}}|| dksVJ d	| d
| d|dddd }t|}|||| |d|ddddd|d|| |}|dkr|dkr|S g d}	d|	|< d|	|< |t|	 S )a  
    Perform Ulysses-style output all-to-all over the head dimension (inverse of input).

    Default layout expects heads at dim=1 and sequence at dim=2:
        [b, h_local, s, d] -> [b, h, s_local, d]

    If heads are at dim=2 (input is [b, s_global, h // world_size, d]), set head_dim=2,
    and the function returns [b, s_local, h, d], preserving the original head/sequence
    dim ordering.

    Args:
        x: A 4D tensor with layout [b, *, *, d] where '*' are sequence and heads
        head_dim: Which dimension index corresponds to heads (1 or 2)

    Returns:
        Tensor with the same dim order as input, with heads gathered and sequence sharded.
    r   r   r    r!   r#   r"   r   r$   zs (r%   r&   r'   r(   r)   r.   r   r   r   _usp_output_all_to_allk   s2   r8           querykeyvalue	attn_implr   	is_causal	dropout_pc              	      s   ddl m} t j}|dusJ dt| g d } t|g d }t|g d } fdd}tdk}	t|||| |||d	}
|	rS|dd
di|
^}}n	|di |
^}}t|g d}|S )a  
    Ring Attention implementation.

    This function implements Ring Attention, a strategy for distributed attention
    computation that reduces peak memory usage. It accepts a generic attention
    implementation (`attn_impl`) which is called by the underlying PyTorch
    distributed attention primitive.

    Args:
        query, key, value: The input tensors for attention.
        attn_impl: An instance of an attention implementation backend
                   (e.g., FlashAttentionImpl) whose `forward` method will be
                   used as the computational kernel.
        is_causal: Whether to apply causal masking.
        dropout_p: Dropout probability.
    r   )_templated_ring_attentionNz&Ring process group is not initialized.r   r"   r   r$   c                    sh   t | g d} t |g d}t |g d} j| ||d dd^}}}t |g d}||g|R S )NrA   T)attn_metadatareturn_softmax_lse)torchr+   forward)qkvargskwargsoutputsoftmax_lserestr=   r   r   attn_callable_adapter   s   z(ring_attn.<locals>.attn_callable_adapter)r"      )opr?   r>   r:   r;   r<   r   r0   r   r   )	0torch.distributed.tensor.experimental._attentionr@   r   
ring_grouprD   r+   r,   r   dict)r:   r;   r<   r=   r>   r?   r@   ring_pgrO   use_segment_idattn_kwargsout_rK   r   rN   r   	ring_attn   s8   

rZ   )r   )Fr9   )loggingtypingr   rD   )torch.distributed._functional_collectivesdistributed_functional_collectivesr   rR   r   8sglang.multimodal_gen.runtime.distributed.parallel_stater   r   sglang.srt.utils.commonr   enable_load_balanceIsglang.multimodal_gen.runtime.layers.attention.backends.attention_backendr   	getLogger__name__loggerTensorr   r   intr7   r8   boolfloatrZ   r   r   r   r   <module>   s<   

:>