o
    پic8                     @   s   d dl mZ d dlZd dlmZ d dlmZmZ d dlm	Z	m
Z
mZmZmZ d dlmZ d dlmZ d dlmZmZmZ d dlmZmZ d d	lmZ d d
lmZ G dd dejZG dd deZG dd dejZ G dd dejZ!dS )    )TypeN)"sequence_model_parallel_all_gather%sequence_model_parallel_all_to_all_4D)get_ring_parallel_world_size get_sequence_parallel_world_sizeget_sp_parallel_rankget_sp_world_sizeget_ulysses_parallel_world_size)AttentionImpl)get_attn_backend)_usp_input_all_to_all_usp_output_all_to_all	ring_attn)ForwardContextget_forward_context)AttentionBackendEnum)get_compute_dtypec                       s   e Zd ZdZ					ddedededB dedB d	ed
ee dB de	ddf fddZ
			ddejdejdejdejdB dejdB dejdB deejejdB f fddZ  ZS )UlyssesAttentionz2Ulysses-style SequenceParallelism attention layer.NF 	num_heads	head_sizenum_kv_headssoftmax_scalecausalsupported_attention_backendsprefixreturnc              	      s   t    |d u r|d | _n|| _|d u r|}t }	t||	|d}
|
 }|d|||| j|| dd|| _|| _|| _|| _	|

 | _|	| _d S N      ࿩r   z.impl)r   r   r   r   r   r    super__init__r   r   r   get_impl_cls	attn_implr   r   r   get_enumbackenddtype)selfr   r   r   r   r   r   r   extra_impl_argsr(   attn_backendimpl_cls	__class__r    h/home/ubuntu/.local/lib/python3.10/site-packages/sglang/multimodal_gen/runtime/layers/attention/layer.pyr#   (   s4   
	

zUlyssesAttention.__init__qkvreplicated_qreplicated_kreplicated_vc                 C   s  |  dkr|  dkr|  dksJ d|j\}}}	}
t }t }t }|j}tj|||gdd}t|ddd}| j	
||}|dury|durM|dusOJ tj|||gdd}|	| }|dddd|| |d | f }tj||gdd}|jd	dd\}}}| j	||||}d}|dur|dd|| df }|ddd|| f }t| dd}| j	||}t|ddd}||fS )
aZ  Forward pass for distributed attention.

        Args:
            q (torch.Tensor): Query tensor [batch_size, seq_len, num_heads, head_dim]
            k (torch.Tensor): Key tensor [batch_size, seq_len, num_heads, head_dim]
            v (torch.Tensor): Value tensor [batch_size, seq_len, num_heads, head_dim]
            replicated_q (Optional[torch.Tensor]): Replicated query tensor, typically for text tokens
            replicated_k (Optional[torch.Tensor]): Replicated key tensor
            replicated_v (Optional[torch.Tensor]): Replicated value tensor

        Returns:
            Tuple[torch.Tensor, Optional[torch.Tensor]]: A tuple containing:
                - o (torch.Tensor): Output tensor after attention for the main sequence
                - replicated_o (Optional[torch.Tensor]): Output tensor for replicated tokens, if provided
           Expected 4D tensorsr   dim      scatter_dim
gather_dimN   )r9   shaper   r   r   attn_metadatatorchcatr   r%   preprocess_qkvchunkforwardr   
contiguouspostprocess_output)r)   r0   r1   r2   r3   r4   r5   
batch_sizeseq_lenr   head_dim
local_rank
world_sizeforward_contextctx_attn_metadataqkvreplicated_qkvheads_per_rankoutputreplicated_outputr    r    r/   rF   Q   sB   ,
 zUlyssesAttention.forward)NNFNr   NNN)__name__
__module____qualname____doc__intfloatboolsetr   strr#   rB   TensortuplerF   __classcell__r    r    r-   r/   r   %   sT    

.r   c                   @   sd   e Zd ZdZ				ddejdejdejdejdB dejdB dejdB d	ejdB d
ejfddZdS )UlyssesAttention_VSAz-Distributed attention layer with VSA support.Nr0   r1   r2   r3   r4   r5   gate_compressr   c                 C   s   |du r|du r|du sJ d|  dkr"|  dkr"|  dks&J dt }|j}	tj||||gdd}
t|
ddd	}
| j|
|	}
|
jddd\}}}}| jj	|||||	d
}| j
||	}t|ddd	}|S )a  Forward pass for distributed attention.

        Args:
            q (torch.Tensor): Query tensor [batch_size, seq_len, num_heads, head_dim]
            k (torch.Tensor): Key tensor [batch_size, seq_len, num_heads, head_dim]
            v (torch.Tensor): Value tensor [batch_size, seq_len, num_heads, head_dim]
            gate_compress (torch.Tensor): Gate compress tensor [batch_size, seq_len, num_heads, head_dim]
            replicated_q (Optional[torch.Tensor]): Replicated query tensor, typically for text tokens
            replicated_k (Optional[torch.Tensor]): Replicated key tensor
            replicated_v (Optional[torch.Tensor]): Replicated value tensor

        Returns:
            Tuple[torch.Tensor, Optional[torch.Tensor]]: A tuple containing:
                - o (torch.Tensor): Output tensor after attention for the main sequence
                - replicated_o (Optional[torch.Tensor]): Output tensor for replicated tokens, if provided
        Nz+Replicated QKV is not supported for VSA nowr6   r7   r   r8   r:   r;   r<   )rc   rA   )r9   r   rA   rB   rC   r   r%   rD   rE   rF   rH   )r)   r0   r1   r2   r3   r4   r5   rc   rN   rO   qkvgrS   r    r    r/   rF      s(   ,
zUlyssesAttention_VSA.forward)NNNN)rV   rW   rX   rY   rB   r_   rF   r    r    r    r/   rb      s.    	rb   c                       sz   e Zd ZdZ				ddedededB dedB ded	ee dB d
df fddZ	de
jde
jde
jd
e
jfddZ  ZS )LocalAttentionzAttention layer.NFr   r   r   r   r   r   r   c                    s   t    |d u r|d | _n|| _|d u r|}t }t|||d}	|	 }
|
d||| j||d|| _|| _|| _|| _	|	
 | _|| _d S )Nr   r   )r   r   r   r   r   r    r!   )r)   r   r   r   r   r   r   r*   r(   r+   r,   r-   r    r/   r#      s2   



zLocalAttention.__init__r0   r1   r2   c                 C   sP   |  dkr|  dkr|  dksJ dt }|j}| jj||||d}|S )a  
        Apply local attention between query, key and value tensors.

        Args:
            q (torch.Tensor): Query tensor of shape [batch_size, seq_len, num_heads, head_dim]
            k (torch.Tensor): Key tensor of shape [batch_size, seq_len, num_heads, head_dim]
            v (torch.Tensor): Value tensor of shape [batch_size, seq_len, num_heads, head_dim]

        Returns:
            torch.Tensor: Output tensor after local attention
        r6   r7   )rA   )r9   r   rA   r%   rF   )r)   r0   r1   r2   rN   rO   rS   r    r    r/   rF     s
   ,zLocalAttention.forward)NNFN)rV   rW   rX   rY   rZ   r[   r\   r]   r   r#   rB   r_   rF   ra   r    r    r-   r/   re      s<    
	%re   c                       s   e Zd ZdZ						ddedededB d	edB d
edee dB de	deddf fddZ
			ddejdejdejdejdB dejdB dejdB dejfddZ  ZS )USPAttentiona"  
    Ulysses Sequence Parallelism with Ring Attention.

    This class implements the USP algorithm, which is a combination of
    Ulysses-style all-to-all communication for sequence-head dimension sharding
    and Ring Attention for fine-grained sequence parallelism within subgroups.
    NFr           r   r   r   r   r   r   r   dropout_rater   c	              	      s   t    |d u r|d | _n|| _|d u r|}t }
t||
|d}| }|d|||| j|| dd|	| _|| _|| _|| _	|
 | _|
| _|| _|| _d S r   )r"   r#   r   r   r   r$   r%   r   r   r   r&   r'   r(   r   	dropout_p)r)   r   r   r   r   r   r   r   rh   r*   r(   r+   r,   r-   r    r/   r#   (  s8   
	

zUSPAttention.__init__r0   r1   r2   r3   r4   r5   c           
      C   s   |du r|du r|du sJ dt  }|j}t dkr&| j||||}	|	S t dkr=t|dd}t|dd}t|dd}t dkrPt|||| j| j	| j
d}	n	| j||||}	t dkrdt|	dd}	|	S )z
        Forward pass for USPAttention.

            q, k, v: [B, S_local, H, D]

        Note: Replicated tensors are not supported in this implementation.
        Nz-USPAttention does not support replicated_qkv.r;   r:   )rK   )r%   	is_causalri   )r   rA   r   r%   rF   r	   r   r   r   r   ri   r   )
r)   r0   r1   r2   r3   r4   r5   rN   rO   outr    r    r/   rF   S  s2   




zUSPAttention.forward)NNFNr   rg   rU   )rV   rW   rX   rY   rZ   r[   r\   r]   r   r^   r#   rB   r_   rF   ra   r    r    r-   r/   rf     sZ    
	0rf   )"typingr   rB   torch.nnnn:sglang.multimodal_gen.runtime.distributed.communication_opr   r   8sglang.multimodal_gen.runtime.distributed.parallel_stater   r   r   r   r	   Isglang.multimodal_gen.runtime.layers.attention.backends.attention_backendr
   7sglang.multimodal_gen.runtime.layers.attention.selectorr   (sglang.multimodal_gen.runtime.layers.uspr   r   r   6sglang.multimodal_gen.runtime.managers.forward_contextr   r   'sglang.multimodal_gen.runtime.platformsr   sglang.multimodal_gen.utilsr   Moduler   rb   re   rf   r    r    r    r/   <module>   s   w@C