o
    پiA                     @   sr  U d Z ddlZddlmZ ddlmZmZmZ ddlZddl	m
Z
 ddlmZ ddlmZ ddlmZ eeZdd	dd
dejdejdejdee dedeej deejejf fddZdejdejfddZdejdejfddZ		dSdejdejdejdededejfddZedG d d! d!eZG d"d# d#eZG d$d% d%ejjZG d&d' d'ejjZ dTdee!ed)f B d*ede!ed)f fd+d,Z"d(dej#d-d.ee!ed)f B d/ee!ed)f B d*ed0ej$e%B dB d1ej&dejfd2d3Z'd4d5d5ej#dfd*ed6ej(eB d7e)d8e)d9e)d1ej&d0ej$e%B dB de!ejejf fd:d;Z*d4d5d5ddd<ej#ddd=	d8e)e+e) B d9e)e+e) B d>ed?ed@ed1ej&dAed0ej$e%B dB de!ejejf fdBdCZ,d5d5dej#ddfd>ed1ej&dAed0ej$e%B dB de!ejejf f
dDdEZ-i Z.e/e!ef e0dF< e a1dGe0dH< e Z2dIe0dJ< 	K			5dUdedLedMedNee)B dedOe/e%ef dB d1ej&dB dPe)defdQdRZ3dS )VzRotary Positional Embeddings.    N)OrderedDict)AnyOptionalTuple)apply_rotary_embedding)get_sp_group)CustomOp)init_loggerF)	head_sizeis_neox	positionsqkcos_sin_cacher
   r   r   returnc             
   C   s$  |   dks|  dkrtdt| j dt|j | j|jkr.td| j d|j t|tjr:|  dks>td| j\}}}}	|d u rK|	}||	krYtd|	 d	| zd
dlm}
 W n t	y   d
d l
}|jddd |jd d }|d u r|d |d |f | j}|d ||d f | j}|d
|dd|| d}|d
|dd|| d}n!||jd}||d |f | j}|||d f | j}| || ||	}||| ||	}t|||| d}t|||| d}|||||	|||||	f Y S w |d u r0tj|| jtjd}|dkr*|n||}n0t|tjrE|jtjkrE|  dksItd| || kr`td||  d|  | || ||	  }||| ||	  }|
||||	||d |||||	|||||	fS )N   z>Expected q/k to be 4D [bsz, seqlen, nheads, head_size], got q:z k:z&q and k must have the same shape, got z vs    z'cos_sin_cache must be a 2D torch.Tensorzhead_size mismatch: inferred z, but head_size=r   )%apply_rope_with_cos_sin_cache_inplacez8FlashInfer not available, using Triton fallback for RoPE)
stacklevel)interleaveddevicedtype   z(positions must be a 1D torch.long Tensorz$positions length must be bsz*seqlen=z, got )r   querykeyr
   r   r   )dim
ValueErrortupleshape
isinstancetorchTensorflashinfer.roper   ImportErrorwarningswarntor   	unsqueezeexpandreshaper   viewr   arangelongrepeatnumel
contiguous)r   r   r   r
   r   r   bszseqlennheadsdr   r&   	half_sizecossinq_flatk_flatq_rotk_rotpos_1d r>   i/home/ubuntu/.local/lib/python3.10/site-packages/sglang/multimodal_gen/runtime/layers/rotary_embedding.py apply_flashinfer_rope_qk_inplace+   s   	 "$

 r@   xc                 C   sH   | dd | j d d f }| d| j d d d f }tj| |fddS )N.r   r   r   )r    r"   catrA   x1x2r>   r>   r?   _rotate_neox}   s   rG   c                 C   sB   | dd d df }| ddd df }t j| |fdd} | dS )N.r   r   r   rB   )r"   stackflattenrD   r>   r>   r?   _rotate_gptj   s   
rK   r7   r8   is_neox_styler   c           	      C   s   |rR| d}| d}|rtj| ddd\}}n| ddddf }| ddddf }| | | |  | }| | | |  | }tj||fddS t| |||S )a  
    Args:
        x: [num_tokens, num_heads, head_size] or [num_tokens, head_size]
        cos: [num_tokens, head_size // 2]
        sin: [num_tokens, head_size // 2]
        is_neox_style: Whether to use the Neox-style or GPT-J-style rotary
            positional embeddings.
    rH   r   r   rB   .Nr   )r)   r"   chunkfloattype_asrC   r   )	rA   r7   r8   rL   r   rE   rF   o1o2r>   r>   r?   _apply_rotary_emb   s   

rR   rotary_embeddingc                       s   e Zd ZdZdedededeeB dedejdd	f fd
dZ	deeB dej
fddZdej
fddZdefddZ		ddej
dej
dej
dej
d	B deej
ej
f f
ddZdefddZ  ZS )RotaryEmbeddingz%Original rotary positional embedding.r
   
rotary_dimmax_position_embeddingsbaserL   r   r   Nc                    sX   t    || _|| _|| _|| _|| _|| _|  }|	|}|  | j
d|dd d S )Nr   F)
persistent)super__init__r
   rU   rV   rW   rL   r   _compute_cos_sin_cacher(   register_buffer)selfr
   rU   rV   rW   rL   r   cache	__class__r>   r?   rZ      s   
	
zRotaryEmbedding.__init__c                 C   s(   d|t jd| jdt jd| j   }|S )zCompute the inverse frequency.      ?r   r   r   )r"   r-   rU   rN   )r]   rW   inv_freqr>   r>   r?   _compute_inv_freq   s   z!RotaryEmbedding._compute_inv_freqc                 C   sR   |  | j}tj| jtjd}td||}| }| }tj	||fdd}|S )zCompute the cos and sin cache.rb   	i,j -> ijr   rB   )
rd   rW   r"   r-   rV   rN   einsumr7   r8   rC   r]   rc   tfreqsr7   r8   r^   r>   r>   r?   r[      s   z&RotaryEmbedding._compute_cos_sin_cachec                 O   s   | j |i |S N)forward_native)r]   argskwargsr>   r>   r?   forward_cuda   s   zRotaryEmbedding.forward_cudar   r   r   offsetsc                 C   s  |dur|| }|  }|jd }| jd|}|jddd\}}|j}	||d| j}|dd| jf }
|d| jdf }t|
||| j	}
t
j|
|fdd|	}|j}||d| j}|dd| jf }|d| jdf }t|||| j	}t
j||fdd|}||fS )z-A PyTorch-native implementation of forward().Nr   r   r   rB   .)rJ   r    r   index_selectrM   r,   r
   rU   rR   rL   r"   rC   r+   )r]   r   r   r   ro   
num_tokenscos_sinr7   r8   query_shape	query_rot
query_pass	key_shapekey_rotkey_passr>   r>   r?   rk      s&   
zRotaryEmbedding.forward_nativec                 C   s@   d| j  d| j }|d| j 7 }|d| j d| j 7 }|S )Nz
head_size=z, rotary_dim=z, max_position_embeddings=z, base=z, is_neox_style=)r
   rU   rV   rW   rL   )r]   sr>   r>   r?   
extra_repr   s   zRotaryEmbedding.extra_reprrj   )__name__
__module____qualname____doc__intrN   boolr"   r   rZ   r#   rd   r[   r   rn   r   rk   strrz   __classcell__r>   r>   r_   r?   rT      sB    
rT   c                       sT   e Zd ZdedededeeB dedejdedd	f fd
dZdej	fddZ
  ZS )LinearScalingRotaryEmbeddingr
   rU   rV   rW   rL   r   scaling_factorr   Nc                    s&   t || _t j||||||d d S )N)r
   rU   rV   rW   rL   r   )rN   r   rY   rZ   )r]   r
   rU   rV   rW   rL   r   r   r_   r>   r?   rZ     s   


z%LinearScalingRotaryEmbedding.__init__c                 C   s\   |  | j}tj| jtjd}|| j }td||}| }|	 }tj
||fdd}|S )Nrb   re   r   rB   )rd   rW   r"   r-   rV   rN   r   rf   r7   r8   rC   rg   r>   r>   r?   r[     s   
z3LinearScalingRotaryEmbedding._compute_cos_sin_cache)r{   r|   r}   r   rN   r   r"   r   rZ   r#   r[   r   r>   r>   r_   r?   r     s&    	r   c                       s   e Zd ZdZdddejddfdedededed	ejd
e	de	f fddZ
dd ZdejfddZejdddedededeejejf fddZdejdeejejf fddZejdddededeejejf fddZ  ZS ) OneDRotaryEmbeddingz,1D rotary positional embedding with caching.     @ra   Fr   thetatheta_rescale_factorinterpolation_factorr   use_realrepeat_interleave_realc                    sH   t    |d dksJ || _|| _|| _|| _|| _|| _|| _d S )Nr   r   )	rY   rZ   r   r   r   r   r   r   r   )r]   r   r   r   r   r   r   r   r_   r>   r?   rZ   (  s   


zOneDRotaryEmbedding.__init__c              	   C   sB   d| j tjd| jd| j|dd | jd  | j j|d  }|S )Nra   r   r   r   r   r   )r   r"   r-   r   r   r(   )r]   r   ri   r>   r>   r?   build_freqs=  s   	zOneDRotaryEmbedding.build_freqsposc                 C   s   | j }| jdkr|| j| j| jd   9 }| |}t|| j |}| }| }| j	r?| j
r?|jddd}|jddd}| | fS )Nra   r   r   rB   )r   r   r   r   r"   outerr   r7   r8   r   r   repeat_interleaverN   )r]   r   r   r   ri   	freqs_cos	freqs_sinr>   r>   r?   build_freqs_outerI  s   

z%OneDRotaryEmbedding.build_freqs_outer   maxsizeseq_len	start_pos
device_strr   c                 C   s:   t |}t j||| | j|d}| ||\}}||fS )Nr   )r"   r   r-   r   r   )r]   r   r   r   r   r   r   r   r>   r>   r?   forward_from_grid\  s   
z%OneDRotaryEmbedding.forward_from_gridc                 C   s"   t | }t|j}| ||S )z
        Calculates 1D rotary embeddings for the given positions.

        This method converts the input tensor to a hashable representation
        and calls a cached helper method to perform the computation.
        )r   tolistr   r   _forward_cached)r]   r   	pos_tupler   r>   r>   r?   forwardh  s   
zOneDRotaryEmbedding.forwardr   c                 C   s4   t |}t j|| j|d}| ||\}}||fS )z}
        The core implementation that computes 1D rotary embeddings.
        This method is wrapped by an LRU cache.
        r   )r"   r   	as_tensorr   r   )r]   r   r   r   r   r   r   r>   r>   r?   r   s  s   
z#OneDRotaryEmbedding._forward_cached)r{   r|   r}   r~   r"   float32r   rN   r   r   rZ   r   r#   r   	functools	lru_cacher   r   r   r   r   r   r>   r>   r_   r?   r   %  sX    
 
r   c                       sb  e Zd ZdZddddejfdee dedeee B deee B de	d	e	d
ej
f fddZdejdeejejf fddZejdddeeedf df dedeejejf fddZdejdeejejf fddZ			d%deedf deded ejeB dB deejejf f
d!d"Zejdddeedf dedededeejejf f
d#d$Z  ZS )&NDRotaryEmbeddingz*N-dimensional rotary positional embedding.ra   Frope_dim_list
rope_thetar   r   r   r   r   c              
      s  t    || _t|| _|| _|| _t|tt	fr"|g| j | _
nt|tr7t|dkr7|d g| j | _
n|| _
t| j
| jksFJ dt|tt	frU|g| j | _nt|trjt|dkrj|d g| j | _n|| _t| j| jksyJ dtj | _i }g | _t| jD ]@}	| j|	 }
| j
|	 }| j|	 }|
||||f}||vrt|
| j||| j||d}t| j||< | j| || }| j| qd S )Nr   r   <len(theta_rescale_factor) should equal to len(rope_dim_list)<len(interpolation_factor) should equal to len(rope_dim_list))r   r   r   r   r   r   r   )rY   rZ   r   lenndimr   r   r!   r   rN   r   listr   r"   nn
ModuleListrope_generatorsdim_idx_to_gen_idxranger   append)r]   r   r   r   r   r   r   r   _config_to_gen_idxir   rescaleinterp
config_key	generatorgen_idxr_   r>   r?   rZ     sX   





	zNDRotaryEmbedding.__init__r   r   c                 C   s(   t tt | }t|j}| ||S )a,  
        Calculates n-d rotary embeddings for given absolute positions.

        Args:
            positions (torch.Tensor): A tensor of shape `[num_tokens, ndim]`
                containing the integer coordinates for each token.

        Returns:
            A tuple of (cos, sin) tensors.
        )r   mapr   r   r   r   )r]   r   r   r   r>   r>   r?   r     s   
zNDRotaryEmbedding.forwardr   r   r   .r   c                 C   s(   t |}t j|t j|d}| j|dS )
        The core implementation that computes embeddings from a position tensor.
        This method is wrapped by an LRU cache.
        r   )r   )r"   r   tensorr.   forward_uncached)r]   r   r   r   r   r>   r>   r?   r     s   
z!NDRotaryEmbedding._forward_cachedr   c                 C   s
  |j }|jd }| jd }|jr|jrt| j}nt| jd }tj||f|| j	d}tj||f|| j	d}d}t
| jD ]?}	|dd|	f | j	}
| j|	 }| j| }||
\}}|jd }||dd||| f< ||dd||| f< ||7 }q=| | fS )r   r   r   r   Nr   )r   r    r   r   r   sumr   r"   emptyr   r   r   r(   r   rN   )r]   r   r   rq   first_generatorhead_dimr7   r8   
col_offsetr   pos_ir   r   cos_1dsin_1dslice_widthr>   r>   r?   r     s&   





z"NDRotaryEmbedding.forward_uncachedr   N	grid_size	shard_dimstart_framer   c                 C   s$   |durt |nd}| ||||S )z'
        Handles sp internally
        Ncpu)r   _forward_cached_from_grid)r]   r   r   r   r   r   r>   r>   r?   r     s   z#NDRotaryEmbedding.forward_from_gridc           "      C   sZ  t |}t }|j}|j}t|| jd}	d| j }
t|	}dg| j }|dkrO|	| | dks?J d| d|	|  d| |	| | }|| ||< |||< d}|D ]}|t|9 }qSt	| j
d }t j||f|| jd	}t j||f|| jd	}d}t| jD ]}| j
| }|d }t|| }|
| }|dkr|dkr||7 }|dkr||kr||| 7 }| j| }| j| }||||\}}d}t|d | jD ]
}|t|| 9 }qd}td|D ]
}|t|| 9 }q|j|dd} |j|dd}!|dkr| |d} |!|d}!| |d
d
||| f< |!|d
d
||| f< ||7 }q| | fS )z
        Computes embeddings for a structured grid, using a highly efficient
        implementation that avoids materializing the full position tensor.
        This method is wrapped by an LRU cache.
        rB   r   r   r   
Dimension  with size 2 is not divisible by sequence parallel world size r   r   N)r"   r   r   rank_in_group
world_size	_to_tupler   r   r   r   r   r   r   r   r   r   r   r   r/   rN   )"r]   r   r   r   r   r   sp_groupsp_ranksp_world_sizesizesstartsshard_sizesshard_offsets
shard_sizerq   ry   head_dim_halfr7   r8   r   r   dim_i
dim_i_halfsize_ibase_offsetr   r   r   r   repeats_per_entryj
tile_countcos_expandedsin_expandedr>   r>   r?   r     sj   






z+NDRotaryEmbedding._forward_cached_from_grid)r   r   N)r{   r|   r}   r~   r"   r   r   r   rN   r   r   rZ   r#   r   r   r   r   r   r   r   r   r   r   r   r>   r>   r_   r?   r     st    

 @
 (



r   r   .r   c                 C   s8   t | tr
| f| S t| |kr| S td| d|  )NzExpected length z or int, but got )r!   r   r   r   )rA   r   r>   r>   r?   r   h  s
   

r   )r   r   r   startrl   r   r   c                   s2  t |dkrt |d}d|  |nIt |dkr6t |d t|d |dt fddt|D }n&t |dkrSt |d t|d |dt|d |d}n	tdt | g }t|D ]%} | | || }}	}
tj||	|
d ||d	d
|
 }|| qbtj|ddi}tj	|dd}|S )a>  
    Get n-D meshgrid with start, stop and num.

    Args:
        start (int or tuple): If len(args) == 0, start is num; If len(args) == 1, start is start, args[0] is stop,
            step is 1; If len(args) == 2, start is start, args[0] is stop, args[1] is num. For n-dim, start/stop/num
            should be int or n-tuple. If n-tuple is provided, the meshgrid will be stacked following the dim order in
            n-tuples.
        *args: See above.
        dim (int): Dimension of the meshgrid. Defaults to 2.

    Returns:
        grid (np.ndarray): [dim, ...]
    r   rB   r   r   c                 3        | ]}|  |  V  qd S rj   r>   .0r   r   stopr>   r?   	<genexpr>      z"get_meshgrid_nd.<locals>.<genexpr>r   'len(args) should be 0, 1 or 2, but got r   Nindexingij)
r   r   r   r   r   r"   linspacer   meshgridrI   )r   r   r   r   rl   num	axis_gridr   abnggridr>   r   r?   get_meshgrid_ndq  s*    r   r   ra   r   r   r   r   c           
      C   s   t |trtj|||d}nt |tjr%|dur%|jt|kr%||}|dkr3||| | d   9 }d|tjd| d|dd| d  ||  j|d  }t|| |}| }|	 }	||	fS )a  
    Precompute the frequency tensor for complex exponential (cis) with given dimensions.
    (Note: `cis` means `cos + i * sin`, where i is the imaginary unit.)

    This function calculates a frequency tensor with complex exponential using the given dimension 'dim'
    and the end index 'end'. The 'theta' parameter scales the frequencies.

    Args:
        dim (int): Dimension of the frequency tensor.
        pos (int or torch.FloatTensor): Position indices for the frequency tensor. [S] or scalar
        theta (float, optional): Scaling factor for frequency computation. Defaults to 10000.0.
        theta_rescale_factor (float, optional): Rescale factor for theta. Defaults to 1.0.
        interpolation_factor (float, optional): Factor to scale positions. Defaults to 1.0.

    Returns:
        freqs_cos, freqs_sin: Precomputed frequency tensor with real and imaginary parts separately. [S, D]
    r   Nra   r   r   r   )
r!   r   r"   r-   r#   r   r(   r   r7   r8   )
r   r   r   r   r   r   r   ri   r   r   r>   r>   r?   get_1d_rotary_pos_embed  s(   


(r   r   )	r   r   r   r   r   r   r   r   r   r   r   r   r   c       	   %   
      s  t | }t |dkrt||d}d|  nIt |dkr8t||d t|d |dt fddt|D }n&t |dkrUt||d t|d |d}t|d |d}n	tdt | ||k slJ d	| d
| t|}dg| }|dkr|| | dksJ d| d||  d| || | }|| ||< |||< t|ttB r|g| }nt|trt |dkr|d g| }t ||ksJ dt|ttB r|g| }nt|trt |dkr|d g| }t ||ksJ dd}|D ]}|t|9 }qt	| d }t
j||f|
|d}t
j||f|
|d}d}t|D ]}t| | }|d }t|| } | }|dkrC|	dkrC||	7 }|dkrS||krS||| 7 }t
j||
|d| }t||||| || ||
d\}}d} t|d |D ]}!| t||! 9 } qwd}"td|D ]}!|"t||! 9 }"q|j| dd}#|j| dd}$|"dkr|#|"d}#|$|"d}$|#|dd||| f< |$|dd||| f< ||7 }q||fS )a  
    This is a n-d version of precompute_freqs_cis, which is a RoPE for tokens with n-d structure.
    Supports sequence parallelism by allowing sharding of a specific dimension.

    Args:
        rope_dim_list (list of int): Dimension of each rope. len(rope_dim_list) should equal to n.
            sum(rope_dim_list) should equal to head_dim of attention layer.
        start (int | tuple of int | list of int): If len(args) == 0, start is num; If len(args) == 1, start is start,
            args[0] is stop, step is 1; If len(args) == 2, start is start, args[0] is stop, args[1] is num.
        *args: See above.
        theta (float): Scaling factor for frequency computation. Defaults to 10000.0.
        theta_rescale_factor (float): Rescale factor for theta. Defaults to 1.0.
        interpolation_factor (float): Factor to scale positions. Defaults to 1.0.
        shard_dim (int): Which dimension to shard for sequence parallelism. Defaults to 0.
        sp_rank (int): Rank in the sequence parallel group. Defaults to 0.
        sp_world_size (int): World size of the sequence parallel group. Defaults to 1.

    Returns:
        Tuple[torch.Tensor, torch.Tensor]: (cos, sin) tensors of shape [HW, D/2]
    r   rB   r   r   c                 3   r   rj   r>   r   r   stopsr>   r?   r     r   z*get_nd_rotary_pos_embed.<locals>.<genexpr>r   r   z
shard_dim z( must be less than number of dimensions r   r   r   r   r   r   )r   r   r   r   r   N)r   r   r   r   r   r   r!   r   rN   r   r"   r   r-   r   r   r/   )%r   r   r   r   r   r   r   r   r   r   r   rl   r   r   _r   r   r   rq   ry   r   r7   r8   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r>   r   r?   get_nd_rotary_pos_embed  s   #




r   c                    s   d||  |du r fddt D }t| ks J dt|t|t|tr/t|nt|t|tr;t|nt||f}|tv }|rQt|}|t|< nt|||||d}|t|< t	tdkrmtt
tt |jt| dd||	|
d	\}}||fS )
a|  
    Generate rotary positional embeddings for the given sizes.

    Args:
        rope_sizes: Tuple of dimensions (t, h, w)
        hidden_size: Hidden dimension size
        heads_num: Number of attention heads
        rope_dim_list: List of dimensions for each axis, or None
        rope_theta: Base for frequency calculations
        theta_rescale_factor: Rescale factor for theta. Defaults to 1.0
        interpolation_factor: Factor to scale positions. Defaults to 1.0
        shard_dim: Which dimension to shard for sequence parallelism. Defaults to 0.

    Returns:
        Tuple of (cos, sin) tensors for rotary embeddings
       Nc                    s   g | ]}  qS r>   r>   )r   r   r   target_ndimr>   r?   
<listcomp>  s    z(get_rotary_pos_embed.<locals>.<listcomp>z>sum(rope_dim_list) should equal to head_dim of attention layer)r   r   r   r   r   r   rB   )r   r   r   r   )r   r   r   rN   r!   r   _ND_ROPE_CACHEpopr   r   nextiterr   r   )
rope_sizeshidden_size	heads_numr   r   r   r   r   r   r   r   r   	cache_hitrope_embr   r   r>   r   r?   get_rotary_pos_embede  sL   





r  
_ROPE_DICTz%OrderedDict[tuple, NDRotaryEmbedding]r  z5OrderedDict[tuple, tuple[torch.Tensor, torch.Tensor]]_ROPE_3D_CACHETrU   max_positionrW   rope_scalingpartial_rotary_factorc              	   C   sL  |d u rt  }|d urdd | D }t| }	nd }	|dk r(t|| }|}
d }|d urb|d|dd }|dv rAd }n!|dkrbt|dd}|d	d }|d urbt|
tt|| }
| ||
|||	|f}|tv rst| S |d u rt	| ||
|||}n|dkrt|dd}t
| ||
||||d
}ntd| |t|< |S )Nc                 S   s(   i | ]\}}|t |trt|n|qS r>   )r!   r   r   )r   r   vr>   r>   r?   
<dictcomp>  s    zget_rope.<locals>.<dictcomp>ra   	rope_typetype)Ndefaultlinearfactor original_max_position_embeddings)r
   rU   rV   rW   rL   r   r   zUnknown RoPE scaling )r"   get_default_dtypeitemsr   r   getrN   maxr  rT   r   r   )r
   rU   r  rW   rL   r  r   r  rope_scaling_tuplerope_scaling_argsrV   r  r  original_maxr   
rotary_embr>   r>   r?   get_rope  sf   
	
r!  )F)r   )TNNra   )4r~   r   collectionsr   typingr   r   r   r"   )sglang.jit_kernel.diffusion.triton.rotaryr   8sglang.multimodal_gen.runtime.distributed.parallel_stater   .sglang.multimodal_gen.runtime.layers.custom_opr   1sglang.multimodal_gen.runtime.utils.logging_utilsr	   r{   loggerr#   r   r   r@   rG   rK   rR   registerrT   r   r   Moduler   r   r   r   r   r   r   r   r   FloatTensorrN   r   r   r   r  r  dict__annotations__r  r  r!  r>   r>   r>   r?   <module>   sB  
R
 Z \ ,h
6
9

	

 	

Y	