o
    پi$                     @  s\  d dl mZ d dlmZmZ d dlmZmZ d dlm	Z	 d dl
mZmZmZmZ d dlZd dlZd dlmZmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlm Z m!Z! d dl"m#Z# erld dl$m%Z% e! Z&ee'ee' f Z(	 eG dd dZ)d5ddZ*d6ddZ+d7ddZ,d8d!d"Z-G d#d$ d$Z.		%d9d:d2d3Z/e#ed&d'gd4ZdS );    )annotations)IterableMapping)	dataclassfield)	lru_cache)TYPE_CHECKINGAnyOptionalTupleN)can_use_fused_inplace_qknormfused_inplace_qknorm)envs)RadixAttention)	SWAKVPool)get_is_capture_mode)ForwardBatch)get_current_device_stream_fastis_cuda)register_custom_op)RMSNormc                   @  sp   e Zd ZU dZeedZded< eedZded< eedZ	ded< dddZ
dddZdddZdddZdS )WeightsMapperzBMaps the name of each weight if they match the following patterns.)default_factoryWeightsMappingorig_to_new_substrorig_to_new_prefixorig_to_new_suffixkeystrreturnOptional[str]c                 C  s   t | j dd ddD ]\}}||v r$|d u r d S |||d} nqt | j dd ddD ]\}}||rJ|d u rA d S |||d} nq1t | j dd ddD ]\}}||rs|d u rg d S ||	|d} |S qW|S )Nc                 S     t | d S Nr   leni r'   K/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/utils.py<lambda>4       z)WeightsMapper._map_name.<locals>.<lambda>T)r   reverse   c                 S  r!   r"   r#   r%   r'   r'   r(   r)   >   r*   c                 S  r!   r"   r#   r%   r'   r'   r(   r)   H   r*   )
sortedr   itemsreplacer   
startswithr   endswithjoinrsplit)selfr   substrnew_keyprefixsuffixr'   r'   r(   	_map_name2   s:   

zWeightsMapper._map_nameweights"Iterable[tuple[str, torch.Tensor]]c                       fdd|D S )Nc                 3  s.    | ]\}} |  d ur |fV  qd S Nr9   ).0namedataout_namer4   r'   r(   	<genexpr>V   s    z&WeightsMapper.apply.<locals>.<genexpr>r'   )r4   r:   r'   rB   r(   applyS   s   zWeightsMapper.applyvalues	list[str]c                   r<   )Nc                   s"   g | ]} |  d ur qS r=   r>   )r?   r@   rB   r'   r(   
<listcomp>]   s
    z,WeightsMapper.apply_list.<locals>.<listcomp>r'   r4   rF   r'   rB   r(   
apply_list\   s   zWeightsMapper.apply_listdict[str, Any]c                   s    fdd|  D S )Nc                   s(   i | ]\}} |  d ur |qS r=   r>   )r?   r@   valuerB   r'   r(   
<dictcomp>d   s
    z,WeightsMapper.apply_dict.<locals>.<dictcomp>)r.   rI   r'   rB   r(   
apply_dictc   s   zWeightsMapper.apply_dictN)r   r   r   r    )r:   r;   r   r;   )rF   rG   r   rG   )rF   rK   r   rK   )__name__
__module____qualname____doc__r   dictr   __annotations__r   r   r9   rE   rJ   rN   r'   r'   r'   r(   r   *   s   
 

!
	r   forward_batchr   c                 C  s,   t ot| jdo| jjtjkot| jt S )z?Enable fused set_kv_buffer only on CUDA with bfloat16 KV cache.dtype)_is_cudahasattrtoken_to_kv_poolrV   torchbfloat16
isinstancer   )rU   r'   r'   r(   enable_fused_set_kv_bufferk   s   
r]   rL   torch.Tensorlayerr   c                 C  sb   ddl m} |j}|j}||}||}|| ||jd d||jd d|j|j	|j
dS )Nr   )FusedSetKVBufferArg)rL   k_bufferv_bufferk_scalev_scale	cache_loc)
sgl_kernelr`   layer_idrY   get_key_bufferget_value_bufferviewshaperd   re   out_cache_loc)rL   r_   rU   r`   rh   rY   rb   rc   r'   r'   r(   create_fused_set_kv_buffer_argu   s   

rn   permr   c                 C  s*   t | }t j|  | j| jd|| < |S )N)devicerV   )rZ   
empty_likearangenumelrp   rV   )ro   inv_permr'   r'   r(   permute_inv   s   
ru   grid_thwc                 C  s   | j jdks
J d|  }t|dddf |dddf  |dddf jdtjd}ttjdtjd|g}t	
|}|S )	z
    Compute cu_seqlens from grid_thw using NumPy.

    grid_thw: [T, 3] int tensor on CPU.
              columns: [repeat_count, H, W]
    Returns:
        cu_seqlens: 1D int32 tensor on CPU, shape [N + 1]
    cpuz7compute_cu_seqlens_from_grid_numpy expects a CPU tensorNr,      r   )axisrV   )rV   )rp   typenumpynprepeatcumsumint32concatenatezerosrZ   
from_numpy)rv   arr
cu_seqlensr'   r'   r(   "compute_cu_seqlens_from_grid_numpy   s   
4
r   c                   @  s$   e Zd Zeedddd	d
ZdS )RotaryPosMixini   )maxsizehintwspatial_merge_sizer   r^   c                 C  s   t | tjrt|  } t |tjrt| }t |tjr$t| }tt| | d| |f}| | }|| }|||||}|	dddd}|
 }tt|d|| |f}|||||}|	dddd}|
 }ttj||gddS )Nr,   r   rx      ra   )ry   )r\   rZ   Tensorr   itemr|   broadcast_torr   reshape	transposeflattenr   stack)r   r   r   hpos_idsh_divw_divwpos_idsr'   r'   r(   rot_pos_ids   s6   zRotaryPosMixin.rot_pos_idsN)r   r   r   r   r   r   r   r^   )rO   rP   rQ   staticmethodr   r   r'   r'   r'   r(   r      s    r   Tqkq_normr   k_normhead_dimr   
alt_streamOptional[torch.cuda.Stream]allow_inplacebool!Tuple[torch.Tensor, torch.Tensor]c                 C  s6  |  d}|j}|j}	tr8|r8||	kr8tj s8t|| jr8t| 	|d||	|d||j
|j
||d | |fS |durwt rwt }
||
 | d|}||}tj| |d|}||}W d   n1 slw   Y  |
| n| d|}||}|d|}||}|	| j} |	|j}| |fS )a  
    Apply QK normalization for query and key tensors.
    If eligible, we will use JIT fused inplace QK normalization for better performance.

    Args:
        q: Query tensor of shape [batch_size, ...]
        k: Key tensor of shape [batch_size, ...]
        q_norm: RMSNorm layer for query normalization
        k_norm: RMSNorm layer for key normalization
        head_dim: Dimension of each attention head
        alt_stream: Optional alternative CUDA stream for overlapping computation
        allow_inplace: Whether to allow inplace normalization. (True for better performance)

    Returns:
        Tuple of normalized query and key tensors
    r   ra   )r   r   q_weightk_weightr   epsN)sizevariance_epsilonrW   r   %SGLANG_ENABLE_DETERMINISTIC_INFERENCEgetr   rV   r   rk   weightr   r   wait_streamr   rZ   cudastreamrl   )r   r   r   r   r   r   r   
batch_sizeq_epsk_epscurrent_stream	q_by_head	k_by_headr'   r'   r(   apply_qk_norm   sL   



r   )mutates_args)rU   r   )rL   r^   r_   r   rU   r   )ro   r^   r   r^   )rv   r^   r   r^   )NT)r   r^   r   r^   r   r   r   r   r   r   r   r   r   r   r   r   )0
__future__r   collections.abcr   r   dataclassesr   r   	functoolsr   typingr   r	   r
   r   r{   r|   rZ   sglang.jit_kernel.normr   r   sglang.srt.environr   !sglang.srt.layers.radix_attentionr   $sglang.srt.mem_cache.swa_memory_poolr   +sglang.srt.model_executor.cuda_graph_runnerr   ,sglang.srt.model_executor.forward_batch_infor   sglang.srt.utilsr   r   sglang.srt.utils.custom_opr   sglang.srt.layers.layernormr   rW   r   r   r   r]   rn   ru   r   r   r   r'   r'   r'   r(   <module>   s>   
@



*B