o
    پi!                    @  st  d Z ddlmZ 	 ddlZddlZddlZddlmZmZ ddlm	Z	 ddl
mZmZmZmZmZmZ ddlZddlZddlZddlmZ ddlmZmZ ddlmZ dd	lmZ dd
lm Z  ddl!m"Z" ddl#m$Z$m%Z% ddl&m'Z' ddl(m)Z)m*Z*m+Z+m,Z, ddl-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3 ddl4m5Z5 ddl6m7Z7 e5eddgdZerddl8m9Z9 ddl:m;Z; e<e=Z>dZ?e0 Z@e2 ZAe/ ZBe. ZCe1 ZDdQddZE		dRdSd-d.ZFG d/d0 d0ZGG d1d2 d2ZHG d3d4 d4eGZIG d5d6 d6ejJZKG d7d8 d8eKZLG d9d: d:eLZMG d;d< d<eKZNG d=d> d>eKZOG d?d@ d@eOZPG dAdB dBeOZQG dCdD dDeKZRdTdJdKZSejTdUdOdPZUdS )Va-  
Copyright 2023-2024 SGLang Team
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
    )annotationsN)contextmanagernullcontext)	dataclass)TYPE_CHECKINGAnyListOptionalTupleUnion)can_use_store_cachestore_cache)BaseLinearStateParams)GPU_MEMORY_TYPE_KV_CACHE)envs)index_buf_accessor)quantize_k_cachequantize_k_cache_separate)RadixAttention)get_mla_kv_buffer_tritonmaybe_init_custom_mem_poolset_mla_kv_buffer_tritonset_mla_kv_scale_buffer_triton)cpu_has_amx_supportis_cpuis_cudais_hipis_npunext_power_of_2)register_custom_op)TorchMemorySaverAdapterk_cachev_cache)mutates_args)LayerDoneCounter)Reqi   @t'Union[torch.Tensor, List[torch.Tensor]]c                 C  s0   t | trtdd | D S t| j| jj S )Nc                 s  s    | ]}t |V  qd S N)get_tensor_size_bytes.0x r-   T/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/mem_cache/memory_pool.py	<genexpr>U       z(get_tensor_size_bytes.<locals>.<genexpr>)
isinstancelistsumnpprodshapedtypeitemsize)r&   r-   r-   r.   r)   S   s   
r)   Tktorch.Tensorvindicesrow_dimintstore_dtypetorch.dtypedevice_moduler   
alt_streamOptional[torch.cuda.Stream]same_kv_dimboolreturnNonec
                 C  s   ||j  }
tr'|	r't|
r't| d||d||d||d|||
dS ddlm} | ra|d ura| }|| | ||< |	| |||< W d    n1 sUw   Y  || d S | ||< |||< d S )N)	row_bytesr   get_is_capture_mode)
r8   _is_cudar   r   view+sglang.srt.model_executor.cuda_graph_runnerrK   current_streamwait_streamstream)r9   r;   r!   r"   r<   r=   r?   rA   rB   rD   rI   rK   rO   r-   r-   r.   _set_kv_buffer_implY   s*   




	

rR   c                   @  sF   e Zd ZdZdd	d
Zdd Zdd ZdddZdddZdd Z	dS )ReqToTokenPool9A memory pool that maps a request to its token locations.sizer>   max_context_lendevicestrenable_memory_saverrE   c                 C  sr   t j|d}|| _|| _|| _|t tj||ftj	|d| _
W d    n1 s+w   Y  tt|| _d S )Nenabler7   rW   )r    createrU   rV   rW   regionr   torchzerosint32req_to_tokenr2   range
free_slots)selfrU   rV   rW   rY   memory_saver_adapterr-   r-   r.   __init__   s   
zReqToTokenPool.__init__c                 C  s   || j |< d S r(   )rb   )re   r<   valuesr-   r-   r.   write   s   zReqToTokenPool.writec                 C  
   t | jS r(   lenrd   re   r-   r-   r.   available_size      
zReqToTokenPool.available_sizereqs	list[Req]rF   Optional[List[int]]c                   s   dd t  D }tdd  D st|dksJ dt fdd|D s+J dt t| }|t| jkr<d S | jd | }| j|d  | _d	} D ]}|jd u r_|| |_|d7 }qOd
d  D S )Nc                 S  s   g | ]\}}|j d ur|qS r(   req_pool_idx)r+   irr-   r-   r.   
<listcomp>   s    z(ReqToTokenPool.alloc.<locals>.<listcomp>c                 s  s    | ]}|  V  qd S r(   )is_dllmr+   rv   r-   r-   r.   r/      r0   z'ReqToTokenPool.alloc.<locals>.<genexpr>   z:only one chunked request may reuse req_pool_idx in a batchc                 3  s,    | ]} | j d kp | jd kV  qdS )r   N)
is_chunkedkv_committed_lenr+   ru   rp   r-   r.   r/      s    
z+request has req_pool_idx but is not chunkedr   c                 S  s   g | ]}|j qS r-   rs   ry   r-   r-   r.   rw      s    )	enumerateanyrl   allrd   rt   )re   rp   chunked	need_sizeselect_indexoffsetrv   r-   r~   r.   alloc   s,   


zReqToTokenPool.allocreqr%   c                 C  s*   |j d us	J d| j|j  d |_ d S )Nzrequest must have req_pool_idx)rt   rd   append)re   r   r-   r-   r.   free   s   
zReqToTokenPool.freec                 C  s   t t| j| _d S r(   )r2   rc   rU   rd   rm   r-   r-   r.   clear      zReqToTokenPool.clearN)rU   r>   rV   r>   rW   rX   rY   rE   )rp   rq   rF   rr   )r   r%   )
__name__
__module____qualname____doc__rg   ri   rn   r   r   r   r-   r-   r-   r.   rS   ~   s    


rS   c                   @  s   e Zd ZedddG dd dZedddG dd deZddd	d3ddZd4ddZd5ddZdd Z	d6d!d"Z
d7d%d&Zd'd( Zd8d+d,Zd9d-d.Zd/d0 Zd1d2 ZdS ):	MambaPoolT)frozenkw_onlyc                   @  s0   e Zd ZU ded< ded< dddZd	d
 ZdS )zMambaPool.StateList[torch.Tensor]convr:   temporallayerr>   c                   s`   i }t |  D ]\}}|dks|dkr  fdd|D ||< q|  ||< qt| di |S )Nr   intermediate_conv_windowc                   s   g | ]}|  qS r-   r-   )r+   r   r   r-   r.   rw          z0MambaPool.State.at_layer_idx.<locals>.<listcomp>r-   )varsitemstype)re   r   kwargsr9   r;   r-   r   r.   at_layer_idx   s   zMambaPool.State.at_layer_idxc                   s   t  fddt D S )Nc                 3  s     | ]}t t |jV  qd S r(   )r)   getattrname)r+   frm   r-   r.   r/      s
    
z2MambaPool.State.mem_usage_bytes.<locals>.<genexpr>)r3   dataclassesfieldsrm   r-   rm   r.   mem_usage_bytes   s   zMambaPool.State.mem_usage_bytesN)r   r>   )r   r   r   __annotations__r   r   r-   r-   r-   r.   State   s
   
 
	r   c                   @  s   e Zd ZU ded< ded< dS )MambaPool.SpeculativeStater:   intermediate_ssmr   r   N)r   r   r   r   r-   r-   r-   r.   SpeculativeState   s   
 r   FN)rY   speculative_num_draft_tokensrU   r>   spec_state_sizecache_paramsr   rW   rX   rY   rE   r   Optional[int]c                  sT  |j j}|j j}|jj |jj}	tj|d| _t|j| _	| _
t| j
d\| _| _}
| jt | jr>tj| jnt   fdd|D }tr\tr\ddlm} ||}tjd f| |	d}d urtjd |d |d |d	 f|	d
d} fdd|D }| j||||d| _td dt|t ddt|t ddt|t ddt|t dd n!| j||d| _td dt|t ddt|t dd tj d| j	d tj!| j
d| _"| j# t | _$| _%W d    n1 s
w   Y  W d    d S W d    d S 1 s#w   Y  d S )NrZ   rW   c                   s(   g | ]}t jd  f|  dqS )rz   rU   r7   rW   r_   r`   r+   
conv_shape)
conv_dtyperW   num_mamba_layersrU   r-   r.   rw      s    z&MambaPool.__init__.<locals>.<listcomp>r   )_init_amx_conv_staterz   r      cudac              	     s2   g | ]}t jd  |d |d  f ddqS )rz   r   r   r   r   r   )r   r   r   r   r-   r.   rw     s    )r   r   r   r   z0Mamba Cache is allocated. max_mamba_cache_size: z, conv_state size: .2fzGB, ssm_state size: z&GB intermediate_ssm_state_cache size: z(GB intermediate_conv_window_cache size: zGB )r   r   r\   )&r6   r   r   r7   r    r]   rf   rl   layersrU   rW   r   enable_custom_mem_poolcustom_mem_poolr^   r   r_   r   use_mem_poolr   _is_cpu_cpu_has_amx_supportsglang.srt.layers.amx_utilsr   r`   r   mamba_cacheloggerinfor)   GBr   arangeint64rd   r   	mem_usager   )re   rU   r   r   rW   rY   r   conv_state_shapetemporal_state_shape	ssm_dtype_
conv_stater   temporal_stateintermediate_ssm_state_cacheintermediate_conv_window_cacher-   )r   rW   r   rU   r   r   r.   rg      s   


		TzMambaPool.__init__rF   c                 C  s   t | j| js	J | jS r(   )r1   r   r   rm   r-   r-   r.   (get_speculative_mamba2_params_all_layersD  s   z2MambaPool.get_speculative_mamba2_params_all_layerslayer_idc                 C  s   | j |S r(   )r   r   re   r   r-   r-   r.   mamba2_layer_cacheH  s   zMambaPool.mamba2_layer_cachec                 C  rj   r(   rk   rm   r-   r-   r.   rn   K  ro   zMambaPool.available_sizer   Optional[torch.Tensor]c                 C  sv   |t | jkr	d S | jd | }| j|d  | _tt | jjD ]}d| jj| d d |f< q d| jjd d |f< |S Nr   )rl   rd   rc   r   r   r   )re   r   r   ru   r-   r-   r.   r   N  s   zMambaPool.alloc
free_indexr:   c                 C  s&   |  dkrd S t| j|f| _d S r   )numelr_   catrd   )re   r   r-   r-   r.   r   [  s   zMambaPool.freec                 C  s"   t jd| jd t j| jd| _d S )Nrz   r\   )r_   r   rU   r   rW   rd   rm   r-   r-   r.   r   `  s   zMambaPool.clear	src_index	dst_indexc                 C  sj   t t| jjD ]}| jj| d d |f | jj| d d |f< q| jjd d |f | jjd d |f< d S r(   )rc   rl   r   r   r   )re   r   r   ru   r-   r-   r.   	copy_frome  s   


zMambaPool.copy_fromc                 C  s&   |  d}|d krd S | || |S )Nrz   )r   r   )re   r   r   r-   r-   r.   	fork_fromo  s
   
zMambaPool.fork_fromc                   s   g }t | jD ]}|dv rqt| j|}t|tr|| q|| qg g g }}}t|D ].\} | fddt| j	D 7 }| fddt| j	D 7 }| fddt| j	D 7 }q1|||fS )z
        Get buffer info for RDMA registration.
        Only returns conv and temporal state buffers, excluding intermediate buffers
        used for speculative decoding (intermediate_ssm, intermediate_conv_window).
        )r   r   c                   s   g | ]} |   qS r-   data_ptrr}   state_tensorr-   r.   rw         z6MambaPool.get_contiguous_buf_infos.<locals>.<listcomp>c                   s   g | ]} | j qS r-   nbytesr}   r   r-   r.   rw         c                   s   g | ]	} | d  j qS r   r   r}   r   r-   r.   rw         )
r   r   r   r1   r2   extendr   r   rc   r   )re   state_tensorsfieldvalue	data_ptrs	data_lens	item_lensr   r-   r   r.   get_contiguous_buf_infosv  s$   


z"MambaPool.get_contiguous_buf_infosc                 C  sl   g }t | jD ]}t| j|}t|tr|| q|| qg }|D ]}|jd }||g| j 7 }q$|S )a  Get the sliceable dimension size for each state tensor.

        For mamba state, the layout is:
        - conv_state: [num_layers, size+1, conv_dim/tp, conv_kernel-1]
        - temporal_state: [num_layers, size+1, num_heads/tp, head_dim, state_size]

        The 3rd dimension (index 2) is the one that gets sliced by TP.
        Returns the size of this dimension for each tensor (repeated for each layer).
        r   )	r   r   r   r1   r2   r   r   r6   r   )re   r   r   r   dim_per_tensorr   sliceable_dimr-   r-   r.   get_state_dim_per_tensor  s   


z"MambaPool.get_state_dim_per_tensor)rU   r>   r   r>   r   r   rW   rX   rY   rE   r   r   )rF   r   r   r>   )r   r>   rF   r   )r   r:   )r   r:   r   r:   )r   r:   rF   r   )r   r   r   r   r   r   rg   r   r   rn   r   r   r   r   r   r   r   r-   r-   r-   r.   r      s$    


p





r   c                      s   e Zd ZdZddd0 fddZ	d1d2ddZd3 fddZd4ddZd5d d!Zd6d#d$Z	d7d&d'Z
	d1d8d,d-Z fd.d/Z  ZS )9HybridReqToTokenPoolrT   N)r   rU   r>   
mamba_sizemamba_spec_state_sizerV   rW   rX   rY   rE   r   r   enable_mamba_extra_bufferr   c       	   
        sL   t  j||||d |	d u rdnd| _|| _|| _| j||||||	d d S )N)rU   rV   rW   rY   r   rz   )rU   r   r   rW   r   r   )superrg   !mamba_ping_pong_track_buffer_sizer   rY   _init_mamba_pool)
re   rU   r   r   rV   rW   rY   r   r   r   	__class__r-   r.   rg     s$   
zHybridReqToTokenPool.__init__c                 C  sr   t ||||| j|d| _dd t|jD | _|| _tj|tj	| jd| _
|r7tj|| jftj	| jd| _d S d S )N)rU   r   r   rW   rY   r   c                 S     i | ]\}}||qS r-   r-   )r+   ru   r   r-   r-   r.   
<dictcomp>  r   z9HybridReqToTokenPool._init_mamba_pool.<locals>.<dictcomp>r\   )r   rY   
mamba_poolr   r   	mamba_maprW   r_   r`   ra    req_index_to_mamba_index_mappingr   1req_index_to_mamba_ping_pong_track_buffer_mapping)re   rU   r   r   rW   r   r   r-   r-   r.   r     s*   	
z%HybridReqToTokenPool._init_mamba_poolrp   List['Req']rF   rr   c                   sV  t  |}|d u rd S g }g }|D ]^}d }|jd ur|j}n(| jd}|d us@J d|d| jjd| j dt||d }||_|| | jrp|j	d u rh| j| j
|_	|j	d useJ dd|_||j	  qt|t|ks}J d| jrt|t|ksJ dtj|tj| jd	| j|< | jrtj|tj| jd	| j|< |S )
Nrz   zkNot enough space for mamba cache, try to increase --mamba-full-memory-ratio or --max-mamba-cache-size. mid=z, self.mamba_pool.size=z#, self.mamba_pool.available_size()=z, len(reqs)=r   zTNot enough space for mamba ping pong idx, try to increase --mamba-full-memory-ratio.zfNot enough space for mamba cache, try to increase --mamba-full-memory-ratio or --max-mamba-cache-size.r\   )r   r   mamba_pool_idxr   rU   rn   rl   r   r   mamba_ping_pong_track_bufferr   mamba_next_track_idxtolistr_   tensorra   rW   r   r   )re   rp   r   mamba_index!mamba_ping_pong_track_buffer_listr   midr   r-   r.   r     sf   

(


zHybridReqToTokenPool.allocreq_indicesr:   c                 C  s
   | j | S r(   )r   )re   r
  r-   r-   r.   get_mamba_indices(  ro   z&HybridReqToTokenPool.get_mamba_indicesr   c                 C  s    || j v sJ | j| j | S r(   )r   r   r   r   r-   r-   r.   r   +  s   z'HybridReqToTokenPool.mamba2_layer_cacher   c                 C  
   | j  S r(   )r   r   rm   r-   r-   r.   r   /  ro   z=HybridReqToTokenPool.get_speculative_mamba2_params_all_layersr  c                 C  s   | j dkr	d| S |S )Nr   rz   )r   )re   r  r-   r-   r.   get_mamba_ping_pong_other_idx2  s   
z2HybridReqToTokenPool.get_mamba_ping_pong_other_idxr   'Req'$mamba_ping_pong_track_buffer_to_keepr   c                 C  s   |j }|d usJ d| j|d d |_ | jrG| j|j }|d ur?|dv s/J d|tt| j	}|
| || }| j| d S d S )Nz double free? mamba_index is Noner   )r   rz   zZmamba_ping_pong_track_buffer_to_keep must be 0 or 1, mamba_ping_pong_track_buffer_to_keep=)r  r   r   	unsqueezer   r   rt   r2   rc   r   remove)re   r   r  r  $mamba_ping_pong_track_buffer_to_freeidx_to_freer-   r-   r.   free_mamba_cache8  s"   


z%HybridReqToTokenPool.free_mamba_cachec                   s@   t d t   | j  | j  | jr| j  d S d S )NzReset HybridReqToTokenPool)	r   r   r   r   r   r   zero_r   r   rm   r   r-   r.   r   P  s   



zHybridReqToTokenPool.clear)rU   r>   r   r>   r   r>   rV   r>   rW   rX   rY   rE   r   r   r   rE   r   r>   r(   )rU   r>   r   r>   r   r   rW   rX   r   rE   r   r>   )rp   r  rF   rr   )r
  r:   rF   r:   r   )rF   r   )r  r>   rF   r>   )r   r  r  r   )r   r   r   r   rg   r   r   r  r   r   r  r  r   __classcell__r-   r-   r   r.   r     s    ("
3


r   c                   @  s   e Zd Zej		d0d1ddZd2ddZejd3ddZejd3ddZejd4ddZ	ejd5d$d%Z
d6d(d)Zd*d+ Zd,d- Zd.d/ ZdS )7KVCacheNrU   r>   	page_sizer7   r@   	layer_numrW   rX   rY   rE   start_layerr   	end_layerc	           
      C  s   || _ || _|| _|| _|tjtjfv rtj| _n|| _|| _	|p"d| _
|p)|d | _tj|d| _d| _d| _d | _t| jd\| _| _}	d S )Nr   rz   rZ       r   )rU   r  r7   rW   r_   float8_e5m2float8_e4m3fnuint8r?   r  r  r  r    r]   rf   r   cpu_offloading_chunk_sizelayer_transfer_counterr   r   r   )
re   rU   r  r7   r  rW   rY   r  r  r   r-   r-   r.   rg   Z  s$   


zKVCache.__init__
num_tokensc              	   C  s   |   }t|tr-|\}}|t }|t }td| d|dd|dd || | _dS |t }td| d|dd || _dS )zCommon logging and mem_usage computation for KV cache allocation.
        Supports both tuple (K, V) size returns and single KV size returns.
        z KV Cache is allocated. #tokens: z
, K size: r   z GB, V size: z GBz, KV size: N)get_kv_size_bytesr1   tupler   r   r   r   )re   r"  kv_size_bytesk_sizev_size	k_size_GB	v_size_GB
kv_size_GBr-   r-   r.   _finalize_allocation_log  s   

z KVCache._finalize_allocation_logr   rF   r:   c                 C     t  r(   NotImplementedErrorr   r-   r-   r.   get_key_buffer     zKVCache.get_key_bufferc                 C  r,  r(   r-  r   r-   r-   r.   get_value_buffer  r0  zKVCache.get_value_buffer!Tuple[torch.Tensor, torch.Tensor]c                 C  r,  r(   r-  r   r-   r-   r.   get_kv_buffer  r0  zKVCache.get_kv_bufferr   r   loccache_kcache_vrG   c                 C  r,  r(   r-  )re   r   r4  r5  r6  r-   r-   r.   set_kv_buffer  s   zKVCache.set_kv_bufferr!  r$   c                 C  s
   || _ d S r(   )r!  )re   r!  r-   r-   r.   register_layer_transfer_counter  ro   z'KVCache.register_layer_transfer_counterc                 C  r,  r(   r-  )re   r<   r-   r-   r.   get_cpu_copy     zKVCache.get_cpu_copyc                 C  r,  r(   r-  )re   kv_cache_cpur<   r-   r-   r.   load_cpu_copy  r:  zKVCache.load_cpu_copyc                 C  s   | j S r(   )r   rm   r-   r-   r.   maybe_get_custom_mem_pool  r:  z!KVCache.maybe_get_custom_mem_poolNN)rU   r>   r  r>   r7   r@   r  r>   rW   rX   rY   rE   r  r   r  r   )r"  r>   r   r>   rF   r:   )r   r>   rF   r2  )
r   r   r4  r:   r5  r:   r6  r:   rF   rG   )r!  r$   )r   r   r   abcabstractmethodrg   r+  r/  r1  r3  r7  r8  r9  r<  r=  r-   r-   r-   r.   r  Y  s$    	
'
	r  c                      s   e Zd Z								dDdE fddZdd Zdd Zdd  Zd!d" Zd#d$ Zd%d& Z	d'd( Z
dFd*d+ZdFd,d-ZdFd.d/ZdFd0d1ZdFd2d3Z			dGdHd>d?ZdIdBdCZ  ZS )JMHATokenToKVPoolNTFrU   r>   r  r7   r@   head_numhead_dimr  rW   rX   rY   rE   
v_head_dimr   swa_head_numswa_head_dimswa_v_head_dimr  r  enable_alt_streamenable_kv_cache_copyc              
     s   t  |||||||| |
d ur|
n|| _|d ur|n|| _|d ur%|n|	d ur+|	n|| _|   t| j| _	t
rB|rB| j	 nd | _|rL|   nd | _| | | j| j | _| j| jk| _d S r(   )r   rg   rC  rD  rE  _create_buffersr_   get_device_modulerW   rA   rL   StreamrB   _init_kv_copy_and_warmup_kv_copy_configr+  r=   rD   )re   rU   r  r7   rC  rD  r  rW   rY   rE  rF  rG  rH  r  r  rI  rJ  r   r-   r.   rg     s4   


zMHATokenToKVPool.__init__c                 C  s   d}d}d}d}d}d}d}t | jd  }||kr|}	n	||kr%|}	n|}	|	|kr-dnd}
|	||	 d	 |	 |	|kr=|n||
d
| _tj|
tj| jd}| j	 | jd f}t
| | j| j||d	|
| jd | jd dd	 d S )Nr  i   i               r   rz   )bytes_per_tile
byte_tiles	num_warpsnum_locs_upperr\   rU  rT  rV  r   BYTES_PER_TILErV  
num_stages)r>   data_stridesitemrO  r_   r`   r   rW   r   r   copy_all_layer_kv_cache_tiled)re   _KV_COPY_STRIDE_THRESHOLD_LARGE _KV_COPY_STRIDE_THRESHOLD_MEDIUM_KV_COPY_TILE_SIZE_LARGE_KV_COPY_TILE_SIZE_MEDIUM_KV_COPY_TILE_SIZE_SMALL_KV_COPY_NUM_WARPS_LARGE_TILE_KV_COPY_NUM_WARPS_SMALL_TILEstride_bytesrT  chunk_upper	dummy_locgridr-   r-   r.   rN    sD   
z)MHATokenToKVPool._init_kv_copy_and_warmupc              	     s   j t?  jrtj jnt "  fddt	 j
D  _ fddt	 j
D  _W d    n1 s9w   Y  W d    n1 sHw   Y  tjdd  jD tj jd _tjdd  jD tj jd _tj j jgdd _tjd	d  j j D  jd
 _d S )Nc                   2   g | ]}t j j j  j jf j jd qS r\   )r_   r`   rU   r  rC  rD  r?   rW   r+   r   rm   r-   r.   rw   *      z4MHATokenToKVPool._create_buffers.<locals>.<listcomp>c                   ri  rj  )r_   r`   rU   r  rC  rE  r?   rW   rk  rm   r-   r.   rw   2  rl  c                 S     g | ]}|  qS r-   r   r*   r-   r-   r.   rw   <  r   r\   c                 S  rm  r-   r   r*   r-   r-   r.   rw   A  r   r   dimc                 S  s(   g | ]}t |jd d |jj qS )rz   N)r4   r5   r6   r7   r8   r*   r-   r-   r.   rw   G  s    r   )rf   r^   r   r   r_   r   r   r   r   rc   r  k_bufferv_bufferr  uint64rW   k_data_ptrsv_data_ptrsr   r   r[  rm   r-   rm   r.   rK  !  s>   



z MHATokenToKVPool._create_buffersc                 C     | ` | `d S r(   )rp  rq  rm   r-   r-   r.   _clear_buffersN     zMHATokenToKVPool._clear_buffersc                 C  s\   t | dsJ t | dsJ d}| jD ]}|t|7 }qd}| jD ]}|t|7 }q!||fS )Nrp  rq  r   )hasattrrp  r)   rq  )re   k_size_bytesr!   v_size_bytesr"   r-   r-   r.   r#  R  s   

z"MHATokenToKVPool.get_kv_size_bytesc                   s    fddt  j j j D  fddt  j j j D  } fddt  j j j D  fddt  j j j D  } fddt  j j j D  fddt  j j j D  }|||fS )Nc                      g | ]	}  | qS r-   )_get_key_bufferr   r}   rm   r-   r.   rw   a      z=MHATokenToKVPool.get_contiguous_buf_infos.<locals>.<listcomp>c                   r{  r-   )_get_value_bufferr   r}   rm   r-   r.   rw   d  r}  c                      g | ]}  |jqS r-   )r|  r   r}   rm   r-   r.   rw   h      
c                   r  r-   )r~  r   r}   rm   r-   r.   rw   k  r  c                   "   g | ]}  |d  j j qS r   )r|  r   r  r}   rm   r-   r.   rw   o      c                   r  r   )r~  r   r  r}   rm   r-   r.   rw   r  r  )rc   r  r  re   kv_data_ptrskv_data_lenskv_item_lensr-   rm   r.   r   ^  s,   






z)MHATokenToKVPool.get_contiguous_buf_infosc           	      C  s   t j  g }| j}t| jD ];}|g  tdt||D ]+}||||  }| j| | j	ddd}| j
| | j	ddd}|d ||g qqt j  |S Nr   cpuTnon_blockingrH   )r_   r   synchronizer   rc   r  r   rl   rp  torq  )	re   r<   r;  
chunk_sizer   ru   chunk_indicesk_cpuv_cpur-   r-   r.   r9  x  s"   


	zMHATokenToKVPool.get_cpu_copyc                 C  s   t j  | j}t| jD ]d}tdt||D ]Y}||||  }|| ||  d || ||  d }}|jd |jd   krIt|ksLJ  J |j| j	d j
dd}	|j| jd j
dd}
|	| j	| |< |
| j| |< qqt j  d S )Nr   rz   Tr  )r_   r   r  r   rc   r  rl   r6   r  rp  rW   rq  )re   r;  r<   r  r   ru   r  r  r  k_chunkv_chunkr-   r-   r.   r<    s   
,zMHATokenToKVPool.load_cpu_copyr   c                 C  4   | j | jkr| j|| j  | jS | j|| j  S r(   )r?   r7   rp  r  rM   r   r-   r-   r.   r|       z MHATokenToKVPool._get_key_bufferc                 C  &   | j d ur| j || j  | |S r(   )r!  
wait_untilr  r|  r   r-   r-   r.   r/    s   

zMHATokenToKVPool.get_key_bufferc                 C  r  r(   )r?   r7   rq  r  rM   r   r-   r-   r.   r~    r  z"MHATokenToKVPool._get_value_bufferc                 C  r  r(   )r!  r  r  r~  r   r-   r-   r.   r1    s   

z!MHATokenToKVPool.get_value_bufferc                 C     |  || |fS r(   r/  r1  r   r-   r-   r.   r3    r   zMHATokenToKVPool.get_kv_bufferr   r   r4  r:   r5  r6  k_scaleOptional[float]v_scalelayer_id_overridec           	      C  s   |d ur|}n|j }|j| jkr.|d ur|| |d ur"|| || j}|| j}| j| jkr@|| j}|| j}t||| j|| j  | j	|| j  || j
| j| j| j| jd
 d S )N)r=   r?   rA   rB   rD   )r   r7   div_r  r?   rM   rR   rp  r  rq  r=   rA   rB   rD   )	re   r   r4  r5  r6  r  r  r  r   r-   r-   r.   r7    s2   



zMHATokenToKVPool.set_kv_buffertgt_locsrc_locc                 C  s  t j rt| j| j|| d S | }|dkrd S | jd us#J d| j}t|dd}| j	 |d f}||krVt
|}t| | j	| j|||||d |d dd		 d S td||D ].}t|| |}	|	| }
t
|
}t| | j	| j|||	 |||	 |
||d |d dd		 q\d S )
Nr   zBKV copy not initialized. Set enable_kv_cache_copy=True in __init__rW  rP  rU  rT  rV  r   rX  )r   SGLANG_NATIVE_MOVE_KV_CACHEgetmove_kv_cache_nativerp  rq  r   rO  r>   r   r   r]  r[  rc   min)re   r  r  Ncfgcaprh  upperstartend	chunk_lenr-   r-   r.   move_kv_cache  sT   


zMHATokenToKVPool.move_kv_cache)NNNNNNTF) rU   r>   r  r>   r7   r@   rC  r>   rD  r>   r  r>   rW   rX   rY   rE   rE  r   rF  r   rG  r   rH  r   r  r   r  r   rI  rE   rJ  rE   r   NNNr   r   r4  r:   r5  r:   r6  r:   r  r  r  r  r  r   r  r:   r  r:   )r   r   r   rg   rN  rK  rv  r#  r   r9  r<  r|  r/  r~  r1  r3  r7  r  r  r-   r-   r   r.   rB    s6    7/-




	'rB  c                   @  sB   e Zd Zdd Zdd ZdddZdd	d
Z			ddddZdS )MHATokenToKVPoolFP4c              	     s$  j t jrtjjnt \ j	j
 jj dtj_ fddtjD _ fddtjD _ fddtjD _ fddtjD _W d    n1 ssw   Y  W d    d S W d    d S 1 sw   Y  d S )N   c                   *   g | ]}t j d  fjjdqS r   r\   r_   r`   r?   rW   rk  r9   mnre   r-   r.   rw   !      z7MHATokenToKVPoolFP4._create_buffers.<locals>.<listcomp>c                   r  r  r  rk  r  r-   r.   rw   )  r  c                   ,   g | ]}t j   fjjd qS rj  r  rk  r9   r  r  scale_block_sizere   r-   r.   rw   2      c                   r  rj  r  rk  r  r-   r.   rw   :  r  )rf   r^   r   r   r_   r   r   r   r   rU   r  rC  rD  r  r?   rc   r  rp  rq  k_scale_bufferv_scale_bufferrm   r-   r  r.   rK    s4   	
"z#MHATokenToKVPoolFP4._create_buffersc                 C  s   | ` | `| `| `d S r(   )rp  rq  r  r  rm   r-   r-   r.   rv  C  s   z"MHATokenToKVPoolFP4._clear_buffersr   r>   c                 C  `   | j | jkr(| j|| j  tj}| j|| j  }ddlm	} |
||}|S | j|| j  S Nr   KVFP4QuantizeUtil)r?   r7   rp  r  rM   r_   r  r  +sglang.srt.layers.quantization.kvfp4_tensorr  batched_dequantizere   r   cache_k_nope_fp4cache_k_nope_fp4_sfr  cache_k_nope_fp4_dequantr-   r-   r.   r|  I     z#MHATokenToKVPoolFP4._get_key_bufferc                 C  r  r  )r?   r7   rq  r  rM   r_   r  r  r  r  r  )re   r   cache_v_nope_fp4cache_v_nope_fp4_sfr  cache_v_nope_fp4_dequantr-   r-   r.   r~  Y  r  z%MHATokenToKVPoolFP4._get_value_bufferNr   r   r4  r:   r5  r6  r  r  r  r  r   c                 C  s  ddl m} |d ur|}	n|j}	|j| jkr<|d ur|| |d ur(|| ddlm}
 |
|\}}|
|\}}| j| jkrZ|	| j}|	| j}|	| j}|	| j}| r| j
d ur| j }| j
| || j|	| j  |< || j|	| j  |< | j| j
 || j|	| j  |< || j|	| j  |< W d    n1 sw   Y  || j
 d S || j|	| j  |< || j|	| j  |< || j|	| j  |< || j|	| j  |< d S )Nr   rJ   r  )rN   rK   r   r7   r  r  r  batched_quantizer?   rM   rB   rA   rO   rP   rp  r  r  rQ   rq  r  )re   r   r4  r5  r6  r  r  r  rK   r   r  cache_k_fp4_sfcache_v_fp4_sfrO   r-   r-   r.   r7  i  s>   



z!MHATokenToKVPoolFP4.set_kv_bufferr   r  r  )r   r   r   rK  rv  r|  r~  r7  r-   r-   r-   r.   r    s    1

r  c                   @  s   e Zd ZdZ				dHdIddZdd Zdd Zdd Zdd  Zd!d" Z	dJd$d%Z
dJd&d'ZdJd(d)ZdJd*d+ZedKd.d/Z	0	0dLdMd8d9ZdNd<d=Zd>d? ZdOdBdCZ	dPdQdFdGZdS )RHybridLinearKVPoolzBKV cache with separate pools for full and linear attention layers.FNrU   r>   r7   r@   r  rC  rD  full_attention_layer_ids	List[int]enable_kvcache_transposerE   rW   rX   r   r   rY   use_mlakv_lora_rankqk_rope_head_dimc              
   C  s  || _ || _|| _t|| _|| _d| _|| _|| _|	| _	|r!J || _
|sBt}tr2ddlm} |}||| j|||| j||
d| _nt}trNddlm} |}||| j|| j||||
d| _dd t|D | _|rr|  t | _d S |  \}}|| t | _d S )Nr   )NPUMHATokenToKVPool)rU   r  r7   rC  rD  r  rW   rY   )NPUMLATokenToKVPool)rU   r  r7   r  rW   r  r  rY   c                 S  r   r-   r-   )r+   ru   idr-   r-   r.   r     s    
z/HybridLinearKVPool.__init__.<locals>.<dictcomp>)rU   r7   rW   rl   full_layer_numsr  r  rC  rD  r   r  rB  _is_npu/sglang.srt.hardware_backend.npu.memory_pool_npur  full_kv_poolMLATokenToKVPoolr  r   full_attention_layer_id_mappingr#  r   r   )re   rU   r7   r  rC  rD  r  r  rW   r   rY   r  r  r  TokenToKVPoolClassr  r  r&  r'  r-   r-   r.   rg     s^   


zHybridLinearKVPool.__init__c                 C  r  r(   )r  r#  rm   r-   r-   r.   r#    ro   z$HybridLinearKVPool.get_kv_size_bytesc                 C  r  r(   )r  r   rm   r-   r-   r.   r     ro   z+HybridLinearKVPool.get_contiguous_buf_infosc                 C  s   | j  \}}}|||fS r(   )r   r   )re   mamba_data_ptrsmamba_data_lensmamba_item_lensr-   r-   r.   get_state_buf_infos  s   
z&HybridLinearKVPool.get_state_buf_infosc                 C  r  )z=Get the sliceable dimension size for each mamba state tensor.)r   r   rm   r-   r-   r.   r     s   
z+HybridLinearKVPool.get_state_dim_per_tensorc                 C  r  r(   )r  r=  rm   r-   r-   r.   r=    ro   z,HybridLinearKVPool.maybe_get_custom_mem_poolr   c                 C  s.   || j vrtd|d| j   | j | S )Nz	layer_id=z not in full attention layers: )r  
ValueErrorkeysr   r-   r-   r.   _transfer_full_attention_id  s
   

z.HybridLinearKVPool._transfer_full_attention_idc                 C     |  |}| j|S r(   )r  r  r/  r   r-   r-   r.   r/       
z!HybridLinearKVPool.get_key_bufferc                 C  r  r(   )r  r  r1  r   r-   r-   r.   r1    r  z#HybridLinearKVPool.get_value_bufferc                 C  r  r(   )r  r  r3  r   r-   r-   r.   r3    r  z HybridLinearKVPool.get_kv_bufferr   r   c                 #  sF    t  fdd}|| d V  W d    d S 1 sw   Y  d S )Nc                 3  s2    | j } | j | _ z	d V  W || _ d S || _ w r(   )r   r  )r   original_layer_idrm   r-   r.   _patch_layer_id  s   z@HybridLinearKVPool._transfer_id_context.<locals>._patch_layer_id)r   )re   r   r  r-   rm   r.   _transfer_id_context  s   
"z'HybridLinearKVPool._transfer_id_context      ?r4  r:   r5  r6  r  floatr  c              	   C  sr   |  |j}| js| jjd ||||||d d S | | | j|||| W d    d S 1 s2w   Y  d S )N)r  )r  r   r  r  r7  r  )re   r   r4  r5  r6  r  r  r   r-   r-   r.   r7  (  s&   	

"z HybridLinearKVPool.set_kv_bufferr  r  c                 C  s   | j || d S r(   )r  r  )re   r  r  r-   r-   r.   r  E     z HybridLinearKVPool.move_kv_cachec                 C  s   | j djd S Nr   rH   )r  r1  r6   rm   r-   r-   r.   get_v_head_dimH  r  z!HybridLinearKVPool.get_v_head_dimcache_k_nopecache_k_ropec                 C  sP   | j sJ d| | | j|||| W d    d S 1 s!w   Y  d S )Nz.set_mla_kv_buffer called when use_mla is False)r  r  r  set_mla_kv_buffer)re   r   r4  r  r  r-   r-   r.   r  K  s   "z$HybridLinearKVPool.set_mla_kv_buffer	dst_dtypeOptional[torch.dtype]c                 C  sL   | j sJ d| | | j|||W  d    S 1 sw   Y  d S )Nz.get_mla_kv_buffer called when use_mla is False)r  r  r  get_mla_kv_buffer)re   r   r4  r  r-   r-   r.   r  V  s   $z$HybridLinearKVPool.get_mla_kv_buffer)FFNN)rU   r>   r7   r@   r  r>   rC  r>   rD  r>   r  r  r  rE   rW   rX   r   r   rY   rE   r  rE   r  r>   r  r>   r   )r   r   )r  r  )r   r   r4  r:   r5  r:   r6  r:   r  r  r  r  r  r   r   r4  r:   r  r:   r  r:   r(   r   r   r4  r:   r  r  )r   r   r   r   rg   r#  r   r  r   r=  r  r/  r1  r3  r   r  r7  r  r  r  r  r-   r-   r-   r.   r    s4    Q





r  c                      s   e Zd Z				d9d: fddZdd Zdd Zdd Zdd Zd;dd Zd;d!d"Z	d;d#d$Z
d<d+d,Zd=d/d0Z	d>d?d3d4Zd5d6 Zd7d8 Z  ZS )@r  NFrU   r>   r  r7   r@   r  r  r  rW   rX   rY   rE   r  r   r  use_nsaoverride_kv_cache_dimc              
     s   t  |||||||	|
 || _|| _|| _|o |tjko |d u| _| jr'|n|| | _| 	  tj
dd | jD tj| jd| _|sJ| | d S d S )Nc                 S  rm  r-   r   r*   r-   r-   r.   rw     r   z-MLATokenToKVPool.__init__.<locals>.<listcomp>r\   )r   rg   r  r  r  r_   r  nsa_kv_cache_store_fp8kv_cache_dimrK  r  	kv_bufferrr  rW   r   r+  )re   rU   r  r7   r  r  r  rW   rY   r  r  r  r  r   r-   r.   rg   b  s>   	zMLATokenToKVPool.__init__c              	     s    j t;  jrtj jnt   fddt j	D  _
W d    n1 s,w   Y  W d    d S W d    d S 1 sDw   Y  d S )Nc                   s0   g | ]}t j j j d  jf j jdqS rz   r\   )r_   r`   rU   r  r  r?   rW   rk  rm   r-   r.   rw     s    z4MLATokenToKVPool._create_buffers.<locals>.<listcomp>)rf   r^   r   r   r_   r   r   r   rc   r  r  rm   r-   rm   r.   rK    s   

"z MLATokenToKVPool._create_buffersc                 C  s   | ` d S r(   )r  rm   r-   r-   r.   rv    s   zMLATokenToKVPool._clear_buffersc                 C  s.   t | dsJ d}| jD ]}|t|7 }q|S )Nr  r   )rx  r  r)   )re   r%  kv_cacher-   r-   r.   r#    s
   
z"MLATokenToKVPool.get_kv_size_bytesc                   R    fddt  jD } fddt  jD } fddt  jD }|||fS )Nc                      g | ]	} j |  qS r-   )r  r   r}   rm   r-   r.   rw     s    z=MLATokenToKVPool.get_contiguous_buf_infos.<locals>.<listcomp>c                      g | ]} j | jqS r-   )r  r   r}   rm   r-   r.   rw     s    c                   s"   g | ]} j | d  j j qS r   )r  r   r  r}   rm   r-   r.   rw     s    rc   r  r  r-   rm   r.   r     s   

z)MLATokenToKVPool.get_contiguous_buf_infosr   c                 C  sP   | j d ur| j || j  | j| jkr | j|| j  | jS | j|| j  S r(   )r!  r  r  r?   r7   r  rM   r   r-   r-   r.   r/    s
   
zMLATokenToKVPool.get_key_bufferc                 C  sl   | j d ur| j || j  | j| jkr'| j|| j  dd | jf | jS | j|| j  dd | jf S )N.)r!  r  r  r?   r7   r  r  rM   r   r-   r-   r.   r1    s   
z!MLATokenToKVPool.get_value_bufferc                 C  r  r(   r  r   r-   r-   r.   r3    r   zMLATokenToKVPool.get_kv_bufferr   r   r4  r:   r5  r6  c                 C  sl   |j }| jrJ |j| jkr|| j}| j| jkr*|| j| j|| j  |< d S || j|| j  |< d S r(   )r   r  r7   r  r?   rM   r  r  )re   r   r4  r5  r6  r   r-   r-   r.   r7    s   
zMLATokenToKVPool.set_kv_bufferr  r  c                 C  s   |j }| jrt||\}}t| j|| j  ||| d S |j| jkr.|| j}|| j}| j| jkr@|	| j}|	| j}t| j|| j  ||| d S r(   )
r   r  r   r   r  r  r7   r  r?   rM   )re   r   r4  r  r  r   cache_k_nope_fp8cache_k_rope_fp8r-   r-   r.   r    s.   z"MLATokenToKVPool.set_mla_kv_bufferr  r  c                 C  sp   |j }| |}|p| j}tj|jd d| jf||jd}tj|jd d| jf||jd}t	|||| ||fS )Nr   rz   r\   )
r   r/  r7   r_   emptyr6   r  rW   r  r   )re   r   r4  r  r   r  r  r  r-   r-   r.   r    s   

z"MLATokenToKVPool.get_mla_kv_bufferc                 C  s   t j  g }| j}t| jD ]-}|g  tdt||D ]}||||  }| j| | j	ddd}|d | qqt j  |S r  )
r_   r   r  r   rc   r  r   rl   r  r  )re   r<   r;  r  r   ru   r  kv_cpur-   r-   r.   r9  %  s   


zMLATokenToKVPool.get_cpu_copyc           	      C  s   t j  | j}t| jD ]:}tdt||D ]/}||||  }|| ||  }|jd t|ks4J |j| j	d j
dd}|| j	| |< qqt j  d S )Nr   Tr  )r_   r   r  r   rc   r  rl   r6   r  r  rW   )	re   r;  r<   r  r   ru   r  r  kv_chunkr-   r-   r.   r<  4  s   
zMLATokenToKVPool.load_cpu_copy)NNFN)rU   r>   r  r>   r7   r@   r  r>   r  r>   r  r>   rW   rX   rY   rE   r  r   r  r   r  rE   r  r   r   r   r   r4  r:   r5  r:   r6  r:   r  r(   r  )r   r   r   rg   rK  rv  r#  r   r/  r1  r3  r7  r  r  r9  r<  r  r-   r-   r   r.   r  a  s&    5
	
	



-r  c                   @  s:   e Zd Zdd Zdd ZdddZdddZdddZdS )MLATokenToKVPoolFP4c              	     s   j t_ jrtjjnt 9 jj	 dj
 dtj_ fddtjD _ fddtjD _W d    n1 sPw   Y  W d    d S W d    d S 1 shw   Y  d S )Nrz   r  c                   r  r  r  rk  r  r-   r.   rw   R  r  z7MLATokenToKVPoolFP4._create_buffers.<locals>.<listcomp>c                   s(   g | ]}t j  fjjd qS rj  r  rk  )r9   r  r  re   r-   r.   rw   [  s    
)rf   r^   r   r   r_   r   r   r   rU   r  r  r  r?   rc   r  r  kv_scale_bufferrm   r-   r  r.   rK  C  s(   	
"z#MLATokenToKVPoolFP4._create_buffersc                 C  ru  r(   )r  r  rm   r-   r-   r.   rv  d  rw  z"MLATokenToKVPoolFP4._clear_buffersr   r>   c                 C  s|   | j d ur| j || j  | j| jkr6| j|| j  tj}| j	|| j  }ddl
m} |||}|S | j|| j  S r  )r!  r  r  r?   r7   r  rM   r_   r  r  r  r  r  r  r-   r-   r.   r/  h  s   
z"MLATokenToKVPoolFP4.get_key_bufferr   r   r4  r:   r5  r6  c           	      C  s   |j }| jrJ |j| jkrddlm} ||\}}| j| jkr?|| j| j|| j	  |< || j| j
|| j	  |< d S || j|| j	  |< d S r  )r   r  r7   r  r  r  r?   rM   r  r  r  )	re   r   r4  r5  r6  r   r  cache_k_fp4r  r-   r-   r.   r7  {  s   

z!MLATokenToKVPoolFP4.set_kv_bufferr  r  c                 C  s   |j }| jr+tj||gdd}t|dd}|| j}|| j	|| j
  |< d S |j| jkrEddlm} ||\}}	||\}
}| j| jkrW|| j}|| j}t| j	|| j
  |||
 t| j|| j
  ||	| d S )NrH   rn  rz   r   r  )r   r  r_   r   r   r  squeezerM   r?   r  r  r7   r  r  r  r   r   r  )re   r   r4  r  r  r   r5  r  r  r  cache_k_rope_fp4cache_k_rope_fp4_sfr-   r-   r.   r    s6   z%MLATokenToKVPoolFP4.set_mla_kv_bufferNr   r	  r  )r   r   r   rK  rv  r/  r7  r  r-   r-   r-   r.   r
  A  s    !

r
  c                      sz   e Zd ZdZejZejZ		d-d. fddZ	d/ddZ
d0ddZd0dd Zd0d!d"Zd1d'd(Zd)d* Z fd+d,Z  ZS )2NSATokenToKVPoolrQ  NrU   r>   r  r  r7   r@   r  r  rW   rX   index_head_dimrY   rE   r  r  r   r  c                   s   |
|| kr|
nd }t  j|||| |	||d|d _dks%J tr/jdks.J njdks6J jr@tjjnt	   fddt
|D _W d    n1 s^w   Y   d S )NT)r  r  rQ  rz   @   c                   sD   g | ]}t j d  j jj d   fj dqS )rz   rS  r\   )r_   r`   r  quant_block_sizeindex_k_with_scale_buffer_dtyperk  rW   r  r  re   rU   r-   r.   rw     s    z-NSATokenToKVPool.__init__.<locals>.<listcomp>)r   rg   r  _is_hipr  r   r_   r   r   r   rc   index_k_with_scale_bufferr+  )re   rU   r  r  r7   r  r  rW   r  rY   r  r  r  override_dimr   r  r.   rg     s<   
zNSATokenToKVPool.__init__r   rF   r:   c                 C  s,   | j d ur| j || j  | j|| j  S r(   )r!  r  r  r  r   r-   r-   r.   get_index_k_with_scale_buffer  s   
z.NSATokenToKVPool.get_index_k_with_scale_bufferseq_lenpage_indicesc                 C  $   | j || j  }tjj| |||dS Nr  r  )r  r  r   GetKexecutere   r   r  r  bufr-   r-   r.   get_index_k_continuous     z'NSATokenToKVPool.get_index_k_continuousc                 C  r  r  )r  r  r   GetSr   r!  r-   r-   r.   get_index_k_scale_continuous  r$  z-NSATokenToKVPool.get_index_k_scale_continuousc                 C  r  )a  
        Fused method to get both index K and scale data in a single call using Triton.
        More efficient than calling get_index_k_continuous and get_index_k_scale_continuous separately.

        :param layer_id: Layer index
        :param seq_len: Sequence length
        :param page_indices: Page indices tensor
        :return: tuple of (k_fp8, k_scale) where
                 k_fp8: (seq_len, index_head_dim), uint8
                 k_scale: (seq_len, 4), uint8
        r  )r  r  r   GetKAndSr   r!  r-   r-   r.   get_index_k_scale_buffer)  s   z)NSATokenToKVPool.get_index_k_scale_bufferr4  index_kindex_k_scalerG   c                 C  s*   | j || j  }tjj| ||||d d S )N)poolr"  r4  r)  r*  )r  r  r   SetKAndSr   )re   r   r4  r)  r*  r"  r-   r-   r.   set_index_k_scale_buffer?  s   

z)NSATokenToKVPool.set_index_k_scale_bufferc                   r   )Nc                   r  r-   )r  r   r}   rm   r-   r.   rw   L  r   z8NSATokenToKVPool.get_state_buf_infos.<locals>.<listcomp>c                   r  r-   r  r   r}   rm   r-   r.   rw   O  r   c                   s   g | ]
} j | d  jqS r   r.  r}   rm   r-   r.   rw   R  s    r  )re   r   r   r   r-   rm   r.   r  K  s   



z$NSATokenToKVPool.get_state_buf_infosc                   s&   t   }| jD ]}|t|7 }q|S r(   )r   r#  r  r)   )re   r%  index_k_cacher   r-   r.   r#  W  s   

z"NSATokenToKVPool.get_kv_size_bytesr>  )rU   r>   r  r>   r  r>   r7   r@   r  r>   r  r>   rW   rX   r  r>   rY   rE   r  r>   r  r   r  r   r?  )r   r>   r  r>   r  r:   )
r   r>   r4  r:   r)  r:   r*  r:   rF   rG   )r   r   r   r  r_   r  r  bfloat16rope_storage_dtyperg   r  r#  r&  r(  r-  r  r#  r  r-   r-   r   r.   r    s    
G



r  c                      sV   e Zd Z		d&d' fddZd(ddZd(ddZd(ddZd(ddZd)d$d%Z  Z	S )*DoubleSparseTokenToKVPoolNrU   r>   r  r7   r@   rC  rD  r  rW   rX   heavy_channel_numrY   rE   r  r   r  c              
     s   t  | |	|
| | jt` | jrtj| j	nt
 :  fddt|D | _ fddt|D | _ fddt|D | _W d    n1 s^w   Y  W d    d S W d    d S 1 svw   Y  d S )Nc                   &   g | ]}t j f d qS rj  r   rk  rW   r7   rD  rC  r  rU   r-   r.   rw         z6DoubleSparseTokenToKVPool.__init__.<locals>.<listcomp>c                   r4  rj  r   rk  r5  r-   r.   rw     r6  c                   s&   g | ]}t jd  f dqS r  r   rk  )rW   r7   rC  r3  rU   r-   r.   rw     r6  )r   rg   rf   r^   r   r   r_   r   r   r   r   rc   rp  rq  label_buffer)re   rU   r  r7   rC  rD  r  rW   r3  rY   r  r  r   )rW   r7   rD  rC  r3  r  rU   r.   rg   _  s8   

"z"DoubleSparseTokenToKVPool.__init__r   c                 C     | j || j  S r(   )rp  r  r   r-   r-   r.   r/       z(DoubleSparseTokenToKVPool.get_key_bufferc                 C  r8  r(   )rq  r  r   r-   r-   r.   r1    r9  z*DoubleSparseTokenToKVPool.get_value_bufferc                 C  r8  r(   )r7  r  r   r-   r-   r.   get_label_buffer  r9  z*DoubleSparseTokenToKVPool.get_label_bufferc                 C  s    | j || j  | j|| j  fS r(   )rp  r  rq  r   r-   r-   r.   r3    s   z'DoubleSparseTokenToKVPool.get_kv_bufferr   r   r4  r:   r5  r6  cache_labelc                 C  sF   |j }|| j|| j  |< || j|| j  |< || j|| j  |< d S r(   )r   rp  r  rq  r7  )re   r   r4  r5  r6  r;  r   r-   r-   r.   r7    s   	z'DoubleSparseTokenToKVPool.set_kv_bufferr>  )rU   r>   r  r>   r7   r@   rC  r>   rD  r>   r  r>   rW   rX   r3  r>   rY   rE   r  r   r  r   r   )
r   r   r4  r:   r5  r:   r6  r:   r;  r:   )
r   r   r   rg   r/  r1  r:  r3  r7  r  r-   r-   r   r.   r2  ^  s    
;


r2  rp  r   rq  r  r  c                 C  s\   |  dkrd S |d }|d }t| |D ]\}}|| ||< || ||< qd S r  )r   rM   longzip)rp  rq  r  r  tgt_loc_flatsrc_loc_flatr!   r"   r-   r-   r.   r    s   r  rW  tl.constexprrY  c                 C  s:  t d}t d}t || }	t | | }
t |
t t j}
|| t d| }||	k }t |d t d|}||k }t j|| |dd}t j|| |dd}|
|dddf |	  |dddf  }|
|dddf |	  |dddf  }|dddf |dddf @ }t j||d}t j|||d dS )z(2D tiled kernel. Safe for in-place copy.r   rz   r  )maskotherN)rA  )	tl
program_idloadcastpointer_typer  r   multiple_ofstore)r   stridestgt_loc_ptrsrc_loc_ptrnum_locsrW  rY  bidtidstridebase_ptrbyte_off	mask_byteloc_idxmask_locsrctgtsrc_ptrtgt_ptrrA  valsr-   r-   r.   r]    s"   

(( r]  )r&   r'   )NT)r9   r:   r;   r:   r!   r:   r"   r:   r<   r:   r=   r>   r?   r@   rA   r   rB   rC   rD   rE   rF   rG   )rp  r   rq  r   r  r:   r  r:   )rW  r@  rY  r@  )Vr   
__future__r   r@  r   logging
contextlibr   r   r   typingr   r   r   r	   r
   r   numpyr4   r_   tritontriton.languagelanguagerC  sglang.jit_kernel.kvcacher   r   sglang.srt.configs.mamba_utilsr   sglang.srt.constantsr   sglang.srt.environr   sglang.srt.layers.attention.nsar   -sglang.srt.layers.attention.nsa.quant_k_cacher   r   !sglang.srt.layers.radix_attentionr   sglang.srt.mem_cache.utilsr   r   r   r   sglang.srt.utilsr   r   r   r   r   r   sglang.srt.utils.custom_opr   +sglang.srt.utils.torch_memory_saver_adapterr    $sglang.srt.managers.cache_controllerr$   "sglang.srt.managers.schedule_batchr%   	getLoggerr   r   r   rL   r  r   r   r  r)   rR   rS   r   r   ABCr  rB  r  r  r  r
  r  r2  r  jitr]  r-   r-   r-   r.   <module>   s|    	  

%< v +`  Y  C a  
Z