o
    پiA                     @  sf  d dl mZ d dlZd dlmZ d dlZd dlZd dlmZ	 d dl
mZmZ d dlmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ erPd d
lmZmZ dZdZeeZej dVddZ!dWdd Z"dXd#d$Z#dXd%d&Z$ej dYd(d)Z%dXd*d+Z&	,dZd[d3d4Z'd\d6d7Z(	,dZd]d<d=Z)d^dAdBZ*d_dFdGZ+	d`dadIdJZ,dbdKdLZ-dcdddQdRZ.dedTdUZ/dS )f    )annotationsN)TYPE_CHECKING)BasePrefixCacheEvictParams)HybridReqToTokenPoolReqToTokenPool)SWATokenToKVPoolAllocator)get_global_server_args)support_triton)
ceil_align)ReqScheduleBatch      req_to_token_ptr_stridetl.constexprc                 C  s^  d}t d}	t ||	 }
t ||	 }t ||	 }t ||	 t t j}t ||}t|D ]'}t d|||  }||k }t j|| |d}t j	| |
|  | ||d q4t 
dt j}t|	D ]}|t || 7 }qgt || |}t|D ]-}t d|||  }||| k }t j|| | |d}t j	| |
|  | | ||d qd S )Ni   r   mask)tl
program_idloadtopointer_typeint64cdivrangearangestorecast)req_to_token_ptrreq_pool_indicesprefix_tensorspre_lensseq_lensextend_lensout_cache_locr   
BLOCK_SIZEpidreq_pool_indexpre_lenseq_lenprefix_tensornum_loopioffsetr   valuecumsum_start r1   O/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/mem_cache/common.pywrite_req_to_token_pool_triton   sH   
r3   r%   torch.Tensorreq_pool_indices_tensorreq_pool_indices_cpuprefix_lens_tensorprefix_lens_cpuseq_lens_tensorseq_lens_cpuextend_lens_tensorextend_lens_cpur!   list[torch.Tensor]req_to_token_poolr   c              
   C  s   t t jr-tjdd |	D |
jtjd}t|jd f |
j	|||||| |
j	jd  d S d}t
|jd D ]<}||  }||  }||  }||  }|
|td|f|	|  |
|t||f| |||   ||7 }q6d S )Nc                 S  s   g | ]}|  qS r1   )data_ptr.0tr1   r1   r2   
<listcomp>]   s    z'write_cache_indices.<locals>.<listcomp>)devicedtyper   r   )r
   r	   attention_backendtorchtensorrD   uint64r3   shapereq_to_tokenr   itemwriteslice)r%   r5   r6   r7   r8   r9   r:   r;   r<   r!   r>   prefix_pointersptr-   req_idx
prefix_lenr*   
extend_lenr1   r1   r2   write_cache_indicesN   s@   

rT   rK   returnc                 C  s.   t  jdkrt  jdkrt}nt}|| ||S )Nascendtorch_native)r	   rF   get_last_loc_tritonget_last_loc_torch)rK   r5   r7   implr1   r1   r2   get_last_loc   s
   r[   c                 C  s&   t |dk| ||d f t |dS )Nr   r   )rG   where	full_like)rK   r5   r7   r1   r1   r2   rY      s
   
rY   r&   c                 C  s   t d}t d|||  }||k }	t j|| |	dd}
t j|| |	dd}|
dk}|| |
d  }t j| | |dd}t j|| ||	d d S )Nr   )r   otherr   r\   r   )r   r   r   r   r   )rK   r5   r7   result
num_tokensreq_to_token_strider&   r'   r.   r   prefix_lensr    
token_masktoken_indextokensr1   r1   r2   get_last_loc_kernel   s   

rg   c              	   C  sH   d}|j d }t|}t||f}t| | ||||| d| |S )N   r   )rJ   rG   
empty_liketritonr   rg   stride)rK   r5   r7   r&   ra   r`   gridr1   r1   r2   rX      s   

	rX   F
tree_cacher   ra   intbackup_stateboolc                 C  sx   | j }t| | d }|r| }||}|d u r4d| dt|  }t| | d ur0|   t||r:||fS |S )Nz=Out of memory. Try to lower your batch size.
Try to allocate 	 tokens.
)	token_to_kv_pool_allocatorevict_from_tree_cachero   allocavailable_and_evictable_strloggererrorpretty_printRuntimeError)rm   ra   ro   	allocatorstater%   	error_msgr1   r1   r2   alloc_token_slots   s$   


r}   BasePrefixCache | Nonec                 C  s   | d u rd S |   rd S | j}t|tr?| }| }||k s$||k r=td|| }td|| }| t||d d S d S |	 |k rO| t|d d S d S )Nr   )ra   swa_num_tokens)ra   )
is_chunk_cacherr   
isinstancer   full_available_sizeswa_available_sizemaxevictr   available_size)rm   ra   rz   r   r   full_num_tokensr   r1   r1   r2   rs      s$   

rs   rc   r#   last_locextend_num_tokensc                 C  s   | j }|t||j  }	t| |	 d }
|r| }
|||||||}|d u rBd| dt|  }t| | d ur>| 	  t
||rH||
fS |S )NzEPrefill out of memory. Try to lower your batch size.
Try to allocate rq   )rr   len	page_sizers   ro   alloc_extendru   rv   rw   rx   ry   )rm   rc   r8   r#   r:   r   r   ro   rz   ra   r{   r%   r|   r1   r1   r2   alloc_paged_token_slots_extend   s4   
	
r   reqs	list[Req]	list[int]c           	      C  s   t |}t| tr6| j }| rtnt}|| }||k r6|dur6| r6td|| }|	t
d|d | |}|du rLtd|  d|d|S )z%Allocate request slots from the pool.Nr   )ra   	mamba_numzalloc_req_slots runs out of memory. Please set a smaller number for `--max-running-requests`. req_to_token_pool.available_size()=z, num_reqs=z, )r   r   r   
mamba_poolr   supports_mamba MAMBA_STATE_PER_REQ_PREFIX_CACHEMAMBA_STATE_PER_REQ_NO_CACHEr   r   r   rt   ry   )	r>   r   rm   num_reqsmamba_available_sizefactormamba_state_neededr   r    r1   r1   r2   alloc_req_slots)  s,   


r   batchr   ,tuple[torch.Tensor, torch.Tensor, list[int]]c                   s      dd  jD }tj jtjd}tj jtjd}|j jdd}|j jdd}t	 j
 j j}tj|tjd}|j jdd} jjdkrUt j j}	n fdd|D }
t j|| j jt|
 jd}	t|	|||| j j||| j
 |	||fS )	a  
    Allocate KV cache for extend batch and write to req_to_token_pool.

    Returns:
        out_cache_loc: allocated cache locations
        req_pool_indices_device: request pool indices at a device tensor
        req_pool_indices: request pool indices as list
    c                 S  s   g | ]}|j qS r1   )prefix_indices)rA   rr1   r1   r2   rC   V  s    z$alloc_for_extend.<locals>.<listcomp>)rE   T)non_blockingr   c                   s6   g | ]}t |d kr|dd ntjdg jdqS )r   r\   N)rD   )r   rG   rH   rD   r@   r   r1   r2   rC   j  s    ()rm   rc   r8   r#   r:   r   r   )maybe_evict_swar   rG   rH   rc   r   r$   r   rD   r   r>   rm   r   r}   r   r   r#   r:   catrT   )r   r!   r8   r<   prefix_lens_deviceextend_lens_devicer    r6   req_pool_indices_devicer%   r   r1   r   r2   alloc_for_extendH  sN   

r   token_per_reqc           	      C  sv   | j }t||j }t| | ||||}|du r9dt||  dt|  }t| | dur5|   t	||S )z)Allocate paged KV cache for decode batch.NzDDecode out of memory. Try to lower your batch size.
Try to allocate rq   )
rr   r   r   rs   alloc_decoderu   rv   rw   rx   ry   )	rm   r#   r:   r   r   rz   ra   r%   r|   r1   r1   r2   alloc_paged_token_slots_decode  s    


r   c                 C  s   |    | jjd }| jjdkrt| j|| }n| jj| j| jd f }| j| }t	| j|| j
| ||d}| jjrB| j| j }n| j }| j| j|f|tj |S )z
    Allocate KV cache for decode batch and write to req_to_token_pool.

    Returns:
        out_cache_loc: allocated cache locations
    r   r   )rm   r#   r:   r   r   )r   r#   rJ   rm   r   r}   r>   rK   r    r   r:   model_configis_encoder_decoderencoder_lensclonerM   r   rG   int32)r   r   bsr%   r   seq_lens_nextlocsr1   r1   r2   alloc_for_decode  s,   
	
r   Treqr   	is_insertc           	      C  s  | j d u r"| sJ d| jd ur |jj| jd d | _d S |j| |d |  \}}t	 }|j
}|j}|d u rL||ksLJ d| jd| j|dkrUt||}||k rj|jj| j  || }|j| t|jtr| s| jd us}J d|j|  |j|  d S )Nz/Only MambaRadixCache allow freeing before allocr\   )r   z8Unexpected overallocated KV cache, req.kv_committed_len=z, req.kv_allocated_len=r   zFmamba state is freed while the tree cache does not manage mamba states)req_pool_idxr   mamba_pool_idxr>   r   free	unsqueezecache_finished_reqpop_overallocated_kv_cacher	   r   speculative_algorithmkv_committed_lenkv_allocated_lenr   rK   rr   r   r   free_mamba_cache)	r   rm   r   start_pend_pglobal_server_argsr   	spec_algoindices_to_freer1   r1   r2   release_kv_cache  sF   




r   strc                 C  s   |   S )N)ru   )rm   r1   r1   r2   ru     s   ru   )r   r   )r%   r4   r5   r4   r6   r4   r7   r4   r8   r4   r9   r4   r:   r4   r;   r4   r<   r4   r!   r=   r>   r   )rK   r4   r5   r4   r7   r4   rU   r4   )r&   r   )F)rm   r   ra   rn   ro   rp   )rm   r~   ra   rn   )rm   r   rc   r4   r8   r4   r#   r4   r:   r4   r   r4   r   rn   ro   rp   )r>   r   r   r   rm   r~   rU   r   )r   r   rU   r   )r   )rm   r   r#   r4   r:   r4   r   r4   r   rn   rU   r4   )r   r   r   rn   rU   r4   )T)r   r   rm   r   r   rp   )rm   r   rU   r   )0
__future__r   loggingtypingr   rG   rj   triton.languagelanguager   &sglang.srt.mem_cache.base_prefix_cacher   r    sglang.srt.mem_cache.memory_poolr   r   $sglang.srt.mem_cache.swa_memory_poolr   sglang.srt.server_argsr	   sglang.srt.utilsr
   sglang.srt.utils.commonr   "sglang.srt.managers.schedule_batchr   r   r   r   	getLogger__name__rv   jitr3   rT   r[   rY   rg   rX   r}   rs   r   r   r   r   r   r   ru   r1   r1   r1   r2   <module>   sJ    

2
1


"
*
G
*.