o
    پiQA                     @  s   d dl mZ 	 	 d dlZd dlmZ d dlZd dlZd dlmZ	 d dl
mZmZmZ er2d dlmZ G dd dejZG dd	 d	eZd
d ZejdddZejdddZG dd deZdS )    )annotationsN)TYPE_CHECKING)get_bool_env_varget_num_new_pagesnext_power_of_2)KVCachec                   @  s   e Zd Zejd1ddZd2ddZdd Zdd Zdd Z	dd Z
dd Zdd Zdd Zdd  Zd!d" Zd#d$ Zd%d& Zejd'd( Zejd3d*d+Zejd4d.d/Zd0S )5BaseTokenToKVPoolAllocatorsizeint	page_sizedtypetorch.dtypedevicestrkvcacher   	need_sortboolc                 C  s@   || _ || _|| _|| _|| _|| _d | _d | _d| _g | _	d S NT)
r	   r   r   r   _kvcacher   
free_pagesrelease_pagesis_not_in_free_group
free_groupselfr	   r   r   r   r   r    r   R/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/mem_cache/allocator.py__init__$   s   

z#BaseTokenToKVPoolAllocator.__init__returnc                 C  s   dS )N r   r   r   r   r   debug_print:   s   z&BaseTokenToKVPoolAllocator.debug_printc                 C  s   t | jt | j | j S N)lenr   r   r   r    r   r   r   available_size=   s   z)BaseTokenToKVPoolAllocator.available_sizec                 C  s   | j S r"   )r   r    r   r   r   get_kvcache@   s   z&BaseTokenToKVPoolAllocator.get_kvcachec                 C  s   |\| _ | _d S r"   r   r   )r   stater   r   r   restore_stateC   s   z(BaseTokenToKVPoolAllocator.restore_statec                 C  s   | j | jfS r"   r&   r    r   r   r   backup_stateF      z'BaseTokenToKVPoolAllocator.backup_statec                 C  s   d| _ g | _d S )NF)r   r   r    r   r   r   free_group_beginI   s   
z+BaseTokenToKVPoolAllocator.free_group_beginc                 C  s&   d| _ | jr| t| j d S d S r   )r   r   freetorchcatr    r   r   r   free_group_endM   s   z)BaseTokenToKVPoolAllocator.free_group_endc                 C  sT   t | jdkr(t| j| jf| _t| j\| _}tjd| jj| jd| _d S d S )Nr   r   r   r   )	r#   r   r-   r.   r   sortemptyr   r   )r   _r   r   r   merge_and_sort_freeR   s   z.BaseTokenToKVPoolAllocator.merge_and_sort_freec                 O     t  r"   NotImplementedErrorr   argskwargsr   r   r   get_cpu_copyZ      z'BaseTokenToKVPoolAllocator.get_cpu_copyc                 O  r6   r"   r7   r9   r   r   r   load_cpu_copy^   r=   z(BaseTokenToKVPoolAllocator.load_cpu_copyc                 O     t d)Nz(alloc_extend is only for paged allocatorr7   r9   r   r   r   alloc_extendb      z'BaseTokenToKVPoolAllocator.alloc_extendc                 O  r?   )Nz(alloc_decode is only for paged allocatorr7   r9   r   r   r   alloc_decodee   rA   z'BaseTokenToKVPoolAllocator.alloc_decodec                 C  r6   r"   r7   r    r   r   r   clearh   r=   z BaseTokenToKVPoolAllocator.clear	need_sizec                 C  r6   r"   r7   )r   rD   r   r   r   allocl   r=   z BaseTokenToKVPoolAllocator.alloc
free_indextorch.Tensorc                 C  r6   r"   r7   r   rF   r   r   r   r,   p   r=   zBaseTokenToKVPoolAllocator.freeNr	   r
   r   r
   r   r   r   r   r   r   r   r   )r   r   rD   r
   rF   rG   )__name__
__module____qualname__abcabstractmethodr   r!   r$   r%   r(   r)   r+   r/   r5   r<   r>   r@   rB   rC   rE   r,   r   r   r   r   r   #   s*    

r   c                      sV   e Zd ZdZd fddZdd Zdd ZdddZdddZdd Z	dd Z
  ZS ) TokenToKVPoolAllocatorz3An allocator managing the indices to kv cache data.r	   r
   r   r   r   r   r   r   r   r   c                   s"   t  |d|||| |   d S )N   )superr   rC   )r   r	   r   r   r   r   	__class__r   r   r   x   s   zTokenToKVPoolAllocator.__init__c                 C  D   t jd| jd t j| jd| _d| _g | _t jdt j| jd| _	d S NrR   r1   Tr0   )
r-   aranger	   int64r   r   r   r   r3   r   r    r   r   r   rC         zTokenToKVPoolAllocator.clearc                 C  s   t | jt | j S r"   )r#   r   r   r    r   r   r   r$      s   z%TokenToKVPoolAllocator.available_sizerD   c                 C  sP   | j r|t| jkr|   |t| jkrd S | jd | }| j|d  | _|S r"   )r   r#   r   r5   )r   rD   select_indexr   r   r   rE      s   zTokenToKVPoolAllocator.allocrF   rG   c                 C  sX   |  dkrd S | jr$| jrt| j|f| _d S t| j|f| _d S | j| d S Nr   )	numelr   r   r-   r.   r   r   r   appendrH   r   r   r   r,      s   zTokenToKVPoolAllocator.freec                 C     | j |S r"   r   r<   r   indicesr   r   r   r<      r*   z#TokenToKVPoolAllocator.get_cpu_copyc                 C     | j ||S r"   r   r>   r   kv_cache_cpurb   r   r   r   r>         z$TokenToKVPoolAllocator.load_cpu_copy)
r	   r
   r   r   r   r   r   r   r   r   rJ   rK   )rL   rM   rN   __doc__r   rC   r$   rE   r,   r<   r>   __classcell__r   r   rT   r   rQ   u   s    	

rQ   c                 C  s   ||  }t |d}|| }	|| d | | | d |  }
|| | | d |  }|
| }t |
d}||
 }t j||t jd}tt| D ]}t|| | | | d | | | |  }|ry|| d |d | d ||	| |	| | < | | | || krqG|| | | | | d |  | }|r||| || ||   | }|dd|dd d||	| | |	| | | < | | | | || krqG|| || | |  }|r||| d  | |d |  d||| | || < qGd S )Nr   rR   )r   r   )r-   cumsumrX   int32ranger#   minview)prefix_lensseq_lenslast_locr   out_indicesr   r   extend_lensend_pos	start_posnum_new_pagesnum_full_new_pages	need_pageend_new_pagesstart_new_pagespos_in_pageinum1num2pagesnum3r   r   r   alloc_extend_naive   s`   	

r   bs_uppertl.constexprr   max_num_extend_tokensc                  C  s&  t d}t d|}	t j||	 |	|kd}
t j| |	 |	|kd}|
| }t || }t | | }|| }t |}|| }|
| d | }|| d | }|| }|| d | || d |  }t |}|| }t || }t||| d | | | }t d|}t j|| | |d | ||k d || |krd S || | || d | |  }t d|}t j|| ||  ||k d}t j|| | | || ||  ||k d || | |krd S ||| |  }t || | d }t j|| | | | || | ||k d d S Nr   )maskrR   )tl
program_idrX   loadsumrn   store) pre_lens_ptrseq_lens_ptrlast_loc_ptrfree_page_ptrrs   r   r   r   pidload_offsetrq   pre_lensrt   seq_lenpre_len
extend_lensum_extend_lensoutput_start_locnum_pages_afternum_pages_beforerw   num_page_start_loc_selfsum_num_new_pagesnew_page_start_locrr   	num_part1offset_one_page	num_part2offset_many_page
page_start	num_part3	start_locr   r   r   alloc_extend_kernel   sn   








r   c                 C  s  t d}t d|}t j| | ||kd}t ||k|d |}	t | | }
|
d }|| d | }|	| d | }|| }|
| d | || d |  }t |}|| }|dkrpt || }t || |d  d S t || }t || ||  d S r   )r   r   rX   r   wherer   r   )r   r   r   rs   r   r   r   r   rq   r   r   r   r   r   rw   r   r   r   rr   pager   r   r   alloc_decode_kernel:  s(   
	

r   c                      sb   e Zd ZdZd& fddZd'ddZd(ddZd)ddZd*ddZd d! Z	d"d# Z
d$d% Z  ZS )+PagedTokenToKVPoolAllocatorz
    An allocator managing the indices to kv cache data.

    This class has the same interface as `TokenToKVPoolAllocator` but the output
    of one request is always page-aligned.

    TODO: fuse last_loc into the kernel.
    r	   r
   r   r   r   r   r   r   r   r   r   c                   s<   t  |||||| || | _td| _d| _|   d S )NSGLANG_DEBUG_MEMORY_POOLrR   )rS   r   	num_pagesr   
debug_mode*seen_max_num_extend_tokens_next_power_of_2rC   r   rT   r   r   r   h  s
   	

z$PagedTokenToKVPoolAllocator.__init__rD   c                 C  s   | j r|| j dksJ d|| j }| jr!|t| jkr!|   |t| jkr*d S | jd | }| j|d  | _|d d d f | j tj| j| jd 	d}|S )Nr   z*The allocation size should be page-aligned)r   rj   )
r   r   r   r#   r   r5   r-   rX   r   reshape)r   rD   r   	out_pagesrs   r   r   r   rE   w  s"   
z!PagedTokenToKVPoolAllocator.allocrp   rG   prefix_lens_cpurq   seq_lens_cpurr   extend_num_tokensc           
   	   C  s0  | j rt|d | j || j ksJ t| jttjj	t
|| _t|}| jr;|| j | d t| jkr;|   tj|ftj| jd}|tjj	k r`t|f |||| j|t
|| j| j nt|||| j|| j| j | j r}tt|t|ks}J t|| j|d}	|	t| jkrd S | j|	d  | _|S )NrR   r1   )rq   r   rp   )r   r-   allr   maxr   rn   r   coreTRITON_MAX_TENSOR_NUMELr   r#   r   r   r5   r3   rY   r   r   r   uniquer   )
r   rp   r   rq   r   rr   r   bsrs   rw   r   r   r   r@     s^   	
z(PagedTokenToKVPoolAllocator.alloc_extendc                 C  s   | j rt|d | j || j ksJ t|}| jr&|t| jkr&|   tj|ftj	| j
d}t|f ||| j|t|| j | j rQtt|t|ksQJ t|| jdd}|t| jkrbd S | j|d  | _|S )N   r1   T)rq   r   decode)r   r-   r   r   r#   r   r   r5   r3   rY   r   r   r   r   r   )r   rq   r   rr   r   rs   rw   r   r   r   rB     s6   	z(PagedTokenToKVPoolAllocator.alloc_decoderF   c                 C  s   |  dkrd S | jr*t|| j }| jr t|| jf| _nt|| jf| _n| j	
| | jrBtt| jt| jksDJ d S d S r\   )r]   r   r-   r   r   r   r.   r   r   r   r^   r   r#   )r   rF   free_page_indicesr   r   r   r,     s   z PagedTokenToKVPoolAllocator.freec                 C  rV   rW   )
r-   rX   r   rY   r   r   r   r   r3   r   r    r   r   r   rC     rZ   z!PagedTokenToKVPoolAllocator.clearc                 C  r_   r"   r`   ra   r   r   r   r<     r*   z(PagedTokenToKVPoolAllocator.get_cpu_copyc                 C  rc   r"   rd   re   r   r   r   r>     rg   z)PagedTokenToKVPoolAllocator.load_cpu_copyrI   rJ   )rp   rG   r   rG   rq   rG   r   rG   rr   rG   r   r
   )rq   rG   r   rG   rr   rG   rK   )rL   rM   rN   rh   r   rE   r@   rB   r,   rC   r<   r>   ri   r   r   rT   r   r   ^  s    	


A
'	r   )r   r   r   r   r   r   )r   r   r   r   )
__future__r   rO   typingr   r-   tritontriton.languagelanguager   sglang.srt.utilsr   r   r    sglang.srt.mem_cache.memory_poolr   ABCr   rQ   r   jitr   r   r   r   r   r   r   <module>   s&    R9<O#