o
    پi                     @  s  d dl mZ d dlZd dlmZ d dlmZ edZee	Z
	 d dlZd dlZd dlmZmZ d dlmZ d dlmZmZ d d	lmZmZmZmZmZmZ d dlZd d
lmZ d dlm Z  d dl!m"Z"m#Z# d dl$m%Z%m&Z&m'Z' d dl(m)Z)m*Z*m+Z+ d dl,m-Z- d dl.m/Z/ erd dl0m1Z1 e2ej34ddZ5e2ej34ddZ6e2ej34ddZ7dZ8G dd deZ9G dd deZ:G dd dZ;G dd deZ<G d d! d!Z=dS )"    )annotationsN) PrefillDelayerSinglePassExecutor)get_bool_env_var#SGLANG_ROUTING_KEY_POLICY_DEBUG_LOG)Counterdefaultdict)contextmanager)Enumauto)TYPE_CHECKINGDictListOptionalSetUnion)
DllmConfig)is_nsa_prefill_cp_in_seq_split)ReqScheduleBatch)BasePrefixCacheInsertParamsMatchPrefixParams)
RadixCacheRadixKeyTreeNode)SWATokenToKVPoolAllocator)
ServerArgs)BaseTokenToKVPoolAllocator%SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION4096'IN_BATCH_PREFIX_CACHING_CHECK_THRESHOLD32.IN_BATCH_PREFIX_CACHING_DEPRIORITIZE_THRESHOLD   c                   @  s   e Zd ZdZdZdZdS )CacheAwarePolicyz5Scheduling policies that are aware of the tree cache.lpmz
dfs-weightN)__name__
__module____qualname____doc__LPM
DFS_WEIGHT r,   r,   W/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/managers/schedule_policy.pyr$   M   s    r$   c                   @  s    e Zd ZdZdZdZdZdZdS )CacheAgnosticPolicyz9Scheduling policies that are not aware of the tree cache.fcfslofrandomzrouting-keyN)r&   r'   r(   r)   FCFSLOFRANDOMROUTING_KEYr,   r,   r,   r-   r.   T   s    r.   c                   @  s   e Zd Zeeef Zd:d	d
Z	d;d<ddZd=ddZ	d>ddZ
d?ddZed@ddZedAd d!ZedBd$d%ZedCd&d'ZedDd(d)ZedEd+d,ZedFd1d2ZedGd8d9ZdS )HSchedulePolicypolicystr
tree_cacher   enable_hierarchical_cacheboolenable_priority_scheduling"schedule_low_priority_values_firstc                 C  sB   |  ||| _|| _|| _|| _|| _|rdnd| _t | _	d S )Nr#   )
_validate_and_adjust_policyr7   r9   r:   r<   r=   priority_signr   create_simulatedwaiting_queue_radix_tree)selfr7   r9   r:   r<   r=   r,   r,   r-   __init__`   s   zSchedulePolicy.__init__Nwaiting_queue	List[Req]running_batchOptional[ScheduleBatch]returnc                 C  s  | j tjkr| jrt|| j dS | |}d}t|t	rHd}| 
||}|t	jkr3t|| |S |t	jkrAt|| j |S td||tjkrP	 |S |tjkr`t|| j| j |S |tjkrlt| |S |tjkr}|d ur{t|| |S td|)NFTz"Unknown CacheAware Policy: policy=z%Unknown CacheAgnostic Policy: policy=)r7   r.   r2   r<   r6   _sort_by_priority_and_fcfsr@   _determine_active_policy
isinstancer$   _compute_prefix_matchesr*   _sort_by_longest_prefixr+   _sort_by_dfs_weightr9   
ValueErrorr3   _sort_by_longest_outputr4   _sort_randomlyr5   _sort_by_routing_key)rC   rE   rG   r7   prefix_computedtemporary_deprioritizedr,   r,   r-   calc_priorityr   sP   








zSchedulePolicy.calc_priorityPolicyc                 C  s$   | j tjkrt|dkrtjS | j S )N   )r7   r$   r*   lenr.   r2   )rC   rE   r,   r,   r-   rK      s   z'SchedulePolicy._determine_active_policyc                 C  s^   zt |}t|ddrtjW S |W S  ty.   zt|W  Y S  ty-   td|w w )z`
        Validates the policy and adjusts it if necessary based on tree cache settings.
        disableTz Unknown schedule_policy: policy=)r$   getattrr.   r2   rP   )rC   r7   r9   policy_enumr,   r,   r-   r?      s   z*SchedulePolicy._validate_and_adjust_policyr$   Set[int]c           	   	   C  s   t  }| j  |D ]d}|j|j }|j}| jtt	||dd}|j
|j|j|jf\|_|_|_|_t|jtkrn| jtt	||dd}|j
}t|tkrX||j q
| jtt	||dtjt|tjdd q
|S )z
        Computes and caches the matching prefixes for requests in the waiting queue,
            and handles in-batch prefix caching logic.
        )	token_ids	extra_keykey)dtype)ra   value)setrB   resetorigin_input_ids
output_idsr_   r9   match_prefixr   r   device_indiceslast_device_nodelast_host_nodehost_hit_lengthprefix_indices	last_noderY   r    r"   addridinsertr   torchemptyr;   )	rC   rE   r7   rU   r
prefix_idsr_   match_resultin_batch_matching_prefixesr,   r,   r-   rM      sN   



z&SchedulePolicy._compute_prefix_matchesrU   Nonec                      | j  fddd dS )z:Sorts the waiting queue based on the longest prefix match.c                   s   | j  vrt| j S tdS )Ninf)rp   rY   rm   float)rt   rU   r,   r-   <lambda>   s   
z8SchedulePolicy._sort_by_longest_prefix.<locals>.<lambda>r`   Nsort)rE   rU   r,   r|   r-   rN         

z&SchedulePolicy._sort_by_longest_prefixc                 C  sp   t t}| D ]
}||j | qt t}|D ]
}t|| ||< qt|j| | 	  t
|j|||  dS )z@Sorts the waiting queue based on a depth-first search weighting.N)r   listrn   appendintrY   r6   _calc_weight	root_nodeclear_get_dfs_priority)rE   r9   last_node_to_reqsreqnode_to_weightnoder,   r,   r-   rO      s   z"SchedulePolicy._sort_by_dfs_weightr@   r   c                   s0   |r| j  fddd dS | j dd d dS )z{Sorts the waiting queue based on the longest output (max_new_tokens). If using priority scheduling, sort by priority first.c                   s   | j   | jj fS N)prioritysampling_paramsmax_new_tokensxr@   r,   r-   r}     s   z8SchedulePolicy._sort_by_longest_output.<locals>.<lambda>r`   c                 S  s
   | j j S r   )r   r   r   r,   r,   r-   r}   $     
 Nr~   )rE   r<   r@   r,   r   r-   rQ     s
   

z&SchedulePolicy._sort_by_longest_outputc                 C  s   t |  dS )z$Shuffles the waiting queue randomly.N)r1   shuffle)rE   r,   r,   r-   rR   &  s   zSchedulePolicy._sort_randomlyc                   ry   )zOSorts the waiting queue based on the request priority then received titmestamp.c                   s   | j   | jjfS r   r   
time_statswait_queue_entry_timer   r   r,   r-   r}   1  s   z;SchedulePolicy._sort_by_priority_and_fcfs.<locals>.<lambda>r`   Nr~   )rE   r@   r,   r   r-   rJ   +  r   z)SchedulePolicy._sort_by_priority_and_fcfsr   c                   s   t dd |jD  tr dd | D }tdt  d|   s$dS d fd
d}| j|d trDdd | D }td|  dS dS )z>Sorts waiting queue by routing key frequency in running batch.c                 s  s    | ]	}|j r|j V  qd S r   routing_key.0rt   r,   r,   r-   	<genexpr><  s    
z6SchedulePolicy._sort_by_routing_key.<locals>.<genexpr>c                 S     g | ]}|j qS r,   r   r   r,   r,   r-   
<listcomp>A      z7SchedulePolicy._sort_by_routing_key.<locals>.<listcomp>zrouting_key_counts=z, waiting_keys_before=Nr   r   c                   s4   | j }|r| v r | }d| |fS dd|pdfS )Nr   r#    r   )r   ra   countrouting_key_countsr,   r-   sort_keyJ  s
   z5SchedulePolicy._sort_by_routing_key.<locals>.sort_keyr`   c                 S  r   r,   r   r   r,   r,   r-   r   U  r   zwaiting_keys_after=r   r   )r   reqs_ROUTING_KEY_POLICY_DEBUG_LOGloggerinfodictr   )rE   rG   waiting_keys_beforer   waiting_keys_afterr,   r   r-   rS   7  s$   z#SchedulePolicy._sort_by_routing_keycur_noder   r   Dict[TreeNode, int]c                 C  s4   | j  D ]}t|| ||   || 7  < qd S r   )childrenvaluesr6   r   )r   r   childr,   r,   r-   r   X  s   zSchedulePolicy._calc_weightnode_to_priorityr   Dict[TreeNode, List[Req]]qr   c                   sT   dd | j  D }|j fddd |D ]
}t| || q|||   d S )Nc                 S  s   g | ]}|qS r,   r,   )r   r   r,   r,   r-   r   e  s    z4SchedulePolicy._get_dfs_priority.<locals>.<listcomp>c                   s
    |   S r   r,   r   r   r,   r-   r}   f  r   z2SchedulePolicy._get_dfs_priority.<locals>.<lambda>r`   )r   r   r   r6   r   extend)r   r   r   r   r   r   r,   r   r-   r   ^  s   z SchedulePolicy._get_dfs_priority)
r7   r8   r9   r   r:   r;   r<   r;   r=   r;   r   )rE   rF   rG   rH   rI   r;   )rE   rF   rI   rW   )r7   r8   r9   r   rI   rW   )rE   rF   r7   r$   rI   r]   )rE   rF   rU   r]   rI   rx   )rE   rF   r9   r   rI   rx   )rE   rF   r<   r;   r@   r   rI   rx   )rE   rF   rI   rx   )rE   rF   r@   r   rI   rx   )rE   rF   rG   r   rI   rx   )r   r   r   r   rI   rx   )
r   r   r   r   r   r   r   r   rI   rx   )r&   r'   r(   r   r$   r.   rW   rD   rV   rK   r?   rM   staticmethodrN   rO   rQ   rR   rJ   rS   r   r   r,   r,   r,   r-   r6   ]   s0    

,

< r6   c                   @  s   e Zd Ze Ze Ze ZdS )AddReqResultN)r&   r'   r(   r
   CONTINUENO_TOKENOTHERr,   r,   r,   r-   r   n  s    
r   c                   @  s   e Zd Z					dHdIddZdJddZdKdd Zed!d" Zed#d$ ZdLd&d'Z	d(d) Z
dMd-d.ZdNd/d0ZdOd1d2ZdPd3d4ZdPd5d6ZdPd7d8ZedQd;d<ZdPd=d>ZdRdBdCZdSdFdGZdS )TPrefillAdderr   N	page_sizer   r9   r   token_to_kv_pool_allocatorr   rG   r   new_token_ratior{   rem_input_tokensrem_chunk_tokensOptional[int]mixed_with_decode_tokens(priority_scheduling_preemption_thresholdprefill_max_requestsprefill_delayer_single_pass*Optional[PrefillDelayerSinglePassExecutor]dllm_configOptional[DllmConfig]c                   s   | _ | _| _| _| _||  _| _| _ jd ur$ |  jd ur0  j|8  _| _	| _
d  _g  _g  _d  _d _d _|d ur]  j	t fdd|jD 7  _	t jt _ j  _|	 _t  _|
 _| _d S )Nr   c                   s   g | ]}  |qS r,   )'_get_running_request_total_token_offsetr   rC   r,   r-   r     s    z)PrefillAdder.__init__.<locals>.<listcomp>)r   r9   r   rG   r   r   r   r   _init_dllm_metarem_total_token_offsetcur_rem_token_offset
req_statescan_run_listpreempt_listnew_chunked_reqlog_hit_tokenslog_input_tokenssumr   rL   r   is_hybrid_swasupports_mambais_hybrid_ssm_cacher   r   nsa_prefill_cp_in_seq_splitr   r   )rC   r   r9   r   rG   r   r   r   r   r   r   r   r   r,   r   r-   rD   u  sF   





zPrefillAdder.__init__r   c                 C  s   |j | _|j}|| j | _d S r   )
block_sizedllm_block_sizemax_running_requestsrem_dllm_tokens)rC   r   max_running_reqsr,   r,   r-   r     s   zPrefillAdder._init_dllm_metar   r   rI   c                 C  s   t |jjt|j t| j S r   )minr   r   rY   rg   CLIP_MAX_NEW_TOKENSr   )rC   r   r,   r,   r-   r     s   z4PrefillAdder._get_running_request_total_token_offsetc                 C  l   | j rt| j | j  | j | j  }n| jr'| j	 | j  }n
| j	 | j
  }|| j S r   )r   r   r   full_available_sizer9   full_evictable_sizeswa_available_sizeswa_evictable_sizer   available_sizeevictable_sizer   rC   available_and_evictabler,   r,   r-   rem_total_tokens  s&   
zPrefillAdder.rem_total_tokensc                 C  r   r   )r   r   r   r   r9   r   r   r   r   r   r   r   r   r,   r,   r-   cur_rem_tokens  s&   
zPrefillAdder.cur_rem_tokenstokensc                 C  s   | | j   | j  S r   )r   )rC   r   r,   r,   r-   ceil_paged_tokens  s   zPrefillAdder.ceil_paged_tokensc                 C  sj   | j dks
| jdkrtjS | jdkrtjS | jd ur%| jdkr"tjS tj	S | jd ur2| jdkr2tjS tj	S Nr   )
r   r   r   r   r   r   r   r   r   r   r   r,   r,   r-   budget_state  s   


zPrefillAdder.budget_state
prefix_lenextend_input_lenr   c                 C  s   |  |}|  j|| 7  _|  j|7  _|  j|8  _| jd ur)|  j|8  _n| jd ur5|  j|8  _|  j|7  _|  j|7  _d S r   )	r   r   r   r   r   r   r   r   r   )rC   r   r   r   r,   r,   r-   _update_prefill_budget  s   


z#PrefillAdder._update_prefill_budgetc                 C  s(   t | j| jt| j}|dkr| j}|S r   )r   r   r   r   r   )rC   _rem_tokensr,   r,   r-   _get_dllm_remain_tokens  s   z$PrefillAdder._get_dllm_remain_tokensc                 C  sR   t | j| j| j | j }||_|jd ||  |_| j| | ||d d S r   )	r   r   r   r   r   fill_idsr   r   r   )rC   r   r   	trunc_lenr,   r,   r-   _add_dllm_req  s   zPrefillAdder._add_dllm_reqc                 C  s0   | j r| j|j}||_d S | j|j d S r   )r   r9   inc_lock_refrn   swa_uuid_for_lock)rC   r   r   r,   r,   r-   _req_inc_lock_ref,  s   
zPrefillAdder._req_inc_lock_refc                 C  s   | j d usJ |  }|dkrtjS |j|k}t|j||_|jd t|j|j  |_| j	
| |s;t|jjtnd}| d|j| |  dkrNtjS tjS r   )r   r   r   r   r   r   r   rY   rm   r   r   r   r   r   r   r   )rC   r   r   	truncatedr   r,   r,   r-   add_dllm_staging_req3  s$   
z!PrefillAdder.add_dllm_staging_reqc                 C  s   | j d ur
|  }nt| jt| j}|dkr| j}|j|k}|t|j| |jd t	|j
|j  |_| j| | d|j|sJt|jjtnd |rQ|S d S r   )r   r   r   r   r   r   r   set_extend_input_lenr   rY   rm   r   r   r   r   r   r   )rC   r   r   r   r,   r,   r-   add_chunked_reqO  s"   


zPrefillAdder.add_chunked_reqrn   r   c              
   c  s    z6| j  r| j  r| j |}n| j | d V  W | j  r0| j  r0| j || d S | j | d S | j  rJ| j  rJ| j || w | j | w r   )r9   supports_swais_tree_cacher   dec_lock_ref)rC   rn   r   r,   r,   r-   
_lock_nodej  s   zPrefillAdder._lock_nodec                   s    |jt j jkrtjS d
 fdd	} jd u rGg  _||  jd ur3 jj	D ]}|| q, j
D ]}|| q6 jjdd d n||dd  js j  |j }d	}t jD ]$\}\}}t j| }	|| ||	  }
|
t|	 krtj  S ||7 }q` jd ur jd	krtjS  |d	   S  jd u s|j jkr j
|  d	|jt|jjt   S  jd	krtjS  j}|| |jd | |_ j
| | _ d	|d	   S )NFc                   s   | j jrdn j}| j j| t| j }t| jt| j }|dkr$d S |s0 j||f d S d}t	t jD ]}| j| d krF nq9 j
|||f d S )Ng      ?r   )r   
ignore_eosr   r   rY   rg   rf   r   r   rangerq   )rt   insert_sortr   tokens_lefttokens_occupiedir   r,   r-   add_req_state  s    z:PrefillAdder.add_one_req_ignore_eos.<locals>.add_req_statec                 S  s   | d S r   r,   r   r,   r,   r-   r}     s    z5PrefillAdder.add_one_req_ignore_eos.<locals>.<lambda>r`   T)r  r   )F)r   r   r   r   r   r   r   r   rG   r   r   r   r   	enumeraterY   IGNORE_EOS_RESERVE_TOKENSr   r   r   r   r   r   r   r   r   r   r   r   r   r   )rC   r   r	  rt   r   tokens_freedr  r  r  bsmin_free_tokensr   r,   r   r-   add_one_req_ignore_eosx  sb   











z#PrefillAdder.add_one_req_ignore_eoshas_chunked_reqr;   truncation_align_sizec                 C  s`  | j rt| jdkrtjS | j }d urt| j|krtjS |jjr.t| j	ddr.| 
|S |jtt|jjt|j dt }|j|j }| |}t|j}|| jkrXtjS || jkrgt| jdkrgtjS | |j5 || jkr~tjW  d    S |jdkr| j	|j|j\}|_t|j|g|_|t|jt|j  t|j}||_| |j}	|	| jkrt| jdkrtjW  d    S | j d ur| j j!ddstjW  d    S | j"d ur| j#dkrtjW  d    S |d u sJ d| $|| | %| n| j&d u s|	| j&kr,| j'| | %| | (||	t|jjt nm| j&| j) | j) }
|
dkrDtjW  d    S |d ur^|
|k rXtjW  d    S ||
|  }
||
 |jd t|j|
  |_| j'| || _*| %| | (||
d W d    | + S W d    | + S W d    | + S 1 sw   Y  | + S )Nr#   rZ   Tr   )local_prefillablez7truncation_align_size is not supported for dllm prefill),r   rY   r   r   r   r   r   r  r[   r9   r  r   r   maxr   rg   r   rl   r   rm   r   r   r   r  rn   init_load_backrk   rr   catr   r   cache_protected_lenr   negotiate_should_allow_prefillr   r   r   r   r   r   r   r   r   r   )rC   r   r  r  r   total_tokensreal_input_tokensr   new_indicesinput_tokensr   r,   r,   r-   add_one_req  s   














;
?


M
M
MMzPrefillAdder.add_one_reqserver_argsr   c                   sJ  |j rdnd fddjjD }t| fddd}g }|jt|jjt j	 }|D ]#}|j
|j
    }|jkrP|| ||8 }|dkrO nq- t|dks[|dkr]d	S t|}g }	d}
tjjD ]*\}}||v r j|8  _|
d7 }
j|tjj|
 | qk|	| qkjj|	d
 j| dS )z
        Preempt running requests to serve the new request if the priority threshold is met and token count sum is verified.
        Returns True if preemption was committed, and the new request can be scheduled.
        r#   r>   c                 3  s&    | ]}| j vr| s|V  qd S r   )r   finishedr   r   r,   r-   r   K  s    z3PrefillAdder.preempt_to_schedule.<locals>.<genexpr>c                   s   | j    | jj fS r   r   r   r   r,   r-   r}   S  s   
z2PrefillAdder.preempt_to_schedule.<locals>.<lambda>r`   r   F)keep_indicesT)r=   rG   r   sortedr   r   r   r   r   r   r   r   r   r   rY   rd   r
  r   release_reqfilter_batchr   r   )rC   r   r  valid_running_reqssorted_valid_running_reqspreemptible_reqsmin_tokens_to_removerunning_reqpriority_diffr  release_counterr  r,   )r@   rC   r-   preempt_to_schedule=  sX   



z PrefillAdder.preempt_to_schedule)r   r   NNN)r   r   r9   r   r   r   rG   r   r   r{   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )r   r   )r   r   rI   r   )r   r   rI   r   )r   r   r   r   r   r   )rI   r   )r   r   r   r   r   )rn   r   )r   r   r  r;   r  r   )r   r   r  r   rI   r;   )r&   r'   r(   rD   r   r   propertyr   r   r   r   r   r   r   r   r   r   r   r  r  r  r*  r,   r,   r,   r-   r   t  s4    

<
	










Wnr   )>
__future__r   logging#sglang.srt.managers.prefill_delayerr   sglang.srt.utilsr   r   	getLoggerr&   r   osr1   collectionsr   r   
contextlibr   enumr	   r
   typingr   r   r   r   r   r   rr   sglang.srt.dllm.configr   %sglang.srt.layers.attention.nsa.utilsr   "sglang.srt.managers.schedule_batchr   r   &sglang.srt.mem_cache.base_prefix_cacher   r   r    sglang.srt.mem_cache.radix_cacher   r   r   $sglang.srt.mem_cache.swa_memory_poolr   sglang.srt.server_argsr   sglang.srt.mem_cache.allocatorr   r   environgetr   r    r"   r  r$   r.   r6   r   r   r,   r,   r,   r-   <module>   sP    
 	  