o
    پi7z                    @  s`  d dl mZ d dlZd dlmZ d dlmZ d dlmZ 	 d dl	Z	d dl
Z
d dlZd dlZd dlZd dlmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZmZmZmZmZmZmZ d dlZ d dl!Z!d dl"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z) d dl*m+Z+ d dl,m-Z- d dl.m/Z/ d dl0m1Z2 d dl3m4Z4 d dl5m6Z6m7Z7 d dl8m9Z9m:Z:m;Z;m<Z< d dl=m>Z> d dl?m@Z@ d dlAmBZB d dlCmDZDmEZEmFZF d dlmGZGmZmHZH d dlImJZJ d dlKmLZL d dlMmNZNmOZO d dlPmQZQ d dlRmSZS erd d lmZmZ d d!lTmUZU d d"lVmWZW d d#lXmYZY d d$lZm[Z[m\Z\ d%Z]d&Z^e_e`Zaed'd(dNd-d.ZbdOd0d1ZcG d2d3 d3ZdG d4d5 d5edZeG d6d7 d7edZfG d8d9 d9edZgG d:d; d;edZhG d<d= d=edZiG d>d? d?eZjG d@dA dAeZke
jlG dBdC dCZme
jlG dDdE dEZnG dFdG dGeoejZpG dHdI dIe-Zqe
jlG dJdK dKe'Zre
jlG dLdM dMZsdS )P    )annotationsN)
DllmConfig)ForwardBatch)
ceil_align)Enumauto)	lru_cache)
HTTPStatus)chain)TYPE_CHECKINGAnyDictListOptionalSetTupleUnion)BaseGrammarObject)BaseKVSender)&ScheduleBatchDisaggregationDecodeMixin)DisaggregationMode)get_tensor_model_parallel_rank)ReqDllmMixin)envs)
CHUNK_SIZE)BaseTokenToKVPoolAllocator)BasePrefixCacheMatchPrefixParams)alloc_for_decodealloc_for_extendevict_from_tree_cacherelease_kv_cache)ReqToTokenPool)RadixKey)SWATokenToKVPoolAllocator)DPCooperationInfoSchedulerMetricsCollector	TimeStats)CaptureHiddenModer   ForwardMode)SamplingBatchInfo)SamplingParams)
ServerArgsget_global_server_args)flatten_nested_list)CudaIpcTensorTransportProxy)r   r   )ModelConfig)PrefillStats)EagleDraftInput)	SpecInputSpeculativeAlgorithm   i@B    )maxsize
vocab_sizeintreturnNonec                 C  s"   | t krtd|  dt  dd S )NzModel vocab_size (z) exceeds MM_PAD_SHIFT_VALUE (zk). MM pad_values may overlap with valid token IDs. Please increase MM_PAD_SHIFT_VALUE in schedule_batch.py.)MM_PAD_SHIFT_VALUE
ValueError)r8    r>   V/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/managers/schedule_batch.pysanity_check_mm_pad_shift_valuel   s
   r@   hashc                 C  s   t | d  S )zCompute pad value from hash.i   @)r<   )rA   r>   r>   r?   _compute_pad_valuev   s   rB   c                   @  s    e Zd Zd	d
ddZdd ZdS )BaseFinishReasonFis_errorboolc                 C  s
   || _ d S NrD   )selfrD   r>   r>   r?   __init__|      
zBaseFinishReason.__init__c                 C  s   t  rF   )NotImplementedErrorrH   r>   r>   r?   to_json   s   zBaseFinishReason.to_jsonN)F)rD   rE   )__name__
__module____qualname__rI   rM   r>   r>   r>   r?   rC   {   s    rC   c                      &   e Zd Zd fddZdd Z  ZS )FINISH_MATCHED_TOKENmatchedUnion[int, List[int]]c                      t    || _d S rF   superrI   rS   rH   rS   	__class__r>   r?   rI         

zFINISH_MATCHED_TOKEN.__init__c                 C     d| j dS Nstop)typerS   rS   rL   r>   r>   r?   rM         zFINISH_MATCHED_TOKEN.to_json)rS   rT   rN   rO   rP   rI   rM   __classcell__r>   r>   rY   r?   rR          rR   c                      rQ   )FINISH_MATCHED_STRrS   strc                   rU   rF   rV   rX   rY   r>   r?   rI      r[   zFINISH_MATCHED_STR.__init__c                 C  r\   r]   r`   rL   r>   r>   r?   rM      ra   zFINISH_MATCHED_STR.to_jsonrS   rf   rb   r>   r>   rY   r?   re      rd   re   c                      rQ   )FINISHED_MATCHED_REGEXrS   rf   c                   rU   rF   rV   rX   rY   r>   r?   rI      r[   zFINISHED_MATCHED_REGEX.__init__c                 C  r\   r]   r`   rL   r>   r>   r?   rM      ra   zFINISHED_MATCHED_REGEX.to_jsonrg   rb   r>   r>   rY   r?   rh      rd   rh   c                      rQ   )FINISH_LENGTHlengthr9   c                   rU   rF   )rW   rI   rj   )rH   rj   rY   r>   r?   rI      r[   zFINISH_LENGTH.__init__c                 C  r\   )Nrj   )r_   rj   rj   rL   r>   r>   r?   rM      ra   zFINISH_LENGTH.to_json)rj   r9   rb   r>   r>   rY   r?   ri      rd   ri   c                      s&   e Zd Zd fdd	Zdd Z  ZS )FINISH_ABORTNc                   s(   t  jdd |p
d| _|| _|| _d S )NTrG   Aborted)rW   rI   messagestatus_codeerr_type)rH   rn   ro   rp   rY   r>   r?   rI      s   

zFINISH_ABORT.__init__c                 C  s   d| j | j| jdS )Nabort)r_   rn   ro   rp   )rn   ro   rp   rL   r>   r>   r?   rM      s
   zFINISH_ABORT.to_json)NNNrb   r>   r>   rY   r?   rl      s    rl   c                   @  s>   e Zd Ze Ze Ze Ze ZedddZ	edd Z
dS )	Modalitymodality_strrf   c              	   C  s<   zt |   W S  ty   td|  ddd t D  w )NzInvalid modality string: z. Valid modalities are: c                 S     g | ]}|j qS r>   )name).0mr>   r>   r?   
<listcomp>       z%Modality.from_str.<locals>.<listcomp>)rr   upperKeyErrorr=   )rs   r>   r>   r?   from_str   s   zModality.from_strc                   C  s   t jt jt jgS rF   )rr   IMAGEVIDEOAUDIOr>   r>   r>   r?   all   s   zModality.allN)rs   rf   )rN   rO   rP   r   r}   MULTI_IMAGESr~   r   staticmethodr|   r   r>   r>   r>   r?   rr      s    rr   c                   @  s   e Zd Ze Ze Ze ZdS )MultimodalInputFormatN)rN   rO   rP   r   NORMALPROCESSOR_OUTPUTPRECOMPUTED_EMBEDDINGr>   r>   r>   r?   r      s    
r   c                   @  s   e Zd ZU dZded< dZded< dZded< dZded	< ej	Z
d
ed< dZded< dZded< ejedZded< d8ddZd9ddZd9ddZedd Zd d! Zd:d$d%Zd&d' Zd(d) Zd*d+ Zd;d,d-Zd.d/ Zd0d1 Zed<d4d5Zd6d7 ZdS )=MultimodalDataItema%  
    One MultimodalDataItem contains all inputs for one modality.
    For example, if there are 3 images and 1 audio inputs, there will be 2 MultimodalDataItem.
    One for images and one for audio.

    We put the common fields first and the model-specific fields in model_specific_data.
    rr   modalityNr9   rA   	pad_valueOptional[list]offsetsr   formatzUnion[torch.Tensor, np.ndarray]featurez)Optional[Union[torch.Tensor, np.ndarray]]precomputed_embeddings)default_factoryzdict[str, Any]model_specific_dataru   rf   c                 C  s@   d| j v r|| j d v r| j d | S td| jj d| d)Nr   'z' object has no attribute ')__dict__AttributeErrorrZ   rN   )rH   ru   r>   r>   r?   __getattr__   s   
zMultimodalDataItem.__getattr__keyvaluer   c                 C  s&   || j v r|| j |< d S || j|< d S rF   )r   r   rH   r   r   r>   r>   r?   __setitem__  s   
zMultimodalDataItem.__setitem__c                 C  s   |  || d S rF   )r   r   r>   r>   r?   set  s   zMultimodalDataItem.setc                 C  s&   | d u rdS t dd t| D dkS )NTc                 S  s   g | ]}|d ur|qS rF   r>   rv   itemr>   r>   r?   rx         z4MultimodalDataItem.is_empty_list.<locals>.<listcomp>r   )lenr.   )lr>   r>   r?   is_empty_list  s   z MultimodalDataItem.is_empty_listc                 C  s   | j durdS ddlm} tj r$ddl}| j| _	t
| j	| _ dS | j	du r:| jdur2| j}n| j}||| _	| j	dusAJ t
| j	| _ dS )z@
        Set the pad value after first hashing the data
        Nr   )hash_feature)r   sglang.srt.managers.mm_utilsr   r   SGLANG_MM_SKIP_COMPUTE_HASHgetuuiduuid4r9   rA   rB   r   r   )rH   r   r   hashed_featurer>   r>   r?   set_pad_value  s   




z MultimodalDataItem.set_pad_valuer:   rE   c                 C  s
   | j |kS rF   )r   )rH   r   r>   r>   r?   is_modality)  rJ   zMultimodalDataItem.is_modalityc                 C     | j tjkS rF   )r   rr   r   rL   r>   r>   r?   is_audio,     zMultimodalDataItem.is_audioc                 C  s   | j tjtjfv S rF   )r   rr   r}   r   rL   r>   r>   r?   is_image/  s   zMultimodalDataItem.is_imagec                 C  r   rF   )r   rr   r~   rL   r>   r>   r?   is_video2  r   zMultimodalDataItem.is_videoc                 C  s   |   p|  p|  S rF   )r   r   r   rL   r>   r>   r?   is_valid5  s   zMultimodalDataItem.is_validc                 C  s   d S rF   r>   rL   r>   r>   r?   validate8  s   zMultimodalDataItem.validatec                 C  r   rF   )r   r   r   rL   r>   r>   r?   is_precomputed_embedding<  r   z+MultimodalDataItem.is_precomputed_embeddingobjdictc                 C  sB   t | }|d}t|trt| }tdd|i|}|  |S )Nr   r>   )r   pop
isinstancerf   rr   r   r   )r   kwargsr   retr>   r>   r?   	from_dict?  s   

zMultimodalDataItem.from_dictc                 C  s>   |  j |j 7  _ |  j|j7  _t| j|jf| _|   d S rF   )r   r   rA   r   rH   otherr>   r>   r?   mergeI  s   zMultimodalDataItem.merge)ru   rf   )r   rf   r   r   )r   rr   r:   rE   r:   rE   r   r   ) rN   rO   rP   __doc____annotations__rA   r   r   r   r   r   r   r   dataclassesfieldr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r>   r>   r>   r?   r      s4   
 





	r   c                   @  s   e Zd ZU dZded< dZded< dZded< dZded	< dZded
< dZ	ded< dZ
ded< dZded< dZded< dZded< dZded< dZded< dZded< dZded< ed&ddZd'ddZd'ddZd'dd Zd'd!d"Zd(d$d%ZdS ))MultimodalInputsz#The multimodal data related inputs.zList[MultimodalDataItem]mm_itemsNr   image_pad_lenOptional[int]num_image_tokensim_token_idim_start_id	im_end_idslice_start_idslice_end_idvideo_token_idaudio_token_idaudio_start_idaudio_end_idOptional[torch.Tensor]mrope_positionsmrope_position_deltar   r   c                 C  sV  t j s
| d }nddlm} | d }||}t|d}t|jts%J dd |jD |_t j	 dkroddlm
}m}m}m} tj rKtj nd}	| sT||	 |  |jD ]}
|
jd urnt|
jtjrn||
j|
_qZ|jD ]}
|
  qrt j	 dkr|jD ]}
|
jd ur|
jjdd	d
|
_qg d}|D ]}|| v rt||| |  q|S )Nr   r   )get_new_expanded_mm_items)r   c                 S  s   g | ]}|  r|qS r>   r   r   r>   r>   r?   rx   }  r   z.MultimodalInputs.from_dict.<locals>.<listcomp>)init_feature_bufferis_feature_buffer_initializedreset_buffer_offsettry_add_to_buffercpuTnon_blocking)r   r   r   r   r   r   r   r   r   r   r   )r   SGLANG_ENABLE_MM_SPLITTINGr   r   r   r   r   r   listSGLANG_MM_BUFFER_SIZE_MBr   r   r   r   torchcudais_availablecurrent_devicer   Tensorr   tosetattr)r   r   r   original_mm_itemsr   r   r   r   r   devicer   optional_argsargr>   r>   r?   r   l  sD   







zMultimodalInputs.from_dictr:   rE   c                 C     t dd | jD S )Nc                 s      | ]}|  V  qd S rF   )r   r   r>   r>   r?   	<genexpr>      z9MultimodalInputs.contains_image_inputs.<locals>.<genexpr>anyr   rL   r>   r>   r?   contains_image_inputs     z&MultimodalInputs.contains_image_inputsc                 C  r   )Nc                 s  r   rF   )r   r   r>   r>   r?   r     r   z9MultimodalInputs.contains_video_inputs.<locals>.<genexpr>r   rL   r>   r>   r?   contains_video_inputs  r   z&MultimodalInputs.contains_video_inputsc                 C  r   )Nc                 s  r   rF   )r   r   r>   r>   r?   r     r   z9MultimodalInputs.contains_audio_inputs.<locals>.<genexpr>r   rL   r>   r>   r?   contains_audio_inputs  r   z&MultimodalInputs.contains_audio_inputsc                 C  r   )Nc                 s  s    | ]	}|  rd V  qdS )TNr   r   r>   r>   r?   r     s    z5MultimodalInputs.contains_mm_input.<locals>.<genexpr>r   rL   r>   r>   r?   contains_mm_input  r   z"MultimodalInputs.contains_mm_inputr   c           	      C  s   ddg}|D ]}t | |d}|durt| ||t ||  q| j}|dur:|jdu r.|| _ntj| j|jgdd| _| j}|durV|jdu rJ|| _ntj| j|jgdd| _|j D ]\}}d|v rut | |ddu rut| |t ||d q[dS )zC
        merge image inputs when requests are being merged
        r   r   Nr6   )dimr   _id)getattrr   r   r   catr   r   items)	rH   r   r   r   self_argr   r   r   valr>   r>   r?   r     s8   

zMultimodalInputs.merger   r   )r   r   )rN   rO   rP   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r>   r>   r>   r?   r   P  s.   
 
A


r   c                   @  sP   e Zd ZdZdZdZdZdZdZdZ	dZ
d	Zd
ZdZdZdZdZdZdZdZdS )RequestStagetokenizedispatchdc_dispatchprefill_waitingrequest_processdecode_loopprefill_forwardchunked_prefillprefill_prepareprefill_bootstrapprefill_transfer_kv_cachedecode_preparedecode_bootstrapdecode_waitingdecode_transferredfake_outputquick_finishN)rN   rO   rP   TOKENIZETOKENIZER_DISPATCHDC_DISPATCHPREFILL_WAITINGREQUEST_PROCESSDECODE_LOOPPREFILL_FORWARDPREFILL_CHUNKED_FORWARDPREFILL_PREPAREPREFILL_BOOTSTRAPPREFILL_TRANSFER_KV_CACHEDECODE_PREPAREDECODE_BOOTSTRAPDECODE_WAITINGDECODE_TRANSFERREDDECODE_FAKE_OUTPUTDECODE_QUICK_FINISHr>   r>   r>   r?   r     s$    r   c                   @  sD  e Zd ZdZ																											dodpd1d2Zedqd4d5Zedrd6d7Zedsd8d9Zdqd:d;Z	dtd=d>Z
dudAdBZdvdDdEZdFdG ZdrdHdIZdwdxdLdMZdNdO ZdydPdQZdrdRdSZdzdUdVZdWdX Zdwd{dYdZZd|d}d]d^Zd_d` Zdadb Zdcdd Zdedf Zd~dhdiZddkdlZdmdn ZdS )Reqz)The input and output status of a request.Fr   Nridrf   origin_input_textorigin_input_ids	List[int]sampling_paramsr+   return_logprobrE   top_logprobs_numr9   dllm_configOptional[DllmConfig]token_ids_logprobstreamorigin_input_ids_unpaddedOptional[Tuple[int]]lora_idOptional[str]input_embedsOptional[List[List[float]]]token_type_ids
session_idcustom_logit_processorrequire_reasoningreturn_hidden_statesreturn_routed_expertseos_token_idsOptional[Set[int]]bootstrap_hostbootstrap_portr   bootstrap_roomdisagg_modeOptional[DisaggregationMode]data_parallel_rankr8   prioritymetrics_collector#Optional[SchedulerMetricsCollector]	extra_keyrouting_key
dimensionshttp_worker_ipcc                   C  sP  || _ || _|
r
|
n|| _|| _g | _g | _|| _|| _d| _d| _	d| _
d| _|| _d| _d| _d| _|| _|| _t|jtrMt|}|jd| iB |_|| _|| _|| _|d ur`|p]d| }|| _|| _|| _d | _d | _d | _d | _d | _ d | _!d | _"d | _#d | _$d | _%d | _&|	| _'|| _(|| _)|| _*d | _+d | _,d| _-d | _.t/j0dt/j1d| _2d| _3d| _4d | _5d | _6d| _7d| _8d | _9d| _:d| _;d| _<d| _=d| _>d| _?d| _@|| _Ad| _B|| _C|| _Dd| _Ed| _Fd| _Gd | _Hd | _Id | _Jd | _Kd | _Ld | _Md | _Nd | _Od | _Pd | _Qd | _R|r!g | _Sg | _Tg | _Ug | _Vg | _Wg | _Xnd  | _S | _T | _U | _V | _W| _Xg | _Yd | _Zd | _[d | _\|| _]d | _^d | __d | _`d | _ad | _bd| _cd| _dd| _ed| _fd| _gd| _hd| _id| _jd| _kg | _ld| _md | _n|| _otp|d| _qd| _rtst | _u|| _v|| _w|| _xd | _y|| _zd| _{d| _|d| _}|| _~| | d S )	Nr   F__req__ r   dtype)r;  )r  r   r*  r!  
output_idsfill_idsr1  r.  kv_committed_lenkv_allocated_lenkv_committed_freedkv_overallocated_freedr0  swa_evicted_seqlenextend_batch_idxdecode_batch_idxrD  r3  r   custom_paramsr   copyr#  r2  r4  rA  r,  rB  req_pool_idxmamba_pool_idxmamba_ping_pong_track_buffermamba_next_track_idxmamba_last_track_seqlenmamba_branching_seqlen	tokenizerfinished_reasonfinished_lenfinished_output	to_finishr)  r6  r8   r>  surr_offsetread_offsetdecoded_textmultimodal_inputsr   emptyint64prefix_indicesextend_input_lenextend_logprob_start_len	last_nodelast_host_nodehost_hit_lengthstorage_hit_lengthswa_uuid_for_lockcache_protected_len
is_chunkedis_retractedretracted_stainsend_token_offsetsend_decode_id_offset!send_output_token_logprobs_offsetr$  logprob_start_lenr%  r(  temp_scaled_logprobstop_p_normalized_logprobsinput_logprob_sentinput_token_logprobs_valinput_token_logprobs_idxinput_top_logprobs_valinput_top_logprobs_idxinput_token_ids_logprobs_valinput_token_ids_logprobs_idxinput_token_logprobstemp_input_top_logprobs_valtemp_input_top_logprobs_idx!temp_input_token_ids_logprobs_val!temp_input_token_ids_logprobs_idxoutput_token_logprobs_valoutput_token_logprobs_idxoutput_top_logprobs_valoutput_top_logprobs_idxoutput_token_ids_logprobs_valoutput_token_ids_logprobs_idxhidden_stateshidden_states_tensoroutput_topk_poutput_topk_indexr5  routed_expertscustomized_info	embeddinggrammar_keygrammargrammar_wait_ctcached_tokensalready_computedcached_tokens_devicecached_tokens_hostcached_tokens_storage_cache_breakdown_computedspec_verify_ctspec_accepted_tokensspec_acceptance_histogramretraction_countretraction_mb_idr?  r'   
time_statshas_log_time_statstime	monotoniclast_ticr8  r9  r:  disagg_kv_senderr=  start_send_idxtmp_end_idxmetadata_buffer_indexrC  init_diffusion_llm) rH   r  r   r!  r#  r$  r%  r&  r(  r)  r*  r,  r.  r0  r1  r2  r3  r4  r5  r6  r8  r9  r:  r;  r=  r8   r>  r?  rA  rB  rC  rD  r>   r>   r?   rI     s  #

zReq.__init__r:   c                 C  s   t | jt | j S )z/Get the current sequence length of the request.r   r!  rK  rL   r>   r>   r?   seqlen0  s   z
Req.seqlenc                 C  s   t  j}| jjdko|du S )zCCheck if this request is prefill-only (no token generation needed).r   N)r-   speculative_algorithmr#  max_new_tokens)rH   spec_algr>   r>   r?   is_prefill_only5  s   zReq.is_prefill_onlyc                 C  s    | j dur| jd| j  S | jS )zIGet the output ids through the stop condition. Stop position is included.N)r^  rK  rL   r>   r>   r?   output_ids_through_stop=  s   
zReq.output_ids_through_stopc                 C  s$   | j rJ d| jdd| _ | jS )z?Return the length of committed KV cache and mark them as freed.z8Committed KV cache already freed (self.kv_committed_len=)T)rO  rM  rL   r>   r>   r?   pop_committed_kv_cacheD  s   zReq.pop_committed_kv_cacheTuple[int, int]c                 C  s0   | j rJ d| jd| jd| _ | j| jfS )zCReturn the range of over-allocated KV cache and mark them as freed.z<Overallocated KV cache already freed, self.kv_committed_len=z, self.kv_allocated_len=T)rP  rM  rN  rL   r>   r>   r?   pop_overallocated_kv_cacheL  s   zReq.pop_overallocated_kv_cachestager   c                 C  s6   | j d u rd S t }| j |j|| j  || _d S rF   )r?  r  r  observe_per_stage_req_latencyr   r  )rH   r  nowr>   r>   r?   add_latencyX  s   

zReq.add_latencyaccepted_draft_tokensc                 C  sD   t | j|kr| jdg|t | j d   | j|  d7  < dS )zUpdate the speculative decoding acceptance histogram.

        Args:
            accepted_draft_tokens: Number of draft tokens accepted in this step.
        r   r6   N)r   r  extend)rH   r  r>   r>   r?    update_spec_acceptance_histogramb  s
   z$Req.update_spec_acceptance_histogramc                 C  s$   | j d u r
|| _ d S | j | d S rF   )rd  r   )rH   image_inputsr>   r>   r?   extend_image_inputsn  s   

zReq.extend_image_inputsc                 C  
   | j d uS rF   )r]  rL   r>   r>   r?   finishedt  s   
zReq.finished
tree_cacheOptional[BasePrefixCache]c                 C  s:  |   r|   |   n| j| j | _t| j}|d }| jr+| jdkr+t	|| j}t
|d}| jd | }|d urn|tt|| jd| rJ| nd | d}|j|j|j|j|jf\| _| _| _| _| _t| j| _| jr| jd ur| jjd urddlm} || jjt| j| j_| t| jt| j  d S )Nr6   r   )	token_idsrA  )r   req	cow_mamba),extend_mrope_positions_for_retracted_request)is_dllm_init_fill_ids_for_dllmdetermine_dllm_phaser!  rK  rL  r   r$  rv  minmaxmatch_prefixr   r#   rA  supports_mambadevice_indiceslast_device_noderk  rl  r[  rg  rj  ro  rq  rd  r   r   r  set_extend_input_len)rH   r  	input_lenmax_prefix_lenr  match_resultr  r>   r>   r?   init_next_round_inputx  sR   



zReq.init_next_round_inputc                 C  s   | j d u p	| jd u }| j}|r/t| j| _t| jt d| _ | j| j d  | | _t|| _n| j	|| jd   t|| _| j| j| j  fS Nr   )
ra  rb  r  r   r*  r  &INIT_INCREMENTAL_DETOKENIZATION_OFFSETsurr_and_decode_idscur_decode_ids_lenr  )rH   
first_iterrK  r>   r>   r?   init_incremental_detokenize  s   

zReq.init_incremental_detokenizec                 C  sf   t | jjdkst | jjdkrt| jjd | jjd }t|d t | j}| j	
| j| d  S Nr   r6   )r   r#  	stop_strsstop_regex_strsr  stop_str_max_lenstop_regex_max_lenr  rK  r\  decode)rH   max_len_tail_strtail_lenr>   r>   r?   tail_str  s   

zReq.tail_strc                 C  s   | j jsdS |  }|sdS | j jD ]0}|sq||v r dS tt|t|}td|d D ]}|| d |d| krA  dS q.qdS )zS
        Check if the suffix of tail_str overlaps with any stop_str prefix
        FTr6   N)r#  r  r  r  r   range)rH   r  stop_strmin_lenir>   r>   r?   check_match_stop_str_prefix  s"   zReq.check_match_stop_str_prefixnew_accepted_tokensc                 C  s   | j jrdS d}t|D ]N\}}| j jr||| j jv O }| jr&||| jv O }| jd ur?||| jjkO }| jjr?||| jjv O }|rZt|d| _	t
| jt
| | }|d | _ dS qdS )NFr`   r6   T)r#  
ignore_eos	enumeratestop_token_idsr6  r\  eos_token_idadditional_stop_token_idsrR   r]  r   rK  r^  )rH   r  matched_eosr  token_idmatched_posr>   r>   r?   _check_token_based_finish  s&   

zReq._check_token_based_finishc                 C  s   t | jjdkst | jjdkrS|  }t | jjdkr5| jjD ]}||v s+|| jv r4t|d| _ dS q t | jjdkrS| jjD ]}t	||rRt
|d| _ dS qAdS )Nr   r`   TF)r   r#  r  r  r  rc  re   r]  researchrh   )rH   r  r  stop_regex_strr>   r>   r?   _check_str_based_finish  s$   zReq._check_str_based_finishc                 C  s   t |D ]B\}}|| jks|dk rFt| jt| | }| jjr+tt| jj| j|< | jr8tt| j| j|< t	dd| _
|d | _ dS qdS )Nr   zNaN happenedr`   r6   TF)r  r8   r   rK  r#  r  nextiterr6  re   r]  r^  )rH   r  r  r  offsetr>   r>   r?   _check_vocab_boundary_finish   s   


z Req._check_vocab_boundary_finishr6   new_accepted_lenc                 C  s   |   rd S | jr| j| _d | _d S t| j| jjkr*t| jjd| _| jj| _d S | j	d ur?| j	
 r?t| jd d| _d S | j| d  }| |rNd S | |rUd S |  r[d S d S )Nrk   rJ  r`   )r  r`  r]  r   rK  r#  r  ri   r^  r  is_terminatedrR   r  r  r  )rH   r  r  r>   r>   r?   check_finished0  s0   




zReq.check_finishedc                 C  s   |  j d7  _ tjdtjd| _d | _d | _d | _d| _d| _	d| _
d | _d | _d | _d| _d| _d | _d | _d | _d | _d | _d| _d| _d| _d| _d| _d| _d| _d| _d S )Nr6   rG  rH  r   TF)r  r   re  rf  rg  r  rj  rn  rh  rq  rr  r  r  r  ri  rp  rW  rX  rY  rZ  r[  r  rN  rM  rO  rP  rQ  rR  rS  rL   r>   r>   r?   reset_for_retractP  s4   
zReq.reset_for_retractc                 C  s*   |j | jd | jd f }||| _d S Nr6   )req_to_tokenrV  r  get_cpu_copykv_cache_cpurH   req_to_token_pooltoken_to_kv_pool_allocatortoken_indicesr>   r>   r?   offload_kv_cacheo  s   zReq.offload_kv_cachec                 C  s0   |j | jd | jd f }|| j| | `d S r  )r  rV  r  load_cpu_copyr  r  r>   r>   r?   load_kv_cacheu  s
   zReq.load_kv_cachec              
   C  s~   | j rd S | jd urd| j nd}d| j | dt| j dt| j d| j  d
}t	| d| j
   d	| _ d S )
Nz, bootstrap_room=rF  zReq Time Stats(rid=z, input len=z, output len=z, type=r  z: T)r  r:  r  r   r!  rK  r  disagg_mode_strloggerinfoconvert_to_duration)rH   bootstrap_infoprefixr>   r>   r?   log_time_stats|  s   
6
zReq.log_time_statsrh  c                 C  sN   || _ | jdkrt| jd }n	t| jt| j}t|t| j | j | _d S )NrJ  r6   )rh  rv  r   rL  r  rg  r  ri  )rH   rh  rv  r>   r>   r?   r    s   

zReq.set_extend_input_len	error_msgc                 C  sT   t  dkrt| d| j d | _d | _dg| _d| _d| _t	|t
jd| _d S )Nr   z, self.rid=FrJ  BadRequestError)r   r
  errorr  rd  r  r!  r$  rv  rl   r	   BAD_REQUESTr`  )rH   r  r>   r>   r?   set_finish_with_abort  s   

zReq.set_finish_with_abortc                 C  s.   d| j  d| j d| j d| jd| jdS )NzReq(rid=z, input_ids=z, output_ids=z, self.grammar=z, self.sampling_params=r  )r  r!  rK  r  r#  rL   r>   r>   r?   __repr__  s   
zReq.__repr__)Fr   NNFNNNNNNFFFNNNNNNNNNNNNN)>r  rf   r   rf   r!  r"  r#  r+   r$  rE   r%  r9   r&  r'  r(  r"  r)  rE   r*  r+  r,  r-  r.  r/  r0  r"  r1  r-  r2  r-  r3  rE   r4  rE   r5  rE   r6  r7  r8  r-  r9  r   r:  r   r;  r<  r=  r   r8   r   r>  r   r?  r@  rA  r-  rB  r-  rC  r   rD  r-  )r:   r9   r   )r:   r"  )r:   r  )r  r   )r  r9   rF   )r  r  )r:   rf   )r  r"  r:   rE   )r  r"  )r6   )r  r9   )rh  r9   )r  rf   )rN   rO   rP   r   rI   propertyr  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r>   r>   r>   r?   r     sv      /





8


 

r  c                   @  s$  e Zd ZU dZded< dZded< dZded< dZd	ed
< dZded< dZ	ded< dZ
ded< dZded< dZded< dZded< dZded< dZded< dZded< dZded< dZded< dZded< dZded< dZded< dZded < dZded!< dZded"< dZded#< dZd$ed%< dZd&ed'< dZded(< dZd)ed*< dZd+ed,< dZd+ed-< dZ ded.< dZ!ded/< dZ"d0ed1< dZ#d2ed3< dZ$ded4< dZ%d+ed5< dZ&d6ed7< dZ'ded8< dZ(ded9< dZ)d:ed;< dZ*d:ed<< dZ+d0ed=< dZ,ded>< dZ-d:ed?< dZ.d@edA< dZ/dBedC< dZ0d@edD< dZ1d+edE< dZ2d@edF< dZ3dGedH< dIZ4d&edJ< dZ5dedK< dLZ6d&edM< dZ7dNedO< dZ8dedP< dZ9dedQ< dZ:dedR< dSZ;dTedU< dZ<dVedW< dZ=dXedY< dZ>dedZ< dZ?ded[< dZ@ded\< d]ZAd&ed^< dZBd_ed`< dZCdaedb< dZDdcedd< eE		dddedfZFdgdh ZGdidj ZHdkdl ZIddmdnZJdodp ZKddwdxZLdydz ZMdd}d~ZN	ddddZOddddZPdddZQdddZRdddZSdd ZTdd ZUeVdd ZWdd ZXdd ZY			ddddZZdddZ[	ddddZ\dd Z]dd Z^dddZ_dd Z`dS )ScheduleBatchz2Store all information of a batch on the scheduler.	List[Req]reqsNr"   r  r   r  r   r  FrE   is_hybrid_swar0   model_configr)   forward_modeenable_overlapbatch_is_fullOptional[Req]chunked_reqr*   sampling_infotorch.Tensor	input_idsr.  r0  req_pool_indicesseq_lensseq_lens_cpuout_cache_locrK  mamba_track_indicesmamba_track_maskmamba_track_seqlenszOptional[List]rd  r9   seq_lens_sumorig_seq_lenszOptional[ScheduleBatch]inner_idle_batchOptional[List[int]]global_num_tokensglobal_num_tokens_for_logprobis_extend_in_batchcan_run_dp_cuda_graphr   tbo_split_seq_indexOptional[ForwardMode]global_forward_moder$  top_logprobs_numsOptional[List[List[int]]]token_ids_logprobsrw  rx  r"  prefix_lensextend_lensextend_num_tokensdecoding_reqsextend_logprob_start_lensr   extend_input_logprob_token_idsOptional[List[bool]]encoder_cachedencoder_lensencoder_lens_cpuencoder_out_cache_locOptional[list[int]]rC  r   split_indexsplit_prefill_finishedr6   split_forward_countr   split_forward_batchseq_lens_cpu_cache
has_streamhas_grammarr   rf   r   r4   spec_algorithmOptional[SpecInput]	spec_infor4  r5  r  rJ  hicache_consumer_indexr'  r&  zOptional[DPCooperationInfo]dp_cooperation_infozOptional[PrefillStats]prefill_statsc
                 C  s   t dd |D }
d}t|trd}| di d|d|d|d|d	|d
|d|d|
dt dd |D dt dd |D d|jd|dt dd |D dt dd |D dtdd |D d|d|	S )Nc                 s      | ]}|j V  qd S rF   r$  rv   r  r>   r>   r?   r   <      z)ScheduleBatch.init_new.<locals>.<genexpr>FTr  r  r  r  r  r  r  r$  rJ  c                 s  rR  rF   r)  rT  r>   r>   r?   r   K  rU  rK  c                 s  rR  rF   r  rT  r>   r>   r?   r   L  rU  r   rL  r4  c                 s  rR  rF   )r4  rT  r>   r>   r?   r   O  rU  r5  c                 s  rR  rF   )r5  rT  r>   r>   r?   r   P  rU  r  c                 s  rR  rF   )r  rT  r>   r>   r?   r   Q  rU  r   r&  r>   )r   r   r$   r   r   )clsr  r  r  r  r  r  rL  r   r&  r$  r  r>   r>   r?   init_new/  sN   
	
zScheduleBatch.init_newc                 C  s
   t | jS rF   r   r  rL   r>   r>   r?   
batch_sizeV  rJ   zScheduleBatch.batch_sizec                 C  s   t | jdkS r  rZ  rL   r>   r>   r?   is_emptyY  s   zScheduleBatch.is_emptyc                 C  r  rF   )r&  rL   r>   r>   r?   r  \  rJ   zScheduleBatch.is_dllmc           
      C  s  g | _ g | _| jD ]3}|j}|d u s|jd u r$| j d | jd q	| j |j | j| j p:t|j	|jk q	t
j| j t
jdj| jdd| _d}g }g }t| jD ]r\}}| j | }	||  |	8  < t|j	|	k rt|j	dksyJ || |	d  ||< || j|||	   || j||	 ||j   | j|  |	8  < |  j|	8  _n|| j|||j   | j|  |	8  < ||j7 }qXt
jt|g t
jdj| jdd| _t
j|t
jdj| jdd| _t
j|t
jd| _|st
jdt
jdj| jdd| _nt
|| _|s!t
jdt
jdj| jdd| _nt
|| _t| j| jks>J dt| j d| j d S )Nr   TrH  r   z	Expected z, got )rB  r@  r  rd  r   appendr  	is_decoder   rg  r   tensorrf  r   r   rA  r  r'  rh  r:  r;  r9  sumr#  r%  r&  zerosr   rC  )
rH   r#  r%  r  imptdecoder_out_cache_locrC  r  encoder_lenr>   r>   r?   prepare_encoder_info_extend_  sn   




z)ScheduleBatch.prepare_encoder_info_extendc           '        s  t j _  rt j _ j}dd |D }tdd |D }dd |D }dd |D }dd |D }dd |D } jjrRt	d	d |D rR fd
d|D  _
dd |D }tjtt|tjdj jdd}	tj|tjdj jdd}
tj|tjd}tj|tjdj jdd}d }t|dkrtjt|g tjdj jdd}| _| _|
 _| _| _t \}}}g }g }g }g }g }g }tt|||D ]\}\}}}|| |_|| |jksJ | jd7  _||_ ||_!|j"d ur|#|j" |$|j% |j&sA||j' }| j(|7  _(|j)s>|j*}t+||j,}|| }t-dt|j.| }||_/||_0||_1d|_)||_'d|_2t3 4 rR 5||||  j6rt|j.t|j7} }!|j8dkrot|j9d }"n|j8}"| |"k ry|"} |j9| d |!d  }#|#|# |#dg|j|j: t|#   qӈ j6rt|}|;d jj<d  nd }|	 _=| _>| _?| _@|rt|j jddnd  _"|D ]9}$|$d u rؐq|$jAD ]+}%tB|%dd }&tC|&tjDr|&j jdd|%_EqtC|&tFr|&GtjHI |%_E~&qېq| _%| _Jt| _K j6r(dd |D  _Ldd |D  _Mdd |D  _N| _Ot3 4 rZtj|tj jd _Ptj|tjQ jd _Rtj|tj jd _S jjTre U|| tVW  jj< _Xd S )Nc                 S  s    g | ]}|j t|jd  qS rF   )rL  r   rg  rv   rr>   r>   r?   rx     s     z4ScheduleBatch.prepare_for_extend.<locals>.<listcomp>c                 s  s    | ]}t |V  qd S rF   )r   )rv   idsr>   r>   r?   r     r   z3ScheduleBatch.prepare_for_extend.<locals>.<genexpr>c                 S     g | ]}t |jqS r>   )r   rL  rg  r>   r>   r?   rx         c                 S  s"   g | ]}t t|jt|jqS r>   )r  r   rL  r!  rg  r>   r>   r?   rx     s   " c                 S  rj  r>   )r   rg  rg  r>   r>   r?   rx     rk  c                 S  rt   r>   )rh  rg  r>   r>   r?   rx     ry   c                 s  s    | ]}|j d uV  qd S rF   )rC  rg  r>   r>   r?   r     s    

c                   s    g | ]}|j r
|j n jjqS r>   )rC  r  hidden_sizerg  rL   r>   r?   rx     s    c                 S  s   g | ]
}|j d ur|j qS rF   )r0  rg  r>   r>   r?   rx     s    rH  Tr   r   r6   FrJ  r   c                 S  rt   r>   )r%  rg  r>   r>   r?   rx   u  ry   c                 S  rt   r>   )r(  rg  r>   r>   r?   rx   v  ry   c                 S  rt   r>   )ri  rg  r>   r>   r?   rx   x  ry   rI  r   )Yr)   EXTENDr  r  DLLM_EXTENDr  r`  r  is_matryoshkar   rC  r   r_  r   r
   from_iterablerf  r   r   int32r   r9  r:  r%  r&  r;  r   r  ziprV  rh  rR  rM  rN  r.  r  r]  rd  rr  r  r  r  rl  r  rm  r  rg  r  r  r  rq  r-   enable_mamba_extra_buffer,_mamba_radix_cache_v2_req_prepare_for_extendr$  rL  rv  r!  ri  clamp_r8   r#  r$  r,  r'  r   r   r   r   r   r/   reconstruct_on_target_devicer   r   r0  r+  r6  r8  r=  r>  r(  rE   r)  r*  is_encoder_decoderrf  r*   from_schedule_batchr!  )'rH   r  r#  r;  r%  r,  r9  r:  r0  input_ids_tensorseq_lens_tensorr&  orig_seq_lens_tensortoken_type_ids_tensorr'  req_pool_indices_tensorr$  r.  r>  rd  mamba_track_mask_cpumamba_track_indices_cpumamba_track_seqlens_cpur  r  seq_lenpre_len
new_cached
host_totalstorage_portionhost_portiondevice_portionglobal_start_idxglobal_end_idxrv  logprob_token_idsmm_inputmm_itempixel_valuesr>   rL   r?   prepare_for_extend  sF  






	




z ScheduleBatch.prepare_for_extendr  r  r  
List[bool]r  r  c                 C  s  ddd}t  j}|j|k}|| ||j|j   d}|r}t|j|j }t|j|j| |  }	t|j|jt	 t	  }
|
|	krJ||	}| j
|j|_|jd urz|jt|j | dk}|jt|jkrz|j|k rz|rz||j}|j}	|	|_|| d S )	Nr  r9   r:   c                 S  s   | t  dksJ | d S r  )FLA_CHUNK_SIZEr  r>   r>   r?   _force_track_h  s   zRScheduleBatch._mamba_radix_cache_v2_req_prepare_for_extend.<locals>._force_track_hrJ  r   )r  r9   r:   r9   )r-   mamba_cache_chunk_sizerh  r]  rX  rY  r   r   rg  r  r  get_mamba_ping_pong_other_idxr[  rZ  )rH   r  r  r  r  r  r  maskmamba_track_seqlenmamba_track_seqlen_alignedmamba_track_fla_chunk_alignedbranching_seqlen_aligned_maskr>   r>   r?   ru    sP   





z:ScheduleBatch._mamba_radix_cache_v2_req_prepare_for_extendc                 C  s   |    tj| _d S rF   )r  r)   SPLIT_PREFILLr  rL   r>   r>   r?   prepare_for_split_prefill  s   z'ScheduleBatch.prepare_for_split_prefillrunning_batch'ScheduleBatch'c                   s   t j| _| }|jD ]}|j|j |_|d qt	
| j|jg}t	
| j|jg}| | || _|| _| jr<dnd | j fdd|jD  | jdg|  |  j|7  _| jdg|  d| _d S )Nr6   r   rJ  c                   s$   g | ]}t |jt |j   qS r>   r  rg  deltar>   r?   rx     s    z2ScheduleBatch.mix_with_running.<locals>.<listcomp>F)r)   MIXEDr  r[  r  r!  rK  rL  r  r   r   r#  r'  merge_batchr  r9  r  r:  r;  r=  r  )rH   r  
running_bsr  r#  r'  r>   r  r?   mix_with_running  s(   



zScheduleBatch.mix_with_runningselected_indicesc           	        s   j j |d u rjnfdd|D }j r(t fdd|D }|  S t }|jp/d}|jp4d}|j	} dkrM|dkrMt
|   }t
| }n dkr[t
| }t
| }t|| |t| }|dj  S )Nc                      g | ]} j | qS r>   r  rv   r  rL   r>   r?   rx     rk  zAScheduleBatch.new_tokens_required_next_decode.<locals>.<listcomp>c                 3  s"    | ]}|j   d krdV  qdS )r   r6   N)rM  rg  )	page_sizer>   r?   r     s     z@ScheduleBatch.new_tokens_required_next_decode.<locals>.<genexpr>r6   )r  r  r  rL  is_noner`  r-   speculative_num_stepsspeculative_eagle_topkspeculative_num_draft_tokensr   r  r   
is_spec_v2)	rH   r  requests	new_pagesserver_argslen_per_topk	spec_topkspec_tokens
num_tokensr>   )r  rH   r?   new_tokens_required_next_decode	  s(   




z-ScheduleBatch.new_tokens_required_next_decodec                 C  s$   |  |}t| j| | j |kS rF   )r  r    r  r  available_size)rH   r  r  r>   r>   r?   check_decode_mem*  s   
zScheduleBatch.check_decode_memr  r,   c                 C  s@   | j }tt| j D ]}| |t| j | | q
| | |S rF   )r  r  r   release_reqfilter_batch)rH   r  retracted_reqsidxr>   r>   r?   retract_all/  s
   
zScheduleBatch.retract_allr:   "Tuple[List[Req], float, List[Req]]c           
        s  t tt j}|js|j fdddd g }d}|s# j|dsKt|dkr*n!d}| } j| }||  	|t|| |s# j|dr#t|dkr[ j|ds[t
d j|d	 td
d  jD }tdd  jD }|tj t j  |d  }	td|	}	||	g fS )z>Retract the decoding requests when there is not enough memory.c                   s"   t  j|  jt  j|  j fS rF   )r   r  rK  r!  r  rL   r>   r?   <lambda>D  s   z.ScheduleBatch.retract_decode.<locals>.<lambda>T)r   reverse)r  r6   FzKOut of memory even after retracting all other requests in the decode batch.)keep_indicesc                 s  s    | ]}t |jV  qd S rF   )r   rK  rg  r>   r>   r?   r   f  s    z/ScheduleBatch.retract_decode.<locals>.<genexpr>c                 s  s    | ]}|j jV  qd S rF   )r#  r  rg  r>   r>   r?   r   g  r   g      ?)r   r  r   r  r  sortr  r   r]  r  r=   r  r`  r   SGLANG_RETRACT_DECODE_STEPSr   r  )
rH   r  sorted_indicesr  r  r  r  total_decoded_tokenstotal_max_new_tokensnew_estimate_ratior>   rL   r?   retract_decode7  sJ   






zScheduleBatch.retract_decoder  remaing_req_countc                 C  sZ   | j | }|jdkr|| j| j t|| jdd |tj	  }t
| j| |  d S )Nr  F)	is_insert)r  disaggregation_moder  r  r  r!   r  r   r  r   r    r  )rH   r  r  r  r  r  r>   r>   r?   r  s  s   

zScheduleBatch.release_reqc                 C  s   dgt | j | _d S )NT)r   r  r@  rL   r>   r>   r?   prepare_encoder_info_decode  s   z)ScheduleBatch.prepare_encoder_info_decodec                 C  s   t j| _tjdtj| jd| _tjdtj| jd| _tjdtjd| _	tjdtj
| jd| _tjdtj| jd| _tjdtj
| jd| _d| _d| _t| | jj| _d S )Nr   rm  rH  )r)   IDLEr  r   re  rf  r   r#  r%  r&  rr  r,  r'  r$  r+  r;  r*   ry  r  r8   r!  rL   r>   r>   r?   prepare_for_idle  s   
zScheduleBatch.prepare_for_idlec                 C  s(   | j o| j  }|r| j sJ |S rF   )r  rL  r  supports_spec_v2)rH   r   r>   r>   r?   r    s   zScheduleBatch.is_spec_v2c                 C  s  t j| _t| j}| jr| j}||  | j	 sd S | j
jjrG| jr;tjdd | jD tj| jd}| j
j| n| j
j| jtj | j| _d | _| jjrV|   t| dd| _| jD ]}| jd7  _| jd7  _| jd7  _q`| jr| jd | _| jd | _| j d | _ n| j!d | j!d | j !d |  j"|7  _"t# $ rtjdd | jD tj| jd| _%tjdd | jD tj&| jd| _'d S d S )Nc                 S  s*   g | ]}t |jr|jd  n|jd  qS )rJ  )r   rK  r!  rT  r>   r>   r?   rx     s    z4ScheduleBatch.prepare_for_decode.<locals>.<listcomp>rm  r6   )token_per_reqc                 S  s   g | ]}|j |j qS r>   )rX  rY  rT  r>   r>   r?   rx     s    
c                 S  s   g | ]
}|t  j d kqS rG  )r-   mamba_track_interval)rv   slr>   r>   r?   rx     s    )(r)   DECODEr  r   r  r  rN  prepare_for_decoderL  r  r!  penalizer_orchestratoris_requiredr  r   r_  rf  r   cumulate_output_tokensrK  r   r#  r  rx  r  r   r'  rS  rM  rN  r%  r&  r,  add_r+  r-   rt  r(  rE   r)  )rH   bsdraft_inputdelayed_output_idsr  r>   r>   r?   r    sn   





z ScheduleBatch.prepare_for_decodec                 C  s,   | j r| j}|jd ur|j  d S d S d S rF   )r  rN  verify_donesynchronize)rH   r  r>   r>   r?   maybe_wait_verify_done  s   
z$ScheduleBatch.maybe_wait_verify_donechunked_req_to_excludeOptional[Union[Req, List[Req]]]r  v1_spec_info_filteredOptional[bool]c                   s     |d u r&t tr g n d u rg   fddttjD }|d u s0t|dkr5g _d S t|tjkr@d S tj|tjdj	j
dd}jjrbj| _fdd|D _fdd|D _jd ur{fd	d|D _j| _j| _j| _j| _d _j  _jd urj| _d _d _d _td
d jD _jrՇfdd|D _fdd|D _nd _d _tdd jD _ tdd jD _!j"#|| |oj$ }j%rj%j#||d d S d S )Nc                   s,   g | ]}j |  sj |  vr|qS r>   )r  r  r  r  rH   r>   r?   rx     s    z.ScheduleBatch.filter_batch.<locals>.<listcomp>r   rH  Tr   c                   r  r>   )rB  r  rL   r>   r?   rx     rk  c                   r  r>   r  r  rL   r>   r?   rx     rk  c                   r  r>   )rd  r  rL   r>   r?   rx      rk  c                 s  rR  rF   rS  rT  r>   r>   r?   r   .  rU  z-ScheduleBatch.filter_batch.<locals>.<genexpr>c                   r  r>   )r6  r  rL   r>   r?   rx   0  rk  c                   r  r>   )r8  r  rL   r>   r?   rx   1  rk  c                 s  rR  rF   rV  rT  r>   r>   r?   r   6  rU  c                 s  rR  rF   rW  rT  r>   r>   r?   r   7  rU  )new_indiceshas_been_filtered)&r  r   r  r  r   r  r   r_  rf  r   r   r  rx  rA  rB  rd  r$  r%  r&  r,  r'  r`  r   r+  rK  r(  r)  r*  r   r$  r6  r8  rJ  rK  r!  r  r  rN  )rH   r  r  r  keep_indices_devicer  r>   r  r?   r    sd   	



zScheduleBatch.filter_batchr   c                 C  s  | j |j  | jjrt| j|jg| _| j|j t| j	|j	g| _	t| j
|j
g| _
t| j|jg| _t| j|jg| _d | _|  j|j7  _| jd ur^t| j|jg| _d | _d | _d | _| jr||jr|| j|j | j|j n7| jr| jdgt|j  | jd gt|j  n|jrdgt| j |j | _d gt| j |j | _| j|j | jd ur| j|j |  j|jO  _|  j|jO  _|  j|jO  _|  j|jO  _| jr| j|j d S d S r  )r!  r  r  rx  r   r   rA  rB  r  r$  r%  r&  r,  r'  r+  rK  r(  r)  r*  r$  r6  r8  r   r  rd  rJ  rK  r4  rN  r   r>   r>   r?   r  E  sH   


zScheduleBatch.merge_batchModelWorkerBatchc                 C  s  | j  rd  } }}n	| j}| j}| j}| jr*| jr&dd | jD | j_nd | j_|d ur0|n| j	}t
d1i d| j d| jd| jd| jd| jd| jd	|d
| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd|d|d|d| jd| jd| jd| jd| jddd | jD d| jd | j d!| j!d"| j"d#| j#d$| j$d%| j%rt&j'n=| j#rt(| j#d%t&j)nbt&j)d&| j*d'| j+d(| j,d)d*d | jD d+| j-d,| jd-| jd.| j.d/| j/d0| j0S d&| j*d'| j+d(| j,d)d*d | jD d+| j-d,| jd-| jd.| j.d/| j/d0| j0S d&| j*d'| j+d(| j,d)d*d | jD d+| j-d,| jd-| jd.| j.d/| j/d0| j0S )2Nc                 S  rt   r>   rW  rT  r>   r>   r?   rx     ry   z8ScheduleBatch.get_model_worker_batch.<locals>.<listcomp>r  r#  r$  r%  r,  r'  r&  r+  r$  r6  r8  r/  r0  r1  r2  r3  r5  r;  extend_seq_lensextend_prefix_lensr=  rd  r@  rA  rB  rC  lora_idsc                 S  rt   r>   )r,  rT  r>   r>   r?   rx     ry   r!  r.  r0  rL  rN  rO  capture_hidden_moder>  r  rC  dllm_block_offsetsc                 S  rt   r>   )dllm_block_offsetrT  r>   r>   r?   rx     ry   r&  r  rK  r(  r)  r*  r>   )1r  is_decode_or_idler:  r9  r=  r!  rK  r  grammarsr&  r  r#  r$  r%  r,  r'  r+  r$  r6  r8  r/  r0  r1  r2  r3  r5  r;  rd  r@  rA  rB  rC  r.  r0  rL  rN  rO  r4  r(   FULLr   NULLr>  r  rC  r&  r(  r)  r*  )rH   rI  r  r  r=  r&  r>   r>   r?   get_model_worker_batchu  s,  
	
 !$
-./0123456#-./0123456&-./0123456z$ScheduleBatch.get_model_worker_batchc                 C  s   t di d| jd| jd| jd| jd| jd| jd| jd| jd	| j	d
| j
d| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd| jS )Nr  r  r$  r  r  r'  r$  r<  rL  r/  r0  r2  r1  r  r&  r  r(  r)  r*  rP  rQ  r>   )r  r  r  r$  r  r  r'  r$  r<  rL  r/  r0  r2  r1  r  r&  r  r(  r)  r*  rP  rQ  rL   r>   r>   r?   rU    sV   	
zScheduleBatch.copyc                 C  s   | j  rl| j j}t }| j r|jr| j  sd S t| j	D ]M\}}| j r:|j
| dkr9| ||jd  q | j rk| j  rk| j| }| jre|jdk rRq |jdkr\||j n|}| || q | || q d S d S )Nr6      r   )r  supports_swasliding_window_sizer-   r  r^  enable_piecewise_cuda_graphis_chunk_cacher  r  rS  
_evict_swar  	is_extendr9  r  rR  chunked_prefill_size)rH   r  r  r  r  r  r>   r>   r?   maybe_evict_swa  s:   





zScheduleBatch.maybe_evict_swar  c                 C  s   | j  s	J d| j j}|j| j j dksJ dt|j|j|_t|j|| }| j jdkr:|| j j | j j }||jkrV| jj|j	|j|f }| j
| ||_d S d S )Nzprefix cache must support swar   z(cache_protected_len must be page alignedr6   )r  r  r  ro  r  r  rQ  r  r  rV  r  free_swa)rH   r  r  r  new_swa_evicted_seqlen
free_slotsr>   r>   r?   r    s(   



zScheduleBatch._evict_swac                 C  s&   d| j r| j jnd dt| j dS )NzScheduleBatch(forward_mode=r;   z, #req=r  )r  ru   r   r  rL   r>   r>   r?   __str__	  s   zScheduleBatch.__str__)NN)r  r  r  r"   r  r   r  r   r  r0   r  rE   rL  r4   r   r  r&  r'  )r#  r"  r%  r"  )r  r  r  r  r  r"  r  r"  )r  r  rF   )r  r.  )r  r,   )r  r,   r:   r  )r  r9   r  r9   r  r,   )NNF)r  r  r  r.  r  r  )r   r  )rI  r   r:   r  )r  r  r  r9   )arN   rO   rP   r   r   r  r  r  r  r  r  r  r  r   r!  r#  r.  r0  r$  r%  r&  r'  rK  r(  r)  r*  rd  r+  r,  r-  r/  r0  r1  r2  r3  r5  r$  r6  r8  rw  rx  r9  r:  r;  r<  r=  r>  r@  rA  rB  rC  rC  rE  rF  rG  rH  rI  rJ  rK  r   rL  rN  r4  r5  r  rO  r&  rP  rQ  classmethodrY  r[  r\  r  rf  r  ru  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  rU  r  r  r  r>   r>   r>   r?   r    s   
 
&
J 
mP
 !


<
T
O1M
#r  c                   @  s  e Zd ZU ded< ded< ded< ded< ded< ded	< d
ed< ded< ded< ded< ded< ded< ded< ded< ded< ded< ded< ded< ded< ded< ded< ded < d!ed"< ded#< ded$< ded%< d&ed'< d(ed)< d*Zded+< d*Zded,< d*Zded-< d*Zd.ed/< d*Zd0ed1< d*Z	d2ed3< d4Z
d
ed5< d*Zd6ed7< d8Zded9< d*Zded:< d*Zd;ed<< d*Zd=ed>< d8Zded?< d8Zded@< d*ZdedA< d*ZdedB< d*ZdedC< d*S )Dr  r)   r  r"  r#  r$  r%  r'  r   r&  r9   r+  rE   r$  r.  r6  r7  r8  r/  r0  r1  r2  r   r3  r4  r5  r;  r  r  r=  r>  z Optional[List[MultimodalInputs]]rd  r?  r@  rA  rB  rC  zOptional[List[str]]r  r*   r!  Nr,  r.  r0  r4   rL  rM  rN  r(   r  rJ  rO  rD  rC  Fr  r  r'  r&  zOptional[List[Req]]r  rK   return_hidden_states_before_normr(  r)  r*  )rN   rO   rP   r   r,  r.  r0  rL  rN  r  rO  rC  r  r  r&  r  rK  r  r(  r)  r*  r>   r>   r>   r?   r   	  s\   
 r  )r8   r9   r:   r;   )rA   r9   r:   r9   )t
__future__r   enumsglang.srt.dllm.configr   ,sglang.srt.model_executor.forward_batch_infor   sglang.srt.utils.commonr   rU  r   loggingr  r  r   r   	functoolsr   httpr	   	itertoolsr
   typingr   r   r   r   r   r   r   r   numpynpr   +sglang.srt.constrained.base_grammar_backendr   sglang.srt.disaggregation.baser   5sglang.srt.disaggregation.decode_schedule_batch_mixinr   sglang.srt.disaggregation.utilsr   %sglang.srt.distributed.parallel_stater   sglang.srt.dllm.mixin.reqr   sglang.srt.environr   -sglang.srt.layers.attention.fla.chunk_delta_hr   r  sglang.srt.mem_cache.allocatorr   &sglang.srt.mem_cache.base_prefix_cacher   r   sglang.srt.mem_cache.commonr   r   r    r!    sglang.srt.mem_cache.memory_poolr"    sglang.srt.mem_cache.radix_cacher#   $sglang.srt.mem_cache.swa_memory_poolr$   sglang.srt.metrics.collectorr%   r&   r'   r(   r)   'sglang.srt.sampling.sampling_batch_infor*   #sglang.srt.sampling.sampling_paramsr+   sglang.srt.server_argsr,   r-   sglang.srt.utilsr.   )sglang.srt.utils.cuda_ipc_transport_utilsr/   sglang.srt.configs.model_configr0   +sglang.srt.managers.scheduler_metrics_mixinr1   !sglang.srt.speculative.eagle_infor2    sglang.srt.speculative.spec_infor3   r4   r  r<   	getLoggerrN   r
  r@   rB   rC   rR   re   rh   ri   rl   rr   r   	dataclassr   r   rf   r   r  r  r  r>   r>   r>   r?   <module>   s    (

	r      6        v