o
    ÔÙ¾iñë  ã                   @   sˆ  d Z ddlZddlZddlZddlZddlmZmZ ddlmZm	Z	m
Z
mZmZ ddlmZ ddlmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ edƒZe e¡Zde dee
e!  fdd„Z"eG dd„ dƒƒZ#eG dd„ dƒƒZ$g d¢Z%de
ee   de&fdd„Z'eG dd„ dƒƒZ(G dd„ dƒZ)G dd„ dƒZ*eG dd „ d ƒƒZ+G d!d"„ d"ƒZ,G d#d$„ d$ƒZ-G d%d&„ d&ƒZ.dS )'z,Utilities for Prometheus Metrics Collection.é    N)Ú	dataclassÚfield)ÚAnyÚDictÚListÚOptionalÚUnion)ÚDisaggregationMode)Úenvs)Úexponential_bucketsÚgenerate_buckets)ÚForwardMode)Ú
ServerArgs)Úget_bool_env_var)ÚGaugeHistogramÚSGLANG_TEST_REQUEST_TIME_STATSÚenv_var_nameÚreturnc                 C   s4   | t jvrdS t j|  }|sdS dd„ | d¡D ƒS )zw
    Get the histogram configuration from the environment variable.
    env value should be like "0.1,0.2,0.5,1,2"
    Nc                 S   s   g | ]}t |ƒ‘qS © )Úfloat©Ú.0Úxr   r   úP/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/metrics/collector.pyÚ
<listcomp>0   s    z/get_histogram_conf_from_env.<locals>.<listcomp>ú,)ÚosÚenvironÚsplit)r   Úenv_var_valuer   r   r   Úget_histogram_conf_from_env%   s   

r    c                   @   sR  e Zd ZU dZejZeed< dZe	ed< dZ
e	ed< dZe	ed< dZe	ed< dZe	ed< dZe	ed	< dZe	ed
< dZe	ed< dZe	ed< dZe	ed< dZe	ed< dZe	ed< dZe	ed< dZe	ed< dZeed< dZe	ed< de	fdd„Zdee	 fdd„Zdee	 fdd„Zdee	 fdd„Zdefdd„Z d e	defd!d"„Z!defd#d$„Z"d%S )&Ú	TimeStatsa  
    Store the timestamps for each stage of a request.

    Unified: wait_queue -> forward -> completion
    Prefill: bootstrap_queue -> wait_queue -> forward -> transfer_queue -> completion
    Decode: prealloc_queue -> transfer_queue -> wait_queue -> forward -> completion
    Údisagg_modeç        Úlb_entry_timeÚwait_queue_entry_timeÚforward_entry_timeÚcompletion_timeÚ"prefill_bootstrap_queue_entry_timeÚ!prefill_transfer_queue_entry_timeÚ decode_prealloc_queue_entry_timeÚ decode_transfer_queue_entry_timeÚbootstrap_durationÚalloc_waiting_durationÚprefill_start_time_hostÚprefill_end_time_hostÚtransfer_speed_gb_sÚtransfer_total_mbr   Úprefill_retry_countÚprefill_finished_tsr   c                 C   s   | j | j S ©N)r&   r%   ©Úselfr   r   r   Úget_queueing_timeW   s   zTimeStats.get_queueing_timec                 C   s   | j dkr| j | j S d S ©Nr#   )r.   r&   r5   r   r   r   Úget_prefill_launch_delayZ   s   
z"TimeStats.get_prefill_launch_delayc                 C   s$   | j dkr| jdkr| j| j  S d S r8   )r.   r/   r5   r   r   r   Úget_prefill_launch_latency_   s   z$TimeStats.get_prefill_launch_latencyc                 C   s   | j dkr| j S d S r8   )r3   r5   r   r   r   Úget_prefill_finished_tsd   s   
z!TimeStats.get_prefill_finished_tsc                 C   sŠ  | j tjkr:| j| j }| j| j }tr'|dkr|dks'J d|› d|› dƒ‚d|  |¡› d|  |¡› d| jd›S | j tjkr¹| j| j	 }| j| j }| j| j }trs| jdkrs|dkre|dkre|dkssJ d|› d	|› d|› dƒ‚t
d
|| j| j  ƒ}d|  |¡› d|  | j¡› d|  | j¡› d|  |¡› d|  |¡› d|  |¡› d| j	d›d| jd›d| jd›d| j› S | j tjkrC| j| j }| j| j }| j| j }| j| j }tr| jdkr|dkrò|dkrò|dkrò|dksJ d|› d|› d	|› d|› d| ›
ƒ‚t
d
|| j| j  ƒ}d|  |¡› d|  | j¡› d|  | j¡› d|  |¡› d|  |¡› d|  |¡› d|  |¡› d| jd›S dS )Nr   zqueue_duration=z < 0 or forward_duration=z < 0z, forward_duration=z, start_time=ú.3fzbootstrap_duration=z < 0 or queue_duration=r#   zbootstrap_queue_duration(z) = alloc_wait(z) + bootstrap(z
) + other(z); queue_duration=z, start=z, transfer_speed=ú.2fzGB/s, transfer_total=zMB, #retries=zprealloc_duration=z < 0 or transfer_duration=z < 0. self=zprealloc_queue_duration(z); transfer_duration=z; queue_duration=zUnknown Time Stats)r"   r	   ÚNULLr&   r%   r'   r   Úformat_durationÚPREFILLr(   Úmaxr-   r,   r0   r1   r2   ÚDECODEr+   r*   )r6   Úqueue_durationÚforward_durationr,   ÚotherÚprealloc_durationÚtransfer_durationr   r   r   Úconvert_to_durationi   s¬   ÿ&
ÿ

ÿ
ÿþ
ÿ
þýüûúùø	÷ÿÿÿ
ÿ ÿ
ÿþ
ÿ
þýüûúùÿzTimeStats.convert_to_durationÚdurationc                 C   s   |d d›dS )Ng     @@r=   Úmsr   )r6   rI   r   r   r   r?   º   s   zTimeStats.format_durationc                 C   s4   | j tjkrdS | j tjkrdS | j tjkrdS dS )NÚunifiedÚdecodeÚprefillÚunknown)r"   r	   r>   rB   r@   r5   r   r   r   Údisagg_mode_str½   s   zTimeStats.disagg_mode_strN)#Ú__name__Ú
__module__Ú__qualname__Ú__doc__r	   r>   r"   Ú__annotations__r$   r   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   Úintr3   r7   r   r9   r:   r;   ÚstrrH   r?   rO   r   r   r   r   r!   3   s4   
 Qr!   c                   @   sî  e Zd ZU dZeed< dZeed< dZeed< dZ	eed< dZ
eed< dZeed< dZeed	< dZeed
< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZ eed< dZ!e"e ed< dZ#eed < dZ$eed!< dZ%eed"< dZ&eed#< dZ'eed$< dZ(eed%< dZ)eed&< dZ*eed'< e+e,d(Z-e.e ed)< e+e,d(Z/e.e ed*< dS )+ÚSchedulerStatsr   Únum_running_reqsÚnum_used_tokensr#   Útoken_usageÚpending_prealloc_token_usageÚswa_token_usageÚmamba_usageÚdecode_sum_seq_lensÚgen_throughputÚnum_queue_reqsÚnum_grammar_queue_reqsÚnum_running_reqs_offline_batchÚcache_hit_rateÚmax_total_num_tokensÚspec_accept_lengthÚspec_accept_rateÚnum_retracted_reqsÚnum_paused_reqsÚnum_prefill_prealloc_queue_reqsÚnum_prefill_inflight_queue_reqsÚnum_decode_prealloc_queue_reqsÚnum_decode_transfer_queue_reqsÚkv_transfer_speed_gb_sÚkv_transfer_latency_msÚkv_transfer_bootstrap_msÚkv_transfer_alloc_msÚkv_transfer_total_mbÚutilizationNÚmax_running_requests_under_SLOÚengine_startup_timeÚengine_load_weights_timeÚnew_token_ratioÚis_cuda_graphÚlora_pool_slots_usedÚlora_pool_slots_totalÚlora_pool_utilizationÚnum_unique_running_routing_keys©Údefault_factoryÚrouting_key_running_req_countsÚrouting_key_all_req_counts)0rP   rQ   rR   rX   rU   rT   rY   rZ   r   r[   r\   r]   r^   r_   r`   ra   rb   rc   rd   re   rf   rg   rh   ri   rj   rk   rl   rm   rn   ro   rp   rq   rr   rs   r   rt   ru   rv   rw   rx   ry   rz   r{   r   Úlistr~   r   r   r   r   r   r   rW   È   sN   
 rW   )
é   é   é   é   é   é
   é   é2   éd   éÈ   Úrouting_keysc                 C   s2   ddl m} |dd„ | D ƒƒ}t|ƒt| ¡ ƒfS )z*Returns (num_unique_keys, per_key_counts).r   )ÚCounterc                 s   s    | ]	}|d ur|V  qd S r4   r   )r   Úkr   r   r   Ú	<genexpr>  s   € z,compute_routing_key_stats.<locals>.<genexpr>)ÚcollectionsrŒ   Úlenr€   Úvalues)r‹   rŒ   Ú
key_countsr   r   r   Úcompute_routing_key_stats  s   r“   c                   @   s4   e Zd ZU eed< edee fdd„ƒZdd„ ZdS )ÚDPCooperationInfoÚnum_prefill_ranksÚforward_modesc                 C   s   t tdd„ | D ƒƒdS )Nc                 s   s     | ]}t |ƒ ¡ rd V  qdS )r   N)r   Ú	is_extend)r   Úmoder   r   r   rŽ     s   € ÿ
ÿz+DPCooperationInfo.create.<locals>.<genexpr>)r•   )r”   Úsum)r–   r   r   r   Úcreate  s
   ÿýzDPCooperationInfo.createc                 C   s
   t  | ¡S r4   )ÚdataclassesÚasdictr5   r   r   r   Ú	to_labels  s   
zDPCooperationInfo.to_labelsN)	rP   rQ   rR   rU   rT   Ústaticmethodr   rš   r   r   r   r   r   r”     s
   
 	r”   c                   @   sˆ  e Zd Z		dAdeeef deded ddfdd	„Zd
ee	e
f ddfdd„Zd
ee	e
f ddfdd„ZdBdd„ZdBdd„Zde	ddfdd„Zdede
ddfdd„Zde
ddfdd„Zde	de
deded ed!eddfd"d#„Zd$e	d%e	d&e	ddfd'd(„Zd)eddfd*d+„Zd,ed-e
ddfd.d/„Z	0	0	0dCd1ee fd2d3„Zd4ed5e
d1ee fd6d7„Zd8eddfd9d:„ZdBd;d<„Zd=e	d>e	ddfd?d@„ZdS )DÚSchedulerMetricsCollectorFNÚlabelsÚenable_loraÚserver_argsr   r   c           	   	      sš  ddl m}m}m}m} || _|| _t ¡ | _	|dd| 
¡ dd| _|dd| 
¡ dd| _|d	d
| 
¡ dd| _|dd| 
¡ dd| _|dd| 
¡ dd| _|dd| 
¡ dd| _|dd| 
¡ dd| _|dd| 
¡ dd| _|dd| 
¡ dd| _|dd| 
¡ dd| _|dd| 
¡ dd| _|dd| 
¡ dd| _|dd| 
¡ dd| _|dd | 
¡ dd| _|d!d"| 
¡ dd| _|d#d$| 
¡ d%| _|d&d'| 
¡ d%| _|d(d)| 
¡ d%| _|d*d+| 
¡ d%| _|d,d-| 
¡ d%| _|d.d/| 
¡ dd| _|d0d1| 
¡ dd| _ |d2d3| 
¡ dd| _!|d4d5| 
¡ dd| _"|d6d7| 
¡ d%| _#|d8d9| 
¡ d%| _$|d:d;| 
¡ d%| _%|d<d=| 
¡ dd| _&|d>d?| 
¡ dd| _'|d@dA| 
¡ dd| _(|dBdC| 
¡ dd| _)|dDdE| 
¡ dd| _*|dFdG| 
¡ dd| _+|dHdI| 
¡ dd| _,|dJdK| 
¡ dd| _-|dLdM| 
¡ dd| _.|dNdO| 
¡ g dP¢dQ| _/|dRdS| 
¡ g dT¢dQ| _0|dUdV| 
¡ d%| _1|dWdX| 
¡ d%| _2|dYdZ| 
¡ d%| _3|d[d\| 
¡ d%| _4|d]d^| 
¡ g d_¢dQ| _5|d`da| 
¡ g db¢dQ| _6g dc¢}|ddde| 
¡ |dQ| _7|dfdg| 
¡ |dQ| _8|dhdit9djdkdldmt:| 
¡ ƒdng do| _;|dpdq| 
¡ dd| _<|drdst:| 
¡ ƒdtg d%| _=|du dkr\t>j? @¡ r\|dvdwt:| 
¡ ƒdxg d%| _A| jr|dydz| 
¡ dd| _B|d{d|| 
¡ dd| _C|d}d~| 
¡ dd| _D|dd€| 
¡ dd| _EtFdd‚t:| 
¡ ƒtGdƒ| _HtFd„d…t:| 
¡ ƒtGdƒ| _I|d†d‡| 
¡ dd| _J|dˆd‰t:| 
¡ ƒdtg d%| _K|dŠd‹t:| 
¡ ƒdŒg d%| _L|ddŽt:| 
¡ ƒdtdg d%| _M|dd‘t:| 
¡ ƒdŒdg d%| _N|jO‰ |d’d“| 
¡ tPtQ‡ fd”d•„|jRpg d–¢D ƒƒdˆ d— hB ƒdQ| _S|d˜d™| 
¡ tPtQ|jTp#g dš¢ƒdhB ƒdQ| _U|d›dœg | 
¡ ¢d‘dž‘dŸ‘d ‘d%| _V|d¡d¢d£d¤gdd| _Wd S )¥Nr   )rŒ   ÚGaugeÚ	HistogramÚSummaryzsglang:num_running_reqszThe number of running requests.Ú
mostrecent)ÚnameÚdocumentationÚ
labelnamesÚmultiprocess_modezsglang:num_used_tokenszThe number of used tokens.zsglang:token_usagezThe token usage.z#sglang:pending_prealloc_token_usagezGThe token usage for pending preallocated tokens (not preallocated yet).zsglang:swa_token_usagezThe token usage for SWA layers.zsglang:mamba_usagez!The token usage for Mamba layers.zsglang:decode_sum_seq_lensz*The sum of all sequence lengths in decode.zsglang:gen_throughputz$The generation throughput (token/s).zsglang:num_queue_reqsz,The number of requests in the waiting queue.zsglang:num_grammar_queue_reqsz4The number of requests in the grammar waiting queue.z%sglang:num_running_reqs_offline_batchzLThe number of running low-priority offline batch requests(label is 'batch').zsglang:cache_hit_ratezThe prefix cache hit rate.zsglang:max_total_num_tokensz4Maximum total number of tokens in the KV cache pool.zsglang:spec_accept_lengthz6The average acceptance length of speculative decoding.zsglang:spec_accept_ratezfThe average acceptance rate of speculative decoding (`accepted tokens / total draft tokens` in batch).zsglang:num_retracted_reqsz!The number of retracted requests.©r§   r¨   r©   z#sglang:num_retracted_requests_totalz#Total number of retracted requests.z'sglang:num_retracted_input_tokens_totalz'Total number of retracted input tokens.z(sglang:num_retracted_output_tokens_totalz(Total number of retracted output tokens.zsglang:num_paused_reqsz3The number of paused requests by async weight sync.z&sglang:num_prefill_prealloc_queue_reqsz5The number of requests in the prefill prealloc queue.z&sglang:num_prefill_inflight_queue_reqsz5The number of requests in the prefill inflight queue.z%sglang:num_decode_prealloc_queue_reqsz4The number of requests in the decode prealloc queue.z%sglang:num_decode_transfer_queue_reqsz4The number of requests in the decode transfer queue.z&sglang:num_bootstrap_failed_reqs_totalz(The number of bootstrap failed requests.z%sglang:num_transfer_failed_reqs_totalz'The number of transfer failed requests.z sglang:num_prefill_retries_totalz Total number of prefill retries.zsglang:kv_transfer_speed_gb_sz+The transfer speed of the KV cache in GB/s.zsglang:kv_transfer_latency_msz+The transfer latency of the KV cache in ms.zsglang:kv_transfer_bootstrap_msz,The bootstrap time of the KV transfer in ms.zsglang:kv_transfer_alloc_msz5The allocation waiting time of the KV transfer in ms.zsglang:kv_transfer_total_mbz7The total number of tokens transferred in the KV cache.zsglang:utilizationzThe utilization.z%sglang:max_running_requests_under_SLOz1The maximum number of running requests under SLO.zsglang:engine_startup_timez*The time taken for the engine to start up.zsglang:engine_load_weights_timez.The time taken for the engine to load weights.zsglang:queue_time_secondsz&Histogram of queueing time in seconds.)$r#   çš™™™™™¹?çš™™™™™É?ç      à?r   r‚   rƒ   é   r„   r†   é   r‡   é   é(   rˆ   é<   éF   éP   éZ   r‰   rŠ   é,  é  éô  éX  é¼  i   i„  éè  é°  ix  i@  é  éÐ  iÄ	  é¸  ©r§   r¨   r©   Úbucketsz'sglang:grammar_compilation_time_secondsz1Histogram of grammar compilation time in seconds.)r#   ç{®Gáz„?ç{®Gáz”?çš™™™™™©?r¬   r­   r®   r   r‚   r„   r†   r‡   r±   r³   r¶   éx   éð   z"sglang:num_grammar_cache_hit_totalzNumber of grammar cache hits.z sglang:num_grammar_aborted_totalz#Number of grammar aborted requests.z sglang:num_grammar_timeout_totalzNumber of grammar timeouts.zsglang:num_grammar_totalz%Number of the total grammar requests.zsglang:grammar_schema_countz"Histogram of grammar schema count.)r   r   r‚   r„   r†   r‡   r±   r²   r³   rµ   r‰   rÆ   éŒ   é    é´   rŠ   r·   r¸   r¹   r»   r¼   zsglang:grammar_ebnf_sizezHistogram of grammar EBNF size.)r   rˆ   r‰   rŠ   r·   r¹   r¼   r¿   rÀ   éˆ  é'  é N  é0u  iPÃ  i † )r#   rÃ   rÄ   rÅ   r¬   r­   r®   r   r‚   r„   r†   r°   r±   r³   r¶   rÆ   rÇ   z&sglang:grammar_tree_traversal_time_avgz<Histogram of average grammar tree traversal time in seconds.z&sglang:grammar_tree_traversal_time_maxz8Histogram of max grammar tree traversal time in seconds.z$sglang:per_stage_req_latency_secondsz&The latency of each stage of requests.çü©ñÒMbP?gìQ¸…ëù?r±   )ÚstartÚwidthÚlengthÚstage)r§   r¨   rÂ   r©   zsglang:is_cuda_graphz&Whether the batch is using CUDA graph.zsglang:cuda_graph_passes_totalz9Total number of forward passes categorized by CUDA graph.r˜   Úmoe_ep_rankzsglang:eplb_balancednessz*Balancedness of MoE in expert parallelism.Úforward_modezsglang:lora_pool_slots_usedz>Number of LoRA adapter slots currently occupied in GPU memory.zsglang:lora_pool_slots_totalzCTotal number of LoRA adapter slots available (max_loras_per_batch).zsglang:lora_pool_utilizationzALoRA pool utilization ratio (used/total). 1.0 means pool is full.z&sglang:num_unique_running_routing_keysz/Number of unique routing keys in running batch.z$sglang:routing_key_running_req_countzIDistribution of routing keys by running request count (gt < count <= le).)r§   r¨   r©   Úbucket_boundsz sglang:routing_key_all_req_countzQDistribution of routing keys by running+waiting request count (gt < count <= le).zsglang:new_token_ratiozThe new token ratio.zsglang:realtime_tokens_totalznTotal number of tokens processed (updated on each log interval). mode: prefill_compute, prefill_cache, decode.z"sglang:gpu_execution_seconds_totalz[Total time that GPU is busy executing a workload. Refer to ForwardMode for category labels.Úcategoryz+sglang:dp_cooperation_realtime_tokens_totalzpTotal number of tokens processed with labels about DP cooperation. mode: prefill_compute, prefill_cache, decode.r•   z1sglang:dp_cooperation_gpu_execution_seconds_totalz|Total time that GPU is busy executing a workload with labels about DP cooperation. Refer to ForwardMode for category labels.z*sglang:prefill_delayer_wait_forward_passesz6Histogram of forward passes waited by prefill delayer.c                 3   s    | ]	}|ˆ k r|V  qd S r4   r   r   ©Ú	max_delayr   r   rŽ   2  s   € ûúz5SchedulerMetricsCollector.__init__.<locals>.<genexpr>)r„   r‡   rˆ   r‰   rŠ   r   z#sglang:prefill_delayer_wait_secondsz5Histogram of wait time in seconds by prefill delayer.)	r   r‚   r„   r†   r‡   rˆ   r‰   rŠ   r¹   z%sglang:prefill_delayer_outcomes_totalzPrefill delayer outcome counts.Úinput_estimationÚoutput_allowÚoutput_reasonÚactual_executionzsglang:cache_config_infoz Cache configuration information.Ú	page_sizeÚ	num_pages)XÚprometheus_clientrŒ   r£   r¤   r¥   r    r¡   ÚtimeÚperf_counterÚlast_log_timeÚkeysrX   rY   rZ   r[   r\   r]   r^   r_   r`   ra   rb   rc   rd   re   rf   rg   Únum_retracted_reqs_totalÚ num_retracted_input_tokens_totalÚ!num_retracted_output_tokens_totalrh   ri   rj   rk   rl   Únum_bootstrap_failed_reqsÚnum_transfer_failed_reqsÚnum_prefill_retries_totalrm   rn   ro   rp   rq   rr   rs   rt   ru   Ú
queue_timeÚgrammar_compilation_timeÚnum_grammar_cache_hitÚnum_grammar_abortedÚnum_grammar_timeoutÚnum_grammar_totalÚgrammar_schema_countÚgrammar_ebnf_sizeÚgrammar_tree_traversal_time_avgÚgrammar_tree_traversal_time_maxr   r€   Úper_stage_req_latency_secondsrw   Úcuda_graph_passes_totalr
   Ú&SGLANG_ENABLE_EPLB_BALANCEDNESS_METRICÚgetÚeplb_balancednessrx   ry   rz   r{   r   Ú#ROUTING_KEY_REQ_COUNT_BUCKET_BOUNDSÚrouting_key_running_req_countÚrouting_key_all_req_countrv   Úrealtime_tokens_totalÚgpu_execution_seconds_totalÚ$dp_cooperation_realtime_tokens_totalÚ*dp_cooperation_gpu_execution_seconds_totalÚ prefill_delayer_max_delay_passesÚsortedÚsetÚ&prefill_delayer_forward_passes_bucketsÚ#prefill_delayer_wait_forward_passesÚ$prefill_delayer_wait_seconds_bucketsÚprefill_delayer_wait_secondsÚprefill_delayer_outcomes_totalÚcache_config_info)	r6   r    r¡   r¢   rŒ   r£   r¤   r¥   Útree_traversal_time_bucketsr   rØ   r   Ú__init__%  s*  
üüüüüüüüüüüüüüü	ýüýýýüüüüýýýüüüüüüüüüü-üýýýýüüüüû	üýÿýüüüü
ü
üüúú	úú	ü
	÷ÿüþûÿüÿþýüûýüz"SchedulerMetricsCollector.__init__Údatac                 C   ó   |j di | j ¤Ž |¡ d S ©Nr   )r    r  )r6   Úgauger  r   r   r   Ú
_log_gaugea  s   z$SchedulerMetricsCollector._log_gaugec                 C   r  r  ©r    Úobserve©r6   Ú	histogramr  r   r   r   Ú_log_histograme  ó   z(SchedulerMetricsCollector._log_histogramc                 C   ó   | j jdi | j¤Ž d¡ d S ©Nr   r   )rè   r    Úincr5   r   r   r   Úincrement_bootstrap_failed_reqsh  ó   z9SchedulerMetricsCollector.increment_bootstrap_failed_reqsc                 C   r  r  )ré   r    r  r5   r   r   r   Úincrement_transfer_failed_reqsk  r  z8SchedulerMetricsCollector.increment_transfer_failed_reqsÚcountc                 C   ó*   |dkr| j jdi | j¤Ž |¡ d S d S ©Nr   r   )rê   r    r  )r6   r  r   r   r   Úincrement_prefill_retriesn  ó   ÿz3SchedulerMetricsCollector.increment_prefill_retriesrÓ   Úlatencyc                 C   s.   i | j ¥d|i¥}| jj di |¤Ž |¡ d S )NrÓ   r   )r    rõ   r  )r6   rÓ   r"  Úlabels_with_stager   r   r   Úobserve_per_stage_req_latencyr  s   z7SchedulerMetricsCollector.observe_per_stage_req_latencyc                 C   s   |   | j|¡ d S r4   )r  rë   )r6   r"  r   r   r   Úobserve_queue_timev  s   z,SchedulerMetricsCollector.observe_queue_timeÚforward_passesÚwait_secondsrÚ   rÛ   rÜ   rÝ   c                 C   s`   |r|r|   | j|¡ |   | j|¡ | jjdi | j¤|t|ƒ ¡ |t|ƒ ¡ dœ¤Ž d¡ d S )N)rÚ   rÛ   rÜ   rÝ   r   r   )r  r  r  r  r    rV   Úlowerr  )r6   r&  r'  rÚ   rÛ   rÜ   rÝ   r   r   r   Úobserve_prefill_delayer_outcomey  s   	ÿ
ÿ

ûz9SchedulerMetricsCollector.observe_prefill_delayer_outcomerg   Únum_retracted_input_tokensÚnum_retracted_output_tokensc                 C   sR   | j jdi | j¤Ž |¡ | jjdi | j¤Ž |¡ | jjdi | j¤Ž |¡ d S r  )rå   r    r  ræ   rç   )r6   rg   r*  r+  r   r   r   Úincrement_retracted_reqs  s   ÿÿz2SchedulerMetricsCollector.increment_retracted_reqsÚvaluec                 C   s2   |rdnd}| j jdi | j¤d|i¤Ž d¡ d S )NÚdecode_cuda_graphÚdecode_noner˜   r   r   )rö   r    r  )r6   r-  r˜   r   r   r   Úincrement_cuda_graph_passž  s   &z3SchedulerMetricsCollector.increment_cuda_graph_passrÕ   Úbalancednessc                 C   s&   | j jdi | j¤d|i¤Ž |¡ d S )NrÕ   r   )rù   r    r  )r6   rÕ   r1  r   r   r   Úincrement_eplb_balancedness£  s   ÿz5SchedulerMetricsCollector.increment_eplb_balancednessr   Údp_cooperation_infoc                 C   s‚   d|fd|fd|ffD ]3\}}|dkrq| j jdi | j¤d|i¤Ž |¡ |d ur>| jjdi | j¤d|i¤| ¡ ¤Ž |¡ qd S )NÚprefill_computeÚprefill_cacherL   r   r˜   r   )rý   r    r  rÿ   r   )r6   r3  Úprefill_compute_tokensÚprefill_cache_tokensÚdecode_tokensr˜   Údeltar   r   r   Úincrement_realtime_tokensª  s$   ý"
ÿþý€óz3SchedulerMetricsCollector.increment_realtime_tokensr×   Útc                 C   st   t  d|›d|d›¡ | jjdi | j¤d|i¤Ž |¡ |d ur8| jjdi | j¤d|i¤| ¡ ¤Ž |¡ d S d S )Nz GPU execution seconds: category=z t=r<   r×   r   )ÚloggerÚdebugrþ   r    r  r   r   )r6   r×   r;  r3  r   r   r   Úincrement_gpu_execution_secondsÀ  s   "
ÿþýûz9SchedulerMetricsCollector.increment_gpu_execution_secondsÚstatsc                 C   sŽ  |   | j|j¡ |   | j|j¡ |   | j|j¡ |   | j|j¡ |   | j|j¡ |   | j|j¡ |   | j|j¡ |   | j|j¡ |   | j	|j	¡ |   | j
|j
¡ |   | j|j¡ |   | j|j¡ |   | j|j¡ |   | j|j¡ |   | j|j¡ |   | j|j¡ |   | j|j¡ |   | j|j¡ |   | j|j¡ |   | j|j¡ |   | j|j¡ |   | j|j¡ |   | j|j¡ |   | j|j¡ |   | j|j¡ |   | j|j¡ |   | j|j¡ |jd urå|   | j|j¡ |   | j|j¡ |jd urú|   | j|j¡ |   | j|j¡ |   | j |j ¡ | j!r&|   | j"|j"¡ |   | j#|j#¡ |   | j$|j$¡ |   | j%|j%¡ | j& '| j(|j)¡ | j* '| j(|j+¡ t, -¡ | _.d S r4   )/r  rX   rY   rZ   r[   r\   r]   r^   r_   r`   ra   rb   rc   rd   re   rf   ri   rj   rk   rl   rm   rn   ro   rp   rq   rg   rh   rr   rs   rt   ru   rv   rw   r¡   rx   ry   rz   r{   rû   Úset_by_current_observationsr    r~   rü   r   rá   râ   rã   )r6   r?  r   r   r   Ú	log_statsÏ  s‚   ÿÿÿÿÿÿ
þ
ÿÿÿÿz#SchedulerMetricsCollector.log_statsc                 C   s  |j d ur|  | j|j ¡ |jd ur|  | j|j¡ |jd ur'|  | j|j¡ |j}|rFt|ƒ}t	|ƒt
|ƒ }|  | j|¡ |  | j|¡ |jrV| jjdi | j¤Ž d¡ |jrf| jjdi | j¤Ž d¡ |jdkry| jjdi | j¤Ž |j¡ | jjdi | j¤Ž d¡ d S )Nr   r   r   )Úcompilation_timer  rì   Úschema_countrñ   Ú	ebnf_sizerò   Útree_traversal_timerA   r™   r   rô   ró   Úis_cache_hitrí   r    r  Úis_grammar_abortedrî   Únum_timeoutrï   rð   )r6   Úgrammar_statsÚ
tree_timesÚmax_timeÚavg_timer   r   r   Úlog_grammar_stats#  s.   
ÿ


ÿz+SchedulerMetricsCollector.log_grammar_statsrÞ   rß   c                 C   s   | j j||d d¡ d S )N)rÞ   rß   r   )r	  r    r  )r6   rÞ   rß   r   r   r   Úemit_cache_config_info<  s   z0SchedulerMetricsCollector.emit_cache_config_info)FN)r   N)r   r   r   )rP   rQ   rR   r   rV   Úboolr   r  r   rU   r   r  r  r  r  r   r$  r%  r)  r,  r0  r2  r”   r:  r>  rW   rA  rM  rN  r   r   r   r   rŸ   #  sŽ    ü
þýü
û    @

þýüûúù
øþýü
ûÿÿ
þ
û
þþý
ü
TrŸ   c                   @   s  e Zd Z						d dee deeef deee  deee  deee  de	d	dfd
d„Z
	d!deeef dedededede	dedeeeef  fdd„Zdeeef defdd„Zded	e	fdd„Zdeeef dedefdd„Zdeeef fdd„ZdS )"ÚTokenizerMetricsCollectorNFr¢   r    Úbucket_time_to_first_tokenÚbucket_inter_token_latencyÚbucket_e2e_request_latencyÚcollect_tokens_histogramr   c           
      C   sz  ddl m}m} |pi | _|| _|dd| ¡ d| _|dd| ¡ d| _|rHg d¢}	|d	d
| ¡ t|j	|	ƒd| _
|dd| ¡ t|j|	ƒd| _|ddt| ¡ ƒdg d| _|dd| ¡ d| _|dd| ¡ d| _|dd| ¡ d| _|d u r}g d¢}|d u r…g d¢}|d u rg d¢}|dd| ¡ |d| _|dd| ¡ |d| _|dd| ¡ |d| _|d d!| ¡ g d"¢d| _d S )#Nr   ©rŒ   r¤   zsglang:prompt_tokens_totalz#Number of prefill tokens processed.r«   zsglang:generation_tokens_totalz&Number of generation tokens processed.)r‰   r·   r¹   r»   r¼   iÜ  r¿   rÀ   i   rË   ip  iX  i@  i(#  rÌ   ià.  i˜:  rÍ   iðU  i¨a  rÎ   i¸ˆ  i@œ  iÐ i¸‚ i  ià“ iÀ'	 i » iàÈ zsglang:prompt_tokens_histogramz!Histogram of prompt token length.rÁ   z"sglang:generation_tokens_histogramz%Histogram of generation token length.zsglang:cached_tokens_totalz?Number of cached prompt tokens by source (device/host/storage).Úcache_sourcezsglang:num_requests_totalzNumber of requests processed.zsglang:num_so_requests_totalz/Number of structured output requests processed.z!sglang:num_aborted_requests_totalzNumber of requests aborted.)r¬   r­   çš™™™™™Ù?ç333333ã?çš™™™™™é?r   r‚   r¯   é   é   r†   r‡   r²   r³   rµ   r‰   rŠ   r¸   )r¬   r­   rW  rX  rY  r   r‚   r¯   rZ  r[  r†   r‡   r²   r³   rµ   r‰   rŠ   r¸   rº   r½   r¾   i`	  )çü©ñÒMb`?çü©ñÒMbp?çú~j¼t“x?çü©ñÒMb€?rÃ   g¸…ëQ¸Ž?rÄ   gš™™™™™™?ç¸…ëQ¸ž?gìQ¸…ë¡?ç{®Gáz¤?g¸…ëQ¸®?g{®Gáz´?r¬   r­   rW  rX  rY  ç      ð?g       @g      @g      @g       @z"sglang:time_to_first_token_secondsz,Histogram of time to first token in seconds.z"sglang:inter_token_latency_secondsz,Histogram of inter-token latency in seconds.z"sglang:e2e_request_latency_secondsz2Histogram of End-to-end request latency in secondszsglang:num_retractionsz+Histogram of retraction counts per request.)r   r   r‚   rƒ   r¯   r„   rZ  r…   r[  é	   r†   r°   r‡   é   r±   r²   rˆ   éK   r‰   )rà   rŒ   r¤   r    rT  rä   Úprompt_tokens_totalÚgeneration_tokens_totalr   Úprompt_tokens_bucketsÚprompt_tokens_histogramÚgeneration_tokens_bucketsÚgeneration_tokens_histogramr€   Úcached_tokens_totalÚnum_requests_totalÚnum_so_requests_totalÚnum_aborted_requests_totalÚhistogram_time_to_first_tokenÚhistogram_inter_token_latencyÚhistogram_e2e_request_latencyÚnum_retractions)
r6   r¢   r    rQ  rR  rS  rT  rŒ   r¤   Údefault_bucket_prompt_tokensr   r   r   r  A  s¤   

ýý ÿüþü
ýýýýüüüüz"TokenizerMetricsCollector.__init__Úprompt_tokensÚgeneration_tokensÚcached_tokensÚe2e_latencyÚhas_grammarÚretraction_countÚcached_tokens_detailsc	                    s†  ˆj jdi ˆ ¤Ž |¡ ˆjjdi ˆ ¤Ž |¡ |dkrn|rZdtdtf‡ ‡fdd„}	|	d| dd¡ƒ |	d| dd¡ƒ d|v rY| dd¡}
|
dkrY| d	¡pPd
}|	d|› |
ƒ ni ˆ ¥ddi¥}ˆjjdi |¤Ž |¡ ˆjjdi ˆ ¤Ž d¡ |rˆˆj	jdi ˆ ¤Ž d¡ ˆj
jdi ˆ ¤Ž t|ƒ¡ ˆjrµˆjjdi ˆ ¤Ž t|ƒ¡ ˆjjdi ˆ ¤Ž t|ƒ¡ ˆjjdi ˆ ¤Ž |¡ d S )Nr   Úsourcer-  c                    s8   |dkri ˆ ¥d| i¥}ˆj jdi |¤Ž |¡ d S d S )Nr   rV  r   )rl  r    r  )r|  r-  Úsource_labels©r    r6   r   r   Úreport_cache_source5  s   þzSTokenizerMetricsCollector.observe_one_finished_request.<locals>.report_cache_sourceÚdeviceÚhostÚstorageÚstorage_backendrN   Ústorage_rV  Útotalr   r   )rf  r    r  rg  rV   rU   rø   rl  rm  rn  rr  r  r   rT  ri  rk  rs  )r6   r    ru  rv  rw  rx  ry  rz  r{  r  Ústorage_tokensÚbackendÚlabels_totalr   r~  r   Úobserve_one_finished_request#  s4   ÿ€ÿz6TokenizerMetricsCollector.observe_one_finished_requestr-  c                 C   s   | j jdi |¤Ž |¡ d S r  )rp  r    r  )r6   r    r-  r   r   r   Úobserve_time_to_first_tokenU  r  z5TokenizerMetricsCollector.observe_time_to_first_tokenc                 C   s|   | j jdi | j¤Ž}tdd„ |jD ƒƒ}|dk rdS |d }d}t|jƒD ]\}}||j7 }||kr;||j| k  S q%dS )Nc                 s   s    | ]}|j V  qd S r4   )Ú_value)r   Úbucketr   r   r   rŽ   Z  s   € zPTokenizerMetricsCollector.check_time_to_first_token_straggler.<locals>.<genexpr>r‰   Fg®Gáz®ï?r   r   )rp  r    r™   Ú_bucketsÚ	enumerater‹  Ú_upper_bounds)r6   r-  ÚhisÚtotal_observationsÚp99_thresholdÚcumulative_countÚirŒ  r   r   r   Ú#check_time_to_first_token_stragglerX  s   
ÿz=TokenizerMetricsCollector.check_time_to_first_token_stragglerÚ	internvalÚnum_new_tokensc                 C   s\   || }| j jdi |¤Ž}|j |¡ t|jƒD ]\}}||kr+|j|  |¡  d S qd S r  )rq  r    Ú_sumr  rŽ  r  r  )r6   r    r–  r—  Úadjusted_intervalr  r”  Úboundr   r   r   Úobserve_inter_token_latencye  s   þÿz5TokenizerMetricsCollector.observe_inter_token_latencyc                 C   s   | j jdi |¤Ž d¡ d S r  )ro  r    r  )r6   r    r   r   r   Úobserve_one_aborted_requestt  r  z5TokenizerMetricsCollector.observe_one_aborted_request)NNNNNFr4   )rP   rQ   rR   r   r   r   rV   r   r   rO  r  rU   r   r‰  rŠ  r•  r›  rœ  r   r   r   r   rP  @  sf    ùþ
ý
ü
û
úù
ø l÷
þýüûúùø	
÷2
ÿÿ
ÿrP  c                   @   sf   e Zd ZU eedZee ed< eedZ	ee ed< eedZ
ee ed< eedZee ed< dS )ÚStorageMetricsr|   Úprefetch_pgsÚ
backup_pgsÚprefetch_bandwidthÚbackup_bandwidthN)rP   rQ   rR   r   r€   rž  r   rU   rT   rŸ  r   r   r¡  r   r   r   r   r  x  s
   
 r  c                   @   sh   e Zd Zdeeef fdd„Zdefdd„Zdefdd	„Zd
e	ee
f fdd„Zddee fdd„ZdS )ÚStorageMetricsCollectorr    c                 C   sª   ddl m}m} || _|dd| ¡ d| _|dd| ¡ d| _g d¢}g d	¢}|d
d| ¡ |d| _|dd| ¡ |d| _|dd| ¡ |d| _	|dd| ¡ |d| _
d S )Nr   rU  zsglang:prefetched_tokens_totalz#Number of prefetched prompt tokens.r«   zsglang:backuped_tokens_totalzNumber of backuped tokens.)r   r„   r†   rˆ   r‰   )r¬   r®   r   r„   r†   rˆ   r‰   zsglang:prefetch_pgsz'Histogram of prefetch pages of batches.rÁ   zsglang:backup_pgsz%Histogram of backup pages of batches.zsglang:prefetch_bandwidthz(Histogram of prefetch bandwidth in GB/s.zsglang:backup_bandwidthz&Histogram of backup bandwidth in GB/s.)rà   rŒ   r¤   r    rä   Úprefetched_tokens_totalÚbackuped_tokens_totalÚhistogram_prefetch_pgsÚhistogram_backup_pgsÚhistogram_prefetch_bandwidthÚhistogram_backup_bandwidth)r6   r    rŒ   r¤   Ú	bucket_ioÚbucket_bandwidthr   r   r   r    sL   ýý
üüüüz StorageMetricsCollector.__init__Úprefetched_tokensc                 C   r  r  )r£  r    r  )r6   r«  r   r   r   Úlog_prefetched_tokensÃ  r!  z-StorageMetricsCollector.log_prefetched_tokensÚbackuped_tokensc                 C   r  r  )r¤  r    r  )r6   r­  r   r   r   Úlog_backuped_tokensÇ  r!  z+StorageMetricsCollector.log_backuped_tokensr  c                 C   r  r  r  r  r   r   r   r  Ë  r  z&StorageMetricsCollector._log_histogramNÚstorage_metricsc                 C   s†   |d u rd S t |tƒsJ ‚|jD ]	}|  | j|¡ q|jD ]	}|  | j|¡ q|jD ]	}|  | j|¡ q*|j	D ]	}|  | j
|¡ q7d S r4   )Ú
isinstancer  rž  r  r¥  rŸ  r¦  r   r§  r¡  r¨  )r6   r¯  Úvr   r   r   Úlog_storage_metricsÎ  s   



ÿz+StorageMetricsCollector.log_storage_metricsr4   )rP   rQ   rR   r   rV   r  rU   r¬  r®  r   r   r  r   r  r²  r   r   r   r   r¢  €  s    

þBr¢  c                   @   s   e Zd Zdeddfdd„ZdS )ÚExpertDispatchCollectorÚep_sizer   Nc                 C   s6   ddl m} dd„ t|ƒD ƒ}|dddh|d| _d S )	Nr   )r¤   c                 S   s   g | ]}|‘qS r   r   )r   r”  r   r   r   r   â  s    z4ExpertDispatchCollector.__init__.<locals>.<listcomp>zsglang:eplb_gpu_physical_countzBThe selected count of physical experts on each layer and GPU rank.ÚlayerrÁ   )rà   r¤   ÚrangeÚeplb_gpu_physical_count)r6   r´  r¤   Úep_size_bucketsr   r   r   r  ß  s   üz ExpertDispatchCollector.__init__)rP   rQ   rR   rU   r  r   r   r   r   r³  Þ  s    r³  c                   @   sn   e Zd Zdeeef ddfdd„Zdeddfdd„Zdeddfd	d
„Zde	ddfdd„Z
de	ddfdd„ZdS )ÚRadixCacheMetricsCollectorr    r   Nc                 C   sž   ddl m}m} || _tdƒ}|d u rg d¢}tdƒ}|d u r#g d¢}|dd| ¡ |d| _|d	d
| ¡ d| _|dd| ¡ |d| _|dd| ¡ d| _	d S )Nr   rU  ÚSGLANG_BUCKET_EVICTION_DURATION)rÏ   r\  gú~j¼t“h?r]  g{®Gázt?r^  gyé&1¬|?r_  g;ßO—n‚?rÃ   rÄ   r`  ra  rÅ   r¬   r­   r®   rb  Ú SGLANG_BUCKET_LOAD_BACK_DURATIONz sglang:eviction_duration_secondsz6Time taken to evict memory from GPU to CPU in seconds.rÁ   zsglang:evicted_tokens_totalz-The number of tokens evicted from GPU to CPU.r«   z!sglang:load_back_duration_secondsz5Time taken to load memory from CPU to GPU in seconds.zsglang:load_back_tokens_totalz,The number of tokens loaded from CPU to GPU.)
rà   rŒ   r¤   r    r    rä   Úeviction_duration_secondsÚeviction_num_tokensÚload_back_duration_secondsÚload_back_num_tokens)r6   r    rŒ   r¤   Úbucket_eviction_durationÚbucket_load_back_durationr   r   r   r  ì  sD   ÿÿüýüýz#RadixCacheMetricsCollector.__init__Ú
num_tokensc                 C   ó   | j jdi | j¤Ž |¡ d S r  )r½  r    r  ©r6   rÂ  r   r   r   Úincrement_eviction_num_tokens?  r  z8RadixCacheMetricsCollector.increment_eviction_num_tokensc                 C   rÃ  r  )r¿  r    r  rÄ  r   r   r   Úincrement_load_back_num_tokensB  r  z9RadixCacheMetricsCollector.increment_load_back_num_tokensÚduration_secondsc                 C   rÃ  r  )r¼  r    r  ©r6   rÇ  r   r   r   Úobserve_eviction_durationE  r  z4RadixCacheMetricsCollector.observe_eviction_durationc                 C   rÃ  r  )r¾  r    r  rÈ  r   r   r   Úobserve_load_back_durationH  r  z5RadixCacheMetricsCollector.observe_load_back_duration)rP   rQ   rR   r   rV   r  rU   rÅ  rÆ  r   rÉ  rÊ  r   r   r   r   r¹  ë  s    
þ
ýSr¹  )/rS   r›   Úloggingr   rá   r   r   Útypingr   r   r   r   r   Úsglang.srt.disaggregation.utilsr	   Úsglang.srt.environr
   Úsglang.srt.metrics.utilsr   r   Ú,sglang.srt.model_executor.forward_batch_infor   Úsglang.srt.server_argsr   Úsglang.srt.utilsr   Ú sglang.srt.utils.gauge_histogramr   r   Ú	getLoggerrP   r<  rV   r   r    r!   rW   rú   Útupler“   r”   rŸ   rP  r  r¢  r³  r¹  r   r   r   r   Ú<module>   sR   
 ;      #  :^