o
    پikz                     @  sf  d dl mZ d dlZd dlZd dlZd dlmZ d dlmZ d dl	m
Z
mZmZ d dlmZmZ d dlmZ d dlmZ d d	lmZmZmZmZmZmZmZmZmZ d d
lmZ d dl m!Z! d dl"m#Z#m$Z$m%Z%m&Z& d dl'm(Z( d dl)m*Z* d dl+m,Z, e
rd dlm-Z-m.Z. e/e0Z1e(dZ2ej34 Z5ej64 Z7ej8G dd dZ9G dd dZ:G dd dZ;dS )    )annotationsN)defaultdict)contextmanager)TYPE_CHECKINGOptionalUnion)EventPublisherFactoryKVEventBatch)DisaggregationMode)envs)	DisaggregationMetricsGetLoadReqInputGetLoadReqOutputGetLoadsReqInputGetLoadsReqOutputLoRAMetricsMemoryMetricsQueueMetricsSpeculativeMetrics)ScheduleBatch)GenerationBatchResult)DPCooperationInfoSchedulerMetricsCollectorSchedulerStatscompute_routing_key_stats)get_bool_env_var)DeviceTimer)SchedulerStatusLogger)EmbeddingBatchResult	SchedulerSGLANG_RECORD_STEP_TIMEc                   @  s:   e Zd ZU dZded< ded< ded< ded< ded< d	S )
PrefillStatsz(Stats for logging prefill batch metrics.intlog_input_tokenslog_hit_tokensfloatnew_token_ratio
running_bsnum_new_seqsN)__name__
__module____qualname____doc____annotations__ r.   r.   _/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/managers/scheduler_metrics_mixin.pyr!   .   s   
 r!   c                   @  s   e Zd Zdd ZdS )	KvMetricsc                 C  s4   d | _ d | _d | _d | _d | _d | _d | _d | _d S N)request_active_slotsrequest_total_slotskv_active_blockskv_total_blocksnum_requests_waitinggpu_cache_usage_percgpu_prefix_cache_hit_ratedata_parallel_rankselfr.   r.   r/   __init__:   s   
zKvMetrics.__init__N)r)   r*   r+   r<   r.   r.   r.   r/   r0   9   s    r0   c                   @  s   e Zd Zd=dd	Zd>ddZd?ddZd@ddZ	dAdBddZ	dAdCdd ZdDd"d#Z	dEd&d'Z
d@d(d)Zd@d*d+Zd@d,d-Zd@d.d/ZdAdFd4d5ZdAdGd9d:ZedHd;d<ZdS )ISchedulerMetricsMixinr;   r   tp_rankr"   pp_rankdp_rankOptional[int]c                 C  sH  d| _ d| _t | _t | _d| _d| _d| _t	t
| _d| _d| _d| _d| _d| _d| _d| _d| _d| _t | _| jdkpD| j| _| jr| jjtjjkrTd}n| jjtj jkr_d}nd}| jj!|||| j"d}|d urt||d< | jj#r|$| jj# t%|| j&| jd| _'t(rt)| j'j*d	| _+| j,r| -| jj. t/0 | _1d S )
Nr           prefilldecodeunified)
model_nameengine_typer>   r?   moe_ep_rankr@   )labelsenable_loraserver_args)reporter)2forward_ct_decodenum_generated_tokenstimeperf_counterlast_decode_stats_ticlast_prefill_stats_ticlast_prefill_tokenslast_gen_throughputlast_input_throughputr   liststep_time_dictspec_num_accepted_tokensspec_num_forward_ctspec_total_num_accepted_tokensspec_total_num_forward_ctkv_transfer_speed_gb_skv_transfer_latency_mskv_transfer_bootstrap_mskv_transfer_alloc_mskv_transfer_total_mbr   statsattn_tp_rank!enable_metrics_for_all_schedulers!current_scheduler_metrics_enabledenable_metricsrK   disaggregation_moder
   PREFILLvalueDECODEserved_model_namerH   extra_metric_labelsupdater   rJ   metrics_collectorENABLE_METRICS_DEVICE_TIMERr   increment_gpu_execution_secondsforward_pass_device_timerenable_kv_cache_eventsinit_kv_eventskv_events_configr   maybe_createscheduler_status_logger)r;   r>   r?   r@   rG   rI   r.   r.   r/   init_metricsF   s`   


z"SchedulerMetricsMixin.init_metricsrs   Optional[str]c                 C  s   | j rt|| j| _d S d S r1   )rq   r   createattn_dp_rankkv_event_publisher)r;   rs   r.   r.   r/   rr      s
   
z$SchedulerMetricsMixin.init_kv_eventsbsnum_accepted_tokensc                 C  s2   |  j || 7  _ |  j|7  _|  j|7  _d S r1   )rX   rY   rN   )r;   r{   r|   r.   r.   r/   update_spec_metrics   s   z)SchedulerMetricsMixin.update_spec_metricsc                 C  s(   d| _ d| _d| _d| _d| _d| _d S Nr   )rM   rN   rX   rY   rZ   r[   r:   r.   r.   r/   reset_metrics   s   
z#SchedulerMetricsMixin.reset_metricsNprefill_statsr!   can_run_cuda_graphbooldp_cooperation_infoOptional[DPCooperationInfo]c                 C  s  t  | j }t  | _| j| | _|j| _| jr;|  \}}}}}	}	}	}	t||}
t||}d|dd|dd}n.| j	rZ| 
 \}}	}}}	}	}	}	|}
|}d|dd|dd}n|  \}
}}	}	d|dd}|j| j_tryd| jd  d	nd
}d| d|j d|j d|j d| d|j dt| j d}| jtjkr|dt| jj d7 }|dt| j d7 }|d| jdd7 }n
|d| jdd7 }| jjr|dt| jj d7 }tdd ddd}||| j   d| 7 }t!"| | j#r| j$j%|j|j|d |j|j }|dkr|j| nd}|j| j_&d| j_'|
| j_(|| j_)| jr1|| j_*| j	r9|| j_+t| j| j_,t| j-| j_.|| j_/| j0| j_0| j1| j_1| j2| j_2d | _1| _2| jtjkrt| jj| j_3t| j| j_4| j5| j_5| j6| j_6| j7| j_7| j8| j_8| j9| j_9n| jtj:krt| j;j| j_<t| j=j| j_>| ?  | @  | j$A| j | B  | C  d S )Nzfull token usage: .2f, swa token usage: , , mamba usage: ztoken usage:  [   ] zPrefill batchz, #new-seq: z, #new-token: z, #cached-token: z#running-req: , #queue-req: #prealloc-req: z#inflight-req: zinput throughput (token/s): waiting-image-req: c                   S     dS Nz
cuda graphr.   r.   r.   r.   r/   <lambda>       z9SchedulerMetricsMixin.log_prefill_stats.<locals>.<lambda>	cpu graph	npu graphcpunpu: )prefill_compute_tokensprefill_cache_tokensr   r   rB   )DrO   rP   rR   rS   rU   r#   is_hybrid_swa_get_swa_token_infomaxis_hybrid_ssm_get_mamba_token_info_get_token_infor&   ra   LOG_FORWARD_ITERS
forward_ctr(   r$   r'   lenwaiting_queuerf   r
   rg   disagg_prefill_bootstrap_queuequeuedisagg_prefill_inflight_queuerK   language_onlymm_receiverwaiting_listr   deviceloggerinfore   rm   increment_realtime_tokensnum_running_reqsnum_running_reqs_offline_batchnum_used_tokenstoken_usageswa_token_usagemamba_usagenum_queue_reqsgrammar_managernum_grammar_queue_reqscache_hit_ratemax_total_num_tokensnum_retracted_reqsnum_paused_reqsnum_prefill_prealloc_queue_reqsnum_prefill_inflight_queue_reqsr\   r]   r^   r_   r`   ri   disagg_decode_prealloc_queuenum_decode_prealloc_queue_reqsdisagg_decode_transfer_queuenum_decode_transfer_queue_reqscalculate_utilizationupdate_lora_metrics	log_stats_emit_kv_metrics_publish_kv_events)r;   r   r   r   gap_latencyfull_num_usedswa_num_usedfull_token_usager   _num_usedr   token_usage_msgr   iter_msgmsggraph_backendtotal_tokensr   r.   r.   r/   log_prefill_stats   s   

















z'SchedulerMetricsMixin.log_prefill_statsrunning_batchr   c              	   C  sl  |p| j }t | j }t | _| j| | _d| _t|j}d}| jrL| 	 \}}}	}
}}}}t
||}t
|	|
}d| d|	dd| d|
dd	}n7| jrq|  \}}}	}}}}}|}|	}d| d|	dd| d	|dd	}n|  \}}}}d
| d|dd}tr| j| || jj  trd| j dnd}d| d| d| }| j rd}d}nG| j| j }| jjpdd }| jjp|}| j| }|dkr| j| nd}|  j| j7  _|  j| j7  _d | _| _|d|dd|dd7 }d}| jtjkr3|d| j j!| j" dd7 }|dt| j j# d7 }|dt| j$j# d7 }|dt| j j% d7 }| jj&rD|dt| j'j( d7 }t)dd ddd}||| j*  d| d | jdd!t| j+ 7 }t,-| | j.r0|| j/_0|| j/_1|| j/_2|| j/_3| jr|
| j/_4| jr|| j/_5|j67 8 | j/_9| j| j/_:t| j+| j/_;t| j<| j/_=|| j/_>| j"| j/_"|| j/_?|| j/_@| jA| j/_A| jB| j/_Bd | _A| _B| jtjCkrt| jDj#| j/_Et| jF| j/_Gn| jtjkrt| j j#| j/_Ht| j$j#| j/_Id"d# |jD }d$d# | j+D }tJ|\| j/_K| j/_LtJ|| \}| j/_M| N  | O  | jPQ| j/ | R  | S  d S )%Nr   z#full token: z, full token usage: r   z, #swa token: r   r   z, mamba num: r   z#token: z, token usage: r   r   r   zDecode batchz, #running-req: r   zaccept len: z, accept rate: rB   zpre-allocated usage: r   z#transfer-req: z#retracted-req: r   c                   S  r   r   r.   r.   r.   r.   r/   r     r   z8SchedulerMetricsMixin.log_decode_stats.<locals>.<lambda>r   r   r   r   z, gen throughput (token/s): r   c                 S     g | ]}|j qS r.   routing_key.0rr.   r.   r/   
<listcomp>      z:SchedulerMetricsMixin.log_decode_stats.<locals>.<listcomp>c                 S  r   r.   r   r   r.   r.   r/   r     r   )Tr   rO   rP   rQ   rN   rT   r   reqsr   r   r   r   r   r   RECORD_STEP_TIMErW   appendrK   decode_log_intervalr   r   spec_algorithmis_nonerX   rY   speculative_num_stepsspeculative_num_draft_tokensrZ   r[   rf   r
   ri   r   num_tokens_pre_allocatedr   r   r   retracted_queuer   r   r   r   r   r   r   r   re   ra   r   r   r   r   r   r   seq_lens_cpusumitemdecode_sum_seq_lensgen_throughputr   r   r   r   spec_accept_ratespec_accept_lengthr   r   rg   r   r   r   r   r   r   r   num_unique_running_routing_keysrouting_key_running_req_countsrouting_key_all_req_countsr   r   rm   r   r   r   )r;   r   r   batchr   r   r   r   r   r   r   r   r   r   r   
mamba_usedr   r   r   r   r   draft_tokens_fallbacknum_draft_tokenstotal_draft_tokensr   r   running_routing_keyswaiting_routing_keysr.   r.   r/   log_decode_stats0  s   




















z&SchedulerMetricsMixin.log_decode_statsr   c                 C  s@   | j r| jj| | |jd | j }r||| j d S d S )N)decode_tokensr   )re   rm   r   
batch_sizer   ru   
maybe_dumpr   )r;   r   r|   xr.   r.   r/    log_decode_stats_every_iteration  s   

z6SchedulerMetricsMixin.log_decode_stats_every_iterationresult2Union[GenerationBatchResult, EmbeddingBatchResult]c                 C  sL   | j sd S t|tsd S |j }d ur$| jj|jj |j	
 d d S d S )N)forward_modebalancedness)re   
isinstancer   expert_distribution_metricsrm   increment_eplb_balancednessr   namelowereplb_balancednessr   )r;   r   r   mr.   r.   r/   log_batch_result_stats  s   


z,SchedulerMetricsMixin.log_batch_result_statsc                 C  s   | j sd S t }| jj|_| j|_t| jj| j	 |_
| j	|_| jj|_| jj|_| jj|_| jd ur6| jnd|_| jjsE| j| d S d S r~   )rq   r0   ra   r   r2   max_running_requestsr3   r"   r   r   r4   r5   r   r6   r7   r   r8   r@   r9   send_metrics_from_schedulerclosed
send_pyobj)r;   
kv_metricsr.   r.   r/   r     s    



z&SchedulerMetricsMixin._emit_kv_metricsc                 C  s<   | j sd S | j }|rtt |d}| j| d S d S )N)tsevents)rq   
tree_cachetake_eventsr	   rO   rz   publish)r;   r  r   r.   r.   r/   r     s   
z(SchedulerMetricsMixin._publish_kv_eventsc           
   
   C  sR  | j sdS z| jjj}|du s|jdu rW dS |j}|j}t }t| drL| jrL| jD ]}|rJt|drJ|j	D ]}t|drI|j
durI||j
 q7q+n%t| drq| jrqt| jdrq| jj	D ]}t|drp|j
durp||j
 q^t|}|dkr}|| nd}|| j_|| j_|| j_W dS  ty }	 ztd|	  W Y d}	~	dS d}	~	ww )	z8Update LoRA pool metrics for monitoring and autoscaling.Nrunning_mbsr   lora_idr   r   rB   zFailed to update LoRA metrics: )rJ   	tp_workermodel_runnerlora_managermemory_poolmax_loras_per_batchsethasattrr  r   r  addr   r   ra   lora_pool_slots_usedlora_pool_slots_totallora_pool_utilization	Exceptionr   warning)
r;   r  mem_poolslots_totalactive_lora_idsr   req
slots_usedutilizationer.   r.   r/   r     s@   


z)SchedulerMetricsMixin.update_lora_metricsc                 C  s^   | j tjkrd| j_d S | jjd ur+| jjdkr-t| jj| jj | jjd | j_d S d S d S )Nr   g?)	rf   r
   rg   ra   r(  max_running_requests_under_SLOr   r   r   r:   r.   r.   r/   r   A  s   
z+SchedulerMetricsMixin.calculate_utilizationr   r   returnr   c                 C  s   | j r|  ^}}}t||}n| jr|  d }n|  d }| jg}| jtj	kr2|
| jj n| jtjkrM|
| jj |
| jj |
| jj |tdd |D 7 }tdd |D }t| jt| jj| ||t dS )Nr   c                 s  s     | ]}|D ]}|j V  qqd S r1   )seqlen)r   r   r&  r.   r.   r/   	<genexpr>a  s    z1SchedulerMetricsMixin.get_load.<locals>.<genexpr>c                 s      | ]}t |V  qd S r1   r   r   r   r.   r.   r/   r.  b      )r@   num_reqsnum_waiting_reqs
num_tokensts_tic)r   r   r   r   r   r   r   rf   r
   rg   r   r   r   ri   r   r   r   r   r   r@   r   r   r   rO   rP   )r;   r   r   r   r5  waiting_queuesr4  r.   r.   r/   get_loadO  s,   zSchedulerMetricsMixin.get_loadr&  r   r   c              
   C  sv  |du rt  }|jrt|jndh}d|v }t| jj}| jg}| jtj	kr.|
| jj n| jtjkrI|
| jj |
| jj |
| jj tdd |D }| jrb|  ^}}}	t||}
n| jrl|  d }
n|  d }
| jdkr||
| j nd}d}|sd|v rz tt| jjjd	t| j j d	t| jjj!d	t"| jd
}W n t#y } zt$%d|  W Y d}~nd}~ww d}|sd|v r| j&' s| j(dkrt)| j*| j( | j+j,d}d}|sd|v rt-| dr| j.durt/| j+j0| j+j1| j+j2d}d}|sd|v rTd}d}d}d}d}d}| jtj	kr)d}t| jj}t| j3}n| jtjkrDd}t| jj}t| jj}t| jj}t4||||||| j+j5| j+j6d}d}|s^d|v rot7t| j| j+j8| j+j9| j+j:d}t;d'i d| j<dt== d|d|d|
d| jdt|dd t| j+j>d!d"t| j+j?dd#t| j+j@dd$| jAd|d%|d|d&|d|S )(a  
        Get comprehensive load metrics for /v1/loads endpoint.

        Args:
            req: Request containing include list and optional dp_rank filter

        Returns:
            GetLoadsReqOutput with core metrics and optional detailed sections
        Ncoreallc                 s  r/  r1   r0  r1  r.   r.   r/   r.    r2  z2SchedulerMetricsMixin.get_loads.<locals>.<genexpr>r   rB   memory   )	weight_gbkv_cache_gbgraph_gbtoken_capacityzMemory metrics not available: spec)accept_lengthaccept_rateloralora_scheduler)r'  r$  r(  disaggnullrC   rD   )modeprefill_prealloc_queue_reqsprefill_inflight_queue_reqsdecode_prealloc_queue_reqsdecode_transfer_queue_reqsdecode_retracted_queue_reqsr\   r]   queues)waitinggrammarpaused	retractedr@   	timestampr   r4  r   r   r      r      r   r(  r
  speculativedisaggregationr.   )Br   includer  r   r   r   r   rf   r
   rg   r   r   r   ri   r   r   r   r   r   r   r   r   r   r   r   r   roundr  r  weight_load_mem_usagetoken_to_kv_pool_allocatorget_kvcache	mem_usagegraph_mem_usager"   AttributeErrorr   debugr   r   r[   r   rZ   ra   r   r  rE  r   r  r  r   r   r   r\   r]   r   r   r   r   r   r@   rO   r   r   r(  r
  )r;   r&  rX  include_allr   r7  r4  r   r   r   r   r   r;  r)  rV  rD  rW  mode_strprefill_preallocprefill_inflightdecode_preallocdecode_transferdecode_retractedrN  r.   r.   r/   	get_loadsl  s   






	
zSchedulerMetricsMixin.get_loadsc                 c  sj    | j rtsd V  d S d|jj  }| jjt||jdd d V  W d    d S 1 s.w   Y  d S )Nforward_)categoryr   )metadata)	re   rn   r   r  r  rp   wrapdictr   )r;   r   rj  r.   r.   r/   record_forward_metrics  s   
"z,SchedulerMetricsMixin.record_forward_metrics)r;   r   r>   r"   r?   r"   r@   rA   )r;   r   rs   rw   )r;   r   r{   r"   r|   r"   )r;   r   r1   )r;   r   r   r!   r   r   r   r   )r;   r   r   r   r   r   )r;   r   r   r   r|   r"   )r;   r   r   r   r   r   )r;   r   r   r   r,  r   )r;   r   r&  r   r,  r   )r;   r   r   r   )r)   r*   r+   rv   rr   r}   r   r   r   r   r	  r   r   r   r   r8  rh  r   rn  r.   r.   r.   r/   r=   E   s,    

G

  
-



	
, r=   )<
__future__r   dataclassesloggingrO   collectionsr   
contextlibr   typingr   r   r   #sglang.srt.disaggregation.kv_eventsr   r	   sglang.srt.disaggregation.utilsr
   sglang.srt.environr   sglang.srt.managers.io_structr   r   r   r   r   r   r   r   r   sglang.srt.managers.schedulerr   sglang.srt.managers.utilsr   sglang.srt.metrics.collectorr   r   r   r   sglang.srt.utilsr   sglang.srt.utils.device_timerr   (sglang.srt.utils.scheduler_status_loggerr   r   r   	getLoggerr)   r   r   SGLANG_LOG_FORWARD_ITERSgetr   "SGLANG_ENABLE_METRICS_DEVICE_TIMERrn   	dataclassr!   r0   r=   r.   r.   r.   r/   <module>   s6    ,



