o
    iD                     @   sP  d dl Z d dlmZmZ d dlmZmZ d dlmZm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZ er@d dlmZmZmZ eG d	d
 d
ZG dd dZeG dd deZeG dd deZeG dd dZeG dd dZeG dd dZeG dd dZeG dd dZG dd dZG dd dZ G dd  d Z!dS )!    N)defaultdictdeque)	dataclassfield)TYPE_CHECKINGAny)CUDAGraphStat)	PerfStats)SpecDecodingStats)EngineCoreEventEngineCoreOutputFinishReasonc                   @   sH   e Zd ZU dZdZeed< 	 dZeed< 	 dZ	eed< 	 dZ
eed< dS )	BaseCacheStatszStores cache hit statistics.Fresetr   requestsquerieshitsN)__name__
__module____qualname____doc__r   bool__annotations__r   intr   r    r   r   K/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/v1/metrics/stats.pyr      s   
 r   c                       sf   e Zd ZdZddeddf fddZdefd	d
Zdd Ze	de
fddZe	defddZ  ZS )CachingMetricszMetrics for caching with a hit rate of the most recent N requests.
    Args:
        interval: The number of the most recent requests to aggregate.
            Defaults to 1000.
      max_recent_requestsreturnNc                    s<   t    || _d| _d| _d| _tttttf   | _	d S Nr   )
super__init__r   aggregated_requestsaggregated_query_totalaggregated_query_hitr   tupler   query_queue)selfr   	__class__r   r   r"   *   s   
zCachingMetrics.__init__statsc                 C   s   |j r|    |jdkrdS | j|j|j|jf |  j|j7  _|  j|j7  _|  j|j7  _t	| jdkrk| j| j
kro| j \}}}|  j|8  _|  j|8  _|  j|8  _t	| jdkrm| j| j
ks?dS dS dS dS )a  Observe the prefix caching for a set of requests.

        This function is called with information gathered when new requests
        are being scheduled and are looking for computed blocks.

        When there are more than `max_recent_requests` requests, the oldest set
        of requests are removed from the metrics.

        Args:
            stats: The prefix cache stats.
        r   N   )r   r   r'   appendr   r   r#   r$   r%   lenr   popleft)r(   r+   old_requestsold_queriesold_hitsr   r   r   observe6   s$   
zCachingMetrics.observec                 C   s    d| _ d| _d| _| j  dS )zReset the metrics.r   N)r#   r$   r%   r'   clearr(   r   r   r   r   ^   s   zCachingMetrics.resetc                 C   s
   | j dkS )z.Return true if no requests have been observed.r   )r#   r5   r   r   r   emptye   s   
zCachingMetrics.emptyc                 C   s   | j dkrdS | j| j  S )z/Calculate the hit rate for the past N requests.r           )r$   r%   r5   r   r   r   hit_ratej   s   
zCachingMetrics.hit_rate)r   )r   r   r   r   r   r"   r   r3   r   propertyr   r6   floatr8   __classcell__r   r   r)   r   r   #   s    (r   c                   @   sV   e Zd ZU dZdZeed< 	 dZeed< 	 dZeed< 	 dedede	d	d
fddZ
d
S )PrefixCacheStatsz
    Stores prefix cache hit statistics.
    - `reset`: Whether `reset_prefix_cache` was invoked.
    - `queries`: Refers to the number of tokens that were queried.
    r   preempted_requestspreempted_queriespreempted_hits
num_tokensnum_hits	preemptedr   Nc                 C   s`   |r|  j d7  _ |  j|7  _|  j|7  _dS |  jd7  _|  j|7  _|  j|7  _dS )z-Aggregate request information into the stats.r,   N)r=   r>   r?   r   r   r   )r(   r@   rA   rB   r   r   r   record   s   zPrefixCacheStats.record)r   r   r   r   r=   r   r   r>   r?   r   rC   r   r   r   r   r<   r   s   
 r<   c                   @   s   e Zd ZdZdS )MultiModalCacheStatsz
    Stores multi-modal cache hit statistics.
    - `reset`: Whether `reset_mm_cache` was invoked.
    - `queries`: Refers to the number of multi-modal data items
      that were queried.
    N)r   r   r   r   r   r   r   r   rD      s    rD   c                   @   s2   e Zd ZU dZeed< eed< eedf ed< dS )KVCacheEvictionEventz&Single KV cache block eviction sample.lifetime_secondsidle_seconds.reuse_gaps_secondsN)r   r   r   r   r:   r   r&   r   r   r   r   rE      s
   
 rE   c                   @   s  e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed< dZ
eed< dZeed	< eed
Zeed< dZedB ed< eed
Zee ed< dZedB ed< dZeeef dB ed< eed
Zeeef ed< eed
Zeeef ed< dZedB ed< dZedB ed< dS )SchedulerStatsz$Stats associated with the scheduler.r   num_running_reqsnum_waiting_reqsstep_countercurrent_waver7   kv_cache_usageencoder_cache_usage)default_factoryprefix_cache_statsNconnector_prefix_cache_statskv_cache_eviction_eventsspec_decoding_statskv_connector_statswaiting_lora_adaptersrunning_lora_adapterscudagraph_stats
perf_stats) r   r   r   r   rJ   r   r   rK   rL   rM   rN   r:   rO   r   r<   rQ   rR   listrS   rE   rT   r
   rU   dictstrr   rV   rW   rX   r   rY   r	   r   r   r   r   rI      s"   
 rI   c                   @   sr   e Zd ZU dZdZeed< dZeed< dZ	eed< dZ
eed< dZeed< dZeed	< dZeed
< dZeed< dS )RequestStateStatsz3Stats that need to be tracked across delta updates.r   num_generation_tokensr7   arrival_time	queued_tsscheduled_tsfirst_token_tslast_token_tsfirst_token_latencyFis_corruptedN)r   r   r   r   r^   r   r   r_   r:   r`   ra   rb   rc   rd   re   r   r   r   r   r   r]      s   
 r]   c                   @   s   e Zd ZU dZded< dZeed< dZeed< dZ	eed< d	Z
ed	B ed
< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< d	S )FinishedRequestStatsz)Stats associated with a finished request.r   finish_reasonr7   e2e_latencyr   num_prompt_tokensr^   Nmax_tokens_paramqueued_timeprefill_timeinference_timedecode_timemean_time_per_output_tokenFre   num_cached_tokens)r   r   r   r   r   rh   r:   ri   r   r^   rj   rk   rl   rm   rn   ro   re   r   rp   r   r   r   r   rf      s   
 rf   c                   @   s   e Zd ZU dZdZeedf ed< dZe	ed< dZ
e	ed< dZe	ed< dZe	ed	< dZe	ed
< dZe	ed< de	de	de	ddfddZdede	fddZdS )PromptTokenStatsab  Breakdown of prompt tokens by source.

    Fields:
        computed: Tokens prefilled locally (actual compute work).
        local_cache_hit: Tokens from local prefix cache.
        external_kv_transfer: Tokens from external KV transfer.
        cached_tokens: Tokens skipped during prefill (from scheduler).
        recomputed_tokens: Cached tokens that were recomputed (see below).
        total: Total prompt tokens.

    Invariants:
        computed + local_cache_hit + external_kv_transfer - recomputed_tokens = total
        local_cache_hit + external_kv_transfer - recomputed_tokens = cached_tokens
    local_computelocal_cache_hitexternal_kv_transfer.ALL_SOURCESr   computedrt   ru   cached_tokensrecomputed_tokenstotalrp   num_external_computed_tokens
prompt_lenr   Nc                 C   sx   |d |krdnd}|  j || 7  _ |  j|7  _|  j|| | 7  _|  j|7  _|  j|7  _|  j|7  _dS )z#Update stats from a prefill output.r,   r   N)rw   ru   rt   rx   ry   rz   )r(   rp   r{   r|   
recomputedr   r   r   update_from_output  s   

z#PromptTokenStats.update_from_outputsourcec                 C   s0   | j | j| jd}||vrtd| || S )z Get token count by source label.rr   zUnknown source: )rw   rt   ru   
ValueError)r(   r   
source_mapr   r   r   get_by_source  s   zPromptTokenStats.get_by_source)r   r   r   r   rv   r&   r\   r   rw   r   rt   ru   rx   ry   rz   r~   r   r   r   r   r   rq      s&   
 
rq   c                   @   s   e Zd ZdZdd ZdefddZedefddZ	d	e
de
fd
dZddde
dedededddedB fddZdeded dededddedB fddZ	d%ddd ed!edB ded"ef
d#d$ZdS )&IterationStatsz8Stats associated with a single set of EngineCoreOutputs.c                 C   sF   t   | _d| _t | _d| _g | _g | _g | _g | _	g | _
d| _d S r    )timeiteration_timestampr^   rq   prompt_token_statsnum_preempted_reqsfinished_requestsmax_num_generation_tokens_itern_params_itertime_to_first_tokens_iterinter_token_latencies_iternum_corrupted_reqsr5   r   r   r   r"   ,  s   

zIterationStats.__init__r   c                 C   s0   d dd t|  D }| jj d| dS )Nz, c                 s   s"    | ]\}}| d | V  qdS )=Nr   ).0kvr   r   r   	<genexpr>9  s     z*IterationStats.__repr__.<locals>.<genexpr>())joinvarsitemsr*   r   )r(   field_to_value_strr   r   r   __repr__8  s   zIterationStats.__repr__c                 C   s   | j jS )z1Total prompt tokens (for backward compatibility).)r   rz   r5   r   r   r   ri   <  s   z IterationStats.num_prompt_tokensstartc                 C   s
   | j | S )z=Calculate an interval relative to this iteration's timestamp.)r   )r(   r   r   r   r   _time_sinceA  s   
zIterationStats._time_sinceoutputr   engine_core_timestampis_prefillingr|   	req_statslora_statesLoRARequestStates	lora_nameNc                 C   s   t |j}|  j|7  _|r(| jj|j|j|d | |j}	| j	
|	 |	|_| j|7  _tjr=|js=|jdkr=d|_|jd urN| |j|j|||| |rT||_n||j }
| j
|
 ||_d S )N)rp   r{   r|   r   T)r.   new_token_idsr^   r   r~   rp   r{   r   r_   r   r-   rd   envsVLLM_COMPUTE_NANS_IN_LOGITSre   num_nans_in_logitseventsupdate_from_events
request_idrb   rc   r   )r(   r   r   r   r|   r   r   r   num_new_generation_tokensrd   itlr   r   r   r~   E  s@   






z!IterationStats.update_from_outputreq_idr   r   c           	      C   s   ddl m} |D ]<}|j|jkr|j|_||| q|j|jkr1|jdkr*|j|_|	|| q|j|j
krD|  jd7  _||| qd S )Nr   )EngineCoreEventTyper7   r,   )vllm.v1.enginer   typeQUEUED	timestampr`   request_waiting	SCHEDULEDra   request_running	PREEMPTEDr   )	r(   r   r   r   r   r   r   r   eventr   r   r   r   |  s   

z!IterationStats.update_from_eventsr   rg   r   ri   rj   rp   c                 C   s   |  |j}|j|j }|j|j }|j|j }	|j|j }
|jd dkr,|	|jd  nd}t||||j||||
|	||j|d}| j	
| |jrR|  jd7  _d S d S )Nr,   r   )rg   rh   ri   r^   rj   rk   rl   rm   rn   ro   re   rp   )r   r_   ra   r`   rb   rc   r^   rf   re   r   r-   r   )r(   rg   ri   rj   r   rp   rh   rk   rl   rn   rm   ro   finished_reqr   r   r   update_from_finished_request  s6   z+IterationStats.update_from_finished_request)r   )r   r   r   r   r"   r\   r   r9   r   ri   r:   r   r   r]   r~   rZ   r   r   r   r   r   r   r   )  s^    
7
r   c                   @   s@   e Zd ZdZdd ZdededefddZed	efd
dZ	dS )	LoRAStatsz9Tracks waiting and running request IDs for a single LoRA.c                 C   s   t  | _t  | _d S N)setwaitingrunningr5   r   r   r   r"     s   zLoRAStats.__init__r   r   r   c                 C   sN   |r|rJ |r| j | n| j | |r| j| d S | j| d S r   )r   adddiscardr   )r(   r   r   r   r   r   r   update  s   zLoRAStats.updater   c                 C   s   | j p| j S r   r   r   r5   r   r   r   r6     s   zLoRAStats.emptyN)
r   r   r   r   r"   r\   r   r   r9   r6   r   r   r   r   r     s    r   c                   @   s   e Zd ZdZddefddZdededB d	ed
efddZdededB fddZdededB fddZ	dededB fddZ
dedB fddZdS )r   z1A per-LoRA count of running and waiting requests.F	log_statsc                 C   s   || _ tt| _d S r   )r   r   r   r   )r(   r   r   r   r   r"     s   zLoRARequestStates.__init__r   r   Nr   r   c                 C   s@   | j r|d u r	d S | j| }|||| |jr| j|= d S d S r   )r   r   r   r6   )r(   r   r   r   r   
lora_statsr   r   r   _request_update  s   
z!LoRARequestStates._request_updatec                 C      | j ||ddd d S )NTFr   r   r(   r   r   r   r   r   r        z!LoRARequestStates.request_waitingc                 C   r   )NFTr   r   r   r   r   r   r     r   z!LoRARequestStates.request_runningc                 C   s   | j ||ddd d S )NFr   r   r   r   r   r   request_finished  r   z"LoRARequestStates.request_finishedscheduler_statsc                 C   sJ   | j r|d u r	d S | j D ]\}}t|j|j|< t|j|j|< qd S r   )r   r   r   r.   r   rV   r   rW   )r(   r   r   r+   r   r   r   update_scheduler_stats  s   z(LoRARequestStates.update_scheduler_stats)F)r   r   r   r   r   r"   r\   r   r   r   r   rI   r   r   r   r   r   r     s     
r   )"r   collectionsr   r   dataclassesr   r   typingr   r   	vllm.envsr   vllm.compilation.cuda_graphr   vllm.v1.metrics.perfr	   vllm.v1.spec_decode.metricsr
   r   r   r   r   r   r   r<   rD   rE   rI   r]   rf   rq   r   r   r   r   r   r   r   <module>   s>   O	> !