o
    .iW:                     @   s>  d dl Z d dlmZmZ d dlmZmZ d dlmZm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZ er@d dlmZmZmZ eG d	d
 d
ZG dd dZeG dd deZeG dd deZeG dd dZeG dd dZeG dd dZeG dd dZG dd dZG dd dZG dd dZ dS )    N)defaultdictdeque)	dataclassfield)TYPE_CHECKINGAny)CUDAGraphStat)	PerfStats)SpecDecodingStats)EngineCoreEventEngineCoreOutputFinishReasonc                   @   sH   e Zd ZU dZdZeed< 	 dZeed< 	 dZ	eed< 	 dZ
eed< dS )	BaseCacheStatszStores cache hit statistics.Fresetr   requestsquerieshitsN)__name__
__module____qualname____doc__r   bool__annotations__r   intr   r    r   r   R/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/v1/metrics/stats.pyr      s   
 r   c                       sf   e Zd ZdZddeddf fddZdefd	d
Zdd Ze	de
fddZe	defddZ  ZS )CachingMetricszMetrics for caching with a hit rate of the most recent N requests.
    Args:
        interval: The number of the most recent requests to aggregate.
            Defaults to 1000.
      max_recent_requestsreturnNc                    s<   t    || _d| _d| _d| _tttttf   | _	d S Nr   )
super__init__r   aggregated_requestsaggregated_query_totalaggregated_query_hitr   tupler   query_queue)selfr   	__class__r   r   r"   *   s   
zCachingMetrics.__init__statsc                 C   s   |j r|    |jdkrdS | j|j|j|jf |  j|j7  _|  j|j7  _|  j|j7  _t	| jdkrk| j| j
kro| j \}}}|  j|8  _|  j|8  _|  j|8  _t	| jdkrm| j| j
ks?dS dS dS dS )a  Observe the prefix caching for a set of requests.

        This function is called with information gathered when new requests
        are being scheduled and are looking for computed blocks.

        When there are more than `max_recent_requests` requests, the oldest set
        of requests are removed from the metrics.

        Args:
            stats: The prefix cache stats.
        r   N   )r   r   r'   appendr   r   r#   r$   r%   lenr   popleft)r(   r+   old_requestsold_queriesold_hitsr   r   r   observe6   s$   
zCachingMetrics.observec                 C   s    d| _ d| _d| _| j  dS )zReset the metrics.r   N)r#   r$   r%   r'   clearr(   r   r   r   r   ^   s   zCachingMetrics.resetc                 C   s
   | j dkS )z.Return true if no requests have been observed.r   )r#   r5   r   r   r   emptye   s   
zCachingMetrics.emptyc                 C   s   | j dkrdS | j| j  S )z/Calculate the hit rate for the past N requests.r           )r$   r%   r5   r   r   r   hit_ratej   s   
zCachingMetrics.hit_rate)r   )r   r   r   r   r   r"   r   r3   r   propertyr   r6   floatr8   __classcell__r   r   r)   r   r   #   s    (r   c                   @   sV   e Zd ZU dZdZeed< 	 dZeed< 	 dZeed< 	 dedede	d	d
fddZ
d
S )PrefixCacheStatsz
    Stores prefix cache hit statistics.
    - `reset`: Whether `reset_prefix_cache` was invoked.
    - `queries`: Refers to the number of tokens that were queried.
    r   preempted_requestspreempted_queriespreempted_hits
num_tokensnum_hits	preemptedr   Nc                 C   s`   |r|  j d7  _ |  j|7  _|  j|7  _dS |  jd7  _|  j|7  _|  j|7  _dS )z-Aggregate request information into the stats.r,   N)r=   r>   r?   r   r   r   )r(   r@   rA   rB   r   r   r   record   s   zPrefixCacheStats.record)r   r   r   r   r=   r   r   r>   r?   r   rC   r   r   r   r   r<   r   s   
 r<   c                   @   s   e Zd ZdZdS )MultiModalCacheStatsz
    Stores multi-modal cache hit statistics.
    - `reset`: Whether `reset_mm_cache` was invoked.
    - `queries`: Refers to the number of multi-modal data items
      that were queried.
    N)r   r   r   r   r   r   r   r   rD      s    rD   c                   @   s2   e Zd ZU dZeed< eed< eedf ed< dS )KVCacheEvictionEventz&Single KV cache block eviction sample.lifetime_secondsidle_seconds.reuse_gaps_secondsN)r   r   r   r   r:   r   r&   r   r   r   r   rE      s
   
 rE   c                   @   s  e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed< dZ
eed< eed	Zeed
< dZedB ed< eed	Zee ed< dZedB ed< dZeeef dB ed< eed	Zeeef ed< eed	Zeeef ed< dZedB ed< dZedB ed< dS )SchedulerStatsz$Stats associated with the scheduler.r   num_running_reqsnum_waiting_reqsstep_countercurrent_waver7   kv_cache_usage)default_factoryprefix_cache_statsNconnector_prefix_cache_statskv_cache_eviction_eventsspec_decoding_statskv_connector_statswaiting_lora_adaptersrunning_lora_adapterscudagraph_stats
perf_stats)r   r   r   r   rJ   r   r   rK   rL   rM   rN   r:   r   r<   rP   rQ   listrR   rE   rS   r
   rT   dictstrr   rU   rV   rW   r   rX   r	   r   r   r   r   rI      s    
 rI   c                   @   sr   e Zd ZU dZdZeed< dZeed< dZ	eed< dZ
eed< dZeed< dZeed	< dZeed
< dZeed< dS )RequestStateStatsz3Stats that need to be tracked across delta updates.r   num_generation_tokensr7   arrival_time	queued_tsscheduled_tsfirst_token_tslast_token_tsfirst_token_latencyFis_corruptedN)r   r   r   r   r]   r   r   r^   r:   r_   r`   ra   rb   rc   rd   r   r   r   r   r   r\      s   
 r\   c                   @   s   e Zd ZU dZded< dZeed< dZeed< dZ	eed< d	Z
ed	B ed
< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< d	S )FinishedRequestStatsz)Stats associated with a finished request.r   finish_reasonr7   e2e_latencyr   num_prompt_tokensr]   Nmax_tokens_paramqueued_timeprefill_timeinference_timedecode_timemean_time_per_output_tokenFrd   num_cached_tokens)r   r   r   r   r   rg   r:   rh   r   r]   ri   rj   rk   rl   rm   rn   rd   r   ro   r   r   r   r   re      s   
 re   c                   @   s   e Zd ZdZdd ZdefddZdedefdd	Zd
ddede	de
dedddedB fddZdeded de	dedddedB fddZ	d#ddde
de
dB ded e
f
d!d"ZdS )$IterationStatsz8Stats associated with a single set of EngineCoreOutputs.c                 C   sD   t   | _d| _d| _d| _g | _g | _g | _g | _g | _	d| _
d S r    )timeiteration_timestampr]   rh   num_preempted_reqsfinished_requestsmax_num_generation_tokens_itern_params_itertime_to_first_tokens_iterinter_token_latencies_iternum_corrupted_reqsr5   r   r   r   r"      s   

zIterationStats.__init__r   c                 C   s0   d dd t|  D }| jj d| dS )Nz, c                 s   s"    | ]\}}| d | V  qdS )=Nr   ).0kvr   r   r   	<genexpr>   s     z*IterationStats.__repr__.<locals>.<genexpr>())joinvarsitemsr*   r   )r(   field_to_value_strr   r   r   __repr__   s   zIterationStats.__repr__startc                 C   s
   | j | S )z=Calculate an interval relative to this iteration's timestamp.)rr   )r(   r   r   r   r   _time_since   s   
zIterationStats._time_sinceoutputr   engine_core_timestampis_prefilling
prompt_len	req_statslora_statesLoRARequestStates	lora_nameNc                 C   s   t |j}|  j|7  _|r$|  j|7  _| |j}	| j|	 |	|_| j|7  _t	j
r9|js9|jdkr9d|_|jd urJ| |j|j|||| |rP||_n||j }
| j|
 ||_d S )Nr   T)r.   new_token_idsr]   rh   r   r^   rw   r-   rc   envsVLLM_COMPUTE_NANS_IN_LOGITSrd   num_nans_in_logitseventsupdate_from_events
request_idra   rb   rx   )r(   r   r   r   r   r   r   r   num_new_generation_tokensrc   itlr   r   r   update_from_output   s8   






z!IterationStats.update_from_outputreq_idr   r   c           	      C   s   ddl m} |D ]<}|j|jkr|j|_||| q|j|jkr1|jdkr*|j|_|	|| q|j|j
krD|  jd7  _||| qd S )Nr   )EngineCoreEventTyper7   r,   )vllm.v1.enginer   typeQUEUED	timestampr_   request_waiting	SCHEDULEDr`   request_running	PREEMPTEDrs   )	r(   r   r   r   r   r   r   r   eventr   r   r   r   3  s   

z!IterationStats.update_from_eventsr   rf   r   rh   ri   ro   c                 C   s   |  |j}|j|j }|j|j }|j|j }	|j|j }
|jd dkr,|	|jd  nd}t||||j||||
|	||j|d}| j	
| |jrR|  jd7  _d S d S )Nr,   r   )rf   rg   rh   r]   ri   rj   rk   rl   rm   rn   rd   ro   )r   r^   r`   r_   ra   rb   r]   re   rd   rt   r-   ry   )r(   rf   rh   ri   r   ro   rg   rj   rk   rm   rl   rn   finished_reqr   r   r   update_from_finished_requestK  s6   z+IterationStats.update_from_finished_request)r   )r   r   r   r   r"   r[   r   r:   r   r   r   r\   r   rY   r   r   r   r   r   r   rp      sZ    
3
rp   c                   @   s@   e Zd ZdZdd ZdededefddZed	efd
dZ	dS )	LoRAStatsz9Tracks waiting and running request IDs for a single LoRA.c                 C   s   t  | _t  | _d S N)setwaitingrunningr5   r   r   r   r"     s   zLoRAStats.__init__r   r   r   c                 C   sN   |r|rJ |r| j | n| j | |r| j| d S | j| d S r   )r   adddiscardr   )r(   r   r   r   r   r   r   update  s   zLoRAStats.updater   c                 C   s   | j p| j S r   r   r   r5   r   r   r   r6     s   zLoRAStats.emptyN)
r   r   r   r   r"   r[   r   r   r9   r6   r   r   r   r   r     s    r   c                   @   s   e Zd ZdZddefddZdededB d	ed
efddZdededB fddZdededB fddZ	dededB fddZ
dedB fddZdS )r   z1A per-LoRA count of running and waiting requests.F	log_statsc                 C   s   || _ tt| _d S r   )r   r   r   r   )r(   r   r   r   r   r"     s   zLoRARequestStates.__init__r   r   Nr   r   c                 C   s@   | j r|d u r	d S | j| }|||| |jr| j|= d S d S r   )r   r   r   r6   )r(   r   r   r   r   
lora_statsr   r   r   _request_update  s   
z!LoRARequestStates._request_updatec                 C      | j ||ddd d S )NTFr   r   r(   r   r   r   r   r   r        z!LoRARequestStates.request_waitingc                 C   r   )NFTr   r   r   r   r   r   r     r   z!LoRARequestStates.request_runningc                 C   s   | j ||ddd d S )NFr   r   r   r   r   r   request_finished  r   z"LoRARequestStates.request_finishedscheduler_statsc                 C   sJ   | j r|d u r	d S | j D ]\}}t|j|j|< t|j|j|< qd S r   )r   r   r   r.   r   rU   r   rV   )r(   r   r   r+   r   r   r   update_scheduler_stats  s   z(LoRARequestStates.update_scheduler_stats)F)r   r   r   r   r   r"   r[   r   r   r   r   rI   r   r   r   r   r   r     s     
r   )!rq   collectionsr   r   dataclassesr   r   typingr   r   	vllm.envsr   vllm.compilation.cuda_graphr   vllm.v1.metrics.perfr	   vllm.v1.spec_decode.metricsr
   r   r   r   r   r   r   r<   rD   rE   rI   r\   re   rp   r   r   r   r   r   r   <module>   s:   O	 