o
    iצ                     @   sr  U d Z ddlZddlZddlmZmZ ddlmZ ddlm	Z	m
Z
 ddlmZmZ ddlZddlmZmZmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZ ddl m!Z! ee"Z#G dd de$Z%e
G dd dZ&e
G dd dZ'e
G dd dZ(G dd dZ)G dd deZ*G dd dZ+i Z,e-e.e/d f e0d< G dd deeZ1G dd de*Z2G d d! d!e*Z3G d"d# d#e*Z4G d$d% d%e1Z5G d&d' d'e*Z6G d(d) d)e*Z7G d*d+ d+e*Z8G d,d- d-e*Z9G d.d/ d/e*Z:G d0d1 d1e1Z;G d2d3 d3e1Z<G d4d5 d5Z=G d6d7 d7Z>G d8d9 d9Z?d:e@d;e.fd<d=ZAdBd:e@d>eBe. d?e@fd@dAZCdS )Cz
Analytic flops/memory estimation module for transformer components,
to help derive MFU (Model Flops Utilization) stats for a running model.
    N)ABCabstractmethod)Iterable)asdict	dataclass)AnyProtocol)	BaseModelFieldValidationErrormodel_validator)Self)
VllmConfig)init_logger)STR_DTYPE_TO_TORCH_DTYPEget_dtype_sizeget_kv_cache_torch_dtype)SchedulerOutputc                   @   s   e Zd ZdZdS )InvalidComponentzt
    Custom exception to indicate that a certain ComponentMetric is not
    applicable to the given VllmConfig.
    N)__name__
__module____qualname____doc__ r   r   J/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/v1/metrics/perf.pyr   !   s    r   c                   @   s   e Zd ZU dZeed< dZeed< dZeed< dZ	e
eef dB ed< dZe
eef dB ed< dZe
eef dB ed	< dZe
eef dB ed
< dS )DebugPerfStats        calc_durationr   num_prefill_requestsnum_decode_requestsNcontext_breakdownnum_flops_per_gpu_breakdown num_read_bytes_per_gpu_breakdown!num_write_bytes_per_gpu_breakdown)r   r   r   r   float__annotations__r   intr   r    dictstrr!   r"   r#   r   r   r   r   r   -   s   
 r   c                   @   sB   e Zd ZU dZeed< dZeed< dZeed< dZe	dB ed< dS )	PerfStatsr   num_flops_per_gpunum_read_bytes_per_gpunum_write_bytes_per_gpuNdebug_stats)
r   r   r   r*   r&   r%   r+   r,   r-   r   r   r   r   r   r)   9   s
   
 r)   c                	   @   s   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed< dZ
eed< dZeed< dZeed	< dZeed
< dedededdfddZdefddZdefddZdefddZededededd fddZdS )ExecutionContexta  
    Represents an execution context for a batch of requests.

    This class aggregates statistics across multiple requests in a batch,
    separately tracking prefill and decode phases.

    Example)
    - Batch with one full prefill (2048 tokens) and one decode (1 token, 8192 context):
      ctx = ExecutionContext()
      ctx.add(2048, 2048, is_prefill=True)
      ctx.add(1, 8192, is_prefill=False)
    r   r   prefill_num_tokensprefill_context_lenprefill_token_context_productr   decode_num_tokensdecode_context_lendecode_token_context_product
num_tokenscontext_len
is_prefillreturnNc                 C   s   |r"|  j d7  _ |  j|7  _|  j|7  _|  j|| 7  _dS |  jd7  _|  j|7  _|  j|7  _|  j|| 7  _dS )z8Add a single request's statistics to this batch context.   N)r   r/   r0   r1   r   r2   r3   r4   )selfr5   r6   r7   r   r   r   add\   s   zExecutionContext.addc                 C      | j | j S )z8Total number of tokens across all requests in the batch.)r/   r2   r:   r   r   r   total_num_tokensi      z!ExecutionContext.total_num_tokensc                 C   r<   )z<Total sum of (num_tokens * context_len) across all requests.)r1   r4   r=   r   r   r   total_token_context_productm   r?   z,ExecutionContext.total_token_context_productc                 C   r<   )zNumber of tokens that require logits computation (unembedding).

        For prefill, only the last token per request needs logits.
        For decode, all tokens need logits.
        )r   r2   r=   r   r   r   num_logits_tokensq   s   z"ExecutionContext.num_logits_tokensc                 C   s   |  }| ||| |S )zwCreate an ExecutionContext from a single request.

        This is a convenience method primarily for testing.
        )r;   )clsr5   r6   r7   ctxr   r   r   from_single_requesty   s   z$ExecutionContext.from_single_request)r   r   r   r   r   r&   r%   r/   r0   r1   r   r2   r3   r4   boolr;   r>   r@   rA   classmethodrD   r   r   r   r   r.   A   s0   
 r.   c                   @   sN   e Zd ZdZdedefddZdededdfdd	Zdeeef fd
dZ	dS )
ParsedArgsz
    Syntactic sugar so that Parsers can use dot notations
    to access/update the parsed arguments.

    e.g.)
        args = ParsedArgs()
        args.x = 3
        args.y = args.x + 1
    namer8   c                 C   s   t dt| j d| d)N'z' has no attribute ')AttributeErrortyper   )r:   rH   r   r   r   __getattr__   s   zParsedArgs.__getattr__valueNc                 C   s   t | || d S N)object__setattr__)r:   rH   rM   r   r   r   rP      s   zParsedArgs.__setattr__c                 C   s   t |  S rN   )varscopyr=   r   r   r   
model_dump      zParsedArgs.model_dump)
r   r   r   r   r(   r   rL   rP   r'   rS   r   r   r   r   rG      s
    
rG   c                   @   s"   e Zd ZdededefddZdS )Parserargsvllm_configr8   c                 C      dS )z
        Parse the vllm config and update the current ParsedArgs and pass it on.
        If the parser isn't applicable to the vllm_config, it will do nothing.
        Nr   r:   rV   rW   r   r   r   parse   s   zParser.parseN)r   r   r   rG   r   rZ   r   r   r   r   rU      s    rU   c                   @   sF   e Zd ZdZdeddfddZdeddfdd	Zd
edefddZ	dS )ParserChainz
    Applies chain of parser in a sequential order.
    Later parsers might overwrite results from previous parsers,
    so parsers should be chained in the appropriate order if they
    are not mutually exclusive.
    parsersr8   Nc                 G   s   t || _d S rN   )listr\   )r:   r\   r   r   r   __init__      zParserChain.__init__parserc                 C   s   | j | d S rN   )r\   append)r:   r`   r   r   r   
add_parser      zParserChain.add_parserrW   c                 C   s"   t  }| jD ]}|||}q|S rN   )rG   r\   rZ   )r:   rW   rV   r`   r   r   r   rZ      s   
zParserChain.parse)
r   r   r   r   rU   r^   rb   r   rG   rZ   r   r   r   r   r[      s
    r[   ComponentMetrics_COMPONENT_METRICS_REGISTRYc                
   @   s.  e Zd ZdZeedefddZeedefddZ	dd Z
ed	edefd
dZedeed   fddZe	ddededeeef fddZe	ddededeeef fddZe	ddededeeef fddZddededefddZddededefddZddededefddZdS )rd   a-  
    Each concrete ComponentMetrics class is associated with:
    - fields that are required for metric derivation
      (fields are specified/validated through pydantic model)
    - parser to parse VllmConfig into fields
    - metric methods that derive flops/bytes for a given execution context
    r8   c                 C      d S rN   r   rB   r   r   r   component_type      zComponentMetrics.component_typec                 C   rX   )a  
        Return a ParserChain that provides values for all required fields.
        The returned parser chain must populate ParsedArgs with values for every
        field defined on this ComponentMetrics class. Missing fields will cause
        a ValidationError when from_vllm_config() is called.
        See individual Parser docstrings for which args they provide, and field
        comments on ComponentMetrics subclasses for which parser provides each field.
        Nr   rg   r   r   r   
get_parser   s   zComponentMetrics.get_parserc                 C   s   | t |  < d S rN   )re   rh   rg   r   r   r   __init_subclass__   rc   z"ComponentMetrics.__init_subclass__rW   c              
   C   sV   |   }||}z| | W S  ty* } ztd|   d| |d}~ww )zj
        Instantiate this class from VllmConfig.
        Raises ValidationError if parsing fails.
        zInvalid z	 config: N)rj   rZ   model_validaterS   r   r   rh   )rB   rW   r`   parsed_argser   r   r   from_vllm_config   s   
z!ComponentMetrics.from_vllm_configc                 C   s   t t S rN   )iterre   valuesrg   r   r   r   registered_metrics   r?   z#ComponentMetrics.registered_metricsTrC   per_gpuc                 C   rf   rN   r   r:   rC   rs   r   r   r   get_num_flops_breakdown      z(ComponentMetrics.get_num_flops_breakdownc                 C   rf   rN   r   rt   r   r   r   get_read_bytes_breakdown   rv   z)ComponentMetrics.get_read_bytes_breakdownc                 C   rf   rN   r   rt   r   r   r   get_write_bytes_breakdown   rv   z*ComponentMetrics.get_write_bytes_breakdownc                 C      t | || S rN   )sumru   rq   rt   r   r   r   get_num_flops      zComponentMetrics.get_num_flopsc                 C   ry   rN   )rz   rw   rq   rt   r   r   r   get_read_bytes   r|   zComponentMetrics.get_read_bytesc                 C   ry   rN   )rz   rx   rq   rt   r   r   r   get_write_bytes  r|   z ComponentMetrics.get_write_bytesNT)r   r   r   r   rF   r   r(   rh   r[   rj   rk   r   r   ro   r   rK   rr   r.   rE   r'   r&   ru   rw   rx   r{   r}   r~   r   r   r   r   rd      sV    


c                   @   &   e Zd ZdZdededefddZdS )BaseConfigParserz
    Parses base model configuration.
    Provides: vocab_size, hidden_size, num_attention_heads, num_hidden_layers,
    weight_byte_size, activation_byte_size, dp_size, tp_size, pp_size, enable_ep
    rV   rW   r8   c                 C   s   |j }| |_| |_t|jd|_t|jd|_|j j	}t
|tj	r(|}nt
|tr6|tv r6t| }n	td| tj}t||_d|_|jj|_|jj|_|jj|_|jj|_|S )Nnum_attention_headsnum_hidden_layersz.Unknown model_dtype %s, defaulting to bfloat16   )model_configget_vocab_size
vocab_sizeget_hidden_sizehidden_sizeget_requiredhf_text_configr   r   dtype
isinstancetorchr(   r   loggerwarningbfloat16r   weight_byte_sizeactivation_byte_sizeparallel_configdata_parallel_sizedp_sizetensor_parallel_sizetp_sizepipeline_parallel_sizepp_sizeenable_expert_parallel	enable_ep)r:   rV   rW   r   model_dtypetorch_dtyper   r   r   rZ     s4   







zBaseConfigParser.parseNr   r   r   r   rG   r   rZ   r   r   r   r   r   
      r   c                   @   r   )BaseAttentionConfigParserzo
    Parses attention-specific configuration.
    Provides: num_key_value_heads, head_dim, cache_byte_size
    rV   rW   r8   c                 C   sB   |j }| |_| |_|j j}|jj}t||}t	||_
|S rN   )r   get_total_num_kv_headsnum_key_value_headsget_head_sizehead_dimr   cache_configcache_dtyper   r   cache_byte_size)r:   rV   rW   r   r   r   kv_cache_torch_dtyper   r   r   rZ   D  s   



zBaseAttentionConfigParser.parseNr   r   r   r   r   r   >      r   c                   @   r   )!AttentionQuantizationConfigParserza
    Parses quantization configuration for attention layers.
    Overrides: weight_byte_size
    rV   rW   r8   c                 C   sB   |j }|d u r	|S | }|dv rd|_|S |dkrd|_|S tN)fp8
fbgemm_fp8r9   mxfp4g      ?quant_configget_namer   r   r:   rV   rW   cfgquant_methodr   r   r   rZ   Y  s   z'AttentionQuantizationConfigParser.parseNr   r   r   r   r   r   S  r   r   c                	   @   sd  e Zd ZU edddZeed< edddZeed< edddZeed< edddZ	eed< edddZ
eed< edddZeed	< edddZeed
< edddZeed< edddZeed< edddZeeB ed< edefddZedefddZ	ddededeeef fddZ	ddededeeef fddZ	ddededeeef fddZdS )AttentionMetrics.r   gtr   r   r   r   r   r   r   r   r   r   r8   c                 C   rX   )Nattnr   rg   r   r   r   rh     ri   zAttentionMetrics.component_typec                 C   s   t t t t S rN   )r[   r   r   r   rg   r   r   r   rj     s
   zAttentionMetrics.get_parserTrC   rs   c           
      C   s   | j | j| j| j| jf\}}}}}| }| }	|r0|| j }td|| j	 }td|| j	 }d| | |d|   | | d| |	 | | d| |	 | | d| | | | | dS )Nr9   r   )qkv_projattn_qkattn_avout_proj)
r   r   r   r   r   r>   r@   r   maxr   )
r:   rC   rs   LDqkvdTTCr   r   r   ru     s"   
z(AttentionMetrics.get_num_flops_breakdownc           
      C   sV  | j | j| j| j| jf\}}}}}| }|r,|| j }td|| j }td|| j }i }	|| | j	 | |	d< t
||d|   | | j | |	d< |jdkrf|j| d|j |  | | j	 | |	d< |jdkr|	dd|j| | | j	 | d|j | | | j |   |	d< || | | j	 | |	d< t
|| | | j | |	d< |	S )	Nr9   	qkv_inputr   
qkv_weightr   
attn_input	out_input
out_weight)r   r   r   r   r   r>   r   r   r   r   r&   r   r/   r0   r2   getr3   r   )
r:   rC   rs   r   r   r   r   r   r   
read_bytesr   r   r   rw     s@   
&


z)AttentionMetrics.get_read_bytes_breakdownc           	      C   s   | j | j| j| j| jf\}}}}}| }|r,|| j }td|| j }td|| j }||d|   | | j	 | d| | | | j
 | || | j	 | dS )z4Calculate write memory traffic for attention layers.r9   r   )
qkv_outputkv_cache
out_output)r   r   r   r   r   r>   r   r   r   r   r   )	r:   rC   rs   r   r   r   r   r   r   r   r   r   rx     s   
z*AttentionMetrics.get_write_bytes_breakdownNr   )r   r   r   r
   r   r&   r%   r   r   r   r   r   r   r   r   r   r$   rF   r(   rh   r[   rj   r.   rE   r'   ru   rw   rx   r   r   r   r   r   o  sN   
 



.
r   c                   @   r   )BaseFfnConfigParserz
    Parses FFN and MoE configuration.
    Provides: intermediate_size, num_experts, num_experts_per_tok,
    moe_intermediate_size, num_shared_experts, num_moe_layers
    rV   rW   r8   c                 C   s   |j j}t|dr|jd ur|j}t|d|jd |_|j  |_t	|ddgd|_
t	|ddgd|_t	|dd	gd|_|jdk}|rI|j|_|S d|_|S )
Ntext_configintermediate_size   num_experts_per_tokmoe_topkr   moe_intermediate_sizen_shared_expertsnum_shared_experts)r   	hf_confighasattrr   getattrr   r   get_num_expertsnum_expertsgetattr_from_listr   r   r   r   num_moe_layers)r:   rV   rW   r   is_moer   r   r   rZ     s&   



zBaseFfnConfigParser.parseNr   r   r   r   r   r     r   r   c                   @   r   )FfnParallelParserzW
    Parses FFN parallelism configuration.

    Provides: ffn_tp_size, ffn_ep_size
    rV   rW   r8   c                 C   s<   |j rd|j|j }}n	|j|j d}}||_||_|S )Nr9   )r   r   r   ffn_tp_sizeffn_ep_size)r:   rV   rW   r   r   r   r   r   rZ     s   zFfnParallelParser.parseNr   r   r   r   r   r     r   r   c                   @   r   )InterleaveMoeLayerStepParserzg
    Parses interleave_moe_layer_step field for models like Llama4.

    Overrides: num_moe_layers
    rV   rW   r8   c                    sX   |j j t dr jd ur j t dr* jdkr*t fddt|jD |_|S )Nr   interleave_moe_layer_stepr   c                    s"   g | ]}|d   j  dkr|qS )r9   r   )r   .0layerr   r   r   
<listcomp>:  s
    z6InterleaveMoeLayerStepParser.parse.<locals>.<listcomp>)	r   r   r   r   r   lenranger   r   rY   r   r   r   rZ   0  s   

z"InterleaveMoeLayerStepParser.parseNr   r   r   r   r   r   )  r   r   c                   @   r   )MoeLayerFreqParserzy
    Parses moe_layer_freq and first_k_dense_replace fields for models like Deepseek.

    Overrides: num_moe_layers
    rV   rW   r8   c                    sX   |j j t dr jd ur j t dr*t dr*t fddt|jD |_|S )Nr   moe_layer_freqfirst_k_dense_replacec                    s(   g | ]}| j kr| j d kr|qS )r   )r   r   r   r   r   r   r   R  s    
z,MoeLayerFreqParser.parse.<locals>.<listcomp>)r   r   r   r   r   r   r   r   rY   r   r   r   rZ   K  s   
	zMoeLayerFreqParser.parseNr   r   r   r   r   r   D  r   r   c                   @   r   )FfnQuantizationConfigParserz\
    Parses quantization configuration for FFN layers.

    Overrides: weight_byte_size
    rV   rW   r8   c                 C   sD   |j }|d u r	|S | }|dv rd|_	 |S |dkr d|_|S tr   r   r   r   r   r   rZ   d  s   z!FfnQuantizationConfigParser.parseNr   r   r   r   r   r   ]  r   r   c                	   @   s  e Zd ZU edddZeed< edddZeed< edddZeed< edddZ	eed< edddZ
eed< edddZeed	< edddZeed
< edZeed< edZeed< edZeed< edZeed< edddZeed< edddZeeB ed< edddefddZedefddZedefddZ	d&dededeeef fdd Z	d&dededeeef fd!d"Z 	d&dededeeef fd#d$Z!d%S )'
FfnMetrics.r   r   r   r   r   r   r   r   r   r   r9   r   r   r   )ger   r   after)moder8   c                 C   sP   | j dkr&| jsJ d| j| jsJ d| j| js&J d| j| S )zJValidate that MoE-related fields are properly set when num_moe_layers > 0.r   zself.num_experts=zself.num_experts_per_tok=zself.moe_intermediate_size=)r   r   r   r   r=   r   r   r   validate_moe_fields  s
   
zFfnMetrics.validate_moe_fieldsc                 C   rX   )Nffnr   rg   r   r   r   rh     ri   zFfnMetrics.component_typec                 C   s   t t t t t t t S rN   )r[   r   r   r   r   r   r   rg   r   r   r   rj     s   zFfnMetrics.get_parserTrC   rs   c                 C   s  | j | j| j}}}| j| j| j| jf\}}}}	| }
|| }|r'|
| nd}|rJ|| j }|| j }|| j	 }|durC|| j	 }|rJ|| j
 }i }|r\d| d | |
 | |d< |rn|rnd| d | | | |d< |r|	rd| d | |	 |
 | |d< |S )z)Calculate flops breakdown for FFN layers.r   Nr      	dense_ffn
routed_ffn
shared_ffn)r   r   r   r   r   r   r   r>   r   r   r   )r:   rC   rs   r   r   DILmEMISr   Ldnum_activated_tokensflopsr   r   r   ru     s2   




 z"FfnMetrics.get_num_flops_breakdownc                 C   s  | j | j| j}}}| j| j| j| jf\}}}}	| }
| j}|| }|r*|
| nd}|rV|| j	 }|| j	 }|| j
 }|durF|| j
 }|rM|| j }|durV|| j }i }|rt|
| | j | |d< td| | | j | |d< td|
 | | j | |d< t|
| | j | |d< t|| | j | |d< |r@|rt||}t|| | j | |d	< td| | | | j | |d
< td| | | j | |d< t|| | j | |d< t|| | | j | |d< |	r@t|
| | j | |d< td| | |	 | j | |d< td|
 | |	 | j | |d< t|
| | j | |d< t|| |	 | j | |d< |S )z-Calculate read memory traffic for FFN layers.r   Ndense_up_gate_inputr   dense_up_gate_weightsdense_silu_inputdense_down_inputdense_down_weightsrouted_up_gate_inputrouted_up_gate_weightsrouted_silu_inputrouted_down_inputrouted_down_weightsshared_up_gate_inputshared_up_gate_weightsshared_silu_inputshared_down_inputshared_down_weights)r   r   r   r   r   r   r   r>   r   r   r   r   r&   r   r   min)r:   rC   rs   r   r   r   r   r   r   r   r   r   r   r  r   num_activated_expertsr   r   r   rw     s   






z#FfnMetrics.get_read_bytes_breakdownc                 C   s  | j | j| j}}}| j| j| j| jf\}}}}	| }
|| }|r'|
| nd}|rJ|| j }|| j }|| j	 }|durC|| j	 }|rJ|| j
 }i }|rwtd|
 | | j | |d< t|
| | j | |d< t|
| | j | |d< |r|rtd| | | j | |d< t|| | j | |d< t|| | j | |d	< |	rtd|
 |	 | | j | |d
< t|
|	 | | j | |d< t|
|	 | | j | |d< |S )z.Calculate write memory traffic for FFN layers.r   Nr   dense_up_gate_outputdense_silu_outputdense_down_outputrouted_up_gate_outputrouted_silu_outputrouted_down_outputshared_up_gate_outputshared_silu_outputshared_down_output)r   r   r   r   r   r   r   r>   r   r   r   r&   r   )r:   rC   rs   r   r   r   r   r   r   r   r   r   r  write_bytesr   r   r   rx   7  sd   




z$FfnMetrics.get_write_bytes_breakdownNr   )"r   r   r   r
   r   r&   r%   r   r   r   r   r   r   r   r   r   r   r   r   r$   r   r   r   rF   r(   rh   r[   rj   r.   rE   r'   ru   rw   rx   r   r   r   r   r   |  sX   
 	

,

[
r   c                	   @   s   e Zd ZU edddZeed< edddZeed< edddZeed< edddZ	eed< eed< e
d	efd
dZe
d	efddZ	ddeded	eeef fddZ	ddeded	eeef fddZ	ddeded	eeef fddZdS )UnembedMetrics.r   r   r   r   r   r   r   r8   c                 C   rX   )Nunembedr   rg   r   r   r   rh     ri   zUnembedMetrics.component_typec                 C   s
   t t S rN   )r[   r   rg   r   r   r   rj     s   zUnembedMetrics.get_parserTrC   rs   c                 C   s8   | j | j}}| }|r|| j }dd| | | iS )z0Calculate flops breakdown for unembedding layer.r  r   )r   r   rA   r   r:   rC   rs   r   Vr   r   r   r   ru     s   
z&UnembedMetrics.get_num_flops_breakdownc                 C   sB   | j | j}}| }|r|| j }|| | j || | j dS )z4Calculate read memory traffic for unembedding layer.)inputweight)r   r   rA   r   r   r   r   r   r   r   rw     s   
z'UnembedMetrics.get_read_bytes_breakdownc                 C   s.   | j }| }|r|| j }d|| | j iS )z5Calculate write memory traffic for unembedding layer.output)r   rA   r   r   )r:   rC   rs   r!  r   r   r   r   rx     s   
z(UnembedMetrics.get_write_bytes_breakdownNr   )r   r   r   r
   r   r&   r%   r   r   r   rF   r(   rh   r[   rj   r.   rE   r'   ru   rw   rx   r   r   r   r   r  }  sD   
 




r  c                	   @   s   e Zd ZdeddfddZdefddZdd	ed
edefddZ	dd	ed
edefddZ
dd	ed
edefddZ	dd	ed
edeeef fddZ	dd	ed
edeeef fddZ	dd	ed
edeeef fddZdedefddZdS )ModelMetricsrW   r8   Nc                 C   s   || _ g | _t D ]7}z||}| j| td| t	| W q
 t
yA } ztd| t	| W Y d}~q
d}~ww dS )z
        Parse vllm_config to instantiate metrics for each component.
        is_enabled() will return False if no component metrics could be instantiated.
        z,Instantiated ComponentMetrics [%s] with (%s)z Failed to instantiate %s from %sN)rW   metricsrd   rr   ro   ra   r   inforh   r(   r   debug)r:   rW   
metric_clsmetricrn   r   r   r   r^     s(   
zModelMetrics.__init__c                 C   s   t | jdkS Nr   )r   r&  r=   r   r   r   
is_enabled  r_   zModelMetrics.is_enabledTrC   rs   c                       t  fdd| jD S )Nc                 3       | ]	}|  V  qd S rN   )r{   r   r*  rC   rs   r   r   	<genexpr>      z-ModelMetrics.get_num_flops.<locals>.<genexpr>rz   r&  rt   r   r0  r   r{        zModelMetrics.get_num_flopsc                    r-  )Nc                 3   r.  rN   )r}   r/  r0  r   r   r1    r2  z.ModelMetrics.get_read_bytes.<locals>.<genexpr>r3  rt   r   r0  r   r}     r4  zModelMetrics.get_read_bytesc                    r-  )Nc                 3   r.  rN   )r~   r/  r0  r   r   r1    r2  z/ModelMetrics.get_write_bytes.<locals>.<genexpr>r3  rt   r   r0  r   r~     r4  zModelMetrics.get_write_bytesc                    H   i }| j D ]}|||}|   fdd| D }|| q|S )Nc                        i | ]\}}  d | |qS .r   r   keyval	componentr   r   
<dictcomp>       z8ModelMetrics.get_num_flops_breakdown.<locals>.<dictcomp>)r&  ru   rh   itemsupdater:   rC   rs   totalr*  	breakdownprefixedr   r<  r   ru        
z$ModelMetrics.get_num_flops_breakdownc                    r5  )Nc                    r6  r7  r   r9  r<  r   r   r>    r?  z9ModelMetrics.get_read_bytes_breakdown.<locals>.<dictcomp>)r&  rw   rh   r@  rA  rB  r   r<  r   rw     rF  z%ModelMetrics.get_read_bytes_breakdownc                    r5  )Nc                    r6  r7  r   r9  r<  r   r   r>    r?  z:ModelMetrics.get_write_bytes_breakdown.<locals>.<dictcomp>)r&  rx   rh   r@  rA  rB  r   r<  r   rx     rF  z&ModelMetrics.get_write_bytes_breakdownscheduler_outputc                 C   s$  t  }t }|jD ]}|j}|j|d}|dkrq
|j| }|j||dd q
|j	}t
|jD ]$\}	}|j|d}|dkrAq1|j|	 }
|
| }|dk}|||| q1| |d}| |d}| |d}tt| t| t| }tjrtt  | |j|jt|||||_|S )zV
        Calculate perf stats for the current step based on scheduled tokens.
        r   T)r7   r9   )time	monotonicr.   scheduled_new_reqsreq_idnum_scheduled_tokensr   num_computed_tokensr;   scheduled_cached_reqs	enumeratereq_idsru   rw   rx   r)   rz   rq   envsVLLM_DEBUG_MFU_METRICSr   r   r   r   r-   )r:   rG  t0rC   new_reqrK  r5   r6   cached_reqsirM  r7   num_flops_breakdownread_bytes_breakdownwrite_bytes_breakdown
perf_statsr   r   r   get_step_perf_stats_per_gpu  sJ   







z(ModelMetrics.get_step_perf_stats_per_gpur   )r   r   r   r   r^   rE   r,  r.   r&   r{   r}   r~   r'   r(   ru   rw   rx   r   r)   r[  r   r   r   r   r%    sF    





r%  c                   @   s@   e Zd Zdd Zdd Zdeddfdd	Zd
edefddZ	dS )PerfMetricsDebugLoggingc                 C   s   |    d S rN   )resetr=   r   r   r   r^   I  rT   z PerfMetricsDebugLogging.__init__c                 C   s4   d| _ d| _d| _d| _i | _i | _i | _i | _d S )Nr   r   )total_calc_durationtotal_num_prefill_requeststotal_num_decode_requeststotal_num_batchestotal_context_breakdown!total_num_flops_per_gpu_breakdown"total_read_bytes_per_gpu_breakdown#total_write_bytes_per_gpu_breakdownr=   r   r   r   r]  L  s   
zPerfMetricsDebugLogging.resetr-   r8   Nc                 C   s   |  j |j7  _ |  j|j7  _|  j|j7  _|  jd7  _t| j| j	| j
| jg|j|j|j|jgD ]\}}t|ts?J | D ]\}}||d| ||< qCq4d S )Nr9   r   )r^  r   r_  r   r`  r   ra  ziprb  rc  rd  re  r    r!   r"   r#   r   r'   r@  r   )r:   r-   dstsrcr:  r;  r   r   r   observeV  s*   zPerfMetricsDebugLogging.observe
log_prefix
delta_timec                 C   s   dd | j  D }dd | j D }dd | j D }td|tj| j| j	| j
| j||||dd| j| dd		d
d d S )Nc                 S   "   i | ]\}}||d  ddqS )   mB.1fTFr   r   kvr   r   r   r>  p      z/PerfMetricsDebugLogging.log.<locals>.<dictcomp>c                 S   rl      eArn  GBr   rp  r   r   r   r>  t  rs  c                 S   rl  rt  r   rp  r   r   r   r>  x  rs  z%sMFU details: %srn  sz.1%)	prefill_reqsdecode_reqsnum_batchesr    flops_breakdownnum_read_bytes_breakdownnum_write_bytes_breakdowndurationmfu_calc_overheadr   )indent)rc  r@  rd  re  r   r(  jsondumpsr_  r`  ra  rb  r^  )r:   log_fnrj  rk  rc  rd  re  r   r   r   logn  s4   
zPerfMetricsDebugLogging.log)
r   r   r   r^   r]  r   ri  r(   r$   r  r   r   r   r   r\  H  s
    
r\  c                   @   sN   e Zd ZdefddZdd Zdeddfd	d
Zej	dfde
ddfddZdS )PerfMetricsLoggingrW   c                 C   s0   || _ |jj| _d | _tjrt | _|   d S rN   )	rW   r   r   r   debug_loggingrQ  rR  r\  r]  )r:   rW   r   r   r   r^     s   
zPerfMetricsLogging.__init__c                 C   s4   t  | _d| _d| _d| _| jr| j  d S d S r+  )rH  rI  last_log_timetotal_num_flops_per_gputotal_read_bytes_per_gputotal_write_bytes_per_gpur  r]  r=   r   r   r   r]    s   
zPerfMetricsLogging.resetrZ  r8   Nc                 C   sZ   |  j |j7  _ |  j|j7  _|  j|j7  _| jr+|jd us"J | j|j d S d S rN   )	r  r*   r  r+   r  r,   r  r-   ri  )r:   rZ  r   r   r   ri    s   zPerfMetricsLogging.observe rj  c                 C   s   | j s| js| jsd S t }|| j }|dkrd}d}n| j | d }| j| j | d }|d||| | jr@| j||| |   d S )Nr   rm  ru  z"%sMFU: %.1f TF/s/GPU %.1f GB/s/GPU)	r  r  r  rH  rI  r  r  r  r]  )r:   r  rj  nowrk  avg_tflops_per_gpuavg_gbps_per_gpur   r   r   r    s8   

zPerfMetricsLogging.log)r   r   r   r   r^   r]  r)   ri  r   r'  r(   r  r   r   r   r   r    s
    

	r  objattrc                 C   s$   t | |std| dt| |S )zMGet an attr from an object, or throw a InvalidComponentError if it's not set.zMissing required attr z
 in config)r   r   r   )r  r  r   r   r   r     s   

r   attrsdefaultc                 C   s&   |D ]}t | |rt| |  S q|S )zdTry to get the first attr that exists in the object
    from a list of attrs. Otherwise return None.)r   r   )r  r  r  r  r   r   r   r     s
   
r   rN   )Dr   r  rH  abcr   r   collections.abcr   dataclassesr   r   typingr   r   r   pydanticr	   r
   r   r   typing_extensionsr   	vllm.envsrQ  vllm.configr   vllm.loggerr   vllm.utils.torch_utilsr   r   r   vllm.v1.core.sched.outputr   r   r   	Exceptionr   r   r)   r.   rG   rU   r[   re   r'   r(   rK   r%   rd   r   r   r   r   r   r   r   r   r   r   r  r%  r\  r  rO   r   r]   r   r   r   r   r   <module>   s`   D	K4 !  B 
KD 