o
    .i                     @   sr  U d Z ddlZddlZddlmZmZ ddlmZ ddlm	Z	m
Z
 ddlmZmZ ddlZddlmZmZmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZ ddl m!Z! ee"Z#G dd de$Z%e
G dd dZ&e
G dd dZ'e
G dd dZ(G dd dZ)G dd deZ*G dd dZ+i Z,e-e.e/d f e0d< G dd deeZ1G dd de*Z2G d d! d!e*Z3G d"d# d#e*Z4G d$d% d%e1Z5G d&d' d'e*Z6G d(d) d)e*Z7G d*d+ d+e*Z8G d,d- d-e*Z9G d.d/ d/e*Z:G d0d1 d1e1Z;G d2d3 d3e1Z<G d4d5 d5Z=G d6d7 d7Z>G d8d9 d9Z?d:e@d;e.fd<d=ZAdBd:e@d>eBe. d?e@fd@dAZCdS )Cz
Analytic flops/memory estimation module for transformer components,
to help derive MFU (Model Flops Utilization) stats for a running model.
    N)ABCabstractmethod)Iterable)asdict	dataclass)AnyProtocol)	BaseModelFieldValidationErrormodel_validator)Self)
VllmConfig)init_logger)STR_DTYPE_TO_TORCH_DTYPEget_dtype_sizeget_kv_cache_torch_dtype)SchedulerOutputc                   @   s   e Zd ZdZdS )InvalidComponentzt
    Custom exception to indicate that a certain ComponentMetric is not
    applicable to the given VllmConfig.
    N)__name__
__module____qualname____doc__ r   r   Q/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/v1/metrics/perf.pyr   !   s    r   c                   @   s   e Zd ZU dZeed< dZeed< dZeed< dZ	e
eef dB ed< dZe
eef dB ed< dZe
eef dB ed	< dZe
eef dB ed
< dS )DebugPerfStats        calc_durationr   num_prefill_requestsnum_decode_requestsNcontext_breakdownnum_flops_per_gpu_breakdown num_read_bytes_per_gpu_breakdown!num_write_bytes_per_gpu_breakdown)r   r   r   r   float__annotations__r   intr   r    dictstrr!   r"   r#   r   r   r   r   r   -   s   
 r   c                   @   sB   e Zd ZU dZeed< dZeed< dZeed< dZe	dB ed< dS )	PerfStatsr   num_flops_per_gpunum_read_bytes_per_gpunum_write_bytes_per_gpuNdebug_stats)
r   r   r   r*   r&   r%   r+   r,   r-   r   r   r   r   r   r)   9   s
   
 r)   c                	   @   s   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed< dZ
eed< dZeed< dZeed	< dZeed
< dedededdfddZdefddZdefddZededededd fddZdS )ExecutionContexta  
    Represents an execution context for a batch of requests.

    This class aggregates statistics across multiple requests in a batch,
    separately tracking prefill and decode phases.

    Example)
    - Batch with one full prefill (2048 tokens) and one decode (1 token, 8192 context):
      ctx = ExecutionContext()
      ctx.add(2048, 2048, is_prefill=True)
      ctx.add(1, 8192, is_prefill=False)
    r   r   prefill_num_tokensprefill_context_lenprefill_token_context_productr   decode_num_tokensdecode_context_lendecode_token_context_product
num_tokenscontext_len
is_prefillreturnNc                 C   s   |r"|  j d7  _ |  j|7  _|  j|7  _|  j|| 7  _dS |  jd7  _|  j|7  _|  j|7  _|  j|| 7  _dS )z8Add a single request's statistics to this batch context.   N)r   r/   r0   r1   r   r2   r3   r4   )selfr5   r6   r7   r   r   r   add\   s   zExecutionContext.addc                 C      | j | j S )z8Total number of tokens across all requests in the batch.)r/   r2   r:   r   r   r   total_num_tokensi      z!ExecutionContext.total_num_tokensc                 C   r<   )z<Total sum of (num_tokens * context_len) across all requests.)r1   r4   r=   r   r   r   total_token_context_productm   r?   z,ExecutionContext.total_token_context_productc                 C   s   |  }| ||| |S )zwCreate an ExecutionContext from a single request.

        This is a convenience method primarily for testing.
        )r;   )clsr5   r6   r7   ctxr   r   r   from_single_requestq   s   z$ExecutionContext.from_single_request)r   r   r   r   r   r&   r%   r/   r0   r1   r   r2   r3   r4   boolr;   r>   r@   classmethodrC   r   r   r   r   r.   A   s.   
 r.   c                   @   sN   e Zd ZdZdedefddZdededdfdd	Zdeeef fd
dZ	dS )
ParsedArgsz
    Syntactic sugar so that Parsers can use dot notations
    to access/update the parsed arguments.

    e.g.)
        args = ParsedArgs()
        args.x = 3
        args.y = args.x + 1
    namer8   c                 C   s   t dt| j d| d)N'z' has no attribute ')AttributeErrortyper   )r:   rG   r   r   r   __getattr__   s   zParsedArgs.__getattr__valueNc                 C   s   t | || d S N)object__setattr__)r:   rG   rL   r   r   r   rO      s   zParsedArgs.__setattr__c                 C   s   t |  S rM   )varscopyr=   r   r   r   
model_dump      zParsedArgs.model_dump)
r   r   r   r   r(   r   rK   rO   r'   rR   r   r   r   r   rF   ~   s
    
rF   c                   @   s"   e Zd ZdededefddZdS )Parserargsvllm_configr8   c                 C      dS )z
        Parse the vllm config and update the current ParsedArgs and pass it on.
        If the parser isn't applicable to the vllm_config, it will do nothing.
        Nr   r:   rU   rV   r   r   r   parse   s   zParser.parseN)r   r   r   rF   r   rY   r   r   r   r   rT      s    rT   c                   @   sF   e Zd ZdZdeddfddZdeddfdd	Zd
edefddZ	dS )ParserChainz
    Applies chain of parser in a sequential order.
    Later parsers might overwrite results from previous parsers,
    so parsers should be chained in the appropriate order if they
    are not mutually exclusive.
    parsersr8   Nc                 G   s   t || _d S rM   )listr[   )r:   r[   r   r   r   __init__      zParserChain.__init__parserc                 C   s   | j | d S rM   )r[   append)r:   r_   r   r   r   
add_parser      zParserChain.add_parserrV   c                 C   s"   t  }| jD ]}|||}q|S rM   )rF   r[   rY   )r:   rV   rU   r_   r   r   r   rY      s   
zParserChain.parse)
r   r   r   r   rT   r]   ra   r   rF   rY   r   r   r   r   rZ      s
    rZ   ComponentMetrics_COMPONENT_METRICS_REGISTRYc                
   @   s.  e Zd ZdZeedefddZeedefddZ	dd Z
ed	edefd
dZedeed   fddZe	ddededeeef fddZe	ddededeeef fddZe	ddededeeef fddZddededefddZddededefddZddededefddZdS )rc   a-  
    Each concrete ComponentMetrics class is associated with:
    - fields that are required for metric derivation
      (fields are specified/validated through pydantic model)
    - parser to parse VllmConfig into fields
    - metric methods that derive flops/bytes for a given execution context
    r8   c                 C      d S rM   r   rA   r   r   r   component_type      zComponentMetrics.component_typec                 C   rW   )a  
        Return a ParserChain that provides values for all required fields.
        The returned parser chain must populate ParsedArgs with values for every
        field defined on this ComponentMetrics class. Missing fields will cause
        a ValidationError when from_vllm_config() is called.
        See individual Parser docstrings for which args they provide, and field
        comments on ComponentMetrics subclasses for which parser provides each field.
        Nr   rf   r   r   r   
get_parser   s   zComponentMetrics.get_parserc                 C   s   | t |  < d S rM   )rd   rg   rf   r   r   r   __init_subclass__   rb   z"ComponentMetrics.__init_subclass__rV   c              
   C   sV   |   }||}z| | W S  ty* } ztd|   d| |d}~ww )zj
        Instantiate this class from VllmConfig.
        Raises ValidationError if parsing fails.
        zInvalid z	 config: N)ri   rY   model_validaterR   r   r   rg   )rA   rV   r_   parsed_argser   r   r   from_vllm_config   s   
z!ComponentMetrics.from_vllm_configc                 C   s   t t S rM   )iterrd   valuesrf   r   r   r   registered_metrics   r?   z#ComponentMetrics.registered_metricsTrB   per_gpuc                 C   re   rM   r   r:   rB   rr   r   r   r   get_num_flops_breakdown      z(ComponentMetrics.get_num_flops_breakdownc                 C   re   rM   r   rs   r   r   r   get_read_bytes_breakdown   ru   z)ComponentMetrics.get_read_bytes_breakdownc                 C   re   rM   r   rs   r   r   r   get_write_bytes_breakdown   ru   z*ComponentMetrics.get_write_bytes_breakdownc                 C      t | || S rM   )sumrt   rp   rs   r   r   r   get_num_flops      zComponentMetrics.get_num_flopsc                 C   rx   rM   )ry   rv   rp   rs   r   r   r   get_read_bytes   r{   zComponentMetrics.get_read_bytesc                 C   rx   rM   )ry   rw   rp   rs   r   r   r   get_write_bytes   r{   z ComponentMetrics.get_write_bytesNT)r   r   r   r   rE   r   r(   rg   rZ   ri   rj   r   r   rn   r   rJ   rq   r.   rD   r'   r&   rt   rv   rw   rz   r|   r}   r   r   r   r   rc      sV    


c                   @   &   e Zd ZdZdededefddZdS )BaseConfigParserz
    Parses base model configuration.
    Provides: vocab_size, hidden_size, num_attention_heads, num_hidden_layers,
    weight_byte_size, activation_byte_size, dp_size, tp_size, pp_size, enable_ep
    rU   rV   r8   c                 C   s   |j }| |_| |_t|jd|_t|jd|_|j j	}t
|tj	r(|}nt
|tr6|tv r6t| }n	td| tj}t||_d|_|jj|_|jj|_|jj|_|jj|_|S )Nnum_attention_headsnum_hidden_layersz.Unknown model_dtype %s, defaulting to bfloat16   )model_configget_vocab_size
vocab_sizeget_hidden_sizehidden_sizeget_requiredhf_text_configr   r   dtype
isinstancetorchr(   r   loggerwarningbfloat16r   weight_byte_sizeactivation_byte_sizeparallel_configdata_parallel_sizedp_sizetensor_parallel_sizetp_sizepipeline_parallel_sizepp_sizeenable_expert_parallel	enable_ep)r:   rU   rV   r   model_dtypetorch_dtyper   r   r   rY   	  s4   







zBaseConfigParser.parseNr   r   r   r   rF   r   rY   r   r   r   r   r         r   c                   @   r   )BaseAttentionConfigParserzo
    Parses attention-specific configuration.
    Provides: num_key_value_heads, head_dim, cache_byte_size
    rU   rV   r8   c                 C   sB   |j }| |_| |_|j j}|jj}t||}t	||_
|S rM   )r   get_total_num_kv_headsnum_key_value_headsget_head_sizehead_dimr   cache_configcache_dtyper   r   cache_byte_size)r:   rU   rV   r   r   r   kv_cache_torch_dtyper   r   r   rY   <  s   



zBaseAttentionConfigParser.parseNr   r   r   r   r   r   6      r   c                   @   r   )!AttentionQuantizationConfigParserza
    Parses quantization configuration for attention layers.
    Overrides: weight_byte_size
    rU   rV   r8   c                 C   sB   |j }|d u r	|S | }|dv rd|_|S |dkrd|_|S tN)fp8
fbgemm_fp8r9   mxfp4g      ?quant_configget_namer   r   r:   rU   rV   cfgquant_methodr   r   r   rY   Q  s   z'AttentionQuantizationConfigParser.parseNr   r   r   r   r   r   K  r   r   c                	   @   sd  e Zd ZU edddZeed< edddZeed< edddZeed< edddZ	eed< edddZ
eed< edddZeed	< edddZeed
< edddZeed< edddZeed< edddZeeB ed< edefddZedefddZ	ddededeeef fddZ	ddededeeef fddZ	ddededeeef fddZdS )AttentionMetrics.r   gtr   r   r   r   r   r   r   r   r   r   r8   c                 C   rW   )Nattnr   rf   r   r   r   rg   {  rh   zAttentionMetrics.component_typec                 C   s   t t t t S rM   )rZ   r   r   r   rf   r   r   r   ri     s
   zAttentionMetrics.get_parserTrB   rr   c           
      C   s   | j | j| j| j| jf\}}}}}| }| }	|r0|| j }td|| j	 }td|| j	 }d| | |d|   | | d| |	 | | d| |	 | | d| | | | | dS )Nr9   r   )qkv_projattn_qkattn_avout_proj)
r   r   r   r   r   r>   r@   r   maxr   )
r:   rB   rr   LDqkvdTTCr   r   r   rt     s"   
z(AttentionMetrics.get_num_flops_breakdownc           
      C   sV  | j | j| j| j| jf\}}}}}| }|r,|| j }td|| j }td|| j }i }	|| | j	 | |	d< t
||d|   | | j | |	d< |jdkrf|j| d|j |  | | j	 | |	d< |jdkr|	dd|j| | | j	 | d|j | | | j |   |	d< || | | j	 | |	d< t
|| | | j | |	d< |	S )	Nr9   	qkv_inputr   
qkv_weightr   
attn_input	out_input
out_weight)r   r   r   r   r   r>   r   r   r   r   r&   r   r/   r0   r2   getr3   r   )
r:   rB   rr   r   r   r   r   r   r   
read_bytesr   r   r   rv     s@   
&


z)AttentionMetrics.get_read_bytes_breakdownc           	      C   s   | j | j| j| j| jf\}}}}}| }|r,|| j }td|| j }td|| j }||d|   | | j	 | d| | | | j
 | || | j	 | dS )z4Calculate write memory traffic for attention layers.r9   r   )
qkv_outputkv_cache
out_output)r   r   r   r   r   r>   r   r   r   r   r   )	r:   rB   rr   r   r   r   r   r   r   r   r   r   rw     s   
z*AttentionMetrics.get_write_bytes_breakdownNr~   )r   r   r   r
   r   r&   r%   r   r   r   r   r   r   r   r   r   r$   rE   r(   rg   rZ   ri   r.   rD   r'   rt   rv   rw   r   r   r   r   r   g  sN   
 



.
r   c                   @   r   )BaseFfnConfigParserz
    Parses FFN and MoE configuration.
    Provides: intermediate_size, num_experts, num_experts_per_tok,
    moe_intermediate_size, num_shared_experts, num_moe_layers
    rU   rV   r8   c                 C   s   |j j}t|dr|jd ur|j}t|d|jd |_|j  |_t	|ddgd|_
t	|ddgd|_t	|dd	gd|_|jdk}|rI|j|_|S d|_|S )
Ntext_configintermediate_size   num_experts_per_tokmoe_topkr   moe_intermediate_sizen_shared_expertsnum_shared_experts)r   	hf_confighasattrr   getattrr   r   get_num_expertsnum_expertsgetattr_from_listr   r   r   r   num_moe_layers)r:   rU   rV   r   is_moer   r   r   rY     s&   



zBaseFfnConfigParser.parseNr   r   r   r   r   r     r   r   c                   @   r   )FfnParallelParserzW
    Parses FFN parallelism configuration.

    Provides: ffn_tp_size, ffn_ep_size
    rU   rV   r8   c                 C   s<   |j rd|j|j }}n	|j|j d}}||_||_|S )Nr9   )r   r   r   ffn_tp_sizeffn_ep_size)r:   rU   rV   r   r   r   r   r   rY     s   zFfnParallelParser.parseNr   r   r   r   r   r     r   r   c                   @   r   )InterleaveMoeLayerStepParserzg
    Parses interleave_moe_layer_step field for models like Llama4.

    Overrides: num_moe_layers
    rU   rV   r8   c                    sX   |j j t dr jd ur j t dr* jdkr*t fddt|jD |_|S )Nr   interleave_moe_layer_stepr   c                    s"   g | ]}|d   j  dkr|qS )r9   r   )r   .0layerr   r   r   
<listcomp>2  s
    z6InterleaveMoeLayerStepParser.parse.<locals>.<listcomp>)	r   r   r   r   r   lenranger   r   rX   r   r   r   rY   (  s   

z"InterleaveMoeLayerStepParser.parseNr   r   r   r   r   r   !  r   r   c                   @   r   )MoeLayerFreqParserzy
    Parses moe_layer_freq and first_k_dense_replace fields for models like Deepseek.

    Overrides: num_moe_layers
    rU   rV   r8   c                    sX   |j j t dr jd ur j t dr*t dr*t fddt|jD |_|S )Nr   moe_layer_freqfirst_k_dense_replacec                    s(   g | ]}| j kr| j d kr|qS )r   )r   r   r   r   r   r   r   J  s    
z,MoeLayerFreqParser.parse.<locals>.<listcomp>)r   r   r   r   r   r   r   r   rX   r   r   r   rY   C  s   
	zMoeLayerFreqParser.parseNr   r   r   r   r   r   <  r   r   c                   @   r   )FfnQuantizationConfigParserz\
    Parses quantization configuration for FFN layers.

    Overrides: weight_byte_size
    rU   rV   r8   c                 C   sD   |j }|d u r	|S | }|dv rd|_	 |S |dkr d|_|S tr   r   r   r   r   r   rY   \  s   z!FfnQuantizationConfigParser.parseNr   r   r   r   r   r   U  r   r   c                	   @   s  e Zd ZU edddZeed< edddZeed< edddZeed< edddZ	eed< edddZ
eed< edddZeed	< edddZeed
< edZeed< edZeed< edZeed< edZeed< edddZeed< edddZeeB ed< edddefddZedefddZedefddZ	d&dededeeef fdd Z	d&dededeeef fd!d"Z 	d&dededeeef fd#d$Z!d%S )'
FfnMetrics.r   r   r   r   r   r   r   r   r   r   r9   r   r   r   )ger   r   after)moder8   c                 C   sP   | j dkr&| jsJ d| j| jsJ d| j| js&J d| j| S )zJValidate that MoE-related fields are properly set when num_moe_layers > 0.r   zself.num_experts=zself.num_experts_per_tok=zself.moe_intermediate_size=)r   r   r   r   r=   r   r   r   validate_moe_fields  s
   
zFfnMetrics.validate_moe_fieldsc                 C   rW   )Nffnr   rf   r   r   r   rg     rh   zFfnMetrics.component_typec                 C   s   t t t t t t t S rM   )rZ   r   r   r   r   r   r   rf   r   r   r   ri     s   zFfnMetrics.get_parserTrB   rr   c                 C   s  | j | j| j}}}| j| j| j| jf\}}}}	| }
|| }|r'|
| nd}|rJ|| j }|| j }|| j	 }|durC|| j	 }|rJ|| j
 }i }|r\d| d | |
 | |d< |rn|rnd| d | | | |d< |r|	rd| d | |	 |
 | |d< |S )z)Calculate flops breakdown for FFN layers.r   Nr      	dense_ffn
routed_ffn
shared_ffn)r   r   r   r   r   r   r   r>   r   r   r   )r:   rB   rr   r   r   DILmEMISr   Ldnum_activated_tokensflopsr   r   r   rt     s2   




 z"FfnMetrics.get_num_flops_breakdownc                 C   s  | j | j| j}}}| j| j| j| jf\}}}}	| }
| j}|| }|r*|
| nd}|rV|| j	 }|| j	 }|| j
 }|durF|| j
 }|rM|| j }|durV|| j }i }|rt|
| | j | |d< td| | | j | |d< td|
 | | j | |d< t|
| | j | |d< t|| | j | |d< |r@|rt||}t|| | j | |d	< td| | | | j | |d
< td| | | j | |d< t|| | j | |d< t|| | | j | |d< |	r@t|
| | j | |d< td| | |	 | j | |d< td|
 | |	 | j | |d< t|
| | j | |d< t|| |	 | j | |d< |S )z-Calculate read memory traffic for FFN layers.r   Ndense_up_gate_inputr   dense_up_gate_weightsdense_silu_inputdense_down_inputdense_down_weightsrouted_up_gate_inputrouted_up_gate_weightsrouted_silu_inputrouted_down_inputrouted_down_weightsshared_up_gate_inputshared_up_gate_weightsshared_silu_inputshared_down_inputshared_down_weights)r   r   r   r   r   r   r   r>   r   r   r   r   r&   r   r   min)r:   rB   rr   r   r   r   r   r   r   r   r   r   r   r   r   num_activated_expertsr   r   r   rv     s   






z#FfnMetrics.get_read_bytes_breakdownc                 C   s  | j | j| j}}}| j| j| j| jf\}}}}	| }
|| }|r'|
| nd}|rJ|| j }|| j }|| j	 }|durC|| j	 }|rJ|| j
 }i }|rwtd|
 | | j | |d< t|
| | j | |d< t|
| | j | |d< |r|rtd| | | j | |d< t|| | j | |d< t|| | j | |d	< |	rtd|
 |	 | | j | |d
< t|
|	 | | j | |d< t|
|	 | | j | |d< |S )z.Calculate write memory traffic for FFN layers.r   Nr   dense_up_gate_outputdense_silu_outputdense_down_outputrouted_up_gate_outputrouted_silu_outputrouted_down_outputshared_up_gate_outputshared_silu_outputshared_down_output)r   r   r   r   r   r   r   r>   r   r   r   r&   r   )r:   rB   rr   r   r   r   r   r   r   r   r   r   r   write_bytesr   r   r   rw   /  sd   




z$FfnMetrics.get_write_bytes_breakdownNr~   )"r   r   r   r
   r   r&   r%   r   r   r   r   r   r   r   r   r   r   r   r   r$   r   r   r   rE   r(   rg   rZ   ri   r.   rD   r'   rt   rv   rw   r   r   r   r   r   t  sX   
 	

,

[
r   c                	   @   s   e Zd ZU edddZeed< edddZeed< edddZeed< edddZ	eed< eed< e
d	efd
dZe
d	efddZ	ddeded	eeef fddZ	ddeded	eeef fddZ	ddeded	eeef fddZdS )UnembedMetrics.r   r   r   r   r   r   r   r8   c                 C   rW   )Nunembedr   rf   r   r   r   rg   ~  rh   zUnembedMetrics.component_typec                 C   s
   t t S rM   )rZ   r   rf   r   r   r   ri     s   zUnembedMetrics.get_parserTrB   rr   c                 C   s8   | j | j}}| }|r|| j }dd| | | iS )z0Calculate flops breakdown for unembedding layer.r  r   )r   r   r>   r   r:   rB   rr   r   Vr   r   r   r   rt     s   
z&UnembedMetrics.get_num_flops_breakdownc                 C   sB   | j | j}}| }|r|| j }|| | j || | j dS )z4Calculate read memory traffic for unembedding layer.)inputweight)r   r   r>   r   r   r   r  r   r   r   rv     s   
z'UnembedMetrics.get_read_bytes_breakdownc                 C   s.   | j }| }|r|| j }d|| | j iS )z5Calculate write memory traffic for unembedding layer.output)r   r>   r   r   )r:   rB   rr   r   r   r   r   r   rw     s   
z(UnembedMetrics.get_write_bytes_breakdownNr~   )r   r   r   r
   r   r&   r%   r   r   r   rE   r(   rg   rZ   ri   r.   rD   r'   rt   rv   rw   r   r   r   r   r  u  sD   
 




r  c                	   @   s   e Zd ZdeddfddZdefddZdd	ed
edefddZ	dd	ed
edefddZ
dd	ed
edefddZ	dd	ed
edeeef fddZ	dd	ed
edeeef fddZ	dd	ed
edeeef fddZdedefddZdS )ModelMetricsrV   r8   Nc                 C   s   || _ g | _t D ]7}z||}| j| td| t	| W q
 t
yA } ztd| t	| W Y d}~q
d}~ww dS )z
        Parse vllm_config to instantiate metrics for each component.
        is_enabled() will return False if no component metrics could be instantiated.
        z,Instantiated ComponentMetrics [%s] with (%s)z Failed to instantiate %s from %sN)rV   metricsrc   rq   rn   r`   r   inforg   r(   r   debug)r:   rV   
metric_clsmetricrm   r   r   r   r]     s(   
zModelMetrics.__init__c                 C   s   t | jdkS Nr   )r   r%  r=   r   r   r   
is_enabled  r^   zModelMetrics.is_enabledTrB   rr   c                       t  fdd| jD S )Nc                 3       | ]	}|  V  qd S rM   )rz   r   r)  rB   rr   r   r   	<genexpr>      z-ModelMetrics.get_num_flops.<locals>.<genexpr>ry   r%  rs   r   r/  r   rz        zModelMetrics.get_num_flopsc                    r,  )Nc                 3   r-  rM   )r|   r.  r/  r   r   r0    r1  z.ModelMetrics.get_read_bytes.<locals>.<genexpr>r2  rs   r   r/  r   r|     r3  zModelMetrics.get_read_bytesc                    r,  )Nc                 3   r-  rM   )r}   r.  r/  r   r   r0    r1  z/ModelMetrics.get_write_bytes.<locals>.<genexpr>r2  rs   r   r/  r   r}     r3  zModelMetrics.get_write_bytesc                    H   i }| j D ]}|||}|   fdd| D }|| q|S )Nc                        i | ]\}}  d | |qS .r   r   keyval	componentr   r   
<dictcomp>       z8ModelMetrics.get_num_flops_breakdown.<locals>.<dictcomp>)r%  rt   rg   itemsupdater:   rB   rr   totalr)  	breakdownprefixedr   r;  r   rt        
z$ModelMetrics.get_num_flops_breakdownc                    r4  )Nc                    r5  r6  r   r8  r;  r   r   r=    r>  z9ModelMetrics.get_read_bytes_breakdown.<locals>.<dictcomp>)r%  rv   rg   r?  r@  rA  r   r;  r   rv     rE  z%ModelMetrics.get_read_bytes_breakdownc                    r4  )Nc                    r5  r6  r   r8  r;  r   r   r=    r>  z:ModelMetrics.get_write_bytes_breakdown.<locals>.<dictcomp>)r%  rw   rg   r?  r@  rA  r   r;  r   rw     rE  z&ModelMetrics.get_write_bytes_breakdownscheduler_outputc                 C   s$  t  }t }|jD ]}|j}|j|d}|dkrq
|j| }|j||dd q
|j	}t
|jD ]$\}	}|j|d}|dkrAq1|j|	 }
|
| }|dk}|||| q1| |d}| |d}| |d}tt| t| t| }tjrtt  | |j|jt|||||_|S )zV
        Calculate perf stats for the current step based on scheduled tokens.
        r   T)r7   r9   )time	monotonicr.   scheduled_new_reqsreq_idnum_scheduled_tokensr   num_computed_tokensr;   scheduled_cached_reqs	enumeratereq_idsrt   rv   rw   r)   ry   rp   envsVLLM_DEBUG_MFU_METRICSr   r   r   r   r-   )r:   rF  t0rB   new_reqrJ  r5   r6   cached_reqsirL  r7   num_flops_breakdownread_bytes_breakdownwrite_bytes_breakdown
perf_statsr   r   r   get_step_perf_stats_per_gpu  sJ   







z(ModelMetrics.get_step_perf_stats_per_gpur~   )r   r   r   r   r]   rD   r+  r.   r&   rz   r|   r}   r'   r(   rt   rv   rw   r   r)   rZ  r   r   r   r   r$    sF    





r$  c                   @   s@   e Zd Zdd Zdd Zdeddfdd	Zd
edefddZ	dS )PerfMetricsDebugLoggingc                 C   s   |    d S rM   )resetr=   r   r   r   r]   A  rS   z PerfMetricsDebugLogging.__init__c                 C   s4   d| _ d| _d| _d| _i | _i | _i | _i | _d S )Nr   r   )total_calc_durationtotal_num_prefill_requeststotal_num_decode_requeststotal_num_batchestotal_context_breakdown!total_num_flops_per_gpu_breakdown"total_read_bytes_per_gpu_breakdown#total_write_bytes_per_gpu_breakdownr=   r   r   r   r\  D  s   
zPerfMetricsDebugLogging.resetr-   r8   Nc                 C   s   |  j |j7  _ |  j|j7  _|  j|j7  _|  jd7  _t| j| j	| j
| jg|j|j|j|jgD ]\}}t|ts?J | D ]\}}||d| ||< qCq4d S )Nr9   r   )r]  r   r^  r   r_  r   r`  zipra  rb  rc  rd  r    r!   r"   r#   r   r'   r?  r   )r:   r-   dstsrcr9  r:  r   r   r   observeN  s*   zPerfMetricsDebugLogging.observe
log_prefix
delta_timec                 C   s   dd | j  D }dd | j D }dd | j D }td|tj| j| j	| j
| j||||dd| j| dd		d
d d S )Nc                 S   "   i | ]\}}||d  ddqS )   mB.1fTFr   r   kvr   r   r   r=  h      z/PerfMetricsDebugLogging.log.<locals>.<dictcomp>c                 S   rk      eArm  GBr   ro  r   r   r   r=  l  rr  c                 S   rk  rs  r   ro  r   r   r   r=  p  rr  z%sMFU details: %srm  sz.1%)	prefill_reqsdecode_reqsnum_batchesr    flops_breakdownnum_read_bytes_breakdownnum_write_bytes_breakdowndurationmfu_calc_overheadr   )indent)rb  r?  rc  rd  r   r'  jsondumpsr^  r_  r`  ra  r]  )r:   log_fnri  rj  rb  rc  rd  r   r   r   logf  s4   
zPerfMetricsDebugLogging.log)
r   r   r   r]   r\  r   rh  r(   r$   r  r   r   r   r   r[  @  s
    
r[  c                   @   sN   e Zd ZdefddZdd Zdeddfd	d
Zej	dfde
ddfddZdS )PerfMetricsLoggingrV   c                 C   s0   || _ |jj| _d | _tjrt | _|   d S rM   )	rV   r   r   r   debug_loggingrP  rQ  r[  r\  )r:   rV   r   r   r   r]     s   
zPerfMetricsLogging.__init__c                 C   s4   t  | _d| _d| _d| _| jr| j  d S d S r*  )rG  rH  last_log_timetotal_num_flops_per_gputotal_read_bytes_per_gputotal_write_bytes_per_gpur  r\  r=   r   r   r   r\    s   
zPerfMetricsLogging.resetrY  r8   Nc                 C   sZ   |  j |j7  _ |  j|j7  _|  j|j7  _| jr+|jd us"J | j|j d S d S rM   )	r  r*   r  r+   r  r,   r  r-   rh  )r:   rY  r   r   r   rh    s   zPerfMetricsLogging.observe ri  c                 C   s   | j s| js| jsd S t }|| j }|dkrd}d}n| j | d }| j| j | d }|d||| | jr@| j||| |   d S )Nr   rl  rt  z"%sMFU: %.1f TF/s/GPU %.1f GB/s/GPU)	r  r  r  rG  rH  r  r  r  r\  )r:   r  ri  nowrj  avg_tflops_per_gpuavg_gbps_per_gpur   r   r   r    s8   

zPerfMetricsLogging.log)r   r   r   r   r]   r\  r)   rh  r   r&  r(   r  r   r   r   r   r    s
    

	r  objattrc                 C   s$   t | |std| dt| |S )zMGet an attr from an object, or throw a InvalidComponentError if it's not set.zMissing required attr z
 in config)r   r   r   )r  r  r   r   r   r     s   

r   attrsdefaultc                 C   s&   |D ]}t | |rt| |  S q|S )zdTry to get the first attr that exists in the object
    from a list of attrs. Otherwise return None.)r   r   )r  r  r  r  r   r   r   r     s
   
r   rM   )Dr   r  rG  abcr   r   collections.abcr   dataclassesr   r   typingr   r   r   pydanticr	   r
   r   r   typing_extensionsr   	vllm.envsrP  vllm.configr   vllm.loggerr   vllm.utils.torch_utilsr   r   r   vllm.v1.core.sched.outputr   r   r   	Exceptionr   r   r)   r.   rF   rT   rZ   rd   r'   r(   rJ   r%   rc   r   r   r   r   r   r   r   r   r   r   r  r$  r[  r  rN   r   r\   r   r   r   r   r   <module>   s`   <	K4 !  B 
KD 