o
    
۾i                     @   s   d Z ddlZddlZddlZddlZddlZddlmZ ddlZ	ddl
m
Z
 ddlmZmZ ddlmZ ddlmZ ddlmZ d	ejd
eeef ddfddZdejfddZd	ejfddZdS )z?Benchmark the latency of processing a single batch of requests.    N)Any)tqdm)#convert_to_pytorch_benchmark_formatwrite_to_json)
EngineArgs)
PromptType)BeamSearchParamsargsresultsreturnc                    sR   t | d d i fdddD d}|r'tj| jd  d}t|| d S d S )	Nlatency	latenciesc                    s   i | ]}| | qS  r   ).0kr
   r   K/home/ubuntu/.local/lib/python3.10/site-packages/vllm/benchmarks/latency.py
<dictcomp>   s    z4save_to_pytorch_benchmark_format.<locals>.<dictcomp>)avg_latencypercentiles)r	   metrics
extra_infor   z.pytorch.json)r   ospathsplitextoutput_jsonr   )r	   r
   
pt_recordspt_filer   r   r    save_to_pytorch_benchmark_format   s   
r   parserc                 C   s   | j dtdd | j dtdd | j dtdd | j dtd	d
d | j ddd | j dtddd | j dtddd | j dddd | j dtd dd | j dddd t| } | jdd d S )Nz--input-len    )typedefaultz--output-len   z--batch-size   z--n   z)Number of generated sequences per prompt.)r!   r"   helpz--use-beam-search
store_true)actionz--num-iters-warmup
   z'Number of iterations to run for warmup.z--num-iters   zNumber of iterations to run.z	--profilez0profile the generation process of a single batch)r(   r&   z--output-jsonz0Path to save the latency results in JSON format.z--disable-detokenizez`Do not detokenize responses (i.e. do not include detokenization time in the latency measurement)F)enable_prefix_caching)add_argumentintstrr   add_cli_argsset_defaults)r   r   r   r   r/   "   sJ   
	r/   c                    s2  t  }ddlm}m} |d#i t|jjj	 j
 j ks&J d| jddd j j dtjjd j j
fd}d	d
 | D  fddd$dtffdd}td tt jddD ]}|dd qj jr|j}|jdkrtd|j d n	|jdkrtd |dd d S g }tt jddD ]
}||dd qt|}g d}	t||	}
tdt | d t!|	|
D ]\}}t| d| d q̈ j"rt || t#t!|	|
 d}t$ j"d }t%j&||d!d" W d    n	1 sw   Y  t' | d S d S )%Nr   )LLMSamplingParamszUPlease ensure that max_model_len is greater than the sum of input_len and output_len.g      ?T)ntemperaturetop_p
ignore_eos
max_tokens
detokenizei'  )sizec                 S   s   g | ]}d |iqS )prompt_token_idsr   )r   batchr   r   r   
<listcomp>k   s    zmain.<locals>.<listcomp>c                      s8    j sjdd d S t j jdd d S )NF)sampling_paramsuse_tqdmT)
beam_widthr7   r6   )use_beam_searchgeneratebeam_searchr   r3   
output_lenr   )r	   dummy_promptsllmr=   r   r   llm_generateo   s   zmain.<locals>.llm_generateF
do_profilec                    s@   | r         d S t }  t }|| }|S )N)start_profilestop_profiletimeperf_counter)rG   
start_timeend_timer   )rE   rF   r   r   run_to_completion|   s   zmain.<locals>.run_to_completionzWarming up...zWarmup iterations)desc)rG   torchz8Profiling with torch profiler (results will be saved to z)...cudaz Profiling with cuda profiler ...zBench iterations)r)      2   K   Z   c   zAvg latency: z secondsz% percentile latency: )r   r   r   w   )indentr   )F)(r   from_cli_argsvllmr1   r2   dataclassesasdict
llm_enginemodel_configmax_model_len	input_lenrC   r3   disable_detokenizenprandomrandint
batch_sizetolistboolprintr   rangenum_iters_warmupprofileprofiler_configprofilertorch_profiler_dir	num_itersappendarray
percentilemeanzipr   dictopenjsondumpr   )r	   engine_argsr1   r2   dummy_prompt_token_idsrN   _rm   r   percentagesr   
percentagers   r
   fr   )r	   rD   rE   rF   r=   r   mainP   st   





r   )__doc__argparser\   rx   r   rJ   typingr   numpyrc   r   vllm.benchmarks.lib.utilsr   r   vllm.engine.arg_utilsr   vllm.inputsr   vllm.sampling_paramsr   	Namespacerv   r.   r   ArgumentParserr/   r   r   r   r   r   <module>   s,   

.