o
    -i                     @   sV  d Z ddlZddlZddlZddlZddlZddlZddlZddlm	Z	 ddl
Z
ddlZddlmZ ddlmZmZ ddlmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZ ddl m!Z!m"Z" ddl#m$Z$m%Z% dd	l&m'Z'm(Z( dd
l)m*Z* ddl+m,Z, ddl-m.Z. ddl/m0Z0m1Z1 ddl2m3Z3 	d2de4e de5de%de6de6de7e8e4e, dB f fddZ9	d2de4e de5de%de6de6de7e8e4e, f fddZ:		d3de4e de5de$de6de6de6de8fddZ;	d2de4e de<de0de5de5d e6de6de8fd!d"Z=d#ej>d$e?e<e	f ddfd%d&Z@d'd( ZAd)d* ZBd+d, ZCd-ejDfd.d/ZEd#ej>fd0d1ZFdS )4z'Benchmark offline inference throughput.    N)Any)tqdm)AutoModelForCausalLMPreTrainedTokenizerBase)AIMODatasetBurstGPTDatasetConversationDatasetInstructCoderDatasetMultiModalConversationDatasetPrefixRepetitionRandomDatasetRandomDatasetRandomDatasetForRerankingRandomMultiModalDatasetSampleRequestShareGPTDatasetSonnetDatasetVisionArenaDatasetadd_random_dataset_base_args"add_random_multimodal_dataset_args)#convert_to_pytorch_benchmark_formatwrite_to_json)AsyncEngineArgs
EngineArgs)
TextPromptTokensPrompt)LoRARequest)RequestOutput)BeamSearchParams)TokenizerLikeget_tokenizer)merge_async_iteratorsFrequestsnengine_args
do_profiledisable_detokenizereturnc                    s  ddl m}m} |di t| t fdd| D s!J dg }g }| D ]9}	d|	jv r6t|	jd dnt|	jd}
|	j	rLt
|	j	tsGJ |	j	|
d	< ||
 |||d
d
d|	j| d q'd }|jrmdd | D }d}d }|st }|r}    j|||dd}|r   t }n?|d u sJ ddd | D }| d j}| D ]	}	|	j|ksJ qt }|r    |t||dd |r̈   t }|| |fS )Nr   LLMSamplingParamsc                 3   &    | ]} j jj|j|j kV  qd S N
llm_enginemodel_configmax_model_len
prompt_lenexpected_output_len.0requestllm W/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/benchmarks/throughput.py	<genexpr>8       

zrun_vllm.<locals>.<genexpr>pPlease ensure that max_model_len is greater than the sum of prompt_len and expected_output_len for all requests.prompt_token_idsr<   promptmulti_modal_data      ?Tr"   temperaturetop_p
ignore_eos
max_tokens
detokenizec                 S      g | ]}|j qS r7   )lora_requestr2   r7   r7   r8   
<listcomp>Z       zrun_vllm.<locals>.<listcomp>F)rI   use_tqdmz$BeamSearch API does not support LoRAc                 S   rH   r7   r>   r2   r7   r7   r8   rJ   k   rK   )
beam_widthrF   rE   r7   )vllmr(   r)   dataclassesasdictallr?   r   r   r@   
isinstancedictappendr1   enable_loratimeperf_counterstart_profilegeneratestop_profilebeam_searchr   )r!   r"   r#   r$   r%   r(   r)   promptssampling_paramsr4   r?   lora_requestsuse_beam_searchoutputsstartend
output_lenr7   r5   r8   run_vllm.   s~   
	






rd   c                    s   ddl m}m} |di t| t fdd| D s!J dg }g }| D ]}	||	j |||ddd|	j| d q't	
 }
|rI    j||dd	}|rW   t	
 }||
 |fS )z
    Run vLLM chat benchmark. This function is recommended ONLY for benchmarking
    multimodal models as it properly handles multimodal inputs and chat
    formatting. For non-multimodal models, use run_vllm() instead.
    r   r'   c                 3   r*   r+   r,   r2   r5   r7   r8   r9      r:   z run_vllm_chat.<locals>.<genexpr>r;   rA   TrB   )rL   Nr7   )rN   r(   r)   rO   rP   rQ   rT   r?   r1   rV   rW   rX   chatrZ   )r!   r"   r#   r$   r%   r(   r)   r\   r]   r4   ra   r`   rb   r7   r5   r8   run_vllm_chat   s:   
	
rf    disable_frontend_multiprocessingc                    s  ddl m} ddlm} |||d4 I d H }|j t fdd| D s*J dg }	g }
g }| D ]?}d|jv rAt|jd d	nt|jd
}|j	rWt
|j	tsRJ |j	|d< |
||ddd|j| d |	| ||j q2g }t }|r| I d H  tt|	|
|D ]\}\}}}|j|||d| d}|| qt| }|2 z3 d H W \}}q6 |r| I d H  t }|| W  d   I d H  S 1 I d H sw   Y  d S )Nr   )r)   )*build_async_engine_client_from_engine_args)rg   c                 3   s"    | ]} j |j|j kV  qd S r+   )r/   r0   r1   r2   r.   r7   r8   r9      s    

z!run_vllm_async.<locals>.<genexpr>r;   r<   r=   r>   r@   rA   TrB   test)rI   
request_id)rN   r)   "vllm.entrypoints.openai.api_serverrh   r.   rQ   r?   r   r   r@   rR   rS   rT   r1   rI   rV   rW   rX   	enumerateziprY   r    rZ   )r!   r"   r#   r$   rg   r%   r)   rh   r6   r\   r]   r^   r4   r?   
generatorsra   isplr	generatorall_gensresrb   r7   ri   r8   run_vllm_async   sn   







0rv   model	tokenizermax_batch_sizetrust_remote_codec              
   C   sj  t |ts	J dtj|tj|d}|jjdkr|j|_	|
 }tt| d}t }	g }
d}d}tt| D ]u}| | j}| | j}| | j}|
| t||}t||}t|
|k r|t| d kr| |d  j}| |d  j}t||t|| dkrq7||
dd	d
j}|j|
 d	|ddd	|d}|s|j|d	d |t|
 g }
d}d}q7t }||	 S )Nz*the hf backend only supports HF tokenizers)dtyperz   llama)totalr      i   ptT)return_tensorspaddingrA   )	input_ids	do_samplenum_return_sequencesrC   rD   	use_cachemax_new_tokens)skip_special_tokens)rR   r   r   from_pretrainedtorchfloat16config
model_type	eos_token	pad_tokencudar   lenrV   rW   ranger?   r0   r1   rT   maxr   rY   batch_decodeupdate)r!   rw   rx   r"   ry   rz   r%   r6   pbarra   batchmax_prompt_lenmax_output_lenrp   r?   r0   rc   next_prompt_lennext_output_lenr   llm_outputsrb   r7   r7   r8   run_hf   s`   	





	r   argsresultsc                    s\   t |  d g d gd fdddD d}|r,tj| jd  d	}t|| d S d S )
Nrequests_per_secondtokens_per_second)r   r   c                    s   i | ]}| | qS r7   r7   )r3   kr   r7   r8   
<dictcomp>H  s    
z4save_to_pytorch_benchmark_format.<locals>.<dictcomp>)elapsed_timenum_requeststotal_num_tokens)r   metrics
extra_infor   z.pytorch.json)r   ospathsplitextoutput_jsonr   )r   r   
pt_recordspt_filer7   r   r8    save_to_pytorch_benchmark_format?  s   

r   c           
      C   sL  | j | jd}|| j| j| jd}| jdks | j d u r\| jdvr\| j|d< t| dd }|d ur1|n| j|d< t| dd }|d urB|n| j	|d	< t| d
d }|d urS|n| j
|d< t}n| jdkrxt}| jdkrld|d< | j
d urv| j
|d< n| jdkr|js|jsJ dt}| j|d< d|d< | j	d ur| j	|d	< | j
d ur| j
|d< n`| jdkrt}nW| jdkr'| j
d ur| j
|d< | j tjv rt}d |d< d|d< d|d< n1| j tjv rt}d|d< n#| j tjv rt}| j|d< | j|d< d|d< n| j tjv rt}| j|d< | j|d< d|d< n| j tjv r&t}d |d< d|d< n| jdkrDt}| j|d< | j|d< | j|d< | j|d< n| jdkrt}t| dd }|d urY|nt| d	d |d	< t| d
d }|d urn|nt| dd |d< t| dd |d< t| dd |d< t| d d |d!< t| d"d |d#< d|d< t| dd }t| dd }|d ur|n||d< | j|d< nQ| jd$kr t }t| dd }|d ur|nt| d	d |d	< t| d
d }|d ur|nt| dd |d< t| d%d&|d'< t| d(d) |d*< | j|d< nt!d+| j d,d- |" D }|d.i |j#d.i |}	t$|	| j%}	|	S )/N)dataset_pathrandom_seed)rx   	lora_path	max_lorasr   random>   prefix_repetition	random-mmrandom-rerankrange_ratiorandom_prefix_len
prefix_lenrandom_input_len	input_lenrandom_output_lenrc   sharegpt	vllm-chatTenable_multimodal_chatsonnetz;Tokenizer/model must have chat template for sonnet dataset.return_prompt_formattedburstgpthfdataset_subsettraindataset_splitr   
suffix_lennum_prefixesr    random_mm_base_items_per_requestbase_items_per_request"random_mm_num_mm_items_range_rationum_mm_items_range_ratiorandom_mm_limit_mm_per_promptlimit_mm_per_promptrandom_mm_bucket_configbucket_configr   random_batch_sizer~   	batchsizeno_rerankerFis_rerankerzUnknown dataset name: c                 S   s   i | ]\}}|d ur||qS r+   r7   )r3   r   vr7   r7   r8   r     s    z get_requests.<locals>.<dictcomp>r7   )&r   seedr   r   num_promptsdataset_namerandom_range_ratiogetattrr   r   rc   r   r   backendchat_templatedefault_chat_templater   r   r   SUPPORTED_DATASET_PATHSr	   r
   	hf_subsethf_splitr   r   r   prefix_repetition_prefix_lenprefix_repetition_suffix_lenprefix_repetition_num_prefixesprefix_repetition_output_lenr   r   
ValueErroritemssamplefilter_requests_for_dpdata_parallel_size)
r   rx   common_kwargssample_kwargsr   r   r   dataset_clsr   r!   r7   r7   r8   get_requestsR  s   
































r   c                    sL   dkr| S t tjd }t tjd }||    fddt| D S )Nr~   RANK
WORLD_SIZEc                    s    g | ]\}}|  kr|qS r7   r7   )r3   rp   rdata_parallel_rankr   r7   r8   rJ     s
    z*filter_requests_for_dp.<locals>.<listcomp>)intr   environrm   )r!   r   global_rank
world_sizer7   r   r8   r     s   r   c                 C   s  | j durtjddd | j | _t| dds| j| _h d}| j|vr+td| j | j sP| jsP| j	dvrPt
d	 d
| _	t| dd}| jdu rP|du rPtd| j	dkrmt| ddduset| dddurmtjddd n@| j	dkr| jtj tjB tjB v r| jdksJ | j dn| jtjtjB v r| jdksJ | j dnt| j d| j	dvr| jdurtjddd | j	dkrt| dddur| jdkrtjddd | j	dkrt| ddrtjddd | j	dvr| jdurtjd dd | j	dv rHt| dd}t| d!d}t| d"d}| jdur$|dur$tjd#dd | jdur6|dur6tjd$dd | jdurH|durHtjd%dd t| d&drY| jdkrYtd't| d&drj| jdu rjtd(| jdkrz| jdu rztd)| jdkr| jdurtd*| jd+v rt| d,ddurtd-| jd.kr| jd/krtd0| jd.kr| jdkrtd1| jd.kr| j| jkrtd2| jdkr| jd3ks| jrtd4dS dS )5z*
    Validate command-line arguments.
    NzzThe '--dataset' argument will be deprecated in the next release. Please use '--dataset-name' and '--dataset-path' instead.   )
stacklevelrx   >   r   miirN   r   zUnsupported backend: >   r   z?When dataset path is not set, it will default to random datasetr   r   zNEither --input-len or --random-input-len must be provided for a random datasetr   r   r   z\--hf-subset and --hf-split will be ignored                 since --dataset-name is not 'hf'.r   z' needs to use vllm-chat as the backend.rN   z" needs to use vllm as the backend.z  is not supported by hf dataset.>   r   r   r   z{--random-range-ratio will be ignored since                 --dataset-name is not 'random', 'random-mm', or 'random-rerank'.r   r   r~   zd--random-batch-size will be ignored since                     --dataset-name is not 'random-rerank'.r   FzZ--no-reranker will be ignored since                 --dataset-name is not 'random-rerank'.>   Nr   r   r   zu--prefix-len will be ignored since --dataset-name                 is not 'random', 'random-mm', 'sonnet', or not set.r   r   z}Both --input-len and --random-input-len are specified. The random version (--random-input-len) will be preferred in this run.zBoth --output-len and --random-output-len are specified. The random version (--random-output-len) will be preferred in this run.zBoth --prefix-len and --random-prefix-len are specified. The random version (--random-prefix-len) will be preferred in this run.rU   z4LoRA benchmarking is only supported for vLLM backendz3LoRA path must be provided when enable_lora is Truez,HF max batch size is required for HF backendz)HF max batch size is only for HF backend.>   r   r   quantizationz&Quantization is only for vLLM backend.r   autoz#dtype must be auto for MII backend.zn must be 1 for MII backend.z8Tokenizer must be the same as the model for MII backend.external_launcherzData parallel is only supported with external launcher mode with synchronous engine in offline benchmark, please use benchmark serving instead)datasetwarningswarnr   r   rw   rx   r   r   r   printr   r   r   keysr
   r   r	   r   r   r   r   rc   r   hf_max_batch_sizer{   r"   r   distributed_executor_backendasync_engine)r   valid_backendsr   r   r   r7   r7   r8   validate_args  s   











	
	r  parserc                 C   s  | j dtg ddd | j dtg dddd	 | j d
td dd | j dtd dd | j dtd dd | j dtd dd | j dtddd | j dtddd | j dtd dd | j dtd dd | j dddd d! | j d"ddd#d! | j d$dd%d& | j d'td d(d | j d)td*d+d | j d,td d-d | j d.td d/d | j d0ddd1d! | j d2td d3d | j d4td d5d | j d6td d7d | j d8td d9d t|  t|  t| } d S ):Nz	--backend)rN   r   r   r   rN   )typechoicesdefaultz--dataset-name)r   r   r   r   r   r   r   r   z$Name of the dataset to benchmark on.r   )r	  r
  helpr  z	--datasetzPath to the ShareGPT dataset, will be deprecated in            the next release. The dataset is expected to be a json in form of list[dict[..., conversations: list[dict[..., value: <prompt_or_response>]]]])r	  r  r  z--dataset-pathzPath to the datasetz--input-lenz$Input prompt length for each requestz--output-lenzMOutput length for each request. Overrides the output length from the dataset.z--nr~   z)Number of generated sequences per prompt.z--num-promptsi  zNumber of prompts to process.z--hf-max-batch-sizez"Maximum batch size for HF backend.z--output-jsonz3Path to save the throughput results in JSON format.z--async-engine
store_trueFz,Use vLLM async engine rather than LLM class.)actionr  r  z"--disable-frontend-multiprocessingz(Disable decoupled async engine frontend.z--disable-detokenizez[Do not detokenize the response (i.e. do not include detokenization time in the measurement))r  r  z--lora-pathztPath to the lora adapters to use. This can be an absolute path, a relative path, or a Hugging Face model identifier.z--prefix-lenr   zRNumber of fixed prefix tokens before the random context in a request (default: 0).z--hf-subsetzSubset of the HF dataset.z
--hf-splitzSplit of the HF dataset.z	--profilezEUse vLLM Profiling. --profiler-config must be provided on the server.z--prefix-repetition-prefix-lenzMNumber of prefix tokens per request, used only for prefix repetition dataset.z--prefix-repetition-suffix-lenz|Number of suffix tokens per request, used only for prefix repetition dataset. Total input length is prefix_len + suffix_len.z --prefix-repetition-num-prefixesz|Number of prefixes to generate, used only for prefix repetition dataset. Prompts per prefix is num_requests // num_prefixes.z--prefix-repetition-output-lenzMNumber of output tokens per request, used only for prefix repetition dataset.)add_argumentstrr   r   r   r   add_cli_args)r  r7   r7   r8   r    s   
				r  c              
   C   s  t |  | jd u rd| _t| j | jdks| jdkr$| jdkr$d| _t| j| j| jd}t| |}t	dd |D }d }| jdkrm| j
r[tt|| jt| | j| j| jd	}nTt|| jt| | j| jd
\}}nB| jdkr| jdksyJ | jrtdt|| j|| j| j| j| j}n| jdkrt|| jt| | j| jd
\}}ntd| j |rd}d}|D ] }t|tsq||jrt |jnd7 }|t!dd |j"D 7 }q|| }	nt!dd |D }	t!dd |D }|	| }|r| jdkrt#d| j d t#dt || dd|	| dd|| dd t#d|  t#d|  | j$rf|t ||	t || |	| d}
t%| j$d}t&j'|
|dd W d    n	1 sZw   Y  t(| |
 d S d S )Nr   r   r   r   )tokenizer_moderz   c                 s   s    | ]}|j d uV  qd S r+   )r@   r2   r7   r7   r8   r9   U  s    zmain.<locals>.<genexpr>rN   )rg   r%   r$   )r%   r$   r~   z/Profiling not implemented yet for backend='hf'.r   zUnknown backend: c                 s   s    | ]
}|rt |jV  qd S r+   )r   	token_ids)r3   or7   r7   r8   r9     s    c                 s   s    | ]	}|j |j V  qd S r+   )r0   r1   r3   r   r7   r7   r8   r9     s    c                 s   s    | ]}|j V  qd S r+   )r1   r  r7   r7   r8   r9     s    z+[91mWARNING[0m: Multi-modal request with z backend detected. The following metrics are not accurate because image tokens are not counted. See vllm-project/vllm/issues/9778 for details.zThroughput: z.2fz requests/s, z total tokens/s, z output tokens/szTotal num prompt tokens:  zTotal num output tokens:  )r   r   r   r   r   w   )indent))r  r   r   r   r  r   rx   rz   r   anyr  uvlooprunrv   r"   r   from_cli_argsrg   r%   profilerd   r   tensor_parallel_sizeNotImplementedErrorr   rw   r  rf   r   rR   r   r<   r   sumr`   r  r   openjsondumpr   )r   rx   r!   is_multi_modalrequest_outputsr   total_prompt_tokenstotal_output_tokensror   r   fr7   r7   r8   mainC  s   





	

	
r*  )F)FF)G__doc__argparserO   r"  r   r   rV   r   typingr   r   r  r   transformersr   r   vllm.benchmarks.datasetsr   r   r   r	   r
   r   r   r   r   r   r   r   r   r   r   vllm.benchmarks.lib.utilsr   r   vllm.engine.arg_utilsr   r   vllm.inputsr   r   vllm.lora.requestr   vllm.outputsr   vllm.sampling_paramsr   vllm.tokenizersr   r   vllm.utils.async_utilsr    listr   booltuplefloatrd   rf   rv   r  r   	NamespacerS   r   r   r   r  ArgumentParserr  r*  r7   r7   r7   r8   <module>   s   D
X
6
P
D

  5  