o
    پic=                     @   s,  d Z ddlZddlZddlZddlZddlZddlZddlZddlZddl	Z	ddl
mZmZmZ ddlZddlmZmZmZmZmZ ddlmZ ddlmZ ddlmZ ejG dd	 d	Z	
	d'dedee dedededede fddZ!d(ddZ"dedefddZ#e$dkre% Z&e'e& e'e& e&( Z)e*dd+ dv rej,-e)j.re/d e)j.  n6zdd!l0m1Z1 e/d"e)j.  e1e)j.e)_.e/d#e)j.  W n e2y Z3 ze/d$ee3  e3dZ3[3ww e4e)Z5e4e)Z6ej7e8ee5j9: d%d& e#e5e6 e6j;r	 e6j;sdS dS dS ))a   
Benchmark the throughput in the offline mode.
It accepts server arguments (the same as launch_server.py) and benchmark arguments (the same as bench_serving.py).

# Usage
## Sharegpt dataset with default args
python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --num-prompts 10

## Random dataset with default args
python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --dataset-name random --random-input 1024 --random-output 1024
    N)DictListOptional)
DatasetRowget_datasetget_tokenizersample_random_requests
set_ulimit)Runtime)Engine)
ServerArgsc                   @   sn  e Zd ZU dZeed< dZeed< dZeed< dZeed< dZ	e
ed	< d
Zee
 ed< d
Zee
 ed< dZe
ed< dZe
ed< dZeed< dZe
ed< dZe
ed< dZe
ed< dZe
ed< dZe
ed< dZe
ed< dZeed< d
Zee ed < dZeed!< dZeed"< dZeed#< dZeed$< dZeed%< dZ eed&< d'Z!e
ed(< e"d)e#j$fd*d+Z%e&d,e#j'fd-d.Z(d
S )/	BenchArgsenginebackend result_filenamesharegptdataset_namedataset_pathi  num_promptsNsharegpt_output_lensharegpt_context_leni   random_input_lenrandom_output_leng        random_range_ratio@   gsp_num_groups   gsp_prompts_per_groupi   gsp_system_prompt_len   gsp_question_len   gsp_output_len   seedFdisable_ignore_eosextra_request_bodyapply_chat_templateprofileskip_warmupdo_not_exitprompt_suffixreturn_logproblogprob_start_lenparserc                 C   s  | j dttjd | j dttjd | j dtdg ddd | j d	td
dd | j dttjdd | j dttjdd | j dttjdd | j dttj	dd | j dttj
dd | j dttjdd | j dttjdd | j dttjdd | j dttjdd | j dttjd d | j d!ttjd"d | j d#td$d%d | j d&d'd(d) | j d*d+ttjd,d- | j d.d'd/d) | j d0d'd1d) | j d2d'd3d) | j d4d'd5d) | j d6td
d7d | j d8d'd9d) | j d:td;d<d d S )=Nz	--backend)typedefaultz--result-filenamez--dataset-namer   )r   randomzgenerated-shared-prefixz$Name of the dataset to benchmark on.)r1   r2   choiceshelpz--dataset-pathr   zPath to the dataset.)r1   r2   r5   z--num-promptsz.Number of prompts to process. Default is 1000.z--sharegpt-output-lenzVOutput length for each request. Overrides the output length from the ShareGPT dataset.z--sharegpt-context-lenzrThe context length of the model for the ShareGPT dataset. Requests longer than the context length will be dropped.z--random-input-lenzANumber of input tokens per request, used only for random dataset.z--random-output-lenzBNumber of output tokens per request, used only for random dataset.z--random-range-ratiozLRange of sampled ratio of input/output length, used only for random dataset.z--gsp-num-groupszHNumber of groups with shared prefix, usedonly for generate-shared-prefixz--gsp-prompts-per-groupzQNumber of prompts per group of shared prefix, usedonly for generate-shared-prefixz--gsp-system-prompt-lenz9System prompt length, usedonly for generate-shared-prefixz--gsp-question-lenz4Question length, usedonly for generate-shared-prefixz--gsp-output-lenzFTarget length in tokens for outputs in generated-shared-prefix datasetz--seedr$   zThe random seed.z--disable-ignore-eos
store_truezDisable ignore EOS token)actionr5   z--extra-request-bodyz${"key1": "value1", "key2": "value2"}z|Append given JSON object to the request payload. You can use this to specifyadditional generate params like sampling params.)metavarr1   r2   r5   z--apply-chat-templatezApply chat templatez	--profilezdUse Torch Profiler. The endpoint must be launched with SGLANG_TORCH_PROFILER_DIR to enable profiler.z--skip-warmupzSkip the warmup batches.z--do-not-exitzUDo not exit the program. This is useful for nsys profile with --duration and --delay.z--prompt-suffixzSSuffix applied to the end of all user prompts, followed by assistant prompt suffix.z--return-logprobz#Enable returning log probabilities.z--logprob-start-lenr.   zStart length for logprob. -1 means only return logprobs for output tokens (default). 0 means return logprobs for all tokens including input.)add_argumentstrr   r   r   intr   r   r   r   r   floatr   r   r   r   r!   r#   r'   )r0    r=   S/home/ubuntu/.local/lib/python3.10/site-packages/sglang/bench_offline_throughput.pyadd_cli_argsB   s  
zBenchArgs.add_cli_argsargsc                    s0   dd t | D }| di  fdd|D S )Nc                 S      g | ]}|j qS r=   )name.0attrr=   r=   r>   
<listcomp>       z+BenchArgs.from_cli_args.<locals>.<listcomp>c                    s   i | ]}|t  |qS r=   )getattrrC   r@   r=   r>   
<dictcomp>   s    z+BenchArgs.from_cli_args.<locals>.<dictcomp>r=   )dataclassesfields)clsr@   attrsr=   rI   r>   from_cli_args   s   zBenchArgs.from_cli_args))__name__
__module____qualname__r   r:   __annotations__r   r   r   r   r;   r   r   r   r   r   r   r<   r   r   r   r!   r#   r%   r&   boolr'   r(   r)   r*   r+   r,   r-   r/   staticmethodargparseArgumentParserr?   classmethod	NamespacerO   r=   r=   r=   r>   r   &   s>   
  
r   Fr.   backend_namereqs
ignore_eosr'   r)   r-   r/   c              
      s~  | t |dtdd |D dddddd	}dd |D }	 fdd|D }
|r?dtjv s1J d	tjtjd d
d |  t }|j|	|
||d}t | }|rit	d}t
t|}|  t|| | dkrrt|}| }||d< tdd |D |d< |d | |d< |d | |d< |d | |d< |d |d  | |d< t|rt|}|d d d |d< |S )Nr.   c                 s   s    | ]}|j V  qd S )N)
prompt_lenrD   rr=   r=   r>   	<genexpr>   s    z'throughput_test_once.<locals>.<genexpr>)	r   successful_requeststotal_latencytotal_input_tokenstotal_output_tokensrequest_throughputinput_throughputoutput_throughputtotal_throughputc                 S   rA   r=   )promptr^   r=   r=   r>   rF      rG   z(throughput_test_once.<locals>.<listcomp>c                    s   g | ]}d |j d qS )r   )temperaturemax_new_tokensr\   )
output_lenr^   r'   r\   r=   r>   rF      s    SGLANG_TORCH_PROFILER_DIRz%Please set SGLANG_TORCH_PROFILER_DIR.T)exist_ok)ri   sampling_paramsr-   r/   runtimerb   c                 s   s    | ]	}|d  d V  qdS )	meta_infocompletion_tokensNr=   )rD   or=   r=   r>   r`     s    
rd   ra   re   rc   rf   rg   rh   internal_statesr   last_gen_throughput)lensumosenvironmakedirsstart_profiletimeperf_countergenerategetenvsetlistdirstop_profilemonitor_trace_filejsonloadsget_server_infoinspectisawaitableasynciorun)rZ   r   r[   r\   r'   r)   r-   r/   measurement_resultsri   rp   stgen_outlatencydirknown_filesserver_infor=   rm   r>   throughput_test_once   sr   









r   r$   c           
   	   C   s   t d| d 	 d}t| tt|}||  }|D ]=}tj||}t d|  d}	 ztj|}	W n t	yI   t d| d Y nw |	|krQ|	}nd}nt| q0q|r_d S q	)	NzMonitoring z for new trace files...TFzNew file detected: r   zFile z is no longer accessible.)
printr}   sleepr   ry   r   pathjoingetsizeFileNotFoundError)
r   	directoryintervalflagcurrent_files	new_filesnew_filenew_file_pathprevious_sizecurrent_sizer=   r=   r>   r   *  s6   

r   server_args
bench_argsc           
   
   C   sf  |j dkrtd,i t| }|stdn|j dkr&td,i t| }ntd| jp/| j}t|}t	  t
|j tj
|j i }|jrOttj}t||}tddt|jdd||jd}|jstd	 t|j |||j |d
|j|jd td td t|j |||j ||j|j|jd}|   |j!rt"|j!d}	|	#t$|d  W d    n1 sw   Y  t%dj&dddd t%d&d|d  t%d&d|d  t%d&d|d  t%d&d|d  t%d&d|d   t%d&d!|d"  t%d&d#|d$  t%d&d%|d&  t%d&d'|d(  t%d&d)|d*  t%d+ |S )-Nr   z%Please provide valid engine argumentsrq   z2Please set backend to either "engine" or "runtime"r"   r   g      ?)	input_lenrl   r   range_ratio	tokenizerr   z

Warmup...F)rZ   r   r[   r\   r'   r)   r-   r/   g      ?z
Benchmark...a
z
{s:{c}^{n}}z% Offline Throughput Benchmark Result 2   =)sncz{:<40} {:<10}zBackend:r   zSuccessful requests:ra   z{:<40} {:<10.2f}zBenchmark duration (s):rb   zTotal input tokens:rc   zTotal generated tokens:rd   z#Last generation throughput (tok/s):rv   zRequest throughput (req/s):re   zInput token throughput (tok/s):rf   z Output token throughput (tok/s):rg   zTotal token throughput (tok/s):rh   z2==================================================r=   )'r   r   rK   asdict
ValueErrorr
   tokenizer_path
model_pathr   r	   r3   r%   npr'   r   r   r@   r   r   minr   r   r*   logginginfor   r&   r-   r/   r}   r   r)   shutdownr   openwritedumpsr   format)
r   r   r   tokenizer_idr   r'   input_requestswarmup_requestsresultfoutr=   r=   r>   throughput_testJ  s   









r   __main__SGLANG_USE_MODELSCOPEfalse)true1zUsing local model path: )snapshot_downloadz$Using ModelScope to download model: zModel downloaded to: zModelScope download failed: z%(message)s)levelr   )Fr.   )r$   )<__doc__rV   r   rK   r   r   r   ry   r3   r}   typingr   r   r   numpyr   sglang.bench_servingr   r   r   r   r	   $sglang.lang.backend.runtime_endpointr
   sglang.srt.entrypoints.enginer   sglang.srt.server_argsr   	dataclassr   r:   rT   r;   r   r   r   rP   rW   r0   r?   
parse_argsr@   r   lowerr   existsr   r   
modelscoper   	ExceptionerO   r   r   basicConfigrH   	log_levelupperr+   r=   r=   r=   r>   <module>   s     3

X 

n




#