o
    پim                 +   @   sV  d Z ddlZddlZddlZddlZddlZddlZddlZddl	Z	ddl
Z
ddlZddlZddlZddlZddlZddlZddlZddlmZ ddlmZ ddlmZmZmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z& ddl'Z'ddl(Z)ddl*Z*ddl+Z+ddl,m-Z- ddl.m/Z/ ddl0m1Z1 ddl2m3Z3m4Z4m5Z5m6Z6m7Z7 dZ8dZ9ej:;dduoe<dduZ=dde>de>de?fddZ@dd ZAeG dd dZBeG dd dZCde>d e>de>fd!d"ZDde>d#e>de>fd$d%ZEde"e>e>f fd&d'ZFd(e#e> de"e>e>f fd)d*ZGde"e>e>f fd+d,ZH	dd-eBd.e$e1 deCfd/d0ZI	dd-eBd.e$e1 deCfd1d2ZJ	dd-eBd.e$e1 deCfd3d4ZK	dd-eBd.e$e1 deCfd5d6ZL	dd-eBd.e$e1 deCfd7d8ZM	dd-eBd.e$e1 deCfd9d:ZNd;e>deCfd<d=ZOd>e$e#e>  d?e$e#e>  de#e%e>e>f  fd@dAZPdBe#e%e>e>f  dCe>ddfdDdEZQdFe>de>fdGdHZRdFe>de&e5e7f fdIdJZSdFe>de&e5e7f fdKdLZTddMdNZUeMeMeJeKeJeKeJeKeIeNeLdOZVeG dPdQ dQZWdRZXdSZYdTdUdVdWdXZZ	YddZe>d[e>d\e>fd]d^Z[dd_e>d[e$e> fd`daZ\dbdc Z]eG ddde deZ^dfe#e" dge6dhe_die`de e^df f
djdkZa	l		mddne`doe3e4B dpe>dqe$e` dre?de#e^ fdsdtZb			u	vddwe>dne`dge6dqe$e` dxe$e` dye$e> de#e^ fdzd{Zc	ddwe>dne`dge6dqe$e` de#e^ f
d|d}Zd			u	vddwe>dne`dge6dqe$e` dxe$e` dye$e> de#e^ fd~dZede`de_de`de#e` fddZf	m	mdde`de`de`de_dge6dwe>dre?de?de#e^ fddZgde>de%e`e`f fddZhdeifddZj	vddne`de`de`de`de_doe3de>de>de>dpe>de?de#e^ fddZkedddd Zldd Zmdd Zndd Zode`de`de`de`de`de_dge6dejpde#e^ fddZq	v	ddfe#e^ de_de?dhe_de e^df f
ddZr		vddfe$e#e^  de#eC de_dge6dpe>de$e_ de?de%eWe#e` f fddZsh dZtde!dpe>de!fddZu	v	v		v				ddpe>d;e>de>de>dge6dfe#e^ de_de$e` de?de#e> de$e> de$e_ de"e>ef de?de?de?de`de?d>e$e#e>  d?e$e#e>  f(ddĄZvddƄ ZwdejpfddɄZxdejpfdd˄Zyddd΄ZzG ddЄ dej{Z|e}dkr)eddӍZ~e~jde>eieV dldd֍ e~jde>dddٍ e~jde>dddٍ e~jde`ddߍ e~jde>dg ddd e~jde>duddٍ e~jde>ddߍ e~jde>ddߍ e~jde>ddߍ e~jde`dddٍ e~jde`dddٍ e~jde`dddٍ e~jde`dddٍ e~jdde`dd e~jde_dddٍ e~jde`dddٍ e~jde>d ddٍ e~jdddd e~jde>dddٍ e~jd	e>d
ddٍ e~jde_e_dddٍ e~jdddd e~jde`dddٍ e~jde>ddߍ e~jdddd e~jdddd e~jdddd e~jdddd e~jdddd e~jddd d e~jd!e`dd"dٍ e~jd#dd$d e~jd%d&e>d'd( e~jd)dd*d e~jd+dd,d e~jd-dd.d e~jd/e>d0d1d2gg d3d4 e~jd5e`dd6 e~jd7ddvd8 e~jd9d0dd: e~jd;e>dd<dٍ e~jd=e>dd>dٍ e~jd?e>d@de|dAdB e~jdCe>dDg dEdFd e~jdGe_dHdIdٍ e~jdJe>dudKdٍ e~jdLddMd e~ ZejdNe>d@ddOdP ejdQe>d@ddRdP e~jdSddTd e~jdUe`ddVdٍ e~jdWddXd e~dYZejdZe`d[d\dٍ ejd]e`d^d_dٍ ejd`e`dadbdٍ ejdce`dddedٍ ejdfe`dgdhdٍ e~jdie_ddjdٍ ejdkddld ejdmddnd ejdoe`ddpdٍ ejdqddrd e~dsZejdte_ddudٍ ejdve`ddwdٍ ejdxe>dyg dXdzd e~jd{e>dd|dٍ e~jd}e>d0dd~dP e~ aeyt dS dS (  a  
Benchmark online serving with dynamic requests.

Usage:
python3 -m sglang.bench_serving --backend sglang --num-prompt 10

python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 3000 --random-input 1024 --random-output 1024 --random-range-ratio 0.5
    N)ArgumentParser)deepcopy)	dataclassfieldreplace)datetime)	lru_cache)JSONDecodeError)Path)AnyAsyncGeneratorCallableDictListOptionalTupleUnion)load_dataset)Image)tqdm)AutoProcessorAutoTokenizerPreTrainedTokenizerPreTrainedTokenizerBasePreTrainedTokenizerFastz
Assistant:zX-SMG-Routing-Keytermplotlibgnuplotfalsenamedefaultreturnc                 C   s   t | |}| dv S )N)true1)osgetenvlower)r   r   value r'   H/home/ubuntu/.local/lib/python3.10/site-packages/sglang/bench_serving.py_get_bool_env_var@   s   r)   c                  C   s"   d} d}t j| d}t j||dS )Ni`T  i   total)timeoutread_bufsize)aiohttpClientTimeoutClientSession)BENCH_AIOHTTP_TIMEOUT_SECONDS BENCH_AIOHTTP_READ_BUFSIZE_BYTESaiohttp_timeoutr'   r'   r(   _create_bench_client_sessionE   s   r4   c                   @   s   e Zd ZU eeee eeeef  f ed< eed< eed< eed< eed< eed< e	ee  ed< eee
f ed< d	Ze	e ed
< d	Ze	e ed< d	S )RequestFuncInputpromptapi_url
prompt_len
output_lenmodel	lora_name
image_dataextra_request_bodyN	timestamprouting_key)__name__
__module____qualname__r   strr   r   __annotations__intr   r   r>   floatr?   r'   r'   r'   r(   r5   R   s   
 "r5   c                   @   s   e Zd ZU dZeed< dZeed< dZe	ed< dZ
e	ed< eedZee	 ed	< eedZee ed
< dZeed< dZeed< dZeed< dZe	ed< edefddZdS )RequestFuncOutput generated_textFsuccess        latencyttft)default_factoryitltext_chunksr   r8   errorr9   
start_timerequest_func_inputc                 C   s   t  }| j|_|S N)rG   r8   )rS   outputr'   r'   r(   init_newm   s   zRequestFuncOutput.init_newN)r@   rA   rB   rI   rC   rD   rJ   boolrL   rF   rM   r   listrO   r   rP   r8   rE   rQ   r9   rR   staticmethodr5   rV   r'   r'   r'   r(   rG   `   s   
 rG   textprefixc                 C   s   |  |r| t|d  S | S rT   )
startswithlen)rZ   r[   r'   r'   r(   remove_prefixt   s   r^   suffixc                 C   s    |  |r| d t|  S | S rT   )endswithr]   )rZ   r_   r'   r'   r(   remove_suffixx   s    ra   c                  C   s<   t jd} | rdd|  iS t jd}|rd| iS i S )NOPENAI_API_KEYAuthorizationzBearer API_KEY)r#   environget)openai_api_keyapi_keyr'   r'   r(   get_auth_headers|   s   
ri   header_listc                 C   s   dd | D S )Nc                 S   s*   i | ]}| d \}}}|r|r||qS )=)	partition).0hk_vr'   r'   r(   
<dictcomp>   s   * z(parse_custom_headers.<locals>.<dictcomp>r'   )rj   r'   r'   r(   parse_custom_headers   s   rs   c                  C   s(   t  } ttdd  }r| t| | S )Nheader)ri   getattrargsupdaters   )headersrn   r'   r'   r(   get_request_headers   s   ry   rS   pbarc              
      s  | j }|dsJ t 4 I d H }d| jdd| jd| jdd| j}tjr-|d= |d= t	| }d	}t
 }|}z|j||d
4 I d H q}	|	jdkr|	j2 z=3 d H W }
|
 }
|
s\qOt|
dd}t|}| j|d 7  _t
 }|d	kr|| }||_n|j||  |}qO6 || |_d|_| j|_n|	jpdd |	 I d H  |_d|_W d   I d H  n1 I d H sw   Y  W n ty   d|_t }d t!j"| |_Y nw |r|#d |W  d   I d H  S 1 I d H sw   Y  d S )Ngenerate_streamTgư>      ?i   )accumulate_tokens
text_inputtemperaturetop_p
max_tokensstream
min_lengthend_idr   r   rK   urljson   utf-8zdata:text_outputrH   : F   )$r7   r`   r4   r6   r9   r=   rv   disable_ignore_eosrG   rV   timeperf_counterpoststatuscontentstripr^   decoder   loadsrI   rM   rO   appendrL   rJ   reasonrZ   rQ   	Exceptionsysexc_infojoin	tracebackformat_exceptionrw   )rS   rz   r7   sessionpayloadrU   rM   stmost_recent_timestampresponsechunk_byteschunkdatar>   r   r'   r'   r(   async_request_trt_llm   sp   	




( 
0r   c              
      s  | j }|dsJ d| j}t 4 I d H >}| j|d| jtj d}d| jvr.d|d< d| jvr9tj	 |d< |
| j | jrL| j|d< | j|d	< | jrW|
d
| ji t }| jrb| j|t< t| }d}| j}	d}
t }||_|}z|j|||d4 I d H }|jdkr|j2 zj3 d H W }| }|sqt|dd}t | }|dkrqt|}|d d d rt }|
dkrt | }
|
|_n|j|d d d  |j ||  |}||d d d 7 }|!dpi !d|	}	q6 ||_"d|_#||_$|	|_n|j%pdd |& I d H  |_'d|_#W d   I d H  n1 I d H s+w   Y  W n t(yK   d|_#t)* }d+t,j-| |_'Y nw W d   I d H  n1 I d H s]w   Y  |rj|
d |S )Ncompletionsz7OpenAI Completions API URL must end with 'completions'.r   )r:   r6   best_ofr   r   r   rK   
ignore_eosr:   	lora_pathr<   rH   r   r   rx   r   r   data: [DONE]choicesr   rZ   usagecompletion_tokensTr   F).r7   r`   r6   r4   r:   r9   rv   disable_streamr=   r   rw   r;   r<   ry   r?   _ROUTING_KEY_HEADERrG   rV   r   r   rR   r   r   r   r   r^   r   r   r   rM   rP   r   rO   rf   rI   rJ   rL   r   rZ   rQ   r   r   r   r   r   r   )rS   rz   r7   r6   r   r   rx   rU   rI   r9   rM   r   r   r   r   r   rL   r   r>   r   r'   r'   r(    async_request_openai_completions   s   
	





#*0*_
r   c              
      s  | j }|dsJ dttddr4tt }t| }d|_t		 }t
d| d| dt| d	 t| jtr>| j}n"| jrYd
d | jD }|d| jd d|dg}nd| jdg}t 4 I dH h}| j|| jtj d}	d| jvr}d|	d< d| jvrtj |	d< |	| j | jr| j|	d< | j|	d< t }
| jr| j|
t< t| }d}| j}d}t	 }||_|}z|j||	|
d4 I dH }|jdkrtjr|  I dH }|d d d d |_!d|_"t	 | |_#|j#|_$|%di %d ||_n|j&2 zq3 dH W }|' }|sqt(|)d!d"}t	 | }|d#kr%qt *|}|%di gd %d$i }|%dd}|ret	 }|dkrQ|| }||_$n|j+| |j,||  |}||7 }|%dpli %d |}q6 ||_!d|_"||_#||_n|j-pdd% |. I dH  |_/d|_"W d  I dH  n1 I dH sw   Y  W n t0y   d|_"t12 }d3t4j5| |_/Y nw W d  I dH  n1 I dH sw   Y  ttddrt		 }t|}d|_!t
d| d| d&||  d't| d		 |r|d( |S ))a  Makes a request to the OpenAI Chat Completions API.

    Handles both streaming and non-streaming responses, including support
    for image data in messages. Calculates and returns various performance
    metrics.

    Args:
        request_func_input: Input parameters for the request.
        pbar: Optional tqdm progress bar to update.

    Returns:
        RequestFuncOutput: Output of the request, including generated text,
                           latency, TTFT, ITL, and success status.
    zchat/completionszAOpenAI Chat Completions API URL must end with 'chat/completions'.print_requestsFz...zrid=z time=z- message="request start" request_func_input=""c                 S      g | ]	}d d|idqS )	image_urlr   )typer   r'   )rm   img_urlr'   r'   r(   
<listcomp>o  s    z9async_request_openai_chat_completions.<locals>.<listcomp>rZ   r   rZ   userroler   N)r:   messagesmax_completion_tokensr   r   rK   r   r:   r   rH   r   r   r   r   messager   Tr   r   r   r   r   deltar   z time_delta=z message="request end" output="r   )6r7   r`   ru   rv   rC   uuiduuid4r   r6   r   print
isinstancerX   r<   r   r4   r:   r9   r   r=   r   rw   r;   ry   r?   r   rG   rV   r   rR   r   r   r   rI   rJ   rL   rM   rf   r   r   r^   r   r   rP   rO   r   rZ   rQ   r   r   r   r   r   r   )rS   rz   r7   ridinput_partialrequest_start_timer   content_itemsr   r   rx   rU   rI   r9   rM   r   r   r   response_jsonr   r   rL   r   r   r   r>   r   curr_toutput_partialr'   r'   r(   %async_request_openai_chat_completionsJ  s   








&*C*q$
r   c              
      s  | j }| j}t 4 I d H }| j|dd| jtj tj d| j}t	 }t
| }d}d}	t }
|
}z|j|||d4 I d H }|jdkr|j2 zS3 d H W }| }|sWqJt|dd}t |
 }|d	krjqJt|}|d
 d d rt }|	dkrt |
 }	|	|_n|j||  |}||d
 d d 7 }qJ6 ||_d|_||_| j|_n|jpdd | I d H  |_d|_W d   I d H  n1 I d H sw   Y  W n ty   d|_t ! }d"t#j$| |_Y nw W d   I d H  n1 I d H sw   Y  |r|%d |S )NrK   r   )r:   r6   r   r   r   r   r   rH   r   r   r   r   r   r   r   rZ   Tr   F)&r7   r6   r4   r:   r9   rv   r   r   r=   ry   rG   rV   r   r   r   r   r   r   r^   r   r   r   rM   rO   r   rI   rJ   rL   r   rZ   rQ   r   r   r   r   r   r   rw   )rS   rz   r7   r6   r   r   rx   rU   rI   rM   r   r   r   r   r   rL   r   r>   r   r'   r'   r(   async_request_truss  s   




(*(C
r   c                    s  | j }| j}t 4 I d H 3}t|trdnd|dd| jtj ddtj d| j	dtj
d	tjd
di| j}| jr>| j|d< t }| jrI| j|t< t| }d}| j}	d}
t }||_|}d}z|j|||d4 I d H }|jdkr|j2 zg3 d H W }| }|sqvt|dd}t | }|dkrqvt|}d|v r|d rt }|d }|d d }	|
dkrt | }
|
|_n|	| }|dkrqv|| }|| }|j |g|  |}|	}qv6 ||_!d|_"||_#|	|_n|j$pdd |% I d H  |_&d|_"W d   I d H  n1 I d H sw   Y  W n" t'y7   d|_"t() }d*t+j,| |_&t-d|j& Y nw W d   I d H  n1 I d H sIw   Y  |rV|.d |S )NrZ   	input_idssampling_paramsrK   )r   max_new_tokensr   r   r   return_logprobreturn_routed_expertslogprob_start_lenr<   rH   r   r   r   r   r   r   	meta_infor   Tr   Fzoutput.error=r   )/r7   r6   r4   r   rC   r9   rv   r   r   r;   r   r   r=   r<   ry   r?   r   rG   rV   r   r   rR   r   r   r   r   r^   r   r   r   rM   rO   extendrI   rJ   rL   r   rZ   rQ   r   r   r   r   r   r   r   rw   )rS   rz   r7   r6   r   r   rx   rU   rI   r9   rM   r   r   last_output_lenr   r   r   rL   r   r>   num_new_tokens	chunk_gap
adjust_itlr   r'   r'   r(   async_request_sglang_generateN  s   




%*2*Z
r   c                    s   t  rT   )NotImplementedErrorrS   rz   r'   r'   r(   async_request_gserver  s   r   r7   c           	   
      s  t  4 I d H }t }z| drittdd }ttdd }|r&|d u r&d}ttdd }|d u r6tdd}ttj	tj
|tt  }|jddd	 t|}ttd
g ||ttdd |ttdd d}ni }td| d| |j| |d4 I d H &}|jdkrd|_n|jpdd | I d H  |_d|_W d   I d H  n1 I d H sw   Y  W n ty   d|_t }dtj| |_Y nw W d   I d H  |S W d   I d H  |S 1 I d H sw   Y  |S )N/start_profileprofile_num_stepsprofile_by_stage   profile_output_dirSGLANG_TORCH_PROFILER_DIR/tmpT)exist_okparentsprofile_activitiesprofile_stagesprofile_prefix)
activities	num_stepsr   r   
output_dirr   zasync_request_profile api_url=z body=r   r   rH   r   F)r4   rG   r`   ru   rv   r#   r$   r
   pathabspathnormpathrC   r   mkdirr   r   r   rJ   r   rZ   rQ   r   r   r   r   r   r   )	r7   r   rU   r   r   r   bodyr   r   r'   r'   r(   async_request_profile  s^   





(+++r   profile_prefill_urlprofile_decode_urlc                 C   s\   g }| rt | D ]\}}|d| |f q|r,t |D ]\}}|d| |f q|S )zBuild profile URLs list from prefill/decode URL arguments.

    Returns:
        List of (worker_type, url) tuples. e.g., [("Prefill-0", "http://..."), ("Decode-0", "http://...")]
    zPrefill-zDecode-)	enumerater   )r   r   profile_urlsidxr   r'   r'   r(   _build_profile_urls  s   	r   r   modec              
      s   |dkrdnd}|dkrdnd}|dkrdnd}t | d | D ]0\}}t|| d	I d
H }|jrAt d| d| d|  q"t d| d| d| d|j  q"d
S )zCall profile endpoint (start/stop) on PD separated workers.

    Args:
        profile_urls: List of (worker_type, url) tuples
        mode: "start" or "stop"
    startr   /stop_profileStartingStoppingstartedstoppedz profiler...r7   Nz	Profiler z for z worker at z
Failed to z profiler for r   )r   r   rJ   rQ   )r   r   endpointactionaction_pastworker_typer   profile_outputr'   r'   r(   _call_profile_pd  s   r	  pretrained_model_name_or_pathc                 C   sF   t dd dkr!dd l}ddlm} || |jjg dd}|S | S )NSGLANG_USE_MODELSCOPEr   r!   r   )snapshot_download)z.*.ptz.*.safetensorsz.*.bin)model_idlocal_files_onlyignore_file_pattern)r#   r$   r%   huggingface_hub.constants
modelscoper  	constantsHF_HUB_OFFLINE)r
  huggingface_hubr  
model_pathr'   r'   r(   	get_model  s   r  c                 C   f   | d ur| dks
J |  ds|  drddlm} || S | d ur,tj| s,t| } tj| ddS )NrH   .json.modelr   )get_tokenizerTtrust_remote_code)	r`   &sglang.srt.utils.hf_transformers_utilsr  r#   r   existsr  r   from_pretrained)r
  r  r'   r'   r(   r  #      r  c                 C   r  )NrH   r  r  r   )get_processorTr  )	r`   r  r!  r#   r   r  r  r   r  )r
  r!  r'   r'   r(   r!  :  r   r!  c           	      C   s4  t | dd}| jdkr"|rJ t| j| j|| j| j| j| jd}|S | j	dr?t
| j| j| j| j|| j| jdk| d}|S | jdkrct|}t| j| j| j| j| j|| j| j| j| j| jd}|S | jd	kr|rlJ t| j| j| j| j| jt | d
d|| d}|S | jdkrt|}t| j|| j| jdd}|S | jdkr| jstjd| j d }n| j}tj!|st"t#| j  | t$|d}dd |D }W d    n1 sw   Y  |d | j }|S | jdkr|rJ t%| j| j|| j| j| j| jd}|S | jdkrt&| j| j|| jd}|S t'd| j )Ntokenize_promptFsharegpt)dataset_pathnum_requests	tokenizerfixed_output_lencontext_lenprompt_suffixapply_chat_templaterandom)	input_lenr9   num_promptsrange_ratior&  r$  random_samplereturn_textimage)r%  image_countr,  r9   r.  	processorimage_contentimage_formatimage_resolutionbackendrandom_image_countgenerated-shared-prefixgsp_range_ratior|   )
num_groupsprompts_per_groupsystem_prompt_lenquestion_lenr9   r.  r&  rv   mmmuT)r%  r3  r7  r'  r/  mooncaker   z_trace.jsonlrc                 S   s   g | ]}|  rt|qS r'   )r   r   r   )rm   liner'   r'   r(   r         zget_dataset.<locals>.<listcomp>customopenai)r$  r%  r&  r'  zUnknown dataset: )(ru   dataset_namesample_sharegpt_requestsr$  r-  sharegpt_output_lensharegpt_context_lenr)  r*  r\   sample_random_requestsrandom_input_lenrandom_output_lenrandom_range_ratior!  sample_image_requestsr2  r4  r5  r6  r7  r8  'sample_generated_shared_prefix_requestsgsp_num_groupsgsp_prompts_per_groupgsp_system_prompt_lengsp_question_lengsp_output_lensample_mmmu_requestsr#   r   r   mooncake_workloadr  download_and_cache_fileMOONCAKE_DATASET_URLopensample_custom_requestssample_openai_requests
ValueError)	rv   r&  r  r"  input_requestsr3  
local_pathfall_requests_datar'   r'   r(   get_datasetQ  s   
^T
H

9
-


ra  )sglangsglang-native
sglang-oaisglang-oai-chatvllm	vllm-chatlmdeploylmdeploy-chattrtgservertrussc                   @   s&  e Zd ZU eed< eed< eed< eed< eed< eed< eed< eed< eed	< eed
< eed< eed< eed< eed< eed< eed< eed< eed< eed< eed< eed< eed< eed< eed< eed< eed< eed< eed< eed< eed< eed< eed < d!Zeed"< d#Zeed$< d%S )&BenchmarkMetrics	completedtotal_inputtotal_input_texttotal_input_visiontotal_outputtotal_output_retokenizedrequest_throughputinput_throughputoutput_throughputoutput_throughput_retokenizedtotal_throughputtotal_throughput_retokenizedmean_ttft_msmedian_ttft_msstd_ttft_msp99_ttft_msmean_tpot_msmedian_tpot_msstd_tpot_msp99_tpot_msmean_itl_msmedian_itl_ms
std_itl_ms
p95_itl_ms
p99_itl_ms
max_itl_msmean_e2e_latency_msmedian_e2e_latency_msstd_e2e_latency_msp90_e2e_latency_msp99_e2e_latency_msconcurrencyrK   max_output_tokens_per_sr   max_concurrent_requestsN)r@   rA   rB   rE   rD   rF   r  r  r'   r'   r'   r(   rm    sF   
 rm  z)anon8231489123/ShareGPT_Vicuna_unfilteredz)ShareGPT_V3_unfiltered_cleaned_split.jsonzjhttps://raw.githubusercontent.com/kvcache-ai/Mooncake/main/FAST25-release/arxiv-trace/mooncake_trace.jsonlzihttps://raw.githubusercontent.com/kvcache-ai/Mooncake/main/FAST25-release/traces/conversation_trace.jsonlzfhttps://raw.githubusercontent.com/kvcache-ai/Mooncake/main/FAST25-release/traces/synthetic_trace.jsonlzfhttps://raw.githubusercontent.com/kvcache-ai/Mooncake/main/FAST25-release/traces/toolagent_trace.jsonl)r@  conversation	synthetic	toolagentdatasetrepo_idfilename	repo_typec                 C   s   ddl m} || ||dS )z7Download a file from Hugging Face and cache it locally.r   )hf_hub_download)r  r  r  )r  r  )r  r  r  r  r'   r'   r(   download_and_cache_hf_file  s   r  r   c              	   C   s  |du rt jd| dd }t|r|S td|  d|  tj| dd}|  t	|j
d	d
}d}t|d?}t||dddd}|j|dD ]}|| |t| qLW d   n1 sew   Y  W d   |S W d   |S 1 s}w   Y  |S )z!Read and cache a file from a url.Nr   /r   zDownloading from z to T)r   zcontent-lengthr      wbB)descr+   unit
unit_scaleunit_divisor)
chunk_size)r#   r   r   splitis_file_valid_jsonr   requestsrf   raise_for_statusrE   rx   rY  r   iter_contentwriterw   r]   )r   r  r   
total_sizer  r_  barr   r'   r'   r(   rW     s6   
(rW  c              
   C   s   t j| sdS zt| }t| W d    W dS 1 sw   Y  W dS  tyC } zt|  d|d W Y d }~dS d }~ww )NFTz" exists but json loading fails (e=z), thus treat as invalid file)r#   r   isfilerY  r   loadr	   r   )r   r_  er'   r'   r(   r  "  s    

r  c                   @   s   e Zd ZU eed< eed< eed< dZee ed< dZee ed< dZ	ee
e  ed< dZee ed< dZee ed	< dZeeeef  ed
< dd ZdS )
DatasetRowr6   r8   r9   Ntext_prompt_lenvision_prompt_lenr<   r>   r?   r=   c                 C   s:   | j d u r	| j| _ | jd u rd| _| jd u ri | _d S d S )Nr   )r  r8   r  r=   )selfr'   r'   r(   __post_init__>  s   



zDatasetRow.__post_init__)r@   rA   rB   rC   rD   rE   r  r   r  r<   r   r>   rF   r?   r=   r   r   r  r'   r'   r'   r(   r  2  s   
 r  r]  r&  slowdown_factor
num_roundsc              
   C  sv  | sdS | j dd d t }| d d }| D ]}|d | d }|| }t | }	||	 }
|
dkr=t|
I dH  d}|d	g }|D ]}|| d
dgd  7 }qG|d7 }|dd}g }t|D ]P}|dd|d  d| d z|j	|dddd}W n t
y   ddd |D }Y nw t||}t|||dV  d
dg| }|d|d qgqdS )z
    An async generator that yields requests based on the timestamps in the Mooncake trace file,
    with support for multi-round sessions.
    Nc                 S   s   | d S )Nr>   r'   rA  r'   r'   r(   <lambda>T  s    z0get_mooncake_request_over_time.<locals>.<lambda>keyr   r>        @@rH   hash_ids hi   z&Tell me a story based on this context.output_length   r   zRound r   r   r   FT)tokenizeadd_generation_promptreturn_dict
c                 S   s"   g | ]}|d   d|d  qS )r   r   r   r'   )rm   msgr'   r'   r(   r     s   " z2get_mooncake_request_over_time.<locals>.<listcomp>r6   r8   r9   story	assistant)sortr   r   asynciosleeprf   r   ranger   r*  r   r]   encoder  )r]  r&  r  r  rR   trace_start_time_msrecordrelative_arrival_time_starget_arrival_time_scurrent_elapsed_time_ssleep_duration_suser_query_baser  hash_idoutput_len_per_roundchat_historyifull_prompt_textr8   placeholder_responser'   r'   r(   get_mooncake_request_over_timeG  s`   



r  rb  Tr%  r3  r7  r'  r/  c                 C   s  t d zt d tdddd}t dt| d W n ty5 } zt d	|  td
| d}~ww t|| kr[|rNttt|| }||}n|tt	| t|}n
t d|  d |}t dt| d g }	t
|D ]z\}
}zZ|d}|durt|dr|jdkr|d}t }|j|dd t| d}d| }nW qu|d}d| d}|dur|nd}t||g|g|||}|	| W qu ty } zt d|
 d|  W Y d}~qud}~ww t dt|	 d |	S ) az  
    Sample requests from the MMMU dataset using HuggingFace datasets.

    Args:
        num_requests: Number of requests to sample.
        fixed_output_len: If provided, use this fixed output length for all requests.
        random_sample: Whether to randomly sample or take the first N.

    Returns:
        List of tuples (prompt, prompt_token_len, output_token_len).
    z(Loading MMMU dataset from HuggingFace...z'Attempting to load MMMU Math dataset...z	MMMU/MMMUMathtest)r  z<Successfully loaded MMMU Math dataset from HuggingFace with z	 examplesz"Failed to load MMMU Math dataset: zFailed to load MMMU dataset: NzDataset has less than z examples, using all examplesz	Selected z examples for benchmarkingimage_1saveRGBARGBPNG)formatr   zdata:image/png;base64,questionz
Question: z


Answer: r  zError processing example r   	
Created z MMMU prompts)r   r   r]   r   r\  r+  sampler  selectminr   rf   hasattrr   convertioBytesIOr  pybase64	b64encodegetvaluer   create_mm_data_rowr   )r%  r3  r7  r'  r/  mmmu_datasetr  indicessample_datasetfiltered_datasetr  exampler1  bufferedimg_strr<   r  text_promptr9   data_rowr'   r'   r(   rU    sb   





 rU  rH   Fr$  r(  r)  c                 C   s  |d ur|dk rt dt| s| dkrtttd} t| }t|}W d    n1 s.w   Y  dd |D }dd |D }t	| g }	t
t|D ]n}
t|	|krX ne||
 d }|rit|t| t }|r|jd	|d
gdddd}|jr||jd}||}||
 d }||}t|}|d u rt|n|}|dk s|dk rqN|r|| |krqN|	t|||d qNtdtdd |	D   tdtdd |	D   |	S )N   output_len too smallrH   r  r  c              
   S   ,   g | ]}t |d |dg dkr|qS conversationsr     r]   rf   rm   r   r'   r'   r(   r     
    z,sample_sharegpt_requests.<locals>.<listcomp>c              
   S   D   g | ]}| d | dg d d | d | dg d d fqS r  r  r   r&   r   rf   r   r'   r'   r(   r         r   r   r   TFr  r  r  r   r  r  #Input tokens: c                 S      g | ]}|j qS r'   r8   rm   xr'   r'   r(   r   D      #Output tokens: c                 S   r  r'   r9   r
  r'   r'   r(   r   E  r  )r\  r  r  SHAREGPT_REPO_IDSHAREGPT_FILENAMErY  r   r  r+  shuffler  r]   ra   ASSISTANT_SUFFIXr*  	bos_tokenr   r  r   r  r   npsum)r$  r%  r&  r'  r(  r)  r*  r_  r  r  r  r6   prompt_token_ids
completioncompletion_token_idsr8   r9   r'   r'   r(   rG    st   	

	


rG  c              
      sz  g }t | d2}|D ]'}|dkrt||kr n| r1z
|t| W q
 tjy0   Y q
w q
W d   n1 s<w   Y  h d g }|D ]I}|dg }	|	sTqI|p[|dd}
 fdd	| D }t|j	|	d
d
d}d|v rt
|d }t||}||7 }|t|	||
|d qItdt| d tdtdd |D   tdtdd |D   |S )a  
    Load OpenAI-compatible chat completion requests from a JSONL file.

    Each line should be a JSON object with:
    - "messages": list of {"role": str, "content": str}
    - "max_tokens": int (used as output_len if fixed_output_len not set)
    - "tools": optional list of tool definitions
    - "temperature": optional temperature value
    - "top_p": optional top_p value
    - Other OpenAI API parameters are also extracted and passed through
    rA  r   N>   r:   r   r   r   r   r   r  c                    s   i | ]\}}| vr||qS r'   r'   )rm   ro   rq   EXCLUDED_FIELDSr'   r(   rr   v  rC  z*sample_openai_requests.<locals>.<dictcomp>T)r  r  tools)r6   r8   r9   r=   zLoaded z OpenAI-format requestsr  c                 S   r  r'   r	  r
  r'   r'   r(   r     r  z*sample_openai_requests.<locals>.<listcomp>r  c                 S   r  r'   r  r
  r'   r'   r(   r     r  )rY  r]   r   r   r   r   r	   rf   itemsr*  dumpsr  r  r   r  r  )r$  r%  r&  r'  r  r_  rB  r  r   r   r9   
extra_bodyr8   	tools_strtools_tokensr'   r  r(   r[  I  sV   	r[  c              
   C   sT  |dur|dk rt dg }tj| std|  t| ddd(}|D ]}	|	 }	|	rBz
|t	|	 W q% tj
yA   Y q%w q%W d   n1 sMw   Y  g }
|D ]5}|d|d	g }t|d
kr|d d|d dd}|d d|d dd}|
||f qV|
}t| g }tt|D ]n}t||kr ne|| d }|rt|t| t }|r|jd|dgdddd}|jr||jd}||}|| d }||}t|}|du rt|n|}|d
k s|d
k rq|r|| |krq|t|||d qtdtdd |D   tdtdd |D   |S )zg
    Sample requests from a custom JSONL dataset: supports 'content'/'value' as conversation keys.
    Nr  r  zDataset not found at rA  r   )encodingr  r  r  r   r   r&   rH   r   r   r   TFr  r  r  c                 S   r  r'   r	  r
  r'   r'   r(   r     r  z*sample_custom_requests.<locals>.<listcomp>r  c                 S   r  r'   r  r
  r'   r'   r(   r     r  )r\  r#   r   r  FileNotFoundErrorrY  r   r   r   r   r	   rf   r]   r+  r  r  ra   r  r*  r  r   r  r  r   r  r  )r$  r%  r&  r'  r(  r)  r*  r  r_  rB  processed_datasetr   convs	user_turnassist_turnr  r  r6   r  r  r  r8   r9   r'   r'   r(   rZ    s   




rZ  full_lenr.  numc                 C   s(   t jjtt| | d| d |d S )Nr   size)r  r+  randintmaxrE   tolistr'  r.  r(  r'   r'   r(   compute_random_lens  s   r/  r,  r9   r-  r0  c                    s  t | ||d}t |||d}	|r(t }
t|D ] td|  |
 | < q|rt|s4tttd}t	|}t
|}W d    n1 sHw   Y  dd |D }dd |D }t| g }|D ]X}t|  |krp nM|d }|}t|}|dkrqd||  kr|d |   }n|  | d | }|| d |   }|}|r|}|t||  |	  d qdn7tjjdj|d	g }t|D ]&  fd
dt|  D }|r|}|t||  |	  d qtdt|  tdt|	  |S )Nr.  r   r  c              
   S   r  r  r  r   r'   r'   r(   r   )  r  z*sample_random_requests.<locals>.<listcomp>c              
   S   r  r  r  r   r'   r'   r(   r   /  r  r   r  r)  c                    s&   g | ]}t     | j qS r'   )rE   
vocab_size)rm   jr  offsetsr&  r'   r(   r   ^  s    r  r  )r/  rE   num_special_tokens_to_addr  r,  r  r  r  r  rY  r   r  r+  r  r]   r  r   r   r  r  r+  r0  r   r  )r,  r9   r-  r.  r&  r$  r/  r0  
input_lensoutput_lensnum_special_tokensr_  r  r]  r   r6   r  r8   r   ratioinput_contentr'   r2  r(   rJ     s   




	

rJ  r6  c                 C   s   ddddd}| |v r||  S |    }d|v rH|d}t|dkrH|d  rH|d	  rHt|d }t|d	 }|dkrH|dkrH||fS td
|  d)zParse image resolution into (width, height).

    Supports presets '1080p', '720p', '360p' and custom 'heightxwidth' format
    (e.g., '1080x1920' means height=1080, width=1920).
    )i   ip  )i  i8  )i   i  )i  ih  )4k1080p720p360pr  r  r   r   zUnsupported image resolution: zX. Choose from 4k, 1080p, 720p, 360p, or provide custom 'heightxwidth' (e.g., 1080x1920).)r   r%   r  r]   isdigitrE   r\  )r6  resolution_to_sizerespartsheightwidthr'   r'   r(   parse_image_resolutionq  s$   
$
rD  imagesc              
   C   sF  z+t |jdkr| dd}ndd |D }|d| d |jd|d	gd
dd}W n tyJ } ztd| d d|  }W Y d }~nd }~ww ||g|dddd  }	z|jd| d	gd
dd}
||
gdddd  }W n ty   t|dr|j	n|}t
|| }Y nw |	| }|dv }t|r| n||	||||dS )NPhi4MMProcessorzimage 1z|endoftext10|c                 S   r   )r1  r   )r   r1  r'   )rm   image_base64r'   r'   r(   r     s    z&create_mm_data_row.<locals>.<listcomp>rZ   r   r   r   TF)r  r  zError applying chat template: z, fallback to <image> tagz<image>pt)rZ   rE  paddingreturn_tensorsr   )rZ   rI  rJ  r&  )rb  rd  re  rf  rg  rh  ri  )r6   r8   r9   r  r  r<   )r   r@   r   r   r*  r   r   numelr  r&  r]   r  r  )r  rE  images_base64r9   r3  r7  r   
prompt_strr  r8   text_only_promptr  tokenizer_to_user  use_raw_promptr'   r'   r(   r    sr   


	
r  r2  r4  r5  r8  c              
      s"  t |\}}|
rtjjd|d | d}t|}n
t| |}||  }|| dkr?|dkr?tjd| d| d| dtd	d
 t	||| d}t	||| d}||fdt
dt
dttt
fffdd g }d}t| D ]I}t
|| }t|jt|dr||jndt
|| }t fddt|D  \}}}|tt|7 }t|t|t|t
|| ||	}|| qitdtdd |D   tdtdd |D   td|  |
rtdt| dt| dt|d ntd| d  td!t| d" d" d#||   d$	 |S )%a  Generate requests with images.

    - If ``random_image_count`` is True, each request includes a random number of images between 1 and ``image_count``.
    - If ``random_image_count`` is False, each request includes exactly ``image_count`` images.
    - Supported resolutions: 4k (3840x2160), 1080p (1920x1080), 720p (1280x720), 360p (640x360),
      or custom 'heightxwidth' (e.g., 1080x1920).
    - Text lengths follow the 'random' dataset sampling rule. ``prompt_len``
      only counts text tokens and excludes image data.
    r   r)  i  d   zHigh resolution (r  z) with zP total images may take a long time. Consider reducing resolution or image count.r  
stacklevelr.  rC  rB  r    c                    s    dkrt j|| dfdt jd}nt j|| dd t j}t|}t	 }|j
|dd t| d}d d	| }t|d}|||fS )
Nblank      )dtypeU   )r  qualityr   zdata:image/z;base64,)r  fulluint8r+  randastyper   	fromarrayr  r  r  r  r  r  r   r]   r  )rC  rB  arrimgbufencodedr<   image_bytes)r4  r5  r'   r(   _gen_random_image_data_uri  s   

z9sample_image_requests.<locals>._gen_random_image_data_urir   image_token_idNc                    s   g | ]}  qS r'   r'   )rm   rp   )rd  r'   r(   r   2  r  z)sample_image_requests.<locals>.<listcomp>r  c                 S   r  r'   r	  r
  r'   r'   r(   r   A  r  r  c                 S   r  r'   r  r
  r'   r'   r(   r   B  r  z#Total images: z#Images per request: min=z, max=z, mean=z.2fz#Images per request: z (fixed)r  r  z images with average z bytes per request)rD  r  r+  r+  r  rZ  warningswarnUserWarningr/  rE   r   rC   r  gen_mm_promptr&  r  re  ziprX   r  r   r   r  r,  meanr]   )r%  r2  r,  r9   r.  r3  r4  r5  r6  r7  r8  rC  rB  image_countstotal_imagesr5  r6  r  total_image_bytesr  request_image_countr  rE  rL  images_bytesr  r'   )rd  r4  r5  r(   rN    s   


($rN  r   )maxsizec                 C   s   t |   S )z:Get all available token ids from the tokenizer vocabulary.)rX   	get_vocabvaluesr&  r'   r'   r(   get_available_tokensR  s   ru  c                 C   s    t | }tj||d}| |S zNGenerate a random prompt of specified token length using tokenizer vocabulary.)ro   )ru  r+  r   r   )r&  	token_numall_available_tokensselected_tokensr'   r'   r(   
gen_promptX  s   
rz  c                 C   s6   t |   }|r|| tj||d}| |S rv  )rX   rr  rs  remover+  r   r   )r&  image_pad_idrw  rx  ry  r'   r'   r(   ri  _  s
   

ri  c                 C   s\   t  d d d }d| j d| j d| j d| j d| j d| j d|jj	 d}|| S )z6Create cache directory under ~/.cache/sglang/benchmarkz.cacherb  	benchmarkgen_shared_prefix_rp   z.pkl)
r
   homeseedrP  rQ  rR  rS  rT  	__class__r@   )rv   r&  	cache_dir	cache_keyr'   r'   r(   get_gen_prefix_cache_pathh  s   r  r;  r<  r=  r>  rv   c                    s  t |dd}t |dd t|}	|dko| o dk}
|	 rC|
rCtd|	  t|	d}t|W  d   S 1 s>w   Y  td| d	 d
|d|d|d|d d t j	dd }t
 d}t||| dtt|||    d|  tt|||  d| }~~~fddt| D } fddt| D }g }d}d}tt| ddD ]m}|| }|r| d| d| nd}ttdddD ]O}|| | }| d|d  g|dd  } dkr|d n|}t |ddr
dnt|d }t|||f }|t||||d ||7 }||7 }qqt |dds<t| td  td!|   td"  td#   td$t|  t |ddstd%|  td&|  td'tfd(d)|D t| d*d+ d,d |D }td-tfd.d)|D t| d*d/ |
r|	jjd0d0d1 td2|	  t|	d3}t|| W d   |S 1 sw   Y  |S )4zWGenerate benchmark requests with shared system prompts using random tokens and caching.gsp_send_routing_keyFgsp_num_turnsr   z*
Loading cached generated input data from rbNz*
Generating new input data... (num_groups=z, z, system_prompt_len=z, question_len=z, output_len=z, range_ratio=z, num_turns=)   z%Y%m%d%H%M%Sr.  c                    s   g | ]	}t  | qS r'   )rz  rm   r  )system_prompt_lensr&  r'   r(   r     s    z;sample_generated_shared_prefix_requests.<locals>.<listcomp>c                    s(   g | ]  fd dt D qS )c                    s(   g | ]  fd dt D qS )c              	      s$   g | ]}t t |f qS r'   )rz  rE   )rm   t)gpquestion_lensr&  r'   r(   r     s    zQsample_generated_shared_prefix_requests.<locals>.<listcomp>.<listcomp>.<listcomp>r  rm   )r  	num_turnsr  r&  r  r(   r     s    zFsample_generated_shared_prefix_requests.<locals>.<listcomp>.<listcomp>r  r  )r  r<  r  r&  )r  r(   r     s    r   zGenerating system prompt)r  rp   zGenerating questions)r  leavez

gsp_fast_prepare)r6   r8   r9   r?   gsp_orderedz,
Generated shared prefix dataset statistics:zNumber of groups: zPrompts per group: zNumber of turns: zTotal prompts: zTotal input tokens: zTotal output tokens: zAverage system prompt length: c                 3       | ]
}t  |V  qd S rT   r]   r  )rm   sprt  r'   r(   	<genexpr>      z:sample_generated_shared_prefix_requests.<locals>.<genexpr>z.1fz tokensc                 S   s$   g | ]}|D ]	}|D ]}|q
qqS r'   r'   )rm   groupconvqr'   r'   r(   r     s   $ zAverage question length: c                 3   r  rT   r  )rm   r  rt  r'   r(   r    r  z tokens
T)r   r   z Caching generated input data to r  )ru   r  r  r   rY  pickler  r   r   hexr   nowstrftimer/  r  arrayreshaper  r   r]   r  rE   r   r  r+  r  r  parentr   dump)r;  r<  r=  r>  r9   r.  r&  rv   send_routing_key
cache_pathshould_cacher_  run_random_strrun_start_timestampr6  system_prompts	questionsr]  total_input_tokenstotal_output_tokens	group_idxsystem_promptr?   
prompt_idxturn_questionsturn_promptsfull_promptr8   output_len_valall_questionsr'   )r  r<  r  r  r&  r(   rO  u  s   
 






&&
rO  r|   request_rateuse_trace_timestampsc                 C  s   |rIt d| d | jdd d t }| r| d jnd}| D ]$}|j| d }|||  }|t  }	|	dkrCt|	I d H  |V  q"d S t| }
|
D ]}|V  |tdkr[qOt	j
d	| }t|I d H  qOd S )
NzCUsing trace timestamps for request generation with slowdown factor .c                 S   s   | j S rT   )r>   r  r'   r'   r(   r  	  s    zget_request.<locals>.<lambda>r  r   r  infr|   )r   r  r   r   r>   r  r  iterrF   r  r+  exponential)r]  r  r  r  rR   r  requesttrace_time_starget_arrival_timesleep_durationinput_requests_iterintervalr'   r'   r(   get_request  s0   

r  outputsdur_saccept_lengthplot_throughputc           .      C   s  g }g }d}	d}
d}d}g }g }g }g }g }|d uo!|dko!|dv }t t|D ]}|| jr|| j}|| t|j|| jdd}|| | d urc|	| | j7 }	|
| | j7 }
|| | j	7 }|dkrx||| j
|| j |d   |rt|| jD ]\}}t|j|| j| dd}|| }||g|  qn||| j7 }||| j ||| j
 |d7 }q(|d |d q(|dkrtjddd d	}d}d
d |D }|rtdd |D }tdd |D }tt|| d }t|} t|}!|D ]n}"|"jsq
|"j|"j g}#|#d }$|"jD ]}%|$|%7 }$|#|$ q |#D ] }&t|&| }'d|'  krC|k rNn q/| |'  d7  < q/t|"j| }(t|"j|"j
 | })t |(t|)d |D ]}*|!|*  d7  < qkq
t| dkrtt| }tt|!}|rtrdd l}+|+ },|,jtt| | ddd |,jtt|!|!ddd |,  nt d |r|n|}t!d:i d|d|	d|
d|dt"|dt"|d|| d|	| dt"|| dt"|| d|	t"| | d|	t"| | d t#|pdd! d"t$|pdd! d#t%|p(dd! d$t&|p3dd%d! d&t#|p?dd! d't$|pJdd! d(t%|pUdd! d)t&|p`dd%d! d*t#|pldd! d+t$|pwdd! d,t%|pdd! d-t&|pdd.d! d/t&|pdd%d! d0t|pdd! d1t#|d! d2t$|d! d3t%|d! d4t&|d5d! d6t&|d%d! d7t"|| d8|d9|}-|-|fS );Nr   )rd  re  F)add_special_tokensr   zYAll requests failed. This is likely due to a misconfiguration on the benchmark arguments.r  rR  rK   c                 S   s   g | ]}|j r|qS r'   rJ   rm   rU   r'   r'   r(   r   m      z%calculate_metrics.<locals>.<listcomp>c                 s       | ]}|j V  qd S rT   )rR   r  r'   r'   r(   r  o      z$calculate_metrics.<locals>.<genexpr>c                 s   s    | ]	}|j |j V  qd S rT   )rR   rL   r  r'   r'   r(   r  p  s    
zOutput tokens per secondzTime (s))titlexlabelzConcurrent requests per secondz8tip: install termplotlib and gnuplot to plot the metricsrn  ro  rp  rq  rr  rs  rt  ru  rv  rw  rx  ry  rz    r{  r|  r}  c   r~  r  r  r  r  r  r  r  _   r  r  r  r  r  r  Z   r  r  r  r  r'   )'r  r]   rJ   r9   r   r  rI   r8   r  r  rL   rM   r   rO   rP   r   rf  rg  r  r,  rE   r  ceilzerosrR   rF   TERM_PLOTLIB_AVAILABLEr   figureplotarangeshowr   rm  r  rk  medianstd
percentile).r]  r  r  r&  r7  r  r  r6  retokenized_output_lensro  rp  rq  rn  itlstpotsttftse2e_latenciesretokenized_itlsuse_retokenized_itlr  r9   retokenized_output_lenro   rO   
num_tokensadjusted_itlr  r  successful_outputsmin_start_timemax_end_timeduration_secondstokens_per_secondconcurrent_requests_per_secondrU   token_timescurrent_time	itl_value
token_timesecond_bucketrequest_start_secondrequest_end_secondsecondtplfigmetricsr'   r'   r(   calculate_metrics&  sh  	



"	





	

 !"#$'r  >   rg  ri  re  request_funcc                    sF   |t v sJ dt  d| 	 ddtdtt dtt f fdd}|S )	Nz(Multi-turn only supports chat backends: z, got rS   rz   r    c                    s   | j }g }g }tt|D ]9}|d|| d tt| t|d} ||t|d kr2|nd dI d H }|| |d|jd q|S )Nr   r   )r6   r   )rz   r  )r6   r  r]   r   r   copyr   rI   )rS   rz   promptsprev_messagesr  round_indexinner_inputrU   r  r'   r(   r_    s"   

z'wrap_multi_turn_request_func.<locals>.frT   )MULTI_TURN_BACKENDSr5   r   r   r   rG   )r  r7  r_  r'   r  r(   wrap_multi_turn_request_func  s   
r  base_urlr  max_concurrencydisable_tqdm
lora_nameslora_request_distributionlora_zipf_alphar=   profilepd_separatedflush_cachewarmup_requestsc           @         s  | t v r
t |  ntd|  |d j}t|to't|dko't|d t}|r0t| d|r7t	|nd fdd}t
d| d tjdkr|d }|d	g }d
}|D ]}|| ddgd  7 }q[|d7 }|dd}t||}t|||d d}n|d }|	d urt|	dkr|	d } nd } t||j||jt|jd| |j|d}!g }"t|D ]}#|"t|!d qtj|" I d H }$|rdd |$D }$|dkrtdd |$D std|$d j t
dtj d d| v rtds|rtj|d t  d t!"d g }%|r$|r$t#||}%|%s$t
d  t
d! |rL|r6|%r5t$|%d"I d H  nt
d# t%|d$ d%I d H }&|&j&rLt
d& t!' }'g }(t|})| dkrtjdkrt
d' t(||||}*t
d(t| d)| d*|  |)tj)9 })nt*||}*|
d+krd}+n!|
d,krt+, fd-dtt|	D },|,t+-|, }-nd }+d }-|rd nt.|)d.}.|*2 zo3 d H W }/|	d ur t|	dkr |
d/krt/0|	} n*|
d+kr|	|+ } |+d0 t|	 }+n|
d,ksJ d1|
 d2t+j/j0|	|-d3} nd } i ||/j1}0t||/j||/j|/j| |/j|0|/j2|/j3d4
}1|(t||1|.d5 q6 tj|( I d H }2|r>d6d |2D }2|ro|rP|%rOt$|%d7I d H  nt4td8d d u rot
d9 t%|d: d%I d H }&|&j&rot
d; |.d urx|.5  d| v rtj|d< t  d}3|3j6d=kr|37 }4d>|4v r|4d> d }4d?|4v r|4d? r|4d? d d@d }5nd }5nd }5nd }5t!' |' }6t8|rd n||2|6|| |5tj9dA\}7}8t
dBj:dCdDdEdF t
dG:dH|  t
dG:dI|rdJn| t
dG:dK|r|ndL t
dG:dM|7j; t
dN:dO|6 t
dG:dP|7j< t
dG:dQ|7j= tjdRv r0t
dG:dS|7j> t
dG:dT|7j? t
dG:dU|7j@ t
dN:dV|7jA t
dN:dW|7jB t
dN:dX|7jC t
dN:dY|7jD t
dG:dZ|7jE t
dN:d[|7jF t
dN:d\|7jG |5rt
dN:d]|5 t
d^j:d_dDd`dF t
dN:da|7jH t
dN:db|7jI t
dN:dc|7jJ t
dN:dd|7jK t
d^j:dedDd`dF t
dN:df|7jL t
dN:dg|7jM t
dN:dh|7jN t
d^j:didDd`dF t
dN:dj|7jO t
dN:dk|7jP t
dN:dl|7jQ t
d^j:dmdDd`dF t
dN:dn|7jR t
dN:do|7jS t
dN:dp|7jT t
dN:dq|7jU t
dN:dr|7jV t
ds tj|d< t  d}9|9j6d=krS|97 nd }3|7jMd ur|7jRd ur|7jCd uri dtt4tdtd dutjWdvtjdw|r}dJn|dx|dytjXdztjYd{tjZd|tj[d}|3d~|6d|7j;d|7j<d|7j=d|7j>d|7j?d|7j@i d|7jAd|7jBd|7jCd|7jFd|7jHd|7jId|7j\d|7jJd|7jKd|7jLd|7jMd|7j]d|7jNd|7jOd|7jPd|7j^d|7jQ|7jR|7jS|7j_|7jT|7jU|7jG|5|7jD|7jEd	}:nt
d|  t
d tj`rtj`};nYtab cd}<tjdkrJtjW d|< dtjd dtjY dtjZ dtje dtjf d};n.tjgdrgtjW d|< dtjd dtjY dtjZ d
};ntjW d|< dtjd dtj d};dd |2D |8dd |2D dd |2D dd |2D dd |2D d}=th|;d }>tjir|:|=B }?n|:}?|>jt7k|?d  W d    |:|=B S 1 sw   Y  |:|=B S )NzUnknown backend: r   )r7  c              	      sj   d u r | |dI d H S 4 I d H   | |dI d H W  d   I d H  S 1 I d H s.w   Y  d S )Nr   r'   r   )r  	semaphorer'   r(   limited_request_func&	  s   0z'benchmark.<locals>.limited_request_funczStarting warmup with z sequences...r@  r  rH   r  r  i   z/Can you tell me a detailed story in 1000 words?r      )r6   r8   r9   r<   )r:   r6   r7   r8   r9   r;   r<   r=   )rS   c                 S      g | ]	}|D ]}|qqS r'   r'   rm   rU   r  r'   r'   r(   r   d	      zbenchmark.<locals>.<listcomp>c                 s   r  rT   r  r  r'   r'   r(   r  g	  r  zbenchmark.<locals>.<genexpr>zUWarmup failed - Please make sure benchmark arguments are correctly specified. Error: zWarmup completed with z* sequences. Starting main benchmark run...rb  SGLANG_IS_IN_CIz/flush_cacherx   r|   zQWarning: PD separated mode requires --profile-prefill-url or --profile-decode-urlzBSkipping profiler start. Please specify worker URLs for profiling.r   zStarting profiler...r   r  zProfiler startedzEUsing time-based Mooncake request scheduler, ignoring --request-rate.z*Starting Mooncake trace replay. Sessions: z, Rounds per session: z. Slowdown factor: distinctskewedc                    s   g | ]} |  qS r'   r'   r  )r	  r'   r(   r   	  r  r*   uniformr   z&Unexpected lora_request_distribution: z. Expected 'skewed'.r  )
r:   r6   r7   r8   r9   r;   r<   r=   r>   r?   r   c                 S   r  r'   r'   r  r'   r'   r(   r   	  r  stopr   zStopping profiler...r   zProfiler stoppedz/get_server_infor   r   internal_statesavg_spec_accept_length)r]  r  r  r&  r7  r  r  z
{s:{c}^{n}}z Serving Benchmark Result 2   rk   )sncz{:<40} {:<10}zBackend:zTraffic request rate:tracezMax request concurrency:znot setzSuccessful requests:z{:<40} {:<10.2f}zBenchmark duration (s):zTotal input tokens:zTotal input text tokens:r1  r?  zTotal input vision tokens:zTotal generated tokens:z%Total generated tokens (retokenized):zRequest throughput (req/s):zInput token throughput (tok/s):z Output token throughput (tok/s):z%Peak output token throughput (tok/s):zPeak concurrent requests:zTotal token throughput (tok/s):zConcurrency:zAccept length:z{s:{c}^{n}}zEnd-to-End Latency-zMean E2E Latency (ms):zMedian E2E Latency (ms):zP90 E2E Latency (ms):zP99 E2E Latency (ms):zTime to First TokenzMean TTFT (ms):zMedian TTFT (ms):zP99 TTFT (ms):z'Time per Output Token (excl. 1st token)zMean TPOT (ms):zMedian TPOT (ms):zP99 TPOT (ms):zInter-Token LatencyzMean ITL (ms):zMedian ITL (ms):zP95 ITL (ms):zP99 ITL (ms):zMax ITL (ms):z2==================================================tagr7  rF  r  r  rH  rK  rL  rM  server_infodurationrn  r  total_input_text_tokenstotal_input_vision_tokensr  total_output_tokens_retokenizedrt  ru  rv  rx  r  r  r  r  r  rz  r{  r|  r}  r~  r  r  r  )	r  r  r  r  r  r  r  r  r  z*Error running benchmark for request rate: z------------------------------z%m%dr1  rp   imgs_z.jsonlr+  c                 S   r  r'   r	  r  r'   r'   r(   r   
  r  c                 S   r  r'   )rM   r  r'   r'   r(   r   
  r  c                 S   r  r'   )rO   r  r'   r'   r(   r   
  r  c                 S   r  r'   )rI   r  r'   r'   r(   r   
  r  c                 S   r  r'   )rQ   r  r'   r'   r(   r   
  r  )r5  r6  r  r  generated_textserrorsar  )lASYNC_REQUEST_FUNCSr\  r6   r   rX   r]   rC   r  r  	Semaphorer   rv   rF  rf   r   r  r  r5   r8   r  r9   r<   r  r   create_taskgatheranyrQ   r  r)   r  r   ri   r   r  r   r	  r   rJ   r   r  mooncake_num_roundsr  r  r  r  r   r+  choicer=   r>   r?   ru   closestatus_coder   r  r  r  rn  ro  rp  rq  rr  rs  rt  ru  rv  r  r  rx  r  r  r  r  r  rz  r{  r}  r~  r  r  r  r  r  r  r  r7  rH  rK  rL  rM  r  r|  r  r  output_filer   r  r  r-  r2  r6  r\   rY  output_detailsr  r  )@r7  r7   r  r  r&  r]  r  r  r  r  r  r	  r=   r
  r  r  r  r  mooncake_slowdown_factorr2  r   r   first_promptis_multi_turnr  warmup_recordr  prompt_textr  r9   r8   test_requestr;   
test_inputwarmup_tasksrp   warmup_outputspd_profile_urlsr  benchmark_start_timetasks
pbar_totalrequest_generatorlora_idxweights
lora_probsrz   r  merged_extra_bodyrS   r  r$  server_info_jsonr  benchmark_durationr  r6  respresultoutput_file_namer  result_detailsfileresult_for_dumpr'   )r	  r  r  r(   r}    s  











 



&






	
 !"#$%&
1, 


r}  c              
   C   sP   zt j| dd}d|jv W S  ty' } ztd|  W Y d }~dS d }~ww )NTr  chat_templatez)Fail to load tokenizer config with error=F)r   r  init_kwargsr   r   )r  r&  r  r'   r'   r(   check_chat_template
  s   rT  args_c                 C   s   | a dS )zSet the global args.N)rv   )rU  r'   r'   r(   set_global_args
  s   rV  c              
   C   s  | a tt ds
d t _tt dsdt _tt dsdt _tt ds"dt _tt ds*dt _tt ds2dt _tt d	s:d
t _tt d	sBd
t _tt dsJdt _	tt dsRd t _
tt ddr_t jdks_J tdt   t  tt j tjt j i }t jrtt j}t jrt jdksJ dt jd u rdddddddddt jdt _t jrt j dn
dt j dt j d}t jdv rt jrt j dn
dt j dt j d}nt jdv rt jrt j dn
dt j dt j d}nt jdv rt jrt j d n
dt j dt j d }ndt jd!kr1t jrt j d"n
dt j dt j d"}t jd u r0td# td n9t jd$krOt jr>t jnt j dt j }t jpLd%t _nt jd&krjt jr_t j d'n
dt j dt j d'}t jd u rzdt j dt j nt j}t jd u rt jd&krtd( td ztj|t d)}| d*g }|r|d+ d, nd t _W n& ty } ztd-| d.|  td/ td W Y d }~nd }~ww t jd u rtd0 td tt jstd1 t j d2v rd3t _!t jrJ d4t j"d5v rt j#d urt$t j#dksJ d6t j%dks(J d7t j% d8tt  d9 t j}t j
p8t j}	t j&d urBt j&nt j}
t'|
}t(t ||	}tt d:sXdt _)t j#d urat j"nd }t j#d urrt j"d;krrt j%nd }t*+t,dLi d<|d=|d>|d?|	d@|dA|dBt j-dt jdCt j.dDt j#dE|dF|dG|dHt j/dIt j0d:t j)dt jdt jd	t jdt j	dJtt dJd dKtt dKd S )MNr  r  r   r7  Fr"  r  r  r8  r|   r2  served_model_namer   re  zbenchmark_args=rb  zE`--tokenize-prompt` only compatible with `--backend sglang` currentlyi0u  i%[  i@  i'  i  )rb  rc  rd  rh  rf  rj  rk  rl  z
/v1/modelszhttp://:)rb  rc  z	/generate)rd  rf  rh  z/v1/completions)re  rg  ri  z/v1/chat/completionsrj  z#/v2/models/ensemble/generate_streamz@Please provide a model using `--model` when using `trt` backend.rk  r   rl  z/v1/models/model:predictzmPlease provide a model with `--model` when using truss backend. e.g. --model meta-llama/Llama-3.1-8B-Instructr  r   r   idzFailed to fetch model from z	. Error: zEPlease specify the correct host and port using `--host` and `--port`.zDNo model specified or found. Please provide a model using `--model`.z
WARNING It is recommended to use the `Chat` or `Instruct` model for benchmarking.
Because when the tokenizer counts the output tokens, if there is gibberish, it might count incorrectly.
r!  Tz5`--tokenize-prompt` not compatible with image dataset)r  r  znMore than 1 LoRA adapter must be specified via --lora-name to use 'distinct' or 'skewed' request distribution.z+Got invalid value for --lora-zipf-alpha of z. It must be greater than 1.r  r  r  r7  r7   r  r  r&  r]  r  r  r  r  r	  r=   r
  r  r   r   r'   )1rv   r  r  r  r7  r"  r  r  r8  r2  rW  ru   r7  r   
set_ulimitr+  r  r  r=   r   r   portrf   r  hostr:   r   exitr  ri   r   rT  rF  r*  r  r;   r]   r	  r&  r  ra  r  r  runr}  r  r  r
  r  )rU  r=   	model_urlr7   r  r   
model_listr  r7  r  tokenizer_idr&  r]  r  r	  r'   r'   r(   run_benchmark
  sn  











	


 $

	
rb    c              
   C   sl   t j}t |\}}|| k r4zt || |f W d S  ty3 } ztd|  W Y d }~d S d }~ww d S )NzFail to set RLIMIT_NOFILE: )resourceRLIMIT_NOFILE	getrlimit	setrlimitr\  r   )target_soft_limitresource_typecurrent_softcurrent_hardr  r'   r'   r(   rZ    s   rZ  c                   @   s   e Zd ZdddZdS )LoRAPathActionNc                 C   s.   t || jg  |D ]}t|| j| q	d S rT   )setattrdestru   r   )r  parser	namespacers  option_stringr;   r'   r'   r(   __call__  s   zLoRAPathAction.__call__rT   )r@   rA   rB   rr  r'   r'   r'   r(   rl    s    rl  __main__z(Benchmark the online serving throughput.)descriptionz	--backendz>Must specify a backend, depending on the LLM Inference Engine.)r   r   r   helpz
--base-urlz7Server or API base url if not using http host and port.)r   r   ru  z--hostz0.0.0.0zDefault host is 0.0.0.0.z--portznIf not set, the default port is configured according to its default value for different LLM Inference Engines.)r   ru  z--dataset-namer#  )	r#  rD  rE  r+  z
random-idsr9  r?  r1  r@  z$Name of the dataset to benchmark on.)r   r   r   ru  z--dataset-pathzPath to the dataset.z--modelzZName or path of the model. If not set, the default model will request /v1/models for conf.z--served-model-namezjThe name of the model as served by the serving service. If not set, this defaults to the value of --model.z--tokenizerz@Name or path of the tokenizer. If not set, using the model conf.z--num-promptsr  z.Number of prompts to process. Default is 1000.z--sharegpt-output-lenzVOutput length for each request. Overrides the output length from the ShareGPT dataset.z--sharegpt-context-lenzrThe context length of the model for the ShareGPT dataset. Requests longer than the context length will be dropped.z--random-input-lenr  zKNumber of input tokens per request, used only for random and image dataset.z--random-output-lenzLNumber of output tokens per request, used only for random and image dataset.)r   r   ru  z--random-range-ratiorK   zVRange of sampled ratio of input/output length, used only for random and image dataset.z--image-countzDNumber of images per request (only available with the image dataset)z--image-resolutionr;  zwResolution of images for image dataset. Supports presets 4k/1080p/720p/360p or custom 'heightxwidth' (e.g., 1080x1920).z--random-image-count
store_truezEnable Random Image Count)r  ru  z--image-formatjpegz:Format of images for image dataset. Supports jpeg and png.z--image-contentr+  z@Content for images for image dataset. Supports random and blank.z--request-rater  zNumber of requests per second. If this is inf, then all the requests are sent at time 0. Otherwise, we use Poisson process to synthesize the request arrival times. Default is inf.z--use-trace-timestampsz]Use timestamps from the trace file for request scheduling. Only valid for 'mooncake' dataset.z--max-concurrencya  Maximum number of concurrent requests. This can be used to help simulate an environment where a higher level component is enforcing a maximum number of concurrent requests. While the --request-rate argument controls the rate at which requests are initiated, this argument will control how many are actually allowed to execute at a time. This means that when used in combination, the actual request rate may be lower than specified with --request-rate, if the server is not processing requests fast enough to keep up.z--output-filezOutput JSONL file name.z--output-detailszOutput details of benchmarking.z--print-requestszQPrint requests immediately during benchmarking. Useful to quickly realize issues.z--disable-tqdmz%Specify to disable tqdm progress bar.z--disable-streamzDisable streaming mode.z--return-logprobzReturn logprob.z--return-routed-expertszReturn routed experts.z--seedzThe random seed.z--disable-ignore-eoszDisable ignoring EOS.z--extra-request-bodyz${"key1": "value1", "key2": "value2"}z|Append given JSON object to the request payload. You can use this to specifyadditional generate params like sampling params.)metavarr   ru  z--apply-chat-templatezApply chat templatez	--profilezdUse Torch Profiler. The endpoint must be launched with SGLANG_TORCH_PROFILER_DIR to enable profiler.z--plot-throughputzTPlot throughput and concurrent requests over time. Requires termplotlib and gnuplot.z--profile-activities+CPUGPU)rz  r{  CUDA_PROFILER)r   nargsr   r   z--profile-num-steps)r   r   z--profile-by-stage)r  r   z--profile-stages)r}  r   z--profile-output-dirz$Output directory for profile traces.z--profile-prefixz#Prefix for profile trace filenames.z--lora-name*zaThe names of LoRA adapters. You can provide a list of names in the format {name} {name} {name}...)r   r}  r   r  ru  z--lora-request-distributionr  )r  r  r  uk  What distribution to sample the LoRA adapters specified in --lora-name. Borrowed from the Punica paper. 'distinct' distribution means selecting a new LoRA adapter for every request. 'skewed' distribution follows the Zipf distribution, where the number of requests to model i specified in --lora-name is α times the number of requests for model i+1, where α > 1.z--lora-zipf-alphag      ?zYThe parameter to use for the Zipf distribution when --lora-request-distribution='skewed'.z--prompt-suffixzSSuffix applied to the end of all user prompts, followed by assistant prompt suffix.z--pd-separatedz"Benchmark PD disaggregation serverz--profile-prefill-urla*  URL(s) of the prefill worker(s) for profiling in PD separated mode. Can specify multiple URLs: --profile-prefill-url http://localhost:30000 http://localhost:30001. NOTE: Cannot be used together with --profile-decode-url. In PD separated mode, prefill and decode workers must be profiled separately.)r   r}  r   ru  z--profile-decode-urla)  URL(s) of the decode worker(s) for profiling in PD separated mode. Can specify multiple URLs: --profile-decode-url http://localhost:30010 http://localhost:30011. NOTE: Cannot be used together with --profile-prefill-url. In PD separated mode, prefill and decode workers must be profiled separately.z--flush-cachez,Flush the cache before running the benchmarkz--warmup-requestsz5Number of warmup requests to run before the benchmarkz--tokenize-promptzYUse integer ids instead of string for inputs. Useful to control prompt lengths accuratelyz)generated-shared-prefix dataset argumentsz--gsp-num-groups@   zBNumber of system prompt groups for generated-shared-prefix datasetz--gsp-prompts-per-group   zMNumber of prompts per system prompt group for generated-shared-prefix datasetz--gsp-system-prompt-leni   zMTarget length in tokens for system prompts in generated-shared-prefix datasetz--gsp-question-lenr  zHTarget length in tokens for questions in generated-shared-prefix datasetz--gsp-output-lenr  zFTarget length in tokens for outputs in generated-shared-prefix datasetz--gsp-range-ratiozIRange of sampled ratio of input/output length, used only for gsp dataset.z--gsp-fast-preparezSpeedup preparing by removing statistics computation, which will make some output statistics inaccurate but suitable for pressure tests.z--gsp-send-routing-keyztSend routing key in requests via X-SMG-Routing-Key header. Requests with the same prefix share the same routing key.z--gsp-num-turnsz}Number of turns for multi-turn conversations. If > 1, each prompt becomes a list of questions sharing the same system prefix.z--gsp-orderedzUKeep requests in order without shuffling. By default, requests are shuffled randomly.zmooncake dataset argumentsz--mooncake-slowdown-factorzSlowdown factor for replaying the mooncake trace. A value of 2.0 means the replay is twice as slow. NOTE: --request-rate is IGNORED in mooncake mode.z--mooncake-num-roundszNumber of conversation rounds for each session in the mooncake dataset. A value > 1 will enable true multi-turn session benchmarking.z--mooncake-workloadr  z-Underlying workload for the mooncake dataset.z--tagzThe tag to be dumped to output.z--headerzkCustom HTTP headers in Key=Value format. Example: --header MyHeader=MY_VALUE MyAnotherHeader=myanothervalue)r   rT   )r  )rb  NT)NNrH   F)TT)F)Fr|   )NF)FFr   Fr|   r   NN)rc  )__doc__argparser  r  importlib.util	importlibr  r   r#   r  r+  rd  shutilr   r   r   r   rf  r   r   dataclassesr   r   r   r   	functoolsr   r	   pathlibr
   typingr   r   r   r   r   r   r   r   r.   numpyr  r  r  datasetsr   PILr   tqdm.asyncior   transformersr   r   r   r   r   r  r   util	find_specwhichr  rC   rW   r)   r4   r5   rG   r^   ra   ri   rs   ry   r   r   r   r   r   r   r   r   r	  r  r  r!  ra  r-  rm  r  r  rX  r  rW  r  r  rF   rE   r  rU  rG  r[  rZ  r/  rJ  rD  rX   r  rN  ru  rz  ri  r  	NamespacerO  r  r  r  r  r}  rT  rV  rb  rZ  Actionrl  r@   ro  add_argumentkeysadd_mutually_exclusive_groupprofile_url_groupadd_argument_groupr  mooncake_group
parse_argsrv   r'   r'   r'   r(   <module>   s  	(
I
q
 7
R
h
/


"



f%
"

O
[
c
S
`	
q
U	

y
		
 

.

 00	




   S	 Q


	


   