o
    پi                  5   @   s  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	m
Z
 d dlmZmZmZmZ d dlZd dlZd dlmZ d dlmZ d dlmZmZ d dlmZmZmZ d dlmZ d d	lm Z  d d
l!m"Z" d dl#m$Z$m%Z% d dl&m'Z'm(Z( dZ)de*dee+ fddZ,dee+ dee+ dee- fddZ.ej/G dd dZ0G dd deZ1dede"fddZ2dede"fddZ3	 		!	dLde*d"eee4  d#e4d$e-d%e*d&ee d'e*d(ee* fd)d*Z5d+e0j6d+e0j7e0j8e0j9e0j:d+e0j;d!de0j<e0j=e0j>e0j?fde*d,e4d#e4d-e4d.e-d/e@d0e4d1e-d2e*d3e*d4eeB d5e@d6e4d7e@d8ee* d9ee* d%e*d:e*d;e@d$e-d'e*d(ee* d<e4d=e4d>e4d?e4f4d@dAZAdBdC ZBdDdE ZCdFee1 dGe0de"fdHdIZDe fde"dGe0defdJdKZEdS )M    N)SimpleNamespace)CallableListOptionalTuple)	BaseModel)tabulate)AutoProcessorPreTrainedTokenizer)get_datasetget_processorget_tokenizer)run_profile)launch_server)
ServerArgs)is_blackwellkill_process_tree)is_in_ciwrite_github_step_summaryiX  urlreturnc              
   C   s   zLt j| d dd}|  d}d}|jdD ]/}|dr1td|}|r0|t|	d7 }q|d	rGtd
|}|rG|t|	d7 }q||fW S  t
yf } ztd|  W Y d}~dS d}~ww )z
    Get cached_tokens_total and prompt_tokens_total from Prometheus /metrics endpoint.
    Returns (cached_tokens_total, prompt_tokens_total) or None if metrics are not available.
    z/metrics   timeout        
zsglang:cached_tokens_total{z2sglang:cached_tokens_total\{[^}]*\}\s+([\d.eE+-]+)   zsglang:prompt_tokens_total{z2sglang:prompt_tokens_total\{[^}]*\}\s+([\d.eE+-]+)z2Warning: Failed to get cache tokens from metrics: N)requestsgetraise_for_statustextsplit
startswithresearchfloatgroup	Exceptionprint)r   responsecached_tokens_totalprompt_tokens_totallinematche r/   _/home/ubuntu/.local/lib/python3.10/site-packages/sglang/test/bench_one_batch_server_internal.pyget_cache_tokens_from_metrics!   s4   


r1   beforeafterc                 C   sH   | du s|du r
dS |d | d  }|d | d  }|dkr"|| S dS )z
    Calculate cache hit rate from before/after metrics snapshots.
    Returns cached_tokens_delta / prompt_tokens_delta for the benchmark run.
    Nr   r   r/   )r2   r3   cached_deltaprompt_deltar/   r/   r0   calculate_cache_hit_rateD   s   r6   c                   @   s  e Zd ZU dZeed< dZee ed< dZ	ee ed< dZ
ee ed< d	Zeed
< dZeed< dZeed< d	Zeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZee ed< dZee ed< dZeed< dZeed< dZeed< dZeed < d!Zeed"< d#Zeed$< dZ eed%< d&Z!eed'< dZ"ee ed(< d)Z#eed*< d+Z$eed,< d	Z%eed-< d.Z&eed/< dZ'ee(e  ed0< e)d1e*j+fd2d3Z,e-d4e*j.fd5d6Z/dS )7	BenchArgsdefaultrun_name)r   
batch_size)   	input_len)   
output_lenr   temperatureFreturn_logprobr   client_stream_intervalinput_len_step_percentage base_urlskip_warmupshow_reportprofiler   profile_stepsprofile_by_stageNprofile_prefixprofile_output_dirdataset_pathrandomdataset_namegsp_num_groupsi   gsp_system_prompt_len   gsp_question_len   gsp_output_lenparallel_batchzresult.jsonlresult_filenamepydantic_result_filenameTappend_to_github_summary*   seedcache_hit_ratesglangbackendserver_args_for_metricsparserc                 C   s0  | j dttjd | j dtdtjd | j dtdtjd | j dtdtjd | j dttj	d | j d	d
d | j dttj
d | j dttjd | j dttjd | j dd
d | j dd
d | j dd
d | j dttjd | j dd
d | j dttjd | j dttjd | j dttjdd | j dttjg ddd | j dttjdd | j dttjd d | j d!ttjd"d | j d#ttjd$d | j d%d
d | j d&ttjd'd | j d(ttjd)d | j d*d+d,d-d. | j d/ttjd | j d0ttjd1d | j d2ttjd3d4gd5d | j d6td7d d8d9 d S ):Nz
--run-name)typer8   z--batch-size+)r`   nargsr8   z--input-lenz--output-lenz--temperaturez--return-logprob
store_true)actionz--client-stream-intervalz--input-len-step-percentagez
--base-urlz--skip-warmupz--show-reportz	--profilez--profile-stepsz--profile-by-stagez--profile-prefixz--profile-output-dirz--dataset-pathzPath to the dataset.)r`   r8   helpz--dataset-name)mmmurM   generated-shared-prefixz$Name of the dataset to benchmark on.)r`   r8   choicesre   z--gsp-num-groupszRNumber of shared prefix groups. batch_size requests are distributed across groups.z--gsp-system-prompt-lenz7Length of the shared system prompt in tokens per group.z--gsp-question-lenz;Length of the unique question suffix in tokens per request.z--gsp-output-lenz=Output length in tokens for generated-shared-prefix requests.z--parallel-batchz--result-filenamezDStore the results line by line in the JSON Line format to this file.z--pydantic-result-filenamezEStore the results as pydantic models in the JSON format to this file.z--no-append-to-github-summarystore_falserX   z=Disable appending the output of this run to github ci summary)rd   destre   z--seedz--cache-hit-ratezzCache hit rate for benchmarking (0.0-1.0). 0.0 means no cache hits (flush all), 0.4 means 40%% of input tokens are cached.z	--backendr\   vllmz%Backend server type (sglang or vllm).z--server-args-for-metrics*zRServer launch arguments to record in metrics output (for tracking configurations).)r`   rb   r8   re   )add_argumentstrr7   r9   intr:   r<   r>   r%   r?   rA   rB   rD   rH   rJ   rK   rL   rN   rO   rP   rR   rT   rV   rW   rZ   r[   r]   )r_   r/   r/   r0   add_cli_argsw   s   



zBenchArgs.add_cli_argsargsc                    s0   dd t | D }| di  fdd|D S )Nc                 S      g | ]}|j qS r/   )name.0attrr/   r/   r0   
<listcomp>       z+BenchArgs.from_cli_args.<locals>.<listcomp>c                    s   i | ]}|t  |qS r/   )getattrrt   rq   r/   r0   
<dictcomp>       z+BenchArgs.from_cli_args.<locals>.<dictcomp>r/   )dataclassesfields)clsrq   attrsr/   rz   r0   from_cli_args   s   zBenchArgs.from_cli_args)0__name__
__module____qualname__r9   rn   __annotations__r:   r   ro   r<   r>   r?   r%   r@   boolrA   rB   rD   rE   rF   rG   rH   rI   rJ   r   rK   rL   rN   rO   rP   rR   rT   rU   rV   rW   rX   rZ   r[   r]   r^   r   staticmethodargparseArgumentParserrp   classmethod	Namespacer   r/   r/   r/   r0   r7   V   sF   
 xr7   c                   @   s   e Zd ZU eed< eed< eed< eed< eed< eed< eed< eed< eed	< eed
< eed< dZee ed< dZ	ee ed< defddZ
dS )BenchOneCaseResultr9   r:   r<   r>   latencyinput_throughputoutput_throughputoverall_throughput	last_ttftlast_gen_throughput
acc_lengthNr[   profile_linkrV   c                 C   s   t |dM}| j| j| j| jt| jdt| jdt| jdt| j	dt| j
dt| jdt| jd| jd ur<t| jdnd d}|t|d  W d    d S 1 sUw   Y  d S )Na      )r9   r:   r<   r>   r   r   r   r   r   r   r   r[   r   )openr9   r:   r<   r>   roundr   r   r   r   r   r   r   r[   writejsondumps)selfrV   foutresr/   r/   r0   dump_to_jsonl  s$   







"z BenchOneCaseResult.dump_to_jsonl)r   r   r   rn   r   ro   r%   r[   r   r   r   r/   r/   r/   r0   r      s   
 r   launch_server_funcserver_argsc              
   C   sT   z z| | W n t y } z|d }~ww W tt dd d S tt dd w )NF)include_parent)r'   r   osgetpid)r   r   r.   r/   r/   r0   launch_server_internal  s   &r   c                 C   s   t jt| |fd}|  d|j d|j }t }t | tk rVzddi}tj	| d|td}|j
dkr=||fW S W n
 tjyH   Y nw td	 t | tk s#td
)N)targetrq   zhttp://:zContent-Typezapplication/json; charset=utf-8
/v1/models)headersr      
   z1Server failed to start within the timeout period.)multiprocessingProcessr   starthostporttimeDEFAULT_TIMEOUTr   r   status_codeRequestExceptionsleepTimeoutError)r   r   procrD   
start_timer   r)   r/   r/   r0   launch_server_process%  s2   


r   rM   r\   	input_idsr<   r[   rN   
image_datar]   
model_namec                    s   t ||   dkrdS td|d dd  d  fdd	|D }|d
kr4||dddd}	| d }
n|dddddd}	|dkrJ|durJ||	d< | d }
tj|
|	td}|  td dS )a  Warm up the cache by sending prefix tokens to populate the radix/prefix cache.

    Args:
        url: Server URL
        input_ids: List of input token id lists
        input_len: Length of input tokens
        cache_hit_rate: Fraction of input tokens to cache (0.0-1.0)
        dataset_name: Name of the dataset (used to determine if image data should be included)
        image_data: Optional image data for VLM models
        backend: Backend server type ("sglang" or "vllm")
        model_name: Model name (required for vllm backend)
    r   NzWarming up cache with d   .1fz% hit rate (z tokens per request)c                    s   g | ]}|d   qS Nr/   ru   idscached_token_lenr/   r0   rw   `  r|   z!_warmup_cache.<locals>.<listcomp>rk   r   r   F)modelprompt
max_tokensr?   stream/v1/completionsT)r?   max_new_tokens
ignore_eos)r   sampling_paramsr   rf   r   	/generate)r   r   zCache warmup completed)ro   r(   r   postr   r   )r   r   r<   r[   rN   r   r]   r   cache_warmup_input_idscache_warmup_payloadgen_urlwarmup_responser/   r   r0   _warmup_cacheA  sD   
	r   Fr:   r>   r?   r@   stream_intervalrB   r9   rV   	tokenizerrG   rH   rI   rJ   rK   rL   rU   rO   rP   rR   rT   c           =         s  |dkrt j| d td}|  nt j| d td}|  d}||vr/td| d| t||}t||||d||d	v|tj||| d
 | |||d}t	d |p\t	 dd }t
||d}|dkr|d | }fdd|D } tdd | D t|  }|}d }!n|dkr fdd|D } dd |D }!n	dd |D } d }!|dkr|| ||ddd}"|rd
|"d< | d }#n?d}$|$rg }%t|D ]}&|%dd d  qd }'nd }'||d|'|d!|dd"|rd#|ini }"| |"d$< |!d ur|!|"d%< | d& }#|d'krt| | ||||!||d( d }(|r t| |d)d*g|||d+}(t| })t }*t j|#|"dtd,}|  d'}+|dkrt },|jdd-D ]W}-|-d.}-|-r|-d/r|-d0d   }.|.d1krd n9t|.}/d2|/v rvtd3|/ d4|/d5g D ]}0|0d6 }1|1|,vr|,|1 t|,|krt |* }+q|qEn]|jdd-D ]V}-|-d.}-|-r|-d/r|-d7kr n@t|-d0d  d8}/d2|/v rtd3|/ d4|/d9 d: d u s|/d9 d: d; d<ksJ |/d9 d= d
krt |* }+qt |* }2|| |+ }3|| |2|+  }4|||  |2 }5|dkr!d>}6d>}7n.t j| d? td}|  | }8|8d@i g}9|9dA dBd pCd>}6|9dA dCd pNd>}7t| }:t|)|:};tdD|  tdE|  tdF|  tdG|2dHdI tdJ|3dHdK |d
krtdL|4dHdK tdM|+dHdI tdN|6dHdK |7dAkrtdO|7dHdP |;d urtdQ|;dR t |||||2|3|4|5|+|6|7|;|(dS}<|	r|<!|	 |<S )TNrk   z/reset_prefix_cacher   z/flush_cache)rM   rf   rg   z)Unsupported dataset for batch benchmark: z. Supported: g      ?)rf   rg   r   )rN   num_promptsrandom_input_lenrandom_output_lenrandom_range_ratiorL   tokenize_promptr]   rZ   rO   gsp_prompts_per_grouprP   rR   rT   r   name_or_path)model_idrg   c                       g | ]}  |jqS r/   encoder   ru   req)r   r/   r0   rw     r|   z run_one_case.<locals>.<listcomp>c                 s   s    | ]}t |V  qd S r   )lenr   r/   r/   r0   	<genexpr>  s    zrun_one_case.<locals>.<genexpr>rf   c                    r   r/   r   r   )	tok_innerr/   r0   rw     r|   c                 S   rr   r/   )r   r   r/   r/   r0   rw     rx   c                 S   rr   r/   )r   r   r/   r/   r0   rw     rx   T)r   r   r   r?   r   r   logprobsr   FzHuman: What is the capital city of france? can you give as many trivial information as possible about that city? answer in json.
2   z
Assistant:z$$ANY$$)r?   r   r   json_schemar   )r   r@   r   rU   r   r   r   r   )r   r   r<   r[   rN   r   r]   r   CPUGPU)r   	num_steps
activities
output_dirrI   rJ   )r   r   r   )decode_unicodezutf-8zdata:r   z[DONE]errorzRequest has failed. .rh   indexzdata: [DONE]r   	meta_infofinish_reasonr`   lengthcompletion_tokens/get_server_infointernal_statesr   r   avg_spec_accept_lengthzbatch size: zinput_len: zoutput_len: z	latency: .2fz szinput throughput: z tok/szoutput throughput: zlast_ttft: zlast generation throughput: zacc_length:  zcache hit rate: .4f)r9   r:   r<   r>   r   r   r   r   r   r   r   r[   r   )"r   r   r   r   
ValueErrorminr   r7   rZ   ry   r   sumr   rangeappendr   r   r1   r   perf_counterset
iter_linesdecoder"   stripr   loadsRuntimeErrorr   addr6   r(   r   r   )=r   r:   r<   r>   r?   r@   r   rB   r9   rV   r   rG   rH   rI   rJ   rK   rN   rL   rU   r[   r]   r   rO   rP   rR   rT   r)   supported_datasetsactual_gsp_groupsdataset_argsdataset_model_idinput_requestsr   r   payloadr   use_structured_outputstexts_r   r   metrics_beforeticr   first_token_indiceschunkdata_strdatachoiceidxr   r   r   r   r   r   server_infointernal_statemetrics_aftermetrics_cache_hit_rateresultr/   )r   r   r0   run_one_case  s|  






















r   c                 C   sP   | ||  |kr&t dd| d|d|d| ||   d|d d  dS d	S )
N========Skip benchmark batch_size=z * (input_len=z + output_len=z) = z! > skip_token_capacity_threshold=z due to kv cache limit.TFr(   )r:   r<   r>   skip_token_capacity_thresholdr/   r/   r0   !should_skip_due_to_token_capacity  s   *r%  c                 C   s.   | |krt dd| d|d d  dS dS )Nr!  r"  z' > skip_max_running_requests_threshold=z# due to max running requests limit.TFr#  )r:   #skip_max_running_requests_thresholdr/   r/   r0   'should_skip_due_to_max_running_requests  s   r'  results
bench_argsc                 C   s  d|j  d|j d}|jdkr|d|jd dd7 }|d	7 }t r&d
}nd}d}| jdd d g }g d}|jr@|d | D ]s}||j }	|jdkrS|jdnd}
d|j	 |j
 }d|j|  d |	 }d|j
 d |	 }|jd ur{|jdnd}|j	|j |jd|jd|j
d|
|d|d|d|g
}|jr|jr|d|j d n|d || qB|t||dd7 }|d	7 }|S )Nz
Input lens: z. Output lens: r   r   z Cache hit rate: r   r   z%.r   r   r   gffffff?c                 S   s   | j S r   )r<   )xr/   r/   r0   <lambda>  s    z$get_report_summary.<locals>.<lambda>)key)
z
batch sizez	input lenzlatency (s)zinput throughput (tok/s)zoutput throughput (tok/s)z
acc lengthzITL (ms)zinput cost ($/1M)zoutput cost ($/1M)zcache hit raterG   r   r   zn/ai  g    .Ai  r   z
[Profile]()github)r   tablefmt)r<   r>   r[   r   sortrG   r  tp_sizer   r:   r   r   r   r   r   )r(  r)  r   summaryhourly_cost_per_gpu
input_utilrowsr   r   hourly_costaccept_lengthitl_ms
input_costoutput_costr[   rowr/   r/   r0   get_report_summary  sR   



r<  c                 C   s  t |j tj |j |jrd |j}}nt|| \}}|jdkrztd| d tj|d t	d}|
  | dg }|sFtd|d d	 }td
|  td| d |jdkret|}nt|}td d|i}	td}
td}nhd }tj|d t	d}|
  | }	d|	v r|	d }nd|	v r|	d d d }|jdkrt|}nt|}|	di g}|d di dd}
|d dd}|	dd pd}|dksJ d||| }t|j|j|j|jd}td| td| td| td |
 |jsJtt|j}td! td"|  |D ]#}t|f|d#d$|j|j|j|jd%d%||j|j|j |j|d&| q"td' g }g }zt!"|j|j#|j$D ]=\}}}t%||smt&||||
roqZ|'t||||f|j|j|j|j|j(|j)||j|j|j |j*|j|d(| qZ|j+rLzt!"|j|j#|j$D ]w\}}}t%||st&||||
rq|j,pd%d)| d*|  }|'t||||fi d+|jd,|jd-|jd.|jd/|j(d0|j)d1|d2|jd3|jd4|j d5|j*d6|j+d7|j-d8|j.d9|d:|j/d;|jd|| qW n t0y: } ztd<|  W Y d }~nd }~ww t1||d=d>D ]	\}}|j2|_2qBW |rUt3|j4 n
|r_t3|j4 w w td?|j)  |j5sp||	fS t6||| }t| t7 r|j8rt9| ||	fS )@Nrk   zConnecting to vLLM server at z...r   r   r  z-No models found on vLLM server via /v1/modelsr   idzFound model: zLoading tokenizer for rf   zTokenizer loaded.r   infr   tokenizer_pathprefillr   memory_usagetoken_capacityi ʚ;%effective_max_running_requests_per_dpr   dp_sizer   zNeffective_max_running_requests_per_dp is not set, max_running_requests_per_dp=)rO   rP   rR   rT   zmax_running_requests_per_dp=zdp_size=z$skip_max_running_requests_threshold=zskip_token_capacity_threshold=z======== Warmup Begin ========zWarmup with batch_size=r;   r=   rC   )r:   r<   r>   r?   r@   r   rB   r9   rV   r   rN   rL   rU   r]   r   z======== Warmup End   ========
)r?   r@   r   rB   r9   rV   r   rN   rL   rU   r[   r]   r   zbs-z-il-r?   r@   r   rB   r9   rV   r   rN   rL   rU   r[   rG   rH   rI   rJ   rK   r]   z8Error profiling, some profile traces may not be dumped: F)strictz
Results are saved to ):rM   rZ   nprD   r   r]   r(   r   r   r   r   r   r  rN   r   r   r%   dictrO   rP   rR   rT   rE   listr  r:   r   r?   r@   rA   rB   rL   rU   	itertoolsproductr<   r>   r'  r%  r  r9   rV   r[   rG   rJ   rH   rI   rK   r'   zipr   r   pidrF   r<  r   rX   r   )r   r)  r   r   rD   r)   
model_listr   r   r  r$  r&  r?  r  max_running_requests_per_dprD  
gsp_kwargsbatch_size_uniquebsr(  profile_resultsilolrJ   r.   r   profile_resr2  r/   r/   r0   run_benchmark_internal  s  









	
'
rV  )rM   Nr\   N)Fr   r}   rI  r   r   r   rM   r#   r   typesr   typingr   r   r   r   numpyrF  r   pydanticr   r   transformersr	   r
   sglang.bench_servingr   r   r   sglang.profilerr   "sglang.srt.entrypoints.http_serverr   sglang.srt.server_argsr   sglang.srt.utilsr   r   sglang.test.test_utilsr   r   r   rn   tupler1   r%   r6   	dataclassr7   r   r   r   ro   r   rH   rJ   rK   rN   rL   r[   rO   rP   rR   rT   r   r   r%  r'  r<  rV  r/   r/   r/   r0   <module>   s"   #
  &	!

N	

  
H