o
    پi"o                  
   @   s"  d Z ddlZddlZddlZddlZddlZddlZddlZddlZddl	Z	ddl
mZ ddlmZ ddlZddlZddlmZ ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddl m!Z!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z,m-Z- ddl.m/Z/ ddl0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8m9Z9m:Z: ddl;m<Z< ej=j>j?gdd e3 ej=j>j@fe4 ej=j>jAffD  ZBdeCfddZDeCdddfddZEejFG dd dZGdd ZHdd  ZId!d" ZJ	dEd#d$ZKG d%d& d&eZLejMd'd( ZNejMd)d* ZOd+e"fd,d-ZPd.d/ ZQd0d1 ZRd2d3 ZSd4d5 ZTd6d7 ZUd8d9 ZVd:d; ZWd<d= ZXd>d? ZYeZd@kre[ Z\e-]e\ eG]e\ e\^ Z_e-`e_ZaeG`e_Zbejcedeeajef dAdB zeYeaeb W eajgdCkr}e5eh ddD dS dS eajgdCkre5eh ddD w w dS )Fa	  
Benchmark the latency of running a single static batch without a server.

This script does not launch a server and uses the low-level APIs.
It accepts server arguments (the same as launch_server.py) and benchmark arguments (e.g., batch size, input lengths).

# Usage (latency test)
## with dummy weights:
python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --load-format dummy
## sweep through multiple data points and store (append) the results in a jsonl file:
python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --output-len 32 256 --run-name test_run
## run with profiling:
python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --profile
## run with profiling to custom directory:
export SGLANG_TORCH_PROFILER_DIR=/root/sglang/profile_log
python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 --input-len 256 --profile
## run with CUDA profiler (nsys):
nsys profile --force-overwrite=true -o bench_one_batch python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 --input-len 256 --profile --profile-activities CUDA_PROFILER
# Usage (correctness test):
python -m sglang.bench_one_batch --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --correct

## Reference output (of the correctness test above, can be gpu dependent):
input_ids=[[1, 450, 7483, 310, 3444, 338], [1, 450, 7483, 310, 278, 3303, 13187, 290, 338], [1, 20628, 338, 263, 6575, 1460, 2462, 322, 306, 763]]

prefill logits (first half): tensor([[-10.0312,  -9.5000,   0.8931,  ...,  -4.9414,  -3.2422,  -3.3633],
        [-10.0312,  -9.5000,   0.8931,  ...,  -4.9414,  -3.2422,  -3.3633],
        [ -9.1875, -10.2500,   2.7129,  ...,  -4.3359,  -4.0664,  -4.1328]],
       device='cuda:0')

prefill logits (final): tensor([[-8.3125, -7.1172,  3.3457,  ..., -4.9570, -4.1328, -3.4141],
        [-8.9141, -9.0156,  4.1445,  ..., -4.9922, -4.4961, -4.0781],
        [-9.6328, -9.0547,  4.0195,  ..., -5.3047, -4.7148, -4.4570]],
       device='cuda:0')

========== Prompt 0 ==========
<s> The capital of France is Paris.
The capital of the United States is Washington, D.C.


========== Prompt 1 ==========
<s> The capital of the United Kindom is London.
The capital of the United Kingdom is London.
The capital of the

========== Prompt 2 ==========
<s> Today is a sunny day and I like to go for a walk in the park.
I'm going to the park
    N)SimpleNamespace)Tuple)ModelConfig)destroy_distributed_environment)_set_envs_and_config)initialize_moe_config)initialize_fp4_gemm_config)initialize_fp8_gemm_config)ReqScheduleBatch)prepare_mlp_sync_batch_raw)ForwardBatch)ModelRunner)SamplingParams)PortArgs
ServerArgs)SpeculativeAlgorithm)
configure_loggerget_bool_env_varis_cuda_alikeis_xpukill_process_treemaybe_reindex_device_idrequire_mlp_syncrequire_mlp_tp_gatherset_gpu_proc_affinitysuppress_other_loggers)get_tokenizerc                 C   s   g | ]\}}|r|qS  r   ).0	availableprofiler_activityr   r   J/home/ubuntu/.local/lib/python3.10/site-packages/sglang/bench_one_batch.py
<listcomp>]   s    r#   Fc              
   C   s   d| v r-zt j   |d W dS  ty, } z|d|  W Y d}~dS d}~ww g }d| v r;|t jjj d| v rG|t jjj	 |rXt jj
|d|d}|  |S dS )	zt
    Abstracted function to start profiling based on profile_activities.
    Returns profiler object (or None).
    CUDA_PROFILERz1CUDA Profiler started (nsys will begin capturing)zFailed to start CUDA profiler: NCPUGPUT)
activities
with_stackrecord_shapes)torchcudacudartcudaProfilerStart	ExceptionappendprofilerProfilerActivityr%   CUDAprofilestart)profile_activitiesprofile_record_shapes
rank_printer'   r0   r   r   r"   start_profileg   s0   
r9   c              
   C   s   d|v r+zt j   |d W n! ty* } z|d|  W Y d}~nd}~ww | dur3|   |ra| durS|rSt| | |rGd| nd}|d| d|  d|v rc|d	| d
 dS dS dS )z
    Abstracted function to stop profiling based on profile_activities.
    Optionally saves trace results and prints completion messages.
    r$   z/CUDA Profiler stopped (nsys should dump traces)zFailed to stop CUDA profiler: Nzfor  ztorch profiler chrome trace z
 saved to zCUDA profiler trace for z
 completed)r*   r+   r,   cudaProfilerStopr.   stop_save_profile_trace_results)r0   r5   r7   
save_tracetrace_filenamestager8   
stage_descr   r   r"   stop_profile   s,   
rB   c                   @   s   e Zd ZU dZeed< dZee ed< dZ	ee ed< dZ
ee ed< d	Zeed
< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZee ed< dZeed< dZeed< edejfddZedejfddZd S )!	BenchArgsdefaultrun_name)   
batch_size)i   	input_len)   
output_lenr:   prompt_filenamezresult.jsonlresult_filenameFcorrectness_test   cut_lenr   log_decode_stepr3   r6   r%   r&   r5   allprofile_stageprofile_filename_prefixparserc                 C   s  | j dttjd | j dtdtjd | j dtdtjd | j dtdtjd | j dttjd | j d	ttj	d | j d
dd | j dttj
d | j dttjdd | j dddd | j dddd | j dtdddgg ddd | j dttjg ddd | j d ttjd!d d S )"Nz
--run-name)typerD   z--batch-size+)rV   nargsrD   z--input-lenz--output-lenz--prompt-filenamez--result-filenamez--correctness-test
store_true)actionz	--cut-lenz--log-decode-stepz>Log decode latency by step, default is set to zero to disable.)rV   rD   helpz	--profilezEnable profiling.)rZ   r[   z--profile-record-shapesz*Record tensor shapes in profiling results.z--profile-activitiesr%   r&   )r%   r&   r$   zrProfiler activities: CPU, GPU, CUDA_PROFILER. If CPU/GPU, use torch profiler. If CUDA_PROFILER, use CUDA profiler.)rV   rX   rD   choicesr[   z--profile-stage)rR   prefilldecodez5Which stage to profile: all, prefill, or decode only.)rV   rD   r\   r[   z--profile-filename-prefixzPrefix of the profiling file names. The full profiling result file(s) be "[profile_filename_prefix]_batch[batch_size]_input[input_len]_output[output_len].trace.json.gz")add_argumentstrrC   rE   intrG   rH   rJ   rK   rL   rO   rP   rS   rT   )rU   r   r   r"   add_cli_args   sf   



zBenchArgs.add_cli_argsargsc                    s0   dd t | D }| di  fdd|D S )Nc                 S   s   g | ]
}|j t|jfqS r   )namerV   rD   )r   attrr   r   r"   r#      s    z+BenchArgs.from_cli_args.<locals>.<listcomp>c                    s    i | ]\}}||t  |qS r   )getattr)r   re   	attr_typerc   r   r"   
<dictcomp>   s     z+BenchArgs.from_cli_args.<locals>.<dictcomp>r   )dataclassesfields)clsrc   attrsr   rh   r"   from_cli_args   s   zBenchArgs.from_cli_argsN)__name__
__module____qualname__rE   r`   __annotations__rG   r   ra   rH   rJ   rK   rL   rM   boolrO   rP   r3   r6   r5   rS   rT   staticmethodargparseArgumentParserrb   classmethod	Namespacern   r   r   r   r"   rC      s&   
 6rC   c           	      C   s   t   |dkr	tndd }|| j| j  }t| }t|| j||| j|| jdd|j| d}|d|j	  t
| j| j| jd}| jdkrHt  ||fS )Nr   c                  _      d S Nr   rc   kwargsr   r   r"   <lambda>       zload_model.<locals>.<lambda>rF   )model_configmem_fraction_staticgpu_idtp_ranktp_sizemoe_ep_rankmoe_ep_sizepp_rankpp_size	nccl_portserver_argszmax_total_num_tokens=)tokenizer_modetrust_remote_code)r   printr   ep_sizer   from_server_argsr   r   r   max_total_num_tokensr   tokenizer_pathr   r   distbarrier)	r   	port_argsr   r   r7   r   r   model_runner	tokenizerr   r   r"   
load_model   s4   

r   c           
         s   |r|ng d} fdd|D }t dtjd}g }tt|D ]9}t|| | jks-J || d | j }t||| ||d}	|	j|	_d|	_	|	
t|	jt|	j  ||	 q ||fS )N)zThe capital of France isz#The capital of the United Kindom iszToday is a sunny day and I likec                    s   g | ]}  |qS r   )encoder   pr   r   r"   r#     s    z7prepare_inputs_for_correctness_test.<locals>.<listcomp>r   temperaturemax_new_tokensridorigin_input_textorigin_input_idssampling_params)r   rC   rJ   rangelenrO   r
   r   fill_idslogprob_start_lenset_extend_input_lenprefix_indicesr/   )

bench_argsr   custom_promptsprompts	input_idsr   reqsitmp_input_idsreqr   r   r"   #prepare_inputs_for_correctness_test  s0   	r   c                 C   s|   t t|D ]5}|| }| j|| | jd  7  _|jj|d | jf |jj|_d|_	|
t|jt|j  q|S )Nr   )r   r   r   rO   req_to_token_poolreq_to_tokentor   dtyper   r   )r   r   r   r   r   r   r   r   r"   *prepare_extend_inputs_for_correctness_test8  s   
r   c                 C   s   |r|nt jjdd| |ft jd}tdtjd}g }tt|D ]'}t	|dt
|| |d}|j|_d|_|t|jt|j  || q |S )Nr   i'  )r   r   r:   r   r   )nprandomrandintint32r   rC   rJ   r   r   r
   listr   r   r   r   r   r/   )rG   rH   custom_inputsr   r   r   r   r   r   r   r"   )prepare_synthetic_inputs_for_latency_testF  s*   
r   c                   @   sD   e Zd ZdefddZdefddZdefddZdefdd	Zd
S )TreeCacheNamespacereturnc                 C      dS NFr   selfr   r   r"   supports_swad     zTreeCacheNamespace.supports_swac                 C   r   r   r   r   r   r   r"   supports_mambag  r   z!TreeCacheNamespace.supports_mambac                 C   r   r   r   r   r   r   r"   is_chunk_cachej  r   z!TreeCacheNamespace.is_chunk_cachec                 C   s
   |    S rz   )r   r   r   r   r"   is_tree_cachem  s   
z TreeCacheNamespace.is_tree_cacheN)ro   rp   rq   rs   r   r   r   r   r   r   r   r"   r   c  s
    r   c              	   C   s   t |jj|j|jd}tj| |j|j||jdt	j
d}|  t|| | }t||}||j}|||}||j|fS )N)	page_sizedevicetoken_to_kv_pool_allocatorF)r   r   r   
tree_cacher   enable_overlapspec_algorithm)r   r   r   r   r   r   init_newr   r   r   NONEprepare_for_extend_maybe_prepare_mlp_sync_batchget_model_worker_batchr   forwardlogits_outputsamplenext_token_logits)r   r   dummy_tree_cachebatchmodel_worker_batchforward_batchr   next_token_idsr   r   r"   extendq  s*   	
r   c                 C   sN   | |_ |  t|| | }t||}||j}|||}||j	fS rz   )

output_idsprepare_for_decoder   r   r   r   r   r   r   r   )input_token_idsr   r   r   r   r   r   r   r   r"   r^     s   

r^   r   c                 C   sB   t |jrt| |jjd|jd |jjt|j|jjt d	 d S d S )NrF   )dp_sizeattn_tp_sizetp_groupget_idle_batchdisable_cuda_graphr   disable_overlap_scheduleoffload_tags)	r   r   r   r   r   r   r   r   set)r   r   r   r   r"   r     s   

r   c                 C   s^   | sg S t j| s|d|  d g S t| d}| W  d   S 1 s(w   Y  dS )zCRead custom prompts from the file specified by `--prompt-filename`.zCustom prompt file z# not found. Using default inputs...rN)ospathexistsopen	readlines)prompt_filer7   pfr   r   r"   _read_prompts_from_file  s   
$r   c                   C   s   t jddS )NSGLANG_TORCH_PROFILER_DIRz/tmp)r   environgetr   r   r   r"   _get_torch_profiler_output_dir  s   r   c              
   C   s6   t  }|  d| d| d| d| d
}tj||S )N_batch_input_output_z.trace.json.gz)r   r   r   join)rT   rG   rH   rJ   r@   
output_dirfilenamer   r   r"   _create_torch_profiler_filename  s   "r   c                 C   sH   t jt j|}t j|dd | | t| jddjdd d S )NT)exist_ok)group_by_input_shapeself_cpu_time_total)sort_by)	r   r   dirnameabspathmakedirsexport_chrome_tracer   key_averagestable)r0   r   
parent_dirr   r   r"   r=     s   
r=   c                    sf  t | d| d |dkrtndd }t| |||\}}t|j|}t|||\ }	|d d |jdkrGt|	|\}
}|d|
 d	 t| |	|}	t|	|\}
}|d
|
 d	  fddt	t
 D }t	|jd d D ] }t||\} }t	t
|	D ]}|| ||  qqut	t
|	D ]}|d| d |||| d qd S )N TPprefixr   c                  _   ry   rz   r   r{   r   r   r"   r}     r~   z"correctness_test.<locals>.<lambda>z
input_ids=
zprefill logits (first half): z 
zprefill logits (final): c                    s   g | ]} | | g qS r   r   )r   r   r   r   r   r"   r#     s    z$correctness_test.<locals>.<listcomp>rF   z========== Prompt z ==========)r   r   r   r   rK   r   rO   r   r   r   r   rJ   r^   tolistr/   )r   r   r   r   r   r7   r   r   r   r   r   r   r   r   next_token_ids_listr   r   r  r"   rM     s6   
rM   c                 C   s   t |   d S rz   )r*   get_device_modulesynchronize)r   r   r   r"   r    s   r  c           "      C   s  |j ||  }||kr|d| d| d| d d S |j  |j  | |||d}d}d }|	o5|dv }|r?t||
|d}t| t }t||\}}}t| t | }|rmt	||||d}t
|||d	|dd
 ||7 }|| | }|d|dd|dd ||d< ||d< g }|d }|	o|dv }t|d D ]s}t| d }|r||krt||
|d}t }t|||\}}t| t | }|r||krt	||||d}t
|||d	|dd
 ||7 }|| }|| |dk s|dkr|| dkr|d| d| d|dd|dd	 q|dkr5t|} ||  }!|d| dd|!dd | |d< |!|d< || | | }|d|dd|dd ||d < ||d!< |S )"Nz
skipping (z, z) due to max batch size limit)rE   rG   rH   rJ   r   )rR   r]   )r6   r7   r]   T)r7   r>   r?   r@   zPrefill. latency: z6.5fz s, throughput: z9.2fz token/sprefill_latencyprefill_throughput   )rR   r^   rF   r^      zDecode z. Batch size: z, latency: zDecode.  median latency: z s, median throughput: median_decode_latencymedian_decode_throughputzTotal. latency: z6.3ftotal_latencyoverall_throughput)r   r   clearr   r9   r  timeperf_counterr   r   rB   r   r^   r/   r   median)"rE   r   r7   r   rG   rH   rJ   r   rP   r3   r6   r5   rT   rS   r   max_batch_sizemeasurement_resultstot_latencyr0   enable_profile_prefillticr   r   r   r  r?   
throughputdecode_latenciesprofile_step_of_interestenable_profile_decoder   latencymed_decode_latencymed_decode_throughputr   r   r"   latency_test_run_once   s   


	
	
  

r+  c                    s  t |  t|  t|  tdrt| j| j| j| t| d| d |dkr)t	ndd }t
| |||\} t|jd |jd }|d t|j||||jd |jd td|jd | jdd	d	d
dd|d |d t|j|} fdd|D }t|}	g }
t|j|j|jD ]~\}}}g }|r|	|kr|}n8|	|kr|d|	 d| d| d t|d | }n|d|	 d| d t|}||d g||	   t|||}t|j||||||| j|j|dkr|jnd |dkr|jnd |j|j|j |}|d ur|
!| q|dkr8|j"r8t#|j"d}|
D ]}|$t%&|d  qW d    n	1 s3w   Y  | jdkrCt'  d S d S )NSGLANG_SET_CPU_AFFINITYr
  r  r   c                  _   ry   rz   r   r{   r   r   r"   r}     r~   zlatency_test.<locals>.<lambda>z
Warmup ...    FrQ   r:   rR   )rP   r3   r6   r5   rT   rS   r   zBenchmark ...c                    s   g | ]	}  | qS r   )r   stripr   r   r   r"   r#     s    z latency_test.<locals>.<listcomp>zCustom input size (z) is larger than batch_size (z). Using the first z	 prompts.z) is smaller than batch_size (z6). Pad to the desired batch_size with the last prompt.r   ar  rF   )(r   r	   r   r   r   r   r   nnodesr   r   r   r   rG   rH   r+  rE   minrJ   r   r   rK   r   	itertoolsproductcopydeepcopyr   rP   r3   r6   r5   rT   rS   r/   rL   r   writejsondumpsr   )r   r   r   r   r   r7   r   r   r   custom_input_lenresult_listbsilolbs_aligned_inputsretfoutresultr   r   r"   latency_test  s   



rB  c              	   C   s   t |j| _t|  | jr|jrt}nt}ntdt	| }| j
dkr.|| ||dd d S g }t| j
D ]+}t|}tj|| ||||fd}|  || W d    n1 s[w   Y  q5|D ]}|  qc|  d S )Nz`Provide --model-path for running the tests or provide --result-filename for plotting the resultsrF   r   )targetrc   )maxrG   cuda_graph_max_bsr   
model_pathrM   rB  
ValueErrorr   r   r   r   r   multiprocessingProcessr4   r/   r   	terminate)r   r   	work_funcr   workersr   r   procr   r   r"   main  s>   




rN  __main__z%(message)s)levelformatrF   )include_parentrz   )i__doc__ru   r4  rj   r2  r7  loggingrH  r   r  typesr   typingr   numpyr   r*   torch.distributeddistributedr   sglang.srt.configs.model_configr   %sglang.srt.distributed.parallel_stater   sglang.srt.entrypoints.enginer   sglang.srt.layers.moer   (sglang.srt.layers.quantization.fp4_utilsr   (sglang.srt.layers.quantization.fp8_utilsr	   "sglang.srt.managers.schedule_batchr
   r   +sglang.srt.managers.scheduler_dp_attn_mixinr   ,sglang.srt.model_executor.forward_batch_infor   &sglang.srt.model_executor.model_runnerr   #sglang.srt.sampling.sampling_paramsr   sglang.srt.server_argsr   r    sglang.srt.speculative.spec_infor   sglang.srt.utilsr   r   r   r   r   r   r   r   r   r   &sglang.srt.utils.hf_transformers_utilsr   r0   r1   r%   r2   XPUr5   r   r9   rB   	dataclassrC   r   r   r   r   r   no_gradr   r^   r   r   r   r   r=   rM   r  r+  rB  rN  ro   rv   rU   rb   
parse_argsrc   rn   r   r   basicConfigrf   	log_levelupperr   getpidr   r   r   r"   <module>   s    10
 
!Q#


1 o
+



