o
    پi$                     @   s   d dl Z d dlmZmZ d dlmZmZmZmZ d dl	m
Z
 d dlmZmZmZ d dlmZmZmZ d dlmZmZmZ 							dd
ee dedee dedee dee dee defddZdS )    N)ListOptional)AccuracyTestParamsAccuracyTestResultrun_accuracy_testwrite_accuracy_github_summary)NightlyBenchmarkRunner)PerformanceTestParamsPerformanceTestResultrun_performance_test)DEFAULT_URL_FOR_TESTModelLaunchSettingsis_in_ci)ToolCallTestParamsToolCallTestResultrun_tool_call_testNightlyTestFmodels	test_namebase_urlis_vlmaccuracy_paramsperformance_paramstool_call_paramsreturnc           !      C   s  |pt }|du}|du}|du}	td td|  tdt|   |r.td|j  |r8td|j  |	r>td td |r[|}
|
jpN|rMd	nd
}t|||d}|  nd}g }d}| D ]}td td|j  td|j	  td|j
  td |jdddg d}|rt|||j|j|j||j|jd}||d< |jsd}|d |j td td |rt|||d}||d< |jsd}|d |j td td |	rt|||d}||d< |jsd}|d |j td td || qc|r|r|  |r't r'dd |D }t||j| td t| d |rEtd|j  td|j  td  t|D ]\}}td!|d"  d#|d$   |r|d r|d }
|
jrwd%|
jd&d'nd(}|
jrd)|
jd*nd(}td+|
jrd,nd- | |  |r|d r|d }td.|jrd,nd-  |j durtd/|j d0 |	r|d r|d }td1|jrd,nd- d2|j! d3|j" d4 |d rtd5|d   qMtd td6|rd7nd8  td9 |sg }t|D ]\}}|#dr|d j p7|#do*|d j p7|#do7|d j }|d s@|rg }|#drS|d jsS|d: |#drd|d jsd|d; |#dr|d js|d }|d<|j! d3|j" d4 |rd=$|nd>}d?$d@dA |d D }|dB|d"  d2|d$  dC| dD|  qdE$|} t%dF|  ||dGS )Ha  Run performance, accuracy, and/or tool call tests for a list of models.

    Args:
        models: List of ModelLaunchSettings to test
        test_name: Name for the test (used in reports)
        base_url: Server base URL (default: DEFAULT_URL_FOR_TEST)
        is_vlm: Whether these are VLM models (affects defaults)
        accuracy_params: Parameters for accuracy tests (None to skip accuracy)
        performance_params: Parameters for performance tests (None to skip perf)
        tool_call_params: Parameters for tool call tests (None to skip tool call)

    Returns:
        dict with test results:
        {
            "all_passed": bool,
            "results": [
                {
                    "model": str,
                    "perf_result": PerformanceTestResult/None,
                    "accuracy_result": AccuracyTestResult/None,
                    "errors": list,
                },
                ...
            ]
        }
    NzQ
================================================================================z	RUNNING: z
  Models: z  Accuracy dataset: z  Performance batches: z  Tool call tests: enabledzP================================================================================performance_profiles_vlms performance_profiles_text_models)profile_dirr   r   TzTESTING MODEL CONFIG: z  TP Size: z  Extra Args: )modelperf_resultaccuracy_resulttool_call_resulterrors)r   perf_runnerbatch_sizes
input_lensoutput_lensr   dataset_namespec_accept_length_thresholdr   Fr"   z+
Waiting 20 seconds for resource cleanup...   )r   paramsr   r    r!   c                 S   s   g | ]
}|d  r|d  qS )r     ).0rr+   r+   R/home/ubuntu/.local/lib/python3.10/site-packages/sglang/test/run_combined_tests.py
<listcomp>   s
    z&run_combined_tests.<locals>.<listcomp>z=
============================================================z Results Summaryz	Dataset: z
Baseline: z<============================================================z
Model    z: r   z
, output: z.1fz tok/s z, accept_len: z.2fz  Performance: PASSFAILz  Accuracy: z	  Score: z.3fz  Tool Call: z (/)z
  Errors: z	OVERALL: zALL TESTS PASSEDzSOME TESTS FAILEDz=============================================================
performanceaccuracyztool_call (z, unknownz; c                 s   s    | ]}t |V  qd S )N)str)r,   er+   r+   r.   	<genexpr>   s    z%run_combined_tests.<locals>.<genexpr>z  Model z): z - 
zTests failed:
)
all_passedresults)&r   printlendatasetr$   r   r   setup_profile_directory
model_pathtp_size
extra_argsr   r%   r&   r'   r(   passedappenderrortimesleepr   r   extendfailureswrite_final_reportr   r   baseline_accuracy	enumerateoutput_throughputavg_spec_accept_lengthscore
num_passed	num_totalgetjoinAssertionError)!r   r   r   r   r   r   r   run_perfrun_accuracyrun_tool_callperfr   r#   all_resultsr=   r   model_resultr   
acc_result	tc_resultaccuracy_resultsithroughput_str
accept_stracctcfailure_linesr-   has_failed_testfailed_testsfailed_test_str	error_strfailure_summaryr+   r+   r.   run_combined_tests   s<  #
	



&


"
rl   )r   NFNNN)rI   typingr   r    sglang.test.accuracy_test_runnerr   r   r   r   sglang.test.nightly_utilsr   #sglang.test.performance_test_runnerr	   r
   r   sglang.test.test_utilsr   r   r   !sglang.test.tool_call_test_runnerr   r   r   r9   booldictrl   r+   r+   r+   r.   <module>   s<    	