o
    پi"                     @   s  d dl mZ d dlmZ d dlmZmZmZ d dlm	Z	 d dl
mZ d dlmZmZmZmZmZ eG dd dZeG d	d
 d
Zdededee ddfddZ								d(dedededee dee dee dedee dee dee dee deeee ee f fddZ			 d)deded!ee d"ededeeee ee f fd#d$Z	d*ded%edee defd&d'ZdS )+    )	dataclass)SimpleNamespace)ListOptionalTuple)kill_process_treerun_eval)!DEFAULT_TIMEOUT_FOR_SERVER_LAUNCHDEFAULT_URL_FOR_TESTModelLaunchSettingspopen_launch_serverwrite_github_step_summaryc                   @   s   e Zd ZU dZeed< eed< dZee	 ed< dZ
ee	 ed< dZee	 ed< dZeed	< dZee ed
< dZee ed< dZee ed< dZee	 ed< dS )AccuracyTestParamsz Parameters for accuracy testing.datasetbaseline_accuracyNnum_examplesnum_threads
max_tokensFreturn_latencythinking_modetemperaturetop_prepeat)__name__
__module____qualname____doc__str__annotations__floatr   r   intr   r   r   boolr   r   r   r    r#   r#   T/home/ubuntu/.local/lib/python3.10/site-packages/sglang/test/accuracy_test_runner.pyr      s   
 r   c                   @   sj   e Zd ZU dZeed< eed< eed< ee ed< eed< ee ed< dZ	ee ed	< dZ
ee ed
< dS )AccuracyTestResultzResult of an accuracy test.modelr   passedscorer   errorNlatencyvariant)r   r   r   r   r   r   r"   r   r    r*   r+   r#   r#   r#   r$   r%   !   s   
 r%   	test_namer   resultsreturnNc           
      C   s   d|  d| d}|d7 }|d7 }|D ]?}|j rdnd}|jdur&|jd	nd
}|jd	}|jr3|jnd}|jr;|jn|j}	|d|	 d| d| d| d| d7 }qt| dS )zWrite accuracy test results to GitHub step summary.

    Args:
        test_name: Name of the test
        dataset: Dataset name used for evaluation
        results: List of AccuracyTestResult objects
    z#### z - Accuracy (z)
z/| config | status | score | baseline | error |
z/| ------ | ------ | ----- | -------- | ----- |
u   ✅u   ❌Nz.4fzN/A-z| z | z |
)r'   r(   r   r)   r+   r&   r   )
r,   r   r-   summaryresultstatus_emoji	score_strbaseline_str	error_strconfig_namer#   r#   r$   write_accuracy_github_summary/   s   
*r7   Fr&   base_urlr   r   r   r   r   r   r   r   c              
   C   s4  d}zzht | j|| jt| jd}t|| j|||pdd}|dur#||_|r(d|_|dur/||_|dur6||_	|	dur=|	|_
|
durD|
|_t|}|r[t|tr[|\}}t|d|d< n|}dd|fW W |rkt|j S S  ty } zdd	t| dfW  Y d}~W |rt|j S S d}~ww |rt|j w w )
zRun evaluation using simple_eval backend (run_eval.py).

    Returns:
        Tuple of (success, error_message, metrics_dict)
    N
other_argstimeoutenvi   )r8   r&   	eval_namer   r   T   r*   FzAccuracy test exception: )r   
model_path
extra_argsr
   r<   r   r   r   r   r   r   r   r	   
isinstancetupleroundr   pid	Exceptionr   )r&   r8   r   r   r   r   r   r   r   r   r   processargsr1   metricsr*   er#   r#   r$   _run_simple_evalK   s^    rJ         num_questions	num_shotsc           
      C   s   ddl m} d}zgzAt| j|| jt| jd}t|d|pd|ddt|	dd	 d
}||}d|v r<d|vr<|d |d< dd|fW W |rJt
|j S S  tyo }	 zddt|	 dfW  Y d}	~	W |rjt
|j S S d}	~	ww |rxt
|j w w )zRun evaluation using few_shot backend (few_shot_gsm8k.py).

    Returns:
        Tuple of (success, error_message, metrics_dict)
    r   r   Nr9         zhttp://127.0.0.1:)rN   	data_pathrM   max_new_tokensparallelhostportaccuracyr(   TFzFew-shot evaluation exception: )sglang.test.few_shot_gsm8kr	   r   r?   r@   r
   r<   r   r!   splitr   rD   rE   r   )
r&   r8   rM   rN   r   run_few_shot_evalrF   rG   rH   rI   r#   r#   r$   _run_few_shot_eval   sH   
 r\   paramsc           
         s  |pt }tdd  td| j  td j  td j  td d t fdddD } jd	krL|sLt| | j jpEd
d\}}}nt	| | j j j
 j j j j j jd\}}}|std| j d|  t| j jdd j|| jdS |dp|dp|dd}| jk}|d}	|rtd|dd jd nd|dd jd}td|  t| j j|| j|s|nd|	| jdS )a8  Run accuracy test for a single model.

    Args:
        model: ModelLaunchSettings with model config
        params: AccuracyTestParams with dataset, baseline, and optional settings
        base_url: Server base URL (default: DEFAULT_URL_FOR_TEST)

    Returns:
        AccuracyTestResult with test outcome
    
z<============================================================zRunning ACCURACY test for z  Dataset: z  Baseline: c                 3   s    | ]
}t  |d uV  qd S N)getattr).0fieldr]   r#   r$   	<genexpr>   s
    
z$run_accuracy_test.<locals>.<genexpr>)r   r   r   r   gsm8krL   )r&   r8   rM   r   )r&   r8   r   r   r   r   r   r   r   r   r   u   ✗ Accuracy test failed for z: FN)r&   r   r'   r(   r   r)   r+   r(   
mean_scorerX   g        r*   u   ✓ Accuracy z.3fz >= baseline z	Accuracy z below baseline u   ✗ )r&   r   r'   r(   r   r)   r*   r+   )r   printr?   r   r   anyr\   r   r   rJ   r   r   r   r   r   r   r%   r+   get)
r&   r]   r8   has_extended_paramssuccessr)   rH   r(   r'   r*   r#   rc   r$   run_accuracy_test   sz   




rl   )NNNFNNNN)NrK   rL   r_   )dataclassesr   typesr   typingr   r   r   sglang.srt.utilsr   sglang.test.run_evalr	   sglang.test.test_utilsr
   r   r   r   r   r   r%   r   r7   r!   r"   r    dictrJ   r\   rl   r#   r#   r#   r$   <module>   s    	
 	

L
5