o
    پi'                     @   s  d Z ddlZddlZddlZddlZddlmZmZmZm	Z	 dd Z
dededefd	d
Zdd ZddgZedkre Zejdeddd ejdeddd ejdedd ejdedd ejdeddd ejdedd  ejd!ed" ejd#ed$d  ejd%ed&d  ejd'ed(d  ejd)ed*d  ejd+edd,d ejd-ed" ejd.deed/d0 ejd1ed2d3d ejd4edd5d ejd6ed7d ejd8ed9d ejd:ed;d<d ejd=edd>d e Zee dS dS )?zX
Usage:
python3 -m sglang.test.run_eval --port 30000 --eval-name mmlu --num-examples 10
    N)ChatCompletionSamplerEvalmake_report
set_ulimitc                 C   s6   t | dd }|tv r|dkrd}nd}d|diiS i S )Nthinking_modedeepseek-v3thinkingenable_thinkingchat_template_kwargsT)getattrTHINKING_MODE_CHOICES)argsr   thinking_param r   H/home/ubuntu/.local/lib/python3.10/site-packages/sglang/test/run_eval.pyget_thinking_kwargs   s   r   base_urleval_objreturnc           
   
   C   s   t | }|r
| ni }t| dd }|d ur||d< t| jt| ddt| dd|t| ddt| dd |r6|nd d	}t }||}t | }	||	|fS )
Ntop_k
max_tokens   top_p      ?temperature        reasoning_effort)modelr   r   r   r   r   
extra_body)r   copyr   r   r   timeperf_counter)
r   r   r   thinking_kwargsr   r   samplerticresultlatencyr   r   r   run_eval_once!   s$   





r'   c                    s  ddl m} t  dtjvrdtjd<  jr j dn
d j d j d jdkr=dd	l	m
} d
}|| j jn jdkrYddlm} tdd}d}||| j jnֈ jdkrlddlm} | j jnÈ jdkrddlm} | j jdgdn jdkrddlm} d}|| j jn jdkrddlm} | j jn jdkrddlm}	  j}
 jr jdnd }|	 j|
 j j|t dd t dd dnT jd krdd!lm} | j jt d"d d#n; jd$krdd%lm } | j jn' jd&kr'dd'l!m"} | j jt d(d)t d*d d+nt#d, j t d-d.d.krt$ \}}}|j%d/|j&iB }t'd0|d1d2 t'd3|d/ d1 | j d4|d/ |j jd5d6 | j d7||j jd5d6 n}dd8l(m)} | j*d9 fd:d;t+ j*D }g }|D ]}|, \}}}|-|j& qt.|t/| }d<d; |D }t'd= t'd> j* d?|d1 t'd@|  t'd= |j%dA|iB }|dB|iB }| j dC||j j j*dDd6 0   j dE|j1dFdE }dG| dH}t'dI|  t2|dJ}|3t4| W d    n	1 s.w   Y  t'| dG| dK}t2|dJ}|3t5j6|dLdM W d    n	1 sXw   Y  t'dN|  t dOdPro||fS |S )QNr   )dump_metricOPENAI_API_KEYEMPTYz/v1zhttp://:mmlu)MMLUEvalz@https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csvmath)MathEvalzgpt-4-turbo)r   zEhttps://openaipublic.blob.core.windows.net/simple-evals/math_test.csvmgsm)MGSMEvalmgsm_enen)	languagesgpqa)GPQAEvalzHhttps://openaipublic.blob.core.windows.net/simple-evals/gpqa_diamond.csv	humaneval)	HumanEvallongbench_v2)LongBenchV2Eval,max_context_lengthmin_context_length)r   data_sourcenum_examplesnum_threads
categoriesr<   r=   mmmu)MMMUVLMEvalresponse_answer_regex)rD   aime25)
AIME25Evalgsm8k)	GSM8KEval	num_shots   gsm8k_data_path)r?   r@   rI   	data_pathzInvalid eval name: repeat   scorezTotal latency: .3fz szScore: _score)r   eval)labels_latency)ThreadPoolExecutor)max_workersc                    s   g | ]
} t qS r   )submitr'   ).0_r   r   r   executorr   r   
<listcomp>   s    zrun_eval.<locals>.<listcomp>c                 S   s   g | ]}|d qS )rP   r   )rX   sr   r   r   r\      s    z====================zRepeat: z, mean: zScores: scores
mean_score_mean_score)r   rR   rM   rY   /z/tmp/z.htmlzWriting report to wz.json   )indentzWriting results to return_latencyF)7sglang.test.test_utilsr(   r   osenvironr   hostport	eval_namesglang.test.simple_eval_mmlur-   r?   r@   sglang.test.simple_eval_mathr/   r   sglang.test.simple_eval_mgsmr1   sglang.test.simple_eval_gpqar6   !sglang.test.simple_eval_humanevalr8   $sglang.test.simple_eval_longbench_v2r:   dataset_pathrA   splitr   r    sglang.test.simple_eval_mmmu_vlmrC   sglang.test.simple_eval_aime25rF   sglang.test.simple_eval_gsm8krH   
ValueErrorr'   metricsrO   printconcurrent.futuresrU   rM   ranger%   appendsumlenshutdownreplaceopenwriter   jsondumps)r   r(   r-   filenamer/   equality_checkerr1   r6   r8   r:   r>   rA   rC   rF   rH   r%   r&   r#   rx   rU   futuresscores_repeatfr_   	file_stemreport_filenamefhresult_filenamer   rZ   r   run_eval=   s   

&










	






r   r   qwen3__main__z
--base-urlz7Server or API base url if not using http host and port.)typedefaulthelpz--hostz0.0.0.0zDefault host is 0.0.0.0.z--portznIf not set, the default port is configured according to its default value for different LLM Inference Engines.)r   r   z--modelzZName or path of the model. If not set, the default model will request /v1/models for conf.z--repeatrN   zrepeat the evaluation n timesz--eval-namer,   )r   r   z--num-examples)r   z--num-threadsi   z--max-tokensr   z--temperaturer   z--top-pr   z--top-kzTop-k sampling parameterz--reasoning-effortz--thinking-modezmEnable thinking mode in Deepseek V3.1/3.2, or Qwen3.--reasoning-parser must be set when launching the server.)r   r   choicesr   z--dataset-pathzTHUDM/LongBench-v2zAPath to dataset file or HuggingFace dataset name for LongBench-v2z--categoriesz?Comma-separated list of categories to evaluate for LongBench-v2z--max-context-lengthz5Maximum context length in characters for LongBench-v2z--min-context-lengthz5Minimum context length in characters for LongBench-v2z--num-shotsrJ   z2Number of few-shot examples for GSM8K (default: 5)z--gsm8k-data-pathz*Path to GSM8K data file (e.g., test.jsonl))__doc__argparser   rg   r    sglang.test.simple_eval_commonr   r   r   r   r   strdictr'   r   r   __name__ArgumentParserparseradd_argumentintfloat
parse_argsr   r   r   r   r   <module>   s     "	