o
    پi5                     @   s  d Z ddlZddlZddlmZ ddlZddlmZ ddl	m
Z
mZmZmZmZmZmZ i dddd	d
ddd	dd	dddddddddd	ddddddddddddddi dd	ddddddddd dd!dd"dd#dd$dd%dd&dd'dd(dd)dd*d	d+di d,dd-dd.dd/dd0d	d1d	d2d	d3d	d4dd5dd6d	d7dd8dd9d	d:dd;d	d<dddddd	dd=ZG d>d? d?eZdS )@z
Measuring Massive Multitask Language Understanding
Dan Hendrycks, Collin Burns, Steven Basart, Andy Zou, Mantas Mazeika, Dawn Song, Jacob Steinhardt
https://arxiv.org/abs/2009.03300
    N)Optional)simple_eval_common)ANSWER_PATTERN_MULTICHOICE
HTML_JINJAEval
EvalResultSamplerBaseSingleEvalResultformat_multichoice_questionabstract_algebrastemanatomyother	astronomybusiness_ethicsclinical_knowledgecollege_biologycollege_chemistrycollege_computer_sciencecollege_mathematicscollege_medicinecollege_physicscomputer_securityconceptual_physicseconometricssocial_scienceselectrical_engineeringelementary_mathematicsformal_logic
humanitiesglobal_factshigh_school_biologyhigh_school_chemistryhigh_school_computer_sciencehigh_school_european_historyhigh_school_geography#high_school_government_and_politicshigh_school_macroeconomicshigh_school_mathematicshigh_school_microeconomicshigh_school_physicshigh_school_psychologyhigh_school_statisticshigh_school_us_historyhigh_school_world_historyhuman_aginghuman_sexualityinternational_lawjurisprudencelogical_fallaciesmachine_learning
management	marketingmedical_geneticsmiscellaneousmoral_disputesmoral_scenarios	nutrition
philosophy
prehistoryprofessional_accountingprofessional_lawprofessional_medicineprofessional_psychology)public_relationssecurity_studies	sociologyus_foreign_policyvirologyworld_religionsc                   @   s8   e Zd Zdedee defddZdedefdd	Z	d
S )MMLUEvalfilenamenum_examplesnum_threadsc                 C   sB   t |}dd | D }|rtd||}|| _|| _d S )Nc                 S   s   g | ]\}}|  qS  )to_dict).0_rowrL   rL   P/home/ubuntu/.local/lib/python3.10/site-packages/sglang/test/simple_eval_mmlu.py
<listcomp>Z   s    z%MMLUEval.__init__.<locals>.<listcomp>r   )pandasread_csviterrowsrandomRandomsampleexamplesrK   )selfrI   rJ   rK   dfrY   rL   rL   rQ   __init__X   s   

zMMLUEval.__init__samplerreturnc                    s.   dt f fdd}t|| j| j}t|S )NrP   c           	         s    j t| ddg} |}|pd}tt|}|r|dnd }|| d kr)dnd}tjt	j
|t|dd|| d |d	}|t|ddg }t| d
 d}t||||i|dS )Nuser)contentrole    Answerg      ?g        	assistant)prompt_messagesnext_messagescorecorrect_answerextracted_answerSubjectr   )htmlrh   metricsconvo)_pack_messager
   researchr   groupcommon	jinja_envfrom_stringr   renderdictsubject2categorygetr	   )	rP   rf   response_textmatchrj   rh   rl   rn   categoryr]   rL   rQ   fna   s*   
zMMLUEval.__call__.<locals>.fn)rw   rs   map_with_progressrY   rK   aggregate_results)rZ   r]   r~   resultsrL   r}   rQ   __call__`   s   
zMMLUEval.__call__N)
__name__
__module____qualname__strr   intr\   r   r   r   rL   rL   rL   rQ   rH   W   s    rH   )__doc__rV   rp   typingr   rS   sglang.testr   rs   sglang.test.simple_eval_commonr   r   r   r   r   r	   r
   rx   rH   rL   rL   rL   rQ   <module>   s   $
	
 !"#$%&'()*+,-./01234=