o
    پi                      @   sv   d Z ddlZddlmZ ddlmZ ddlmZm	Z	m
Z
mZmZmZ d Zdedee fd	d
ZG dd de
ZdS )a  
AIME 2025 - American Invitational Mathematics Examination 2025
Dataset: opencompass/AIME2025
https://huggingface.co/datasets/opencompass/AIME2025

The American Invitational Mathematics Examination (AIME) is a challenging
competition math exam. All answers are integers from 000 to 999.
    N)Optional)simple_eval_common)ANSWER_PATTERN
HTML_JINJAEval
EvalResultSamplerBaseSingleEvalResulta  
Solve the following AIME (American Invitational Mathematics Examination) problem step by step. The last line of your response should be of the form Answer: $ANSWER (without quotes) where $ANSWER is the answer to the problem.

Note: AIME answers are always integers from 000 to 999 (inclusive). If you get a non-integer answer, you likely made a computational error.

{question}

Remember to put your answer on its own line after "Answer:", and express your answer as an integer from 000 to 999.
answerreturnc              	   C   sj   | du rdS t |  } ztt| }d|  krdkr&n W | S t |W S W | S  ttfy4   Y | S w )zb
    Normalize AIME answer to standard format.
    AIME answers are integers from 000 to 999.
    Nr   i  )strstripintfloat
ValueError	TypeError)r
   num r   R/home/ubuntu/.local/lib/python3.10/site-packages/sglang/test/simple_eval_aime25.pynormalize_aime_answer$   s   
r   c                   @   s4   e Zd Zdee defddZdedefddZd	S )

AIME25Evalnum_examplesnum_threadsc           	      C   s   zddl m} W n ty   tdw |dddd}|dddd}d	d
 |D }dd
 |D }|| }|rA|d t|t| }|| _|| _d S )Nr   )load_datasetzfThe 'datasets' package is required for AIME25 evaluation. Please install it with: pip install datasetszopencompass/AIME2025z
AIME2025-Itest)splitzAIME2025-IIc                 S   "   g | ]}|d  t |d dqS questionr
   )r   r
   r   .0rowr   r   r   
<listcomp>I       z'AIME25Eval.__init__.<locals>.<listcomp>c                 S   r   r   r   r    r   r   r   r#   M   r$   )datasetsr   ImportErrorminlenexamplesr   )	selfr   r   r   dataset1dataset2	examples1	examples2r)   r   r   r   __init__9   s(   
zAIME25Eval.__init__samplerr   c                    s.   dt f fdd}t|| j| j}t|S )Nr"   c           
         s    j tjdi | ddg} |}|pd}tt|}|r%|d nd }t|}t| d }||kr7dnd}t	j
tj|t|dd|| d |d	}|t|ddg }	t|||	d
t|idS )Nuser)contentrole    r
   g      ?g        	assistant)prompt_messagesnext_messagescorecorrect_answerextracted_answerchars)htmlr9   convometricsr   )_pack_messageQUERY_TEMPLATEformatresearchr   groupr   r   common	jinja_envfrom_stringr   renderdictr	   r(   )
r"   r7   response_textmatchr;   normalized_extractednormalized_correctr9   r=   r>   r0   r   r   fnZ   s.   

zAIME25Eval.__call__.<locals>.fn)rJ   rF   map_with_progressr)   r   aggregate_results)r*   r0   rP   resultsr   rO   r   __call__Y   s   !
zAIME25Eval.__call__N)	__name__
__module____qualname__r   r   r/   r   r   rT   r   r   r   r   r   8   s    
 r   )__doc__rC   typingr   sglang.testr   rF   sglang.test.simple_eval_commonr   r   r   r   r   r	   r   rA   r   r   r   r   r   r   r   <module>   s   	 	