o
    پi                     @   sl   d Z ddlZddlZddlmZ ddlZddlmZ ddl	m
Z
mZmZmZmZmZmZ G dd deZdS )z
GPQA: A Graduate-Level Google-Proof Q&A Benchmark
David Rein, Betty Li Hou, Asa Cooper Stickland, Jackson Petty, Richard Yuanzhe Pang, Julien Dirani, Julian Michael, Samuel R. Bowman
https://arxiv.org/abs/2311.12022
    N)Optional)simple_eval_common)ANSWER_PATTERN_MULTICHOICE
HTML_JINJAEval
EvalResultSamplerBaseSingleEvalResultformat_multichoice_questionc                	   @   s@   e Zd Z	ddedee dedefddZded	efd
dZ	dS )GPQAEval   filenamenum_examplesnum_threads	n_repeatsc                    sv   t |}dd | D }td |r#|dksJ d ||}|| } fdd|D }|| _|| _|| _d S )Nc                 S   s   g | ]\}}|  qS  )to_dict).0_rowr   r   P/home/ubuntu/.local/lib/python3.10/site-packages/sglang/test/simple_eval_gpqa.py
<listcomp>$   s    z%GPQAEval.__init__.<locals>.<listcomp>r   r   z)n_repeats only supported for num_examplesc                    s$   g | ]}|d   tddiB qS )permutation   )samplerange)r   examplerngr   r   r   *   s    )	pandasread_csviterrowsrandomRandomr   examplesr   r   )selfr   r   r   r   dfr$   r   r   r   __init__   s   



zGPQAEval.__init__samplerreturnc                    s.   dt f fdd}t|| j| j}t|S )Nr   c                    s  | d | d | d | d g  fdd| d D    | d }d| }t d	  d
  d  d | d d}jt|ddg}|}|d u rKd}tt|}|rX|d
nd }||kr`dnd}tj	
tj|t|dd|||d}	|t|ddg }
t|	||
dt|idS )NzCorrect AnswerzIncorrect Answer 1zIncorrect Answer 2zIncorrect Answer 3c                    s   g | ]} | qS r   r   )r   ichoicesr   r   r   9   s    z1GPQAEval.__call__.<locals>.fn.<locals>.<listcomp>r   ABCDr   r         Question)ABCDr0   user)contentrole g      ?g        	assistant)prompt_messagesnext_messagescorecorrect_answerextracted_answerchars)htmlr<   convometrics)indexdict_pack_messager
   researchr   groupcommon	jinja_envfrom_stringr   renderr	   len)r   correct_indexr=   choices_dictr:   response_textmatchr>   r<   r@   rA   r(   r+   r   fn2   sN   

zGPQAEval.__call__.<locals>.fn)rD   rI   map_with_progressr$   r   aggregate_results)r%   r(   rS   resultsr   rR   r   __call__1   s   +
zGPQAEval.__call__N)r   )
__name__
__module____qualname__strr   intr'   r   r   rW   r   r   r   r   r      s    
r   )__doc__r"   rF   typingr   r   sglang.testr   rI   sglang.test.simple_eval_commonr   r   r   r   r   r	   r
   r   r   r   r   r   <module>   s   $