o
    7ti(                     @   s&  d dl Z d dlZd dlZd dlZd dlmZ d dlZzd dlm	Z	 W n	 e
y+   Y nw eddZeddZdZd	d
 Zdd Zd/ddZdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zd d! Zd"d# Zd$d% Zd&ed'ee d(ed)efd*d+Zd,ed'ee d(ed)efd-d.Z dS )0    N)ListOpenAIAPI_TYPEopenaiMODEL_VERSIONzgpt-4.1-minia  You are an impartial grader for multiple-choice questions.
You are given:
1) the model's free-form output (student_answer),
2) the available options (each with a letter and text),
3) the correct answer (by letter and/or text).

Your job:
- Extract which single option the student intended (by letter if present, otherwise by best semantic match to the option text).
- Compare that choice to the correct answer.
- Output only a single character: 1 if correct, 0 if incorrect.
No explanation. No extra characters. Just 1 or 0.c                 C   s8   t ddD ]}d| d}d}|| v r| ||} q| S )N      z<image >z<image>)rangereplace)input_stringiquestion_text
query_text r   L/home/ubuntu/.local/lib/python3.10/site-packages/lm_eval/tasks/pisa/utils.pyreplace_images_tokens   s   r   c                 C   s4   dd t t| D }ddd t|| D }|S )Nc                 S   s   g | ]
}t td | qS )A)chrord).0r   r   r   r   
<listcomp>)   s    z!parse_options.<locals>.<listcomp>
c                 S      g | ]\}}| d | qS )z. r   )r   option_letteroptionr   r   r   r   +   s    )r   lenjoinzip)optionsoption_letterschoices_strr   r   r   parse_options(   s   r#    c                 C   s6   | d }t t| d  }d| d| d| }|S )Nquestionchoicesz>Given the provided image <image>, answer following questions:
r   z

)r#   astliteral_eval)doc	mc_promptr%   parsed_optionsr   r   r   construct_prompt3   s   r,   c                 C   s   t | }|S )N)r,   )r)   r%   r   r   r   pisa_doc_to_text:   s   r-   c                 C   s   d}| | d u r
d S | | gS )Nimager   )r)   	image_keyr   r   r   pisa_doc_to_visual?   s   
r0   c           	      K   sf   t t| d  \}}t|d ||}| d }||v r!||nd}|dur+||knd}dt|iS )z:Default evaluation of answers based on substring matching.r&   r   answerNFacc)get_multi_choice_infor'   r(   parse_multi_choice_responseindexfloat)	r)   resultskwargs	index2ansall_choicesparsed_predgold_ipred_i
is_correctr   r   r   pisa_process_resultsF   s   r?   c           	      K   s   t ddusJ dzddlm} W n ty   tdw tt| d  \}}| d }|||  }t|d d	d
 |	 D t
td|  d| dk}dt|iS )z.Evaluation of answers based on LLM as a judge.OPENAI_API_KEYNz/OPENAI_API_KEY environment variable is not set.r   r   z1Please install openai package to use LLM judging.r&   r1   c                 S   r   )) r   )r   kvr   r   r   r   g   s    z3pisa_process_results_llm_judged.<locals>.<listcomp>r   rA   r   r2   )osgetenvr   r   ImportErrorr3   r'   r(   	judge_mcqitemsr   r   r6   )	r)   r7   r8   r   r9   r:   r<   correct_answerr>   r   r   r   pisa_process_results_llm_judgedU   s.   
rJ   c                 C   s>   d}t | tr| D ]}||krd} |S q	|S | |krd}|S )z$Evaluate a multiple choice instance.FT)
isinstancelist)r<   r=   correctr1   r   r   r   eval_multi_choicer   s   
rN   c                 C   s   d}t | trg }| D ]	}|t| qnt| }|D ]'}t |tr8|D ]}t |tr6||v r6|s4d} nq%q||v rC|s@d} |S q|S )z
    Evaluate an open question instance
    https://github.com/pisa-Benchmark/pisa/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L191
    FT)rK   rL   extendnormalize_strstr)r<   r=   rM   norm_answersr1   prednorm_ansr   r   r   	eval_open   s,   

rU   c                 C   s  dD ]}|  |} qd|  d } d}d}g }|D ]}d| d| v r)|| d}qt|dkrA|D ]}| d| v r@|| q2t|dkrX|D ]}| d| v rW|| qIt|dkr~t|  d	kr~| D ]\}}	|	 |  v r}|| d}qjt|dkrt|}
|
S t|d
krg }|r|r|D ]}| d| d}|| qn)|D ]}| d| d}|| qn|D ]}|  ||  }|| q|t	
| }
|
S |d }
|
S )z
    Parse the prediction from the generated response.
    Return the predicted index e.g., A, B, C, D.
    https://github.com/pisa-Benchmark/pisa/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L10
    ),.!?;:' TF()r   rW      r   )stripappendr   splitrH   lowerrandomchoicerfindnpargmax)responser:   r9   char	index_ansans_with_brack
candidatesrf   r5   ans
pred_indexstart_indexescanr   r   r   r4      sb   




r4   c                 C   s@   d}d}d}t || }t || }t || }|| | }|S )z>
    Exact all forms of numbers from a string with regex.
    z-?\b\d{1,3}(?:,\d{3})+\bz-?\d+(?:\.\d+)?[eE][+-]?\d+z3-?(?:\d+\.\d+|\.\d+|\d+\b)(?![eE][+-]?\d+)(?![,\d]))refindall)stringpattern_commaspattern_scientificpattern_simplenumbers_with_commasnumbers_scientificnumbers_simpleall_numbersr   r   r   extract_numbers   s   r}   c                 C   s,   zt | dd W dS  ty   Y dS w )NrV   r$   TF)r6   r   
ValueError)ru   r   r   r   check_is_number   s   r   c                 C   sb   |   } t| }|r| dd} t| } t| d} | gS |  } t| dkr.d|  | d gS | gS )zHNormalize the str to lower case and make them float numbers if possible.rV   r$      r   r]   )ra   r   r   r6   roundrd   r   )ru   	is_numberr   r   r   rP      s   
rP   c                 C   sP   d}g }i }t | D ]\}}||tt|| < |tt||  q
||fS )zi
    Given the list of options for multiple choice question
    Return the index2ans and all_choices
    r   )	enumerater   r   rb   )r    	start_chrr:   r9   r   r   r   r   r   r3     s   r3   student_answerr    rM   returnc                 C   s*   d|    dtd| d|   dS )z
    options: like ["A) red", "B) blue", "C) green", "D) yellow"]
    correct: either a letter like "B" or the full option text. Both are provided to help you.
    zStudent Answer:
z

Options:

   z'

Correct Answer (letter and/or text):
z

Instructions:
- If student gives multiple letters, pick the *final* one.
- If no clear letter, pick the best-matching option by meaning.
- Output only 1 or 0.
)ra   r   r   )r   r    rM   r   r   r   build_user_prompt#  s   r   rS   c                 C   sd   t tdd}t| ||}|jjjtdddtdd|dgd}|j	d j
j }|d	kr0dS dS )
Nr@   )api_keyr   r   system)rolecontentuser)modeltemperature
max_tokensmessages1)r   rD   rE   r   chatcompletionscreater   SYSTEM_PROMPTr&   messager   ra   )rS   r    rM   clientuser_promptresprawr   r   r   rG   8  s   	rG   )r$   )!r'   rD   re   rs   typingr   numpyrh   r   r   rF   rE   r   r   r   r   r#   r,   r-   r0   r?   rJ   rN   rU   r4   r}   r   rP   r3   rQ   r   intrG   r   r   r   r   <module>   s<    	
=	"