o
    پi3                     @   s6  d dl Z d dlZd dlZd dlmZ d dlmZmZ d dlm	Z	 d dl
mZmZmZmZmZ d dlZd dlZd dlZd dlZd dlZd dlmZ d dlmZ dZd	Zeeef Zee ZG d
d dZeG dd dZeG dd dZG dd dZ G dd dej!Z"G dd deZ#d$ Z%dZ&dZ'd$ Z(dZ)dd Z*dededefd d!Z+d"e,d#efd$d%Z-	&	dEd'ee d(ee d)eeeee f  d*efd+d,Z.d-e/d.ee d/e0fd0d1Z1ej2e3 ej4e5d2d3gd4Z6d5Z7d6ed*efd7d8Z8e8e6j9d8< d9Z:d:ed*efd;d<Z;d=ee fd>d?Z<d@dA Z=dFdCdDZ>dS )G    N)defaultdict)	dataclassfield)
ThreadPool)AnyDictListOptionalTuple)OpenAI)tqdmzYou are a helpful assistant.zYou are ChatGPT, a large language model trained by OpenAI, based on the GPT-4 architecture.
Knowledge cutoff: 2023-12
Current date: 2024-04-01c                   @   "   e Zd ZdZdedefddZdS )SamplerBasezw
    Base class for defining a sampling model, which can be evaluated,
    or used as part of the grading process.
    message_listreturnc                 C      t  NNotImplementedError)selfr    r   R/home/ubuntu/.local/lib/python3.10/site-packages/sglang/test/simple_eval_common.py__call__$      zSamplerBase.__call__N)__name__
__module____qualname____doc__MessageListstrr   r   r   r   r   r      s    r   c                   @   sJ   e Zd ZU dZee ed< eeeef  ed< e	e ed< e	e
 ed< dS )
EvalResultzN
    Result of running an evaluation (usually consisting of many samples)
    scoremetricshtmlsconvosN)r   r   r   r   r	   float__annotations__r   r   r   r   r   r   r   r   r    (   s   
 r    c                   @   sX   e Zd ZU dZee ed< eedZ	e
eef ed< dZee ed< dZee ed< dS )SingleEvalResultz.
    Result of evaluating a single sample
    r!   )default_factoryr"   Nhtmlconvo)r   r   r   r   r	   r%   r&   r   dictr"   r   r   r)   r*   r   r   r   r   r   r'   4   s   
 r'   c                   @   r   )Evalz0
    Base class for defining an evaluation.
    samplerr   c                 C   r   r   r   )r   r-   r   r   r   r   E   r   zEval.__call__N)r   r   r   r   r   r    r   r   r   r   r   r,   @   s    r,   c                       s   e Zd Z fddZ  ZS )LargerHttpxClientc                    s,   t d}t jddd}t j||d d S )Ni  )max_keepalive_connectionsmax_connections)timeoutlimits)httpxTimeoutLimitssuper__init__)r   timeout_configr2   	__class__r   r   r7   J   s   
zLargerHttpxClient.__init__)r   r   r   r7   __classcell__r   r   r9   r   r.   I   s    r.   c                   @   s   e Zd ZdZ								d$dedee dee d	ed
edee dedeeee	f  fddZ
			d%dedededefddZdefddZdede	fddZd ed!efd"d#ZdS )&ChatCompletionSamplerz2
    Sample from OpenAI's chat completion API
    N              ?   base_urlmodelsystem_messagetemperaturetop_preasoning_effort
max_tokens
extra_bodyc	           	      C   s   t |t d| _|d u r| jj jd j}|| _|| _|| _	|| _
|| _|| _|| _d| _td| jd| j	d| jd| jd| j
 d S )	N)r@   http_clientr   urlz;ChatCompletionSampler initialized with self.system_message=z self.temperature=z self.max_tokens=z self.reasoning_effort=z self.extra_body=)r   r.   clientmodelslistdataidrA   rB   rC   rD   rF   rE   rG   image_formatprint)	r   r@   rA   rB   rC   rD   rE   rF   rG   r   r   r   r7   X   s   *zChatCompletionSampler.__init__base64png   imageencodingformatfoveac                 C   s$   ddd| d| d| id}|S )N	image_urlrI   zdata:image/;,)typerX   r   )r   rT   rU   rV   rW   	new_imager   r   r   _handle_imaget   s
   z#ChatCompletionSampler._handle_imagetextc                 C   s
   d|dS )Nr^   )r[   r^   r   )r   r^   r   r   r   _handle_text   s   
z"ChatCompletionSampler._handle_textrolecontentc                 C   s   t ||dS )N)r`   ra   )r   )r   r`   ra   r   r   r   _pack_message   s   z#ChatCompletionSampler._pack_messager   r   c              
   C   s   | j r| d| j g| }d}|dk rzz| jjjj| j|| j| j| j	| j
| jd}|jd jjp1dW S  tjyK } ztd| W Y d }~dS d }~w tyu } zd| }td| d	| d
| t| |d7 }W Y d }~nd }~ww |dk std dS )Nsystemr      )rA   messagesrC   rD   rF   rE   rG    zBad Request Error   z'Rate limit exception so wait and retry z after z sec   zCAll retry attempts exhausted for request. Returning empty response.)rB   rb   rJ   chatcompletionscreaterA   rC   rD   rF   rE   rG   choicesmessagera   openaiBadRequestErrorrP   	Exceptiontimesleep)r   r   trialresponseeexception_backoffr   r   r   r      sF   
	

zChatCompletionSampler.__call__)NNNr=   r>   Nr?   N)rQ   rR   rS   )r   r   r   r   r   r	   r%   intr   r   r7   r]   r_   rb   r   r   r   r   r   r   r<   S   sT    	

r<   a   
Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.

{Question}

A) {A}
B) {B}
C) {C}
D) {D}
z(?i)Answer\s*:\s*([A-D])z(?i)Answer\s*:\s*([^\n]+)a  
Look at the following two expressions (answers to a math problem) and judge whether they are equivalent. Only perform trivial simplifications

Examples:

    Expression 1: $2x+3$
    Expression 2: $3+2x$

Yes

    Expression 1: 3/2
    Expression 2: 1.5

Yes

    Expression 1: $x^2+2x+1$
    Expression 2: $y^2+2y+1$

No

    Expression 1: $x^2+2x+1$
    Expression 2: $(x+1)^2$

Yes

    Expression 1: 3245/5
    Expression 2: 649

No
(these are actually equal, don't mark them equivalent if you need to do nontrivial simplifications)

    Expression 1: 2/(-3)
    Expression 2: -2/3

Yes
(trivial simplifications are allowed)

    Expression 1: 72 degrees
    Expression 2: 72

Yes
(give benefit of the doubt to units)

    Expression 1: 64
    Expression 2: 64 square feet

Yes
(give benefit of the doubt to units)

---

YOUR TASK


Respond with only "Yes" or "No" (without quotes). Do not include a rationale.

    Expression 1: %(expression1)s
    Expression 2: %(expression2)s
aA  
<h3>Prompt conversation</h3>
{% for message in prompt_messages %}
{{ message_to_html(message) | safe }}
{% endfor %}
<h3>Sampled message</h3>
{{ message_to_html(next_message) | safe }}
<h3>Results</h3>
<p>Correct Answer: {{ correct_answer }}</p>
<p>Extracted Answer: {{ extracted_answer }}</p>
<p>Score: {{ score }}</p>
c                 C   s   t jdi | S )Nr   )QUERY_TEMPLATE_MULTICHOICErV   )rowr   r   r   format_multichoice_question  s   rz   r-   expr1expr2c                 C   s4   t ||d }| t|ddg}|pd  dkS )N)expression1expression2user)ra   r`   rf   yes)EQUALITY_TEMPLATEr+   lowerstrip)r-   r{   r|   promptrt   r   r   r   check_equality
  s   r   valuesstatc                 C   sV   |dkr	t | S |dkrt | S |dkrt | S |dkr$t | S td|)NmeanstdminmaxzUnknown stat =)npr   r   r   r   
ValueError)r   r   r   r   r   _compute_stat  s   



r   r   r   single_eval_resultsdefault_stats
name2statsr   c                 C   s   |pi }t t}g }g }| D ]1}|du rq|j D ]\}}|| | q|jdur3|d |j ||j ||j qi }	| D ]#\}}
|||}|D ]}|dkrZ|n| d| }t	|
||	|< qRqFt
|	dd|	||dS )zO
    Aggregate results from multiple evaluations into a single EvalResult.
    Nr!   r   :)r!   r"   r#   r$   )r   rL   r"   itemsappendr!   r)   r*   getr   r    pop)r   r   r   name2valuesr#   r$   single_eval_resultnamevaluefinal_metricsr   statsr   keyr   r   r   aggregate_results  s4   

r   fxsnum_threadsc                 C   sv   t drtt| t|t|dS tt|t|}tt|| |t|dW  d   S 1 s4w   Y  dS )zO
    Apply f to each element of xs, using a ThreadPool, and show progress.
    debug)totalN)	osgetenvrL   mapr   lenr   r   imap)r   r   r   poolr   r   r   map_with_progressA  s
   
$r   r)   xml)loader	undefined
autoescapez
<div class="message {{ role }}">
    <div class="role">
    {{ role }}
    {% if variant %}<span class="variant">({{ variant }})</span>{% endif %}
    </div>
    <div class="content">
    <pre>{{ content }}</pre>
    </div>
</div>
rm   c                 C   s&   t tj| d | d | dddS )z?
    Generate HTML snippet (inside a <div>) for a message.
    r`   ra   variantN)r`   ra   r   )	jinja_envfrom_string_message_templaterenderr   )rm   r   r   r   message_to_html^  s
   

r   a  <!DOCTYPE html>
<html>
    <head>
        <style>
            .message {
                padding: 8px 16px;
                margin-bottom: 8px;
                border-radius: 4px;
            }
            .message.user {
                background-color: #B2DFDB;
                color: #00695C;
            }
            .message.assistant {
                background-color: #B39DDB;
                color: #4527A0;
            }
            .message.system {
                background-color: #EEEEEE;
                color: #212121;
            }
            .role {
                font-weight: bold;
                margin-bottom: 4px;
            }
            .variant {
                color: #795548;
            }
            table, th, td {
                border: 1px solid black;
            }
            pre {
                white-space: pre-wrap;
            }
        </style>
    </head>
    <body>
    {% if metrics %}
    <h1>Metrics</h1>
    <table>
    <tr>
        <th>Metric</th>
        <th>Value</th>
    </tr>
    <tr>
        <td><b>Score</b></td>
        <td>{{ score | float | round(3) }}</td>
    </tr>
    {% for name, value in metrics.items() %}
    <tr>
        <td>{{ name }}</td>
        <td>{{ value }}</td>
    </tr>
    {% endfor %}
    </table>
    {% endif %}
    <h1>Examples</h1>
    {% for html in htmls %}
    {{ html | safe }}
    <hr>
    {% endfor %}
    </body>
</html>
eval_resultc                 C   s   t tj| j| j| jdS )z=
    Create a standalone HTML report from an EvalResult.
    r!   r"   r#   )r   r   _report_templater   r!   r"   r#   )r   r   r   r   make_report  s
   
r   r#   c                 C   s   t tjdi | dS )zF
    Create a standalone HTML report from a list of example htmls
    Nr   )r   r   r   r   )r#   r   r   r   make_report_from_example_htmls  s   
r   c           
   
   C   s   t d|  d|  z`tj|dd}|  t|jdd}d}t| d3}td	|d
ddd}||D ]}|	|}|
| q6W d    n1 sMw   Y  W d    n1 s\w   Y  t d|   W d S  tjy~ }	 ztd|	 d }	~	ww )NzDownloading dataset z from T)streamzcontent-lengthr   i    wbDownloadingiBi   )descr   unit
unit_scaleunit_divisorz Dataset downloaded and saved to zFailed to download dataset: )rP   requestsr   raise_for_statusrw   headersopenr   iter_contentwriteupdateRequestExceptionrp   )
pathrI   rt   
total_size
block_sizer   progress_barrM   sizeru   r   r   r   download_dataset  s4   
 r     c              
   C   sl   t j}t |\}}|| k r4zt || |f W d S  ty3 } ztd|  W Y d }~d S d }~ww d S )NzFail to set RLIMIT_NOFILE: )resourceRLIMIT_NOFILE	getrlimit	setrlimitr   rP   )target_soft_limitresource_typecurrent_softcurrent_hardru   r   r   r   
set_ulimit  s   r   )r   N)r   )?r   r   rq   collectionsr   dataclassesr   r   multiprocessing.poolr   typingr   r   r   r	   r
   r3   jinja2numpyr   rn   r   r   r   OPENAI_SYSTEM_MESSAGE_APIOPENAI_SYSTEM_MESSAGE_CHATGPTr   Messager   r   r    r'   r,   Clientr.   r<   r   rx   ANSWER_PATTERN_MULTICHOICEANSWER_PATTERNr   
HTML_JINJArz   r   rL   r   r   callablerw   r   Environment
BaseLoaderStrictUndefinedselect_autoescaper   r   r   globalsr   r   r   r   r   r   r   r   r   <module>   s   
	
Y	:=
$
B	