o
    پi'                     @   s  d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZ d dlm	Z	 d dl
Z
d dlZd dlZd dlmZ d dlmZ d dlmZ ddd	d
ddd	d
ddd	d
dZdd ZdddddZdddddZG dd de
jZdd Zdd Zdd Zdd  Zd!d" Zeeeed#Zd$d% Zd&d' Ze d(kre j!d)d*Z"e"j#d+e$d,d-d. e"j#d/e$d0d1d. e"j#d2e$d3d4d5 e"j#d6e%dd7d. e"j#d8e%d9d: e"j#d;e$d<d=d. e"& Z'e(ee' eee'j) e'j*e'j+ ej,d<d3d> dS dS )?    N)defaultdict)	dataclass)load_dataset)AsyncOpenAI)tqdmz meta-llama/Llama-3.1-8B-Instructz!meta-llama/Llama-3.1-70B-Instructz"meta-llama/Llama-3.1-405B-Instruct)8b70b405b)b10oaisglc              	      s$  t j|d| d}t j|rtd| d d S |4 I d H c | jjt| | |d|dI d H }	t|	t	j
rUt|d}
td|
 W d    n1 sPw   Y  t|	t	jjjs_J t|d}
t|	|
 W d    n1 suw   Y  W d   I d H  d S 1 I d H sw   Y  d S )	N	response_.pklzFile z already exists, skipping.g        )modelprompttemperature
max_tokenswbbad_response)ospathjoinexistsprintcompletionscreateprovider_to_models
isinstanceopenaiBadRequestErroropenpickledumptypes
completion
Completion)clientr   	semaphoreindexprovider
model_size
output_dirr   output_fileresponsef r/   K/home/ubuntu/.local/lib/python3.10/site-packages/sglang/eval/llama3_eval.pyfetch_responses'   s*   
.r1      i   i   )evals__mmlu__details!evals__mmlu__0_shot__cot__detailsevals__mmlu_pro__detailsevals__gsm8k__detailsr3   r4   r5   r6   )mmlummlu_cotmmlu_progsm8kc                       s*   e Zd Zdejdejf fddZ  ZS )CustomAsyncHTTPXClientrequestreturnc                    s<   t dtd d|_t j|g|R i |I d H S )Nhttps://model-MODEL_ID#.api.baseten.co/development/predict)httpxURLr   getenvurlsupersend)selfr<   argskwargs	__class__r/   r0   rF   P   s
    zCustomAsyncHTTPXClient.send)__name__
__module____qualname__rA   RequestResponserF   __classcell__r/   r/   rJ   r0   r;   O   s    "r;   c                 C   sb   | dvrt dd krdt jd< tddtdt d dt d d	t d
tddd|  S )Nr
   OPENAI_API_KEYEMPTYzhttp://127.0.0.1:8000/v1/)base_urlzApi-Key r>   r?   r@   )api_keyrT   http_clientzhttp://127.0.0.1:30000/v1/)r   r
   r   )r   rC   environr   r;   )r)   r/   r/   r0   
get_clientW   s   
rX   c           
         s   t ddt| j  }t| j}| jd u r t|d d | _|d d d | j }tj	| j
dd g }tt| j  }t| j}tt|ddD ]\}}|tt|d	|d
  ||| j| j| j
|d qItt|t|ddD ]}	|	I d H  qtd S )N(meta-llama/Llama-3.1-405B-Instruct-evalsLlama-3.1-405B-Instruct-latestinput_final_promptsT)exist_okzCreating tasks)descz<|begin_of_text|>r   )r   zProcessing tasks)totalr^   )r   TASK_TO_EVAL_SETtaskasyncio	Semaphoreconcurrencynum_exampleslenr   makedirsr+   TASK_TO_MAX_TOKENSrX   r)   	enumerater   appendcreate_taskr1   r*   as_completed)
rH   dsr'   promptstasksr   r&   idxr   futurer/   r/   r0   	benchmarkg   sB   


rr   c                 C   s,   | d ur| j d j   ddS d S )Nr   . )choicestextlstriprstripupperreplace)r-   r/   r/   r0   get_mmlu_answer   s    r{   c                 C   s   d}t || jd j}|r|dddddS d}t || jd j}|r2|dddS d}t || jd j}|rI|dddS d	}t || jd j}|r`|dddS d S )
NzThe best answer is (.+)\.?r   r2   rs   rt   *zthe best answer is (.+)\.?zThe correct answer is (.+)\.?zthe correct answer is (.+)\.?researchru   rv   grouprz   )r-   patternmatchr/   r/   r0   get_mmlu_cot_answer   s"   r   c                 C   sD   d}t || jd j}|r |d}dD ]}||d}q|S d S )NzThe final answer is (.+)\.?r   r2   )%$rt   r}   )r-   r   r   s	ok_symbolr/   r/   r0   get_answer_gsm8k   s   
r   )r3   r4   r6   r5   c           
      C   s   t dd|  }dd |d d D }d|v sd|v rUd	|v r(t d
d|  }nt dd|  }i }|d D ]
}|||d d < q6g }|D ]	}	|||	  qE||d< |S |S )NrY   rZ   c                 S   s   g | ]}|d  qS )r   r/   .0xr/   r/   r0   
<listcomp>   s    z)get_dataset_from_task.<locals>.<listcomp>r[   input_final_prompts_hashr   r   70z'meta-llama/Llama-3.1-70B-Instruct-evalszLlama-3.1-70B-Instruct-z&meta-llama/Llama-3.1-8B-Instruct-evalszLlama-3.1-8B-Instruct-r   )r   rj   )
ra   response_pathr*   ds_405bds_405b_hash_orderref_model_dshash_to_rowrowreordered_rowsprompt_hashr/   r/   r0   get_dataset_from_task   s0   r   c              
      s  t | ||}g }t|d }td|D ]}tttj|d| dd}|	| qt
G dd d t fdd	}t||d D ]5\}}	t|  |}
|	d
 }|
|	d v }|rc||  jd7  _|	d rp||  jd7  _||  jd7  _qD  }| D ]*\}}|j|j |_|j|j |_| j|j7  _| j|j7  _| j|j7  _q|j|j |_|j|j |_tdtdd | D  tdtdd | D  td|j td|j d S )Nr[   r   r   r   rbc                   @   s>   e Zd ZU dZeed< dZeed< dZeed< dZe	ed< dS )zanalyze.<locals>.Statsr   correctr_   meta_correctNaverage)
rL   rM   rN   r   int__annotations__r_   r   r   floatr/   r/   r/   r0   Stats   s
   
 r   c                      s     S )Nr/   r/   r   r/   r0   <lambda>   s    zanalyze.<locals>.<lambda>subtask_nameinput_correct_responsesr2   
is_correctzMacro averagec                 S      g | ]}|j qS r/   )r   r   r/   r/   r0   r         zanalyze.<locals>.<listcomp>zMeta Macro averagec                 S   r   r/   )meta_averager   r/   r/   r0   r     r   zMicro averagezMeta Micro average)r   rf   ranger!   loadr    r   r   r   rj   r   r   zipTASK_TO_ANSWER_EXTRACTORr   r   r_   itemsr   r   r   npmeanvalues)ra   r   r*   rm   	responsesr_   ir-   subtask_name_to_statsds_rowmodel_answersubtaskis_eval_correctmicro_statsstatsr/   r   r0   analyze   sH   r   __main__z.Script to run model with specified parameters.)descriptionz--model-sizer   z#Size of the model (e.g., 8b or 70b))typedefaulthelpz
--providerr   z#Provider name (e.g., sgl, oai, b10)z--taskTz,Task (e.g., mmlu, mmlu_cot, mmlu_pro, gsm8k))r   requiredr   z--num-exampleszNumber of examples to processz--concurrency   )r   r   z--output-dirztmp-output-dirzDirectory to save responses)ignore_errors)-argparserb   r   r!   r~   shutilcollectionsr   dataclassesr   rA   numpyr   r   datasetsr   r   r   r   r1   rh   r`   AsyncClientr;   rX   rr   r{   r   r   r   r   r   rL   ArgumentParserparseradd_argumentstrr   
parse_argsrH   runra   r+   r*   rmtreer/   r/   r/   r0   <module>   s   	):