o
    پi                     @   sP  d dl Z d dlZd dlZd dlZd dlmZ d dlmZ d dlZd dl	Z	d dl
mZ d dlmZ d dlmZ dedejfd	d
Zdd ZdejdededejdededefddZdd Zdd Zedkre jddZejdddd  ejd!d"d#d  ejd$ed%d&d' ejd(d)d*d  ejd+ed,d-d' e Zeee ee dS dS ).    N)Path)List)
BERTScorerload_dataset)tqdmapi_urlreturnc                 C   s$   t dd u rdt jd< tj| dS )NOPENAI_API_KEYEMPTY)base_url)osgetenvenvironopenaiAsyncOpenAI)r    r   K/home/ubuntu/.local/lib/python3.10/site-packages/sglang/eval/loogle_eval.py
get_client   s   
r   c                   C   s   t ddddS )Nzbigai-nlco/LooGLE
longdep_qatest)splitr   r   r   r   r   get_dataset   s   r   clientcontextquestion	semaphoreindexmodel
output_dirc                    sR  |d| d }|  rd S d| d| d}dddd	|dg}	|4 I d H X z| jjj||	d
ddI d H }
W n< tjyu } z/t|d}tdt	|i| W d    n1 s\w   Y  W Y d }~W d   I d H  d S d }~ww W d   I d H  n1 I d H sw   Y  t|d}t|
| W d    d S 1 sw   Y  d S )N	response_.pklz:Please answer the question based on the long texts below.
z
Question: z
Answer:systemzYou are a helpful assistant.)rolecontentuserg        i   )r   messagestemperature
max_tokenswberror)
existschatcompletionscreater   BadRequestErroropenpickledumpstr)r   r   r   r   r   r   r   output_filepromptr&   responseefr   r   r   fetch_response   sD   	
("r9   c           	         s   t  }t| j}|jddd t| j}t| j}g }t	|D ] \}}|| j
kr, n|tt||d |d ||| j| q!tt|t|ddD ]}|I d H  qNd S )NT)parentsexist_okr   r   zRunning benchmark)totaldesc)r   r   r   mkdirr   r   asyncio	Semaphoremax_concurrency	enumeratenum_promptsappendcreate_taskr9   r   r   as_completedlen)	argsdatasetr   r   r   tasksidxex_r   r   r   	benchmarkB   s8   



rN   c                 C   sl  t  }t| j}tj rdnd}td|d}g }g }tt|ddD ]@\}}|| j	kr. n6|d| d }	|	
 s>t|	tt|	d	}
t|
trPd
|
v rPq#||
jd jj  ||d  q#|sltd d S d}g }ttdt||ddD ]'}||||  }||||  }|j||dd\}}}|dd |D  q|t|t| }td|d d S )Ncudacpuen)langdevicezLoading responses)r=   r    r!   rbr*   r   answerzNo valid responses to score!@   zScoring batchesF)verbosec                 S   s   g | ]}t |qS r   )float).0xr   r   r   
<listcomp>   s    zanalyse.<locals>.<listcomp>zAverage BERTScore (F1): z.2%)r   r   r   torchrO   is_availabler   rB   r   rC   r+   FileNotFoundErrorr1   loadr0   
isinstancedictrD   choicesmessager$   stripprintrangerG   scoreextendsum)rH   rI   r   rS   scorerhypsrefsrK   rL   pkl_filer6   
batch_sizeall_f1ih_batchr_batchrM   	f1_scoresavgr   r   r   analyseb   s:   

ru   __main__z'Run benchmark and evaluation in one go.)descriptionz	--api-urlzhttp://127.0.0.1:30000/v1u    OpenAI‑compatible API base URL)defaulthelpz--modelz-meta-llama/Llama-4-Maverick-17B-128E-Instructz*Model name or ID, only used for model namez--max-concurrency   zMaximum concurrent requests)typerx   ry   z--output-dirztmp-output-dirzDirectory for cached responsesz--num-promptsi'  zNumber of prompts to run)argparser?   r   r1   pathlibr   typingr   r   r\   
bert_scorer   datasetsr   r   r3   r   r   r   r@   intr9   rN   ru   __name__ArgumentParserparseradd_argument
parse_argsrH   runr   r   r   r   <module>   sr    
) '