o
    پia                     @   sx   d Z ddlZddlmZ ddlmZ ddlmZ ddlZ	ddl
Z
ddlZedeeef dB fddZG d	d
 d
ZdS )zc
This module provides a mixin class for running lm-eval harness evaluations
against SGLang servers
    N)contextmanager)Path)Anynew_envc                 c   s    | sdV  dS i }g }zB|   D ]\}}|tjv r"tj| ||< n|| t|tj|< qdV  W |  D ]	\}}|tj|< q7|D ]	}tj|d qCdS |  D ]	\}}|tj|< qS|D ]	}tj|d q_w )z9Context manager to temporarily set environment variables.N)itemsosenvironappendstrpop)r   
old_valuesnew_keyskeyvalue r   P/home/ubuntu/.local/lib/python3.10/site-packages/sglang/test/kits/lm_eval_kit.pyscoped_env_vars   s.   

r   c                   @   s\   e Zd ZU dZg Zee ed< dZeed< dZ	e
ed< dd Zd	eeef d
efddZdS )LMEvalMixinz>
    Mixin class for running lm-eval harness evaluations.
    
other_args model_config_nameg{Gz?default_rtolc           	      C   s   t | jd  tt| jjdd}| |}|d| j	}d}|d D ];}|d D ]4}|d }|d	 |d
  |d
  }t
|d
  d|d
  d|dd|dd| 	 |o_tj|||d}q,q&| |d dS )z,Run lm-eval evaluation and validate results.z/flush_cachezutf-8)encodingrtolTtasksmetricsr   resultsnamez | z: ground_truth=z.3fz | measured=z | rtol=)r   zlm-eval validation failedN)requestsgetbase_urlyaml	safe_loadr   r   	read_textlaunch_lm_evalr   printnpisclose
assertTrue)	selfeval_configr   r   successtaskmetricground_truthmeasured_valuer   r   r   test_lm_eval2   s2   
zLMEvalMixin.test_lm_evalr*   returnc           	      C   s   ddl }|dd}|dd}|dd}|d	 | jd
 |d}|dd}t|1 |j||dd |d D |dd|dd|dd|dd|d|d	}W d   |S 1 s_w   Y  |S )zf
        Args:
            eval_config: Configuration dictionary with model and task settings
        r   N
batch_sizeautobackendzlocal-completionsnum_concurrent   
model_namez/v1/completions)modelr    r5   env_varsc                 S   s   g | ]}|d  qS )r   r   ).0r,   r   r   r   
<listcomp>d   s    z.LMEvalMixin.launch_lm_eval.<locals>.<listcomp>r   num_fewshotlimitapply_chat_templateFfewshot_as_multiturn
gen_kwargs)	r8   
model_argsr   r<   r=   r>   r?   r@   r2   )lm_evalr   r    r   simple_evaluate)	r)   r*   rB   r2   r4   r5   rA   r9   r   r   r   r   r$   N   s2   





zLMEvalMixin.launch_lm_evalN)__name__
__module____qualname____doc__r   listr
   __annotations__r   r   floatr0   dictr   r$   r   r   r   r   r   )   s   
 r   )rG   r   
contextlibr   pathlibr   typingr   numpyr&   r   r!   rK   r
   r   r   r   r   r   r   <module>   s    