o
    پiD                  	   @   s   d Z ddlZddlZddlmZmZ ddlmZmZm	Z	 zddl
mZ ddlmZ ddlmZ W n eefy>   ed  w dd	lmZ dd
lmZmZmZmZmZ 		ddeeef dee dedefddZG dd deZ dS )a  
HumanEval: Evaluating Large Language Models Trained on Code
Mark Chen and Jerry Tworek and Heewoo Jun and Qiming Yuan and Henrique Ponde de Oliveira Pinto and Jared Kaplan and Harri Edwards and Yuri Burda and Nicholas Joseph and Greg Brockman and Alex Ray and Raul Puri and Gretchen Krueger and Michael Petrov and Heidy Khlaaf and Girish Sastry and Pamela Mishkin and Brooke Chan and Scott Gray and Nick Ryder and Mikhail Pavlov and Alethea Power and Lukasz Kaiser and Mohammad Bavarian and Clemens Winter and Philippe Tillet and Felipe Petroski Such and Dave Cummings and Matthias Plappert and Fotios Chantzis and Elizabeth Barnes and Ariel Herbert-Voss and William Hebgen Guss and Alex Nichol and Alex Paino and Nikolas Tezak and Jie Tang and Igor Babuschkin and Suchir Balaji and Shantanu Jain and William Saunders and Christopher Hesse and Andrew N. Carr and Jan Leike and Josh Achiam and Vedant Misra and Evan Morikawa and Alec Radford and Matthew Knight and Miles Brundage and Mira Murati and Katie Mayer and Peter Welinder and Bob McGrew and Dario Amodei and Sam McCandlish and Ilya Sutskever and Wojciech Zaremba
https://arxiv.org/abs/2107.03374 https://github.com/openai/human-eval/
    N)ThreadPoolExecutoras_completed)DictListOptional)read_problemsestimate_pass_at_k)check_correctnesszD
Please install human-eval at https://github.com/openai/human-eval.
)simple_eval_common)
HTML_JINJAEval
EvalResultSamplerBaseSingleEvalResult         @samplecompletions	n_workerstimeoutc                 C   s   t |d9}g }t|D ]\}}| |||f}|jtg|R  }	||	 qg }
t|D ]}	|	 }|
| q+W d   n1 sAw   Y  dd |
D }|S )z
    Evaluates the functional correctness of generated samples, and writes
    results to f"{sample_file}_results.jsonl.gz"
    )max_workersNc                 S   s   g | ]}t |d  qS )passed)int).0r r   U/home/ubuntu/.local/lib/python3.10/site-packages/sglang/test/simple_eval_humaneval.py
<listcomp>6   s    z3evaluate_functional_correctness.<locals>.<listcomp>)r   	enumeratesubmitr
   appendr   result)r   r   r   r   executorfuturesi
completionargsfutureresultsr"   r   r   r   r   evaluate_functional_correctness    s   
r*   c                   @   sP   e Zd Zdg ddfdee dededee def
d	d
ZdedefddZ	dS )	HumanEval   )      r,   x   num_examplesnum_threadsnum_samples_per_task	ks_passesr   c                 C   s^   d| _ t | _t| j | _|| _| jr!t| j | j|| _|| _	|| _
|| _|| _d S )Nr   )seedr   exampleslistvalues_num_examplesrandomRandomr   _num_samples_per_task
_ks_passes_timeout_num_threads)selfr0   r1   r2   r3   r   r   r   r   __init__;   s   
zHumanEval.__init__samplerreturnc                    sJ   ddd  dt ttf f fdd}tj|jjd}t|S )NzRead the following function signature and docstring, and fully implement the function described. Your response should only contain the code for this function.
c                 S   sR   | pd} t dt j}|| }t|dkr|d n| }||dd d  }|S )N z```python\n(.*?)```r-   r   z:
    r.   )recompileDOTALLfindalllenfind)r&   patternmatchesextracted_answerr   r   r   	find_codeR   s   
z%HumanEval.__call__.<locals>.find_coder   c                    s   j d| d  dgfddtjD }t| |}t|t| t|t| }tjt	j
t|d dd|d	gt| |d
}dd |D  }t||| fddjD dS )Nuserprompt)rolecontentc                    s   g | ]} qS r   r   )r   _)rM   prompt_messagesrA   r   r   r   b   s    
z2HumanEval.__call__.<locals>.fn.<locals>.<listcomp>r   	assistantrQ   rP   r-   )rS   next_messagescorecorrect_answerrL   c                 S   s   g | ]}t |d dqS )rT   rU   )dict)r   r&   r   r   r   r   q   s    c                    s,   i | ]}|krd | t g g|qS )zpass@r   )r   k)correcttotalr   r   
<dictcomp>x   s
    z2HumanEval.__call__.<locals>.fn.<locals>.<dictcomp>)htmlrW   convometrics)_pack_messageranger;   r*   rH   sumcommon	jinja_envfrom_stringr   renderrY   r   r<   )r   r   r)   rW   r^   r_   rM   instructionrA   r?   )r[   rS   r\   r   fn\   s:   
zHumanEval.__call__.<locals>.fn)r1   )r   strrd   map_with_progressr5   r>   aggregate_results)r?   rA   rj   r)   r   rh   r   __call__O   s    
$

zHumanEval.__call__N)
__name__
__module____qualname__r   r   r   r@   r   r   rn   r   r   r   r   r+   :   s     
r+   )r   r   )!__doc__r9   rD   concurrent.futuresr   r   typingr   r   r   human_eval.datar   human_eval.evaluationr	   human_eval.executionr
   ImportErrorModuleNotFoundErrorprintsglang.testr   rd   sglang.test.simple_eval_commonr   r   r   r   r   rk   r   floatr*   r+   r   r   r   r   <module>   s6   

