o
    5tiJ                  	   @   s`  d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZ d dlm	Z	m
Z
mZmZmZ d dlZd dlZd dlmZmZ edZe eZeddd Zed	d
d	 Zeddd Zeddd Zeddd Zeddd Zeddd Zeddd Zeddd Z eddd Z!eddd Z"eddd Z#ed d!d  Z$ed d"d#gd d$d%d& Z%ed'd(d)d#gdd$d*d+ Z&ed,d(d)d#gdd$d-d. Z'ed/d(d#dd$d0d1 Z(ed2d(d)d#gdd$d3d4 Z)		"	"	"dd5d6Z*ed7d(d8dd$d9d: Z+edd"d)dd$d;d< Z,ed=d(d#dd$d>d? Z-ed@d"dAdd$dBdC Z.edDd"dAdd$dEdF Z/edd"dAdd$dGdH Z0dIdJ Z1dKee dLe2fdMdNZ3dOdP Z4edd(g dQdd$dRd Z5edSd(d#dd$dTdU Z6edd(d#dd$dVdW Z7edd(d8dd$dXdY Z8edd(d8dd$dZd[ Z9edd(d8dd$d\d] Z:ed^d(d)dd$d_d^ Z;d`da Z<dbdc Z=ddde Z>dfdg Z?dhdi Z@G djdk dkZAdle	ee ge2f dmee dneBdLeCe2 fdodpZDdle	ee ge2f dmee dneBdLe2fdqdrZEdse	ee ge2f dteBdLee	ee ge2f  fdudvZFdwe
e2 dxe
eB fdydzZGddwe
e2 dxe
eB fd{d|ZHdd}d~ZIdS )    N)Iterable)CallableListOptionalSequenceTypeVar)register_aggregationregister_metricTbypassc                 C   s   dS )Ni   arrr   r   G/home/ubuntu/.local/lib/python3.10/site-packages/lm_eval/api/metrics.py
bypass_agg   s   r   nanmeanc                 C   s*   t | dkstt| rtjS t| S )Nr   )lenallnpisnannanr   r   r   r   r   r      s   
meanc                 C   s   t | t|  S N)sumr   r   r   r   r   r   "      medianc                 C   s   | t | d  S N   )r   r   r   r   r   r   '   r   
perplexityc                 C      t t|  S r   )mathexpr   itemsr   r   r   r   .   r   weighted_perplexityc                 C   r   r   )r    r!   weighted_meanr"   r   r   r   r$   3   r   bits_per_bytec                 C   s   t |  td S r   )r%   r    logr"   r   r   r   r&   8   s   f1c                 C   s<   ddl m} tt|  }|d }|d }|||}t|S )Nr   )f1_score   )sklearn.metricsr)   listzipr   max)r#   r)   unzipped_listgoldspredsfscorer   r   r   r)   =   s   

r)   matthews_corrcoefc                 C   s2   ddl m} tt|  }|d }|d }|||S )Nr   )r3   r*   )r+   r3   r,   r-   )r#   r3   r/   r0   r1   r   r   r   r3   I   s
   
bleuc                 C   <   t t|  d }t t|  d }t||\}}t||jS )a#  The Bilingual Evaluation Understudy Score, or BLEU for short, is a metric
    for evaluating a generated sentence to a reference sentence. It counts matching
    n-grams in the candidate translation to n-grams in the reference text, where
    1-gram or unigram would be each token and a bigram comparison would be each
    word pair. The comparison is made regardless of word order
    Source: https://machinelearningmastery.com/calculate-bleu-score-for-text-python/
    Paper: https://www.aclweb.org/anthology/P02-1040/

    Higher is better
    r   r*   )r,   r-   _sacreformat	sacrebleucorpus_bleuscorer#   refsr1   r   r   r   r4   S   s   chrfc                 C   r5   )a(  chrF++ is a tool for automatic evaluation of machine translation output
    based on character n-gram precision and recall enhanced with word n-grams.
    Source: https://github.com/m-popovic/chrF
    Paper: https://www.aclweb.org/anthology/W15-3049.pdf

    Higher is better  # TODO I think
    r   r*   )r,   r-   r6   r7   corpus_chrfr9   r:   r   r   r   r<   e   s   	terc                 C   r5   )a,  Translation Error Rate is an error metric for machine translation that
    measures the number of edits required to change a system output into one
    of the references
    Source: http://www.cs.umd.edu/~snover/tercom/
    Paper: http://mt-archive.info/AMTA-2006-Snover.pdf

    Lower is better
    r   r*   )r,   r-   r6   r7   
corpus_terr9   r:   r   r   r   r>   t   s   
brier_scorec                 C   sR   t t|  \}}t|j\}}t |}t|| }ttj|| d ddS )Nr   r*   )axis)r,   r-   r   arrayshapeeyer   r   )r#   goldpredictionsbs	num_classgold_one_hotr   r   r   r@      s
   Fmultiple_choice)metrichigher_is_betteroutput_typeaggregationc                 C      | S r   r   r"   r   r   r   brier_score_fn      rP   accTloglikelihoodc                 C   rO   r   r   r"   r   r   r   acc_fn   rQ   rT   acc_normc                 C   rO   r   r   r"   r   r   r   acc_norm_fn   rQ   rV   acc_mutual_infoc                 C   rO   r   r   r"   r   r   r   acc_mutual_info_fn   rQ   rX   	acc_bytesc                 C   rO   r   r   r"   r   r   r   acc_bytes_fn   rQ   rZ   c                    s   |d ur"|D ] t  fdd| D } t  fdd|D }qn
t | } t |}|r:t j| } t j|}|rUtjddtj}t jj| |d} t jj||d}|rptj	ddtj	}t jj| |d} t jj||d}| |k}dt 
|iS )Nc                       g | ]	}t  d |qS  resub.0xsr   r   
<listcomp>       z+exact_match_hf_evaluate.<locals>.<listcomp>c                    r[   r\   r^   ra   rd   r   r   rf      rg   r]   )tableexact_match)r   rB   asarraycharlowerstringpunctuation	maketrans	translatedigitsr   )rF   
referencesregexes_to_ignoreignore_caseignore_punctuationignore_numbers
repl_table
score_listr   rd   r   exact_match_hf_evaluate   s(   

ry   ri   generate_untilc                  K   s   t di | S )Nr   )ry   )kwargsr   r   r   exact_match_fn   s   r|   c                 C   rO   r   r   r"   r   r   r   perplexity_fn  rQ   r}   
likelihoodc                 C   rO   r   r   r"   r   r   r   likelihood_fn  rQ   r   word_perplexityloglikelihood_rollingc                 C   rO   r   r   r"   r   r   r   word_perplexity_fn  rQ   r   byte_perplexityc                 C   rO   r   r   r"   r   r   r   byte_perplexity_fn   rQ   r   c                 C   rO   r   r   r"   r   r   r   bits_per_byte_fn*  rQ   r   c                    s,   t |  tt fdd| D t|  S )Nc                       g | ]}|  d  qS r   r   ra   mur   r   rf   6      zpop_stddev.<locals>.<listcomp>r   r    sqrtr   r   r   r   r   r   
pop_stddev4  s   $r   r   returnc                    s0   t |  tt fdd| D t| d  S )Nc                    r   r   r   ra   r   r   r   rf   ;  r   z!sample_stddev.<locals>.<listcomp>r*   r   r   r   r   r   sample_stddev9  s   (r   c                 C   s   t | tt|  S r   )r   r    r   r   r   r   r   r   mean_stderr>     r   )rS   rJ   rz   c                 C   s   d S r   r   r"   r   r   r   r   B  rQ   mccc                 C   rO   r   r   r"   r   r   r   mcc_fnL  rQ   r   c                 C   rO   r   r   r"   r   r   r   f1_fnV  rQ   r   c                 C   rO   r   r   r"   r   r   r   bleu_fn`  rQ   r   c                 C   rO   r   r   r"   r   r   r   chrf_fnj  rQ   r   c                 C   rO   r   r   r"   r   r   r   ter_fnt  rQ   r   acc_allc           
      C   s   i }t t|  d }t t|  d }t||D ]-\}}|d d }|d d }||f|vr3g |||f< |d dk}|||f ||k qtdd | D }	|	S )	Nr   r*   idx	paragraphquestionlabelc                 S      g | ]}t t|qS r   intr   ra   r   r   r   rf     r   zacc_all.<locals>.<listcomp>)r,   r-   appendr   r   values)
r#   question_scoring_dictr1   docsdocpredparagraph_idquestion_id
gold_labelrR   r   r   r   r   ~  s   c           	      C   s   i }t t|  d }t t|  d }t||D ]!\}}|d d }||vr)g ||< |d dk}|| ||k qtdd | D }|S )Nr   r*   r   r   r   c                 S   r   r   r   ra   r   r   r   rf     r   z"acc_all_stderr.<locals>.<listcomp>)r,   r-   r   r   r   )	r#   r   r1   r   r   r   r   r   rR   r   r   r   acc_all_stderr  s   r   c                 C   s*   g }|D ]}| ||}| | qt|S )z<Compute max metric between prediction and each ground truth.)r   r.   )	metric_fn
predictionground_truthsscores_for_ground_truthsground_truthr9   r   r   r   metric_max_over_ground_truths  s
   
r   c                 C   s   t |  \}}t|t| S r   )r-   r   )r#   abr   r   r   r%     s   r%   c                 C   s   t | to
t | t S r   )
isinstancer   str)objr   r   r   is_non_str_iterable  r   r   c                 C   s   t | st| } t | d sdd | D } tt|  } t |s#t|}t |d rAt|d dks:J d|d  dd |D }| |fS )zMFormat refs and preds for sacrebleu corpus calculation. It is very particularr   c                 S   s   g | ]}|gqS r   r   )rb   refr   r   r   rf     s    z _sacreformat.<locals>.<listcomp>r*   zPred must be a str, was c                 S   s   g | ]}|d  qS )r   r   )rb   r   r   r   r   rf         )r   r,   r-   r   )r;   r1   r   r   r   r6     s   "r6   c                   @   sV   e Zd ZdZdeee gef deddfddZ	de
eee f dee fd	d
ZdS )_bootstrap_internaluh   
    Pool worker: `(i, xs)` → `n` bootstrap replicates
    of `f(xs)`using a RNG seeded with `i`.
    fnr   Nc                 C   s   || _ || _d S r   )r   r   )selfr   r   r   r   r   __init__  s   
z_bootstrap_internal.__init__vc              	   C   sP   |\}}t  }|| g }t| jD ]}|| |j|t|d q|S )Nk)	randomRandomseedranger   r   r   choicesr   )r   r   ixsrndres_r   r   r   __call__  s   
 z_bootstrap_internal.__call__)__name__
__module____qualname____doc__r   r   r
   floatr   r   tupler,   r   r   r   r   r   r     s    $&r   r   r   itersc           	   	   C   sx   g }t d|}ddlm} td| j  |t|| D ]}t|}t|D ]}|| |j|t	|d q(q|S )uz   
    Single-process fallback: compute `iters` bootstrap replicates
    of statistic`f(xs)`, chunked (≤ 1000 draws).
      r   tqdmzbootstrapping for stddev: r   )
minr   printr   r   r   r   r   r   r   )	r   r   r   r   
chunk_sizer   r   r   r   r   r   r   _bootstrap_internal_no_mp  s   

r   c           	   	      s   t ds\ddl}g }td|}ddlm} td| j || -}||	t
| | fddt|| D || d	D ]}|| q>W d   t|S 1 sSw   Y  t|S t|  |}t|S )
u   
    Bootstrap estimate of the standard error of statistic `f(xs)`
    using up to `iters` resamples, chunked (≤ 1000 draws)

    Executes in parallel unless the env-var `DISABLE_MULTIPROC` is set;
    DISABLE_MULTIPROCr   Nr   r   zbootstrapping for stddev:c                    s   g | ]}| fqS r   r   )rb   r   r   r   r   rf     r   z$bootstrap_stderr.<locals>.<listcomp>)total)osgetenvmultiprocessingr   r   r   r   Pool	cpu_countimapr   r   extendr   r   )	r   r   r   mpr   r   r   pool	bootstrapr   r   r   bootstrap_stderr  s,   
	


r   rK   bootstrap_itersc                    sN    dkrdS t tttttttg}|v r fddS tt	t
ti}|dS )aG  
    Return a function that estimates the standard error of `metric(xs)`.

    * If `bootstrap_iters > 0` and the metric is in the pre-approved
      bootstrappable list, use `bootstrap_stderr` with that many draws.
    * If the metric has a closed-form SE (e.g. `mean`, `acc_all`), use it.
    * Otherwise, return `None`.
    r   Nc                    s   t |  dS )N)r   )r   )rc   r   rK   r   r   <lambda>G  s    z#stderr_for_metric.<locals>.<lambda>)r   r3   r)   r   r4   r<   r>   r   r   r   r   r   get)rK   r   bootstrappablestderrr   r   r   stderr_for_metric+  s   r   stderrssizesc                 C   sN   t | t |ks
J tdd t|| D t|t |  }t|t| S )Nc                 S   s$   g | ]\}}|d  |d  | qS )r*   r   r   )rb   sizer   r   r   r   rf   Z  s   $ z(pooled_sample_stderr.<locals>.<listcomp>)r   r   r-   r   r   )r   r   pooled_sample_varr   r   r   pooled_sample_stderrN  s
   r   c           	      C   s   |d usJ dt | t |krt |t |ksJ | d d }|d }|d }t| dd  |dd  |dd  D ]9\}}}|| ||  ||  }|d | |d |d   || d  || || || d   || d   }q:t|S )NzHNeed to pass a list of each subtask's metric for this stderr aggregationr   r   r*   )r   r-   r   r   )	r   r   metricsvariance	curr_size
curr_scorer   r   r9   r   r   r   combined_sample_stderr`  s(   
$
.

r   c                 C   sF   |s	dgt | }t | t |ksJ tdd t| |D t| S )Nr*   c                 S   s   g | ]\}}|| qS r   r   )rb   rK   r   r   r   r   rf     r   z-aggregate_subtask_metrics.<locals>.<listcomp>)r   r   r-   )r   r   weight_by_sizer   r   r   aggregate_subtask_metrics  s    r   )NFFFr   )T)Jloggingr    r   r   r_   rm   collections.abcr   typingr   r   r   r   r   numpyr   r7   lm_eval.api.registryr   r	   r
   	getLoggerr   eval_loggerr   r   r   r   r   r$   r&   r)   r3   r4   r<   r>   r@   rP   rT   rV   rX   rZ   ry   r|   r}   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r%   r   r6   r   r   r,   r   r   r   r   r   r   r   r   r   r   <module>   s   









	



	





&












	

'
# 