o
    6ti                     @   s  d dl Z d dlmZmZ d dlmZ d dlmZmZm	Z	 d dl
Z
z
d dlZe ZW n ey7   ed Y nw eG dd dZdeee
jf fd	d
ZdedefddZde	de	e defddZdededefddZdededefddZdededefddZde	e dedeeef fddZde	e deeef fddZde	e deeef fdd Zde	e deeef fd!d"ZdS )#    N)Counterdefaultdict)	dataclass)CallableDictListzOCan not import pymorphy3. If you try to score libra, do `pip install pymorphy3`c                   @   s*   e Zd ZU eed< ee ed< eed< dS )PredictionResultpred_answeranswerslengthN)__name__
__module____qualname__str__annotations__r    r   r   M/home/ubuntu/.local/lib/python3.10/site-packages/lm_eval/tasks/libra/utils.pyr      s   
 r   returnc                     s   | dg | dd}| dd}| dd | dd}ttj|||d	d
 j } |vr;td  d| rStj|||d	d
  fdd}d
|iS tj|||d	d
 }d
|iS )zFilter dataset by page lengths for Libra task.

    in CLI metadata --metadata '{"valid_pages": ["8p", "32p"], "dataset_repo_name": "ai-forever/LIBRA"}'
    valid_pagesdataset_repo_namezai-forever/LIBRAdataset_nameNfilter_colnamer   token)r   testzColumn z not found in dataset c                    s   |   v S N)get)docr   r   r   r   <lambda>1   s    z0filter_dataset_by_page_lengths.<locals>.<lambda>)r   listdatasetsload_datasetfeatureskeys
ValueErrorfilter)argskwargsr   r   r   dataset_columnsdataset_filteredr   r   r   filter_dataset_by_page_lengths   s8   r*   sentencec                 C   sT   t | } g }|  D ]}tdd| }t|d j }|| q
d	|S )a@  Normalize an input sentence by removing punctuation and converting words to their base (lemmatized) form.
    :param sentence: str
        Input sentence.
    :return: str
        A normalized sentence where:
        - All characters except letters, digits, and underscores are removed.
        - All words are converted to lowercase.
        - Words are lemmatized using `normalizer`.
    :raises ValueError:
        If `sentence` is not a string.
    :example:
    >>> normalize_answer("Hello, world! This is a test sentence.")
    'hello world this is a test sentence'
    u   [^a-zа-яй0-9_]+ r    )
r   splitresublower
normalizerparsenormal_formappendjoin)r+   new_sentencewordr   r   r   r   normalize_answer9   s   
r9   r   resultsc                 C   s$   |d }|| d | d d}d|iS )a  Processes evaluation results by extracting prediction and relevant metadata.

    :param doc: A single instance from the evaluation dataset, containing reference answers and metadata.
    :param results: A list containing the predicted answer(s). The first element is used as the main prediction.
    :return: A dictionary where the key is the metric name ("libra_score") and the value is a dictionary
             with the predicted answer, reference answers, and context length.
    r   positive_outputsr   )r	   r
   r   libra_scorer   )r   r:   
prediction	data_dictr   r   r   process_resultsQ   s   r?   r=   ground_truthc                 C   s   d}t |t | v rd}|S )N              ?r9   )r=   r@   resultr   r   r   exact_match_scored   s   rE   c                 C   s`   t | t |@ }t| }|dkrdS d| t|  }d| t| }d| | ||  }|S )Nr   rB      )r   sumvalueslen)r=   r@   commonnum_same	precisionrecallf1r   r   r   f1_scorek   s   rO   c                 C   sV   t d| }d}|D ]}t|t|kr|d7 }q
t|dkr!dn|t| }t|S )Nz\d+r      rA   )r/   findallr   rI   float)r=   r@   numbers	right_numnumberfinal_scorer   r   r   count_scorev   s   rW   scoring_functionc           	      C   s   t dd }| D ]<}|d }t|d }tdd |d D }|| d  d7  < |D ]}|||d	}|d
krC|| d
  |7  <  nq+qdd | D S )a9  Aggregates score by 'length' by scoring_function.

    :param results: List of dictionaries containing 'pred_answer', 'answers', and 'length'.
    :return: Dictionary with 'length' as keys and average score as values.

    :example:
    >>> results = [
    ...     {"pred_answer": "1", "answers": ["1", "one"], "length": "8p"},
    ...     {"pred_answer": "0", "answers": ["zero", "none"], "length": "8p"},
    ...     {"pred_answer": "one", "answers": ["1", "one"], "length": "16p"}
    ... ]
    >>> aggregate_results(results=results)
    {'8p': 0.5, '16p': 1.0}
    c                   S   s   ddgS )Nr   r   r   r   r   r   r      s    z#aggregate_results.<locals>.<lambda>r   r	   c                 S   s   g | ]}t |qS r   rC   ).0textr   r   r   
<listcomp>   s    z%aggregate_results.<locals>.<listcomp>r
   rP   )r=   r@   r   c                 S   s   i | ]\}\}}||| qS r   r   )rY   keycorrecttotalr   r   r   
<dictcomp>   s    z%aggregate_results.<locals>.<dictcomp>)r   r9   setitems)	r:   rX   scoresrD   r   r	   r
   answermetricr   r   r   aggregate_results   s   re   c                 C   
   t | tS r   )re   rE   r:   r   r   r   aggregate_results_em      
rh   c                 C   rf   r   )re   rO   rg   r   r   r   aggregate_results_f1   ri   rj   c                 C   rf   r   )re   rW   rg   r   r   r   aggregate_results_count_score   ri   rk   )r/   collectionsr   r   dataclassesr   typingr   r   r   r    	pymorphy3MorphAnalyzerr2   ImportErrorprintr   r   Datasetr*   r9   r?   rR   rE   rO   rW   re   rh   rj   rk   r   r   r   r   <module>   s>    


!"