o
    7ti+                     @   s   d Z ddlZddlZddlZddlZddlmZ ddlZdd eej	D 
ejZg dZdgZdd	 Zd
d Zdd Zdd Zdd Zdd ZdejdejfddZdd Zdd Zdd Zdd Zd d! Zd"d# Zd$d% Zd&d' ZdS )(z
Code based on Official evaluation script for the MLQA dataset.
Repo: https://github.com/facebookresearch/MLQA/blob/main/mlqa_evaluation_v1.py
    N)Counterc                 C   s(   h | ]}t t|d rt|qS )P)unicodedatacategorychr
startswith).0i r
   L/home/ubuntu/.local/lib/python3.10/site-packages/lm_eval/tasks/mlqa/utils.py	<setcomp>   s    r   )eneshividearzhc                 C      |   S N)splittextr
   r
   r   whitespace_tokenize      r   c                 C   sv   g }d}| D ]%}t d|s|tv r'|dkr!t|}|| d}|| q||7 }q|dkr9t|}|| |S )N z[\u4e00-\u9fa5])researchPUNCTr   extendappend)r   segs_outtemp_strcharssr
   r
   r   mixed_segmentation   s   


r%   c                 C   s8   dd }dd }dd }dd }||||| ||S )	zALower text and remove punctuation, articles and extra whitespace.c                 S   s   |dkrt dd| S |dkrt dd| S |dkr| S |dkr't dd| S |d	kr2t d
d| S |dkr=t dd| S |dkrC| S td|)Nr   z\b(a|an|the)\b r   z$\b(un|una|unos|unas|el|la|los|las)\br   r   u$   \b(của|là|cái|chiếc|những)\br   z>\b(ein|eine|einen|einem|eines|einer|der|die|das|den|dem|des)\br   u   \sال^|الr   Unknown Language {})r   sub	Exceptionformat)r   langr
   r
   r   remove_articles3   s&   z)normalize_answer.<locals>.remove_articlesc                 S   sF   |t v r	t| }n|tv rt| }ntd|ddd |D S )Nr'   r&   c                 S   s   g | ]
}|  d kr|qS )r   )strip)r   tr
   r
   r   
<listcomp>P   s    z=normalize_answer.<locals>.white_space_fix.<locals>.<listcomp>)WHITESPACE_LANGSr   MIXED_SEGMENTATION_LANGSr%   r)   r*   join)r   r+   tokensr
   r
   r   white_space_fixI   s   

z)normalize_answer.<locals>.white_space_fixc                 S   s   d dd | D S )Nr   c                 s   s    | ]	}|t vr|V  qd S r   )r   )r   chr
   r
   r   	<genexpr>S   s    z8normalize_answer.<locals>.remove_punc.<locals>.<genexpr>)r2   r   r
   r
   r   remove_puncR      z%normalize_answer.<locals>.remove_puncc                 S   r   r   )lowerr   r
   r
   r   r9   U   r   znormalize_answer.<locals>.lowerr
   )sr+   r,   r4   r7   r9   r
   r
   r   normalize_answer0   s
   	r;   c           
      C   s|   t | | }t || }t|t|@ }t| }|dkr"dS d| t| }d| t| }d| | ||  }	|	S )Nr   g      ?   )r;   r   r   sumvalueslen)

predictionground_truthr+   prediction_tokensground_truth_tokenscommonnum_same	precisionrecallf1r
   r
   r   f1_score[   s   rI   c                 C   s   t | |t ||kS r   )r;   )r@   rA   r+   r
   r
   r   exact_match_scoreh   r8   rJ   c                 C   s,   g }|D ]}| |||}| | qt|S r   )r    max)	metric_fnr@   ground_truthsr+   scores_for_ground_truthsrA   scorer
   r
   r   metric_max_over_ground_truthsl   s
   rP   datasetreturnc                 C   s   dd }|  |S )Nc                 S   s    | d | d | d d d}|S )Ncontextquestionanswersr   )rS   rT   rU   r
   )docout_docr
   r
   r   _process_docu   s
   
z"process_docs.<locals>._process_doc)map)rQ   rX   r
   r
   r   process_docst   s   
rZ   c                 C   s:   | d }|d   }tt|||}tt|||}||dS )NrU   r   )exact_matchrH   )r-   rP   rJ   rI   )rV   resultsr+   rM   r@   r[   rH   r
   r
   r   process_results_lang   s   
r]   c                 C      t | |dS )Nr   r]   rV   r\   r
   r
   r   process_results_en      ra   c                 C   r^   )Nr   r_   r`   r
   r
   r   process_results_es   rb   rc   c                 C   r^   )Nr   r_   r`   r
   r
   r   process_results_hi   rb   rd   c                 C   r^   )Nr   r_   r`   r
   r
   r   process_results_vi   rb   re   c                 C   r^   )Nr   r_   r`   r
   r
   r   process_results_de   rb   rf   c                 C   r^   )Nr   r_   r`   r
   r
   r   process_results_ar   rb   rg   c                 C   r^   )Nr   r_   r`   r
   r
   r   process_results_zh   rb   rh   )__doc__r   stringsysr   collectionsr   datasetsrange
maxunicodeunionpunctuationr   r0   r1   r   r%   r;   rI   rJ   rP   DatasetrZ   r]   ra   rc   rd   re   rf   rg   rh   r
   r
   r
   r   <module>   s:    +