o
    6ti'                     @   s   d dl Z d dlmZ d dlZd dlZd dlZd dlZd dlm	  m
  mZ d dlmZmZ d dlmZ dd Zdd Zd	d
 Zdd Zdd Zdd Zdd ZdejdejfddZdd Zdd Zdd Zdd Zdd  ZdS )!    N)product)rouge_scorerscoring)general_detokenizec                 C   s   | d   | dd   S )Nr      )lower)text r	   V/home/ubuntu/.local/lib/python3.10/site-packages/lm_eval/tasks/galician_bench/utils.pylowercase_first_letter   s   r   c                 C      dd }|  |S )Nc                 S   s0   t dd| d | d< t dd| d | d< | S )Nz + r   summary)resubdocr	   r	   r
   _process_doc   s   z+process_summarization.<locals>._process_docmap)datasetr   r	   r	   r
   process_summarization   s   
r   c                    R   g   fdd} g krt  }td| dt |  d   | dd |S )Nc                       | d dvr;| d dvr;t | d  | d< t | d  | d< | d dr1| d d d | d< t| d | d< | S  |  | S )NFraseN 
   Paráfrase.,;r   stripendswithr   appendr   
empty_docsr	   r
   r         
z.process_docs_paraphrases.<locals>._process_docFound  empty documents out of the  total docs in the dataset: c                 S      | d dvo| d dvS )Nr   r   r   r	   r   r	   r	   r
   <lambda>3       z*process_docs_paraphrases.<locals>.<lambda>lenprintfilterr   r   r   len_empty_docsr	   r'   r
   process_docs_paraphrases      r6   c                    r   )Nc                    r   )N	sentence1r   	sentence2r   r"   r#   r   r'   r	   r
   r   ;   r)   z'process_docs_paws.<locals>._process_docr*   r+   r,   c                 S   r-   )Nr8   r   r9   r	   r   r	   r	   r
   r.   O   r/   z#process_docs_paws.<locals>.<lambda>r0   r4   r	   r'   r
   process_docs_paws8   r7   r:   c                 C   s   | S )z&
    # passthrough for efficiency
    r	   )itemsr	   r	   r
   rouge1T   s   r<   c                 C   s<   t t|  d }t t|  d }td}|j||dd S )z
    Higher is better
    r   r   rouge)predictions
referencesr<   )listzipevaluateloadcompute)r;   refspredsr   r	   r	   r
   
rouge1_agg[   s   
rG   c           	      C   s~   t | \}}t| d d d}|d | ||d  }}tt|tt|}}|t|t|  }dt|iS )Nmc2_targetslabelsr   acc)rA   r@   indexnpexparraysum)	r   resultslls	is_greedy	split_idxll_truell_falsep_truep_falser	   r	   r
   process_results_mc2f   s   "rX   r   returnc                 C   s
   |  tS )N)r   preprocess_function_gen)r   r	   r	   r
   process_docs_gens   s   
r[   c                 C   sF   dd }|| d }|| d }d|vr| d | d  ||dS )Nc                 S   sH   g }| D ]}|  }t|r!|d dkr||d  q|| q|S )Nr"   r   )r$   r1   r&   )answersformatted_answersanswerr	   r	   r
   _format_answersx   s   
z0preprocess_function_gen.<locals>._format_answersincorrect_answerscorrect_answersu   Non teño ningún comentario.question)rb   ra   r`   )r&   r$   )examplesr_   r`   ra   r	   r	   r
   rZ   w   s   

rZ   c                 C   r   )Nc                 S   s   t | d  | d< t | d  | d< | d dr#| d d d n| d | d< t| d | d< | d ds>| d d n| d | d< dddd	}|| d
  | d
< | S )Nr8   r9   )r   r    !?r"   r   r   r      )
entailmentneutralcontradiction
gold_label)r   r$   r%   r   )r   label_to_intr	   r	   r
   
process_fn   s   z#process_doc_nli.<locals>.process_fnr   )r   rl   r	   r	   r
   process_doc_nli   s   
rm   c                    s  |d  | d | d }}|| } fdd|D }t |d t| }t |t|d  }|}|| }	t||k}
 fdd|D }dd |D }t |d t| }t |t|d  }|}|| }t||k}dd |D }t |d t| }t |t|d  }|}|| }t||k}d	d |D }t |d t| }t |t|d  }|}|| }t||k}||
|	|||||||||d
S )Nr   ra   r`   c                    s   g | ]
}t |gg gqS r	   )bleu.0ref
completionr	   r
   
<listcomp>   s    z'process_results_gen.<locals>.<listcomp>c                    s   g | ]	}t |g gqS r	   )r=   ro   rr   r	   r
   rt      s    c                 S      g | ]}|d  qS )r<   r	   rp   scorer	   r	   r
   rt          c                 S   ru   )rouge2r	   rv   r	   r	   r
   rt      rx   c                 S   ru   )	rougeLsumr	   rv   r	   r	   r
   rt      rx   )bleu_maxbleu_acc	bleu_diff
rouge1_max
rouge1_accrouge1_diff
rouge2_max
rouge2_accrouge2_diff
rougeL_max
rougeL_accrougeL_diff)rL   nanmaxr1   int)r   rP   	true_refs
false_refsall_refsbleu_scoresbleu_correctbleu_incorrectr{   r}   r|   rouge_scoresrouge1_scoresrouge1_correctrouge1_incorrectr~   r   r   rouge2_scoresrouge2_correctrouge2_incorrectr   r   r   rougeL_scoresrougeL_correctrougeL_incorrectr   r   r   r	   rr   r
   process_results_gen   sR   r   c              
   C   s    t j|| dddddddj}|S )aW  
    Returns `t5` style BLEU scores. See the related implementation:
    https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L41

    :param refs:
        A `list` of `list` of reference `str`s.
    :param preds:
        A `list` of predicted `str`s.
    rM   g        Fintl)smooth_methodsmooth_valueforce	lowercasetokenizeuse_effective_order)	sacrebleucorpus_bleurw   )rE   rF   rw   r	   r	   r
   rn      s   
	
rn   c                    sr   g d}t |}dd }t }t| |D ]\}}||}||}|||| q|   fdd|D S )aN  
    Returns `t5` style ROUGE scores. See the related implementation:
    https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L68

    :param refs:
        A `list` of reference `strs`.
    :param preds:
        A `list` of predicted `strs`.
    )r<   ry   rz   c                 S   s   |  dd} | S )Nz . z.
)replace)r   r	   r	   r
   _prepare_summary  s   zrouge.<locals>._prepare_summaryc                    s   i | ]}| | j jd  qS )d   )midfmeasure)rp   typeresultr	   r
   
<dictcomp>!  s    zrouge.<locals>.<dictcomp>)r   RougeScorerr   BootstrapAggregatorrA   
add_scoresrw   	aggregate)rE   rF   rouge_typesscorerr   
aggregatorrq   predr	   r   r
   r=     s   

r=   )r   	itertoolsr   datasetsrB   numpyrL   r   'transformers.data.metrics.squad_metricsdatametricssquad_metricsrouge_scorer   r   lm_eval.utilsr   r   r   r6   r:   r<   rG   rX   Datasetr[   rZ   rm   r   rn   r=   r	   r	   r	   r
   <module>   s,    
G