o
    6tiM                     @   s  d Z ddlZddlmZ ddlmZmZmZ z
ddlm	Z	 dZ
W n ey/   dZ
ed Y nw z
dd	lmZ dZW n eyI   dZed
 Y nw dedefddZdee dee deeef fddZdee dee deeef fddZdeeef dee deeef fddZdedefddZdeeef defddZdeeef dee fddZd d! Zd"edefd#d$ZdS )%zD
Utility functions for CNN/DailyMail summarization task evaluation.
    N)defaultdict)AnyDictList)rouge_scorerTFzIWarning: rouge_score not installed. Install with: pip install rouge-score)scorezGWarning: bert_score not installed. Install with: pip install bert-scoretextreturnc                 C   s   t dd| } |  } | S )z
    Normalize text for evaluation by removing extra whitespace and lowercasing.

    Args:
        text: Input text string

    Returns:
        Normalized text string
    \s+ )resubstrip)r    r   U/home/ubuntu/.local/lib/python3.10/site-packages/lm_eval/tasks/cnn_dailymail/utils.pynormalize_text   s   r   predictions
referencesc           	      C   s   t sddddS tjg ddd}g }g }g }t| |D ]*\}}t|}t|}|||}||d j ||d j ||d j q|rQt|t	| nd|r\t|t	| nd|rit|t	| dS ddS )z
    Calculate ROUGE scores for a list of predictions and references.

    Args:
        predictions: List of generated summaries
        references: List of reference summaries

    Returns:
        Dictionary with rouge1, rouge2, and rougeL scores
            rouge1rouge2rougeLTuse_stemmerr   r   r   )
ROUGE_AVAILABLEr   RougeScorerzipr   r   appendfmeasuresumlen)	r   r   scorerrouge1_scoresrouge2_scoresrougeL_scorespredrefscoresr   r   r   calculate_rouge_scores,   s&   r)   c                 C   sh   t sddddS dd | D } dd |D }t| |dddd	\}}}|  |  |  dS )
z
    Calculate BERTScore for a list of predictions and references.

    Args:
        predictions: List of generated summaries
        references: List of reference summaries

    Returns:
        Dictionary with precision, recall, and F1 scores
    r   bertscore_precisionbertscore_recallbertscore_f1c                 S      g | ]}t |qS r   r   ).0pr   r   r   
<listcomp>g       z'calculate_bertscore.<locals>.<listcomp>c                 S   r.   r   r/   )r0   rr   r   r   r2   h   r3   endistilbert-base-uncasedFlang
model_typeverbose)BERTSCORE_AVAILABLE
bert_scoremeanitem)r   r   PRF1r   r   r   calculate_bertscoreR   s$   
	

rB   docresultsc                 C   s   |rt |dkrd}nt|tr|d n|}| dd}t|}t|}i }trFtjg ddd}|||}|d j	|d j	|d	 j	d}i }t
rjt|g|gd
ddd\}}	}
|d  |	d  |
d  d}t|}i ||d|iS )a]  
    Process results for a single document.

    This function is called by lm-eval-harness for each document after generation.

    Args:
        doc: The document dictionary containing 'highlights' (reference summary)
        results: List containing the generated text(s)

    Returns:
        Dictionary with metric scores for this document
    r    
highlightsr   Tr   r   r   r   r5   r6   Fr7   r*   summary_length)r!   
isinstancelistgetr   r   r   r   r   r   r;   r<   r>   calculate_summary_length)rC   rD   generated_summaryreference_summaryrouge_resultsr"   r(   bertscore_resultsr?   r@   rA   rG   r   r   r   process_results|   s>   


rP   
generationc                 C   s(   |   } tdd| } tdd| } | S )z
    Post-process the generated text to clean it up.

    Args:
        generation: Raw generated text

    Returns:
        Cleaned generated text
    z\n+r   r
   )r   r   r   )rQ   r   r   r   postprocess_generation   s   rR   c                 C   s    |  dd}t| }|dkS )z
    Filter out articles that are too long.

    Args:
        doc: Document dictionary

    Returns:
        True if document should be kept, False if it should be filtered out
    articlerE   i  )rJ   r!   split)rC   rS   
word_countr   r   r   filter_long_articles   s   
rV   c                 C   s   g S )z
    For multiple-choice format (not used in summarization, but kept for compatibility).

    Args:
        doc: Document dictionary

    Returns:
        List of choices (empty for generation tasks)
    r   )rC   r   r   r   doc_to_choice   s   
rW   c                 C   s   dd }|  |S )z
    Pre-process the entire dataset before evaluation.

    Args:
        dataset: HuggingFace dataset

    Returns:
        Processed dataset
    c                 S   sN   |  dd}t|}|  dd}t|}i | ||t| t| dS )zProcess a single document.rS   rE   rF   )rS   rF   article_lengthrG   )rJ   r   r!   rT   )rC   rS   rF   r   r   r   _process_doc   s   

z"process_docs.<locals>._process_doc)map)datasetrY   r   r   r   process_docs   s   
r\   	generatedc                 C   s   t t|  S )z
    Calculate the length of generated summary in words.

    Args:
        generated: Generated summary text

    Returns:
        Number of words in summary
    )r!   r   rT   )r]   r   r   r   rK     s   
rK   )__doc__r   collectionsr   typingr   r   r   rouge_scorer   r   ImportErrorprintr<   r   r;   strr   floatr)   rB   rP   rR   boolrV   rW   r\   intrK   r   r   r   r   <module>   sN    

&

**> 