o
    8wi1                     @   s   d Z ddlZddlmZ ddlmZmZ ddlZddlm	Z	 ddl
mZ ddlmZ ddlmZ eeZG d	d
 d
eZdejfddZ	ddeee  dejfddZdS )zUProvides a metrics class for the BERTscore metric.

Authors
* Sylvain de Langen 2024
    N)defaultdict)IterableOptional)TextEncoder)cosine_similarity_matrix)
get_logger)MetricStatsc                   @   sj   e Zd ZdZ				ddedededed	ef
d
dZdd Zdd Z	dddZ
dd Zdd Zdd ZdS )BERTScoreStatsa	  Computes BERTScore with a provided HuggingFace Transformers text encoder,
    using the method described in the paper
    `BERTScore: Evaluating Text Generation with BERT <https://arxiv.org/abs/1904.09675>`_.

    BERTScore operates over contextualized tokens (e.g. the output of BERT, but
    many other models would work). Since cosine similarities are used, the
    output range would be between `-1` and `1`.
    See the linked resources for more details.

    Special tokens (as queried from the tokenizer) are entirely ignored.

    Authors' reference implementation of the metric can be found
    `here <https://github.com/Tiiiger/bert_score>`_. The linked page extensively
    describes the approach and compares how the BERTScore relates to human
    evaluation with many different models.

    .. warning::
        Out of the box, this implementation may not strictly match the results
        of the reference implementation. Please read the argument documentation
        to understand the differences.

    Arguments
    ---------
    lm : speechbrain.lobes.models.huggingface_transformers.TextEncoder
        HF Transformers tokenizer and text encoder wrapper to use as a LM.
    batch_size : int, optional
        How many pairs of utterances should be considered at once. Higher is
        faster but may result in OOM.
    use_idf : bool, optional
        If enabled (default), tokens in the reference are weighted by
        Inverse Document Frequency, which allows to weight down the impact of
        common words that may carry less information. Every sentence appended
        is considered a document in the IDF calculation.
    sentence_level_averaging : bool, optional
        When `True`, the final recall/precision metrics will be the average of
        recall/precision for each tested sentence, rather of each tested token,
        e.g. a very long sentence will weigh as much as a very short sentence in
        the final metrics. The default is `True`, which matches the reference
        implementation.
    allow_matching_special_tokens : bool, optional
        When `True`, non-special tokens may match against special tokens during
        greedy matching (e.g. `[CLS]`/`[SEP]`). Batch size must be 1 due to
        padding handling.
        The default is `False`, which is different behavior from the reference
        implementation (see
        `bert_score#180 <https://github.com/Tiiiger/bert_score/issues/180>`_).
    @   TFlm
batch_sizeuse_idfsentence_level_averagingallow_matching_special_tokensc                 C   s*   |    || _|| _|| _|| _|| _d S N)clearr   r   r   r   r   )selfr   r   r   r   r    r   X/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/speechbrain/utils/bertscore.py__init__F   s   
zBERTScoreStats.__init__c                 C   s"   g | _ g | _g | _g | _i | _dS )zClears the collected statisticsN)idspredictionstargetsscoressummary)r   r   r   r   r   U   s
   
zBERTScoreStats.clearc                 C   s(   | j | | j| | j| dS )aN  
        Appends inputs, predictions and targets to internal
        lists

        Arguments
        ---------
        ids: list
            the string IDs for the samples
        predict: list
            the model's predictions in tokenizable format
        target: list
            the ground truths in tokenizable format
        N)r   extendr   r   )r   r   predicttargetr   r   r   append]   s   zBERTScoreStats.appendNc                 C   sH   t   |   W d   n1 sw   Y  |dur!| j| S | jS )a  Summarize the classification metric scores. Performs the actual LM
        inference and BERTScore estimation.

        Full set of fields:
         - `bertscore-recall`, optionally weighted by idf of ref tokens
         - `bertscore-precision`, optionally weighted by idf of hyp tokens
         - `bertscore-f1`

        Arguments
        ---------
        field : str
            If provided, only returns selected statistic. If not,
            returns all computed statistics.

        Returns
        -------
        float or dict
            Returns a float if ``field`` is provided, otherwise
            returns a dictionary containing all computed stats.
        N)torchno_grad_update_summaryr   )r   fieldr   r   r   	summarizeo   s   


zBERTScoreStats.summarizec              	   C   s  | j r| jdksJ dt| jj}| | j}d }}d }}tdt| j	| jD ]}| j
||| j  }| j||| j  }	| j	||| j  }
dd |	D }	dd |
D }
| j|	dd	\}}| j|
dd	\}}| }| }|d
  }|d
  }t||}| ||}| ||}| j sd|| ddf< d|dd| ddf< |jdd\}}|jdd\}}| || }| || }d|| < d|| < || }|| }t|D ]&\}}| j|||  ||    ||  ||    d q| jr'|| |  7 }|d7 }|| |  7 }|d7 }q*|| 7 }|| 7 }|| 7 }|| 7 }q*|| }|| }d||  ||  }| j|||d dS )zPerforms the actual LM inference and BERTscore estimation, updating
        the `summary` field. Automatically called by `summarize`.   zZBatch size must be 1 when passing `allow_matching_special_tokens` due to padding handling.g        r   c                 S      g | ]}d  |qS  join).0refr   r   r   
<listcomp>       z2BERTScoreStats._update_summary.<locals>.<listcomp>c                 S   r%   r&   r(   )r*   hypr   r   r   r,      r-   T)return_tokens	input_idsN   )dim)keyrecall	precisiong      ?g       @)zbertscore-recallzbertscore-precisionzbertscore-f1)r   r   get_bert_token_maskr   	tokenizer_make_weightsr   rangelenr   r   cpur   _select_by_tokens	transposemax	enumerater   r   sumitemr   r   update)r   token_maskstoken_weights
recall_sumrecall_weightprecision_sumprecision_weight	chunk_idxr   ref_texthyp_textref_toks
ref_hiddenhyp_toks
hyp_hiddensimilarity_matrixref_maskhyp_maskrecall_values_precision_valuesrecall_weightsprecision_weightsbatch_recallbatch_precisioniutt_idr6   r7   f1r   r   r   r!      s   





zBERTScoreStats._update_summaryc                 C   s6   | j rt| jdkrtdt| jj|S t| jjS )zMakes a token weight tensor, optionally including IDF. If not using
        IDF, currently simply returns a tensor full of ones.r$   zxToken IDF weighting was enabled, but 1 text is not enough. Compute the summary over more texts or disable IDF weighting.)r   r<   r   
ValueErrorget_bertscore_token_weightsr   r9   )r   corpusr   r   r   r:      s   zBERTScoreStats._make_weightsc                 C   s   |j d| d|jS )zFrom a batch of tokenized texts `input_tokens`, returns an
        identically shaped tensor where each item `token_id` becomes
        `token_weight[token_id]`.r   )r3   index)index_selectflattenreshapeshape)r   token_weightinput_tokensr   r   r   r>     s
   z BERTScoreStats._select_by_tokens)r
   TTFr   )__name__
__module____qualname____doc__r   intboolr   r   r   r#   r!   r:   r>   r   r   r   r   r	      s.    3

ir	   returnc                 C   s|   |   }t| }tj|d ftjd}g }| j D ]}t|tr+|	||  q|D ]	}|	||  q-qd||< |S )a4  Returns a token mask with special tokens masked.

    Arguments
    ---------
    tokenizer : transformers.PreTrainedTokenizer
        HuggingFace tokenizer for the BERT model.

    Returns
    -------
    torch.BoolTensor
        A mask tensor that can be indexed by token ID (of shape `[vocab_size]`).
    r$   )dtypeF)
	get_vocabr@   valuesr   onesrn   special_tokens_map
isinstancestrr   )r9   vocabmax_idxweightsspecial_tokens	tok_entrytokr   r   r   r8     s   
r8   ra   c           	         s   t |   }|du rt|fS tdd t|D ]\}}| d|d }t|}|D ]
}|  d7  < q/q|d   fddt	|d D }t
|S )	a  Returns token weights for use with the BERTScore metric.
    When specifying `corpus`, the weights are the Inverse Document Frequency
    (IDF) of each token, extracted from the `corpus`.

    The IDF formula is adapted from the BERTScore paper, where words missing
    from the reference corpus are weighted with `+1` smoothing.

    Arguments
    ---------
    tokenizer : transformers.PreTrainedTokenizer
        HuggingFace tokenizer for the BERT model.
    corpus : Iterable[str], optional
        Iterable corpus to compute the IDF from. Each iterated value is
        considered a document in the corpus in the IDF calculation.
        If omitted, no IDF weighting is done.

    Returns
    -------
    torch.Tensor
        A floating-point tensor that can be indexed by token ID, of shape
        `[vocab_size]`, where each entry is by how much the impact of a given
        token should be multiplied.
    Nc                   S   s   dS )Nr   r   r   r   r   r   <lambda>O  s    z-get_bertscore_token_weights.<locals>.<lambda>r'   r0   r$   c                    s&   g | ]}t  d  | d   qS )r$   )mathlog)r*   token_iddocument_count	freq_dictr   r   r,   Z  s    z/get_bertscore_token_weights.<locals>.<listcomp>)r@   rq   rr   r   rs   r   rA   r)   setr;   tensor)	r9   ra   rx   document_idxdocumenttokensunique_wordsunique_wordry   r   r   r   r`   /  s   

r`   r   )rl   r~   collectionsr   typingr   r   r   1speechbrain.lobes.models.huggingface_transformersr   speechbrain.utils.distancesr   speechbrain.utils.loggerr   speechbrain.utils.metric_statsr   ri   loggerr	   
BoolTensorr8   rv   Tensorr`   r   r   r   r   <module>   s&     z"
