o
    .wi{m                  #   @   s  d dl Z d dlmZ d dlmZ d dlmZmZmZm	Z	 d dl
Z
d dl
mZ d dlmZ d dlmZ d dlmZ d d	lmZmZmZmZ d d
lmZ d dlmZ er\er\d dlmZmZ esadgZed Z eG dd deZ!G dd dZ"dedede#de$de$defddZ%ddde&e'e$f fddZ(dede$d e$d!e$def
d"d#Z)d$d%d&e&e'ef d'e*de#d(e&e'e$f defd)d*Z+e
, d$d%d+ed'e*de#d(e&e'e$f d,e#defd-d.Z-d/e	e'ee' f d0e	e'ee' f ddd1e$de.eeeef f
d2d3Z/	4dGd$d%d5ed6ed'e*de#d7e"d(e&e'e$f d,e#defd8d9Z0	:	;	<	4					=	 	4	>dHd/e	e'ee' f d0e	e'ee' f d?e	e'e j1f d'e*d@e de#dAee* dBee* dCee	e'e
j2f  d1ee$ de$dDe$d,e#dEe#de	ee.eef f fdFdZ3dS )I    N)Sequence)unique)TYPE_CHECKINGListOptionalUnion)Tensor)
functional)
DataLoader)Literal)TokenizedDataset_get_progress_bar_input_data_collator_load_tokenizer_and_model)EnumStr)_TRANSFORMERS_GREATER_EQUAL_4_4)PreTrainedModelPreTrainedTokenizerBaseinfolm)	kl_divergencealpha_divergencebeta_divergenceab_divergencerenyi_divergencel1_distancel2_distancel_infinity_distancefisher_rao_distancec                   @   sF   e Zd ZdZedefddZdZdZdZ	dZ
d	Zd
ZdZdZdZdS )_IMEnumz8A helper Enum class for storing the information measure.returnc                   C   s   dS )NzInformation measure r    r    r    `/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/torchmetrics/functional/text/infolm.py_name:   s   z_IMEnum._namer   r   r   r   r   r   r   r   r   N)__name__
__module____qualname____doc__staticmethodstrr"   KL_DIVERGENCEALPHA_DIVERGENCEBETA_DIVERGENCEAB_DIVERGENCERENYI_DIVERGENCEL1_DISTANCEL2_DISTANCEL_INFINITY_DISTANCEFISHER_RAO_DISTANCEr    r    r    r!   r   6   s    r   c                	   @   s(  e Zd ZdZ		ddedee dee ddfddZd	ed
edefddZ	e
d	ed
edefddZd	ed
edefddZd	ed
edefddZd	ed
edefddZd	ed
edefddZe
d	ed
edefddZe
d	ed
edefddZe
d	ed
edefddZe
d	ed
edefddZdS ) _InformationMeasureu  A wrapper class used for the calculation of different information measures.

    This metric can be used to measure the information between the discrete reference distributions of predicted and
    reference sentences. The class also handles input validation for `alpha` and `beta` parameters.

    Args:
        information_measure:
            A name of information measure to be used. Please use one of: ['kl_divergence', 'alpha_divergence',
            'beta_divergence', 'ab_divergence', 'renyi_divergence', 'l1_distance', 'l2_distance', 'l_infinity_distance',
            'fisher_rao_distance']
        alpha:
            Alpha parameter of the divergence used for alpha, AB and Rényi divergence measures.
        beta:
            Beta parameter of the divergence used for beta and AB divergence measures.

    Raises:
        ValueError:
            If information measure is one from alpha, AB or Rényi divergence and parameter `alpha` is `None`.
        ValueError:
            If information measure is one from beta or divergence and parameter `beta` is `None`.
        ValueError:
            If information measure is alpha divergence and parameter `alpha` equals 0 or 1.
        ValueError:
            If information measure is beta divergence and parameter `beta` equals 0 or -1.
        ValueError:
            If information measure is AB divergence and parameter `alpha`, `beta` or `alpha + beta` equal 0.
        ValueError:
            If information measure is Rényi divergence and parameter `alpha` equals 1.

    Ninformation_measurealphabetar   c                 C   sb  t || _t jt jt jf}| j|v r t|ts td| d| jt j	t jfv r6t|ts6td| d| jt jkrMt|trE|dv rMtd| d| jt j	krdt|tr\|dv rdtd| d| jt jkr|d u s|d u st
dd	 ||fD sd
|||| fv rtd| d| jt jkrt|tr|dkrtd| d|pd
| _|pd
| _d S )Nz0Parameter `alpha` is expected to be defined for .z/Parameter `beta` is expected to be defined for )r      zFParameter `alpha` is expected to be float differened from 0 and 1 for )r   zFParameter `beta` is expected to be float differened from 0 and -1 for c                 s   s    | ]	}t |t V  qd S )N)
isinstancefloat).0pr    r    r!   	<genexpr>   s    z/_InformationMeasure.__init__.<locals>.<genexpr>r   zRParameters `alpha`, `beta` and their sum are expected to be differened from 0 for r7   z@Parameter `alpha` is expected to be float differened from 1 for )r   from_strr3   r*   r,   r-   r9   r:   
ValueErrorr+   anyr4   r5   )selfr3   r4   r5   _bad_measuresr    r    r!   __init__i   s:   


z_InformationMeasure.__init__preds_distributiontarget_distributionc                 C   s$   t | d| jj }t|||S )N_calculate_)getattrr3   valuetorch
nan_to_num)rA   rD   rE   information_measure_functionr    r    r!   __call__   s   z_InformationMeasure.__call__c                 C   s   t j|t | |  ddS )a  Calculate Kullback-Leibler divergence between discrete distributions of predicted and reference sentences.

        Args:
            preds_distribution:
                Discrete reference distribution of predicted sentences over the vocabulary.
            target_distribution:
                Discrete reference distribution of reference sentences over the vocabulary.

        Return:
            Kullback-Leibler divergence between discrete distributions of predicted and reference sentences.

        r8   dim)rI   sumlogrD   rE   r    r    r!   _calculate_kl_divergence   s   z,_InformationMeasure._calculate_kl_divergencec                 C   s:   | j | j d  }dtj|| j  |d| j    dd | S )a  Calculate alpha divergence between discrete distributions of predicted and reference sentences.

        Args:
            preds_distribution:
                Discrete reference distribution of predicted sentences over the vocabulary.
            target_distribution:
                Discrete reference distribution of reference sentences over the vocabulary.

        Return:
            Alpha divergence between discrete distributions of predicted and reference sentences.

        r7   r8   rM   )r4   rI   rO   )rA   rD   rE   _alpha_denomr    r    r!   _calculate_alpha_divergence   s   $z/_InformationMeasure._calculate_alpha_divergencec                 C   s   t t j|| j| j  dd}|| j| j| j   }t t j|| j| j  dd}|| j| j| j   }t t j|| j || j  dd}|| j| j  }|| | S )a  Calculate AB divergence between discrete distributions of predicted and reference sentences.

        Args:
            preds_distribution:
                Discrete reference distribution of predicted sentences over the vocabulary.
            target_distribution:
                Discrete reference distribution of reference sentences over the vocabulary.

        Return:
            AB divergence between discrete distributions of predicted and reference sentences.

        r8   rM   )rI   rP   rO   r5   r4   )rA   rD   rE   abcr    r    r!   _calculate_ab_divergence   s     $z,_InformationMeasure._calculate_ab_divergencec                 C   s   d| _ | ||S )a  Calculate beta divergence between discrete distributions of predicted and reference sentences.

        Args:
            preds_distribution:
                Discrete reference distribution of predicted sentences over the vocabulary.
            target_distribution:
                Discrete reference distribution of reference sentences over the vocabulary.

        Return:
            Beta divergence between discrete distributions of predicted and reference sentences.

        g      ?)r4   rX   rA   rD   rE   r    r    r!   _calculate_beta_divergence   s   z._InformationMeasure._calculate_beta_divergencec                 C   s2   t t j|| j |d| j   dd| jd  S )u  Calculate Rényi divergence between discrete distributions of predicted and reference sentences.

        Args:
            preds_distribution:
                Discrete reference distribution of predicted sentences over the vocabulary.
            target_distribution:
                Discrete reference distribution of reference sentences over the vocabulary.

        Return:
            Rényi divergence between discrete distributions of predicted and reference sentences.

        r7   r8   rM   )rI   rP   rO   r4   rY   r    r    r!   _calculate_renyi_divergence   s   &z/_InformationMeasure._calculate_renyi_divergencec                 C      t j||  dddS )a  Calculate L1 distance between discrete distributions of predicted and reference sentences.

        Args:
            preds_distribution:
                Discrete reference distribution of predicted sentences over the vocabulary.
            target_distribution:
                Discrete reference distribution of reference sentences over the vocabulary.

        Return:
            L1 distance between discrete distributions of predicted and reference sentences.

        r7   r8   r<   rN   rI   normrQ   r    r    r!   _calculate_l1_distance      z*_InformationMeasure._calculate_l1_distancec                 C   r\   )a  Calculate L2 distance between discrete distributions of predicted and reference sentences.

        Args:
            preds_distribution:
                Discrete reference distribution of predicted sentences over the vocabulary.
            target_distribution:
                Discrete reference distribution of reference sentences over the vocabulary.

        Return:
            L2 distance between discrete distributions of predicted and reference sentences.

           r8   r]   r^   rQ   r    r    r!   _calculate_l2_distance   ra   z*_InformationMeasure._calculate_l2_distancec                 C   s   t j||  tdddS )a  Calculate L-infinity distance between discrete distributions of predicted and reference sentences.

        Args:
            preds_distribution:
                Discrete reference distribution of predicted sentences over the vocabulary.
            target_distribution:
                Discrete reference distribution of reference sentences over the vocabulary.

        Return:
            L-infinity distance between discrete distributions of predicted and reference sentences.

        infr8   r]   )rI   r_   r:   rQ   r    r    r!   _calculate_l_infinity_distance
  s   z2_InformationMeasure._calculate_l_infinity_distancec              	   C   s(   dt t t | | ddd S )a  Calculate Fisher-Rao distance between discrete distributions of predicted and reference sentences.

        Args:
            preds_distribution:
                Discrete reference distribution of predicted sentences over the vocabulary.
            target_distribution:
                Discrete reference distribution of reference sentences over the vocabulary.

        Return:
            Fisher-Rao distance between discrete distributions of predicted and reference sentences.

        rb   r8   r   r7   )rI   acosclampsqrtrO   rQ   r    r    r!   _calculate_fisher_rao_distance  s   (z2_InformationMeasure._calculate_fisher_rao_distance)NN)r#   r$   r%   r&   $_ALLOWED_INFORMATION_MEASURE_LITERALr   r:   rC   r   rL   r'   rR   rT   rX   rZ   r[   r`   rc   re   ri   r    r    r    r!   r2   I   s8    "
$r2   	input_idsattention_maskidf
batch_sizenum_workersr   c                 C   s   t | ||}t|||dS )aH  Prepare dataloader.

    Args:
        input_ids:
            Indices of input sequence tokens in the vocabulary.
        attention_mask:
            Mask to avoid performing attention on padding token indices.
        idf:
            A bool indicating whether normalization using inverse document frequencies should be used.
        batch_size:
            A batch size used for model processing.
        num_workers:
            A number of workers to use for a dataloader.

    Return:
        An instance of ``torch.utils.data.DataLoader`` used for iterating over examples.

    )rn   ro   )r   r
   )rk   rl   rm   rn   ro   datasetr    r    r!   _get_dataloader+  s   rq   	tokenizerr   c                 C   s   | j | j| j| jdS )a  Build a dictionary of model/tokenizer special tokens.

    Args:
        tokenizer:
            Initialized tokenizer from HuggingFace's `transformers package.

    Return:
        A dictionary containing: mask_token_id, pad_token_id, sep_token_id and cls_token_id.

    mask_token_idpad_token_idsep_token_idcls_token_idrs   )rr   r    r    r!   _get_special_tokens_mapD  s
   rx   ru   rv   rw   c                 C   s$   |  ||  |B |  |B }| S )a(  Generate a token mask for differentiating all special tokens in the input batch.

    There are 0s for special tokens and 1s otherwise.

    Args:
        input_ids:
            Indices of input sequence tokens in the vocabulary.
        pad_token_id:
            An id of ``<PAD>`` tokens that are used to make arrays of tokens the same size for batching purpose
        cls_token_id:
            An id of ``<CLS>`` token that represents the class of the input. (It might be ``<BOS>`` token for some
            models.)
        sep_token_id:
            An id of ``<SEP>`` token that separates two different sentences in the same input. (It might be ``<EOS>``
            token for some models.)

    Return:
        Tensor mask of 0s and 1s that masks all special tokens in the ``input_ids`` tensor.

    )eq)rk   ru   rv   rw   
token_maskr    r    r!   _get_token_maskW  s   r{   modelr   batchtemperaturespecial_tokens_mapc                 C   sR  |d j d }g }t|d |d |d |d }t|D ]P}|d  }	|d |	dd|f< | |	|d j}
|
dd|ddf }
tj|
| d	d
}|r_||d dd|f d|j	9 }|
|d  ~	~
~qtj|dd
}td||j	|}|r||d |j	 }|jdd
|jdd
d S |jdd
|jdd
d S )a_  Calculate a discrete probability distribution for a batch of examples. See `InfoLM`_ for details.

    Args:
        model:
            Initialized model from HuggingFace's `transformers package.
        batch:
            An input batch dictionary containing ``input_ids`` and ``attention_mask``.
        temperature:
            A temperature for calibrating language modelling. For more information, please reference `InfoLM`_ paper.
        max_length:
            A maximum length of input sequences. Sequences longer than `max_length` are to be trimmed.
        idf:
            An indication of whether normalization using inverse document frequencies should be used.
        special_tokens_map:
            A dictionary mapping tokenizer special tokens into the corresponding integer values.

    Return:
        A discrete probability distribution.

    rk   r7   ru   rv   rw   rt   Nrl   r8   rM   input_ids_idfzbsv, bs -> bsv)shaper{   rangeclonelogitsFsoftmax	unsqueezetodeviceappendcpurI   cateinsumrO   )r|   r}   r~   rm   r   seq_lenprob_distribution_batch_listrz   mask_idxrk   logits_distributionprob_distributionprob_distribution_batchmasked_input_ids_idfr    r    r!   _get_batch_distributionp  s0   &r   
dataloaderverbosec           	   	   C   sH   | j }g }t||D ]}t||}|t| |||| q
tj|ddS )a  Calculate a discrete probability distribution according to the methodology described in `InfoLM`_.

    Args:
        model:
            Initialized model from HuggingFace's `transformers package.
        dataloader:
            An instance of `torch.utils.data.DataLoader` used for iterating over examples.
        temperature:
            A temperature for calibrating language modelling. For more information, please reference `InfoLM`_ paper.
        max_length:
            A maximum length of input sequences. Sequences longer than `max_length` are to be trimmed.
        idf:
            An indication of whether normalization using inverse document frequencies should be used.
        special_tokens_map:
            A dictionary mapping tokenizer special tokens into the corresponding integer values.
        verbose:
            An indication of whether a progress bar to be displayed during the embeddings calculation.

    Return:
        A discrete probability distribution.

    r   rM   )r   r   r   r   r   rI   r   )	r|   r   r~   rm   r   r   r   r   r}   r    r    r!   _get_data_distribution  s   
r   predstarget
max_lengthc                 C   sd   t | ttfst| } t |ttfst|}|| d|ddd}||d|ddd}|j|j|j|jfS )a8  Update the metric state by a tokenization of ``preds`` and ``target`` sentencens.

    Args:
        preds:
            An iterable of hypothesis corpus.
        target:
            An iterable of reference corpus.
        tokenizer:
            Initialized tokenizer from HuggingFace's `transformers package.
        max_length:
            A maximum length of input sequences. Sequences longer than `max_length` are to be trimmed.

    Return:
        Tokenizerd ``preds`` and ``target`` sentences represented with ``input_ids`` and ``attention_mask`` tensors.

    r   Tpt)paddingr   
truncationreturn_tensors)r9   r(   listrk   rl   )r   r   rr   r   preds_inputtarget_inputr    r    r!   _infolm_update  s   r   Tpreds_dataloadertarget_dataloaderinformation_measure_clsc           
      C   sF   t | |||||}t | |||||}	||jj }|	|jj }	|||	S )al  Calculate selected information measure using the pre-trained language model.

    Args:
        model:
            Initialized model from HuggingFace's `transformers package.
        preds_dataloader:
            Loader iterating over tokenizer predicted sentences.
        target_dataloader:
            Loader iterating over tokenizer reference sentences.
        temperature:
            A temperature for calibrating language modelling. For more information, please reference `InfoLM`_ paper.
        idf:
            An indication of whether normalization using inverse document frequencies should be used.
        information_measure_cls:
            Information measure class containing all parameters necessary for calculating information measure values
            using ``preds_distribution`` and ``target_distribution``.
        special_tokens_map:
            A dictionary mapping tokenizer special tokens into the corresponding integer values.
        verbose:
            An indication of whether a progress bar to be displayed during the embeddings calculation.

    Return:
        A corpus-level InfoLM score.

    )r   rp   sorting_indices)
r|   r   r   r~   rm   r   r   r   rD   rE   r    r    r!   _infolm_compute  s   #
r   bert-base-uncased      ?r   @   Fmodel_name_or_pathr3   r4   r5   r   num_threadsreturn_sentence_level_scorec              	   C   s   t ||\}}t|||}|	p|jj}	t|}t| |||	\}}}}t||||
|}t||||
|}t||||||||}|rE| |fS | S )uo  Calculate `InfoLM`_ [1].

    InfoML corresponds to distance/divergence between predicted and reference sentence discrete distribution using
    one of the following information measures:

        - `KL divergence`_
        - `alpha divergence`_
        - `beta divergence`_
        - `AB divergence`_
        - `Rényi divergence`_
        - L1 distance
        - L2 distance
        - L-infinity distance
        - `Fisher-Rao distance`_

    `InfoLM`_ is a family of untrained embedding-based metrics which addresses some famous flaws of standard
    string-based metrics thanks to the usage of pre-trained masked language models. This family of metrics is mainly
    designed for summarization and data-to-text tasks.

    If you want to use IDF scaling over the whole dataset, please use the class metric.

    The implementation of this metric is fully based HuggingFace `transformers`' package.

    Args:
        preds:
            An iterable of hypothesis corpus.
        target:
            An iterable of reference corpus.
        model_name_or_path:
            A name or a model path used to load `transformers` pretrained model.
        temperature:
            A temperature for calibrating language modelling. For more information, please reference `InfoLM`_ paper.
        information_measure:
            A name of information measure to be used. Please use one of: ['kl_divergence', 'alpha_divergence',
            'beta_divergence', 'ab_divergence', 'renyi_divergence', 'l1_distance', 'l2_distance', 'l_infinity_distance',
            'fisher_rao_distance']
        idf:
            An indication of whether normalization using inverse document frequencies should be used.
        alpha:
            Alpha parameter of the divergence used for alpha, AB and Rényi divergence measures.
        beta:
            Beta parameter of the divergence used for beta and AB divergence measures.
        device:
            A device to be used for calculation.
        max_length:
            A maximum length of input sequences. Sequences longer than `max_length` are to be trimmed.
        batch_size:
            A batch size used for model processing.
        num_threads:
            A number of threads to use for a dataloader.
        verbose:
            An indication of whether a progress bar to be displayed during the embeddings calculation.
        return_sentence_level_score:
            An indication whether a sentence-level InfoLM score to be returned.

    Returns:
        A corpus-level InfoLM score.
        (Optionally) A list of sentence-level InfoLM scores if `return_sentence_level_score=True`.

    Example:
        >>> from torchmetrics.functional.text.infolm import infolm
        >>> preds = ['he read the book because he was interested in world history']
        >>> target = ['he was interested in world history because he read the book']
        >>> infolm(preds, target, model_name_or_path='google/bert_uncased_L-2_H-128_A-2', idf=False)
        tensor(-0.1784)

    References:
        [1] InfoLM: A New Metric to Evaluate Summarization & Data2Text Generation by Pierre Colombo, Chloé Clavel and
        Pablo Piantanida `InfoLM`_

    )	r   r2   configr   rx   r   rq   r   mean)r   r   r   r~   r3   rm   r4   r5   r   r   rn   r   r   r   rr   r|   r   r   preds_input_idspreds_attention_masktarget_input_idstarget_attention_maskr   r   info_lm_scorer    r    r!   r   "  s,   W)T)r   r   r   TNNNNr   r   TF)4oscollections.abcr   enumr   typingr   r   r   r   rI   r   torch.nnr	   r   torch.utils.datar
   typing_extensionsr   4torchmetrics.functional.text.helper_embedding_metricr   r   r   r   torchmetrics.utilities.enumsr   torchmetrics.utilities.importsr   transformersr   r   __doctest_skip__rj   r   r2   boolintrq   dictr(   rx   r{   r:   r   no_gradr   tupler   r   PathLiker   r   r    r    r    r!   <module>   s   c



9
(
*
	
1	
