o
    yin                  #   @   s  d dl Z d dlmZ d dlmZmZmZmZmZm	Z	 d dl
Z
d dl
mZ d dlmZ d dlmZ d dlmZ d dlmZmZmZmZ d d	lmZ d d
lmZ erYd dlmZmZ nd ZZdgZed Z eG dd deZ!G dd dZ"dedede#de$de$defddZ%dedee&e$f fddZ'dede$de$de$def
d d!Z(d"ed#ee&ef d$e)de#d%ee&e$f defd&d'Z*e
+ d"ed(ed$e)de#d%ee&e$f d)e#defd*d+Z,d,e	e&ee& f d-e	e&ee& f ded.e$deeeeef f
d/d0Z-	1dDd"ed2ed3ed$e)de#d4e"d%ee&e$f d)e#defd5d6Z.	7	8	9	1					:	 	1	;dEd,e	e&ee& f d-e	e&ee& f d<e	e&e j/f d$e)d=e de#d>ee) d?ee) d@ee	e&e
j0f  d.ee$ de$dAe$d)e#dBe#de	eeeef f fdCdZ1dS )F    N)unique)DictListOptionalSequenceTupleUnion)Tensor)
functional)
DataLoader)Literal)TokenizedDataset_get_progress_bar_input_data_collator_load_tokenizer_and_model)EnumStr)_TRANSFORMERS_AVAILABLE)PreTrainedModelPreTrainedTokenizerBaseinfolm)	kl_divergencealpha_divergencebeta_divergenceab_divergencerenyi_divergencel1_distancel2_distancel_infinity_distancefisher_rao_distancec                       sV   e Zd ZdZdZdZdZdZdZdZ	dZ
d	Zd
Zededed f fddZ  ZS )_IMEnumz8A helper Enum class for storing the information measure.r   r   r   r   r   r   r   r   r   valuereturnr   c                    sF   dd t jD }t |}|dur||v r|S td| d| d)z
        Raises:
            ValueError:
                If required information measure is not among the supported options.
        c                 S   s   g | ]}|  qS  )lower).0imr"   r"   W/home/ubuntu/.local/lib/python3.10/site-packages/torchmetrics/functional/text/infolm.py
<listcomp>J   s    z$_IMEnum.from_str.<locals>.<listcomp>Nz-Invalid information measure. Expected one of z
, but got .)r   _member_names_superfrom_str
ValueError)clsr    _allowed_imenum_key	__class__r"   r&   r+   C   s
   z_IMEnum.from_str)__name__
__module____qualname____doc__KL_DIVERGENCEALPHA_DIVERGENCEBETA_DIVERGENCEAB_DIVERGENCERENYI_DIVERGENCEL1_DISTANCEL2_DISTANCEL_INFINITY_DISTANCEFISHER_RAO_DISTANCEclassmethodstrr   r+   __classcell__r"   r"   r0   r&   r   5   s    $r   c                	   @   s(  e Zd ZdZ		d dedee dee ddfddZd	ed
edefddZ	e
d	ededefddZd	ededefddZd	ededefddZd	ededefddZd	ededefddZe
d	ededefddZe
d	ededefddZe
d	ededefddZe
d	ededefddZdS )!_InformationMeasureu  A wrapper class used for the calculation the result of information measure between the discrete reference
    distributions of predicted and reference sentences. The class also handles input validation for `alpha` and
    `beta` parameters.

    Args:
        information_measure:
            A name of information measure to be used. Please use one of: ['kl_divergence', 'alpha_divergence',
            'beta_divergence', 'ab_divergence', 'renyi_divergence', 'l1_distance', 'l2_distance', 'l_infinity_distance',
            'fisher_rao_distance']
        alpha:
            Alpha parameter of the divergence used for alpha, AB and Rényi divergence measures.
        beta:
            Beta parameter of the divergence used for beta and AB divergence measures.

    Raises:
        ValueError:
            If information measure is one from alpha, AB or Rényi divergence and parameter `alpha` is `None`.
        ValueError:
            If information measure is one from beta or divergence and parameter `beta` is `None`.
        ValueError:
            If information measure is alpha divergence and parameter `alpha` equals 0 or 1.
        ValueError:
            If information measure is beta divergence and parameter `beta` equals 0 or -1.
        ValueError:
            If information measure is AB divergence and parameter `alpha`, `beta` or `alpha + beta` equal 0.
        ValueError:
            If information measure is Rényi divergence and parameter `alpha` equals 1.
    Ninformation_measurealphabetar!   c                 C   sN  t || _| jt jt jt jfv rt|tstd| d| jt j	t jfv r4t|ts4td| d| jt jkrKt|trC|dv rKtd| d| jt j	krbt|trZ|dv rbtd| d| jt jkrt
dd	 ||fD s|d
|||| fv rtd| d| jt jkrt|tr|dkrtd| d|pd
| _|pd
| _d S )Nz0Parameter `alpha` is expected to be defined for r(   z/Parameter `beta` is expected to be defined for )r      zFParameter `alpha` is expected to be float differened from 0 and 1 for )r   zFParameter `beta` is expected to be float differened from 0 and -1 for c                 s   s    | ]	}t |t V  qd S )N)
isinstancefloat)r$   pr"   r"   r&   	<genexpr>   s    z/_InformationMeasure.__init__.<locals>.<genexpr>r   zRParameters `alpha`, `beta` and their sum are expected to be differened from 0 for rF   z@Parameter `alpha` is expected to be float differened from 1 for )r   r+   rC   r7   r9   r:   rH   rI   r,   r8   anyrD   rE   )selfrC   rD   rE   r"   r"   r&   __init__p   s4   



(
z_InformationMeasure.__init__preds_distributiontarget_distribtuionc                 C   s"   t | d| j }t|||S )N_calculate_)getattrrC   torch
nan_to_num)rM   rO   rP   information_measure_functionr"   r"   r&   __call__   s   z_InformationMeasure.__call__target_distributionc                 C   s   t j|t | |  ddS )a  Calculate Kullback-Leibler divergence between discrete distributions of predicted and reference
        sentences.

        Args:
            preds_distribution:
                Discrete reference distribution of predicted sentences over the vocabulary.
            target_distribution:
                Discrete reference distribution of reference sentences over the vocabulary.

        Return:
            Kullback-Leibler divergence between discrete distributions of predicted and reference sentences.
        rG   dim)rS   sumlogrO   rW   r"   r"   r&   _calculate_kl_divergence   s   z,_InformationMeasure._calculate_kl_divergencec                 C   s>   | j | j d  }dtj|| j  |d| j    dd | }|S )a  Calculate alpha divergence between discrete distributions of predicted and reference sentences.

        Args:
            preds_distribution:
                Discrete reference distribution of predicted sentences over the vocabulary.
            target_distribution:
                Discrete reference distribution of reference sentences over the vocabulary.

        Return:
            Alpha divergence between discrete distributions of predicted and reference sentences.
        rF   rG   rX   )rD   rS   rZ   )rM   rO   rW   _alpha_denomr   r"   r"   r&   _calculate_alpha_divergence   s
   $z/_InformationMeasure._calculate_alpha_divergencec                 C   s   t t j|| j| j  dd}|| j| j| j   }t t j|| j| j  dd}|| j| j| j   }t t j|| j || j  dd}|| j| j  }|| | }|S )a  Calculate AB divergence between discrete distributions of predicted and reference sentences.

        Args:
            preds_distribution:
                Discrete reference distribution of predicted sentences over the vocabulary.
            target_distribution:
                Discrete reference distribution of reference sentences over the vocabulary.

        Return:
            AB divergence between discrete distributions of predicted and reference sentences.
        rG   rX   )rS   r[   rZ   rE   rD   )rM   rO   rW   abcr   r"   r"   r&   _calculate_ab_divergence   s     $z,_InformationMeasure._calculate_ab_divergencec                 C   s   d| _ | ||}|S )a  Calculate beta divergence between discrete distributions of predicted and reference sentences.

        Args:
            preds_distribution:
                Discrete reference distribution of predicted sentences over the vocabulary.
            target_distribution:
                Discrete reference distribution of reference sentences over the vocabulary.

        Return:
            Beta divergence between discrete distributions of predicted and reference sentences.
        g      ?)rD   rc   )rM   rO   rW   r   r"   r"   r&   _calculate_beta_divergence   s   z._InformationMeasure._calculate_beta_divergencec                 C   s6   t t j|| j |d| j   dd| jd  }|S )u  Calculate Rényi divergence between discrete distributions of predicted and reference sentences.

        Args:
            preds_distribution:
                Discrete reference distribution of predicted sentences over the vocabulary.
            target_distribution:
                Discrete reference distribution of reference sentences over the vocabulary.

        Return:
            Rényi divergence between discrete distributions of predicted and reference sentences.
        rF   rG   rX   )rS   r[   rZ   rD   )rM   rO   rW   r   r"   r"   r&   _calculate_renyi_divergence   s   &z/_InformationMeasure._calculate_renyi_divergencec                 C      t j||  dddS )a  Calculate L1 distance between discrete distributions of predicted and reference sentences.

        Args:
            preds_distribution:
                Discrete reference distribution of predicted sentences over the vocabulary.
            target_distribution:
                Discrete reference distribution of reference sentences over the vocabulary.

        Return:
            L1 distance between discrete distributions of predicted and reference sentences.
        rF   rG   rJ   rY   rS   normr\   r"   r"   r&   _calculate_l1_distance      z*_InformationMeasure._calculate_l1_distancec                 C   rf   )a  Calculate L2 distance between discrete distributions of predicted and reference sentences.

        Args:
            preds_distribution:
                Discrete reference distribution of predicted sentences over the vocabulary.
            target_distribution:
                Discrete reference distribution of reference sentences over the vocabulary.

        Return:
            L2 distance between discrete distributions of predicted and reference sentences.
           rG   rg   rh   r\   r"   r"   r&   _calculate_l2_distance   rk   z*_InformationMeasure._calculate_l2_distancec                 C   s   t j||  tdddS )a  Calculate L-infinity distance between discrete distributions of predicted and reference sentences.

        Args:
            preds_distribution:
                Discrete reference distribution of predicted sentences over the vocabulary.
            target_distribution:
                Discrete reference distribution of reference sentences over the vocabulary.

        Return:
            L-infinity distance between discrete distributions of predicted and reference sentences.
        infrG   rg   )rS   ri   rI   r\   r"   r"   r&   _calculate_l_infinity_distance  s   z2_InformationMeasure._calculate_l_infinity_distancec              	   C   s(   dt t t | | ddd S )a  Calculate Fisher-Rao distance between discrete distributions of predicted and reference sentences.

        Args:
            preds_distribution:
                Discrete reference distribution of predicted sentences over the vocabulary.
            target_distribution:
                Discrete reference distribution of reference sentences over the vocabulary.

        Return:
            Fisher-Rao distance between discrete distributions of predicted and reference sentences.
        rl   rG   r   rF   )rS   acosclampsqrtrZ   r\   r"   r"   r&   _calculate_fisher_rao_distance  s   (z2_InformationMeasure._calculate_fisher_rao_distance)NN)r2   r3   r4   r5   $_ALLOWED_INFORMATION_MEASURE_LITERALr   rI   rN   r	   rV   staticmethodr]   r_   rc   rd   re   rj   rm   ro   rs   r"   r"   r"   r&   rB   R   s8     
"rB   	input_idsattention_maskidf
batch_sizenum_workersr!   c                 C   s   t | ||}t|||d}|S )a  Prepare dataloader.

    Args:
        input_ids:
            Indices of input sequence tokens in the vocabulary.
        attention_mask:
            Mask to avoid performing attention on padding token indices.
        idf:
            A batch size used for model processing.
        num_threads:
            A number of workers to use for a dataloader.

    Return:
        An instance of ``torch.utils.data.DataLoader`` used for iterating over examples.
    )ry   rz   )r   r   )rv   rw   rx   ry   rz   dataset
dataloaderr"   r"   r&   _get_dataloader,  s   r}   	tokenizerc                 C   s   | j | j| j| jd}|S )a  Build a dictionary of model/tokenizer special tokens.

    Args:
        tokenizer:
            Initialized tokenizer from HuggingFace's `transformers package.

    Return:
        A dictionary containing: mask_token_id, pad_token_id, sep_token_id and cls_token_id.
    mask_token_idpad_token_idsep_token_idcls_token_idr   )r~   special_tokens_mapsr"   r"   r&   _get_special_tokens_mapC  s   r   r   r   r   c                 C   s$   |  ||  |B |  |B }| S )a&  Generate a token mask for differentiating all special tokens in the input batch. There are 0s for special
    tokens and 1s otherwise.

    Args:
        input_ids:
            Indices of input sequence tokens in the vocabulary.
        pad_token_id:
            An id of ``<PAD>`` tokens that are used to make arrays of tokens the same size for batching purpose
        cls_token_id:
            An id of ``<CLS>`` token that represents the class of the input. (It might be ``<BOS>`` token for some
            models.)
        sep_token_id:
            An id of ``<SEP>`` token that separates two different sentences in the same input. (It might be ``<EOS>``
            token for some models.)

    Return:
        Tensor mask of 0s and 1s that masks all special tokens in the ``input_ids`` tensor.
    )eq)rv   r   r   r   
token_maskr"   r"   r&   _get_token_maskV  s   r   modelbatchtemperaturespecial_tokens_mapc                 C   sZ  |d j d }g }t|d |d |d |d }t|D ]P}|d  }	|d |	dd|f< | |	|d j}
|
dd|ddf }
tj|
| d	d
}|r_||d dd|f d|j	9 }|
|d  ~	~
~qtj|dd
}td||j	|}|r||d |j	 }|jdd
|jdd
d }|S |jdd
|jdd
d }|S )a  Calculate a discrete probability distribution for a batch of examples according to the methodology described
    in `InfoLM`_.

    Args:
        model:
            Initialized model from HuggingFace's `transformers package.
        batch:
            An input batch dictionary containing ``input_ids`` and ``attention_mask``.
        temperature:
            A temperature for calibrating language modelling. For more information, please reference `InfoLM`_ paper.
        max_length:
            A maximum length of input sequences. Sequences longer than `max_length` are to be trimmed.
        idf:
            An indication of whether normalization using inverse document frequencies should be used.

    Return:
        A discrete probability distribution.
    rv   rF   r   r   r   r   Nrw   rG   rX   input_ids_idfzbsv, bs -> bsv)shaper   rangeclonelogitsFsoftmax	unsqueezetodeviceappendcpurS   cateinsumrZ   )r   r   r   rx   r   seq_lenprob_distribution_batch_listr   mask_idxrv   logits_distributionprob_distributionprob_distribution_batchmasked_input_ids_idfr"   r"   r&   _get_batch_distributionm  s4   &r   r|   verbosec           	   	   C   sH   | j }g }t||D ]}t||}|t| |||| q
tj|ddS )ab  Calculate a discrete probability distribution according to the methodology described in `InfoLM`_.
    Args:
        model:
            Initialized model from HuggingFace's `transformers package.
        dataloader:
            An instance of `torch.utils.data.DataLoader` used for iterating over examples.
        temperature:
            A temperature for calibrating language modelling. For more information, please reference `InfoLM`_ paper.
        max_length:
            A maximum length of input sequences. Sequences longer than `max_length` are to be trimmed.
        idf:
            An indication of whether normalization using inverse document frequencies should be used.
        verbose:
            An indication of whether a progress bar to be displayed during the embeddings calculation.

    Return:
        A discrete probability distribution.
    r   rX   )r   r   r   r   r   rS   r   )	r   r|   r   rx   r   r   r   r   r   r"   r"   r&   _get_data_distribution  s   
r   predstarget
max_lengthc                 C   sd   t | ttfst| } t |ttfst|}|| d|ddd}||d|ddd}|j|j|j|jfS )a7  Update the metric state by a tokenization of ``preds`` and ``target`` sentencens.

    Args:
        preds:
            An iterable of hypothesis corpus.
        target:
            An iterable of reference corpus.
        tokenizer:
            Initialized tokenizer from HuggingFace's `transformers package.
        max_length:
            A maximum length of input sequences. Sequences longer than `max_length` are to be trimmed.

    Return:
        Tokenizerd ``preds`` and ``target`` sentences represented with ``input_ids`` and ``attention_mask`` tensors.
    r   Tpt)paddingr   
truncationreturn_tensors)rH   r@   listrv   rw   )r   r   r~   r   preds_inputtarget_inputr"   r"   r&   _infolm_update  s   r   Tpreds_dataloadertarget_dataloaderinformation_measure_clsc                 C   sJ   t | |||||}t | |||||}	||jj }|	|jj }	|||	}
|
S )ak  Calculate selected information measure using the pre-trained language model.

    Args:
        model:
            Initialized model from HuggingFace's `transformers package.
        preds_dataloader:
            Loader iterating over tokenizer predicted sentences.
        target_datalaoder:
            Loader iterating over tokenizer reference sentences.
        temperature:
            A temperature for calibrating language modelling. For more information, please reference `InfoLM`_ paper.
        idf:
            An indication of whether normalization using inverse document frequencies should be used.
        information_measure_cls:
            Information measure class containing all parameters necessary for calculating information measure values
            using ``preds_distribution`` and ``target_distribution``.
        special_tokens_map:
            A dictionary mapping tokenizer special tokens into the corresponding integer values.
        verbose:
            An indication of whether a progress bar to be displayed during the embeddings calculation.

    Return:
        A corpus-level InfoLM score.
    )r   r{   sorting_indices)r   r   r   r   rx   r   r   r   rO   rW   infolm_scorer"   r"   r&   _infolm_compute  s   "
r   bert-base-uncased      ?r   @   Fmodel_name_or_pathrC   rD   rE   r   num_threadsreturn_sentence_level_scorec              	   C   s   t ||\}}t|||}|	p|jj}	t|}t| |||	\}}}}t||||
|}t||||
|}t||||||||}|rE| |fS | S )uj  
    Calculate `InfoLM`_ [1] - i.e. calculate a distance/divergence between predicted and reference sentence discrete
    distribution using one of the following information measures:

        - `KL divergence`_
        - `alpha divergence`_
        - `beta divergence`_
        - `AB divergence`_
        - `Rényi divergence`_
        - L1 distance
        - L2 distance
        - L-infinity distance
        - `Fisher-Rao distance`_

    `InfoLM`_ is a family of untrained embedding-based metrics which addresses some famous flaws of standard
    string-based metrics thanks to the usage of pre-trained masked language models. This family of metrics is mainly
    designed for summarization and data-to-text tasks.

    If you want to use IDF scaling over the whole dataset, please use the class metric.

    The implementation of this metric is fully based HuggingFace `transformers`' package.

    Args:
        preds:
            An iterable of hypothesis corpus.
        target:
            An iterable of reference corpus.
        model_name_or_path:
            A name or a model path used to load `transformers` pretrained model.
        temperature:
            A temperature for calibrating language modelling. For more information, please reference `InfoLM`_ paper.
        information_measure:
            A name of information measure to be used. Please use one of: ['kl_divergence', 'alpha_divergence',
            'beta_divergence', 'ab_divergence', 'renyi_divergence', 'l1_distance', 'l2_distance', 'l_infinity_distance',
            'fisher_rao_distance']
        idf:
            An indication of whether normalization using inverse document frequencies should be used.
        alpha:
            Alpha parameter of the divergence used for alpha, AB and Rényi divergence measures.
        beta:
            Beta parameter of the divergence used for beta and AB divergence measures.
        device:
            A device to be used for calculation.
        max_length:
            A maximum length of input sequences. Sequences longer than `max_length` are to be trimmed.
        batch_size:
            A batch size used for model processing.
        num_threads:
            A number of threads to use for a dataloader.
        verbose:
            An indication of whether a progress bar to be displayed during the embeddings calculation.
        return_sentence_level_score:
            An indication whether a sentence-level InfoLM score to be returned.

    Returns:
        A corpus-level InfoLM score.
        (Optionally) A list of sentence-level InfoLM scores if `return_sentence_level_score=True`.

    Example:
        >>> from torchmetrics.functional.text.infolm import infolm
        >>> preds = ['he read the book because he was interested in world history']
        >>> target = ['he was interested in world history because he read the book']
        >>> infolm(preds, target, model_name_or_path='google/bert_uncased_L-2_H-128_A-2', idf=False)
        tensor(-0.1784)

    References:
        [1] InfoLM: A New Metric to Evaluate Summarization & Data2Text Generation by Pierre Colombo, Chloé Clavel and
        Pablo Piantanida `InfoLM`_
    )	r   rB   configr   r   r   r}   r   mean)r   r   r   r   rC   rx   rD   rE   r   r   ry   r   r   r   r~   r   r   r   preds_input_idspreds_attention_masktarget_input_idstarget_attention_maskr   r   info_lm_scorer"   r"   r&   r     s,   U)T)r   r   r   TNNNNr   r   TF)2osenumr   typingr   r   r   r   r   r   rS   r	   torch.nnr
   r   torch.utils.datar   typing_extensionsr   4torchmetrics.functional.text.helper_embedding_metricr   r   r   r   torchmetrics.utilities.enumsr   torchmetrics.utilities.importsr   transformersr   r   __doctest_skip__rt   r   rB   boolintr}   r@   r   r   rI   r   no_gradr   r   r   PathLiker   r   r"   r"   r"   r&   <module>   s    [



5
$
)
	
1	
