o
    %ݫi                     @   s  d Z ddlmZmZ ddlZddlmZmZ ddlm	Z	m
Z
mZ ddlmZmZ ddlmZ ddlmZmZmZmZ G d	d
 d
Zd$ddZd%ddZG dd deZG dd deZG dd dZG dd deZdd Z	d&ddZG dd deZG d d! d!Z d"d# Z!dS )'u   The ``metric_stats`` module provides an abstract class for storing
statistics produced over the course of an experiment and summarizing them.

Authors:
 * Peter Plantinga 2020
 * Mirco Ravanelli 2020
 * Gaëlle Laperrière 2021
 * Sahar Ghannay 2021
    )CallableOptionalN)Paralleldelayed)extract_concepts_values
merge_char
split_word)print_alignmentsprint_wer_summary)undo_padding)EDIT_SYMBOLS_str_equalswer_details_for_batchwer_summaryc                   @   s>   e Zd ZdZdddZdd Zdd	 ZdddZdddZd
S )MetricStatsa  A default class for storing and summarizing arbitrary metrics.

    More complex metrics can be created by sub-classing this class.

    Arguments
    ---------
    metric : function
        The function to use to compute the relevant metric. Should take
        at least two arguments (predictions and targets) and can
        optionally take the relative lengths of either or both arguments.
        Not usually used in sub-classes.
    n_jobs : int
        The number of jobs to use for computing the metric. If this is
        more than one, every sample is processed individually, otherwise
        the whole batch is passed at once.
    batch_eval : bool
        When True it feeds the evaluation metric with the batched input.
        When False and n_jobs=1, it performs metric evaluation one-by-one
        in a sequential way. When False and n_jobs>1, the evaluation
        runs in parallel over the different inputs using joblib.

    Example
    -------
    >>> from speechbrain.nnet.losses import l1_loss
    >>> loss_stats = MetricStats(metric=l1_loss)
    >>> loss_stats.append(
    ...      ids=["utterance1", "utterance2"],
    ...      predictions=torch.tensor([[0.1, 0.2], [0.2, 0.3]]),
    ...      targets=torch.tensor([[0.1, 0.2], [0.1, 0.2]]),
    ...      reduction="batch",
    ... )
    >>> stats = loss_stats.summarize()
    >>> stats['average']
    0.050...
    >>> stats['max_score']
    0.100...
    >>> stats['max_id']
    'utterance2'
       Tc                 C   s   || _ || _|| _|   d S N)metricn_jobs
batch_evalclearselfr   r   r    r   R/home/ubuntu/.local/lib/python3.10/site-packages/speechbrain/utils/metric_stats.py__init__H   s   zMetricStats.__init__c                 C   s   g | _ g | _i | _dS )z=Creates empty container for storage, removing existing stats.N)scoresidssummaryr   r   r   r   r   N   s   
zMetricStats.clearc                 O   s   | j | | jr| j|i | }n(d|vsd|vr td| jdkr0tdd| ji|}ntd| j| jd|}| j	| dS )	8  Store a particular set of metric scores.

        Arguments
        ---------
        ids : list
            List of ids corresponding to utterances.
        *args : tuple
            Arguments to pass to the metric function.
        **kwargs : dict
            Arguments to pass to the metric function.
        predicttarget>Must pass 'predict' and 'target' as kwargs if batch_eval=Falser   r   r   r   Nr   )
r   extendr   r   detach
ValueErrorr   sequence_evaluationmultiprocess_evaluationr   )r   r   argskwargsr   r   r   r   appendT   s   
zMetricStats.appendNc                 C   s   t t | j}t t | j}tt| jt| j t| j| | j| t| j| | j| d| _	|dur>| j	| S | j	S )a  Summarize the metric scores, returning relevant stats.

        Arguments
        ---------
        field : str
            If provided, only returns selected statistic. If not,
            returns all computed statistics.

        Returns
        -------
        float or dict
            Returns a float if ``field`` is provided, otherwise
            returns a dictionary containing all computed stats.
        )average	min_scoremin_id	max_scoremax_idN)
torchargmintensorr   argmaxfloatsumlenr   r   )r   field	min_index	max_indexr   r   r   	summarizev   s   
zMetricStats.summarizeFc                 C   s   | j s|   d| j d  d}|d| j d  d7 }|d| j d  d7 }|d	| j d
  d7 }|d| j d  d7 }|| |rIt| dS dS )zWrite all relevant statistics to file.

        Arguments
        ---------
        filestream : file-like object
            A stream for the stats to be written to.
        verbose : bool
            Whether to also print the stats to stdout.
        zAverage score: r-   
zMin error: r.    zid: r/   zMax error: r0   r1   N)r   r<   writeprint)r   
filestreamverbosemessager   r   r   write_stats   s   

zMetricStats.write_stats)r   Tr   )F)	__name__
__module____qualname____doc__r   r   r,   r<   rD   r   r   r   r   r      s    
(
"r      c              
      s   |dur%|| d    }dd t||D }dd t||D }	 zt|dd fd	d
t||D }W |S  tyU } zt| td W Y d}~nd}~ww q&)z6Runs metric evaluation if parallel over multiple jobs.Nr   c                 S       g | ]\}}|d |   qS r   cpu.0plengthr   r   r   
<listcomp>        z+multiprocess_evaluation.<locals>.<listcomp>c                 S   rJ   r   rK   rN   trP   r   r   r   rQ      rR   T   )r   timeoutc                 3   s"    | ]\}}t  ||V  qd S r   )r   )rN   rO   rT   r   r   r   	<genexpr>   s    
z*multiprocess_evaluation.<locals>.<genexpr>z)Evaluation timeout...... (will try again))sizeroundintrL   zipr   	Exceptionr@   )r   r!   r"   lengthsr   r   er   rW   r   r)      s"   r)   c                 C   sz   |dur%|| d    }dd t||D }dd t||D }g }t||D ]\}}| ||}|| q,|S )z4Runs metric evaluation sequentially over the inputs.Nr   c                 S   rJ   r   rK   rM   r   r   r   rQ      rR   z'sequence_evaluation.<locals>.<listcomp>c                 S   rJ   r   rK   rS   r   r   r   rQ      rR   )rY   rZ   r[   rL   r\   r,   )r   r!   r"   r^   r   rO   rT   scorer   r   r   r(      s   
r(   c                   @   s^   e Zd ZdZdddddddefdeeegef fddZ						dd
dZ	dddZ
dd Zd	S )ErrorRateStatsa  A class for tracking error rates (e.g., WER, PER).

    Arguments
    ---------
    merge_tokens : bool
        Whether to merge the successive tokens (used for e.g.,
        creating words out of character tokens).
        See ``speechbrain.dataio.dataio.merge_char``.
    split_tokens : bool
        Whether to split tokens (used for e.g. creating
        characters out of word tokens).
        See ``speechbrain.dataio.dataio.split_word``.
    space_token : str
        The character to use for boundaries. Used with ``merge_tokens``
        this represents character to split on after merge.
        Used with ``split_tokens`` the sequence is joined with
        this token in between, and then the whole sequence is split.
    keep_values : bool
        Whether to keep the values of the concepts or not.
    extract_concepts_values : bool
        Process the predict and target to keep only concepts and values.
    tag_in : str
        Start of the concept ('<' for example).
    tag_out : str
        End of the concept ('>' for example).
    equality_comparator : Callable[[str, str], bool]
        The function used to check whether two words are equal.

    Example
    -------
    >>> cer_stats = ErrorRateStats()
    >>> i2l = {0: 'a', 1: 'b'}
    >>> cer_stats.append(
    ...     ids=['utterance1'],
    ...     predict=torch.tensor([[0, 1, 1]]),
    ...     target=torch.tensor([[0, 1, 0]]),
    ...     target_len=torch.ones(1),
    ...     ind2lab=lambda batch: [[i2l[int(x)] for x in seq] for seq in batch],
    ... )
    >>> stats = cer_stats.summarize()
    >>> stats['WER']
    33.33...
    >>> stats['insertions']
    0
    >>> stats['deletions']
    0
    >>> stats['substitutions']
    1
    F_T equality_comparatorc	           	      C   s<   |    || _|| _|| _|| _|| _|| _|| _|| _d S r   )	r   merge_tokenssplit_tokensspace_tokenr   keep_valuestag_intag_outrd   )	r   re   rf   rg   rh   r   ri   rj   rd   r   r   r   r     s   
zErrorRateStats.__init__Nc                 C   s   | j | |durt||}|durt||}|dur$||}||}| jr5t|| jd}t|| jd}| jrFt|| jd}t|| jd}| jrct|| j	| j
| j| jd}t|| j	| j
| j| jd}t|||d| jd}| j| dS )a^  Add stats to the relevant containers.

        * See MetricStats.append()

        Arguments
        ---------
        ids : list
            List of ids corresponding to utterances.
        predict : torch.tensor
            A predicted output, for comparison with the target output
        target : torch.tensor
            The correct reference output, for comparison with the prediction.
        predict_len : torch.tensor
            The predictions relative lengths, used to undo padding if
            there is padding present in the predictions.
        target_len : torch.tensor
            The target outputs' relative lengths, used to undo padding if
            there is padding present in the target.
        ind2lab : callable
            Callable that maps from indices to labels, operating on batches,
            for writing alignments.
        N)spaceT)compute_alignmentsrd   )r   r%   r   re   r   rg   rf   r   r   rh   ri   rj   r   rd   r   )r   r   r!   r"   predict_len
target_lenind2labr   r   r   r   r,     sJ   

zErrorRateStats.appendc                 C   s4   t | j| _| jd | jd< |dur| j| S | jS )zhSummarize the error_rate and return relevant statistics.

        * See MetricStats.summarize()
        WER
error_rateN)r   r   r   )r   r9   r   r   r   r<   c  s
   
zErrorRateStats.summarizec                 C   s*   | j s|   t| j | t| j| dS )zoWrite all relevant info (e.g., error rate alignments) to file.
        * See MetricStats.write_stats()
        N)r   r<   r
   r	   r   r   rA   r   r   r   rD   r  s   zErrorRateStats.write_stats)NNNr   )rE   rF   rG   rH   r   r   strboolr   r,   r<   rD   r   r   r   r   ra      s&    4	


Mra   c                	   @   sZ   e Zd ZdZ	ddedeeee ee gef defddZ	dd	 Z
dddZdd Zd
S )WeightedErrorRateStatsa  Metric that reweighs the WER from :class:`~ErrorRateStats` with any
    chosen method. This does not edit the sequence of found edits
    (insertion/deletion/substitution) but multiplies their impact on the metric
    by a value between 0 and 1 as returned by the cost function.

    Arguments
    ---------
    base_stats : ErrorRateStats
        The base WER calculator to use.
    cost_function : Callable[[str, Optional[str], Optional[str]], float]
        Cost function of signature `fn(edit_symbol, a, b) -> float`, where the
        returned value, between 0 and 1, is the weight that should be assigned
        to a particular edit in the weighted WER calculation.
        In the case of insertions and deletions, either of `a` or `b` may be
        `None`. In the case of substitutions, `a` and `b` will never be `None`.
    weight_name : str
        Prefix to be prepended to each metric name (e.g. `xxx_wer`)
    weighted
base_statscost_functionweight_namec                 C   s   |    || _|| _|| _d S r   )r   rw   rx   ry   )r   rw   rx   ry   r   r   r   r     s   
zWeightedErrorRateStats.__init__c                 O   s   t d)a}  Append function, which should **NOT** be used for the weighted error
        rate stats. Please append to the specified `base_stats` instead.

        `WeightedErrorRateStats` reuses the scores from the base
        :class:`~ErrorRateStats` class.

        Arguments
        ---------
        *args : tuple
            Ignored.
        **kwargs : dict
            Ignored.
        z]Cannot append to a WeightedErrorRateStats. You should only append to the base ErrorRateStats.)r'   )r   r*   r+   r   r   r   r,     s   zWeightedErrorRateStats.appendNc              
   C   s  d}d}d}d}t | jjD ]\}}d}d}	d}
d}|d D ]N\}}}|dur-|d | nd}|dur9|d | nd}|td krh| |||}|td krS||7 }n|td kr^|
|7 }
n
|td	 krh|	|7 }	|d
7 }q||	 |
 }|| }| j| jj| |d ||	|
|d ||7 }||	7 }||
7 }||7 }q|| | }|| }| j d|d | j d|| j d|| j d|| j d|i| _|dur| j| S | jS )a  Returns a dict containing some detailed WER statistics after
        weighting every edit with a weight determined by `cost_function`
        (returning `0.0` for no error, `1.0` for the default error behavior, and
        anything in between).

        Does not require :meth:`~ErrorRateStats.summarize` to have been called.

        Full set of fields, **each of which are prepended with
        `<weight_name_specified_at_init>_`**:
        - `wer`: Weighted WER (ratio `*100`)
        - `insertions`: Weighted insertions
        - `substitutions`: Weighted substitutions
        - `deletions`: Weighted deletions
        - `num_edits`: Sum of weighted insertions/substitutions/deletions

        Additionally, a `scores` list is populated by this function for each
        pair of sentences. Each entry of that list is a dict, with the fields:
        - `key`: the ID of the utterance.
        - `WER`, `insertions`, `substitutions`, `deletions`, `num_edits` with
          the same semantics as described above, but at sentence level rather
          than global.

        Arguments
        ---------
        field : str, optional
            The field to return, if you are only interested in one of them.
            If specified, a single `float` is returned, otherwise, a dict is.

        Returns
        -------
        dict from str to float, if `field is None`
            A dictionary of the fields documented above.
        float, if `field is not None`
            The single field selected by `field`.
                	alignmentN
ref_tokens
hyp_tokenseqinsdelsub      ?g      Y@)keyrp   
insertionssubstitutions	deletions	num_edits_wer_insertions_substitutions
_deletions
_num_edits)		enumeraterw   r   r   rx   r,   r   ry   r   )r   r9   weighted_insertionsweighted_substitutionsweighted_deletionstotali	utteranceutt_weighted_insertionsutt_weighted_substitutionsutt_weighted_deletions	utt_totaledit_symbola_idxb_idxab
pair_scoreutt_weighted_editsutt_weighted_wer_ratioweighted_editsweighted_wer_ratior   r   r   r<     sv   %






z WeightedErrorRateStats.summarizec                 C   sR   | j s|   td| j d|d | j  D ]\}}t| d| |d qdS )zWrite all relevant info to file; here, only the weighted info as
        returned by `summarize`.
        See :meth:`~ErrorRateStats.write_stats`.
        zWeighted WER metrics (z):file: N)r   r<   r@   ry   items)r   rA   kvr   r   r   rD      s   z"WeightedErrorRateStats.write_stats)rv   r   )rE   rF   rG   rH   ra   r   rs   r   r6   r   r,   r<   rD   r   r   r   r   ru   }  s    

pru   c                   @   s\   e Zd ZdZdeegeej f de	de	de	fddZ
ded	ee d
ee de	fddZdS )EmbeddingErrorRateSimilaritya  Implements the similarity function from the EmbER metric as defined by
    https://www.isca-archive.org/interspeech_2022/roux22_interspeech.pdf

    This metric involves a dictionary to map a token to a single word embedding.
    Substitutions in the WER get weighted down when the embeddings are similar
    enough. The goal is to reduce the impact of substitution errors with small
    semantic impact. Only substitution errors get weighted.

    This is done by computing the cosine similarity between the two embeddings,
    then weighing the substitution with `low_similarity_weight` if
    `similarity >= threshold` or with `high_similarity_weight` otherwise (e.g.
    a substitution with high similarity could be weighted down to matter 10% as
    much as a substitution with low similarity).

    .. note ::
        The cited paper recommended `(1.0, 0.1, 0.4)` as defaults for fastTexst
        French embeddings, chosen empirically. When using different embeddings,
        you might want to test other values; thus we don't provide defaults.

    Arguments
    ---------
    embedding_function : Callable[[str], Optional[torch.Tensor]]
        Function that returns an embedding (as a :class:`torch.Tensor`) from a
        word. If no corresponding embedding could be found for the word, should
        return `None`. In that case, `low_similarity_weight` will be chosen.
    low_similarity_weight : float
        Weight applied to the substitution if `cosine_similarity < threshold`.
    high_similarity_weight : float
        Weight applied to the substitution if `cosine_similarity >= threshold`.
    threshold : float
        Cosine similarity threshold used to select by how much a substitution
        error should be weighed for this word.
    embedding_functionlow_similarity_weighthigh_similarity_weight	thresholdc                 C   s   || _ || _|| _|| _d S r   )r   r   r   r   )r   r   r   r   r   r   r   r   r   Q  s   
z%EmbeddingErrorRateSimilarity.__init__r   r   r   returnc                 C   s   |t d t d fv rdS |t d krW|du s|dkr| jS |du s%|dkr(| jS | |}|du r4| jS | |}|du r@| jS tjjj||dd }|| jkrT| j	S | jS d	S )
a-  Returns the weight that should be associated with a specific edit
        in the WER calculation.

        Compatible candidate for the cost function of
        :class:`~WeightedErrorRateStats` so an instance of this class can be
        passed as a `cost_function`.

        Arguments
        ---------
        edit_symbol: str
            Edit symbol as assigned by the WER functions, see `EDIT_SYMBOLS`.
        a: str, optional
            First word to compare (if present)
        b: str, optional
            Second word to compare (if present)

        Returns
        -------
        float
            Weight to assign to the edit.
            For actual edits, either `low_similarity_weight` or
            `high_similarity_weight` depending on the embedding distance and
            threshold.
        r   r   r   r   Nrc   r   dimrz   )
r   r   r   r2   nn
functionalcosine_similarityitemr   r   )r   r   r   r   a_embb_emb
similarityr   r   r   __call__]  s,   


z%EmbeddingErrorRateSimilarity.__call__N)rE   rF   rG   rH   r   rs   r   r2   Tensorr6   r   r   r   r   r   r   r   .  s(    "
r   c                   @   s6   e Zd ZdZdddZdd Zdd Z	
dddZd	S )BinaryMetricStatsz?Tracks binary metrics, such as precision, recall, F1, EER, etc.r   c                 C   s   |    || _d S r   )r   positive_label)r   r   r   r   r   r     s   
zBinaryMetricStats.__init__c                 C   s   g | _ g | _g | _i | _dS )zClears the stored metrics.N)r   r   labelsr   r   r   r   r   r        
zBinaryMetricStats.clearc                 C   s0   | j | | j|  | j|  dS )a  Appends scores and labels to internal lists.

        Does not compute metrics until time of summary, since
        automatic thresholds (e.g., EER) need full set of scores.

        Arguments
        ---------
        ids : list
            The string ids for the samples.
        scores : list
            The scores corresponding to the ids.
        labels : list
            The labels corresponding to the ids.
        N)r   r%   r   r&   r   )r   r   r   r   r   r   r   r,     s   zBinaryMetricStats.appendN:0yE>c                 C   s  t | jtrt| j| _t| j| _|du r| j| j| jkjdd }| j| j| jkjdd }|durzt||krXt	|\}}|dd t
dt|tt|| D  }t||krzt	|\}}|dd t
dt|tt|| D  }t||\}	}| j|k }
| j}t|
|  }| jd< td	|
 d	|   }| jd
< t|
d	|   }| jd< td	|
 |  }| jd< ||| |  | jd< ||| |  | jd< || || |  | jd< || jd< ||| |  | jd< ||| |  | jd< d	|d  | d	|d  | |d |  |  | jd< || ||  || ||  ||  ||  | d  | jd< |durM| j| S | jS )aF  Compute statistics using a full set of scores.

        Full set of fields:
         - TP - True Positive
         - TN - True Negative
         - FP - False Positive
         - FN - False Negative
         - FAR - False Acceptance Rate
         - FRR - False Rejection Rate
         - DER - Detection Error Rate (EER if no threshold passed)
         - threshold - threshold (EER threshold if no threshold passed)
         - precision - Precision (positive predictive value)
         - recall - Recall (sensitivity)
         - F-score - Balance of precision and recall (equal if beta=1)
         - MCC - Matthews Correlation Coefficient

        Arguments
        ---------
        field : str
            A key for selecting a single statistic. If not provided,
            a dict with all statistics is returned.
        threshold : float
            If no threshold is provided, equal error rate is used.
        max_samples: float
            How many samples to keep for positive/negative scores.
            If no max_samples is provided, all scores are kept.
            Only effective when threshold is None.
        beta : float
            How much to weight precision vs recall in F-score. Default
            of 1. is equal weight, while higher values weight recall
            higher, and lower values weight precision higher.
        eps : float
            A small value to avoid dividing by zero.

        Returns
        -------
        summary
            if field is specified, only returns the score for that field.
            if field is None, returns the full set of fields.
        NT)as_tuplec                 S      g | ]}|qS r   r   rN   r   r   r   r   rQ         z/BinaryMetricStats.summarize.<locals>.<listcomp>r   c                 S   r   r   r   r   r   r   r   rQ     r   TPr   TNFPFNFARFRRDERr   	precisionrecallg       @zF-scoreg      ?MCC)
isinstancer   listr2   stackr   r   nonzeror8   sortranger[   EERr6   mulr7   r   )r   r9   r   max_samplesbetaepspositive_scoresnegative_scoresrb   eerpredtruer   r   r   r   r   r   r   r<     sv   +
$  

"

zBinaryMetricStats.summarize)r   )NNNr   r   )rE   rF   rG   rH   r   r   r,   r<   r   r   r   r   r     s    
r   c                 C   s  t t | |g\}}t |}|dd |dd  d }t t ||g\}}d}d}d}t|D ]B\}}	| |	k}
|
d | jd  }~
||	k}|d |jd  }~||  	 t|| k sm|dkrw|}|	 }|	 }q5|| d }t|t|| fS )aq  Computes the EER (and its threshold).

    Arguments
    ---------
    positive_scores : torch.tensor
        The scores from entries of the same class.
    negative_scores : torch.tensor
        The scores from entries of different classes.

    Returns
    -------
    EER : float
        The EER score.
    threshold : float
        The corresponding threshold for the EER score.

    Example
    -------
    >>> positive_scores = torch.tensor([0.6, 0.7, 0.8, 0.5])
    >>> negative_scores = torch.tensor([0.4, 0.3, 0.2, 0.1])
    >>> val_eer, threshold = EER(positive_scores, negative_scores)
    >>> val_eer
    0.0
    r   r   N   )
r2   r   catuniquer   r7   r6   shapeabsr   )r   r   
thresholdsrb   intermediate_thresholdsr:   	final_FRR	final_FARr   
cur_threshpos_scores_thresholdr   neg_scores_thresholdr   r   r   r   r   r   )  s*   
$r   r   {Gz?c                 C   s$  t t | |g\}}t |}|dd |dd  d }t t ||g\}}t t|| dg } | dd|k}|d | j	d  }	~ ~t t||dg }|dd|k}
|
d |j	d  }~~
||	 | || d|   }t j
|dd\}}t|t|| fS )a  Computes the minDCF metric normally used to evaluate speaker verification
    systems. The min_DCF is the minimum of the following C_det function computed
    within the defined threshold range:

    C_det =  c_miss * p_miss * p_target + c_fa * p_fa * (1 -p_target)

    where p_miss is the missing probability and p_fa is the probability of having
    a false alarm.

    Arguments
    ---------
    positive_scores : torch.tensor
        The scores from entries of the same class.
    negative_scores : torch.tensor
        The scores from entries of different classes.
    c_miss : float
         Cost assigned to a missing error (default 1.0).
    c_fa : float
        Cost assigned to a false alarm (default 1.0).
    p_target: float
        Prior probability of having a target (default 0.01).

    Returns
    -------
    minDCF : float
        The minDCF score.
    threshold : float
        The corresponding threshold for the minDCF score.

    Example
    -------
    >>> positive_scores = torch.tensor([0.6, 0.7, 0.8, 0.5])
    >>> negative_scores = torch.tensor([0.4, 0.3, 0.2, 0.1])
    >>> val_minDCF, threshold = minDCF(positive_scores, negative_scores)
    >>> val_minDCF
    0.0
    r   r   r   Nr   r   )r2   r   r   r   r8   	unsqueeze	transposer7   r6   r   min)r   r   c_missc_fap_targetr   rb   r   r   p_missr   p_fac_detc_minr:   r   r   r   minDCFd  s*   )
r   c                       s   e Zd ZdZ fddZd%ddZd%ddZd	d
 Zdd Zdd Z	dd Z
dd Zdd Zdd Zdd Zdd Zdd Zdd Zdd  Zd!d" Zd#d$ Z  ZS )&ClassificationStatsaC  Computes statistics pertaining to multi-label classification tasks, as
    well as tasks that can be loosely interpreted as such for the purpose of evaluations.

    Example
    -------
    >>> import sys
    >>> from speechbrain.utils.metric_stats import ClassificationStats
    >>> cs = ClassificationStats()
    >>> cs.append(
    ...     ids=["ITEM1", "ITEM2", "ITEM3", "ITEM4"],
    ...     predictions=[
    ...         "M EY K AH",
    ...         "T EY K",
    ...         "B AE D",
    ...         "M EY K",
    ...     ],
    ...     targets=[
    ...         "M EY K",
    ...         "T EY K",
    ...         "B AE D",
    ...         "M EY K",
    ...     ],
    ...     categories=[
    ...         "make",
    ...         "take",
    ...         "bad",
    ...         "make"
    ...     ]
    ... )
    >>> cs.write_stats(sys.stdout)
    Overall Accuracy: 75%
    <BLANKLINE>
    Class-Wise Accuracy
    -------------------
    bad -> B AE D : 1 / 1 (100.00%)
    make -> M EY K: 1 / 2 (50.00%)
    take -> T EY K: 1 / 1 (100.00%)
    <BLANKLINE>
    Confusion
    ---------
    Target: bad -> B AE D
      -> B AE D   : 1 / 1 (100.00%)
    Target: make -> M EY K
      -> M EY K   : 1 / 2 (50.00%)
      -> M EY K AH: 1 / 2 (50.00%)
    Target: take -> T EY K
      -> T EY K   : 1 / 1 (100.00%)
    >>> summary = cs.summarize()
    >>> summary['accuracy']
    0.75
    >>> summary['classwise_stats'][('bad', 'B AE D')]
    {'total': 1.0, 'correct': 1.0, 'accuracy': 1.0}
    >>> summary['classwise_stats'][('make', 'M EY K')]
    {'total': 2.0, 'correct': 1.0, 'accuracy': 0.5}
    >>> summary['keys']
    [('bad', 'B AE D'), ('make', 'M EY K'), ('take', 'T EY K')]
    >>> summary['predictions']
    ['B AE D', 'M EY K', 'M EY K AH', 'T EY K']
    >>> summary['classwise_total']
    {('bad', 'B AE D'): 1.0, ('make', 'M EY K'): 2.0, ('take', 'T EY K'): 1.0}
    >>> summary['classwise_correct']
    {('bad', 'B AE D'): 1.0, ('make', 'M EY K'): 1.0, ('take', 'T EY K'): 1.0}
    >>> summary['classwise_accuracy']
    {('bad', 'B AE D'): 1.0, ('make', 'M EY K'): 0.5, ('take', 'T EY K'): 1.0}
    c                    s   t   |   d | _d S r   )superr   r   r   	__class__r   r   r     s   
zClassificationStats.__init__Nc                 C   s@   | j | | j| | j| |dur| j| dS dS )a/  
        Appends inputs, predictions and targets to internal
        lists

        Arguments
        ---------
        ids: list
            the string IDs for the samples
        predictions: list
            the model's predictions (human-interpretable,
            preferably strings)
        targets: list
            the ground truths (human-interpretable, preferably strings)
        categories: list
            an additional way to classify training
            samples. If available, the categories will
            be combined with targets
        N)r   r%   predictionstargets
categories)r   r   r   r   r   r   r   r   r,     s   zClassificationStats.appendc                    sz   |    |  }|  || || j| jd| _dD ]  fdd| jd  D | jd  < q|dur:| j| S | jS )a  Summarize the classification metric scores

        The following statistics are computed:

        accuracy: the overall accuracy (# correct / # total)
        confusion_matrix: a dictionary of type
            {(target, prediction): num_entries} representing
            the confusion matrix
        classwise_stats: computes the total number of samples,
            the number of correct classifications and accuracy
            for each class
        keys: all available class keys, which can be either target classes
            or (category, target) tuples
        predictions: all available predictions all predictions the model
            has made

        Arguments
        ---------
        field : str
            If provided, only returns selected statistic. If not,
            returns all computed statistics.

        Returns
        -------
        float or dict
            Returns a float if ``field`` is provided, otherwise
            returns a dictionary containing all computed stats.
        )accuracyconfusion_matrixclasswise_statskeysr   r   correctr   c                    s   i | ]	\}}||  qS r   r   )rN   r   	key_statsstatr   r   
<dictcomp>4  s    z1ClassificationStats.summarize.<locals>.<dictcomp>r   
classwise_N)_build_lookups_compute_confusion_matrix_compute_accuracy_compute_classwise_stats_available_keys_available_predictionsr   r   )r   r9   r   r   r  r   r<     s   

zClassificationStats.summarizec                 C   s&   t dd t| j| jD t| j S )Nc                 s   s    | ]	\}}||kV  qd S r   r   )rN   
predictionr"   r   r   r   rX   >  s
    
z8ClassificationStats._compute_accuracy.<locals>.<genexpr>)r7   r\   r   r   r8   r   r   r   r   r   r	  =  s
   z%ClassificationStats._compute_accuracyc                 C   sH   |   | _tttdd | jD | _| | j| _| | j| _	d S )Nc                 s   s    | ]}|V  qd S r   r   rN   r  r   r   r   rX   F  s    z5ClassificationStats._build_lookups.<locals>.<genexpr>)
	_get_keysr  r   sortedsetr   r  _index_lookup_keys_lookup_predictions_lookupr   r   r   r   r  C  s   

z"ClassificationStats._build_lookupsc                 C   sV   t t| jt| j}|  D ]\}}| j| }| j| }|||f  d7  < q|S )Nr   )r2   zerosr8   r  r  _get_confusion_entriesr  r  )r   r   r   r  key_idxprediction_idxr   r   r   r  M  s   

z-ClassificationStats._compute_confusion_matrixc                    sh    j dd}jsjndd jD }t fddt|D }|| }dd tj|||D S )Nr   r   c                 S   s   g | ]\}}|qS r   r   )rN   rb   r"   r   r   r   rQ   a      z@ClassificationStats._compute_classwise_stats.<locals>.<listcomp>c                    s0   g | ]\}}|j v r |j | f nd qS )r   )r  )rN   idxr"   r   r   r   r   rQ   d  s    
c                 S   s.   i | ]\}}}}||  |  |  d qS )r   )r   )rN   r   
item_totalitem_correctitem_accuracyr   r   r   r  n  s    
z@ClassificationStats._compute_classwise_stats.<locals>.<dictcomp>)r7   r   r  r2   r4   r   r\   )r   r   r   key_targetsr  r   r   r  r   r
  W  s    

z,ClassificationStats._compute_classwise_statsc                 C   s,   | j rt| j | j}n| j}ttt|S r   )r   r\   r   r   r  r  )r   r   r   r   r   r  y  s   zClassificationStats._get_keysc                 C   s>   | j rdd t| j | j| jD }nt| j| j}t|}|S )Nc                 s   s"    | ]\}}}||f|fV  qd S r   r   )rN   categoryr"   r  r   r   r   rX     s
    

z=ClassificationStats._get_confusion_entries.<locals>.<genexpr>)r   r\   r   r   r   )r   resultr   r   r   r    s   z*ClassificationStats._get_confusion_entriesc                 C   s   dd t |D S )Nc                 S   s   i | ]\}}||qS r   r   )rN   r  r   r   r   r   r    s    z5ClassificationStats._index_lookup.<locals>.<dictcomp>)r   )r   r   r   r   r   r    s   z!ClassificationStats._index_lookupc                 C   s   g | _ g | _g | _g | _dS )zClears the collected statisticsN)r   r   r   r   r   r   r   r   r     r   zClassificationStats.clearc                 C   sX   | j du r	|   td| j d d|d t|d | | t|d | | dS )zOutputs the stats to the specified filestream in a human-readable format

        Arguments
        ---------
        filestream: file
            a file-like object
        NzOverall Accuracy: r   z.0%r   )r   r<   r@   _write_classwise_stats_write_confusionrr   r   r   r   rD     s   



zClassificationStats.write_statsc              
      s    j d|d  fdd jD }tdd | D } jD ]/} jd | }  ||}t| dt|d	  d
t|d  d|d dd|d qd S )NzClass-Wise AccuracyrA   c                    s   i | ]}|  |qS r   )_format_key_label)rN   r   r   r   r   r    s    z>ClassificationStats._write_classwise_stats.<locals>.<dictcomp>c                 s       | ]}t |V  qd S r   r8   )rN   labelr   r   r   rX     s    z=ClassificationStats._write_classwise_stats.<locals>.<genexpr>r   r   r   / r    (r   .2%)r   )	_write_headerr  maxvaluesr   _pad_to_lengthr%  r@   r[   )r   rA   
key_labelslongest_key_labelr   statspadded_labelr   r   r   r"    s   


0z*ClassificationStats._write_classwise_statsc                 C   s   | j d|d tdd | jD }| jd  }|jdd}t| j||D ]K\}}}| |}t	d| |d	 t
|d
k\}	| }|	D ])}
||
  }| j|
 }| ||}t	d| d| d| d|| dd	|d	 qFq%d S )N	Confusionr$  c                 s   r&  r   r'  r  r   r   r   rX     s    
z7ClassificationStats._write_confusion.<locals>.<genexpr>r   r   r   zTarget: r   r   z  -> r   r)  r*  r+  r,  )r-  r.  r  r   r[   r7   r\   r  r%  r@   r2   wherer   r0  )r   rA   longest_predictionr   totalsr   key_predictionsr   target_labelindexesindexcountr  r4  r   r   r   r#    s2   

"z$ClassificationStats._write_confusionc                 C   s$   t ||d t dt| |d d S )Nr   -)r@   r8   )r   headerrA   r   r   r   r-    s   z!ClassificationStats._write_headerc                 C   s   t d|t| }|d|  S )Nr   r>   )r.  r8   )r   r(  rP   paddingr   r   r   r0    s   z"ClassificationStats._pad_to_lengthc                 C   s(   | j r|\}}| d| }|S |}|S )Nz -> )r   )r   r   r   r"   r(  r   r   r   r%    s   z%ClassificationStats._format_key_labelr   )rE   rF   rG   rH   r   r,   r<   r	  r  r  r
  r  r  r  r   rD   r"  r#  r-  r0  r%  __classcell__r   r   r   r   r     s&    B

0

"r   c                   @   s4   e Zd ZdZdddZdd Zdd	 ZdddZd
S )MultiMetricStatsaT	  A wrapper that evaluates multiple metrics simultaneously

    Arguments
    ---------
    metric : function
        The function to use to compute the relevant metrics. Should take
        at least two arguments (predictions and targets) and can
        optionally take the relative lengths of either or both arguments.
        The function should return a dict or a namedtuple
    n_jobs : int
        The number of jobs to use for computing the metric. If this is
        more than one, every sample is processed individually, otherwise
        the whole batch is passed at once.
    batch_eval : bool
        When True it feeds the evaluation metric with the batched input.
        When False and n_jobs=1, it performs metric evaluation one-by-one
        in a sequential way. When False and n_jobs>1, the evaluation
        runs in parallel over the different inputs using joblib.

    Example
    -------
    >>> def metric(a, b):
    ...    return {
    ...        "sum": a + b,
    ...        "diff": a - b,
    ...        "sum_sq": a**2 + b**2
    ...    }
    >>> multi_metric = MultiMetricStats(metric, batch_eval=True)
    >>> multi_metric.append([1, 2], a=torch.tensor([2.0, 1.0]), b=torch.tensor([1.0, 2.0]))
    >>> multi_metric.append([3, 4], a=torch.tensor([4.0, 5.0]), b=torch.tensor([0.0, 1.0]))
    >>> multi_metric.append([5, 6], a=torch.tensor([2.0, 4.0]), b=torch.tensor([4.0, 2.0]))
    >>> multi_metric.append([7, 8], a=torch.tensor([2.0, 4.0]), b=torch.tensor([4.0, 2.0]))
    >>> multi_metric.summarize() #doctest: +NORMALIZE_WHITESPACE
    {'sum': {'average': 5.0,
      'min_score': 3.0,
      'min_id': 1,
      'max_score': 6.0,
      'max_id': 4},
     'diff': {'average': 1.0,
      'min_score': -2.0,
      'min_id': 5,
      'max_score': 4.0,
      'max_id': 3},
     'sum_sq': {'average': 16.5,
      'min_score': 5.0,
      'min_id': 1,
      'max_score': 26.0,
      'max_id': 4}}
    >>> multi_metric.summarize(flat=True) #doctest: +NORMALIZE_WHITESPACE
    {'sum_average': 5.0,
     'sum_min_score': 3.0,
     'sum_min_id': 1,
     'sum_max_score': 6.0,
     'sum_max_id': 4,
     'diff_average': 1.0,
     'diff_min_score': -2.0,
     'diff_min_id': 5,
     'diff_max_score': 4.0,
     'diff_max_id': 3,
     'sum_sq_average': 16.5,
     'sum_sq_min_score': 5.0,
     'sum_sq_min_id': 1,
     'sum_sq_max_score': 26.0,
     'sum_sq_max_id': 4}
    r   Fc                 C   s&   t || _|| _|| _g | _i | _d S r   )_dictifyr   r   r   r   metricsr   r   r   r   r   &  s
   

zMultiMetricStats.__init__c                    s   | j | | jr| j|i |}n6d|vsd|vrtd| jdkr-t| jfi | ntd| j| jd|  d 	 } fdd|D }|
 D ]\}}|| jvr`td	d
 dd| j|< | j| || qLdS )r    r!   r"   r#   r   r$   r   c                    s&   i | ]  t  fd dD qS )c                    s   g | ]}|  qS r   r   )rN   r`   r   r   r   rQ   O  r  z6MultiMetricStats.append.<locals>.<dictcomp>.<listcomp>)r2   r4   )rN   
scores_rawrE  r   r  N  s    z+MultiMetricStats.append.<locals>.<dictcomp>c                 S   s   | S r   r   )xr   r   r   <lambda>U  s    z)MultiMetricStats.append.<locals>.<lambda>T)r   Nr   )r   r%   r   eval_simpler'   r   r(   r   r)   r   r   rD  r   r,   )r   r   r*   r+   r   r   r   metric_scoresr   rF  r   r,   -  s.   


zMultiMetricStats.appendc                 O   s"   | j |i |}dd | D S )z3Evaluates the metric in a simple, sequential mannerc                 S   s   i | ]	\}}||  qS r   )r&   )rN   r   r`   r   r   r   r  [  s    z0MultiMetricStats.eval_simple.<locals>.<dictcomp>)r   r   )r   r*   r+   r   r   r   r   rJ  X  s   zMultiMetricStats.eval_simpleNc                    s2    fdd| j  D }|rdd | D }|S )a  Summarize the metric scores, returning relevant stats.

        Arguments
        ---------
        field : str
            If provided, only returns selected statistic. If not,
            returns all computed statistics.
        flat : bool
            whether to flatten the dictionary

        Returns
        -------
        dict
            Returns a dictionary of all computed stats
        c                    s   i | ]
\}}||  qS r   )r<   )rN   r   r   r9   r   r   r  m  s    z.MultiMetricStats.summarize.<locals>.<dictcomp>c                 S   s2   i | ]\}}|  D ]\}}| d | |q
qS )rb   )r   )rN   r   fieldsr9   valuer   r   r   r  q  s    )rD  r   )r   r9   flatr!  r   rL  r   r<   ]  s   
zMultiMetricStats.summarize)r   F)NF)rE   rF   rG   rH   r   r,   rJ  r<   r   r   r   r   rB    s    
B+rB  c                    s   d fdd}|S )a  A wrapper that converts functions returning
    namedtuples to functions returning dicts while leaving
    functions returning dicts intact

    Arguments
    ---------
    f : callable
        a function

    Returns
    -------
    result : callable
        a wrapped function
    Nc                     s0    | i |}du rt |dr| S |S )zThe wrapper functionN_asdict)hasattrrP  )r*   r+   r!  f
has_asdictr   r   wrapper  s   
z_dictify.<locals>.wrapperr   )rS  rU  r   rR  r   rC  y  s   rC  )NrI   r   )r   r   r   )"rH   typingr   r   r2   joblibr   r   speechbrain.dataio.dataior   r   r   speechbrain.dataio.werr	   r
   speechbrain.utils.data_utilsr   speechbrain.utils.edit_distancer   r   r   r   r   r)   r(   ra   ru   r   r   r   r   r   rB  rC  r   r   r   r   <module>   s8    
 

 0 2i <
H  9 