o
    %ݫij                     @   s  d Z ddlZddlmZ dddddZd	ed
efddZe efdeeegef fddZ	efdeeegef fddZ
efdeeegef fddZdd Zdd Zdd Zdefdeeegef fddZddefdeeegef fddZd d! Zd"d# Zd*d%d&Zd+d(d)ZdS ),zVEdit distance and WER computation.

Authors
 * Aku Rouhe 2020
 * Salima Mdhaffar 2021
    N)Callable=IDS)eqinsdelsubabc                 C   s   | |kS N )r   r   r   r   S/home/ubuntu/.local/lib/python3.10/site-packages/speechbrain/utils/edit_distance.py_str_equals   s   r   equality_comparatorc                 C   s`   |t | ||d }|d dkrtd|d< |S t|d |d |d g}d	| |d  |d< |S )
a  Computes word error rate and the related counts for a batch.

    Can also be used to accumulate the counts over many batches, by passing
    the output back to the function in the call for the next batch.

    Arguments
    ---------
    refs : iterable
        Batch of reference sequences.
    hyps : iterable
        Batch of hypothesis sequences.
    stats : collections.Counter
        The running statistics.
        Pass the output of this function back as this parameter
        to accumulate the counts. It may be cleanest to initialize
        the stats yourself; then an empty collections.Counter() should
        be used.
    equality_comparator : Callable[[str, str], bool]
        The function used to check whether two words are equal.

    Returns
    -------
    collections.Counter
        The updated running statistics, with keys:

        * "WER" - word error rate
        * "insertions" - number of insertions
        * "deletions" - number of deletions
        * "substitutions" - number of substitutions
        * "num_ref_tokens" - number of reference tokens

    Example
    -------
    >>> import collections
    >>> batches = [[[[1,2,3],[4,5,6]], [[1,2,4],[5,6]]],
    ...             [[[7,8], [9]],     [[7,8],  [10]]]]
    >>> stats = collections.Counter()
    >>> for batch in batches:
    ...     refs, hyps = batch
    ...     stats = accumulatable_wer_stats(refs, hyps, stats)
    >>> print("%WER {WER:.2f}, {num_ref_tokens} ref tokens".format(**stats))
    %WER 33.33, 9 ref tokens
    r   num_ref_tokensr   nanWER
insertions	deletionssubstitutions      Y@)_batch_statsfloatsum)refshypsstatsr   updated_stats	num_editsr   r   r   accumulatable_wer_stats   s   1r"   c                 C   sj   t | t |krtdt }t| |D ]\}}t|||d}t|}||7 }|d  t |7  < q|S )a  Internal function which actually computes the counts.

    Used by accumulatable_wer_stats

    Arguments
    ---------
    refs : iterable
        Batch of reference sequences.
    hyps : iterable
        Batch of hypothesis sequences.
    equality_comparator : Callable[[str, str], bool]
        The function used to check whether two words are equal.

    Returns
    -------
    collections.Counter
        Edit statistics over the batch, with keys:

        * "insertions" - number of insertions
        * "deletions" - number of deletions
        * "substitutions" - number of substitutions
        * "num_ref_tokens" - number of reference tokens

    Example
    -------
    >>> from speechbrain.utils.edit_distance import _batch_stats
    >>> batch = [[[1,2,3],[4,5,6]], [[1,2,4],[5,6]]]
    >>> refs, hyps = batch
    >>> print(_batch_stats(refs, hyps))
    Counter({'num_ref_tokens': 6, 'substitutions': 1, 'deletions': 1})
    z=The reference and hypothesis batches are not of the same sizer   r   )len
ValueErrorcollectionsCounterzipop_table	count_ops)r   r   r   r   
ref_tokens
hyp_tokenstableeditsr   r   r   r   _   s   "r   c                    s  dd t t d D }dgt d  } fddt t| d D }t t| d D ]
}td || d< q-t t d D ]
}td |d |< q@td |d d< t| dd	D ]t\}}|d  d7  < t dd	D ]W\}}	||d  d }
|| d }|||	rdnd}||d  | }||
k r||k r|||< |rtd
 || |< qk||
k r|||< td || |< qk|
||< td || |< qk|dd |dd< qY|S )a5  Table of edit operations between a and b.

    Solves for the table of edit operations, which is mainly used to
    compute word error rate. The table is of size ``[|a|+1, |b|+1]``,
    and each point ``(i, j)`` in the table has an edit operation. The
    edit operations can be deterministically followed backwards to
    find the shortest edit path to from ``a[:i-1] to b[:j-1]``. Indexes
    of zero (``i=0`` or ``j=0``) correspond to an empty sequence.

    The algorithm itself is well known, see

    `Levenshtein distance <https://en.wikipedia.org/wiki/Levenshtein_distance>`_

    Note that in some cases there are multiple valid edit operation
    paths which lead to the same edit distance minimum.

    Arguments
    ---------
    a : iterable
        Sequence for which the edit operations are solved.
    b : iterable
        Sequence for which the edit operations are solved.
    equality_comparator : Callable[[str, str], bool]
        The function used to check whether two words are equal.

    Returns
    -------
    list
        List of lists, Matrix, Table of edit operations.

    Example
    -------
    >>> ref = [1,2,3]
    >>> hyp = [1,2,4]
    >>> for row in op_table(ref, hyp):
    ...     print(row)
    ['=', 'I', 'I', 'I']
    ['D', '=', 'I', 'I']
    ['D', 'D', '=', 'I']
    ['D', 'D', 'D', 'S']
    c                 S   s   g | ]}|qS r   r   .0jr   r   r   
<listcomp>   s    zop_table.<locals>.<listcomp>   r   c                    s&   g | ]}d d t t d D qS )c                 S   s   g | ]}t d  qS )r   )EDIT_SYMBOLSr.   r   r   r   r1      s    z'op_table.<locals>.<listcomp>.<listcomp>r2   )ranger#   )r/   ir   r   r   r1      s    r	   r   r   )startr
   N)r4   r#   r3   	enumerate)r   r   r   prev_rowcurr_rowr,   r5   r0   a_tokenb_tokeninsertion_costdeletion_costsubstitutionsubstitution_costr   r6   r   r(      s<   /
r(   c                 C   s^  g }t | d }t | d d }|dkr|dks|dkr,|d8 }|dtd d|f ny|dkr@|d8 }|dtd |df ne| | | td krZ|d8 }|dtd d|f nK| | | td krt|d8 }|dtd |df n1| | | td kr|d8 }|d8 }|dtd ||f n|d8 }|d8 }|dtd ||f |dkr|dkr|S )a0  Get the edit distance alignment from an edit op table.

    Walks back an edit operations table, produced by calling ``table(a, b)``,
    and collects the edit distance alignment of a to b. The alignment
    shows which token in a corresponds to which token in b. Note that the
    alignment is monotonic, one-to-zero-or-one.

    Arguments
    ---------
    table : list
        Edit operations table from ``op_table(a, b)``.

    Returns
    -------
    list
        Schema: ``[(str <edit-op>, int-or-None <i>, int-or-None <j>),]``
        List of edit operations, and the corresponding indices to a and b.
        See the EDIT_SYMBOLS dict for the edit-ops.
        The i indexes a, j indexes b, and the indices can be None, which means
        aligning to nothing.

    Example
    -------
    >>> # table for a=[1,2,3], b=[1,2,4]:
    >>> table = [['I', 'I', 'I', 'I'],
    ...          ['D', '=', 'I', 'I'],
    ...          ['D', 'D', '=', 'I'],
    ...          ['D', 'D', 'D', 'S']]
    >>> print(alignment(table))
    [('=', 0, 0), ('=', 1, 1), ('S', 2, 2)]
    r2   r   r   Nr	   r
   r   )r#   insertr3   )r,   	alignmentr5   r0   r   r   r   rB      s2   "rB   c                 C   s  t  }t| d }t| d d }|dkr|dks|dkr+|d  d7  < |d8 }nY|dkr<|d  d7  < |d8 }nH| | | td krS|d  d7  < |d8 }n1| | | td krj|d  d7  < |d8 }n| | | td kr||d  d7  < |d8 }|d8 }|dkr|dkr|S )	a  Count the edit operations in the shortest edit path in edit op table.

    Walks back an edit operations table produced by table(a, b) and
    counts the number of insertions, deletions, and substitutions in the
    shortest edit path. This information is typically used in speech
    recognition to report the number of different error types separately.

    Arguments
    ---------
    table : list
        Edit operations table from ``op_table(a, b)``.

    Returns
    -------
    collections.Counter
        The counts of the edit operations, with keys:

        * "insertions"
        * "deletions"
        * "substitutions"

        NOTE: not all of the keys might appear explicitly in the output,
        but for the missing keys collections. The counter will return 0.

    Example
    -------
    >>> table = [['I', 'I', 'I', 'I'],
    ...          ['D', '=', 'I', 'I'],
    ...          ['D', 'D', '=', 'I'],
    ...          ['D', 'D', 'D', 'S']]
    >>> print(count_ops(table))
    Counter({'substitutions': 1})
    r2   r   r   r   r   r	   r
   r   )r%   r&   r#   r3   )r,   r-   r5   r0   r   r   r   r)   -  s,   "



r)   c                 C   s   t t| |S r   )dictr'   )idsseqsr   r   r   _batch_to_dict_formati  s   rF   Fc                 C   s&   t | |}t | |}t|||d|dS )a&  Convenient batch interface for ``wer_details_by_utterance``.

    ``wer_details_by_utterance`` can handle missing hypotheses, but
    sometimes (e.g. CTC training with greedy decoding) they are not needed,
    and this is a convenient interface in that case.

    Arguments
    ---------
    ids : list, torch.tensor
        Utterance ids for the batch.
    refs : list, torch.tensor
        Reference sequences.
    hyps : list, torch.tensor
        Hypothesis sequences.
    compute_alignments : bool, optional
        Whether to compute alignments or not. If computed, the details
        will also store the refs and hyps. (default: False)
    equality_comparator : Callable[[str, str], bool]
        The function used to check whether two words are equal.

    Returns
    -------
    list
        See ``wer_details_by_utterance``

    Example
    -------
    >>> ids = [['utt1'], ['utt2']]
    >>> refs = [[['a','b','c']], [['d','e']]]
    >>> hyps = [[['a','b','d']], [['d','e']]]
    >>> wer_details = []
    >>> for ids_batch, refs_batch, hyps_batch in zip(ids, refs, hyps):
    ...     details = wer_details_for_batch(ids_batch, refs_batch, hyps_batch)
    ...     wer_details.extend(details)
    >>> print(wer_details[0]['key'], ":",
    ...     "{:.2f}".format(wer_details[0]['WER']))
    utt1 : 33.33
    strict)compute_alignmentsscoring_moder   )rF   wer_details_by_utterance)rD   r   r   rH   r   r   r   r   wer_details_for_batchn  s   
-
rK   rG   c                 C   s  g }|   D ]\}}|ddddt|ddddd|r|nddd}||v r0|ddi || }	n1|dkr>|ddi g }	n#|dkrO|ddi || q|dkr[td	| d
 td| t||	|d}
t|
}t|dksx|d dkr{d}nt|}|dt|	dkrdndt|	 |dt|	  t
d| |d |d |d |rt|
nd|r|nd|r|	ndd || q|S )a;	  Computes a wealth WER info about each single utterance.

    This info can then be used to compute summary details (WER, SER).

    Arguments
    ---------
    ref_dict : dict
        Should be indexable by utterance ids, and return the reference tokens
        for each utterance id as iterable
    hyp_dict : dict
        Should be indexable by utterance ids, and return
        the hypothesis tokens for each utterance id as iterable
    compute_alignments : bool
        Whether alignments should also be saved.
        This also saves the tokens themselves, as they are probably
        required for printing the alignments.
    scoring_mode : {'strict', 'all', 'present'}
        How to deal with missing hypotheses (reference utterance id
        not found in hyp_dict).

        * 'strict': Raise error for missing hypotheses.
        * 'all': Score missing hypotheses as empty.
        * 'present': Only score existing hypotheses.
    equality_comparator : Callable[[str, str], bool]
        The function used to check whether two words are equal.

    Returns
    -------
    list
        A list with one entry for every reference utterance. Each entry is a
        dict with keys:

        * "key": utterance id
        * "scored": (bool) Whether utterance was scored.
        * "hyp_absent": (bool) True if a hypothesis was NOT found.
        * "hyp_empty": (bool) True if hypothesis was considered empty
          (either because it was empty, or not found and mode 'all').
        * "num_edits": (int) Number of edits in total.
        * "num_ref_tokens": (int) Number of tokens in the reference.
        * "WER": (float) Word error rate of the utterance.
        * "insertions": (int) Number of insertions.
        * "deletions": (int) Number of deletions.
        * "substitutions": (int) Number of substitutions.
        * "alignment": If compute_alignments is True, alignment as list,
          see ``speechbrain.utils.edit_distance.alignment``.
          If compute_alignments is False, this is None.
        * "ref_tokens": (iterable) The reference tokens
          only saved if alignments were computed, else None.
        * "hyp_tokens": (iterable) the hypothesis tokens,
          only saved if alignments were computed, else None.

    Raises
    ------
    KeyError
        If scoring mode is 'strict' and a hypothesis is not found.
    FN)keyscored
hyp_absent	hyp_emptyr!   r   r   r   r   r   rB   r*   r+   rN   allTpresentrG   zKey z; in reference but missing in hypothesis and strict mode on.zInvalid scoring mode: r   r    r   r2   r   r   r   )rM   rO   r!   r   r   r   r   r   rB   r*   r+   )itemsr#   updateappendKeyErrorr$   r(   r)   r   valuesmaxrB   )ref_dicthyp_dictrH   rI   r   details_by_utterancerL   r*   utterance_detailsr+   r,   opsr   r   r   r   rJ     st   ?





rJ   c                 C   s   d } }}d } } } } }}	| D ]>}
|	d7 }	|
d rJ|d7 }||
d 7 }||
d 7 }||
d 7 }||
d 7 }||
d 7 }|
d dkrJ|d7 }|
d	 rR|d7 }q|dkr^d
| | }nd}|d
| | ||||||	|||d}|S )aD  
    Computes summary stats from the output of details_by_utterance

    Summary stats like WER

    Arguments
    ---------
    details_by_utterance : list
        See the output of wer_details_by_utterance

    Returns
    -------
    dict
        Dictionary with keys:

        * "WER": (float) Word Error Rate.
        * "SER": (float) Sentence Error Rate (percentage of utterances
          which had at least one error).
        * "num_edits": (int) Total number of edits.
        * "num_scored_tokens": (int) Total number of tokens in scored
          reference utterances (a missing hypothesis might still
          have been scored with 'all' scoring mode).
        * "num_erroneous_sents": (int) Total number of utterances
          which had at least one error.
        * "num_scored_sents": (int) Total number of utterances
          which were scored.
        * "num_absent_sents": (int) Hypotheses which were not found.
        * "num_ref_sents": (int) Number of all reference utterances.
        * "insertions": (int) Total number of insertions.
        * "deletions": (int) Total number of deletions.
        * "substitutions": (int) Total number of substitutions.

        NOTE: Some cases lead to ambiguity over number of
        insertions, deletions and substitutions. We
        aim to replicate Kaldi compute_wer numbers.
    r   r2   rM   r   r   r   r   r!   rN   r   g        )r   SERr!   num_scored_tokensnum_erroneous_sentsnum_scored_sentsnum_absent_sentsnum_ref_sentsr   r   r   r   )r[   r   delssubsr_   ra   r!   r`   rb   rc   detsr   wer_detailsr   r   r   wer_summary)  sF   &
rh   c                 C   s2  i }| D ]W}||d  }| |t|dddddddddd
}t }|d r/|ddi |d rV|d|d |d	 |d
 |d |d d |d dkrV|ddi || qg }| D ]4\}}||d< |d dkrd|d  |d  |d< d|d  |d  |d< nd|d< d|d< || qb|S )a  Compute word error rate and another salient info grouping by speakers.

    Arguments
    ---------
    details_by_utterance : list
        See the output of wer_details_by_utterance
    utt2spk : dict
        Map from utterance id to speaker id


    Returns
    -------
    dict
        Maps speaker id to a dictionary of the statistics, with keys:

        * "speaker": Speaker id,
        * "num_edits": (int) Number of edits in total by this speaker.
        * "insertions": (int) Number insertions by this speaker.
        * "dels": (int) Number of deletions by this speaker.
        * "subs": (int) Number of substitutions by this speaker.
        * "num_scored_tokens": (int) Number of scored reference
          tokens by this speaker (a missing hypothesis might still
          have been scored with 'all' scoring mode).
        * "num_scored_sents": (int) number of scored utterances
          by this speaker.
        * "num_erroneous_sents": (int) number of utterance with at least
          one error, by this speaker.
        * "num_absent_sents": (int) number of utterances for which no
          hypotheses was found, by this speaker.
        * "num_ref_sents": (int) number of utterances by this speaker
          in total.
    rL   r   )
speakerr   rd   re   r_   ra   r!   r`   rb   rc   rN   rb   r2   rM   r   r   r   r   r!   )ra   r_   r   rd   re   r!   r`   ri   ra   r   r_   r   r^   N)
setdefaultr%   r&   rT   rS   rU   )r[   utt2spkdetails_by_speakerrf   ri   spk_dets	utt_statsdetails_by_speaker_dictsr   r   r   wer_details_by_speakert  sf   "
rp      c                 C   s   dd | D }t |dd dd}g }g }|rTt||k s"t||k rT|d}|d r7t||k r7|| n|d sFt||k rF|| |rTt||k s"t||k s"||fS )	a  
    Finds the k utterances with highest word error rates.

    Useful for diagnostic purposes, to see where the system
    is making the most mistakes.
    Returns results utterances which were not empty
    i.e. had to have been present in the hypotheses, with output produced

    Arguments
    ---------
    details_by_utterance : list
        See output of wer_details_by_utterance.
    top_k : int
        Number of utterances to return.

    Returns
    -------
    list
        List of at most K utterances,
        with the highest word error rates, which were not empty.
        The utterance dict has the same keys as
        details_by_utterance.
    c                 S   s   g | ]}|d  r|qS )rM   r   r/   rf   r   r   r   r1     s
    z top_wer_utts.<locals>.<listcomp>c                 S      | d S Nr   r   dr   r   r   <lambda>      ztop_wer_utts.<locals>.<lambda>TrL   reverser   rO   )sortedr#   poprU   )r[   top_kscored_utterancesutts_by_wertop_non_empty	top_emptyuttr   r   r   top_wer_utts  s$   


r   
   c                 C   s<   dd | D }t |dd dd}t||kr|d| S |S )a  
    Finds the K speakers with the highest word error rates.

    Useful for diagnostic purposes.

    Arguments
    ---------
    details_by_speaker : list
        See output of wer_details_by_speaker.
    top_k : int
        Number of speakers to return.

    Returns
    -------
    list
        List of at most K dicts (with the same keys as details_by_speaker)
        of speakers sorted by WER.
    c                 S   s   g | ]
}|d  dkr|qS )ra   r   r   rr   r   r   r   r1     s    z top_wer_spks.<locals>.<listcomp>c                 S   rs   rt   r   ru   r   r   r   rw     rx   ztop_wer_spks.<locals>.<lambda>Try   N)r{   r#   )rl   r}   scored_speakersspks_by_werr   r   r   top_wer_spks  s   r   )rq   )r   )__doc__r%   typingr   r3   strr   r&   boolr"   r   r(   rB   r)   rF   rK   rJ   rh   rp   r   r   r   r   r   r   <module>   sR    
E
2
^?<	
;
 K
]+