o
    wi                     @   sf  d dl Z d dlZd dlmZmZmZmZmZmZm	Z	m
Z
 d dlZe	dZ	ddee dee dedeee
eef f fd	d
Z	ddee dee dededeeeef  f
ddZ			 d deee  deee  deeee   dededefddZdededeeef fddZ	d!deee  deee  deee  deeee  df fddZdZdS )"    N)DictIterableListOptionalSequenceTupleTypeVarUnionSymbolFrefhypsclite_modereturnc           	      C   s   t ttt| t|B }dd | D }g }g }| D ]	}|||  q|D ]	}|||  q)t|||}t||d< z|d t| |d< W |S  t	yj   |d dkrad|d< Y |S t
d|d< Y |S w )	at  
    Compute the edit distance between sequences ``ref`` and ``hyp``.
    Both sequences can be strings or lists of strings or ints.

    Optional ``sclite_mode`` sets INS/DEL/SUB costs to 3/3/4 for
    compatibility with sclite tool.

    Returns a dict with keys:
    * ``ins`` -- the number of insertions (in ``hyp`` vs ``ref``)
    * ``del`` -- the number of deletions (in ``hyp`` vs ``ref``)
    * ``sub`` -- the number of substitutions
    * ``total`` -- total number of errors
    * ``ref_len`` -- the number of symbols in ``ref``
    * ``err_rate`` -- the error rate  (total number of errors divided by ``ref_len``)
    c                 S      i | ]\}}||qS  r   .0kvr   r   P/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/kaldialign/__init__.py
<dictcomp>       z!edit_distance.<locals>.<dictcomp>ref_lentotalerr_rater   g        inf)dict	enumeratesortedsetitemsappend_kaldialignedit_distancelenZeroDivisionErrorfloat)	r   r   r   int2symsym2intrefihypisymansr   r   r   r#   
   s(   
r#   
eps_symbolc                 C   s   t ttt| t|B |hB }dd | D }g }g }| D ]	}|||  q |D ]	}|||  q,|| }	t|||	|}
g }tt	|
D ]}|||
| d  ||
| d  f qJ|S )a  
    Compute the alignment between sequences ``ref`` and ``hyp``.
    Both sequences can be strings or lists of strings or ints.

    ``eps_symbol`` is used as a blank symbol to indicate insertion or deletion.

    Optional ``sclite_mode`` sets INS/DEL/SUB costs to 3/3/4 for
    compatibility with sclite tool.

    Returns a list of pairs of alignment symbols. The presence of ``eps_symbol``
    in the first pair index indicates insertion, and in the second pair index, deletion.
    Mismatched symbols indicate substitution.
    c                 S   r   r   r   r   r   r   r   r   G   r   zalign.<locals>.<dictcomp>r      )
r   r   r   r   r    r!   r"   alignranger$   )r   r   r-   r   r'   r(   aibir+   eps_int	alignmentaliidxr   r   r   r/   3   s   "(r/   '  refshypshyps2replicationsseedc                 C   s.  ddl m}m}m} t|t| ks!J dt|  dt| d|dks)J d|dks1J dt| ts;t|tr?J dt| ||\} }}|| |}||||d	\}	}
t|	|
}|d
u ra|S t|t| ksxJ dt|  dt| d|| |}||||d	\}}|||||d	}|t|||dS )aq  
    Compute a boostrapping of WER to extract the 95% confidence interval (CI)
    using the bootstrap method of Bisani and Ney [1].
    The implementation is based on Kaldi's ``compute-wer-bootci`` script [2].

    Args:
        refs: A list of reference sequences (str, list[str], list[list[[int]])
        hyps: A list of hypothesis sequences from system1 (str, list[str], list[list[int]])
        hyps2: A list of hypothesis sequences from system2 (str, list[str], list[list[int]]).
            When provided, we'll compute CI for both systems as well as the probability
            of system2 improving over system1.
        replications: The number of replications to use for bootstrapping.
        seed: The random seed to reproduce the results.

    Returns:
        A dict with results. When scoring a single system (``hyp2_seqs=None``), the keys are:
            - "wer" (mean WER estimate),
            - "ci95" (95% confidence interval size),
            - "ci95min" (95% confidence interval lower bound)
            - "ci95max" (95% confidence interval upper bound)
        When scoring two systems, the keys are "system1", "system2", and "p_s2_improv_over_s1".
        The first two keys contain dicts as described for the single-system case, and the last key's
        value is a float in the range [0, 1].

    [1] Bisani, M., & Ney, H. (2004, May). Bootstrap estimates for confidence intervals in ASR performance evaluation.
        In 2004 IEEE International Conference on Acoustics, Speech, and Signal Processing (Vol. 1, pp. I-409). IEEE.

    [2] https://github.com/kaldi-asr/kaldi/blob/master/src/bin/compute-wer-bootci.cc
    r   )_get_boostrap_wer_interval
_get_edits_get_p_improvz"Inconsistent number of reference (z) and hypothesis (z) sequences.z2The number of replications must be greater than 0.zThe seed must be 0 or greater.z=The input must be a list of strings or list of lists of ints.)r;   r<   Nz.) sequences for the second system (hyp2_seqs).)system1system2p_s2_improv_over_s1)	r"   r=   r>   r?   r$   
isinstancestr_convert_to_int_build_results)r8   r9   r:   r;   r<   r=   r>   r?   edit_sym_per_hypmeanintervalans1edit_sym_per_hyp2mean2	interval2p_improvr   r   r   bootstrap_wer_ci\   sJ   $




rO   rH   rI   c                 C   s   | || | | | dS )N)werci95ci95minci95maxr   )rH   rI   r   r   r   rF      s
   rF   hyp2.c                    sz   | |g}|d ur| | ttdd |D }tt|}dd | D   fdd|D }|d u r9| d  t|S )Nc                 s   s(    | ]}|D ]
}|D ]}|V  q
qqd S Nr   )r   sourceseqsymbolr   r   r   	<genexpr>   s   & z"_convert_to_int.<locals>.<genexpr>c                 S   r   r   r   r   r   r   r   r      r   z#_convert_to_int.<locals>.<dictcomp>c                       g | ]} fd d|D qS )c                    rZ   )c                    s   g | ]} | qS r   r   )r   itemr(   r   r   
<listcomp>   s    z9_convert_to_int.<locals>.<listcomp>.<listcomp>.<listcomp>r   )r   rW   r\   r   r   r]          z._convert_to_int.<locals>.<listcomp>.<listcomp>r   )r   rV   r\   r   r   r]      r^   z#_convert_to_int.<locals>.<listcomp>)r!   r   r   r   r   r    tuple)r   r   rT   sourcessymbolsr'   intsr   r\   r   rE      s   

rE   z0.9.3)F)Nr7   r   rU   )mathrandomtypingr   r   r   r   r   r   r   r	   r"   r
   boolrD   intr&   r#   r/   rO   rF   rE   __version__r   r   r   r   <module>   sn    (
-
,


J



