o
    wiWQ                     @   s  d dl Z d dl mZ d dlmZmZmZmZ d dlZd dl	Z
d dlZd dlmZmZ d dlmZ d dlmZ d dlmZ d dlmZ g d	Zd
ee dee dee fddZ	d5dedededededeeef dedeeeef eeef f fddZd6ddZ				d7deded eee  d!ed"ed#edeeeef  fd$d%Z d8d'd(Z!d)ee d*ee deeeef fd+d,Z"	-d9d)ee d*ee d.edeeeef fd/d0Z#d1eee  d2eee  deee ee ee f fd3d4Z$dS ):    N)permutations)DictListOptionalTuple)SegmentTimeline)DiarizationErrorRate)word_error_rate)linear_sum_assignment)logging)score_labelscalculate_session_cpWER"calculate_session_cpWER_bruteforceconcat_perm_word_error_ratepred_labels
ref_labelsreturnc           	      C   s   t |dkrg S t | dkrd}n	tdd | D }g }|D ]4}| \}}}t|t|}}||k rIt||}| d| d| }|| q||k rR|| q|S )a_  
    For evaluation of online diarization performance, generate partial reference labels
    from the last prediction time.

    Args:
        pred_labels (list[str]): list of partial prediction labels
        ref_labels (list[str]): list of full reference labels

    Returns:
        ref_labels_out (list[str]): list of partial reference labels
    r   c                 S   s   g | ]
}t | d  qS )   )floatsplit).0labels r   ]/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/asr/metrics/der.py
<listcomp>:   s    z*get_partial_ref_labels.<locals>.<listcomp> )lenmaxr   r   minappend)	r   r   last_pred_timeref_labels_outlabelstartendspeakerend_timer   r   r   get_partial_ref_labels%   s"   

r(      DERCERFAMISSdiar_eval_countder_stat_dictdecic                 C   s   t d|  |t d| |t d| |t d| |d}|d  | 7  < |d  |7  < t d|d  | ||d< t d|d  | ||d< t t|d |d ||d< t t|d	 |d
 ||d
< ||fS )a  
    For evaluation of online diarization performance, add cumulative, average, and maximum DER/CER.

    Args:
        DER (float): Diarization Error Rate from the start to the current point
        CER (float): Confusion Error Rate from the start to the current point
        FA (float): False Alarm from the start to the current point
        MISS (float): Miss rate from the start to the current point
        diar_eval_count (int): Number of evaluation sessions
        der_stat_dict (dict): Dictionary containing cumulative, average, and maximum DER/CER
        deci (int): Number of decimal places to round

    Returns:
        der_dict (dict): Dictionary containing DER, CER, FA, and MISS
        der_stat_dict (dict): Dictionary containing cumulative, average, and maximum DER/CER
    d   )r*   r+   r,   r-   cum_DERcum_CERavg_DERavg_CERr*   max_DERr+   max_CER)roundr   )r*   r+   r,   r-   r.   r/   r0   der_dictr   r   r   get_online_DER_statsJ   s   r:    c           	      C   s|   t |d}t| d*}| }|D ]}| }| \}}}}|tt|t| qW d   |S 1 s7w   Y  |S )z
    Generate pyannote timeline segments for uem file

     <UEM> file format
     UNIQ_SPEAKER_ID CHANNEL START_TIME END_TIME
    )urirN)r   open	readlinesstripr   addr   r   )	uem_file	uniq_nametimelineflinesline_
start_timer'   r   r   r   uem_timeline_from_filer   s   

rJ         ?Tall_referenceall_hypothesisall_uemcollarignore_overlapverbosec                 C   sX  d}t |t |kr"td| |d}i d}}	tt||D ]\}
\}}|\}}|\}}t | t | kr=|	d7 }	|ret | t | kretd|dd  dt |  d	t |   d}|durv|||||
 d
d n(| | dddur| | dd}t||d}||||d
d n|||d
d |	||||< q|	t | }t
|}|d dkrtd|d |d  }|d |d  }|d |d  }||||f}|rtdd tdd tdd tdd td|   td| d| d|dd|dd|dd|dd |dd |||fS |r*td! dS )"a  
    Calculate DER, CER, FA and MISS rate from hypotheses and references. Hypothesis results are
    coming from Pyannote-formatted speaker diarization results and References are coming from
    Pyannote-formatted RTTM data.

    Args:
        AUDIO_RTTM_MAP (dict): Dictionary containing information provided from manifestpath
        all_reference (list[uniq_name,Annotation]): reference annotations for score calculation
        all_hypothesis (list[uniq_name,Annotation]): hypothesis annotations for score calculation
        all_uem (list[list[float]]): List of UEM segments for each audio file. If UEM file is not provided,
                                     it will be read from manifestpath
        collar (float): Length of collar (in seconds) for diarization error rate calculation
        ignore_overlap (bool): If True, overlapping segments in reference and hypothesis will be ignored
        verbose (bool): If True, warning messages will be printed

    Returns:
        metric (pyannote.DiarizationErrorRate): Pyannote Diarization Error Rate metric object.
                                                This object contains detailed scores of each audiofile.
        mapping (dict): Mapping dict containing the mapping speaker label for each audio input
        itemized_errors (tuple): Tuple containing (DER, CER, FA, MISS) for each audio file.
            - DER: Diarization Error Rate, which is sum of all three errors, CER + FA + MISS.
            - CER: Confusion Error Rate, which is sum of all errors
            - FA: False Alarm Rate, which is the number of false alarm segments
            - MISS: Missed Detection Rate, which is the number of missed detection segments

    < Caveat >
    Unlike md-eval.pl, "no score" collar in pyannote.metrics is the maximum length of
    "no score" collar from left to right. Therefore, if 0.25s is applied for "no score"
    collar in md-eval.pl, 0.5s should be applied for pyannote.metrics.
    N   )rO   skip_overlapr   r   z!Wrong Spk. Count with uniq_id:...iz, Ref: z, Hyp: T)uemdetaileduem_filepath)rB   rC   )rU   totalz"Total evaluation time is 0. Abort.	confusionzfalse alarmzmissed detectionzdisplay.max_rowszdisplay.max_columnszdisplay.widthzdisplay.max_colwidth
zCumulative Results for collar z sec and ignore_overlap z	: 
| FA: z.4fz	 | MISS: z | CER: z | DER: z | Spk. Count Acc. z|Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate)r   r	   	enumeratezipr   r   infogetrJ   optimal_mappingabs
ValueErrorpd
set_optionreportwarning)AUDIO_RTTM_MAPrL   rM   rN   rO   rP   rQ   metricmapping_dictcorrect_spk_countidx	reference
hypothesisref_keyr   rH   
hyp_labelsuem_objrB   spk_count_accr*   r+   r,   r-   itemized_errorsr   r   r   r      sv   '



r   allc                 C   sr   g }|dkr
dg}n|dkrdg}n|dkrdg}n|dkr#g d}nt d	|D ]\}}t| ||||d
}q)|S )a  
    Evaluate with a selected diarization evaluation scheme

    AUDIO_RTTM_MAP (dict):
        Dictionary containing information provided from manifestpath
    all_reference (list[uniq_name,annotation]):
        reference annotations for score calculation
    all_hypothesis (list[uniq_name,annotation]):
        hypothesis annotations for score calculation
    diar_eval_mode (str):
        Diarization evaluation modes

        diar_eval_mode == "full":
            DIHARD challenge style evaluation, the most strict way of evaluating diarization
            (collar, ignore_overlap) = (0.0, False)
        diar_eval_mode == "fair":
            Evaluation setup used in VoxSRC challenge
            (collar, ignore_overlap) = (0.25, False)
        diar_eval_mode == "forgiving":
            Traditional evaluation setup
            (collar, ignore_overlap) = (0.25, True)
        diar_eval_mode == "all":
            Compute all three modes (default)
    fullg        FfairrK   F	forgivingrK   Trq   )rs   ru   rw   z7`diar_eval_mode` variable contains an unsupported value)re   rL   rM   rO   rP   )r`   r   )audio_rttm_map_dictrL   rM   diar_eval_modeeval_settingsrO   rP   
diar_scorer   r   r   evaluate_der   s&   
r|   spk_hypothesisspk_referencec                 C   s   g g }}g }t |D ]	\}}|| qd|}t| D ]}d|}	||	 t|	g|gd}
||
 qt|}|| }|| }|||fS )aM  
    Calculate cpWER with actual permutations in brute-force way when LSA algorithm cannot deliver the correct result.

    Args:
        spk_hypothesis (list):
            List containing the hypothesis transcript for each speaker. A list containing the sequence
            of words is assigned for each speaker.

            Example:
            >>> spk_hypothesis = ["hey how are you we that's nice", "i'm good yes hi is your sister"]

        spk_reference (list):
            List containing the reference transcript for each speaker. A list containing the sequence
            of words is assigned for each speaker.

            Example:
            >>> spk_reference = ["hi how are you well that's nice", "i'm good yeah how is your sister"]

    Returns:
        cpWER (float):
            cpWER value for the given session.
        min_perm_hyp_trans (str):
            Hypothesis transcript containing the permutation that minimizes WER. Words are separated by spaces.
        ref_trans (str):
            Reference transcript in an arbitrary permutation. Words are separated by spaces.
    r   
hypotheses
references)rZ   r    joinr   r
   npargmin)r}   r~   
p_wer_listpermed_hyp_listsref_word_listspk_id	word_list	ref_transhyp_word_list	hyp_transp_wer
argmin_idxmin_perm_hyp_transcpWERr   r   r   r     s   





r   Fuse_lsa_onlyc                    s    |g}t tj| }t t|}}|s#||k r#t |\}}}	nHg }
|D ]\}}t|g|gd}|
| q't|
	t t|g}t
|\}} fddt|D }d|}d|}	t|g|	gd}|||	fS )a  
    Calculate a session-level concatenated minimum-permutation word error rate (cpWER) value. cpWER is
    a scoring method that can evaluate speaker diarization and speech recognition performance at the same time.
    cpWER is calculated by going through the following steps.

    1. Concatenate all utterances of each speaker for both reference and hypothesis files.
    2. Compute the WER between the reference and all possible speaker permutations of the hypothesis.
    3. Pick the lowest WER among them (this is assumed to be the best permutation: `min_perm_hyp_trans`).

    cpWER was proposed in the following article:
        CHiME-6 Challenge: Tackling Multispeaker Speech Recognition for Unsegmented Recordings
        https://arxiv.org/pdf/2004.09249.pdf

    Implementation:
        - Brute force permutation method for calculating cpWER has a time complexity of `O(n!)`.
        - To reduce the computational burden, linear sum assignment (LSA) algorithm is applied
          (also known as Hungarian algorithm) to find the permutation that leads to the lowest WER.
        - In this implementation, instead of calculating all WER values for all permutation of hypotheses,
          we only calculate WER values of (estimated number of speakers) x (reference number of speakers)
          combinations with `O(n^2)`) time complexity and then select the permutation that yields the lowest
          WER based on LSA algorithm.
        - LSA algorithm has `O(n^3)` time complexity in the worst case.
        - We cannot use LSA algorithm to find the best permutation when there are more hypothesis speakers
          than reference speakers. In this case, we use the brute-force permutation method instead.

          Example:
              >>> transcript_A = ['a', 'b', 'c', 'd', 'e', 'f'] # 6 speakers
              >>> transcript_B = ['a c b d', 'e f'] # 2 speakers

              [case1] hypothesis is transcript_A, reference is transcript_B
              [case2] hypothesis is transcript_B, reference is transcript_A

              LSA algorithm based cpWER is:
                [case1] 4/6 (4 deletion)
                [case2] 2/6 (2 substitution)
              brute force permutation based cpWER is:
                [case1] 0
                [case2] 2/6 (2 substitution)

    Args:
        spk_hypothesis (list):
            List containing the hypothesis transcript for each speaker. A list containing the sequence
            of words is assigned for each speaker.

            Example:
            >>> spk_hypothesis = ["hey how are you we that's nice", "i'm good yes hi is your sister"]

        spk_reference (list):
            List containing the reference transcript for each speaker. A list containing the sequence
            of words is assigned for each speaker.

            Example:
            >>> spk_reference = ["hi how are you well that's nice", "i'm good yeah how is your sister"]

    Returns:
        cpWER (float):
            cpWER value for the given session.
        min_perm_hyp_trans (str):
            Hypothesis transcript containing the permutation that minimizes WER. Words are separated by spaces.
        ref_trans (str):
            Reference transcript in an arbitrary permutation. Words are separated by spaces.
    r   c                    s   g | ]} | qS r   r   )r   kr}   r   r   r     s    z+calculate_session_cpWER.<locals>.<listcomp>r   )list	itertoolsproductr   r   r
   r    torchtensorreshaper   r   argsortr   )r}   r~   r   hyp_ref_pair	all_pairsnum_hyp_spksnum_ref_spksr   r   r   lsa_wer_listspk_hyp_transspk_ref_transspk_wercost_werrow_hyp_indcol_ref_ind
hyp_permedr   r   r   r   G  s    B


r   spk_hypothesesspk_referencesc           
      C   s   t | t |krtdt |  dt | dg g g }}}t| |D ]\}}t||\}}}	|| || ||	 q$|||fS )a  
    Launcher function for `calculate_session_cpWER`. Calculate session-level cpWER and average cpWER.
    For detailed information about cpWER, see docstrings of `calculate_session_cpWER` function.

    As opposed to `cpWER`, `WER` is the regular WER value where the hypothesis transcript contains
    words in temporal order regardless of the speakers. `WER` value can be different from cpWER value,
    depending on the speaker diarization results.

    Args:
        spk_hypotheses (list):
            List containing the lists of speaker-separated hypothesis transcripts.
        spk_references (list):
            List containing the lists of speaker-separated reference transcripts.

    Returns:
        cpWER (float):
            List containing cpWER values for each session
        min_perm_hyp_trans (list):
            List containing transcripts that lead to the minimum WER in string format
        ref_trans (list):
            List containing concatenated reference transcripts
    zIn concatenated-minimum permutation word error rate calculation, hypotheses and reference lists must have the same number of elements. But got arguments:z and z correspondingly)r   r`   r[   r   r    )
r   r   cpWER_valueshyps_spkrefs_spkr}   r~   r   min_hypothesisconcat_referencer   r   r   r     s   


r   )r)   )r;   )NrK   TT)rq   )F)%r   r   typingr   r   r   r   numpyr   pandasra   r   pyannote.corer   r   pyannote.metrics.diarizationr	    nemo.collections.asr.metrics.werr
   3nemo.collections.asr.parts.utils.optimization_utilsr   
nemo.utilsr   __all__strr(   r   intr:   rJ   r   boolr   r|   r   r   r   r   r   r   r   <module>   s   ",


(


`(04
d

