o
    }oi                     @   s  d dl Z d dlZd dlZd dlZd dlmZ d dlmZ d dlm	Z	m
Z
mZ d dlZd dlmZ d dlmZ d dlmZ d dlmZmZmZmZmZ d d	lmZ zd dlZd
ZW n eyc   dZY nw dgZde de!fddZ"de de fddZ#de dee
e  e f fddZ$de
e	e e%f  dee
e  e f fddZ&	d1de
e	e e%f  de de'dee
e  e f fd d!Z(d"e	e e	e e%f f d#e	e e	e e%f f d$e
e  fd%d&Z)d'e d(e%d)e%d*e de f
d+d,Z*d-e
e  de'fd.d/Z+G d0d dZ,dS )2    N)OrderedDict)datetime)DictListTuple)concat_perm_word_error_rate)word_error_rate)ClusteringDiarizer)audio_rttm_mapget_uniqname_from_filepathlabels_to_rttmfilerttm_to_labelswrite_rttm2manifest)loggingTFOfflineDiarWithASR	file_pathsession_trans_dictc                 C   s@   t | d}tj||dd W d   dS 1 sw   Y  dS )a  
    Write a json file from the session_trans_dict dictionary.

    Args:
        file_path (str):
            Target filepath where json file is saved
        session_trans_dict (dict):
            Dictionary containing transcript, speaker labels and timestamps
    w   )indentN)openjsondump)r   r   outfile r   f/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/asr/parts/utils/diarization_utils.pydump_json_to_file/   s   
"r   w_pathvalc                 C   s>   t | d}||d  W d   dS 1 sw   Y  dS )z
    Write a text file from the string input.

    Args:
        w_path (str):
            Target path for saving a file
        val (str):
            String variable to be written
    r   
N)r   write)r   r   outputr   r   r   	write_txt=   s   
"r"   ctm_file_pathreturnc                 C   s   g i }}t |  }|D ]"}| }|d }||vrg ||< || |d  ||d  qdd | D }d|}||fS )a  
    Convert ctm file into a list containing transcription (space seperated string) per each speaker.

    Args:
        ctm_file_path (str):
            Filepath to the reference CTM files.

    Returns:
        spk_reference (list):
            List containing the reference transcripts for each speaker.

            Example:
            >>> spk_reference = ["hi how are you well that's nice", "i'm good yeah how is your sister"]

        mix_reference (str):
            Reference transcript from CTM file. This transcript has word sequence in temporal order.

            Example:
            >>> mix_reference = "hi how are you i'm good well that's nice yeah how is your sister"
       r   c                 S      g | ]}d  |qS  join.0	word_listr   r   r   
<listcomp>i       z'convert_ctm_to_text.<locals>.<listcomp>r(   )r   	readlinessplitappendvaluesr*   )r#   mix_referenceper_spk_ref_trans_dictctm_contentctm_line	ctm_splitspkspk_referencer   r   r   convert_ctm_to_textK   s   

r;   word_dict_seq_listc                 C   sp   g i }}| D ]}|d }||vrg ||< ||  |d  | |d  qdd | D }d|}||fS )ac  
    Convert word_dict_seq_list into a list containing transcription (space seperated string) per each speaker.

    Args:
        word_dict_seq_list (list):
            List containing words and corresponding word timestamps in dictionary format.

            Example:
            >>> word_dict_seq_list =             >>> [{'word': 'right', 'start_time': 0.0, 'end_time': 0.04, 'speaker': 'speaker_0'},  
                 {'word': 'and', 'start_time': 0.64, 'end_time': 0.68, 'speaker': 'speaker_1'},
                   ...],
    
    Returns:
        spk_hypothesis (list):
            Dictionary containing the hypothesis transcript for each speaker. A list containing the sequence
            of words is assigned for each speaker.

            Example:
            >>> spk_hypothesis= ["hi how are you well that's nice", "i'm good yeah how is your sister"]

        mix_hypothesis (str):
            Hypothesis transcript from ASR output. This transcript has word sequence in temporal order.

            Example:
            >>> mix_hypothesis = "hi how are you i'm good well that's nice yeah how is your sister"
    speakerwordc                 S   r&   r'   r)   r+   r   r   r   r.      r/   z1convert_word_dict_seq_to_text.<locals>.<listcomp>r(   )r2   r3   r*   )r<   mix_hypothesisper_spk_hyp_trans_dict	word_dictr9   spk_hypothesisr   r   r   convert_word_dict_seq_to_textn   s   

rC   null   uniq_iddecimalsc                 C   st   g }d}| D ]1}|d }|d }t |d |d  |}|d }	| d| d| d| d|	 d| }
||
 q|S )a  
    Convert word_dict_seq_list into a list containing transcription in CTM format.

    Args:
        word_dict_seq_list (list):
            List containing words and corresponding word timestamps in dictionary format.

            Example:
            >>> word_dict_seq_list =             >>> [{'word': 'right', 'start_time': 0.0, 'end_time': 0.34, 'speaker': 'speaker_0'},  
                 {'word': 'and', 'start_time': 0.64, 'end_time': 0.81, 'speaker': 'speaker_1'},
                   ...],
    
    Returns:
        ctm_lines_list (list):
            List containing the hypothesis transcript in CTM format.

            Example:
            >>> ctm_lines_list= ["my_audio_01 speaker_0 0.0 0.34 right 0",
                                  my_audio_01 speaker_0 0.64 0.81 and 0",


    r   r=   
start_timeend_timer>   r(   )roundr2   )r<   rF   rG   	ctm_lines
confidencerA   r9   sttdurr>   ctm_line_strr   r   r   convert_word_dict_seq_to_ctm   s   &rP   der_resultswer_resultscsv_columnsc                 C   s~   i }|   D ]0}|dkrqdd |D ||< ||| d< || v r)|| | |  ||v r6|| ||  qt| }|S )a  
    Merge WER results and DER results into a single dictionary variable.

    Args:
        der_results (dict):
            Dictionary containing FA, MISS, CER and DER values for both aggregated amount and
            each session.
        wer_results (dict):
            Dictionary containing session-by-session WER and cpWER. `wer_results` only
            exists when CTM files are provided.

    Returns:
        total_result_dict (dict):
            Dictionary containing both DER and WER results. This dictionary contains unique-IDs of
            each session and `total` key that includes average (cp)WER and DER/CER/Miss/FA values.
    totalc                 S   s   i | ]}|d qS )-r   r,   xr   r   r   
<dictcomp>   s    z)get_total_result_dict.<locals>.<dictcomp>rF   )keysupdatelistr3   )rQ   rR   rS   total_result_dictrF   total_result_jsonsr   r   r   get_total_result_dict   s   r^   r>   stt_secend_secr=   c                 C   s(   | dd }| d| d| d|  S )aA  
    Get a string formatted line for Audacity label.

    Args:
        word (str):
            A decoded word
        stt_sec (float):
            Start timestamp of the word
        end_sec (float):
            End timestamp of the word

    Returns:
        speaker (str):
            Speaker label in string type
    _	z	[] )r1   )r>   r_   r`   r=   r9   r   r   r   get_audacity_label   s   re   labelsc                 C   s   dd | D }t t|S )at  
    Count the number of speakers in a segment label list.
    Args:
        labels (list):
            List containing segment start and end timestamp and speaker labels.

            Example:
            >>> labels = ["15.25 21.82 speaker_0", "21.18 29.51 speaker_1", ... ]

    Returns:
        n_spk (int):
            The number of speakers in the list `labels`

    c                 S   s   g | ]}| d d  qS )r(   rb   )r1   striprV   r   r   r   r.     s    z.get_num_of_spk_from_labels.<locals>.<listcomp>)lenset)rf   spk_setr   r   r   get_num_of_spk_from_labels   s   rk   c                   @   s2  e Zd ZdZdd Zedeeef fddZede	e fddZ
d	d
 Zdd ZdedefddZdd Zdeeeee	e f f fddZede	e dede	e fddZdeee	e f fddZdfddZe	dgd eeeeef f d!eeeeef f d"ed#edeeeeef f f
d$d%Z	&dhd'ed(ejd)edefd*d+Zd,e	e deee	e f deee	e	e  f fd-d.Zd/eee	e f d0eee	e f d1eee	e f deeeeef f fd2d3Z	4	5did6e	e d7e	e d8e	e	e  d9e	e	e  d#edeeeeef f fd:d;Zded7e	e d<e	eeef  deeeeef f fd=d>Zd?ed@edeeef fdAdBZdCe	e defdDdEZd<e	eeef  de	eeef  fdFdGZ e	4	4djd,e	e dHeeeeef f dIe	e dJe	e deeeeef f f
dKdLZ!edMe	e	e  de	e fdNdOZ"e	PdkdQeeeeef f dReeeeef f d"edSe	e dTef
dUdVZ#dldXedYedefdZd[Z$ded\eeeeef f d]e	e d^eeeeef f d_e	eeef  f
d`daZ%edQeeeeef f dReeeeef f fdbdcZ&d_e	eeef  fdddeZ'd4S )mr   a  
    A class designed for performing ASR and diarization together.

    Attributes:
        cfg_diarizer (OmegaConf):
            Hydra config for diarizer key
        params (OmegaConf):
            Parameters config in diarizer.asr
        ctc_decoder_params (OmegaConf)
            Hydra config for beam search decoder
        realigning_lm_params (OmegaConf):
            Hydra config for realigning language model
        manifest_filepath (str):
            Path to the input manifest path
        nonspeech_threshold (float):
            Threshold for VAD logits that are used for creating speech segments
        fix_word_ts_with_VAD (bool):
            Choose whether to fix word timestamps by using VAD results
        root_path (str):
            Path to the folder where diarization results are saved
        vad_threshold_for_word_ts (float):
            Threshold used for compensating word timestamps with VAD output
        max_word_ts_length_in_sec (float):
            Maximum limit for the duration of each word timestamp
        word_ts_anchor_offset (float):
            Offset for word timestamps from ASR decoders
        run_ASR:
            Placeholder variable for an ASR launcher function
        realigning_lm:
            Placeholder variable for a loaded ARPA Language model
        ctm_exists (bool):
            Boolean that indicates whether all files have the corresponding reference CTM file
        frame_VAD (dict):
            Dictionary containing frame-level VAD logits
        AUDIO_RTTM_MAP:
            Dictionary containing the input manifest information
        color_palette (dict):
            Dictionary containing the ANSI color escape codes for each speaker label (speaker index)
    c                 C   s   || _ |jj| _|jj| _|jj| _|j| _| jj	| _
| jj| _|j| _d| _d| _d| _d | _d | _d| _i | _|   |  | _|  | _d S )Ngffffff?g333333?g        F)cfg_diarizerasr
parametersparamsctc_decoder_parametersctc_decoder_paramsrealigning_lm_parametersrealigning_lm_paramsmanifest_filepathasr_based_vad_thresholdnonspeech_thresholdfix_word_ts_with_VADout_dir	root_pathvad_threshold_for_word_tsmax_word_ts_length_in_secword_ts_anchor_offsetrun_ASRrealigning_lm
ctm_exists	frame_VADmake_file_listsget_color_palettecolor_paletteget_csv_columnsrS   )selfrl   r   r   r   __init__/  s$   





zOfflineDiarWithASR.__init__r$   c                   C   s   dddddddddd	d
dS )Nz[1;32mz[1;34mz[1;30mz[1;31mz[1;35mz[1;36mz[1;37mz[1;33mz[0;34m[0;37m)	speaker_0	speaker_1	speaker_2	speaker_3	speaker_4	speaker_5	speaker_6	speaker_7	speaker_8	speaker_9whiter   r   r   r   r   r   F  s   z$OfflineDiarWithASR.get_color_palettec                   C   s   g dS )N)
rF   DERCERFAMISS	est_n_spk	ref_n_spkcpWERWERmappingr   r   r   r   r   r   V  s   z"OfflineDiarWithASR.get_csv_columnsc                 C   s   t | j| _dd | j D | _g | _t| jD ],\}}t|}d| j| v rE| j| d durE|| j| d v rE| j| j| d  qt	| jt	| jkrUd| _
dS dS )zU
        Create lists containing the filepaths of audio clips and CTM files.
        c                 S   s   g | ]\}}|d  qS )audio_filepathr   )r,   ra   valuer   r   r   r.   j  s    z6OfflineDiarWithASR.make_file_lists.<locals>.<listcomp>ctm_filepathNT)r
   rt   AUDIO_RTTM_MAPitemsaudio_file_listctm_file_list	enumerater   r2   rh   r   )r   kaudio_file_pathrF   r   r   r   r   e  s   
z"OfflineDiarWithASR.make_file_listsc                 C   sJ   | j d | j d f| _ddg| _td| j d   t| j d d S )zS
        Load ARPA language model for realigning speaker labels for words.
        min_number_of_wordsmax_number_of_wordsz</s>z<s>zLoading LM for realigning: arpa_language_modelr   )rs   N_rangestt_end_tokensr   infoarpaloadfr   r   r   r   _load_realigning_LMz  s   
z&OfflineDiarWithASR._load_realigning_LMrF   n_spkc                 C   s   t d|d|g g dS )z
        Initialize json (in dictionary variable) formats for session level result and Gecko style json.

        Returns:
            (dict): Session level result dictionary variable
        initialized )status
session_idtranscriptionspeaker_countwords	sentencesod)r   rF   r   r   r   r   _init_session_trans_dict  s   z+OfflineDiarWithASR._init_session_trans_dictc                 C   s   t dg dS )z
        Initialize a dictionary format for Gecko style json.

        Returns:
            (dict):
                Gecko style json dictionary.
        g       @)schemaVersion
monologuesr   r   r   r   r   _init_session_gecko_dict  s   z+OfflineDiarWithASR._init_session_gecko_dictword_ts_dictc           	      C   s   i | _ t| D ]6\}\}}| || j}| |}tj| j	d}tj
|s.t| t|||}| j| |d| j |< q	dS )z
        Take the non_speech labels from logit output. The logit output is obtained from
        `run_ASR` function.

        Args:
            word_ts_dict (dict):
                Dictionary containing word timestamps.
        
pred_rttms)r   rttm_filepathN)VAD_RTTM_MAPr   r   )get_speech_labels_from_decoded_predictionrv   get_str_speech_labelsospathr*   ry   existsmakedirsr   r   )	r   r   idxrF   word_timestampsspeech_labels_floatspeech_labelsoutput_pathfilenamer   r   r   _save_VAD_labels_list  s   	

z(OfflineDiarWithASR._save_VAD_labels_listinput_word_tsrv   c                 C   s   g }t | }|g kr|S t|d }|dkrOt|dkrG|| d ||d  d  |krG||}||d }||d |d |d g |d8 }|dks|S )a>  
        Extract speech labels from the ASR output (decoded predictions)

        Args:
            input_word_ts (list):
                List containing word timestamps.

        Returns:
            word_ts (list):
                The ranges of the speech segments, which are merged ranges of input_word_ts.
        r%   r   )copydeepcopyrh   popinsert)r   rv   r   word_tscounttrangeBtrangeAr   r   r   r     s   
 
z<OfflineDiarWithASR.get_speech_labels_from_decoded_predictionc                 C   s   |j jjjr#| | tj| jd}t	| j
|}d|j j_||j j_t|d}| }|j jjdurC|j jsC| j|j|j jjjd i }t| jD ]\}}t|}	tj| jd|	d }
t|
||	< qJ||fS )a  
        Launch the diarization process using the given VAD timestamp (oracle_manifest).

        Args:
            diar_model_config (OmegaConf):
                Hydra configurations for speaker diarization
            word_and_timestamps (list):
                List containing words and word timestamps

        Returns:
            diar_hyp (dict):
                A dictionary containing rttm results which are indexed by a unique ID.
            score Tuple[pyannote object, dict]:
                A tuple containing pyannote metric instance and mapping dictionary between
                speakers in hypotheses and speakers in reference RTTM files.
        zasr_vad_manifest.jsonN)cfg)vad_processing_dirsmoothing_typer   .rttm)diarizerrm   rn   asr_based_vadr   r   r   r*   ry   r   r   vad
model_pathexternal_vad_manifestr	   diarize
oracle_vad_get_frame_level_VADvad_pred_dir	smoothingr   r   r   r   )r   diar_model_configr   oracle_manifest
diar_modelscorediar_hypr   r   rF   	pred_rttmr   r   r   run_diarization  s&   




z"OfflineDiarWithASR.run_diarizationFc           	   	   C   s   t |tr
|s
d}n|}| jD ]9}tj||d | }g }t|d}| D ]}|t	|
  q(W d   n1 s>w   Y  || j|< qdS )a?  
        Read frame-level VAD outputs.

        Args:
            vad_processing_dir (str):
                Path to the directory where the VAD results are saved.
            smoothing_type (bool or str): [False, median, mean]
                type of smoothing applied softmax logits to smooth the predictions.
        frame.rN)
isinstanceboolr   r   r   r*   r   r0   r2   floatrg   r   )	r   r   r   ext_typerF   	frame_vadframe_vad_float_listfpliner   r   r   r     s   

z'OfflineDiarWithASR._get_frame_level_VADr   audio_rttm_map_dicttrans_info_dictry   rG   c              	   C   sz  | \}}}|j }i }	d}
|D ]}|\}}d|| v r!|| d }n
tj|d|d }t|}|| d }t|}t|}t|}|d |d  |d  |d	  |d |d	  |d |d	  |d |d	  f\}}}}t||t||t||t|||||| d
|	|< |
t||k7 }
qt||d |d	  |d |d	  |d |d	  f\}}}}|||||
t	|j  d|	d	< |	S )a!  
        Gather diarization evaluation results from pyannote DiarizationErrorRate metric object.

        Args:
            metric (DiarizationErrorRate metric):
                DiarizationErrorRate metric pyannote object
            trans_info_dict (dict):
                Dictionary containing word timestamps, speaker labels and words from all sessions.
                Each session is indexed by unique ID as a key.
            mapping_dict (dict):
                Dictionary containing speaker mapping labels for each audio file with key as unique name
            decimals (int):
                The number of rounding decimals for DER value

        Returns:
            der_results (dict):
                Dictionary containing scores for each audio file along with aggregated results
        r   hyp_rttm_filepathr   r   r   	confusionzfalse alarmzmissed detectionrT   )r   r   r   r   r   r   r   )r   r   r   r   spk_counting_acc)
results_r   r   r*   r   rk   rJ   intabsrh   )
diar_scorer   r   ry   rG   metricmapping_dictra   resultsrQ   count_correct_spk_countingresultkeyr   r   pred_labelsref_rttm
ref_labelsr   r   _DER_CER_FA_MISSr   r   r   r   r   r   r   gather_eval_results  sP   

	
z&OfflineDiarWithASR.gather_eval_results
   vad_index_word_end
vad_framesoffsetc                 C   sx   || }t d| j | }|t|k r*|| | jk rn|d7 }||kr$n|t|k stt|d |}t|d d}|S )aN  
        Find the closest silence frame from the given starting position.

        Args:
            vad_index_word_end (float):
                The timestamp of the end of the current word.
            vad_frames (numpy.array):
                The numpy array containing  frame-level VAD probability.
            params (dict):
                Contains the parameters for diarization and ASR decoding.

        Returns:
            cursor (float):
                A timestamp of the earliest start of a silence region from
                the given time point, vad_index_word_end.
        d   r%   g      Y@   )r   r{   rh   rz   minrJ   )r   r  r  r  cursorlimitr   r   r   _get_the_closest_silence_starta  s   z1OfflineDiarWithASR._get_the_closest_silence_startr   c                 C   s  i }t | D ]\}\}}t|}g }t |D ]o\}	}
|	|d k r|t|
d |
d  d}t||	d  d |
d  d d}|| jv r^td|
d  }| || j| }t||
d  d}n|}t||}tt| j	||}|
|
d |
d | g q|
|
d |
d g q|||< q|S )a  
        Compensate the word timestamps based on the VAD output.
        The length of each word is capped by self.max_word_ts_length_in_sec.

        Args:
            audio_file_list (list):
                List containing audio file paths.
            word_ts_dict (dict):
                Dictionary containing timestamps of words.

        Returns:
            enhanced_word_ts_dict (dict):
                Dictionary containing the enhanced word timestamp values indexed by unique-IDs.
        r%   r   r  g{Gz?r  )r   r   rh   rJ   r   r   r  r  maxr{   r2   )r   r   r   enhanced_word_ts_dictr   rF   word_ts_seq_listNenhanced_word_ts_bufferr   r   word_lenlen_to_next_wordr  closest_sil_sttvad_est_lenmin_candidatefixed_word_lenr   r   r   _compensate_word_ts_list  s*   "



z+OfflineDiarWithASR._compensate_word_ts_listr   word_hypword_ts_hypc                 C   s   i }| j r| ji krtd | | j|}n|}| jd r)ts$td| 	 | _
g }t| jD ]4\}}t|}	||	 ||	 }
}||	 ||	 }}| j|
|||d}| j
r[| |}| |	||||	< q0td| j d |S )a[  
        Match the diarization result with the ASR output.
        The words and the timestamps for the corresponding words are matched in a for loop.

        Args:
            diar_hyp (dict):
                Dictionary of the Diarization output labels in str. Indexed by unique IDs.

                Example:
                >>>  diar_hyp['my_audio_01'] = ['0.0 4.375 speaker_1', '4.375 5.125 speaker_0', ...]

            word_hyp (dict):
                Dictionary of words from ASR inference. Indexed by unique IDs.

                Example:
                >>> word_hyp['my_audio_01'] = ['hi', 'how', 'are', ...]

            word_ts_hyp (dict):
                Dictionary containing the start time and the end time of each word.
                Indexed by unique IDs.

                Example:
                >>> word_ts_hyp['my_audio_01'] = [[0.0, 0.04], [0.64, 0.68], [0.84, 0.88], ...]

        Returns:
            trans_info_dict (dict):
                Dictionary containing word timestamps, speaker labels and words from all sessions.
                Each session is indexed by a unique ID.
        zkVAD timestamps are not provided. Fixing word timestamps without VAD. Please check the hydra configurations.r   zbLM for realigning is provided but arpa is not installed. Install arpa using PyPI: pip install arpa)r   r   word_rfnd_tsdiar_labelsz0Diarization with ASR output files are saved in: /pred_rttms)rw   r   r   warningr%  r   rs   ARPAImportErrorr   r~   r   r   get_word_level_json_listrealign_words_with_lm_make_json_outputr   ry   )r   r   r&  r'  r   word_ts_refinedr<   r   r   rF   r   r)  r   r(  r   r   r   "get_transcript_with_speaker_labels  s6    



z5OfflineDiarWithASR.get_transcript_with_speaker_labelsNr  r   r)  r   r(  c                 C   s   |du r|}|d   \}}}d\}	}
g }tt|||D ]@\}\}}}| |}	|	t|krE|
d7 }
t|
t|d }
||
   \}}}t|d |}t|d |}|||||d q|S )a  
        Assign speaker labels to each word and save the hypothesis words and speaker labels to
        a dictionary variable for future use.

        Args:
            uniq_id (str):
                A unique ID (key) that identifies each input audio file.
            diar_labels (list):
                List containing the Diarization output labels in str. Indexed by unique IDs.

                Example:
                >>>  diar_labels = ['0.0 4.375 speaker_1', '4.375 5.125 speaker_0', ...]

            words (list):
                Dictionary of words from ASR inference. Indexed by unique IDs.

                Example:
                >>> words = ['hi', 'how', 'are', ...]

            word_ts (list):
                Dictionary containing the start time and the end time of each word.
                Indexed by unique IDs.

                Example:
                >>> word_ts = [[0.0, 0.04], [0.64, 0.68], [0.84, 0.88], ...]
            
            word_ts_refined (list):
                Dictionary containing the refined (end point fixed) word timestamps based on hypothesis
                word timestamps. Indexed by unique IDs.

                Example:
                >>> word_rfnd_ts = [[0.0, 0.60], [0.64, 0.80], [0.84, 0.92], ...]

        Returns:
            word_dict_seq_list (list):
                List containing word by word dictionary containing word, timestamps and speaker labels.

                Example:
                >>> [{'word': 'right', 'start_time': 0.0, 'end_time': 0.04, 'speaker': 'speaker_0'},  
                     {'word': 'and', 'start_time': 0.64, 'end_time': 0.68, 'speaker': 'speaker_1'},  
                     {'word': 'i', 'start_time': 0.84, 'end_time': 0.88, 'speaker': 'speaker_1'},  
                     ...]
        Nr   )r   r   r%   )r>   rH   rI   r=   )	r1   r   zip_get_word_timestamp_anchorr   r  rh   rJ   r2   )r   r   r)  r   r(  rG   start_point	end_pointr=   word_posturn_idxr<   word_idxr>   word_ts_stt_endrefined_word_ts_stt_endr_   r`   r   r   r   r.    s   3
z+OfflineDiarWithASR.get_word_level_json_listr<   c                 C   s  g g }}|d   \}}}|}	g g }
}|||dd}t|}td| d| d | j||d}|  }t|D ]t\}}|d |d	 }}|| |d
 |d }}||	krt|dkro|d d|	d|d g }|d 	 |d< |
| |||dd}n||d< ||}}||||dd |d  |	 d 7  < |t
|||| |}	q;||d< |d 	 |d< |
| |d d|d|d d||d< |
|d< | |||||
 |S )a<	  
        Generate json output files and transcripts from the ASR and diarization results.

        Args:
            uniq_id (str):
                A unique ID (key) that identifies each input audio file.
            diar_labels (list):
                List containing the diarization hypothesis timestamps

                Example:
                >>>  diar_hyp['my_audio_01'] = ['0.0 4.375 speaker_1', '4.375 5.125 speaker_0', ...]

            word_dict_seq_list (list):
                List containing words and corresponding word timestamps in dictionary format.

                Example:
                >>> [{'word': 'right', 'start_time': 0.0, 'end_time': 0.04, 'speaker': 'speaker_0'},  
                     {'word': 'and', 'start_time': 0.64, 'end_time': 0.68, 'speaker': 'speaker_1'},  
                     {'word': 'i', 'start_time': 0.84, 'end_time': 0.88, 'speaker': 'speaker_1'},  
                     ...]

        Returns:
            session_result_dict (dict):
                A dictionary containing overall results of diarization and ASR inference.
                `session_result_dict` has following keys: `status`, `session_id`, `transcription`, `speaker_count`,
                `words`, `sentences`.

                Example:
                >>> session_trans_dict =                     {
                        'status': 'Success',
                        'session_id': 'my_audio_01',
                        'transcription': 'right and i really think ...',
                        'speaker_count': 2,
                        'words': [{'word': 'right', 'start_time': 0.0, 'end_time': 0.04, 'speaker': 'speaker_0'},  
                                  {'word': 'and', 'start_time': 0.64, 'end_time': 0.68, 'speaker': 'speaker_1'},  
                                  {'word': 'i', 'start_time': 0.84, 'end_time': 0.88, 'speaker': 'speaker_1'},  
                                  ...
                                  ]
                        'sentences': [{'sentence': 'right',  'start_time': 0.0, 'end_time': 0.04, 'speaker': 'speaker_0'},
                                      {'sentence': 'and i really think ...', 
                                       'start_time': 0.92, 'end_time': 4.12, 'speaker': 'speaker_0'},
                                      ...
                                      ]
                    }
        r   r   )r=   rH   rI   textzCreating results for Session: z n_spk: r(   )rF   r   r>   r=   rH   rI   r   N)nameid)r=   termsr<  WORD)startendr<  typer   r   r   )r1   rk   r   r   r   r   r   r2   rh   rg   re   r*   _write_and_log)r   rF   r)  r<   word_seq_listaudacity_label_wordsr5  r6  r=   prev_speakerr   
terms_listsentencer   r   
gecko_dictr   rA   r>   r_   r`   r   r   r   r0  3  sH   
1




z$OfflineDiarWithASR._make_json_outputr   word_seq_lenc                 C   s   || j d k rt|| j d }t|| | j d }||fS ||| j d  kr<t|| j d }t|| | j d }||fS | j d | j d }}||fS )a  
        Calculate word ranges for realignment operation.
        N1, N2 are calculated to not exceed the start and end of the input word sequence.

        Args:
            k (int):
                Index of the current word
            word_seq_len (int):
                Length of the sentence

        Returns:
            N1 (int):
                Start index of the word sequence
            N2 (int):
                End index of the word sequence
        r%   r   )r   r  r  )r   r   rK  N1N2r   r   r   _get_realignment_ranges  s   z*OfflineDiarWithASR._get_realignment_rangesr:  c                 C   s   | j d dkr|d }n.| j d dkr|d }n"| j d dkr*|d |d  d }ntd| j d	  d
 |d }|| j }|S )a  
        Determine a reference point to match a word with the diarization results.
        word_ts_anchor_pos determines the position of a word in relation to the given diarization labels:
            - 'start' uses the beginning of the word
            - 'end' uses the end of the word
            - 'mid' uses the mean of start and end of the word

        word_ts_anchor_offset determines how much offset we want to add to the anchor position.
        It is recommended to use the default value.

        Args:
            word_ts_stt_end (list):
                List containing start and end of the decoded word.

        Returns:
            word_pos (float):
                Floating point number that indicates temporal location of the word.
        word_ts_anchor_posrA  r   rB  r%   midr  zword_ts_anchor_pos: word_ts_anchorz= is not a supported option. Using the default 'start' option.)ro   r   r   r|   )r   r:  r7  r   r   r   r4    s   


z-OfflineDiarWithASR._get_word_timestamp_anchorc              
   C   s  t |}g g }}t|D ]\}}|d |d }}|| || qg }	t|}
t|D ]\}}| jd |  k rF|| jd  k rn n|| |
|d  ks\|| |
|d  kr| ||\}}| jd	||| | | j
 ||||   }| jd	||| |d  | j
 ||d ||   }||g}t|ddd }||d  ||d  | jd  kr|d dkr|
|d  ||< || |d< |	| q0|	S )	a$  
        Realign the mapping between speaker labels and words using a language model.
        The realigning process calculates the probability of the certain range around the words,
        especially at the boundary between two hypothetical sentences spoken by different speakers.

        Example:
            k-th word: "but"

            hyp_former:
                since i think like tuesday </s> <s>  but he's coming back to albuquerque
            hyp_latter:
                since i think like tuesday but </s> <s>  he's coming back to albuquerque

        The joint probabilities of words in the sentence are computed for these two hypotheses. In addition,
        logprob_diff_threshold parameter is used for reducing the false positive realigning.

        Args:
            word_dict_seq_list (list):
                List containing words and corresponding word timestamps in dictionary format.

        Returns:
            realigned_list (list):
                List of dictionaries containing words, word timestamps and speaker labels.
        r>   r=   r   r%   r(   Nrb   logprob_diff_threshold)rh   r   r2   r   r   r   rN  r~   log_sr*   r   npargsortrs   )r   r<   rK  hyp_w_dict_listspk_listr   	line_dictr>   	spk_labelrealigned_listorg_spk_listrL  rM  
hyp_former
hyp_latterlog_pp_orderr   r   r   r/    s4   


((*2"z(OfflineDiarWithASR.realign_words_with_lmhyp_trans_info_dicthyp_ctm_file_listref_ctm_file_listc                 C   s  i }|durg g }}g g }}g g }	}
t t| |D ]m\}\}}t|}|
| |t|kr5td|durN|t|| krJt|| \}}ntd|dura||v rat|| d \}}ntdt|\}}|| || || || |	t|g|g qt||\}}}i |d< t||d|d d< t||d|d d	< t|
||	D ]\}}}i ||< ||| d
< ||| d< q|S )a  
        Evaluate the result transcripts based on the provided CTM file. WER and cpWER are calculated to assess
        the performance of ASR system and diarization at the same time.

        Args:
            audio_file_list (list):
                List containing file path to the input audio files.
            hyp_trans_info_dict (dict):
                Dictionary containing the hypothesis transcriptions for all sessions.
            hyp_ctm_file_list (list):
                List containing file paths of the hypothesis transcriptions in CTM format for all sessions.
            ref_ctm_file_list (list):
                List containing file paths of the reference transcriptions in CTM format for all sessions.

            Note: Either `hyp_trans_info_dict` or `hyp_ctm_file_list` should be provided.

        Returns:
            wer_results (dict):
                Session-by-session results including DER, miss rate, false alarm rate, WER and cpWER
        Nz:audio_file_list has mismatch in uniq_id with ctm_file_pathz;Hypothesis CTM files are provided but uniq_id is mismatchedr   z=Hypothesis information is not provided in the correct format.rT   )
hypotheses
referencesaverage_cpWERaverage_WERr   r   )	r   r3  r   r2   
ValueErrorr;   rC   r   r   )r   r`  ra  rb  rR   spk_hypothesesspk_referencesmix_hypothesesmix_references
WER_valuesuniq_id_listr   r   r#   rF   rB   r?   r:   r4   cpWER_valueshyps_spkrefs_spkr   r   r   r   r   evaluate  sD   









zOfflineDiarWithASR.evaluater   c                 C   s(   g }| D ]\}}| d|| q|S )a  
        Convert floating point speech labels list to a list containing string values.

        Args:
            speech_labels_float (list):
                List containing start and end timestamps of the speech segments in floating point type
            speech_labels (list):
                List containing start and end timestamps of the speech segments in string format
        z{:.3f} {:.3f} speech)r2   format)r   r   rA  rB  r   r   r   r   _  s   z(OfflineDiarWithASR.get_str_speech_labelsctm_eval.csvrQ   rR   rS   csv_file_namec           
      C   s   | d}t j|dd td| d|  t| ||}z4t| d| d}tj||d}|  |D ]}	|	|	 q6W d   W dS 1 sJw   Y  W dS  t
y`   td	 Y dS w )
aQ  
        This function is for development use when a CTM file is provided.
        Saves the session-level diarization and ASR result into a csv file.

        Args:
            wer_results (dict):
                Dictionary containing session-by-session results of ASR and diarization in terms of
                WER and cpWER.
        r*  T)exist_okzWriting /r   )
fieldnamesNz0I/O error has occurred while writing a csv file.)r   r   r   r   r^   r   csv
DictWriterwriteheaderwriterowIOError)
rQ   rR   ry   rS   rt  target_pathr]   csvfilewriterdatar   r   r   !write_session_level_result_in_csvo  s   
&z4OfflineDiarWithASR.write_session_level_result_in_csvZ   
string_outmax_chars_in_linec                 C   s   | j d r	tdnd}|d}g }|D ]G}g }t|| |krV|dkr*|d| nd}t|t||D ]}	||	|	|  }
t|
 dkrO||
 }|| q4|| q|| qd|}|S )a[  
        Break the lines in the transcript.

        Args:
            string_out (str):
                Input transcript with speaker labels
            max_chars_in_line (int):
                Maximum characters in each line

        Returns:
            return_string_out (str):
                String variable containing line breaking
        colored_textz[1;00mr   r   Nr   )ro   rh   r1   rangerg   r2   extendr*   )r   r  r  color_str_lensplit_string_outreturn_string_out	org_chunkbuffer	color_stri	trans_strc_trans_strr   r   r   _break_lines  s"   


zOfflineDiarWithASR._break_linesr   rF  rJ  r   c                 C   s   |  |}| jd r| |}d|d< t|d }t| j d| d| t| j d| d| t| j d| dd	| t| j d| d
|  t| j d| dd	| dS )ad  
        Write output files and display logging messages.

        Args:
            uniq_id (str):
                A unique ID (key) that identifies each input audio file
            session_trans_dict (dict):
                Dictionary containing the transcription output for a session
            audacity_label_words (list):
                List containing word and word timestamp information in Audacity label format
            gecko_dict (dict):
                Dictionary formatted to be opened in  Gecko software
            sentences (list):
                List containing sentence dictionary
        break_linessuccessr   r   z/pred_rttms/z.jsonz_gecko.jsonz.ctmr   z.txtz.w.labelN)	print_sentencesro   r  rP   r   ry   r"   r*   rg   )r   rF   r   rF  rJ  r   r  ctm_lines_listr   r   r   rD    s   


"z!OfflineDiarWithASR._write_and_logc                 C   s   d| d d dd| d d dd| d d dd	| d d
 dd| d d d
}|durMt |dkrMt|d|d d dd|d d d  dS t| dS )a  
        Print a slew of error metrics for ASR and Diarization.

        Args:
            der_results (dict):
                Dictionary containing FA, MISS, CER and DER values for both aggregated amount and
                each session.
            wer_results (dict):
                Dictionary containing session-by-session WER and cpWER. `wer_results` only
                exists when CTM files are provided.
        z
DER                : rT   r   z.4fz,                      
FA                 : r   z,                      
MISS               : r   z,                      
CER                : r   z,                      
Spk. counting acc. : r   Nr   z
cpWER              : re  z,                      
WER                : rf  )rh   r   r   )rQ   rR   DER_infor   r   r   print_errors  s$   




zOfflineDiarWithASR.print_errorsc                 C   s   d}|D ]x}|d }|d }|d }|d }| j d r#| j|d}nd}d}	t|d	kr0d
}
nd}
tt|dtt|d}}t||	 |
dd }t||	 |
dd }| j d rld| d| d}
nd}
|| |
 | d| d7 }q|S )a=  
        Print a transcript with speaker labels and timestamps.

        Args:
            sentences (list):
                List containing sentence-level dictionaries.

        Returns:
            string_out (str):
                String variable containing transcript and the corresponding speaker label.
        r   r=   rH   rI   r<  r  r   i   i  z%H:%M:%S.%fz%M:%S.%fr   N
print_time[z - rd   z: r   )ro   r   getr   r  r   fromtimestampstrftime)r   r   r  rI  r=   r5  r6  r<  colordatetime_offsettime_strstart_point_strend_point_strr   r   r   r    s*   

z"OfflineDiarWithASR.print_sentences)F)r   )r  )Nr  )NN)rs  )r  )(__name__
__module____qualname____doc__r   staticmethodr   strr   r   r   r   r   r   r   r   r   r   r   r   r   r  rT  ndarrayr  r%  r2  r.  r0  r   rN  r4  r/  rq  r   r  r  rD  r  r  r   r   r   r   r     s   ("

(M
!
*
I


C
k*"6M  
%4)rD   rE   )-r   rx  r   r   collectionsr   r   r   typingr   r   r   numpyrT   nemo.collections.asr.metrics.derr    nemo.collections.asr.metrics.werr   nemo.collections.asr.modelsr	   .nemo.collections.asr.parts.utils.speaker_utilsr
   r   r   r   r   
nemo.utilsr   r   r,  r-  __all__r  dictr   r"   r;   r   rC   r   rP   r^   re   rk   r   r   r   r   r   <module>   sX   *#+
&
!