o
    }oiP!                    @   s  d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZ d dlm	Z	 d dlm
Z
mZ d dlmZ d dlmZmZmZmZmZ d dlmZ d dlZd dlmZ d dlZd dlZd dlZd dl Z d dl!m"Z"m#Z# d dl$m%Z%m&Z& d d	l'm(Z( d d
l)m*Z* d dl+m,Z, d dl-m-Z- d dl.m/Z/m0Z0 d dl1m2Z2 d dl3m4Z4 	 eG dd dZ5dde6de5fddZ7de8de6fddZ9dd Z:de8de8de;fddZ<de;de;fd d!Z=d"e6deej>e6f fd#d$Z?	dd%e6d&e6d'e@d(e@d)e@d*eAd+e6de6fd,d-ZBd.d/ ZCejDjEd0ej>d1ee6e@f d&e6dej>fd2d3ZFd4e6d1e8de6fd5d6ZGejDjEd7ej>dej>fd8d9ZHejDjEd7ej>d:e@dej>fd;d<ZIdej>d=eAde@fd>d?ZJ	dd@e6dAe@dBe@dCej>dee@e@f f
dDdEZKejDjEdCej>d1ee6e@f dej>fdFdGZLejDjEdHej>dIej>dej>fdJdKZMejDjEd7ej>dej>fdLdMZNejDjEdNej>d1ee6e@f dej>fdOdPZOdCej>d1e8dee6e8f fdQdRZPejDjEdCej>d1ee6e@f dej>fdSdTZQdUe6d1e8de6fdVdWZR		XddYe6dZe8d[e@d*eAd+e6d\eSde6fd]d^ZTd_d` ZUdae6dbe6dee%e%f fdcddZVdee8de;fdfdgZW	h	0	i	j	kddee8dle6dme6dne6doe6dpe6d[e@d*eAdee8e8f fdqdrZXdee8deSfdsdtZYddle6dme6doe6deeZe8e8f fdudvZ[				w	 				j	x	yddze6d{ee6 d|ee6 d}ee6 d~eAdBe@de@d:e@d1e8de@deAdeAdej\fddZ]	jddNej>de@d)e@dej^fddZ_de6de;de;fddZ`	Xdd(e@d)e@de6d+e6deSde6fddZade6fddZbde6fddZc	N	dde6de6de6de6fddZd	dde6de6de6de6fddZed"e6dejffddZgdeee@  deee@  fddZhde6deee@  fddZide6deeee@  eee@  f fddZj	ddNeee@  dee@ deee@  fddZk	dd7eee@  de@dBe@de@deSde6fddZl					jdde6de6dee@ de6deSdBe@de@fddZmdddZndde6dee6 de%fddZoddee@ d[e@fddZp	jddee6ee@ f dee6ee@ f d[e@dee%e%f fddńZqde"fddȄZrde6de8de8de8d[e@f
dd΄Zs		Xddej>de#deAdeSfddՄZt	X	ddeej> dee6ee6ee@eAf f f de#deAdeSdeAdeee@  fddۄZudS )    N)	dataclass)repeat)ceilfloor)Path)DictListOptionalTupleUnion)
DictConfig	OmegaConf)
AnnotationSegment)	detection)roc_auc_score)ParameterGrid)tqdm)EncDecClassificationModelEncDecFrameClassificationModel)get_full_path)loggingc                   @   sZ   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed< dZ
eed< dZeed	< d
S )PostProcessingParamsaH  
    Postprocessing parameters for end-to-end speaker diarization models.
    These parameters can significantly affect DER performance depending on the evaluation style and the dataset.
    It is recommended to tune these parameters based on the evaluation style and the dataset
    to achieve the desired DER performance.
          ?onsetoffset        	pad_onset
pad_offsetmin_duration_onmin_duration_offN)__name__
__module____qualname____doc__r   float__annotations__r   r   r   r   r     r'   r'   ^/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/asr/parts/utils/vad_utils.pyr   1   s   
 r   postprocessing_yamlreturnc                 C   s   t t }| du rtd |S t| d-}t|d }td|  d | D ]\}}t	||r:t
||| q+W d   |S 1 sFw   Y  |S )a%  
    Load postprocessing parameters from a YAML file.

    Args:
        postprocessing_yaml (str):
            Path to a YAML file for postprocessing configurations.

    Returns:
        postprocessing_params (dataclass):
            Postprocessing parameters loaded from the YAML file.
    NzeNo postprocessing YAML file has been provided. Default postprocessing configurations will be applied.r
parameterszPostprocessing YAML file 'z' has been loaded.)r   
structuredr   r   infoopenyaml	safe_loaditemshasattrsetattr)r)   postprocessing_paramsfileyaml_paramskeyvaluer'   r'   r(   load_postprocessing_from_yamlB   s$   

r:   configc                    s  d| v r| d r| d }nd}d| v rt j| d |n|}t| d tkrSg }t| d ddd}| D ]}|t	|
  q6W d   n1 sMw   Y  nt| d tkr`| d }ntd	d
| d | d t| d tkrzt| d jndd | ddur| d dkrtj| d d}t|t }tt|t|t|ddd}W d   n1 sw   Y  n fddt|dddD }t j|rtd t | t|ddd"}	|D ]}
|
D ]}t||	 |	d |	  qqW d   |S 1 sw   Y  |S )z
    Perform VAD on long audio snippet might cause CUDA out of memory issue.
    Automatically split manifest entry by split_duration to avoid the potential memory issue.
    prepared_manifest_vad_inputzmanifest_vad_input.jsonout_dirinputr+   utf-8encodingNzThe input for manifest preparation would either be a string of the filepath to manifest or a list of {'audio_filepath': i, 'offset': 0, 'duration': null}.infersplit_durationwindow_length_in_sec )labelrC   rD   manifest_dirnum_workers   	processeszsplitting manifestTtotaldescleavec                    s   g | ]}t | qS r'   write_vad_infer_manifest).0input_el	args_funcr'   r(   
<listcomp>   s    z$prepare_manifest.<locals>.<listcomp>rN   rO   z/The prepared manifest file exists. Overwriting!a
)ospathjointypestrr/   	readlinesappendjsonloadsstriplist
ValueErrorr   parentgetmultiprocessingPoolzipr   r   imapwrite_vad_infer_manifest_starlenexistsr   r.   removedumpwriteflush)r;   manifest_vad_inputdefault_path
input_listmanifestlinepinputsresultsfoutresr+   r'   rT   r(   prepare_manifest`   sj   

 






r}   c                 C      t |  S z?
    A workaround for tqdm with starmap of multiprocessing
    rP   argsr'   r'   r(   rl         rl   r6   rU   c                 C   s  g }|d }|d }|d }| d }|  dd}|  dd}t| s6t|d	 | }	|	 r6|	  }z{d
}
tj||
||d\}}tj||
d}|}|}d}|dkr||krq|dkrb|}d}n
d}|| }||8 }|}d}n&|dksy|dkr|d}nd}|dkr|}|}n|| }||8 }|| }||8 }|||d|d}|| ||7 }|dksUW |S W |S  t	y } z2d}t
|ddd}||d t|  W d   n1 sw   Y  W Y d}~|S W Y d}~|S d}~ww )a!  
    Used by prepare_manifest.
    Given a list of files, split them with maximum split_duration and write them to the manifest.
    Args:
        files (dict) : file to be processed
        args_func:
            label (str): label for audio snippet.y
            split_duration (float): max duration of each audio clip (each line in json)
            window_length_in_sec (float) : length of window for generating the frame. Used for taking care of joint.
    Returns:
        res (list) : list of generated metadata line of json for file
    rF   rC   rD   audio_filepathdurationNr   r   rG   >  )srr   r   yr   singleendstartnext_)r   r   rF   textr   z	error.logwr?   r@   :)rg   r   is_fileabsoluteas_posixlibrosaloadget_durationr`   	Exceptionr/   rq   r^   )r6   rU   r|   rF   rC   rD   filepathin_duration	in_offsetnew_filepathr   x_srr   leftcurrent_offsetstatuswrite_duration
offset_incmetadataeerr_filer{   r'   r'   r(   rQ      sx   

**&
rQ   datac                 C   s0  t | dkr	dgS dgt |  }tt | D ]}|dkr-| | | |d  kr(dnd||< q|t | d krF| | | |d  krAdnd||< q| | | |d  kr_| | | |d  kr_d||< q| | | |d  krx| | | |d  krxd||< q| | | |d  kr| | | |d  krd||< qd||< q|S )a/  
    Generate a list of status for each snippet in manifest. A snippet should be in single, start, next or end status.
    Used for concatenating to full audio file.
    Args:
        data (list): list of filepath of audio snippet
    Returns:
        status (list): list of status of each snippet.
    rI   r   Nr   r   r   r   )rm   range)r   r   ir'   r'   r(   get_vad_stream_status   s    	""(
(
(

r   r   c                 C   sf   g }t | ddd}| D ]	}|t| qW d   n1 s"w   Y  t| j}t||fS )z2
    Load torch.Tensor and the name from file
    r+   r?   r@   N)r/   r_   r`   r%   r   stemtorchtensor)r   framefrw   namer'   r'   r(   load_tensor_from_file  s   
r   frame_pred_dirsmoothing_methodoverlaprD   shift_length_in_secrH   r=   c                 C   s   t  | d }|r|}ntj| d| d t| }tj|s&t| |||||d}	|durf|dkrftj|d }
t	|t
|	}tt|
t|t|dd	d
}W d   |S 1 s_w   Y  |S t|dddD ]}t||	 qm|S )a  
    Generate predictions with overlapping input windows/segments.
    Then a smoothing filter is applied to decide the label for a frame spanned by multiple windows.
    Two common smoothing filters are supported: majority vote (median) and average (mean).
    This function uses multiprocessing to speed up.
    Args:
        frame_pred_dir (str): Directory of frame prediction file to be processed.
        smoothing_method (str): median or mean smoothing filter.
        overlap (float): amounts of overlap of adjacent windows.
        window_length_in_sec (float): length of window for generating the frame.
        shift_length_in_sec (float): amount of shift of window for generating the frame.
        out_dir (str): directory of generated predictions.
        num_workers(float): number of process for multiprocessing
    Returns:
        overlap_out_dir(str): directory of the generated predictions.
    z/*.frameoverlap_smoothing_output_r   )r   rD   r   r=   r   NrI   rJ   zgenerating predsTrL   FrW   )globrZ   r[   r\   r^   rn   mkdirrh   ri   rj   r   rd   r   rk   &generate_overlap_vad_seq_per_file_starrm   !generate_overlap_vad_seq_per_file)r   r   r   rD   r   rH   r=   frame_filepathlistoverlap_out_dirper_argsrx   ry   rz   frame_filepathr'   r'   r(   generate_overlap_vad_seq   s@   


r   c                 C   r~   r   )r   r   r'   r'   r(   r   _  r   r   r   r   c                 C   s  |d }|d }|d }| dd}t|| }t|| d }t|d|  }	t|	| }
|
dk r:td|
 dtt| | }|d	krt|}t|}t| D ]+\}}||
 d
kr_qT|| }|| }||| | |||< ||| d |||< qT|| }||d
k d }|||d
k< |S |dkrdd t|D }t| D ]0\}}||
 d
krq|| }|| }t||D ]}||d krt|| |	d
fd
||< qqt
dd |D }t|}||  d }|||< |S td)a   
    Use generated frame prediction (generated by shifting window of shift_length_in_sec (10ms)) to generate
    prediction with overlapping input window/segments. See description in generate_overlap_vad_seq.
    Use this for single instance pipeline.
    r   rD   r   	frame_len{Gz?rI   zrNote we jump over frame sequence to generate overlapping input segments. 
         Your input makes jump_on_frame=a   < 1 which is invalid because it cannot jump and will stuck.
         Please try different window_length_in_sec, shift_length_in_sec and overlap choices. 
         jump_on_target = int(seg * (1 - overlap)) 
         jump_on_frame  = int(jump_on_frame/shift) meanr   medianc                 S   s   g | ]}t d qS )r   )r   emptyrR   r   r'   r'   r(   rV     s    z7generate_overlap_vad_seq_per_tensor.<locals>.<listcomp>c                 S   s   g | ]	}t j|d dqS )r   )q)r   nanquantile)rR   lr'   r'   r(   rV     s    z0smoothing_method should be either mean or median)rg   intre   rm   r   zeros	enumerater   cat	unsqueezestackisnan)r   r   r   r   rD   r   r   shiftsegjump_on_targetjump_on_frame
target_lenpreds
pred_countr   og_predr   r   last_non_zero_predjnan_idxlast_non_nan_predr'   r'   r(   #generate_overlap_vad_seq_per_tensorf  s^   


r   r   c                 C   s   |d }|d }t | \}}i }|D ]}t|| tks$t|| tkr*|| ||< qt|||}tj||d | }	t|	ddd}
|D ]}|
	|dd qFW d	   |	S 1 s]w   Y  |	S )
z<
    A wrapper for generate_overlap_vad_seq_per_tensor.
    r=   r   .r   r?   r@   .4frY   N)
r   r]   r%   r   r   rZ   r[   r\   r/   rq   )r   r   r=   r   r   r   per_args_floatr   r   overlap_filepathr   predr'   r'   r(   r     s$    
r   segmentsc                 C   s   | j tdgks| j tddgks| j tddgkr| S | | dddf  d  } | dddf | dddf k}tjjj|ddgddd}| | df }tjjj|ddgddd}| | df }tj||fdd	}|S )
z
    Merged the given overlapped segments.
    For example:
    torch.Tensor([[0, 1.5], [1, 3.5]]) -> torch.Tensor([0, 3.5])
    r      rI   Nr   constantr   )moder9   dim)shaper   Sizesortnn
functionalpadr   )r   merge_boundaryhead_paddedheadtail_paddedtailmergedr'   r'   r(   merge_overlap_segment  s    r   	thresholdc                 C   s(   | | dddf | dddf  |k S )z
    Remove segments which duration is smaller than a threshold.
    For example,
    torch.Tensor([[0, 1.5], [1, 3.5], [4, 7]]) and threshold = 2.0
    ->
    torch.Tensor([[1, 3.5], [4, 7]])
    NrI   r   r'   )r   r   r'   r'   r(   filter_short_segments  s   (	r   percc                 C   s.   t | }tt| tt|| d d  S )z)
    Calculate percentile given data
    d   rI   )rm   r%   sortedr   mathr   )r   r   sizer'   r'   r(   
percentile  s   &r   scaler   r   sequencec                 C   sx   | dkr	d}d}n| dkrt |}t|}n| dkr$t|d}t|d}||||   }||||   }t|t|fS )zE
    Calculate onset and offset threshold given different scale.
    r   r   rI   relativer   c   )minmaxr   r%   )r   r   r   r   minimaxir'   r'   r(   cal_vad_onset_offset  s   


r  c                 C   s2  | dd}| dd}| dd}| dd}| dd}d	}d}d
}	td
}
td
t| D ]E}	|ri| |	 |k rh|	| | td
|| krbttd
|| |	| | gd
}t|
|fd
}
|	| }d	}q0| |	 |kru|	| }d}q0|rttd
|| |	| | gd
}t|
|fd
}
t	|
}
|
S )a  
    Binarize predictions to speech and non-speech

    Reference
    Paper: Gregory Gelly and Jean-Luc Gauvain. "Minimum Word Error Training of RNN-based Voice
           Activity Detection", InterSpeech 2015.
    Implementation: https://github.com/pyannote/pyannote-audio/blob/master/pyannote/audio/utils/signal.py

    Args:
        sequence (torch.Tensor) : A tensor of frame level predictions.
        per_args:
            onset (float): onset threshold for detecting the beginning and end of a speech
            offset (float): offset threshold for detecting the end of a speech.
            pad_onset (float): adding durations before each speech segment
            pad_offset (float): adding durations after each speech segment;
            frame_length_in_sec (float): length of frame.

    Returns:
        speech_segments(torch.Tensor): A tensor of speech segment in the form of:
                                      `torch.Tensor([[start1, end1], [start2, end2]])`.
    frame_length_in_secr   r   r   r   r   r   r   Fr   T)
rg   r   r   r   rm   r  r   r   r   r   )r   r   r  r   r   r   r   speechr   r   speech_segmentsnew_segr'   r'   r(   binarization  s>   
&r
  original_segmentsto_be_removed_segmentsc                 C   s(   |D ]}| |  |jdd  } q| S )aH  
    Remove speech segments list in to_be_removed_segments from original_segments.
    (Example) Remove torch.Tensor([[start2, end2],[start4, end4]])
              from torch.Tensor([[start1, end1],[start2, end2],[start3, end3], [start4, end4]]),
              ->
              torch.Tensor([[start1, end1],[start3, end3]])
    rI   r   )eqalllogical_not)r  r  r   r'   r'   r(   remove_segmentsJ  s   	r  c                 C   sB   | | dddf   d  } t| dddf | dddf fS )z
    Get the gap segments.
    For example,
    torch.Tensor([[start1, end1], [start2, end2], [start3, end3]]) -> torch.Tensor([[end1, start2], [end2, start3]])
    Nr   rI   r   )r   r   column_stack)r   r'   r'   r(   get_gap_segmentsX  s   &r  r  c                 C   s   | j tdgkr| S |dd}|dd}|dd}|dkrH|dkr*t| |} |dkrFt| }t|t||}t| |fd} t| } | S |dkrdt| }t|t||}t| |fd} t| } |dkrmt| |} | S )a  
    Filter out short non-speech and speech segments.

    Reference:
        Paper: Gregory Gelly and Jean-Luc Gauvain. "Minimum Word Error Training of RNN-based Voice
        Activity Detection", InterSpeech 2015.
        Implementation:
        https://github.com/pyannote/pyannote-audio/blob/master/pyannote/audio/utils/signal.py

    Args:
        speech_segments (torch.Tensor):
            A tensor of speech segments in the format
            torch.Tensor([[start1, end1], [start2, end2]]).
        per_args:
            min_duration_on (float):
                Threshold for short speech segment deletion.
            min_duration_off (float):
                Threshold for small non-speech deletion.
            filter_speech_first (float):
                Whether to perform short speech segment deletion first. Use 1.0 to represent True.

    Returns:
        speech_segments (torch.Tensor):
            A tensor of filtered speech segments in the format
            torch.Tensor([[start1, end1], [start2, end2]]).
    r   r   r   r    filter_speech_first      ?)	r   r   r   rg   r   r  r  r   r   )r  r   r   r    r  non_speech_segmentsshort_non_speech_segmentsr'   r'   r(   	filteringc  s4   



r  c                 C   s   | dd}t| dd|d |d | \|d< |d< d|v r,|d r(d|d< nd	|d< i }|D ]}t|| tksBt|| tkrH|| ||< q0||fS )
z1
    Preparing for generating segment table.
    r=   Nr   r   r   r   r  r  r   )rg   r  r]   r%   r   )r   r   r=   r   r   r'   r'   r(   prepare_gen_segment_table  s   
 r  c                 C   s|   d}t | |}t||}|jtdgkr|S t|d\}}|ddddf |ddddf  | }t||f}|S )za
    See description in generate_overlap_vad_seq.
    Use this for single instance pipeline.
    r   r   NrI   r   )r
  r  r   r   r   r   r  )r   r   UNIT_FRAME_LENr  r   durr'   r'   r(   %generate_vad_segment_table_per_tensor  s   

,r  pred_filepathc                 C   sZ  t | \}}t||\}}t||}|ddrdnd}|| }tj||}	|jd dkrat|	ddd"}
|ddrB|
	d	 n|
	d
 W d   |	S W d   |	S 1 sZw   Y  |	S t|	ddd;}
|D ]/}|ddr|
	d| d|d dd|d dd qk|
	|d dd|d dd qkW d   |	S 1 sw   Y  |	S )z=
    A wrapper for generate_vad_segment_table_per_tensor
    use_rttmF.rttm.txtr   r   r?   r@   z.SPEAKER <NA> 1 0 0 <NA> <NA> speech <NA> <NA>
z0 0 speech
NzSPEAKER z 1 r    r   z <NA> <NA> speech <NA> <NA>
z speech
)
r   r  r  rg   rZ   r[   r\   r   r/   rq   )r  r   r   r   r=   r   r   ext	save_name	save_pathfpr   r'   r'   r(   #generate_vad_segment_table_per_file  s8   


,$
r%  Fvad_pred_dirr5   r  r  c                    s  d  fddt D }|s-d}|D ]}|d t| t||  }qt j|}t j|s8t | |||d}	i |	|}	d}|dur}|dkr}t| }
t	|t
|	}tt|
t|t|d	d
d W d   |S 1 svw   Y  |S t|d	d
dD ]}t||	 q|S )a  
    Convert frame level prediction to speech segment in start and end times format.
    And save to csv file  in rttm-like format
            0, 10, speech
            17,18, speech
    Args:
        vad_pred_dir (str): directory of prediction files to be processed.
        postprocessing_params (dict): dictionary of thresholds for prediction score.
        See details in binarization and filtering.
        frame_length_in_sec (float): frame length.
        out_dir (str): output dir of generated table/csv file.
        num_workers(float): number of process for multiprocessing
    Returns:
        out_dir(str): directory of the generated table.
    )r   r   r   c                    s$   g | ]}|  rtj|qS r'   )endswithrZ   r[   r\   rR   r   suffixesr&  r'   r(   rV        $ z.generate_vad_segment_table.<locals>.<listcomp>
seg_output-)r  r=   r  NrI   zcreating speech segmentsTrL   rW   )rZ   listdirr^   r[   r\   rn   r   rh   ri   rj   r   rd   r   rk   (generate_vad_segment_table_per_file_starrm   r%  )r&  r5   r  rH   r=   r  vad_pred_filepath_listout_dir_namer8   r   rx   ry   vad_pred_filepathr'   r)  r(   generate_vad_segment_table  sB   


r3  c                 C   r~   r   )r%  r   r'   r'   r(   r/  5  r   r/  vad_table_filepathgroundtruth_RTTM_filec              	   C   s   t j| ddd}t j|dddd}|jddddd	}t }| D ]\}}|d |t|d |d |d  < q"t }| D ]\}}d
|tt|d t|d t|d  < q?||fS )a6  
    Construct a Pyannote object for evaluation.
    Args:
        vad_table_filepath(str) : path of vad rttm-like table.
        groundtruth_RTTM_file(str): path of groundtruth rttm file.
    Returns:
        reference(pyannote.Annotation): groundtruth
        hypothesis(pyannote.Annotation): prediction
    r   Nsepheaderr7  	delimiterr8  r   r  speaker         columnsSpeechr   rI   )pdread_csvrenamer   iterrowsr   r%   )r4  r5  r   rF   	referenceindexrow
hypothesisr'   r'   r(   &vad_construct_pyannote_object_per_file<  s   $,rK  paramsc                 C   sH   d}d| v r| d }d}|  d tt| }|r"|D ]}||d< q|S )zB
    Get the parameter grid given a dictionary of parameters.
    Fr  T)poprd   r   )rL  has_filter_speech_firstr  params_gridr   r'   r'   r(   get_parameter_gridY  s   

rP  r|   DetERr      vad_predgroundtruth_RTTMresult_filevad_pred_methodfocus_metricc           !      C   s  d}i }	zt |  W n   tdt|||\}
}}t }t| }|D ]$}|D ]}t|| tjks>t|| tj	krFt
|| ||< q*zt||||d}|
D ]}|| }tj||d }t||\}}||| qRtj|dd |jdd}|jd	g d
  }|jd	g d  }|jd	g d  }|dks|dks|dksJ d|||d|	t|< td| d|	t|   |	t| |d  }~|  t|d ddd}|| d|	t|  d W d   n1 sw   Y  ||k r|}|	t| }|}td|| W q% ty, } ztd| d|  W Y d}~q%d}~w tjjyJ }  ztd| d|   W Y d} ~ q%d} ~ ww ||fS )a  
    Tune thresholds on dev set. Return best thresholds which gives the lowest
    detection error rate (DetER) in thresholds.

    Args:
        params (dict): dictionary of parameters to be tuned on.
        vad_pred_method (str): suffix of prediction file. Use to locate file.
                               Should be either in "frame", "mean" or "median".
        groundtruth_RTTM_dir (str): Directory of ground-truth rttm files or a file contains the paths of them.
        focus_metric (str): Metrics we care most when tuning threshold. Should be either in "DetER", "FA", "MISS"
        frame_length_in_sec (float): Frame length.
        num_workers (int): Number of workers.
    Returns:
        best_threshold (float): Threshold that gives lowest DetER.
    r   z(Please check if the parameters are valid)r  rH   r  T)ignore_errorsFdisplayr   )zdetection error rate%)zfalse alarmr[  )missr[  rQ  FAMISSz>Metric we care most should be only in 'DetER', 'FA' or 'MISS'!)z	DetER (%)zFA (%)zMISS (%)z
parameter z, z (%)rX   r?   r@   rY   NzCurrent bestzPass z, with error ) check_if_param_validre   pred_rttm_mapr   DetectionErrorRaterP  r]   npfloat64int64r%   r3  rZ   r[   r\   rK  shutilrmtreereportilocitemr^   r   r.   resetr/   rq   printRuntimeErrorrC  errorsEmptyDataError)!rL  rS  rT  rU  rV  rW  r  rH   	min_scoreall_perfpaired_filenamesgroundtruth_RTTM_dictvad_pred_dictmetricrO  paramr   vad_table_dirfilenamer5  r4  rG  rJ  rg  rQ  r]  r^  scorer$  best_thresholdoptimal_scoresr   e1r'   r'   r(   vad_tune_threshold_on_devk  sl   
$ 
  r|  c                 C   s   | D ],}|dkrt | d tkstdq|dkrq|dkrq| | D ]
}|dks-tdq#qtdd | d	 D rEtdd | d
 D sItddS )z,
    Check if the parameters are valid.
    r  zCInvalid inputs! filter_speech_first should be either True or False!r   r   r   z]Invalid inputs! All float parameters except pad_onset and pad_offset should be larger than 0!c                 s   s    | ]}|d kV  qdS )rI   Nr'   rR   r   r'   r'   r(   	<genexpr>  s    z'check_if_param_valid.<locals>.<genexpr>r   r   zJInvalid inputs! The onset and offset thresholds should be in range [0, 1]!T)r]   boolre   r  )rL  r   r   r'   r'   r(   r_    s&   ,r_  c                 C   s^  i }t j|r&t|ddd}|  }W d   n1 s w   Y  nt j|r7tt j|d}nt	d|D ]}t j
|ddd	 }|||< q=i }t j| rvt| ddd}|  }	W d   n1 spw   Y  nt j| rtt j| d
| }	nt	d|	D ]}t j
|ddd	 }|||< q| | @ }
|
||fS )z<
    Find paired files in vad_pred and groundtruth_RTTM
    r+   r?   r@   Nz*.rttmzcgroundtruth_RTTM should either be a directory contains rttm files or a file contains paths to them!r   rI   r   z*.zavad_pred should either be a directory containing vad pred files or a file contains paths to them!)rZ   r[   isfiler/   read
splitlinesisdirr   r\   re   basenamersplitkeys)rS  rT  rV  rr  r$  groundtruth_RTTM_filesr   rw  rs  vad_pred_filesrq  r'   r'   r(   r`    s<   


r`  r   rI      path2audio_filepath2_vad_predpath2groundtruth_rttmgroundtruth_labelssample_rater   unit_frame_lenlabel_repeatxticks_stepc                 C   s  t jddgd tj| |d||d\}}tj||d}t||| |	}t||	 d }d}|rJt|\}}|t||	 t|| |	  }t	|}t 
 }|t|j| |d	 |d
t|d g |jddd |d |ddg | }|r|rtd|s|std|r|durt||kdd
}n*|r|durt||\}}t||}t||}|t||	 t|| |	  }nd}|r|drt||}n&|rdd |D }|
dkrt||
}|t||	 t|| |	  }nd}|dur|jt||	 |ddd |dur |jt||	 |ddd |dur3|jt||	 |ddd |jddd |jddd |d |ddg |td
t|d | tj||d S )!a  
    Plot Audio and/or VAD output and/or groundtruth labels for visualization
    Args:
        path2audio_file (str):  path to audio file.
        path2_vad_pred (str): path to vad prediction file,
        path2groundtruth_rttm(str): path to groundtruth RTTM file.
        ground_truth_labels(str): a list of groundtruth label.
        sample_rate (int): sample rate of audio file.
        offset (float): offset in seconds.
        duration (float): duration in seconds.
        threshold (float): threshold for prediction score (from 0 to 1).
        per_args(dict): a dict that stores the thresholds for postprocessing.
        unit_frame_len (float): unit frame length in seconds for VAD predictions.
        label_repeat (int): repeat the label for this number of times to match different
                            frame lengths in preds and labels.
        xticks_step (int): step size for xticks.
    rR  r   figsizeTr[   r   monor   r   r   rI   Ngrayr   r   baxis
labelcolorSignalr   z3threshold and per_args cannot be used at same time!z?One and only one of threshold and per_args must have been used!r  c                 S      g | ]}t |qS r'   r%   r(  r'   r'   r(   rV   O      zplot.<locals>.<listcomp>r+   rF   rF   r   zg--zspeech problower rightlocshadowzPreds and Probas皙皙?rate)pltfigurer   r   r   rb  aranger   r   rm   subplotplotr   set_xlimtick_params
set_ylabelset_ylimtwinxre   wherer  r  gen_pred_from_speech_segmentsr'  extract_labelsr   legend
set_xticksipdAudio)r  r  r  r  r  r   r   r   r   r  r  r  audior  timelen_predframe_snippetr   r   ax1ax2pred_snippetr   r  r   rF   r'   r'   r(   r    sj   

 


""



r  probc                 C   sd   t |j}dd | D } | jdd d | D ]}t|d | }t|d | }d|||< q|S )zX
    Generate prediction arrays like 000111000... from speech segments {[0,1][2,4]}
    c                 S   r  r'   rd   r}  r'   r'   r(   rV   l  r  z1gen_pred_from_speech_segments.<locals>.<listcomp>c                 S      | d S Nr   r'   r   r'   r'   r(   <lambda>m      z/gen_pred_from_speech_segments.<locals>.<lambda>r8   r   rI   )rb  r   r   r   r   )r  r  r   r   r   r   r   r'   r'   r(   r  e  s   r  path2ground_truth_labelr  c                 C   s~   t j| dddd}|jddddd}g }|D ]%}||d |k|d |d  |k@  }t|d	kr7|d	 q|d
 q|S )z
    Extract ground-truth label for given time period.
    path2ground_truth_label (str): path of groundtruth RTTM file
    time (list) : a list of array representing time period.
    \s+Nr9  r   r  r;  r<  r@  rI   r   )rC  rD  rE  rm   r`   )r  r  r   labelsposrw   r'   r'   r(   r  v  s   $r  rs   use_featc              
      s  t || }t |d }|| }d}	g }
t|ddd#}|D ]}t|d dd }|
|d	d  qW d
   n1 sAw   Y  tdt|
 d t	|
}t
t  t  dD ]\}} fdd|D }tj jj |r |d |d d}n
 |d |d d}tj|dd}t|jdkr|jd dkr|d}|d
d
df }|dkr|}n+|| dkr|d
|  }n|| dkr|||  }n|| dkr||d
 }n|}|  }|	t|7 }	tj||
| d }t|ddd}tt|D ]}|d||  q	W d
   n	1 s"w   Y  W d
   n	1 s2w   Y  ~|| dksF|| dkrVtd|
|  d|	 d d}	qc|S )zB
    Generate VAD frame level prediction and write to out_dir
    r   r   r+   r?   r@   r   /r   z.wavNzInference on z audio files/json lines!)rM   c                    s   g | ]}|  jqS r'   )todevicer(  	vad_modelr'   r(   rV     s    z+generate_vad_frame_pred.<locals>.<listcomp>rI   )processed_signalprocessed_signal_length)input_signalinput_signal_lengthr   r=  r   r   r   z.framerX   z	{0:0.4f}
r   z Overall length of prediction of z is !)r   r/   ra   rb   splitr`   r   r.   rm   r   r   r   test_dataloaderr   ampautocastr  r]   softmaxr   squeezecputolistrZ   r[   r\   r   rq   formatdebug)r  rD   r   rs   r=   r  	time_unittrunctrunc_lall_lenr   r   rw   r6   r   r   
test_batch	log_probsprobsr   to_saveoutpathr{   r'   r  r(   generate_vad_frame_pred  s`   $
r  
model_pathc                 C   d   |  drtd|   tj| d}|S |  dr"tj| d}|S td|   tj| d}|S z,
    Initiate VAD model with model path
    z.nemozUsing local VAD model from )restore_pathz.ckpt)checkpoint_pathzUsing NGC cloud VAD model )
model_name)r'  r   r.   r   restore_fromload_from_checkpointfrom_pretrainedr  r  r'   r'   r(   init_vad_model     

r  c                 C   r  r  )r'  r   r.   r   r  r  r  r  r'   r'   r(   init_frame_vad_model  r  r  !asr_stitched_output_manifest.jsonsegmented_output_manifestspeech_segments_tensor_dirstitched_output_manifestc                 C   s  t j|st | g }t| ddd}|D ]}t|}|| qW d   n1 s.w   Y  t|ddd#}t	 }d}	t
|dkrtdt
|D ]}
||
d  d ||
d  d ||
d  d	  }}t||gd
}t||fd
}||
d  d }|	|7 }	||
d  d dd ddd
 }||
d  d ||
 d krt j||d }t|| ||
d  d ||	d}t|| |d |  t	 }d}	qO|	d7 }	qOnd}
||
 d ||
 d ||
 d	  }}t||gd
}t||fd
}||
 d }|	|7 }	||
 d dd ddd
 }t j||d }t|| ||
 d ||	d}t|| |d |  td| d|  |W  d   S 1 sbw   Y  dS )z3
    Stitch the prediction of speech segments.
    r+   r?   r@   Nr   rE   rI   r   r   r   	pred_textr   r  r   r   z.pt)r   speech_segments_filepathr  rY   r   z&Finish stitch segmented ASR output to z8, the speech segments info has been stored in directory )rZ   r[   rn   r   r/   ra   rb   r`   r   Tensorrm   r   r   r   r   r  r  r\   saverp   rq   rr   r   r.   )r  r  r  segmented_outputr   rw   r6   r{   r  all_pred_textr   r   r   r	  r  r   speech_segments_tensor_pathmetar'   r'   r(   stitch_segmented_asr_output  sv   

&

&"

&r  vad_asr_out.jsoninput_manifestaligned_vad_asr_output_manifestc                 C   sB  t  }t|ddd}|D ]}t|}|||d < qW d   n1 s%w   Y  g }t| ddd8}|D ]-}t|}|d }||v rV|| d |d< || d |d< nd|d< d|d< || q6W d   n1 snw   Y  t|d	dd}	|D ]}
t|
|	 |	d
 |	  q}W d   |S 1 sw   Y  |S )z
    Generate aligned manifest for evaluation.
    Because some pure noise samples might not appear in stitched_output_manifest.
    r+   r?   r@   r   Nr  r  rE   r   rY   )dictr/   ra   rb   r`   rp   rq   rr   )r  r  r  stitched_outputr   rw   r6   outsampler{   r   r'   r'   r(   construct_manifest_eval2  s<   




r  c                 C   s   t |  std|  tj| dddd}|jddddd	}|d t|d< |d t|d< |d |d  |d
< |jdgd}t	t
|d |d
 |d< |S )z4
    Load rttm file and extract speech segments
    zFile not found: r  Nr9  r   r  r;  r<  r@  r   )bysegment)r   rn   re   rC  rD  rE  astyper%   sort_valuesrd   rj   )r   r   r'   r'   r(   load_rttm_fileV  s   r  	intervalsc                 C   sd   | j dd d g }| D ]#}|r|d d |d k r || qt|d d |d |d d< q|S )z=
    Merge speech segments into non-overlapping segments
    c                 S   r  r  r'   r  r'   r'   r(   r  m  r  z!merge_intervals.<locals>.<lambda>r  r   rI   r   )r   r`   r  )r  r   intervalr'   r'   r(   merge_intervalsi  s    r  	rttm_filec                 C   s*   t t| d }dd |D }t|}|S )zm
    load speech segments from rttm file, where each segment is represented
    as [start, end] interval
    r  c                 S   r  r'   r  r(  r'   r'   r(   rV     r  z2load_speech_segments_from_rttm.<locals>.<listcomp>)rd   r  r  )r  r  r'   r'   r(   load_speech_segments_from_rttm{  s   r  c                 C   s   t t| d }dd |D }|jdd d g }g }|D ]5}|r+|d d |d	 k r1|| q||d	 t|d d |d g t|d d |d |d d< q||fS )
a'  
    Load speech segments from RTTM file, merge and extract possible overlaps

    Args:
        rttm_file (str): Path to RTTM file

    Returns:
        merged (List[List[float]]): merged speech intervals without overlaps
        overlaps (List[List[float]]): intervals with overlap speech
    r  c                 S   r  r'   r  r(  r'   r'   r(   rV     r  z:load_speech_overlap_segments_from_rttm.<locals>.<listcomp>c                 S   r  r  r'   r  r'   r'   r(   r    r  z8load_speech_overlap_segments_from_rttm.<locals>.<lambda>r  r   rI   r   )rd   r  r   r`   r  r  )r  r  r   overlapsr  r'   r'   r(   &load_speech_overlap_segments_from_rttm  s   $ r  max_durationc                 C   sR   g }d}| D ]}|d }| ||g |d }q|dur'||k r'| ||g |S )a  
    Get non-speech segments from given speech segments and maximum duration

    Args:
        speech_segments (List[List[float]]): speech segment intervals loaded by load_speech_segments()
        max_duration (Optional[float]): maximum duration of the audio, used to calculate the last silence segment

    Returns:
        nonspeech_segments (List[List[float]]): intervals of non-speech segments
    r   r   rI   N)r`   )r  r  nonspeech_segmentsr   sp_segr   r'   r'   r(   get_nonspeech_segments  s   
r  Tframe_lengthas_strc           
      C   s   g }t t|| }d}t|D ]S}|||  }	|t| d k r=| | d |	k r=|d7 }|t| d k r=| | d |	k s)| | d dkr_| | d |	  krW| | d kr_n n|d q|d q|rqddd |D S dd |D S )a~  
    Generate frame-level binary labels for audio, '0' for non-speech and '1' for speech

    Args:
        segments (List[List[float]]): speech segments loaded by load_speech_segments_from_rttm
        frame_length (float): frame length in seconds, e.g. 0.01 for 10ms frames
        offset (float): Offset of the audio clip
        duration (float): duration of the audio clip
    r   rI   r   c                 S   r  r'   )r^   r(  r'   r'   r(   rV     r  z$get_frame_labels.<locals>.<listcomp>c                 S   r  r'   r  r(  r'   r'   r(   rV     r  )r   rb  r   r   rm   r`   r\   )
r   r  r   r   r  r  n_framessidr   tr'   r'   r(   get_frame_labels  s     8r"  rE   r   
audio_filer#  showc                 C   sN  t jddgd tj| dd||d\}}tj||d}	t|}
t|
|||	}dd	 | D }t|}t 	 }|
|  |t|j| |d
 |dt|	d g |jddd |d |ddg | }|jt|| |ddd |jddd |jddd |d |ddg |rt   |rt | tj|ddS )zA
    Plot audio signal and frame-level labels from RTTM file
    rR  r   r  r   Tr  r   c                 S   r  r'   r  r(  r'   r'   r(   rV     r  z)plot_sample_from_rttm.<locals>.<listcomp>r  r   rI   r   r  r  r  r   r+   rF   r  r  r  Labelsr  r  r  )r  r  r   r   r   r  r"  r  rm   r  	set_titler  rb  r  r   r  r   r  r  r  r  r  r$  savefigr  r  )r#  r  r  r#  r$  r   r  r  r  r  r   r  lengthr  r  r'   r'   r(   plot_sample_from_rttm  s2   



r)  皙?c                 C   s  t | }t |}t|  } t| }||k r|| }|| }t|| |k rf| }t |t| dkrI|dgt|t |t|   7 }t| }|dt|d}t	|  |  S |dkrq|d|  }|dt
|d}|  S ||kr|| }|| }t|| |k r|jt|dd  }|d| }|S |jt
|dd  }|dkr||| d 7 }|S |  S )a  
    Aligns labels to frames when the frame length (e.g., 10ms) is different from the label length
    (e.g., 20ms). The threshold 0.2 is not critical, as the actual ratio will always be close to an
    integer unless using frame/label lengths that are not multiples of each other (e.g., 15ms frame
    length and 20ms label length), which is not valid. The value 0.2 is chosen for easier unit testing.

    Args:
        probs (List[float]):
            List of probabilities.
        labels (List[int]):
            List of labels.
        threshold (float):
            Threshold for rounding the ratio to an integer.

    Returns:
        labels (List[int]):
            List of labels aligned to frames.
    r   r   rI   Nr   )rm   r   r   r%   longr   r  viewamaxalign_labels_to_framesr   repeat_interleave)r  r  r   
frames_len
labels_lenratior|   r'   r'   r(   r.    s<   "r.  speaker_overridec                 C   s   t  }tj| dddd}|jddddd}| D ])\}}|dur2||t|d |d |d  < q|d |t|d |d |d  < q|S )	a  
    Read rttm file and construct a Pyannote object.
    Args:
        rttm_file(str) : path of rttm file.
        speaker_override(str) : if not None, all speakers will be replaced by this value.
    Returns:
        annotation(pyannote.Annotation): annotation object
    r  Nr9  r   r  r;  r<  r@  )r   rC  rD  rE  rF  r   )r  r3  
annotationr   rH  rI  r'   r'   r(   read_rttm_as_pyannote_objectC  s   	 $r5  r  c                 C   s|   g }d}t | D ]"\}}|dkr|dkr|| }q|dkr*|||d | g d}q|dkr<||t| d | g |S )a  
    Convert a list of labels to a list of speech segments.
    Args:
        labels (List[float]): list of labels
        frame_length_in_sec (float): frame length in seconds
    Returns:
        segments (List[Tuple[float, float]]): list of speech segments
    r   rI   )r   r`   rm   )r  r  r   r   r   rF   r'   r'   r(   !convert_labels_to_speech_segmentsW  s   	r6  
predictiongroundtruthc           
   	   C   s:  t  }t|tr| drt| dd}nPt|trE| drEtj| ddd}| D ]\}}d|tt	|d t	|d t	|d	  < q*nt|t
r`t| |}|D ]}d|t|d |d	 < qQntd
t  }	t|tr{|dr{t|dd}	|	|fS t|t
rt||}|D ]}d|	t|d |d	 < q|	|fS td)a}  
    Construct a Pyannote object for evaluation.
    Args:
        prediction (str) : path of VAD predictions stored as RTTM or CSV-like txt.
        groundtruth (str): path of groundtruth rttm file.
        frame_length_in_sec(float): frame length in seconds
    Returns:
        reference(pyannote.Annotation): groundtruth
        hypothesis(pyannote.Annotation): prediction
    r  r  )r3  r  r   Nr6  r   rI   zAprediction must be a path to rttm file or a list of frame labels.zBgroundtruth must be a path to rttm file or a list of frame labels.)r   
isinstancer^   r'  r5  rC  rD  rF  r   r%   rd   r6  re   )
r7  r8  r  rJ  r   rH  rI  r   r  rG  r'   r'   r(   ,frame_vad_construct_pyannote_object_per_fileo  s0   ,



r:  cfgc                 C   sn  t  }i }i }g }t| j  }t|d}| D ]}t|	 }t
|d |d}	t|	|d< t|	j}
|
|v r@td||
 || | jrd|v rSdnd}||d}|rt
||d}t|}t|| jjj|d |d	 d
}|| ||
< dd | D ||
< q|dddurdd |d  D ||
< qtdqW d   n1 sw   Y  |||fS )aE  
    Load manifest file and prepare label/rttm mapping
    Args:
        cfg: DictConfig object
    Returns:
        manifest_orig (List[Dict]): original manifest data
        key_labels_map (Dict): mapping from unique_audio_name to its labels
        key_rttm_map (Dict): mapping from unique_audio_name to its rttm file
    r+   r   )r#  manifest_filez=Please make sure each line is with different audio_filepath! rttm_filepathr  Nr   r   )r   r  r   r   c                 S   r  r'   r  r(  r'   r'   r(   rV     r  z1frame_vad_infer_load_manifest.<locals>.<listcomp>rF   c                 S   r  r'   r  r(  r'   r'   r(   rV     r  zJMust have either `label` or `rttm_filepath` in manifest when evaluate=True)setr   r  r   r   r/   r_   ra   rb   rc   r   r^   r   re   addr`   evaluaterg   r  r"  vadr,   r   r  )r;  unique_audio_nameskey_labels_mapkey_rttm_mapmanifest_origr<  finrw   entryr   uniq_audio_namerttm_keyr  r   	label_strr'   r'   r(   frame_vad_infer_load_manifest  sH   




"rK  pred_dirrC  rD  key_pred_rttm_mapc              	   C   s.  g }g }t  }i }tt| d}	t|	dt|	dD ]j}
g }|
d}| D ]}|	 }|s3q*|
t| q*W d   n1 sEw   Y  |
j}|||< t||| d||< |||  |||  ||v rr|| }n|| }t|| ||d\}}||| qt||d}|jd	d
}||fS )an  
    Perform evaluation on frame-VAD results
    Args:
        pred_dir: directory of frame-VAD prediction files with in `<unique_audio_name>.frame` format
        key_labels_map: dictionary of mapping each <unique_audio_name> to its labels
        key_rttm_map: dictionary of mapping each <unique_audio_name> to its GROUNDTRUTH rttm file
        key_pred_rttm_map: dictionary of mapping each <unique_audio_name> to its PREDICTED rttm file
        frame_length_in_sec: frame length in seconds, e.g. 0.02s
    Returns:
        auroc: AUROC score in 0~100%
        report: Pyannote detection.DetectionErrorRate() report
    z*.framezEvaluating VAD results)rN   rM   r+   N)r  r  )r7  r8  r  )y_truey_scoreFrY  )r   ra  rd   r   r   r   rm   r/   r_   rc   r`   r%   r   r.  extendr:  r   rg  )rL  rC  rD  rM  r  	all_probs
all_labelsrt  key_probs_mappredictions_list
frame_pred
pred_probsrF  rw   r8   r8  rG  rJ  aurocrg  r'   r'   r(   frame_vad_eval_detection_error  s@   

rX     ts_vad_binary_veccfg_vad_paramsunit_10ms_frame_countbypass_postprocessingc                 C   sN   t | |}|st||}t||}|S d|_d|_d|_d|_t||}|S )a   
    Post-processing on diarization results using VAD style post-processing methods.
    These post-processing methods are inspired by the following paper:
    Medennikov, Ivan, et al. "Target-Speaker Voice Activity Detection:
                              a Novel Approach for Multi-Speaker Diarization in a Dinner Party Scenario." (2020).

    Args:
        ts_vad_binary_vec (Tensor):
            Sigmoid values of each frame and each speaker.
            Dimension: (num_frames,)
        cfg_vad_params (OmegaConf):
            Configuration (omega config) of VAD parameters.
        unit_10ms_frame_count (int, optional):
            an integer indicating the number of 10ms frames in a unit.
            For example, if unit_10ms_frame_count is 8, then each frame is 0.08 seconds.
        bypass_postprocessing (bool, optional):
            If True, diarization post-processing will be bypassed.

    Returns:
        speech_segments (Tensor):
            start and end of each speech segment.
            Dimension: (num_segments, 2)

            Example:
                tensor([[  0.0000,   3.0400],
                        [  6.0000,   6.0800],
                        ...
                        [587.3600, 591.0400],
                        [591.1200, 597.7600]])
    r   r   )r   r/  r
  r  r   r   r   r   )rZ  r[  r\  r]  ts_vad_binary_framesr  r'   r'   r(   ts_vad_post_processing  s   $


r_  r   batch_preds_listaudio_rttm_map_dict	precisionc                    s   g }|rdnd}t t| t||dD ]S\}\}	}
|
d }| | jdd}dd t|jd	 D }t|jd	 D ](}t|d
d
|f |||d}|| }| } fdd|D }|| 	| q:|
| q|S )aT  
    Converts floating point number tensor diarization results to timestamps using VAD style
    post-processing methods.

    Args:
        batch_preds_list (List[Tensor]):
            Tensor diarization results for each sample.
            Dimension: [(num_frames, num_speakers), ...]
        audio_rttm_map_dict (Dict[str, Dict[str, Union[float, int]]]):
            Dictionary mapping unique audio file names to their rttm file entries.
        cfg_vad_params (OmegaConf):
            Configuration (omega config) of VAD parameters.
        unit_10ms_frame_count (int):
            an integer indicating the number of 10ms frames in a unit.
            For example, if unit_10ms_frame_count is 8, then each frame is 0.08 seconds.
        bypass_postprocessing (bool, optional):
            If True, diarization post-processing will be bypassed.
        precision (int, optional):
            The number of decimal places to round the timestamps. Defaults to 2.

    Returns:
        total_speaker_timestamps (List[List[List[float]]]):
            A list of lists of timestamp tensors for each session (utterance)
            Levels:
                - Session-level (uniq_id) [session1_list, session2_list,...]
                    - Segment-level: [[start1, end1], [start2, end2],...]]
                        - List of start and end timestamp [start, end]
    BinarizationzPost-processing)rM   rN   r   r   r   c                 S   s   g | ]}g qS r'   r'   r   r'   r'   r(   rV   Z  s    z*predlist_to_timestamps.<locals>.<listcomp>r   N)r[  r\  r]  c                    s$   g | ]\}}t | t | gqS r'   )round)rR   sttr   rb  r'   r(   rV   d  r+  )r   r   r2   rm   r  r   r   r_  r  rP  r`   )r`  ra  r[  r\  r]  rb  total_speaker_timestamps
pp_message
sample_idxuniq_idaudio_rttm_valuesr   speaker_assign_matspeaker_timestampsspk_idts_matts_seg_raw_listts_seg_listr'   rf  r(   predlist_to_timestamps/  s*   $rr  )N)NF)r|   r   rQ  r   rR  )r   )NNNr   r   NNNr   rI   r  )r   )F)r  r  )r  )T)NrE   Tr   r   )r*  )rY  F)Fr   )vr   ra   r   rh   rZ   re  dataclassesr   	itertoolsr   r   r   pathlibr   typingr   r   r	   r
   r   IPython.displayrZ  r  r   matplotlib.pyplotpyplotr  numpyrb  pandasrC  r   r0   	omegaconfr   r   pyannote.corer   r   pyannote.metricsr   sklearn.metricsr   sklearn.model_selectionr   r   nemo.collections.asr.modelsr   r   4nemo.collections.common.parts.preprocessing.manifestr   
nemo.utilsr   r   r^   r:   r  r}   rl   rd   rQ   r   r  r   r%   r   r   r   jitscriptr   r   r   r   r   r  r
  r  r  r  r  r  r%  r  r3  r/  rK  rP  r|  r_  r>  r`  r  r  arrayr  r  r  r  r  r  r  	DataFramer  r  r  r  r  r"  r)  r.  r5  r6  r:  rK  rX  r_  rr  r'   r'   r'   r(   <module>   s  >P
?
G	

$B
$ F$"
>


	
\&&	

b

>
N
$"*






+?

(4
6
6
