o
    }oiM5                    @   s8  d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZm	Z	m
Z
mZ d dlZd dlZd dlZd dlmZ d dlmZmZmZ d dlmZ d dlmZ d dlmZ d d	lmZmZ d d
lm Z  dd Z!de"de"fddZ#dddZ$dddZ%dd Z&dd Z'dd Z(dd Z)dd  Z*dd"d#Z+d$d% Z,d&d' Z-dd(d)Z.d*d+ Z/d,d- Z0d.e	e" d/e	e1 fd0d1Z2	2dd3e3fd4d5Z4d6d7 Z5dd9d:Z6dd;d<Z7d=d> Z8d?d@ Z9dAe	e: dBe	e: de3fdCdDZ;dAe	e: dBe	e: fdEdFZ<dGe	e	e1  de	e	e1  fdHdIZ=ddJe:dKe1de1fdLdMZ>ddJe1dKe1de:fdNdOZ?ddQe	e	e:  dKe1dRe1de	e	e:  fdSdTZ@dUe	e: dVe	e	e:  de	e	e:  fdWdXZA	8ddYe"dZe"d[e3dKe1de"f
d\d]ZB		^	_	`	ddae"dbe"dce:dde:dee:d[e3fdfdgZC	h	P		i	jddke:dce:dde:dle:dee:dKe1dme3dne1doe1de	e	e:  fdpdqZDdke:dce:dde:dle:de	e	e:  f
drdsZEdte:due:dve1dne1dejFf
dwdxZGdydz ZHd{ejFde	e	e:  fd|d}ZId~e	e	e:  de1de	e" fddZJde:de:dejFdejFde:de
ejFejFf fddZKde:de	e	e:  de
e:e1f fddZLdejFde:de:de	e	e:  de1dce:dne1de
e1e	ejF e	e	e:  e	e1 f fddZMde:de:dne1dejFdejFde	e1 dce:dde:de
e	ejF e	e	e:  e	e1 f fddZNdee"eOf dee1ejFf fddZPde	e" de	e" fddZQde1de:de1fddZRde	ee:e1f  de	ejF de
e	e" e	e" f fddZSdZe"fddZTde	e" de	ejF fddZUdd ZVdZe"fddZWde"dee"e	ee:e1f  f de	ejF fddZXd~e	e
e:e:f  de"dee"e"f de	e
e"ef  de	e
e"ef  de	e
e"ef  de"dB fddZYde	e	e:  de"fddZZdddZ[G ddĄ dăZ\dS )    N)deepcopy)DictListTupleUnion)
ListConfig)
AnnotationSegmentTimeline)tqdm)repeat_signal)LongFormSpeakerClustering)get_argmin_matsplit_input_data)loggingc                 C   s0   t | tu rtjtj| d }|S td)z1
    Return base name from provided filepath
    r   zinput must be filepath string)typestrospathsplitextbasename	TypeError)filepathuniq_id r   b/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/asr/parts/utils/speaker_utils.pyget_uniqname_from_filepath$   s   r   linereturnc                 C   s   t |  }t|d }|S )zJ
    Retrieve `uniq_id` from the `audio_filepath` in a manifest line.
    audio_filepath)jsonloadsstripr   )r   dicr   r   r   r   get_uniq_id_from_manifest_line/   s   r$      c                 C   s   t | d }| d du r| d du r|S | d r)ttt| d |td| }nd}| d rDttt| d | d  |td| }nd}| d| d| }|S )	z9
    Return basename with offset and end time labels
    rttm_filepathoffsetNduration
   r   NULL_)r   r   introundpow)metadecimalsbare_uniq_idr'   endtimer   r   r   r   get_uniq_id_with_dur8   s   "*r3   Fc           	      C   s  i }t | dy}| }tdt| |D ]_}| }t|}|d |	dd|	dd|	dd|	dd|	d	d|	d
d|	ddd}|rSt
|}nd| v r^|d }nt|d d}||vrn|||< qtd|d  dW d   |S 1 sw   Y  |S )a  
    This function creates AUDIO_RTTM_MAP which is used by all diarization components to extract embeddings,
    cluster and unify time stamps

    Args:
        manifest (str): Path to the manifest file
        attach_dur (bool, optional): If True, attach duration information to the unique name. Defaults to False.

    Returns:
        AUDIO_RTTM_MAP (dict) : Dictionary with unique names as keys and corresponding metadata as values.
    rzNumber of files to diarize: {}r   r&   Nr'   r(   textnum_speakersuem_filepathctm_filepath)r   r&   r'   r(   r5   r6   r7   r8   r   )r   zfile z^ is already part of AUDIO_RTTM_MAP, it might be duplicated, Note: file basename must be unique)open	readlinesr   infoformatlenr"   r    r!   getr3   keysr   KeyError)	manifest
attach_durAUDIO_RTTM_MAPinp_filelinesr   r#   r/   uniqnamer   r   r   audio_rttm_mapL   s>   












!!rG   c                 C   s~  dd | |fD }dd | ||fD }t |st |rt |r+| g|gdg}}}n| ||}}}ttt|t|t|gdkoIt|dk}t|t|ddd koct|t|ddd k}t|dkrwt d	d t||D }	n|d |d k}	d
di}
t |||	grt|dkrdd tt||D |
d< nd|d |d fi|
d< ||
d< |
S tdt|rtddS )a  
    Check whether multiscale parameters are provided correctly. window_lengths_in_sec, shift_lengfhs_in_sec and
    multiscale_weights should be all provided in omegaconf.listconfig.ListConfig type. In addition, the scales
    should be provided in descending order, from the longest scale to the base scale (the shortest).

    Example:
        Single-scale setting:
            parameters.window_length_in_sec=1.5
            parameters.shift_length_in_sec=0.75
            parameters.multiscale_weights=null

        Multiscale setting (base scale - window_length 0.5 s and shift_length 0.25):
            parameters.window_length_in_sec=[1.5,1.0,0.5]
            parameters.shift_length_in_sec=[0.75,0.5,0.25]
            parameters.multiscale_weights=[1,1,1]

    In addition, you can also specify session-by-session multiscale weight. In this case, each dictionary key
    points to different weights.
    c                 S   s   g | ]}t |tqS r   )
isinstancefloat.0varr   r   r   
<listcomp>       z'parse_scale_configs.<locals>.<listcomp>c                 S   s   g | ]
}t |tttfqS r   )rH   r   listtuplerJ   r   r   r   rM      s    g      ?   r   Nc                 S   s   g | ]\}}||kqS r   r   )rK   wsr   r   r   rM      s    use_single_scale_clusteringFc                 S   s   i | ]\}\}}|||fqS r   r   )rK   krS   rT   r   r   r   
<dictcomp>   s    z'parse_scale_configs.<locals>.<dictcomp>
scale_dictmultiscale_weightsz-Multiscale parameters are not properly setup.z^You must provide a list config for all three parameters: window, shift and multiscale weights.)	allr=   setrO   sortedzip	enumerate
ValueErrorany)window_lengths_in_secshift_lengths_in_secrY   check_float_configcheck_list_configwindow_lengthsshift_lengthslength_checkscale_order_checkshift_length_checkmultiscale_args_dictr   r   r   parse_scale_configs~   sJ   
"
2rk   c                 C   s`  dd | d d   D }|d r*t|}d|d d i|d< |d dd |d< n|}| d \}}|  D ]w}g g g }}}	t|d   D ]5}
| |
 \}}t|| t|| kr`td	t|| }|||  |	|| jd  || qHt|d 	d
 || d< tj|dd
|| d< tj|dd
|| d< t|	|| d< q6|S )a  
    The embeddings and timestamps in multiscale_embeddings_and_timestamps dictionary are
    indexed by scale index. This function rearranges the extracted speaker embedding and
    timestamps by unique ID to make the further processing more convenient.

    Args:
        multiscale_embeddings_and_timestamps (dict):
            Dictionary of embeddings and timestamps for each scale.
        multiscale_args_dict (dict):
            Dictionary of scale information: window, shift and multiscale weights.

    Returns:
        embs_and_timestamps (dict)
            A dictionary containing embeddings and timestamps of each scale, indexed by unique ID.
    c                 S   s   i | ]}|i qS r   r   rK   r   r   r   r   rW          z+get_embs_and_timestamps.<locals>.<dictcomp>r   rU   rX   rY   NrQ   z;Mismatch of counts between embedding vectors and timestampsdim
embeddings
timestampsmultiscale_segment_counts)r?   r   r\   r=   r_   torchtensorappendshape	unsqueezerI   cat)$multiscale_embeddings_and_timestampsrj   embs_and_timestamps_multiscale_args_dictrp   rq   r   embeddings_listtime_stamps_listsegment_index_list	scale_idxtime_stamps_tensorr   r   r   get_embs_and_timestamps   s.   
r   c                 C   s^   dd | d   D }t|d   D ]}| | }|  D ]}d|| i|| d |< qq|S )aU  
    The timestamps in `multiscale_timestamps` dictionary are indexed by scale index.
    This function rearranges the extracted speaker embedding and timestamps by unique ID
    to make the further processing more convenient.

    Args:
        multiscale_timestamps (dict):
            Dictionary of timestamps for each scale.
        multiscale_args_dict (dict):
            Dictionary of scale information: window, shift and multiscale weights.

    Returns:
        timestamps_dict (dict)
            A dictionary containing embeddings and timestamps of each scale, indexed by unique ID.
    c                 S   s   i | ]}|d i iqS )rX   r   rl   r   r   r   rW     rN   z"get_timestamps.<locals>.<dictcomp>r   rX   time_stamps)r?   r\   )multiscale_timestampsrj   timestamps_dictr   r   r   r   r   r   get_timestamps   s   r   c                 C   s   t | }g }tt|d D ]Q}||  \}}}||d   \}}}	t|t|krRtt|t| d }
d|
||	g||d < ||d |
 d |  q||d | d |  q|d  \}}}||d | d |  |S )z'
    Return contiguous time stamps
    rQ   g       @ rR   )r   ranger=   splitrI   r   joinru   )stampsrE   contiguous_stampsistartendspeaker
next_startnext_endnext_speakeravgr   r   r   get_contiguous_stamps  s   r   c           
      C   s   t | }g }tt|d D ]<}||  \}}}||d   \}}}	t|t|kr=||	kr=d|||	g||d < q||d | d |  q|d  \}}}||d | d |  |S )z0
    Merge time stamps of the same speaker.
    rQ   r   rR   )r   r   r=   r   rI   r   ru   )
rE   r   overlap_stampsr   r   r   r   r   r   r   r   r   r   merge_stamps'  s   r    c                 C   sJ   t |d}| D ]}|  \}}}t|t|}}||t||< q|S )z\
    Convert the given labels to pyannote object to calculate DER and for visualization
    uri)r   r"   r   rI   r	   )labels	uniq_name
annotationlabelr   r   r   r   r   r   labels_to_pyannote_object;  s   
r   c                 C   s   t j||d }t|d2}| D ]&}| }| \}}}t|t| }	t|}d|||	|}
||
 qW d   |S 1 sCw   Y  |S )zU
    Write rttm file with uniq_id name in out_rttm_dir with timestamps in labels
    .rttmrS   z6SPEAKER {} 1   {:.3f}   {:.3f} <NA> <NA> {} <NA> <NA>
N)	r   r   r   r9   r"   r   rI   r<   write)r   r   out_rttm_dirfilenamefr   r   r   r   r(   logr   r   r   labels_to_rttmfileH  s   
		r   c                 C   s   t t| |S )z8
    Convert string to float then round the number.
    )r-   rI   )xround_digitsr   r   r   string_to_floatY  s   r   c                 C   sH   |    }t|d |}t|d |t|d | }|d }|||fS )a  
    Convert a line in RTTM file to speaker label, start and end timestamps.

    Args:
        rttm_line (str):
            A line in RTTM formatted file containing offset and duration of each segment.
        round_digits (int):
            Number of digits to be rounded.

    Returns:
        start (float)
            Start timestamp in floating point number.
        end (float):
            End timestamp in floating point number.
        speaker (str):
            speaker string in RTTM lines.
    r%         )r"   r   r   )	rttm_liner   rttmr   r   r   r   r   r   convert_rttm_line`  s
   
r   c              	   C   sh   g }t | d#}| D ]}t|dd\}}}|d||| qW d   |S 1 s-w   Y  |S )z7
    Prepare time stamps label list from rttm file
    r4   r%   )r   z{} {} {}N)r9   r:   r   ru   r<   )rttm_filenamer   r   r   r   r   r   r   r   r   rttm_to_labelsy  s   
r   c                 C   s\   t j|dd|  d}t|d}|D ]}|| qW d   dS 1 s'w   Y  dS )aj  
    Write cluster labels that are generated from clustering into a file.
    Args:
        base_scale_idx (int): The base scale index which is the highest scale index.
        lines_cluster_labels (list): The start and end time-stamps of each segment with the predicted cluster label.
        out_rttm_dir (str): The path where output rttm files are saved.
    z../speaker_outputssubsegments_scalez_cluster.labelrS   N)r   r   r   r9   r   )base_scale_idxlines_cluster_labelsr   out_label_namer   clus_label_liner   r   r   write_cluster_labels  s   "r   segment_rangescluster_labelsc           
      C   s`   g }t |D ]\}}dt| }| | \}}|| d| d|  qt|}t|}	|	|fS )a  
    Generate cluster (speaker labels) from the segment_range list and cluster label list.

    Args:
        segment_ranges (list):
            List containing intervals (start and end timestapms, ranges) of each segment
        cluster_labels (list):
            List containing a cluster label sequence

    Returns:
        diar_hyp (list):
            List containing merged speaker-turn-level timestamps and labels in string format
            Example:
                >>>  diar_hyp = ['0.0 4.375 speaker_1', '4.375 5.125 speaker_0', ...]

        lines (list)
            List containing raw segment-level timestamps and labels in raw digits
                >>>  diar_hyp = ['0.0 0.25 speaker_1', '0.25 0.5 speaker_1', ..., '4.125 4.375 speaker_1']
    speaker_r   )r^   r   ru   r   r   )
r   r   rE   idxr   tagsttr   
cont_linesdiar_hypr   r   r   generate_cluster_labels  s   r   Tverbosec                    s  g }g }d}g }	d}
|j dkrtd d}
t|
d}|ddr.tj|}tj|d t	|
 dd| d	D ]\ }|   }|jrS|d
d}|du rRtdnd}|d jd d }|j|d |d |d |d t|t|jt|jt|j|dd|ddd
}~|
rtj  nt  |j| }|  }t||jd krtdt||\}}|rt| | |	 fdd|D  t| d}|  |g |dd}|durt!j"#|r|st$|}t| d}|  |g q9d}g }q9|rt%||	| ||fS )a  
    Performs spectral clustering on embeddings with time stamps generated from VAD output

    Args:
        embs_and_timestamps (dict): This dictionary contains the following items indexed by unique IDs.
            'embeddings' : Tensor containing embeddings. Dimensions:(# of embs) x (emb. dimension)
            'timestamps' : Tensor containing ime stamps list for each audio recording
            'multiscale_segment_counts' : Tensor containing the number of segments for each scale
        AUDIO_RTTM_MAP (dict):
            AUDIO_RTTM_MAP for mapping unique id with audio file path and rttm path
        out_rttm_dir (str):
            Path to write predicted rttms
        clustering_params (dict):
            Clustering parameters provided through config that contains max_num_speakers (int),
            oracle_num_speakers (bool), max_rp_threshold(float), sparse_search_volume(int)
            and enhance_count_threshold (int).
        use_torch_script (bool):
            Boolean that determines whether to use torch.jit.script for speaker clustering
        device (torch.device):
            Device we are running on ('cpu', 'cuda').
        verbose (bool):
            Enable TQDM progress bar.

    Returns:
        all_reference (list[uniq_name,Annotation]): reference annotations for score calculation
        all_hypothesis (list[uniq_name,Annotation]): hypothesis annotations for score calculation

    FTcudaz[cuda=False, using CPU for eigen decomposition. This might slow down the clustering process.)r   export_script_modulezspeaker_clustering_script.pt
clustering)descleavedisabler6   NzNProvided option as oracle num of speakers but num_speakers in manifest is nullrR   rr   r   rQ   rp   rq   rY   chunk_cluster_countembeddings_per_chunk)
embeddings_in_scalestimestamps_in_scalesrr   rY   oracle_num_speakersmax_num_speakersmax_rp_thresholdsparse_search_volumer   r   z9Mismatch of length between cluster_labels and timestamps.c                    s   g | ]
}  d | dqS )r   
r   )rK   seg_liner   r   r   rM   
  s    z&perform_clustering.<locals>.<listcomp>r   r&   )&r   r   warningr   r>   rs   jitscriptsaver   itemsr   r_   rv   forward_inferr,   r   rI   r   r   r   empty_cachegccollectr   cpunumpyr=   r   r   extendr   ru   r   r   existsr   r   )rz   rC   r   clustering_paramsdevicer   all_hypothesisall_referenceno_referencesr   r   speaker_clusteringaudio_rttm_valuesuniq_embs_and_timestampsr6   r   r   rq   r   rE   
hypothesis	rttm_file
ref_labels	referencer   r   r   perform_clustering  sr   





r   c                 C   sp   |    }t|dkr"t|d t|d |d }}}||fS t|d t|d |d }}}||fS )z:
    Extract VAD timestamp from the given RTTM lines.
    r%   r   r   r   rQ      )r"   r   r=   rI   )r   vad_outr   durr+   r   r   r   get_vad_out_from_rttm_line  s   $$r      c                 C   sj   | | d }| |  ddr$t| | d |}t| | d |}||fS t|}|j|j }d}||fS )aG  
    Extract offset and duration information from AUDIO_RTTM_MAP dictionary.
    If duration information is not specified, a duration value is extracted from the audio file directly.

    Args:
        AUDIO_RTTM_MAP (dict):
            Dictionary containing RTTM file information, which is indexed by unique file id.
        uniq_id (str):
            Unique file id
    Returns:
        offset (float):
            The offset value that determines the beginning of the audio stream.
        duration (float):
            The length of audio stream that is expected to be used.
    r   r(   Nr'           )r>   r-   sf	SoundFileframes
samplerate)rC   r   r0   
audio_pathr(   r'   soundr   r   r   get_offset_and_duration)  s   
r   c           	      C   sT   || d }|D ]\}}|t ||t || |d|d}t||  | d qdS )a  
    Write the json dictionary into the specified manifest file.

    Args:
        outfile:
            File pointer that indicates output file path.
        AUDIO_RTTM_MAP (dict):
            Dictionary containing the input manifest information
        uniq_id (str):
            Unique file id
        overlap_range_list (list):
            List containing overlapping ranges between target and source.
        decimals (int):
            Number of decimals to round the offset and duration values.
    r   UNKr   r'   r(   r   r   r   N)r-   r    dumpr   )	outfilerC   r   overlap_range_listr0   r   r   r   r/   r   r   r   write_overlap_segmentsD  s   r   c                 C   sV   | r$t j| r$t| d}| }W d   |S 1 sw   Y  |S td|  )z
    Read rttm files and return the rttm information lines.

    Args:
        rttm_file_path (str):
            An absolute path to an RTTM file

    Returns:
        lines (list):
            List containing the strings from the RTTM file.
    r4   NzkRequested to construct manifest from rttm with oracle VAD option or from NeMo VAD but received filename as )r   r   r   r9   r:   FileNotFoundError)rttm_file_pathr   rE   r   r   r   read_rttm_linesa  s   

r  c           	      C   s   t  }t|d#}|D ]}| }t|}|d dkr#||d  qW d   n1 s.w   Y  t |  }|| }|D ]}| |= t| d q?t	| dkrWt
ddS )a?  
    This function will check the valid speech segments in the manifest file which is either
    generated from NeMo voice activity detection(VAD) or oracle VAD.
    If an audio file does not contain any valid speech segments, we ignore the audio file
    (indexed by uniq_id) for the rest of the processing steps.
    r4   r(   r   r   NzN is ignored since the file does not contain any speech signal to be processed.zCAll files present in manifest contains silence, aborting next steps)r[   r9   r"   r    r!   addr?   r   r   r=   r_   )	rC   vad_manifestvad_uniq_idsvad_filer   r#   provided_uniq_idssilence_idsr   r   r   r   validate_vad_manifestx  s$   
r  rangeArangeBc                 C   s4   | d | d }}|d |d }}||ko||kS )ad  
    Check whether two ranges have overlap.

    Args:
        rangeA (list, tuple):
            List or tuple containing start and end value in float.
        rangeB (list, tuple):
            List or tuple containing start and end value in float.
    Returns:
        (bool):
            Boolean that indicates whether the input ranges have overlap.
    r   rQ   r   )r	  r
  start1end1start2end2r   r   r   
is_overlap  s   r  c                 C   sB   t | |sJ d|  d| t| d |d t| d |d gS )a|  
    Calculate the overlapping range between rangeA and rangeB.

    Args:
        rangeA (list, tuple):
            List or tuple containing start and end value in float.
        rangeB (list, tuple):
            List or tuple containing start and end value in float.

    Returns:
        (list):
            List containing the overlapping range between rangeA and rangeB.
    z#There is no overlap between rangeA:z and rangeB:r   rQ   )r  maxmin)r	  r
  r   r   r   get_overlap_range  s   $r  intervals_inc                 C   s  t | }|dkr
g S |dkr| S g }d}d}dd | D } t| }tj|dd\}}dd | D }|}	|	d d |	d d }
}td|D ]0}|	| d |	| d }}||krat||}qHt|
t|}
}||
|g |}
t||}qHt|
t|}
}||
|g |S )a/  
    Interval merging algorithm which has `O(N*logN)` time complexity. (N is number of intervals)
    Merge the range pairs if there is overlap exists between the given ranges.
    This algorithm needs a sorted range list in terms of the start time.
    Note that neighboring numbers lead to a merged range.

    Example:
        input: [(1, 10), (11, 20)]
        output: [(1, 20)]

    Refer to the original code at https://stackoverflow.com/a/59378428

    Args:
        intervals_in (list):
            List containing ranges.
            Example:
                >>> intervals_in
                [(102, 103), (104, 109), (107, 120)]

    Returns:
        merged_list (list):
            List containing the combined ranges.
            Example:
                >>> merged_list
                [(102, 120)]
    r   rQ   c                 S   $   g | ]}t |d  t |d gqS r   rQ   r,   rK   r   r   r   r   rM        $ z'merge_int_intervals.<locals>.<listcomp>rn   c                 S   r  r  r  r  r   r   r   rM     r  )	r=   rs   rt   sortr   r   r  r,   ru   )r  num_intervalsmerged_liststt2r  interval_tensor_sortedr+   _sorted_int	intervalsr   r   r   r   r   r   merge_int_intervals  s2   
r!  r   r0   c                 C   s&   t jt | d|  gdd  S )z3
    Convert floating point number to integer.
    r)   r   r0   )rs   r-   rt   r,   itemr   r0   r   r   r   fl2int  s   &r%  c                 C   s"   t jt | d|  g|d S )z3
    Convert integer to floating point number.
    r)   r"  )rs   r-   rt   r#  r$  r   r   r   int2fl  s   "r&  r   rangesmarginc           	         sv   g }g }| D ]"}t t|d   t t|d  }}||k r(|||g qt|}g } fdd|D }|S )a  
    Combine overlaps with floating point numbers. Since neighboring integers are considered as continuous range,
    we need to add margin to the starting range before merging then subtract margin from the result range.

    Args:
        ranges (list):
            List containing ranges.
            Example: [(10.2, 10.83), (10.42, 10.91), (10.45, 12.09)]
        decimals (int):
            Number of rounding decimals
        margin (int):
            margin for determining overlap of the two ranges when ranges are converted to integer ranges.
            Default is margin=2 which follows the python index convention.

        Examples:
            If margin is 0:
                [(1, 10), (10, 20)] -> [(1, 20)]
                [(1, 10), (11, 20)] -> [(1, 20)]
            If margin is 1:
                [(1, 10), (10, 20)] -> [(1, 20)]
                [(1, 10), (11, 20)] -> [(1, 10), (11, 20)]
            If margin is 2:
                [(1, 10), (10, 20)] -> [(1, 10), (10, 20)]
                [(1, 10), (11, 20)] -> [(1, 10), (11, 20)]

    Returns:
        merged_list (list):
            List containing the combined ranges.
            Example: [(10.2, 12.09)]
    r   rQ   c                    s,   g | ]}t |d    t |d  gqS r  )r&  r  r0   r(  r   r   rM   %     , z)merge_float_intervals.<locals>.<listcomp>)r,   r%  ru   r!  )	r'  r0   r(  
ranges_intmerged_ranges_intr   r   r   merged_ranges_floatr   r)  r   merge_float_intervals  s   *r.  target_rangesource_range_listc                 C   s@   t | dkrg S g }|D ]}t|| rt|| }|| q|S )a  
    Get the ranges that has overlaps with the target range from the source_range_list.

    Example:
        source range:
            |===--======---=====---====--|
        target range:
            |--------================----|
        out_range:
            |--------===---=====---==----|

    Args:
        target_range (list):
            A range (a start and end value pair) that defines the target range we want to select.
            target_range = [(start, end)]
        source_range_list (list):
            List containing the subranges that need to be selected.
            source_range = [(start0, end0), (start1, end1), ...]
    Returns:
        out_range (list):
            List containing the overlap between target_range and
            source_range_list.
    r   )r=   r  r  ru   )r/  r0  	out_ranges_range	ovl_ranger   r   r   get_sub_range_list)  s   


r4  rC   manifest_fileinclude_uniq_idc                 C   s   t |dk}| D ]_}| | d }t|}t| ||\}}	g }
|D ]}t|\}}|
||| g q t|
|}t|dkrGtd| d q|	dkrUtd| d qt	||||	 gd}t
|| ||| qW d   |S 1 ssw   Y  |S )	a  
    Write manifest file based on rttm files (or vad table out files). This manifest file would be used by
    speaker diarizer to compute embeddings and cluster them. This function takes care of overlapping VAD timestamps
    and trimmed with the given offset and duration value.

    Args:
        AUDIO_RTTM_MAP (dict):
            Dictionary containing keys to unique names, that contains audio filepath and rttm_filepath as its contents,
            these are used to extract oracle vad timestamps.
        manifest (str):
            The path to the output manifest file.

    Returns:
        manifest (str):
            The path to the output manifest file.
    rS   r&   r   z	File ID: z6: The VAD label is not containing any speech segments.z/: The audio file has negative or zero duration.)r0  r/  N)r9   r  r   r   ru   r.  r=   r   r   r4  r   )rC   r5  r6  r0   r   r   r   
rttm_linesr'   r(   vad_start_end_list_rawr   r   r   vad_start_end_listr   r   r   r   write_rttm2manifestL  s.   

r:        ?      ?皙?segments_manifest_filesubsegments_manifest_filewindowshiftmin_subsegment_durationc              
   C   s:  |du rt  }t j|d}t| d}t|da}| }	|	D ]R}
|
 }
t|
}|d |d |d |d f\}}}}t	||||d	}|rQd
|v rQ|d
 }nd}|D ]}|\}}||krr|||||d}t
|| |d qUq!W d   n1 s~w   Y  W d   |S W d   |S 1 sw   Y  |S )a}  
    Generate subsegments manifest from segments manifest file
    Args:
        segments_manifest file (str): path to segments manifest file, typically from VAD output
        subsegments_manifest_file (str): path to output subsegments manifest file
                                        (default (None) : writes to current working directory)
        window (float): window length for segments to subsegments length
        shift (float): hop length for subsegments shift
        min_subsegments_duration (float): exclude subsegments smaller than this duration value

    Returns:
        returns path to subsegment manifest file
    Nzsubsegments.jsonr4   rS   r   r'   r(   r   r'   r@  rA  r(   r   r   r   )r   getcwdr   r   r9   r:   r"   r    r!   get_subsegments_scriptabler   r   )r>  r?  r@  rA  rB  r6  pwdsegments_manifestsubsegments_manifestsegmentssegmentr#   audior'   r(   r   subsegmentsr   
subsegmentr   r   r/   r   r   r   )segments_manifest_to_subsegments_manifestu  sL   
$

(rN  {Gz?>  d   r'   r(   use_asr_style_frame_countsample_ratefeat_per_secc	                 C   sL  g }	| }
|
| }||  kr|krn nd}n9|du rBt d||  t||  t}t |t||  t}|
||  }nt d|| |  t}|dkrgt|||kre|	|
t||g |	S |dkrt| ||d| }t|t| | |t| }tj	||d}||k}tj
|| || gdd}| }	|	S )ab  
    Return subsegments from a segment of audio file.

    Example:
        (window, shift) = 1.5, 0.75
        Segment:  [12.05, 14.45]
        Subsegments: [[12.05, 13.55], [12.8, 14.3], [13.55, 14.45], [14.3, 14.45]]

    Args:
        offset (float): Start time of audio segment
        window (float): Window length for segments to subsegments length
        shift (float): Hop length for subsegments shift
        duration (float): Duration of segment
        min_subsegment_duration (float): Exclude subsegments smaller than this duration value
        decimals (int): Number of decimal places to round to
        use_asr_style_frame_count (bool): If True, use asr style frame count to generate subsegments.
                                          For example, if duration is 10 secs and frame_shift is 0.08 secs,
                                          it results in (10/0.08)+1 = 125 + 1 frames.

    Returns:
        subsegments (List[tuple[float, float]]): subsegments generated for the segments as
                                                 list of tuple of start and duration of each subsegment
    rQ   Tr   Nr"  rn   )npceilr,   astyper  ru   rs   arange	ones_liker-   stacktolist)r'   r@  rA  r(   rB  r0   rR  rS  rT  rL  r   	slice_endslicesnum_feat_frames	start_coldur_col_rawdur_col
valid_maskvalid_subsegmentsr   r   r   get_subsegments  s0   "$
rd  c                 C   s~   g }| }|| }t || | }|dk rdn|d }t|D ]}	|| }
|
|kr+|}
|||
| g | |	d |  }q|S )a  
    This function returns subsegments from a segment of an audio file.
    Although this implementation is inefficient due to the use of a for-loop for segmentation,
    it is designed to be torch-jit-scriptable.
    Use `get_subsegments` for a more efficient implementation.

    Args:
        offset (float): start time of audio segment
        window (float): window length for segments to subsegments length
        shift (float): hop length for subsegments shift
        duration (float): duration of segment
    Returns:
        subsegments (List[tuple[float, float]]): subsegments generated for the segments
                                                 as list of tuple of start and duration of
                                                 each subsegment
    r   rQ   )mathrV  r   ru   )r'   r@  rA  r(   rL  r   r\  baser]  slice_idr   r   r   r   rE    s   rE  	start_secend_secslice_lengthc                 C   s2   t || }tt || t || }| || S )a  
    Extract time-series signal from the given audio buffer based on the start and end
    timestamps.

    Args:
        start_sec (float):
            Start of the targeted segments in second
        end_sec (float):
            Start of the targeted segments in second
        slice_length (int):
            Length of the entire audio segment that the samples are extracted from
        sample_rate (int):
            Sampling rate of the time-series audio signal

    Returns:
        (Tensor) Trimmed ime-series audio signal samples
    )r,   r  )sigrh  ri  rj  rS  	start_idxend_idxr   r   r   get_target_sig  s   rn  c                 C   s8   t | jd D ]}| | }|d |d k rtdqdS )a  
    Check whether the range list has any faulty timestamp order.

    Args:
        range_tensor (list):
            List containing the start and end time of the segments.
            Example:
                >>> range_tensor = [[0.5, 3.12], [3.51, 7.26], ... ]
    r   rQ   zIRange start time should be preceding the end time but we got: {range_tup}T)r   rv   r_   )range_tensorrV   	range_tupr   r   r   check_ranges%  s   
rq  ro  c                    s    fddt  jd D S )zL
    For online segmentation. Force the list elements to be float type.
    c                    s,   g | ]}t  | d  t  | d gqS r  )rI   )rK   rV   ro  r   r   rM   :  r*  z"tensor_to_list.<locals>.<listcomp>r   )r   rv   rr  r   rr  r   tensor_to_list6  s   rs  speaker_timestampsmodel_spk_numc              
   C   s\   g }t |D ]%}| | }t|}|D ]}||d dd|d ddt| g qq|S )a  
    Generate diarization output lines list from the speaker timestamps list by merging overlapping intervals.

    Args:
        speaker_timestamps (list):
            List containing the start and end time of the speech intervals for each speaker.
            Example:
                >>> speaker_timestamps = [[0.5, 3.12], [3.51, 7.26],... ]
        model_spk_num (int):
            Number of speakers in the model.

    Returns:
        speaker_lines_total (list):
            List containing the diarization output lines in the format:
            "start_time end_time speaker_id"
            Example:
                >>> speaker_lines_total = ["0.5 3.12 speaker_0", "3.51 7.26 speaker_1",...]
    r   z.3fr   rQ   	 speaker_)r   r.  r   r,   )rt  ru  speaker_lines_totalspk_idxts_invervalsmerged_ts_intervalsts_intervalr   r   r   !generate_diarization_output_lines=  s   .r|  frame_start
buffer_endvad_timestampscumulative_speech_labelscursor_for_old_segmentsc           	      C   s   g }|| k rt |t | g}t|}t|}tt | t |g|d}t||d}t|| dd}t|| dd}t|}t|}||fS )ad  
    Bring the new speech labels from the current buffer. Followingly:

    1. Concatenate the old speech labels from self.cumulative_speech_labels for the overlapped region.
        - This goes to new_speech_labels.
    2. Update the new 1 sec of speech label (speech_label_for_new_segments) to self.cumulative_speech_labels.
    3. Return the speech label from cursor_for_old_segments to buffer end.

    Args:
        frame_start (float):
            Start of the middle audio chunk in the audio buffer
        buffer_end (float):
            End of the audio buffer
        vad_timestamps (Tensor):
            Tensor containing VAD intervals (start and end timestamps)
        cumulative_speech_labels (torch.Tensor):
            Cumulative speech/non-speech timestamps (equivalent to VAD timestamps)
        cursor_for_old_segments (float):
            Floating point number that indicates the point where new segments should replace
            the old segments

    Returns:
        speech_label_for_new_segments (Tensor):
            The intervals (start and end) timestamps where the new incoming speech segments should
            be collected from
        cumulative_speech_labels (Tensor):
            Cumulative speech/non-speech timestamps (equivalent to VAD timestamps) with newly added
            speech/non-speech timestamps from the `vad_timestamps` input
    )r/  r0  r   )r(  )rI   rs  r4  r.  rs   rt   )	r}  r~  r  r  r  update_overlap_rangenew_incoming_speech_labelsupdate_overlap_speech_labelsspeech_label_for_new_segmentsr   r   r   get_speech_labels_for_updateY  s$   $

r  segment_range_tsc                 C   sl   | }t |}d}t |dkr,|d|d   }| |d kr%|d7 }|d }nnt |dkst || }||fS )a  
    Function for updating a cursor online speaker diarization.
    Remove the old segments that overlap with the new frame (self.frame_start)
    cursor_for_old_segments is set to the onset of the t_range popped lastly.


    Args:
        frame_start (float):
            Start of streaming pipeline frame
        segment_range_ts (float):
            Interval (start and end timestamps) of the targeted segments

    Returns:
        cursor_for_old_segments (float):
            Floating point number that indicates the point where new segments should replace
            the old segments
        cursor_index (int):
            The index of the first newly accepted segments
    r   TrR   rQ   )r=   )r}  r  r  cursor_indexcountt_ranger   r   r   get_new_cursor_for_update  s   
r  rk  buffer_startrL  
ind_offsetc                 C   s"  g }g }g }	t || }
d}|D ]b}|d |d }}||kr q|d7 }|| }t|| }||kr9tt||}t| |||
|}t|dkrKtdt||
k rYt|t||
}|| }|| }|	| |||g || qt|	t|  krt|kstd td||	||fS )a\  
    Create short speech segments from slices for online processing purpose.

    Args:
        sig (Tensor):
            Tensor containing the raw time-series signal
        buffer_start (float):
            Start point of the time-series signal buffer
        buffer_end (float):
            End point of the time-series signal buffer
        subsegments (list):
            List containing the interval information (start and duration) of each segment
        ind_offset (int):
            Offset for index that compensates the point of the current position in the streaming session
        window (float):
            Window length in second
        shift (float):
            Shift length in second

    Returns:
        sigs_list  (list):
            list of sliced input signal
        audio_lengths (list):
            list of audio sample lengths
    r   r   rQ   z6len(signal) is zero. Signal length should not be zero.z)Signal information lists have a mismatch.)r,   rI   r  rn  r=   r_   r   ru   )rk  r  r~  rL  r  r@  rS  sig_rangel_listsig_indexes	sigs_listrj  ri  subsegrh  r   
buffer_lensignalstart_abs_secend_abs_secr   r   r   get_online_segments_from_slices  s:   "
r  speech_labels_for_updateaudio_buffersegment_indexesc              
   C   s  g }g }	g }
t |dkr|d }nd}t|D ]Q\}}t|d  |  t|d  |  g}td|d |d g}t|d |||d |d  d}t|| |||||d\}}}}|| |	| |
| qt |t |	  krzt |
ks}J  J ||	|
fS )a1  
    Generate subsegments for online processing from the given segment information.
    This function extracts subsegments (embedding vector level) time-series from the
    raw time-series buffer based on the segment interval (start and end timestamps) information.

    Args:
        buffer_start (float):
            Start point of the time-series signal buffer
        buffer_end (float):
            End point of the time-series signal buffer
        sample_rate (int):
            Sampling rate of the audio input
        speech_labels_for_update (Tensor):
            Tensor containing intervals (start and end timestamps) of the speech segments
        audio_buffer (Tensor):
            Tensor containing the raw time-series signal
        segment_indexes (list):
            List containing the unique indices of segments
        window (float):
            Window length in second
        shift (float):
            Shift length in second

    Returns:
        sigs_list (list):
            List containing the tensors of the old and the newly added time-series signals
        sig_rangel_list (list):
            List containing the old and the newly added intervals (timestamps) of the speech segments
        sig_indexes (list):
            List containing the old and the newly added unique indices of segments
    r   rR   rQ   rC  )rk  r  r~  rL  r@  r  rS  )r=   r^   rI   r#  r  rE  r  r   )r  r~  rS  r  r  r  r@  rA  r  r  r  r  r   	range_spl
range_offsrange_trL  sigsr'  indsr   r   r   "get_online_subsegments_from_buffer	  s:   )
(


(
r  r   c                 C   sP   i }t | d | d | d d\}}t|}tt|D ]
}|| }|||< q|S )a"  
    Calculate cosine similarity values among speaker embeddings for each scale then
    apply multiscale weights to calculate the fused similarity matrix.

    Args:
        uniq_embs_and_timestamps: (dict)
            The dictionary containing embeddings, timestamps and multiscale weights.
            If uniq_embs_and_timestamps contains only one scale, single scale diarization
            is performed.

    Returns:
        scale_mapping_argmat (dict)
            Dictionary containing scale mapping information matrix for each scale.
    rp   rq   rr   )r   r   rr   )r   r   r   r=   )r   scale_mapping_argmatr   r   session_scale_mapping_listr   mapping_argmatr   r   r   get_scale_mapping_argmatV  s   

r  cont_stampsovl_spk_idxc              	   C   s   dd t t|D }t t|D ]'}t| D ] \}}| \}}}||| v r7|| | d| d|  qqg }	|D ]}
t|
dkrL|	t|
 q=|	S )a  
    Generate timestamps that include overlap speech. Overlap-including timestamps are created based on
    the segments that are created for clustering diarizer. Overlap speech is assigned to the existing
    speech segments in `cont_stamps`.

    Args:
        cont_stamps (list):
            Non-overlapping (single speaker per segment) diarization output in string format. Each line
            contains the start and end time of segments and corresponding speaker labels.
        ovl_spk_idx (list):
            List containing segment index of the estimated overlapped speech. The start and end of
            segments are based on the single-speaker (i.e., non-overlap-aware) RTTM generation.

    Returns:
        total_ovl_cont_list (list):
            Rendered diarization output in string format. Each line contains the start and end time of
            segments and corresponding speaker labels. This format is identical to `cont_stamps`.
    c                 S      g | ]}g qS r   r   rK   r+   r   r   r   rM         z&get_overlap_stamps.<locals>.<listcomp>r   rv  r   )r   r=   r^   r   ru   r   r   )r  r  ovl_spk_cont_listrx  r   cont_a_liner   r   r   total_ovl_cont_listovl_cont_listr   r   r   get_overlap_stampsr  s   r  estimated_num_of_spksmin_thresholdoverlap_infer_spk_limitc                 C   s    || d |d  |d   }|S )a  
    This function controls the magnitude of the sigmoid threshold based on the estimated number of
    speakers. As the number of speakers becomes larger, diarization error rate is very sensitive
    to overlap speech detection. This function linearly increases the threshold in proportion to
    the estimated number of speakers so more confident overlap speech results are reflected when
    the number of estimated speakers is relatively high.

    Args:
        estimated_num_of_spks (int):
            Estimated number of speakers from the clustering result.
        min_threshold (float):
            Sigmoid threshold value from the config file. This threshold value is the minimum
            threshold when `estimated_num_of_spks=2`.
        overlap_infer_spk_limit (int):
            If the `estimated_num_of_spks` is less than `overlap_infer_spk_limit`, overlap speech
            estimation is skipped.

    Returns:
        adaptive_threshold (float):
            Threshold value that is scaled based on the `estimated_num_of_spks`.
    r   rQ   r   )r  r  r  adaptive_thresholdr   r   r   get_adaptive_threshold  s   r  clus_labels
msdd_predsc                 K   s  | d |jd }dd t|D }|t|d k }g }|d r,t||d |d }n|d }t| D ]\}}	| d |d|f |k    }
|d|f   }|d rbt|	d	 }nt	
|d|f   d
d
d d }t|
dkr|rt	
|d
d
d }|d
|d   D ]}|t|kr|| | q||	d  d|	d  d|  q4t|}t|}t||}||fS )a  
    Generate speaker timestamps from the segmentation information. If `use_clus_as_main=True`, use
    clustering result for main speaker labels and use timestamps from the predicted sigmoid values.
    In this function, the main speaker labels in `maj_labels` exist for every subsegment step, while
    overlap speaker labels in `ovl_labels` only exist for segments where overlap speech occurs.

    Args:
        clus_labels (list):
            List containing integer-valued speaker clustering results.
        msdd_preds (list):
            List containing tensors of the predicted sigmoid values. Each tensor has shape of:
            (Session length, estimated number of speakers).
        params:
            Parameters for generating RTTM output and evaluation. Parameters include:
                infer_overlap (bool): If False, overlap speech will not be detected.
                use_clus_as_main (bool): Add overlap-speech detection from MSDD to clustering results.
                                         If False, only MSDD output is used for constructing output
                                         RTTM files.
                overlap_infer_spk_limit (int): Above this limit, overlap-speech detection is bypassed.
                use_adaptive_thres (bool): Boolean that determines whether to use adaptive thresholds
                                           depending on the estimated number of speakers.
                max_overlap_spks (int): Maximum number of overlap speakers detected. Default is 2.
                threshold (float): Sigmoid threshold for MSDD output.

    Returns:
        maj_labels (list):
            List containing string-formatted single-speaker speech segment timestamps and corresponding
            speaker labels.
            Example: [..., '551.685 552.77 speaker_1', '552.99 554.43 speaker_0', '554.97 558.19 speaker_0', ...]
        ovl_labels (list):
            List containing string-formatted additional overlapping speech segment timestamps and
            corresponding speaker labels. Note that `ovl_labels` includes only overlapping speech that
            is not included in `maj_labels`.
            Example: [..., '152.495 152.745 speaker_1', '372.71 373.085 speaker_0', '554.97 555.885 speaker_1', ...]
    r   rR   c                 S   r  r   r   r  r   r   r   rM     r  z/generate_speaker_timestamps.<locals>.<listcomp>r  use_adaptive_thres	thresholduse_clus_as_mainr   NrQ   max_overlap_spksr   rv  )squeezerv   r   r,   r  r^   r   r   r[  rU  argsortsumru   r   r   r  )r  r  paramsr  overlap_speaker_listinfer_overlapmain_speaker_linesr  seg_idxcluster_labelspk_for_seg
sm_for_segmain_spk_idxidx_arrr  r  
maj_labels
ovl_labelsr   r   r   generate_speaker_timestamps  s8   
&

 ($
r  c                 C   sv   g }t | ddd(}t| D ]\}}| }t|}t|d }|| qW d   |S 1 s4w   Y  |S )zRRetrieve `uniq_id` values from the given manifest_file and save the IDs to a list.r4   utf-8encodingr   N)r9   r^   r:   r"   r    r!   r   ru   )r5  uniq_id_listrA   r   r   r#   r   r   r   r   get_uniq_id_list_from_manifest  s   

r  r  
preds_listc                 C   sF   dd | D }t |D ]\}}t|j}|| |j|| g q|S )a  
    Create session-level dictionary containing data needed to construct RTTM diarization output.

    Args:
        uniq_id_list (list):
            List containing the `uniq_id` values.
        test_data_collection (collections.DiarizationLabelEntity):
            Class instance that is containing session information such as targeted speaker indices,
            audio filepath and RTTM filepath.
        preds_list (list):
            List containing tensors of predicted sigmoid values.

    Returns:
        session_dict (dict):
            Dictionary containing session-level target speakers data and predicted simoid values in tensor format.
    c                 S   s   i | ]}|g qS r   r   r  r   r   r   rW     rm   z#get_id_tup_dict.<locals>.<dictcomp>)r^   r   
audio_fileru   target_spks)r  test_data_collectionr  session_dictr   r   r   r   r   r   get_id_tup_dict  s
   
r  c                 C   s   t j|d}|dkrt j|rt| t | t| dd}t j|d}t	d|  t j|s=t
||dd i }|d  D ]3\}\}	}
t j|d	| d
}t j|spt|||	|
dd t	d| d|  t|}|||< qEt||}|S )a  
    This function is needed for preparing diarization training data for multiscale diarization decoder (MSDD).
    Prepare multiscale timestamp data for training. Oracle VAD timestamps from RTTM files are used as VAD timestamps.
    In this function, timestamps for embedding extraction are extracted without extracting the embedding vectors.

    Args:
        manifest_filepath (str):
            Input manifest file for creating audio-to-RTTM mapping.
        _out_dir (str):
            Output directory where timestamp json files are saved.

    Returns:
        multiscale_args_dict (dict):
            - Dictionary containing two types of arguments: multi-scale weights and subsegment timestamps
              for each data sample.
            - Each data sample has two keys: `multiscale_weights` and `scale_dict`.
                - `multiscale_weights` key contains a list containing multiscale weights.
                - `scale_dict` is indexed by integer keys which are scale index.
            - Each data sample is indexed by using the following naming convention:
                `<uniq_id>_<start time in ms>_<end time in ms>`

                Example: `fe_03_00106_mixed_626310_642300`
    speaker_outputsr   T)rB   zoracle_vad_manifest.jsonz/Extracting oracle VAD timestamps and saving at )r6  rX   r   z.json)r>  r?  r@  rA  r6  z3Subsegmentation for timestamp extracted for: scale-z at )r   r   r   r   shutilrmtreemakedirsrG   r   r;   r:  r   rN  extract_timestampsr   )manifest_filepath_out_dirrj   global_rankspeaker_dirsplit_audio_rttm_map_speaker_manifest_pathmultiscale_timestamps_by_scaler   r@  rA  subsegments_manifest_pathr   multiscale_timestamps_dictr   r   r   prepare_split_data  s8   



r  c           	      C   s   t d|  d i }t| ddd<}t| D ],\}}| }t|}|d }||vr2g ||< |d }||d  }|| ||g qW d	   |S 1 sQw   Y  |S )
a  
    This method extracts timestamps from segments passed through manifest_file.

    Args:
        manifest_file (str):
            Manifest file containing segmentation information.
    Returns:
        time_stamps (dict):
            Dictionary containing lists of timestamps.
    zExtracting timestamps from z  for multiscale subsegmentation.r4   r  r  r   r'   r(   N)	r   r;   r9   r^   r:   r"   r    r!   ru   )	r5  r   rA   r   r   r#   r   r   r   r   r   r   r  X  s"   



r  manifest_file_pathclus_label_dictc                 K   sT  t | }g }g g }}d}t| ddd}	t|	 D ]x\}
}t|}|| }|| }|t| t|||
 fi |\}}|d rH|| }n|}t||d}|d rdt	|dd	 d
}t
|||d  |||g |dd}|durtj|r|st|}t||d}|||g qd}g }qW d   ||fS 1 sw   Y  ||fS )a  
    Create RTTM files that include detected overlap speech. Note that the effect of overlap detection is only
    notable when RTTM files are evaluated with `ignore_overlap=False` option.

    Args:
        manifest_file_path (str):
            Path to the input manifest file.
        clus_label_dict (dict):
            Dictionary containing subsegment timestamps in float type and cluster labels in integer type.
            Indexed by `uniq_id` string.
        msdd_preds (list):
            List containing tensors of the predicted sigmoid values.
            Each tensor has shape of: (Session length, estimated number of speakers).
        params:
            Parameters for generating RTTM output and evaluation. Parameters include:
                infer_overlap (bool): If False, overlap-speech will not be detected.
            See docstrings of `generate_speaker_timestamps` function for other variables in `params`.

    Returns:
        all_hypothesis (list):
            List containing Pyannote's `Annotation` objects that are created from hypothesis RTTM outputs.
        all_reference
            List containing Pyannote's `Annotation` objects that are created from ground-truth RTTM outputs
    Fr4   r  r  r  r   r   c                 S   s   t |  d S )Nr   )rI   r   )r   r   r   r   <lambda>  r  z(make_rttm_with_overlap.<locals>.<lambda>)keyr&   NT)rG   r9   r^   r:   r$   ru   r=   r  r   r\   r   r>   r   r   r   r   )r  r  r  r  rC   manifest_file_lengths_listr   r   r   rA   r   r   r   manifest_dicr  r  r  
hyp_labelsr   r   r   r   r   r   r   make_rttm_with_overlapr  s>   


r  r   r   r   r   all_uemsr   c                 C   s  t |ddt |dd}}t| t| d}	t|	|d}
|durItj|rIt| d| dd}|
	| W d   n1 sDw   Y  |
||
g |d	d}|durtj|r||| gg}t|}|}t||d}t||d
}|
| |
||g |||fS )a  
    Convert speaker timestamps to pyannote.core.Timeline object.

    Args:
        speaker_timestamps (List[Tuple[float, float]]):
            Timestamps of each speaker: start time and end time of each speaker.
        uniq_id (str):
            Unique ID of each speaker.
        audio_rttm_values (Dict[str, str]):
            Dictionary of manifest values.
        all_hypothesis (List[Tuple[str, pyannote.core.Timeline]]):
            List of hypothesis in pyannote.core.Timeline object.
        all_reference (List[Tuple[str, pyannote.core.Timeline]]):
            List of reference in pyannote.core.Timeline object.
        all_uems (List[Tuple[str, pyannote.core.Timeline]]):
            List of uems in pyannote.core.Timeline object.
        out_rttm_dir (str | None):
            Directory to save RTTMs

    Returns:
        all_hypothesis (List[Tuple[str, pyannote.core.Timeline]]):
            List of hypothesis in pyannote.core.Timeline object with an added Timeline object.
        all_reference (List[Tuple[str, pyannote.core.Timeline]]):
            List of reference in pyannote.core.Timeline object with an added Timeline object.
        all_uems (List[Tuple[str, pyannote.core.Timeline]]):
            List of uems in pyannote.core.Timeline object with an added Timeline object.
    r'   Nr(   )rt  ru  r   /r   rS   r&   r   )rI   r>   r|  r=   r   r   r   r   r9   
write_rttmru   r   get_uem_object)rt  r   r   r   r   r  r   r'   r   r  r   r   r   	uem_linesorg_ref_labelsr   r   uem_objr   r   r   timestamps_to_pyannote_object  s(   "$

r  r  c                 C   s8   t |d}| D ]}|\}}|tt|t| q|S )a  
    Generate pyannote timeline segments for uem file.

     <UEM> file format
     UNIQ_SPEAKER_ID CHANNEL START_TIME END_TIME

    Args:
        uem_lines (list): list of session ID and start, end times.
            Example:
            [[0.0, 30.41], [60.04, 165.83]]
        uniq_id (str): Unique session ID.

    Returns:
        timeline (pyannote.core.Timeline): pyannote timeline object.
    r   )r
   r  r	   rI   )r  r   timelineuem_stt_end
start_timeend_timer   r   r   r    s
   
r  绽|=c                 C   sP   | | j dd } |r| | jdd|  } tjtjj| ddddd}| | } | S )z
    Mean and l2 length normalize the input speaker embeddings

    Args:
        embs: embeddings of shape (Batch,emb_size)
    Returns:
        embs: normalized embeddings of shape (Batch,emb_size)
    r   )axisr   rR   )ordr  rQ   )meanstdrU  expand_dimslinalgnorm)embsuse_stdepsembs_l2_normr   r   r   embedding_normalize  s   	r  c                   @   sz   e Zd ZdZdefddZdejdejdeej deee	  d	ee d
e	de	de
eej eee	  ee f fddZdS )OnlineSegmentora:  
    Online Segmentor for online (streaming) diarizer.
    - The class instances created by this class takes time-series signal from the audio buffer and
      creates subsegments for embedding extraction.
    - Since online segmentation is based on a short audio buffer, the methods in this class extracts
      a few subsegments from the given intervals for the raw time-series signal.

    Attributes:
        frame_start (float):
            Start of the middle chunk
        buffer_start (float):
            Start of the entire buffer
        buffer_end (float):
            End of the entire buffer
        sample_rate (int):
            Sampling rate of the input time-series signal
        cumulative_speech_labels (Tensor):
            Torch tensor matrix containing culmulative VAD (speech activity) timestamps
    rS  c                 C   s(   d| _ d| _d| _|| _tg | _d S )Nr   )r}  r  r~  rS  rs   rt   r  )selfrS  r   r   r   __init__&  s
   zOnlineSegmentor.__init__r  r  segment_raw_audior  r  r@  rA  r   c              
   C   sF  | j dkrt|dkr%|jd dkr%t|d d d|d d< |}|| _n@t| j|\}	}
|d|
 }|d|
 }|d|
 }t|t|  krRt|ksWtd tdt| j| j	| j||	\}| _t
| j | j	| j|||||d\}}}|| || || t|t|  krt|kstd td|||fS )at  
        Remove the old segments that overlap with the new frame (self.frame_start)
        cursor_for_old_segments is pointing at the onset of the t_range popped most recently.

        Frame is in the middle of the buffer.

        |___Buffer___[___________]____________|
        |____________[   Frame   ]____________|

        | <- buffer start
        |____________| <- frame start


        Args:
            audio_buffer (Tensor):
                Tensor containing raw time-series signal
            vad_timestamps (Tensor):
                Tensor containing VAD intervals (start and end timestamps)
            segment_raw_audio (list):
                List containing the previously added tensors of the raw time-series signal segments
            segment_range_ts (list):
                List containing the previously added intervals (start and end timestamps) of each segment
            segment_indexes (list):
                List containing the previously added global integer indicies of the segments from
                start to current cursor
            window (float):
                Window length in second
            shift (float):
                Shift length in second

        Returns:
            segment_raw_audio (list):
                List containing the newly added tensors of the raw time-series signal
            segment_range_ts (list):
                List containing the newly added interval (start and end timestamps) of each segment
            segment_indexes (list):
                List containing the newly added global integer indicies of the segments from
                start to current cursor
        r   r   Nz8Scale-wise segment information has a mismatch in length.)r  r~  rS  r  r  r  r@  rA  z-Segment information has a mismatch in length.)r  r=   rv   r  r  r  r}  r_   r  r~  r  rS  r   )r  r  r  r  r  r  r@  rA  r  r  r  r  r  r  r   r   r   run_online_segmentation-  sL   
1
	



z'OnlineSegmentor.run_online_segmentationN)__name__
__module____qualname____doc__r,   r  rs   Tensorr   rI   r   r  r   r   r   r   r    s(    
	r  )r%   )F)r   )T)r   )r   r   )Fr   )Nr;  r<  r=  F)rO  r   FrP  rQ  )Fr  )]r   r    re  r   r  copyr   typingr   r   r   r   r   rU  	soundfiler   rs   omegaconf.listconfigr   pyannote.corer   r	   r
   r   (nemo.collections.asr.data.audio_to_labelr   4nemo.collections.asr.parts.utils.longform_clusteringr   3nemo.collections.asr.parts.utils.offline_clusteringr   r   
nemo.utilsr   r   r   r$   r3   rG   rk   r   r   r   r   r   r   r   r   r   r   r,   r   boolr   r   r   r   r  r  rI   r  r  r!  r%  r&  r.  r4  r:  rN  rd  rE  r  rn  rq  rs  r|  r  r  r  r  dictr  r  r  r  r  r  r  r  r  r  r  r  r  r   r   r   r   <module>   s  
	
2L.


j

";,*+$
+
=	


&<
"
C


%
 
H	
$M 
G?
=

9
