o
    }oi                  
   @   s  d dl mZmZ d dlZd dlmZmZmZmZm	Z	m
Z
mZ d dlmZ dejdejdejdejd	ejf
d
dZdejd	ejfddZejjdejdejd	ejfddZdejdeded	ejfddZdedejded	ejfddZdejdejdejd	eejejf fdd Zd!ejd"ed	eejejf fd#d$Zd%ejd&ed'edejfd(d)Zd*ejd+ed	efd,d-ZG d.d/ d/ejjZdS )0    )ListTupleN)NMESCSpeakerClusteringSpectralClusteringget_scale_interpolated_embsgetAffinityGraphMatgetCosAffinityMatrixsplit_input_data)linear_sum_assignmentU_setcmm_Pcmm_QPandQreturnc                 C   s  dd t t| D }dd |D }tt|t|f|j}tt|t|f|j}d|tt||f< d|tt||f< dt|j|j|j }t	|\}	}
tdt| |j}t |
j
d D ]}||vrx|||< qm|
| ||< qm|S )aP  
    Find a mapping that minimizes the matching cost between the label P and Q.
    One-hot encodding is employed to represent sequence and calculate the cost.

    Args:
        U_set (list):
            Whole set of the estimated speakers
        cmm_P (Tensor):
            Length-matched old sequence
        cmm_Q (Tensor):
            Length-matched new sequence
        PandQ (Tensor):
            Tensor containing the indices of the speakers that are in both old and new sequences

    Returns:
        mapping_array (np.array):
            Mapped labels that minimizes the cost
    c                 S   s   g | ]}|gqS  r   .0xr   r   f/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/asr/parts/utils/online_clustering.py
<listcomp>F   s    z+get_lsa_speaker_mapping.<locals>.<listcomp>c                 S   s   g | ]}t | qS r   )intitemr   r   r   r   r   G   s       r   )rangelentorchzerostodevicearangematmulTr   shape)r   r   r   r   all_spks_labelscommon_indsenc_Penc_Qcost_col_indmapping_arrayr   r   r   r   get_lsa_speaker_mapping1   s   
r-   Y_newc                 C   s`   | j }t|  d tj|}tt|d |}tt||||< ||  S )a~  
    Force the unique indices of the labels to use the lowest numbers.

    Example:
        >>> Y_new = [3, 3, 3, 4, 4, 5]
        >>> get_minimal_indices(Y_new)
        Return:
            [0, 0, 0, 1, 1, 2]

    Args:
        Y_new (Tensor):
            Tensor containing cluster labels

    Returns:
        (Tensor): Newly mapped cluster labels that has minimized indicies
    r   r   )	r    r   uniquesortr   longr!   maxr   )r.   r    Y_new_enlistedsequencer   r   r   get_minimal_indices_   s
    r5   Y_oldc                 C   s   t |}t| dkr|}ne| |j|}}tt||g}t||g}tj|dd\}}|t|dd  }	t	|j
d |j
d }
|d|
 |d|
 }}t|dkrftddg|j}nt||||	d}|| }t |}|S )a  
    Run Hungarian (linear sum assignment) algorithm to find the best permutation mapping between
    the cumulated labels in history and the new clustering output labels.

    Args:
        Y_old (Tensor):
            Cumulated diarization labels. This will be concatenated with history embedding speaker label
            then compared with the predicted label Y_new.
        Y_new (Tensor):
            Contains predicted labels for reduced history embeddings concatenated with the predicted label.
            Permutation is not matched yet.

    Returns:
        mapping_array[Y] (Tensor):
            An output numpy array where the input Y_new is mapped with mapping_array.
    r   T)return_countsr   N)r   r   r   r   )r5   r   r   r    r   r/   catwheregtminr$   tensorr-   )r6   r.   matched_outputP_rawQ_rawr   PQa_cat_bcountsr   min_lenPQr,   r   r   r   stitch_cluster_labelsw   s    rF   removable_counts_matremain_countnum_clusc                 C   sp  | j }tjtdg||  d tdg|gdd}| jddd }|dd |dd  d| }t|dd|| }tj|dd}|}	d}
t|D ]
\}
}||k r\ nqR|
dkrt	|
D ] }| |d||    || 8  < |	t
||  ||  8 }	qe|dksJ d|	||
  }|	||
  }| |d||
    |8  < | |d|   d8  < | 
 S )	a  
    Calculate removable counts based on the arguments and calculate how many counts should be
    removed from the each cluster. This function has `O(N)` (N = num_clus) time complexity to
    return the desired `removable_counts_mat`.

    Example:

        The original input to `get_merge_quantity` function:
        >>> pre_clus_labels = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2]
        >>> num_to_be_removed = 3
        >>> min_count_per_cluster = 2

        Histogram: (`min_count_per_cluster`=2 is removed)
        0 |*****
        1 |***
        2 |*

        Inputs:
            >>> removable_counts_mat = [5, 3, 1]
            >>> remain_count = 6
            >>> num_clus = 3
        
        Interim results:
            >>> diff_counts 
            [1, 2, 2]
            >>> gradual_counts
            [3, 4, 2]
            >>> cumsum_counts
            [3, 7, 9]

        Return:
            >>> removable_counts_mat 
            [2, 1, 0]

    Args:
        removable_counts_mat (Tensor):
            Tensor containing how many vectors could be removed from each cluster
        remain_count (int):
            Integer value that indicates the number of vectors removed from the total set
        num_clus (int):
            Number of clusters in the given label sequence (cardinality of a label set)

    Returns:
        removable_counts_mat (Tensor):
            Tensor containing the number of vectors should be removed from each cluster
    r   dimT
descendingr   Nr   z&remain_count should never be negative.)r    r   r8   r<   r   r0   r!   cumsum	enumerater   r   r   )rG   rH   rI   r    zero_padded_countsremovable_count_argsdiff_countsgradual_countscumsum_countsremain_count_remindnumknd
num_labels
rem_labelsr   r   r   calculate_removable_counts   s0   /.  r[   num_to_be_removedpre_clus_labelsmin_count_per_clusterc           	      C   s
  | |j d d krtd|  d|j d |  }t|}tt|}||| k r3td||  t|gt| |j}t	||f
dd }|tt|8 }|| }t|||}t| | krltdt|dkr|t|| dkstd| |S )a  
    Determine which embeddings we need to reduce or merge in history buffer.
    We want to merge or remove the embedding in the bigger cluster first.
    At the same time, we keep the minimum number of embedding per cluster
    with the variable named min_count_per_cluster.

    Constraint:
        - Each cluster should keep the number of vectors over `min_count_per_cluster`.
        - In total, `num_to_be_removed` of vectors should be removed from the total buffer.
        - While merging embeddings, minimize the gap between quantities between clusters.

    Example:
        >>> num_to_be_removed = 3
        >>> pre_clus_labels = [0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2]
        >>> min_count_per_cluster = 2
        >>> get_merge_quantity(num_to_be_removed, pre_clus_labels, min_count_per_cluster)
        Return:   
            torch.tensor([2, 1, 0]) 
        >>> # Sum should be equal to `num_to_be_removed` which is 3

    Args:
        num_to_be_removed: (int)
            the quantity of the newly obtained embedding from the new stream of input.
        pre_clus_labels: (Tensor)
            the speaker labels of (the history_embedding_buffer_emb) + (the new embeddings to be added)
        min_count_per_cluster: (int)
            Minimum vector quantity for each cluster

    Returns:
        removable_counts_mat: (Tensor)
            Tensor containing the number of vectors should be removed from each cluster
    r   r   znum_to_be_removed: z/ should be less than pre_clus_labels length - 1z4The remaining embedding vectors should be more than zKSum of `removable_counts_mat` is not equal to `num_to_be_removed` variable.zREvery value in `removable_counts_mat` should be always non-negative value but got )r$   
ValueErrorr   bincountr   r/   r<   r   r    stackr;   r   sumr[   all)	r\   r]   r^   rH   spk_freq_countrI   min_seg_countmin_seg_count_matrG   r   r   r   get_merge_quantity   s&   #
 rg   selected_indsemb_ndxpre_cluster_labelsc           
      C   s   |j d |j d krtdtj|| ddf dd}||  }|  }g }t|j d D ]}||vr7|| q,t|}t|} |j d dkrW|d}	|d}|	|fS t	|| |f}	t
|| |d f}|	|fS )a  
    Merge feature (embedding) vectors estimated to be the same cluster label.

    Args:
        selected_inds (Tensor):
            Selected indices for merging
        emb_ndx (Tensor):
            Feature (embedding) vectors
            Dimension: (original vector counts) x (feature dimension)
        pre_cluster_labels (Tensor):
            Original cluster labels before merging

    Returns:
        merged_vecs (Tensor):
            Merged feature vectors that are concatenated
            Dimension: (merged vector counts) x (feature dimension)
        merged_clus_labels (Tensor):
            Cluster labels for the merged feature vectors
            Dimension: (merged vector counts)
    r   z9pre_cluster_labels and emb_ndx have mismatch in dimensionNrJ   )r$   r_   r   meantolistr   appendr<   	unsqueezevstackhstack)
rh   ri   rj   avg_embmerged_clus_labelsselected_inds_listbypass_inds_listkbypass_indsmerged_vecsr   r   r   merge_vectors/  s&   




rx   affinity_mat	n_closestc                 C   sz   t | jd d }||krtd| d| d| | d}tj|ddd|d  }tj|dd|d d }||fS )	a  
    Get the indices of the embedding vectors we want to merge.

    Example:
        >>> n_closest = 2
        >>> affinity_mat = [[1.0, 0.2, 0.8],
                            [0.2, 1.0, 0.4],
                            [0.8, 0.4, 1.0]]
        >>> affinity_mat.sum(0) 
        [2.0, 1.6, 2.2]

        # The closest two embedding vectors are at index 0 and 2.

    Args:
        affinity_mat: (Tensor)
            Symmetric affinity matrix of the given embedding vector set.
        n_closest (int):
            The amount of vector counts that are expected to be removed from the set
            Example:
                Input: 10 vectors in a set
                n_closest = 5
                (5+1) vectors are merged into 1 vector
                Output: 5 vectors in a set

    Returns:
        idx_aff_sum (torch.Tensor):
            Indices of the closest `n_closest` embedding vectors
        rest_inds (torch.Tensor):
            Indices of the complementary set of the indices in `idx_aff_sum`
    r   r   zGot n_closest of z: z is bigger than comb_limit TrL   N)r   r$   r_   rb   r   argsort)ry   rz   
comb_limitsum_cmatidx_aff_sum	rest_indsr   r   r   get_closest_embeddingsZ  s   
r   pre_embstarget_spk_idxmerge_quantityc                 C   sT  | j d |j d krtdt||kd }|j d }|dkr||j d d kr8td| d|j d d  t| }|dd|f |ddf }|j d |j d krXtdt||\}}	|| | | }
}||	 d  || f}t|||
\}}|| |j d krtd|j d  d	||  d
n| | }|| }|tdf}|||fS )aA  
    Reduce the number of embedding vectors by merging the closest embedding vectors.
        - This merging algorithm is based on the assumption that the closest embeddings 
          are the most redundant embedding vectors.
        - The closest embedding vectors are chosen by selecting the highest top-N sum of 
          each column in a given affinity matrix.
        - If merge_quantity is N, we choose (N+1) vectors into 1 embedding vector. 
          Thus, we reduce N embeddings in the original embedding vector set.

    Example:
        >>> merge_quantity = 1 # We merge 1+1 = 2 embedding vectors
        >>> affinity_mat = [[1.0, 0.2, 0.8],
                            [0.2, 1.0, 0.4],
                            [0.8, 0.4, 1.0]]
        >>> affinity_mat.sum(0) 
        [2.0, 1.6, 2.2]

        The first and the third embedding vectors are merged into one embedding vector.
        >>> index_mapping # (bypassed indices, merged indices)
        ([1], [0, 2]) 

    Args:
        pre_embs (Tensor):
            Potential Embedding vectors to be merged
        affinity_mat (Tensor):
            The affinity matrix of the `pre_embs`
        target_spk_idx (int):
            The targeted speaker index for merging
        merge_quantity (int):
            The count of embeddings to be reduced
        pre_clus_labels (list)
            The original cluster (speaker) index

    Returns:
        merged_embs (torch.Tensor):    
            The merged embedding vectors.
        merged_clus_labels (torch.Tensor):
            The cluster (speaker) indices for the merged embedding vectors.
        index_mapping (Tuple[torch.Tensor, torch.Tensor]):
            A tuple containing the indices of the original embeddings that were not merged (`bypassed indices`)
            and the indices of the new merged embeddings (`merged indices`).
    r   z<Dimension mismatch between `pre_embs` and `pre_clus_labels`.r   zmerge_quantity z4 should not be larger than target_emb_index length: NzrDimension mismatch between targeted speaker affinity `affinity_mat` and targeted speaker index `target_emb_index`.zReducer output z' is not matched to the target quantity .)	r$   r_   r   r9   r	   r   r0   rx   r!   )r   r   r   r]   target_emb_indexorg_sizetotal_affinity_matry   rh   r   spk_cluster_labelsselected_embsindex_mappingmerged_embsrr   r   r   r   run_reducer  s8   -

r   matlabelc                 C   s   t t| |kd d S )a4  
    Get the index of the first element are specified by `index` variable.

    Args:
        mat (Tensor):
            Source matrix filled with indices
        label (int):
            Label which we want to find the first occuring index

    Returns:
        (int): The first index of the given label
    r   )r   r   r9   )r   r   r   r   r   get_first_arg_index  s   r   c                       s  e Zd ZdZ											
					dAdededededededededededededededef fddZdej	d ed!e
eef fd"d#Zd$ed!ej	fd%d&Zd ed$ed!efd'd(Zdej	d ed!e
eej	f fd)d*Zd+ej	d,ej	d!e
eeej	ej	f fd-d.Zd+ej	d/ej	d!ej	fd0d1Zd+ej	d/ej	d!e
ej	ef fd2d3Zd+ej	d/ej	d!e
ej	ef fd4d5Zd6ej	d7ed!ej	fd8d9Z	dBdedededed eded!ej	fd:d;Z	<					=	dCd>ej	d/ej	dededededed eded!ej	fd?d@Z  ZS )DOnlineSpeakerClusteringau  
    Online clustering method for speaker diarization based on cosine similarity.

    Regular Clustering Attributes:

        max_num_speakers (int):
            The upper bound for the number of speakers in each session
        max_rp_threshold (float):
            Limits the range of parameter search.
            Clustering performance can vary depending on this range.
            Default is 0.15.
        enhanced_count_thres (int):
            For the short audio recordings, clustering algorithm cannot
            accumulate enough amount of speaker profile for each cluster.
            Thus, function `getEnhancedSpeakerCount` employs anchor embeddings
            (dummy representations) to mitigate the effect of cluster sparsity.
            enhanced_count_thres = 40 is recommended.
        sparse_search_volume (int):
            Number of p_values we search during NME analysis.
            Default is 30. The lower the value, the faster NME-analysis becomes.
            Lower than 20 might cause a poor parameter estimation.
        fixed_thres (float):
            A fixed threshold for finding p-closest neighbors in affinity matrix for clustering.
            If fixed_thres value is provided, NME-analysis process will be skipped.
            This value should be optimized on a development set to obtain a quality result.
            Default is None and performs NME-analysis to estimate the threshold.
        min_samples_for_nmesc (int):
            The minimum number of samples required for NME clustering. This avoids
            zero p_neighbour_lists. If the input has fewer segments than min_samples,
            it is directed to the enhanced speaker counting mode.
        sparse_search (bool):
            Toggle sparse search mode. If True, limit the size of p_value_list to sparse_search_volume.
        cuda (bool):
            Use cuda for Eigen decomposition if cuda=True.

    Additional Online Processing Attributes:

        history_buffer_size (int):
            - This is a buffer where diarization history is saved in the form of averaged speaker embedding vector.
            - The values in [50, 200] range is recommended while the system requires bigger buffer size for
              sessions with larger number of speakers.
        current_buffer_size (int):
            - This is a buffer which process the most recent speaker embedding vector inputs.
              current-buffer is first-in-first-out (FIFO) queue where the embeddings accepted earlier
              get to merged and saved to history buffer.
            - In general, [50, 200] range is recommended and the performance can be sensitive on this buffer size.
        min_spk_counting_buffer_size (int):
            Integer number for speaker counting buffer. Number of speakers are estimated through a small buffer
            and the number is obtained by taking majority vote.
        min_frame_per_spk (int):
            Below this number, the system considers the whole input segments as a single speaker.
        p_update_freq (int):
            Frequency (interval) of updating p_value for NMESC algorithm.
        p_value_skip_frame_thres (int):
            After `frame_index` passes this number, `p_value` estimation is skipped for inference speed
        p_value_queue_size (int):
            `p_value` buffer for major voting
        use_temporal_label_major_vote (bool):
            Boolean that determines whether to use temporal majorvoting for the final speaker labels
        temporal_label_major_vote_buffer_size (int):
            Buffer size for major-voting the
        num_spk_stat (list):
            List of number of speakers for major voting. Number of speakers are estimated through 
            majority voting of `self.num_spk_stat` list.
        p_value_hist (list):
            List of p_values for major voting.
            To save the computation time, p_value is estimated every `p_update_freq` frames and
            saved to `self.p_value_hist`.

    Attributes for counters and buffers in streaming system:
        
        is_online (bool):
            - If self.is_online is False:
                FIFO queue does not push out any speaker embedding vector
            - If self.is_online is True:
                FIFO queue starts push out speaker embedding vectors and saving them into
                history buffer.
        max_embed_count (int):
            The maximum number of segments the streaming system has ever seen.
            This value keeps increasing as the system processes more and more segments.
        memory_margin (int):
            The margin that is added to keep the segmentation data in the streaming system
        minimum_segments_per_buffer (int):
            Maximum number of embedding vectors kept in history buffer per speaker.
            Example:
                history_buffer_size (history_n) = 100
                max_num_speakers = 4
                minimum_segments_per_buffer = 25
        history_buffer_seg_end (int):
            Index that indicates the boundary between history embedding sets and current processing buffer
            when history embedding vectors and current input embedding vectors are concatenated into a
            single matrix.

    Attributes for history buffer:

        history_embedding_buffer_emb (Tensor)
            Tensor containing speaker embedding vectors for saving the history of the previous
            speaker profile in the given audio session
        history_embedding_buffer_label (Tensor)
            Speaker label (cluster label) for embedding vectors saved in the history buffer
        Y_fullhist (Tensor)
            Tensor containing the speaker label hypothesis from start to current frame
       333333?(         
               2   F   max_num_speakersmax_rp_thresholdenhanced_count_thresfixed_thressparse_search_volumehistory_buffer_sizecurrent_buffer_sizemin_spk_counting_buffer_sizemin_frame_per_spkp_update_freqp_value_skip_frame_thresp_value_queue_sizeuse_temporal_label_major_vote%temporal_label_major_vote_buffer_sizecudac                    s   t    || _|| _|| _|| _|| _|| _|| _|| _	|	| _
|
| _|| _|| _|| _|| _|| _tdg| _tdg| _d| _d| _d| _t| j| j | _d| _tg | _tg | _tg | _d S )Nr      Fr   )super__init__r   r   r   r   r   	history_n	current_nr   r   r   r   r   r   r   r   r   r<   num_spk_statp_value_hist	is_onlinemax_embed_countmemory_marginr   minimum_segments_per_bufferhistory_buffer_seg_endhistory_embedding_buffer_embhistory_embedding_buffer_label
Y_fullhist)selfr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   	__class__r   r   r   T  s4   
z OnlineSpeakerClustering.__init__mat_inframe_indexr   c           	      C   s   t || j| jdd| j| jdd|j| jd}t| jdks(|| j	k rB|| j
 dkrB| \}}| j| t| j| jkrB| jd dd | jD }tt|d  }||}|d |d  }}||fS )	a  
        To save the running time, the p-value is only estimated in the beginning of the session.
        After switching to online mode, the system uses the most common estimated p-value.
        Estimating p-value requires a plenty of computational resource. The less frequent estimation of
        p-value can speed up the clustering algorithm by a huge margin.

        Args:
            mat_in (Tensor):
                Tensor containing the affinity matrix for the current segments
            frame_index (int):
                Unique index for each segment and embedding vector

        Returns:
            est_num_of_spk: (int)
                The estimated number of speakers.
            p_hat_value: (int)
                The estimated p-value from NMESC method.
        TF   )
r   r   sparse_searchmaj_vote_spk_countr   r   nme_mat_sizeparallelismr    r   r   c                 S      g | ]}t |qS r   r   )r   pr   r   r   r         z=OnlineSpeakerClustering.onlineNMEanalysis.<locals>.<listcomp>r   )r   r   r   r   r   r    r   r   r   r   r   forwardrm   r   popr   moder<   r   getEigRatior   )	r   r   r   nmescest_num_of_spkp_hat_valuep_hat_int_listoutputg_pr   r   r   onlineNMEanalysis  s0   
z)OnlineSpeakerClustering.onlineNMEanalysisr   c                 C   s`   t |}| j| t| j| jkr| jd t dd | jD }t |}t |}|S )a3  
        Use a queue to avoid unstable speaker counting results.

        Args:
            est_num_of_spk (int):
                Estimated number of speakers

        Returns:
            est_num_of_spk (torch.Tensor):
                Estimated number of speakers from the speaker counting buffer.
        r   c                 S   r   r   r   )r   sr   r   r   r     r   zBOnlineSpeakerClustering.speaker_counter_buffer.<locals>.<listcomp>)	r   r<   r   rm   r   r   r   r`   argmax)r   r   num_spk_stat_tensornum_spks_bincountr   r   r   speaker_counter_buffer  s   


z.OnlineSpeakerClustering.speaker_counter_bufferc                 C   s   t |td|| j  S )a  
        Limit the estimated number of speakers in proportion to the number of speakers.

        Args:
            frame_index (int):
                Unique index for each segment and embedding vector
            est_num_of_spk (int):
                Estimated number of speakers
        
        Returns:
            (int) Estimated number of speakers capped by `self.min_frame_per_spk`
        r   )r;   r   r   )r   r   r   r   r   r   limit_frames_per_speaker  s   z0OnlineSpeakerClustering.limit_frames_per_speakerc                 C   s<   |  ||\}}t||}| |}| || }||fS )aS  
        Online version of speaker estimation involves speaker counting buffer and application of per-speaker
        frame count limit.

        Args:
            mat_in (Tensor):
                Raw affinity matrix containing similarity values of each pair of segments
            frame_index (int)
                Unique frame index of online processing pipeline

        Returns:
            est_num_of_spk (int):
                Estimated number of speakers
            affinity_mat (Tensor):
                Affinity matrix after applying the affinity threshold with `p_hat_value`
        )r   r   r   r   r   )r   r   r   r   r   ry   raw_est_num_of_spkr   r   r   online_spk_num_estimation  s
   

z1OnlineSpeakerClustering.online_spk_num_estimationemb_insegment_indexes_matrixc                 C   sx  t |d d }t || j }d}td}td}d}|| jkrt| jdkrD|| j| j  }t||}	|d|	 }| j	d| }n1| j
|}
}||
 }t t||
}t t||}t| j||| f}t| j| j	|
| f}|| jkrtd| d| j d|dkrtd	|jd |jd krtd
|jd  d|jd  dnd}|| _
t|| j| _||||fS )a
  
        This function performs the following tasks:
            1. Decide whether to extract more embeddings or not (by setting `is_update`)
        (Only if we need update):
            2. Calculate how many embeddings should be updated (set `new_emb_n` variable)
            3. Update history embedding vectors and save it to `pre_embs`.

        We only save the index and clustering label of each embedding.

        - Case-1: The very first step
            This else statement is for the very first diarization loop.
            This is the very first reduction frame.

        - Case-2: Number of embedding vectors is increased, therefore we need to update.
            Since there are new embeddings, we push the same amount (new_emb_n)
            of old embeddings to the history buffer.
            We should also update self.history_buffer_seg_end which is a pointer.
                update to history emb: emb_in[emb_idx_stt:emb_idx_end]
                update to history label: self.Y_fullhist[label_stt:_end]

        - Case-3: Number of embedding vectors is decreased
            If the number of embeddings is decreased compared to the last trial,
            then skip embedding merging.

        Variables:
            hist_curr_boundary (int):
                The current boundary of between history buffer and current buffer.
                This is the new history-current buffer boundary while self.history_buffer_seg_end is the old one.
                Thus, the new set of embedding vectors are collected from 
                `label_stt=self.hist_buffer_seg_end` to `label_end=hist_curr_boundary`.
            total_segments_processed_count (int):
                The number of segments that are processed so far in integer format.

        Args:
            emb_in (Tensor):
                Tensor containing embedding vectors
                Dimensions: (number of embedding vectors) x (embedding dimension)
            segment_indexes_matrix (Tensor):
                Tensor containing unique segment (embedding vector) index

        Returns:
            is_update (bool):
                Boolean indicates whether to update speaker embedding vectors.
            new_emb_n (int):
                The amount of embedding vectors that are exceeding FIFO queue size.
                new_emb_n is also an amount of embedding vectors that needs to be merged in history buffer.
            pre_embs (Tensor):
                Embedding vector matrix before merging.
                The subset of `pre_embs` embedding vectors will be merged.
                Dimensions: (number of embedding vectors) x (embedding dimension)
            pre_clus_labels (Tensor):
                A set of clustering labels for each embedding vector in `pre_embs`.
        r   r   r   TNzknew_emb_n should be less than or equal to current buffer size (self.current_n). Getting too many segments: z# for the given current buffer size zj. Please either (1) increase buffer size or (2) use longer segment lengths to get less number of segments.zGSegment counting error. `new_emb_n` should be a positve integer number.zF`pre_embs` and `pre_clus_labels` should have the same length, but got z and z respectively.F)r   r   r   emptyr   r   r   r   r   r   r   ro   rp   r   r_   r$   r2   )r   r   r   total_segments_processed_counthist_curr_boundary	new_emb_nr   r]   	is_updatehist_curr_boundary_emb_idx	label_stt	label_endemb_idx_sttemb_idx_endr   r   r   prepare_embedding_update  sV   8




	z0OnlineSpeakerClustering.prepare_embedding_updatebase_segment_indexesc                 C   sv   t || jkd }|| jd | jk r5| j|| jd  }t || d |df}t || |f}|S || }|S )a:  
        This function deals with edge cases when the number of segments decreases and the number of embedding falls
        short for the labels.

        - ASR decoder occasionally returns less number of words compared to the previous frame.
        - In this case, we obtain fewer embedding vectors for the short period of time. To match the pre-defined
          length, the last embedding vector is repeated to fill the voidness.
        - The repeated embedding will be soon replaced by the actual embeddings once the system takes new frames.

        Args:
            emb_in (Tensor):
                If self.is_online is False:
                    `pre_embs` contains only current speaker embedding inputs, which is FIFO queue
                If self.is_online is True:
                    `pre_embs` contains history buffer and FIFO queue
            base_segment_indexes (Tensor):
                Tensor containing unique segment (embedding vector) index

        Returns:
            emb_curr (Tensor):
                Length preserved speaker embedding vectors
        r   r   r   )r   r9   r   r$   r   tilero   )r   r   r   curr_clustered_segmentsdelta_countfill_in_embemb_currr   r   r   make_constant_length_embZ  s   z0OnlineSpeakerClustering.make_constant_length_embc                 C   s>  |  ||\}}}}g g }}|rat||| jd}	tt|	D ]\}
}t||
| |d\}}}|| || qt	|| _
t|| _| j
jd | jkrTtdt| j| jkr`tdn|| j
 || j | ||}|| || j| j d  t	|}t|}|jd t|krtd||fS )aU	  
        Merge the given embedding vectors based on the calculate affinity matrix.
        if `is_update` is True, update the history buffer .

        Args:
            emb_in (Tensor):
                If self.is_online is False:
                    `emb` contains only current speaker embedding inputs, which is FIFO queue
                If self.is_online is True:
                    `emb` contains history buffer and FIFO queue
            base_segment_indexes (Tensor):
                Tensor containing unique segment (embedding vector) index

        Returns:
            history_embedding_buffer_emb (Tensor):
                Matrix containing merged embedding vectors of the previous frames.
                This matrix is referred to as "history buffer" in this class.
            is_update (bool):
                Boolean indicates whether to update speaker

        Example:

            at the frame index where `is_online` turns to True:

            |------hist-buffer------|-----FIFO-queue-----|

            self.history_n = 20
            self.current_n = 10

            Step (1)
            |-----------------------|ABCDEF--------------|

            If we get two more segments, "NN" as in the description:
            history buffer = 20
            current buffer = 12

            Step (2)
            |-----------------------|ABCDEF--------------XY|
                                    |---------emb_in-------| 

            The newly accepted embeddings go through a FIFO queue (first come, first merge)
            history buffer = 22
            current buffer = 10

            Step (3)
            |-----------------------AB|CDEF--------------XY|
            |---------pre_embs--------|

            After merging (reducing) the embedding set gets back to the original size:
            history buffer = 20
            current buffer = 10

            Step (4)
            |======================|CDEF--------------XY|
            |-----hist_emb_buff----|
            
            After clustering, `self.Y_fullhist` is updated as:

            |0000000000011111111111|11110000110010010011|

            The dimension of `self.Y_fullhist` is (`history_n + current_n`) x 1

        self.history_buffer_seg_end (int):
            The total number of segments that have been merged from the beginning of the session.
            (=`hist_curr_boundary`)
        )r\   r]   r^   )r   r   r   r]   r   z3History embedding size is not maintained correctly.z/History label size is not maintained correctly.NzU`history_and_current_emb` has a mismatch in length with `history_and_current_labels`.)r   rg   r   rO   listr   r   rm   r   ro   r   rp   r   r$   r   r_   r   r   r   r   )r   r   r   r   r   r   r]   	total_embtotal_cluster_labelsclass_target_volspk_idxsub_cluster_numr   rr   r*   r   history_and_current_embhistory_and_current_labelsr   r   r   update_speaker_history_buffer|  sD   E




z5OnlineSpeakerClustering.update_speaker_history_bufferc                 C   sr   |j d | j| j  }t| jdkr|dkrtd|dkr.d| _| j||d\}}||fS d| _|}d}||fS )aZ  
        Choose whether we want to add embeddings to the memory or not.
        The processing buffer has size of (self.current_n + self.history_n).

        Case-1: If margin_seg_n > 0, this means we have more embedding vectors than we can hold in the processing buffer.
            - `is_online` should be `True`
            - reduce the number of embedding vectors by merging the closest ones.
                call `update_speaker_history_buffer` function

        Case-2: If margin_seg_n <= 0, this means that we can accept more embedding vectors and yet to fill the processing buffer.
            - `is_online` should be `False`
            - Replace `merged_emb` variable with the raw input `emb_in`.
            - `add_new` is `True`, since we are adding more embedding vectors to `merged_emb` variable.

        Args:
            emb_in (Tensor):
                If self.is_online is False:
                    `emb` contains only current speaker embedding inputs
            base_segment_indexes (Tensor):
                Tensor containing unique segment (embedding vector) index

        Returns:
            merged_emb (Tensor):
                Matrix containing merged embedding vectors of the previous frames.
                This matrix is referred to as "history buffer" in this class.
                If self.is_online is False:
                    `merged_emb` contains only current speaker embedding inputs
                If self.is_online is True:
                    `merged_emb` is a concatenated matrix with history embedding and current embedding inputs
            add_new (bool):
                Boolean that indicates whether there is a new set of segments. Depending on the VAD timestamps,
                the number of subsegments can be ocassionally decreased. If `add_new=True`, then it adds the newly
                acquired cluster labels.
        r   zThe number of incoming embedding vectors is larger than the total processing buffer size.Please either (1) increase the history and current buffer size (2) or use longer segment lengths to reduce number of segments.Tr   r   F)r$   r   r   r   r   r_   r   r   )r   r   r   margin_seg_n
merged_embadd_newr   r   r   get_reduced_mat  s   #
z'OnlineSpeakerClustering.get_reduced_matY_mergedr   c                 C   s   | j rJt| j| j| jd f}t||d|j}|rE|| j	d j
d | jkr.tdt| jd| j || j	d f}|| _|S | j}|S t| j|d|j}|| _|S )aN  
        This function matches the newly generated clustering label sequence with the existing speaker labels in the history buffer.
        `self.history_buffer_seg_end` is an integer index that tells to which point is history embedding contains from `self.Y_fullhist`.

        If embedding reducing is done correctly, we should discard  (0, self.history_n) amount and take
        (self.history_n, len(Y_merged)) from the new clustering output `Y_merged`.

        Args:
            Y_merged (Tensor):
                The newly generated clustering label sequence that may have different permutations with the existing
                speaker labels in the history buffer.
            add_new (bool):
                This variable indicates whether there is a new set of segments. Depending on the VAD timestamps,
                the number of subsegments can be occasionally decreased. If `add_new=True`, then it adds the newly
                acquired cluster labels.

        Returns:
            Y_out (Tensor):
                Permutation-matched speaker labels based on history buffer
        N)r6   r.   r   z!Update point sync is not correct.)r   r   rp   r   r   r   rF   r   r    r   r$   r   r_   )r   r   r   r6   	Y_matchedY_outr   r   r   match_labels'  s   $z$OnlineSpeakerClustering.match_labelsc	           
   
   C   s   | j ||||||||d}	|	S )z
        Wrapper function for torch.jit.script compatibility.
        NOTE: jit scripted classes only contain the methods which are included in the computation graph in the forward pass.
        )curr_embr   r   r   r   r   r   r   )forward_infer)
r   r  r   r   r   r   r   r   r   Yr   r   r   r   Q  s   
zOnlineSpeakerClustering.forward   r   r  c
                 C   s   || _ || _|| _|| _|| _|	r,|jtdks!|jtdkr,td| d| d| j||d\}
}|
j	d dkrEtj
dtjd	}nt|
}| ||\}}t||	|
jd
}|||
j}| j||d}|S )a  
        Perform speaker clustering in online mode. Embedding vector set `emb` is expected to be containing
        history embeddings to count the number of speakers.

        Args:
            curr_emb (Tensor):
                Current embedding vector input.
            base_segment_indexes (Tensor):
                Tensor containing unique segment (embedding vector) index
            max_num_speakers (int):
                Maximum number of speakers to be detected during online diarization session
            max_rp_threshold (float):
                Limits the range of parameter search.
                Clustering performance can vary depending on this range.
                Default is 0.25.
            max_rp_threshold (float):
                Limits the range of parameter search.
                Clustering performance can vary depending on this range.
                Default is 0.15.
            frame_index (int):
                Unique index for each segment (also each embedding vector)
            cuda (bool):
                Boolean that determines whether cuda is used or not
            device (torch.device):
                `torch.device` variable

        Returns:
            Y (Tensor):
                Speaker labels for history embeddings and current embedding inputs
        cpuzCUDA is enabled but the input z or z is not on the GPU.r   r   r   )r   )dtype)
n_clustersr   r    )r   r   )r   r   r   r   r   r    r   r_   r   r$   r   int32r	   r   r   r   r   r   )r   r  r   r   r   r   r   r   r   r   r   r   r  r   r   ry   spectral_modelrr   r   r   r   r  l  s    *$z%OnlineSpeakerClustering.forward_infer)r   r   r   r   r   r   r   r   r   r   r   r   Fr   F)F)r  r   r   r   r   r   F)__name__
__module____qualname____doc__r   floatboolr   r   Tensorr   r   r   r   r   r   r   r   r   r   r   r  __classcell__r   r   r   r   r     s    j	
 1-"
m"
$w43	

	
r   )typingr   r   r   3nemo.collections.asr.parts.utils.offline_clusteringr   r   r   r   r   r	   r
   3nemo.collections.asr.parts.utils.optimization_utilsr   r  r-   r5   jitscriptrF   r   r[   rg   rx   r   r   r   nnModuler   r   r   r   r   <module>   sb   !$	
.)P
>
$+,
U