o
    }oi?                     @   s  d dl mZmZmZ d dlZd dlmZmZ edfdej	dej	dej	fdd	Z
d
ej	dej	fddZedfdej	dej	dejdej	fddZdedfd
ej	dedededejf
ddZddd edfd
ej	dededededejdej	fddZd ej	d!edejdej	fd"d#Zd ej	dejdej	fd$d%Zdgd ej	d'ed(edej	fd)d*Zd+ej	d'edej	fd,d-Zd.ej	d/ej	d0ej	dejdeej	ej	f f
d1d2Zd3ej	d4ej	dej	fd5d6Zd7eej	 deej	 fd8d9Zd:ej	dej	fd;d<Zedfd=ej	d>eej	 d7eej	 dejdeej	eej	 f f
d?d@Zedfd=ej	d>eej	 d7eej	 dejdej	f
dAdBZd
ej	dej	fdCdDZdEej	dFedejdeej	ej	f fdGdHZdEej	dFedejdej	fdIdJZ dKej	dej	fdLdMZ!d:ej	dNedOedPedej	f
dQdRZ"	S	T	U	V	Wdhd:ej	dXedOedNedPedFedej	fdYdZZ#d>ej	d7ej	d[ej$deeej	 eej	 f fd\d]Z%	Wdid ej	d^edFedeej	ej	ej	f fd_d`Z&G dadb dbZ'G dcdd ddZ(G dedf dfej)j*Z+dS )j    )DictListTupleN)eigheigvalshgǺ6?emb_aemb_breturnc                 C   s   | j d dks|j d dkrtd| j  d|j  | tj| ddd|  }|tj|ddd|  }t||dd}|d |S )a  
    Calculate cosine similarities of the given two set of tensors. The output is an N by N
    matrix where N is the number of feature vectors.

    Args:
        a (Tensor):
            Matrix containing speaker representation vectors. (N x embedding_dim)
        b (Tensor):
            Matrix containing speaker representation vectors. (N x embedding_dim)

    Returns:
        res (Tensor):
            N by N matrix containing the cosine similarities of the values.
    r      z;Number of feature vectors should be greater than 1 but got z and dim)shape
ValueErrortorchnorm	unsqueezemm	transposefill_diagonal_)r   r   epsa_normb_normres r   g/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/asr/parts/utils/offline_clustering.pycos_similarity(   s   
r   Xc                 C   s&   |   |  }}| | ||  }|S )a*  
    Min-max scale the input affinity matrix X, which will lead to a dynamic range of [0, 1].

    Args:
        X (Tensor):
            Matrix containing cosine similarity values among embedding vectors (N x N)

    Returns:
        v_norm (Tensor):
            Min-max normalized value of X.
    )minmax)r   v_minv_maxv_normr   r   r   ScalerMinMaxA   s   r"   cpuspecEmbAspecEmbBdevicec                 C   sP   |  || |} }| jdd|jdd}}|| d }|jdd }|S )a  
    Calculate Euclidean distances from the given feature tensors.

    Args:
        specEmbA (Tensor):
            Matrix containing spectral embedding vectors from eigenvalue decomposition (N x embedding_dim).
        specEmbB (Tensor):
            Matrix containing spectral embedding vectors from eigenvalue decomposition (N x embedding_dim).

    Returns:
        dis (Tensor):
            Euclidean distance values of the two sets of spectral embedding vectors.
    r
   r   r   g       @)tor   sumsqueeze)r$   r%   r&   ABdisr   r   r   getEuclideanDistanceR   s
   r.      
n_clustersrandom_staten_local_trialsc                 C   s  t | | |} | j\}}t j||| jd}t d|d }t j|gdt j	d}	| | 
d|d< |
d|	d< ||}|d d| jd | jd d|  }
|
djddjdd}| }td|D ]}t ||  }t|jdkrt j|ddd }nt j|dd}t |||}|jd }| | d| jd | jd | d| |d }|djdd|d}t ||}|jdd}t |}|| }|| }|| }| | ||< ||	|< qj||	fS )	a  
    Choose initial centroids for initializing k-means algorithm. The performance of
    k-means algorithm can vary significantly by the initial centroids. To alleviate
    this problem, k-means++ algorithm chooses initial centroids based on the probability
    proportional to the distance from the formally chosen centroids. The centroids
    selected by k-means++ algorithm improve the chance of getting more accurate and
    stable clustering results. The overall implementation of k-means++ algorithm is
    inspired by the numpy based k-means++ implementation in:
        https://github.com/scikit-learn/scikit-learn

    Originally, the implementation of the k-means++ algorithm in scikit-learn is based
    on the following research article:
        Arthur, David, and Sergei Vassilvitskii. k-means++: The advantages of careful
        seeding. Proceedings of the eighteenth annual ACM-SIAM symposium on Discrete
        algorithms, Society for Industrial and Applied Mathematics (2007)

    Args:
        X (Tensor):
            Matrix containing cosine similarity values among embedding vectors (N x N)
        n_clusters (int):
            Maximum number of speakers for estimating number of speakers.
            Shows stable performance under 20.
        random_state (int):
            Seed variable for setting up a random state.
        n_local_trials (int):
            Number of trials for creating initial values of the center points.
        device (torch.device)
            Torch device variable.

    Returns:
        centers (Tensor):
            The coordinates for center points that are used for initializing k-means algorithm.
        indices (Tensor):
            The indices of the best candidate center points.
    dtyper   r
   r'   )r   Nr
      r   )r   manual_seedr(   r   zerosr4   randintlongfullintr*   repeatviewpowr)   r   rangeranditemlencumsumsearchsortedminimumargmin)r   r0   r1   r2   r&   	n_samples
n_featurescenters	center_idindicesclosest_dist_diffclosest_dist_sqcurrent_potc	rand_valstorch_cumsumcandidate_idsN_cidistance_diffdistancedistance_to_candidatescandidates_potbest_candidater   r   r   kmeans_plusplus_torchi   s<   
*


(
4

rZ   g-C6?   num_clusters	threshold
iter_limitc                 C   s.  |   |} | jd }t| |||d}|d }t| }	t|D ]q}
t| ||d}t	|jdkr6 |	S tj
|dd}	| }t|D ].}t|	|k |}t| d|}|jd dkrk| tt	| d }|jdd||< qEt|| d}ttttj|ddd}||k r |	S q#|	S )a  
    Run k-means algorithm on the given set of spectral embeddings in X. The threshold
    and iter_limit variables are set to show the best performance on speaker diarization
    tasks. The overall implementation of k-means algorithm is inspired by the k-means
    algorithm implemented in https://github.com/scikit-learn/scikit-learn.

    References:
        Arthur, David, and Sergei Vassilvitskii. k-means++: The advantages of careful
        seeding. Proceedings of the eighteenth annual ACM-SIAM symposium on Discrete
        algorithms, Society for Industrial and Applied Mathematics (2007).

    Args:
        X (Tensor):
            Cosine similarity matrix calculated from speaker embeddings
        num_clusters (int):
            The estimated number of speakers.
        threshold (float):
            This threshold limits the change of center values. If the square of
            the center shift values are bigger than this threshold, the iteration stops.
        iter_limit (int):
            The maximum number of iterations that is allowed by the k-means algorithm.
        device (torch.device):
            Torch device variable

    Returns:
        selected_cluster_indices (Tensor):
            The assigned cluster labels from the k-means clustering.
    r   )r0   r1   r&   r&   r
   r   r5   r6   )floatr(   r   rZ   r   r8   r:   r@   r.   rC   rG   clonenonzeror*   index_selectr9   meanr?   r)   sqrt)r   r\   r]   r^   r1   r&   
input_sizeplusplus_init_statesrJ   selected_cluster_indices
iter_counteuc_distcenter_initsindexselected_clusterchosen_indicescenter_delta_powcenter_shift_powr   r   r   kmeans_torch   s2   %
"rq   affinity_mat	seg_indexc                 C   s   | j d }tj|tjd|}tj|tjd|}d||< ||}t|D ]H}| }tj|||d || kr? |S |tdk	 
  }t| dkrY|d}|D ]}	| |	 |}
tj||
d|d q[q(|S )a  
    Find the largest affinity_mat connected components for each given node.
    This is for checking whether the affinity_mat is fully connected.

    Args:
        affinity_mat (Tensor):
            A square matrix (tensor) containing normalized cosine distance values
        seg_index (int):
            The segment index that is targeted to be explored.

    Returns:
        connected_nodes (Tensor):
            A tensor containing booleans that indicate whether the node is connected.
    r   r3   T)out)r   r   r8   boolr(   r@   r)   
logical_ortensorrb   tr*   rC   sizer   )rr   rs   r&   num_of_segmentsconnected_nodesnodes_to_exploreklast_num_componentrL   i	neighborsr   r   r   getTheLargestComponent	  s&   


r   c                 C   s   t | d| | jd kS )zM
    Check whether the given affinity matrix is a fully connected graph.
    r   )r   r)   r   )rr   r&   r   r   r   isGraphFullyConnected.  s   r   binaryp_valuemask_methodc                 C   s(  | j }t|  }tj| dddddd|f }td| j ||jt	| j d f< |ddd|f 
 }t	|d |dj
 }|dksR|du rft|j d | j |||f< |S |dkrx| ||f  |||f< |S |dkrt| ||f  |||f< |S td	| )
a(  
    Binarize top-p values for each row from the given affinity matrix.

    Args:
        affinity_mat (Tensor):
            A square matrix (tensor) containing normalized cosine similarity values
        p_value (int):
            The number of top values that are selected from each row.
        mask_method (str):
            The method that is used to manipulate the affinity matrix. The default method is 'binary'.

    Returns:
        binarized_affinity_mat (Tensor):
            A binarized affinity matrix based on the given mask method.
    r
   T)r   
descendingNr   r   dropsigmoidzUnknown mask method: )r   r   
zeros_likehalfargsortonesr(   r&   Tarangeflattenr=   r   r   )rr   r   r   r   binarized_affinity_matsorted_matrixindices_rowindices_colr   r   r   getKneighborsConnections5  s$    
	r   affinity_mat_rawc                 C   s(   |dkr| nt | |}d||j  }|S )z[
    Calculate a binarized graph matrix and
    symmetrize the binarized graph matrix.
    r   g      ?)r   r   )r   r   r   symm_affinity_matr   r   r   getAffinityGraphMatZ  s   r   matmax_Nn_listc                 C   sX   t d}t| |}t|D ]\}}t||}t| |}|s"||kr' ||fS q||fS )z
    Generate connections until fully connect all the nodes in the graph.
    If the graph is not fully connected, it might generate inaccurate results.
    r
   )r   rw   r   	enumerater   )r   r   r   r&   r   rr   r   fully_connectedr   r   r   getMinimumConnectiond  s   



r   mapping_argmatscore_mat_sizec                 C   sB   t j|t jd| j}t j| dd\}}| | j||< |S )z
    Count the numbers in the mapping dictionary and create lists that contain
    repeated indices that will be used for creating a repeated affinity matrix.
    This repeated matrix is then used for fusing multiple affinity values.
    r3   T)return_counts)r   r8   int32r(   r&   uniquer<   )r   r   repeat_listidxscountsr   r   r   getRepeatedListv  s   r   timestamps_in_scalesc                 C   s   t tt| }g }|D ]}| | }|tj|dd qt|}|| }g }|D ]/}|| }t||jd df}	t||jd df	 }
tj
t|	|
 dd}|| q)|S )a  
    Calculate the mapping between the base scale and other scales. A segment from a longer scale is
    repeatedly mapped to a segment from a shorter scale or the base scale.

    Args:
        timestamps_in_scales (list):
            List containing timestamp tensors for each scale.
            Each tensor has dimensions of (Number of base segments) x 2.

    Returns:
        session_scale_mapping_list (list):
            List containing argmin arrays indexed by scale index.
    r
   r   r   )listr@   rC   appendr   rd   r   tiler   rx   rG   abs)r   
scale_listsegment_anchor_list	scale_idxtime_stamps_floatbase_scale_idxbase_scale_anchorsession_scale_mapping_listcurr_scale_anchorcurr_matbase_mat
argmin_matr   r   r   get_argmin_mat  s   r   embc                 C   sF   | j d dkrtdgg| j}|S |  } t| | }t|}|S )ar  
    Calculate cosine similarity values among speaker embeddings then min-max normalize
    the affinity matrix.

    Args:
        emb (Tensor):
            Matrix containing embedding vectors. emb variable should be float(FP32) type to make the data-type
            compatible with torch.mm operation for both CPU and GPU(CUDA).
            dimension: (Number of embedding vectors) x (embedding dimension)

    Returns:
        sim_d (Tensor):
            Matrix containing cosine similarity values among the given embedding vectors.
            dimension: (Number of embedding vectors) x (Number of embedding vectors)
    r   r
   )r   r   rw   r(   r&   r`   r   r"   )r   sim_dr   r   r   getCosAffinityMatrix  s   
r   multiscale_weightsembeddings_in_scalesc                 C   s   g }|  |} t|}ttt|}|D ]-}|| }||  |}	| |}t|t|	jd  |}
tj	|	|
dd}|
| qt|}t|ddd|    }t|jdk re|d}| |}||fS )a  
    Generate a scale-interpolated single embedding vector by calculating the weighted sum
    of the multiple embedding vectors from different scales. The output is a set of embedding
    vectors corresponding to the base-scale segments.

    Args:
        multiscale_weights (Tensor):
            Tensor containing Multiscale weights
            Dimensions: (Number of scales) x 1
        embeddings_in_scales (list):
            List containing split embedding tensors by each scale
        timestamps_in_scales (list):
            List containing split timestamps tensors by each scale
        device (torch.device):
            Torch device variable

    Returns:
        context_emb (Tensor):
            A set of scale-interpolated embedding vectors.
            Dimensions: (Number of base-scale segments) x (Dimensions of embedding vector)
        session_scale_mapping_list (list):
            List containing argmin arrays indexed by scale index.
    r   repeatsr   r6   r
   )r(   r   r   r@   rC   r   r   rw   r   repeat_interleaver   stackmatmulpermuterx   r*   r   )r   r   r   r&   rep_mat_listr   r   r   r   emb_tr   	rep_emb_tstacked_scale_embscontext_embr   r   r   get_scale_interpolated_embs  s"   


"

r   c                 C   s   t j| dd|} t|}ttt|}t t|d t|d |}|D ]?}|| }||  |}	t	|	}
t
|t |
jd |}t j|
|dd|}t j||dd|}|| | | 7 }q)|S )at  
    Calculate cosine similarity values among speaker embeddings for each scale then
    apply multiscale weights to calculate the fused similarity matrix.
    NOTE: Due to CUDA memory limit, the embedding vectors in embeddings_in_scales are stored in `cpu` device.

    Args:
        multiscale_weights (Tensor):
            Tensor containing multiscale weights
            Dimensions: (Number of scales) x 1
        embeddings_in_scales (list):
            List containing split embedding tensors by each scale
        timestamps_in_scales (list):
            List containing split timestamps tensors by each scale
        device (torch.device):
            Torch device variable

    Returns:
        fused_sim_d (Tensor):
            An affinity matrix that is obtained by calculating the weighted sum of 
            the multiple affinity matrices from the different scales.
    r   r   r'   r   r
   )r   r*   r(   r   r   r@   rC   r8   r   r   r   rw   r   r   )r   r   r   r&   r   r   fused_sim_dr   r   r   score_mat_torchr   repeated_tensor_0repeated_tensor_1r   r   r   getMultiScaleCosAffinityMatrix  s   "r   c                 C   s4   |  d tjt| dd}t|}||  }|S )zA
    Calculate a laplacian matrix from an affinity matrix X.
    r   r
   r   )r   r   r)   r   
diag_embed)r   DLr   r   r   getLaplacian  s
   

r   	laplaciancudac                 C   sN   |r|du rt j }|  |} n
|  t d} t| \}}||fS )zK
    Calculate eigenvalues and eigenvectors from the Laplacian matrix.
    Nr#   )r   r   current_devicer`   r(   r&   r   )r   r   r&   lambdasdiffusion_mapr   r   r   eigDecompose  s   
r   c                 C   sF   |r|du rt j }|  |} n
|  t d} t| }|S )z?
    Calculate only eigenvalues from the Laplacian matrix.
    Nr#   )r   r   r   r`   r(   r&   r   )r   r   r&   r   r   r   r   
eigValueSh-  s   
r   r   c                 C   s,   t | r
t | } | dd | dd  S )z3
    Calculate the gaps between lambda values.
    r
   Nr'   )r   
is_complexreal)r   r   r   r   getLamdaGaplist;  s   

r   anchor_sample_nanchor_spk_nsigmac              	   C   s   | j d }tj| dd}t|| j}g }t|D ]A}ttd||df| j}t||j	| j}	t
t||	tjt|	ddd d j	}	|||	  }
||
 q||  t|}|S )a  
    Add randomly generated synthetic embeddings to make eigenanalysis more stable.
    We refer to these embeddings as anchor embeddings.

    emb (Tensor):
        The input embedding from the embedding extractor.
    anchor_sample_n (int):
        Number of embedding samples per speaker.
        anchor_sample_n = 10 is recommended.
    anchor_spk_n (int):
        Number of speakers for synthetic embedding.
        anchor_spk_n = 3 is recommended.
    sigma (int):
        The amplitude of synthetic noise for each embedding vector.
        If the sigma value is too small, under-counting could happen.
        If the sigma value is too large, over-counting could happen.
        sigma = 50 is recommended.
    r
   r   r   )r   r   stdrw   r(   r&   r@   r   randnr   r   diagr   r   r   r   vstack)r   r   r   r   emb_dimstd_orgnew_emb_list_emb_m	emb_noiseemb_gen
new_emb_npr   r   r   addAnchorEmbD  s"   
 (

r         
   2   Frandom_test_countc                 C   s   g }t |D ].}t| t| |||}t|}	t|	| jd ddddd|d}
|
 \}}||	  qt
ttt
|d 	 | d}|S )	aS  
    Calculate the number of speakers using NME analysis with anchor embeddings. Add dummy speaker
    embedding vectors and run speaker counting multiple times to enhance the speaker counting accuracy
    for the short audio samples.

    Args:
        emb (Tensor):
            The input embedding from the embedding extractor.
        cuda (bool):
            Use cuda for the operations if cuda==True.
        random_test_count (int):
            Number of trials of the enhanced counting with randomness.
            The higher the count, the more accurate the enhanced counting is.
        anchor_spk_n (int):
            Number of speakers for synthetic embedding.
            anchor_spk_n = 3 is recommended.
        anchor_sample_n (int):
            Number of embedding samples per speaker.
            anchor_sample_n = 10 is recommended.
        sigma (float):
            The amplitude of synthetic noise for each embedding vector.
            If the sigma value is too small, under-counting could happen.
            If the sigma value is too large, over-counting could happen.
            sigma = 50 is recommended.

    Returns:
        comp_est_num_of_spk (Tensor):
            The estimated number of speakers. `anchor_spk_n` is subtracted from the estimated
            number of speakers to factor out the dummy speaker embedding vectors.
    r   333333?Tr         i,  )max_num_speakersmax_rp_thresholdsparse_searchsparse_search_volumefixed_thresnme_mat_sizer   r
   )r@   r   r7   r   r   NMESCr   forwardr   rB   rw   r   mode)r   r   r   r   r   r   est_num_of_spk_listseedemb_augr   nmescest_num_of_spkr   comp_est_num_of_spkr   r   r   getEnhancedSpeakerCounti  s&   &

(r  multiscale_segment_countsc                 C   s   t | jdkrtdt | j dt |jdkr$tdt |j dt|| jd   kr7|jd ksOn td|jd  d| jd  d|jd  d	| }tj| |dd
} tj||dd
}t| t|} }| |fS )a  
    Split multiscale embeddings and multiscale timestamps and put split scale-wise data into python lists.
    This formatting function is needed to make the input type as `torch.Tensor`.

    Args:
        embeddings_in_scales (Tensor):
            Concatenated Torch tensor containing embeddings in multiple scales
        timestamps_in_scales (Tensor):
            Concatenated Torch tensor containing timestamps in multiple scales
        multiscale_segment_counts (LongTensor):
            Concatenated Torch LongTensor containing number of segments per each scale

    Returns:
        embeddings_in_scales (list):
            List containing split embedding tensors by each scale
        timestamps_in_scales (list):
            List containing split timestamps tensors by each scale
    r6   z>embeddings_in_scales Tensor should have 2 dimensions, but got .z>timestamps_in_scales Tensor should have 2 dimensions, but got r   zmultiscale_segment_counts, embeddings_in_scales, and timestamps_in_scales should have the same length,                            but got z, z, and z respectively.r   )rC   r   r   r   r)   tolistsplitr   )r   r   r  split_indexr   r   r   split_input_data  s.   (r  r   c                 C   sZ   t | }t||| jd}t|d }t|}t|dt||jd  d }|||fS )ar  
    Estimate the number of speakers using eigendecomposition on the Laplacian Matrix.

    Args:
        affinity_mat (Tensor):
            N by N affinity matrix
        max_num_speakers (int):
            Maximum number of clusters to consider for each session
        cuda (bool):
            If cuda available eigendecomposition is computed on GPUs.

    Returns:
        num_of_spk (Tensor):
            The estimated number of speakers
        lambdas (Tensor):
            The lambda values from eigendecomposition
        lambda_gap (Tensor):
            The gap between the lambda values from eigendecomposition
    r   r&   r   Nr
   )	r   r   r&   r   sortr   argmaxr   r   )rr   r   r   r   r   
lambda_gap
num_of_spkr   r   r   estimateNumofSpeakers  s   "
r  c                   @   s   e Zd ZdZddddedfdeded	ed
edejf
ddZdej	fddZ
dedfdej	d
edejdej	fddZddej	ded
edej	fddZdS )SpectralClusteringz
    Perform spectral clustering by calculating spectral embeddings then run k-means clustering
    algorithm on the spectral embeddings.
       r   r
   Fr#   r0   r1   n_random_trialsr   r&   c                 C   s(   || _ || _t|d| _|| _|| _dS )a  
        Initialize the variables needed for spectral clustering and k-means++.

        Args:
            n_clusters (int):
                Number of the estimated (or oracle) number of speakers
            random_state (int):
                Random seed that determines a random state of k-means initialization.
            n_random_trials (int):
                Number of trials with different random seeds for k-means initialization.
                k-means++ algorithm is executed for multiple times then the final result
                is obtained by taking a majority vote.
            cuda (bool):
                if cuda=True, spectral clustering is done on GPU.
            device (torch.device):
                Torch device variable
        r
   N)r0   r1   r   r  r   r&   )selfr0   r1   r  r   r&   r   r   r   __init__  s
   
zSpectralClustering.__init__r	   c                 C   s4   |j d |j d krtd| j|| j| jd}|S )z
        Call self.clusterSpectralEmbeddings() function to predict cluster labels.

        Args:
            X (Tensor):
                Affinity matrix input

        Returns:
            labels (Tensor):
                Clustering label output
        r   r
   z+The affinity matrix is not a square matrix.r	  )r   r   clusterSpectralEmbeddingsr   r&   )r  r   labelsr   r   r   r     s   zSpectralClustering.forwardaffinityc                 C   s|   | j || j|d}g }t| j| j| j D ]}t|| j||d}|| qt|}t	t	|dd d }	||	 }
|
S )a  
        Perform k-means clustering on spectral embeddings. To alleviate the effect of randomness,
        k-means clustering is performed for (self.n_random_trials) times then the final labels are obtained
        by taking a majority vote. If speed is the major concern, self.n_random_trials should be set to 1.
        n_random_trials=30 is recommended to see an improved result.

        Args:
            affinity (Tensor):
                Affinity matrix input
            cuda (torch.bool):
                Use cuda for spectral clustering if cuda=True
            device (torch.device):
                Torch device variable

        Returns:
            labels (Tensor):
                clustering label output

        )n_spksr   )r   r\   r1   r&   r   r
   )
getSpectralEmbeddingsr0   r@   r1   r  rq   r   r   r   r   )r  r  r   r&   spectral_emb
labels_setrandom_state_seed_labelsstacked_labelslabel_indexr  r   r   r   r  #  s   

z,SpectralClustering.clusterSpectralEmbeddingsrr   r  c           
      C   sl   t |}t|||jd\}}|ddd|f }t|dd dd }|j|ddf }	|	d| jS )a  
        Calculate eigenvalues and eigenvectors to extract spectral embeddings.

        Args:
            affinity (Tensor):
                Affinity matrix input
            cuda (torch.bool):
                Use cuda for spectral clustering if cuda=True
            device (torch.device):
                Torch device variable

        Returns:
            labels (Tensor):
                clustering label output
        r	  Nr
   r'   )r   r   r&   r   r   ry   r:   r   )
r  rr   r  r   r   r   diffusion_map_r   inv_idx	embeddingr   r   r   r  F  s   z(SpectralClustering.getSpectralEmbeddingsN)r  F)__name__
__module____qualname____doc__r   r&   r<   ru   r  Tensorr   r  r  r   r   r   r   r    s<    


$#r  c                   @   s   e Zd ZdZdddddddddded	fd
ejdedede	dedede	dede	de	de	dejfddZ
deejejf fddZdedejfddZdedejfddZdejfd d!Zd"S )#r   a  
    Normalized Maximum Eigengap based Spectral Clustering (NME-SC)
    uses Eigengap analysis to get an estimated p-value for
    affinity binarization and an estimated number of speakers.

    p_value (also referred to as p_neighbors) is for taking
    top p number of affinity values and convert those to 1 while
    convert the rest of values to 0.

    p_value can be also tuned on a development set without performing
    NME-analysis. Fixing p_value brings about significantly faster clustering
    speed, but the performance is limited to the development set.

    References:
        Tae Jin Park et al., Auto-Tuning Spectral Clustering for Speaker Diarization
        Using Normalized Maximum Eigengap, IEEE Signal Processing Letters 27 (2019),
        https://arxiv.org/abs/2003.02405

    Args:
        Please refer to def __init__().

    Methods:
        NMEanalysis():
            Performs NME-analysis to estimate p_value and the number of speakers
        subsampleAffinityMat(nme_mat_size):
            Subsamples the number of speakers to reduce the computational load
        getPvalueList():
            Generates a list containing p-values that need to be examined.
        getEigRatio(p_neighbors):
            Calculates g_p, which is a ratio between p_neighbors and the maximum eigengap
        getLamdaGaplist(lambdas):
            Calculates lambda gap values from an array contains lambda values
        estimateNumofSpeakers(affinity_mat):
            Estimates the number of speakers using lambda gap list
    r   r   Tr/      r   Fr#   r   r   r   r   r   r   use_subsampling_for_nmer   maj_vote_spk_countparallelismr   r&   c                 C   sx   || _ || _|| _|| _|| _|| _td| _|| _	d| _
td| _|| _| jd| _|| _|| _|	| _|
| _dS )a  
        Args:
            mat (Tensor):
                Cosine similarity matrix calculated from the provided speaker embeddings.
            max_num_speakers (int):
                Maximum number of speakers for estimating number of speakers.
                Shows stable performance under 20.
            max_rp_threshold (float):
                Limits the range of parameter search.
                Clustering performance can vary depending on this range.
                Default is 0.25.
            sparse_search (bool):
                To increase the speed of parameter estimation, sparse_search=True
                limits the number of p_values we search.
            sparse_search_volume (int):
                Number of p_values we search during NME analysis.
                Default is 30. The lower the value, the faster NME-analysis becomes.
                However, a value lower than 20 might cause a poor parameter estimation.
            nme_mat_size (int):
                Targeted size of matrix for NME analysis.
            use_subsampling_for_nme (bool):
                Use subsampling to reduce the calculational complexity.
                Default is True.
            fixed_thres (float or None):
                A fixed threshold which can be used instead of estimating the
                threshold with NME analysis. If fixed_thres is float,
                it skips the NME analysis part.
            maj_vote_spk_count (bool):
                If True, take a majority vote on all p-values in the given range to estimate the number of speakers.
                The majority voting may contribute to surpress overcounting of the speakers and improve speaker
                counting accuracy.
            parallelism (bool):
                If True, turn on parallelism based on torch.jit.script library.
            cuda (bool):
                Use cuda for Eigen decomposition if cuda=True.
            device (torch.device):
                Torch device variable
        r6   g|=r   N)r   r   r(  r   r   r   r   rw   min_p_valuer   r   r   r   r   p_value_listr   r&   r)  r*  )r  r   r   r   r   r   r   r(  r   r)  r*  r   r&   r   r   r   r    s    5
zNMESC.__init__r	   c                 C   s  | j r
| | j}ntd}g }i }|  | _| jjd }t|}t|}| j	rQg }t
| jD ]\}}	|tj| j|	 q2|D ]}
|tj|
 qDnt
| jD ]\}}	|| |	 qVt
| jD ]!\}}	|| }|d |d  }}|||< |||	 < |||< qht|}| j| }t| j|}t|| jdst| j| j| j| jd\}}|| tj}| jrtt|d }||fS ||  }||fS )aB  
        Subsample the input matrix to reduce the computational load.

        Returns:
            est_num_of_spk (Tensor):
                Estimated number of speakers from NMESC approach
            p_hat_value (Tensor):
                Estimated p-value (determines how many neighboring values to be selected)
        r
   r   r_   )r(  subsampleAffinityMatr   r   rw   getPvalueListr,  r   r8   r*  r   r   jitforkgetEigRatiowaitr<   rB   rG   r   r   r   r&   r   r   typer)  r   )r  subsample_ratioresultsest_spk_n_dictp_volumeeig_ratio_listr   futuresp_idxr   futureoutputg_pr   index_nn
rp_p_valuerr   p_hat_valuer   r   r   r     sJ   








zNMESC.forwardc                 C   sT   t t dt | jjd | t j}| jdd| dd| f | _|S )a  
        Perform subsampling of affinity matrix.
        This subsampling is for calculational complexity, not for performance.
        The smaller nme_mat_size is,
            - the bigger the chance of missing a speaker.
            - the faster p-value estimation speed (based on eigen decomposition).

        The recommended nme_mat_size is 250~750.
        However, if there are speakers who speak for very short period of time in the recording,
        this subsampling might make the system miss underrepresented speakers.
        Use this variable with caution.

        Args:
            nme_mat_size (int):
                The targeted matrix size

        Returns:
            subsample_ratio (float):
                The ratio between nme_mat_size and the original matrix size
        r
   r   N)r   r   rw   r   r   r3  r<   rB   )r  r   r4  r   r   r   r-    s   ,$zNMESC.subsampleAffinityMatp_neighborsc           
      C   s   t | j|}t|| j| j\}}}tj|d| j dd}|d }|| t| | j	  }|| jj
d  || j	  }	t|	|gS )a  
        For a given p_neighbors value, calculate g_p, which is a ratio between p_neighbors and the
        maximum eigengap values.
        References:
            Tae Jin Park et al., Auto-Tuning Spectral Clustering for Speaker Diarization Using
            Normalized Maximum Eigengap, IEEE Signal Processing Letters 27 (2019),
            https://arxiv.org/abs/2003.02405

        Args:
            p_neighbors (int):
                Determines how many binary graph connections we want to keep for each row.

        Returns:
            est_num_of_spk (int):
                Estimated number of speakers
            g_p (float):
                The ratio between p_neighbors value and the maximum eigen gap value.
        NT)r   r   )r   r   r  r   r   r   r   r   rB   r   r   r   )
r  rA  rr   r   r   lambda_gap_listarg_sorted_idxmax_keymax_eig_gapr=  r   r   r   r1    s   

zNMESC.getEigRatioc                 C   s  | j dur,| j dkr,ttt| jjd | j  tj| j	| _
| j
d }nQttt| jjd | j tj| j	| _
| jrtt| j
t| jtj}t|td}t| j
|}tjd| j
|dtj}n	td| j
d }|jd dkrtd|S )an  
        Generates a p-value (p_neighbour) list for searching. p_value_list must include 2 (min_p_value)
        since at least one neighboring segment should be selected other than itself.

        If fixed_thres value is specified, then only one p-value is specified.
        If fixed_thres is not provided, multiple p-values are searched.
            If sparse_search is True:
                - Limit the number of p-values to be searched to sparse_search_volume.
                - N should be at least 2 to include a number greater than 1.
            If sparse_search is False:
                - Scan all the p_values from 1 to max_N
                - If sparse_search is False, NMESC analysis could take more time compared to sparse_search = True.

        Returns:
            p_value_list (Tensor):
                Tensor containing the p_values to be searched.
        Ng        r   r6   r
   )startendstepsz!p_value_list should not be empty.)r   r   r   floorrw   r   r   r3  r<   r+  r   r   r   r   r   r   linspacer   r   )r  r,  search_volumeNrH  r   r   r   r.  ;  s"   ((zNMESC.getPvalueListN)r"  r#  r$  r%  r   r&   r&  r<   r`   ru   r  r   r   r-  r1  r.  r   r   r   r   r   ^  sT    '	

F<r   c                       s   e Zd Z						d)dedededed	ed
ef fddZddddedddfdejdedede	dedejde	dedej
fddZdeeejf dej
fdd Z				!			d*d"ejd#ejd$ej
d%ejdedede	d&edede	dedej
fd'd(Z  ZS )+SpeakerClustering   r'  TFmin_samples_for_nmescr   r   r)  r*  r   c                    sp   t    || _|| _|| _|| _|| _|| _t	dg| _
t	dg| _| jr0td| _dS td| _dS )a  
        Clustering method for speaker diarization based on cosine similarity.
        NME-SC part is converted to torch.tensor based operations in NeMo 1.9.

        Args:
            min_samples_for_nmesc (int):
                The minimum number of samples required for NME clustering. This avoids
                zero p_neighbour_lists. If the input has fewer segments than min_samples,
                it is directed to the enhanced speaker counting mode.
            nme_mat_size (int):
                The targeted matrix size for NME analysis.
            sparse_search (bool):
                Toggle sparse search mode. If True, limit the size of p_value_list to sparse_search_volume.
            maj_vote_spk_count (bool):
                If True, take a majority vote on all p-values in the given range to estimate the number of speakers.
                The majority voting may contribute to surpress overcounting of the speakers and improve speaker
                counting accuracy.
            parallelism (bool):
                Use dynamic parallelism feature in torch.jit compiler to accelerate the p-value search.
            cuda (bool):
                Boolean variable for toggling cuda availability.
        r   r   r#   N)superr  rO  r   r   r*  r   r)  r   r&  r   r   r&   )r  rO  r   r   r)  r*  r   	__class__r   r   r  e  s   
&zSpeakerClustering.__init__r'   r  r   r/   r   r
   r   oracle_num_speakersr   r   r   est_num_of_spk_enhancedr   kmeans_random_trialsr	   c	                 C   s   t |||| j||| j| j| j| j| jd}	|jd | jkr)|		 \}
}t
||}n||	_|		 \}
}|}|dkr=t|}n|dkrHt| }nt|
 }t||| j| jd}|	|}|S )a  
        This function takes a cosine similarity matrix `mat` and returns the speaker labels for the segments 
        in the given input embeddings. 
       
        Args: 
            mat (Tensor):
                Cosine similarity matrix (affinity matrix) calculated from the provided speaker embeddings.
            oracle_num_speakers (int):
                The number of speakers in a session, as specified by the reference transcript.
                Can be used as `chunk_cluster_count` in long-form clustering mode.
            max_num_speakers (int):
                The upper bound for the number of speakers in each session.
            max_rp_threshold (float):
                Limits the range of parameter search.
                The clustering performance can vary based on this range.
                The default value is 0.15.
            sparse_search_volume (int):
                The number of p_values considered during NME analysis.
                The default is 30. Lower values speed up the NME-analysis but might lead to poorer parameter estimations. Values below 20 are not recommended.
            est_num_of_spk_enhanced (int):
                The number of speakers estimated from enhanced speaker counting.
                If the value is -1, the enhanced speaker counting is skipped.
            fixed_thres (float):
                If a `fixed_thres` value is provided, the NME-analysis process will be skipped.
                This value should be optimized on a development set for best results.
                By default, it is set to -1.0, and the function performs NME-analysis to estimate the threshold.
            kmeans_random_trials (int):
                The number of random trials for initializing k-means clustering. More trials can result in more stable clustering. The default is 1. 
                
        Returns:
            Y (LongTensor):
                Speaker labels (clustering output) in integer format for the segments in the given input embeddings.
        )
r   r   r   r   r   r   r)  r*  r   r&   r   )r0   r  r   r&   )r   r   r   r)  r*  r   r&   r   rO  r   r   r   r<   rB   r  )r  r   rS  r   r   r   rT  r   rU  r   r   r@  rr   r0   spectral_modelYr   r   r   forward_unit_infer  s:   ,

z$SpeakerClustering.forward_unit_infer
param_dictc                 C   s   |d }|d }|d }|d }t |d  }t |d  }t |d  }t |d  }	t|d	  }
t|d
  }| j||||||
|||	|d
S )a  
        A function wrapper designed for inference in exported script format.

        Note:
            Dict is used to allow easy inference of the exported jit model in Triton server using easy to understand
            naming convention.
            See https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#special-conventions-for-pytorch-backend

        Args:
            param_dict (dict):
                    Dictionary containing the arguments for speaker clustering.
                    See `forward_infer` function for the argument information.

            Returns:
                (LongTensor): Speaker labels for the segments in the given input embeddings.
        
embeddings
timestampsr  r   rS  r   enhanced_count_thresr   r   r   )
r   r   r  r   rS  r   r   r\  r   r   )r<   rB   r`   forward_infer)r  rY  r   r   r  r   rS  r   r\  r   r   r   r   r   r   r     s,   zSpeakerClustering.forward(   r   r   r  r   r\  c              
   C   s   t |||\| _| _| jd }|jd dkrtjdtjdS |jd t|| jkr5|dk r5t	|| j
d}ntd}|dkr@|}t|| j| j| jd}| j|||||	|||
dS )	a  
        Calculate the affinity matrix using timestamps and speaker embeddings, run NME analysis to estimate the best
        p-value, and perform spectral clustering based on the estimated p-value and the calculated affinity matrix.

        Caution:
            For compatibility with libtorch, python boolean `False` has been replaced with `torch.LongTensor(-1)`.

        Args:
            embeddings_in_scales (Tensor):
                List containing concatenated Torch tensor embeddings across multiple scales.
                The length of the list is equal to the number of scales.
                Each tensor has dimensions of (Number of base segments) x (Embedding Dimension).
            timestamps_in_scales (Tensor):
                List containing concatenated Torch tensor timestamps across multiple scales.
                The length of the list is equal to the number of scales.
                Each tensor has dimensions of (Total number of segments across all scales) x 2.
                Example:
                    >>> timestamps_in_scales[0] =                         torch.Tensor([[0.4, 1.4], [0.9, 1.9], [1.4, 2.4], ... [121.2, 122.2]])
            multiscale_segment_counts (LongTensor):
                A Torch tensor containing the number of segments for each scale.
                The tensor has dimensions of (Number of scales).
                Example:
                    >>> multiscale_segment_counts = torch.LongTensor([31, 52, 84, 105, 120])
            multiscale_weights (Tensor):
                Multi-scale weights used when merging affinity scores.
                Example:
                    >>> multiscale_weights = torch.tensor([1.4, 1.3, 1.2, 1.1, 1.0])
            oracle_num_speakers (int):
                The number of speakers in a session as given by the reference transcript.
            max_num_speakers (int):
                The upper bound for the number of speakers in each session.
            max_rp_threshold (float):
                Limits the range of parameter search.
                The clustering performance can vary based on this range.
                The default value is 0.15.
            enhanced_count_thres (int):
                For shorter audio recordings, the clustering algorithm might not accumulate enough speaker profiles for each cluster.
                Thus, the function `getEnhancedSpeakerCount` uses anchor embeddings (dummy representations) to mitigate the effects of cluster sparsity.
                A value of 80 is recommended for `enhanced_count_thres`.
            sparse_search_volume (int):
                The number of p_values considered during NME analysis.
                The default is 30. Lower values speed up the NME-analysis but might lead to poorer parameter estimations. Values below 20 are not recommended.
            fixed_thres (float):
                If a `fixed_thres` value is provided, the NME-analysis process will be skipped.
                This value should be optimized on a development set for best results.
                By default, it is set to -1.0, and the function performs NME-analysis to estimate the threshold.
            kmeans_random_trials (int):
                The number of random trials for initializing k-means clustering. More trials can result in more stable clustering. The default is 1.

        Returns:
            (LongTensor): Speaker labels for the segments in the provided input embeddings.
        r'   r   r
   r5   r3   )r   r   )r   r   r   r&   )r   rS  r   r   r   rT  rU  r   )r  r   r   r   r   r8   int64r   rO  r  r   rw   r   r&   rX  )r  r   r   r  r   rS  r   r   r\  r   r   rU  r   rT  r   r   r   r   r]    s6   C

zSpeakerClustering.forward_infer)rN  r'  TFFF)r'   r  r   r^  r/   r   r
   )r"  r#  r$  r<   ru   r  r   rw   r&  r`   
LongTensorrX  r   strr   r]  __classcell__r   r   rQ  r   rM  d  s    -	

P.	
rM  )r   )r   r   r   r   F)F),typingr   r   r   r   torch.linalgr   r   rw   r&  r   r"   r&   r.   r<   rZ   r`   rq   r   r   ra  r   r   r   r   r   r   r   r   r   ru   r   r   r   r   r  r`  r  r  r  r   nnModulerM  r   r   r   r   <module>   s  !&

W
 L% %


4
** "	'
;
,
q  