o
    oi                     @   sJ   d dl Zd dlmZ d dlmZ d dlmZ d dlm	Z	 G dd dZ
dS )    N)UMAP)HDBSCAN)	normalize)Datasetc                   @   sN   e Zd ZdZ		ddedefddZdejd	d
fddZde	d	e	fddZ
d
S )SpeakerMakerUnsuperviseda  
    Unsupervised speaker clustering system using UMAP dimensionality reduction
    and HDBSCAN clustering to assign speaker labels to audio samples.

    This class performs automatic speaker identification by:
    1. Filtering out invalid embeddings (NaN values)
    2. Reducing high-dimensional speaker embeddings (e.g., 128-dim) to lower dimensions using UMAP
    3. Normalizing the reduced embeddings with L2 normalization
    4. Clustering the normalized embeddings using HDBSCAN to identify distinct speakers
    5. Assigning speaker_id labels to each sample
    6. Filtering out noise samples (speaker_id == -1)

    Attributes:
        cfg (dict): Configuration dictionary with 'UMAP' and 'HDBSCAN' parameters
        embedding_column (str): Name of the column containing speaker embeddings
        pre_reducer (UMAP): UMAP dimensionality reduction model
        clusterer (HDBSCAN): HDBSCAN clustering model

    Example:
        >>> cluster_config = {
        ...     'UMAP': {'n_components': 5, 'metric': 'cosine'},
        ...     'HDBSCAN': {'min_cluster_size': 10}
        ... }
        >>> speaker_maker = SpeakerMakerUnsupervised(cluster_config)
        >>> labeled_dataset = speaker_maker(dataset)
    wavlm_embedding
speaker_idembedding_columnspeaker_columnc                 C   sB   || _ || _|| _tdi | j d | _tdi | j d | _dS )aC  
        Initialize the unsupervised speaker clustering system.

        Args:
            cluster_config (dict): Configuration dictionary containing:
                - 'UMAP': dict of UMAP parameters (n_components, metric, etc.)
                - 'HDBSCAN': dict of HDBSCAN parameters (min_cluster_size, etc.)
            embedding_column (str): Name of the dataset column containing speaker
                embeddings. Defaults to 'wavlm_embedding'.
            speaker_column (str): Name of the output column for cluster IDs.
                Defaults to 'speaker_id'.
        r   r   N )cfgr	   r
   r   pre_reducerr   	clusterer)selfcluster_configr	   r
   r   r   O/home/ubuntu/kanitts-2-dataset-pipeline/utils/speaker_emb/speaker_clustering.py__init__$   s
   z!SpeakerMakerUnsupervised.__init__labelsreturnNc           	      C   s2  t |}t||dk }t |}|dk }|| }|dkr%|| d nd}td td td td|  td|  td	| d
|dd td| d
d| dd |dkrt||dk }td td|   td|   td| d tdt	|d td dS )a  
        Log clustering results and statistics.

        Prints information about:
        - Total number of samples processed
        - Number of unique speakers identified
        - Number of noise samples (speaker_id == -1)
        - Percentage of samples assigned to valid clusters

        Args:
            labels (np.ndarray): Array of speaker labels from clustering.
                Labels >= 0 represent speaker clusters, -1 represents noise.
        r   d   z=
============================================================zSpeaker Clustering Resultsz<============================================================zTotal samples processed:      zUnique speakers identified:   zValid cluster assignments:    z (z.1fz%)zNoise samples (unassigned):   z
Cluster size statistics:z  Min samples per speaker:    z  Max samples per speaker:    z  Mean samples per speaker:   z  Median samples per speaker: z.0fz=============================================================
N)
lennpuniquesumprintbincountminmaxmeanmedian)	r   r   total_samplesunique_speakersnum_speakers	num_noise	num_validvalid_percentagespeaker_countsr   r   r   proc_log8   s*   z!SpeakerMakerUnsupervised.proc_logdatasetc                    s  t d t dt| d t d t d j d t| j }t d|j  t d  j|}t d	|j  t d
 t|dd}t d  j	
|}t d j d |j j|d} | t d j d d}|j fddd|d}t dt| d t d |S )aP  
        Perform unsupervised speaker clustering on the dataset.

        This method processes speaker embeddings through the following pipeline:
        1. Filter out samples with NaN embeddings
        2. Extract embeddings into numpy array
        3. Apply UMAP dimensionality reduction
        4. Normalize reduced embeddings with L2 normalization
        5. Cluster normalized embeddings using HDBSCAN
        6. Add 'speaker_id' column to dataset
        7. Remove noise samples (speaker_id == -1)

        Args:
            dataset (Dataset): HuggingFace Dataset containing speaker embeddings
                in the column specified by self.embedding_column

        Returns:
            Dataset: Dataset with added 'speaker_id' column, noise samples removed.
                Only samples with valid speaker assignments (speaker_id >= 0) are retained.

        Note:
            - Samples with NaN embeddings are filtered out before processing
            - HDBSCAN assigns label -1 to noise samples (outliers)
            - Noise samples are automatically removed from the final dataset
        z*
[Speaker Clustering] Starting pipeline...z#[Speaker Clustering] Dataset size: z sampleszO[Speaker Clustering] (NaN filtering done upstream in SpeakerEmbeddingProcessor)z1[Speaker Clustering] Extracting embeddings from 'z' column...z'[Speaker Clustering] Embeddings shape: z>[Speaker Clustering] Applying UMAP dimensionality reduction...z/[Speaker Clustering] Reduced embeddings shape: z;[Speaker Clustering] Normalizing reduced embeddings (L2)...l2)normz2[Speaker Clustering] Running HDBSCAN clustering...z[Speaker Clustering] Adding 'z' column to dataset...)namecolumnz-[Speaker Clustering] Removing noise samples (z
 == -1)...zRemoving noise samplesc                    s   |  j  dkS )Nr   )r
   )ir   r   r   <lambda>   s    z3SpeakerMakerUnsupervised.__call__.<locals>.<lambda>
   )num_procdescz)[Speaker Clustering] Final dataset size: z6[Speaker Clustering] Pipeline completed successfully!
)r   r   r	   r   arrayshaper   fit_transformr   r   fit_predictr
   
add_columnr(   filter)r   r)   embembeddings_5demb_normalizedr   r3   r   r/   r   __call__`   s,   
z!SpeakerMakerUnsupervised.__call__)r   r   )__name__
__module____qualname____doc__strr   r   ndarrayr(   r   r=   r   r   r   r   r      s    
(r   )numpyr   umapr   hdbscanr   sklearn.preprocessingr   datasetsr   r   r   r   r   r   <module>   s    