o
    oi                     @   sf   d dl Z d dlmZ d dlZd dlmZmZ ddlm	Z	m
Z
 ddlmZ ddlmZ G dd	 d	ZdS )
    N)List)DatasetAudio   )EmbedderConfigWavLMEmbedder)SpeakerMakerUnsupervised)SpeakerEmbeddingGrouperc                   @   s@   e Zd ZdZdd ZdededefddZdee fd	d
Z	dS )SpeakerEmbeddingProcessora=  
    Orchestrates speaker embedding generation and optional downstream steps
    (unsupervised clustering, per-speaker averaged embeddings) for a
    HuggingFace dataset.

    Receives a dataset (already loaded at 22050 Hz by DatasetProcessor),
    generates WavLM speaker embeddings using a separate 16 kHz audio channel,
    then optionally runs:
      - UMAP + HDBSCAN clustering to assign cluster-based speaker IDs
      - Per-speaker embedding averaging (group_sp_emb)

    Returns the dataset with all produced columns added, ready for the
    tokenization step.
    c                 C   s4   || _ t|j|j|j|j|j|jd}t|| _	dS )zc
        Args:
            settings: SpeakerEmbeddingSettings instance from ConfigManager.
        )
model_name	target_srmax_audio_sec
batch_sizeembedding_column_nameuse_multiprocessingN)
settingsr   r   r   r   r   embedding_columnr   r   embedder)selfr   embedder_config r   F/home/ubuntu/kanitts-2-dataset-pipeline/utils/speaker_emb/processor.py__init__   s   z"SpeakerEmbeddingProcessor.__init__datasetaudio_columnreturnc                    sT  dt jd< |j  j}| tdd}|j fddddd	}|d
t| jjdd}| t|dd}|j	dd ddd	}| j
j|d
dd}|d
}| jjt|}|j	fddddd	}|t| }|rvtd| dt|  | jjrt| jj| jjd| jj| jjd}||}| jjr| jjjrt| jj| jjd}||}|S )u  
        Add embedding column and optionally speaker/cluster column and grouped
        embedding column to the dataset.

        Audio resampling strategy:
          1. Switch audio column to raw-bytes mode (decode=False) — avoids
             double decode/resample quality loss.
          2. Copy raw bytes to a temporary "audio_spk" column.
          3. Decode "audio_spk" at target_sr (16 kHz) for embedding.
          4. Decode original audio column back at 22050 Hz for tokenization.
          5. Filter samples shorter than 0.7 s (too short for reliable embedding).
          6. Generate embeddings; remove "audio_spk".
          7. Filter samples with NaN embeddings (mandatory, before all downstream steps).
          8. Optionally run UMAP + HDBSCAN clustering.
          9. Optionally run per-speaker embedding averaging (group_sp_emb).

        Args:
            dataset:      HuggingFace Dataset loaded at 22050 Hz.
            audio_column: Name of the audio column (from DatasetConfig).

        Returns:
            Dataset with added embedding (and optional clustering / grouping)
            columns. The audio column is still at 22050 Hz — ready for
            tokenization.
        
torchaudioDATASETS_AUDIO_BACKENDF)decodec                    s   d|   iS )N	audio_spkr   i)r   r   r   <lambda>N   s    z3SpeakerEmbeddingProcessor.process.<locals>.<lambda>
   z#Copying audio for speaker embedding)num_procdescr   T)sampling_rater   c                 S   s    t | d d | d d  dkS )Nr   arrayr&   gffffff?)lenr    r   r   r   r"   a   s       zFiltering short audiospeaker_embedding)r   r   
split_namec                    s   t t |    dkS )Nr   )npisnanr'   sumr    )emb_colr   r   r"   v   s    zFiltering NaN embeddingsz$[SpeakerEmbeddingProcessor] Removed z) samples with NaN embeddings. Remaining: )UMAPHDBSCAN)cluster_configr   speaker_column)r   r   )osenvironfeaturesr&   cast_columnr   mapr   r   filterr   embed_datasetremove_columnsr   r(   printdo_clustersr   umap_paramshdbscan_paramsclustering_speaker_column	group_embdo_thisr	   group)r   r   r   original_srinitial_sizeremovedspeaker_makergrouperr   )r   r/   r   process,   sr   


	


z!SpeakerEmbeddingProcessor.processc                 C   sF   | j jg}| j jr|| j j | j jr!| j jjr!|| j jj |S )z
        Return the list of column names added by this processor.
        These must be passed through the tokenization step (preserve_columns).
        )r   r   r=   appendr@   rA   rB   grouped_embedding_column)r   colsr   r   r   get_preserved_columns   s   
z/SpeakerEmbeddingProcessor.get_preserved_columnsN)
__name__
__module____qualname____doc__r   r   strrI   r   rM   r   r   r   r   r
      s
    kr
   )r4   typingr   numpyr,   datasetsr   r   r   r   r   speaker_clusteringr   group_embeddingsr	   r
   r   r   r   r   <module>   s    