o
    oi                     @   sJ   d dl mZ d dlmZ d dlZd dlmZ d dlmZ G dd dZ	dS )    )defaultdict)ListN)Dataset)tqdmc                   @   sB   e Zd ZdZdefddZdedefddZdee fd	d
Z	dS )SpeakerEmbeddingGroupera  
    Computes per-speaker averaged embeddings and adds them as a new column.

    Groups all embedding vectors by a speaker column, averages them, then
    maps the averaged vector back to every sample of that speaker.

    Can be used:
    - Inside SpeakerEmbeddingProcessor (after embedding + clustering)
    - Independently when embeddings already exist in the dataset
      (add_speaker_emb: false, group_sp_emb.do_this: true)
    embedding_columnc                 C   s   || _ || _dS )z
        Args:
            settings:         GroupEmbeddingSettings instance.
            embedding_column: Name of the source embedding column to average
                              (e.g. 'wavlm_embedding').
        N)settingsr   )selfr   r    r
   M/home/ubuntu/kanitts-2-dataset-pipeline/utils/speaker_emb/group_embeddings.py__init__   s   
z SpeakerEmbeddingGrouper.__init__datasetreturnc                    sZ  | j j| j}| j jtd td d td| d td d tdt| d tt}t|dd	D ]'}| }|| }|d
u rJq;t	j
|t	jd}t	t	|r[q;|| | q;t|}td|  i  t| dd	D ]\}}	t	jt	j|	dddd}
|
  |< qx fdd}|j|dd dd}td d |S )a  
        Add a grouped (per-speaker averaged) embedding column to the dataset.

        Steps:
        1. Collect all embeddings per unique speaker value, skipping None/NaN.
        2. Average embeddings per speaker.
        3. Map the averaged vector back to every sample via dataset.map().

        Samples whose speaker has no valid embeddings at all receive None
        in the output column (they are not filtered out).

        Args:
            dataset: HuggingFace Dataset containing both the embedding column
                     and the group-by column.

        Returns:
            Dataset with the new grouped embedding column added.
        z(
[Group Embeddings] Starting grouping...z![Group Embeddings] Group by:    ''z![Group Embeddings] Source emb:  'z![Group Embeddings] Output col:  'z![Group Embeddings] Dataset size: z samplesz)[Group Embeddings] Collecting per speaker)descN)dtypez:[Group Embeddings] Unique speakers with valid embeddings: z[Group Embeddings] Averagingr   )axisc                    s   |  }  |d iS )N)get)samplespeakeraveraged_embeddings	group_colout_colr
   r   _add_grouped_embZ   s   z7SpeakerEmbeddingGrouper.group.<locals>._add_grouped_emb
   z[Group Embeddings] Adding 'z' column)num_procr   u   [Group Embeddings] Done — 'z' column added.
)r   group_by_column_namer   grouped_embedding_columnprintlenr   listr   nparrayfloat32anyisnanappenditemsmeanstacktolistmap)r	   r   emb_colspeaker_embeddingsr   r   	embeddingemb_arr
n_speakers
embeddingsavg_embr   r
   r   r   group    sD   
zSpeakerEmbeddingGrouper.groupc                 C   s
   | j jgS )z6Return the list of columns produced by this processor.)r   r   )r	   r
   r
   r   get_preserved_columnsg   s   
z-SpeakerEmbeddingGrouper.get_preserved_columnsN)
__name__
__module____qualname____doc__strr   r   r4   r   r5   r
   r
   r
   r   r   	   s
    
Gr   )
collectionsr   typingr   numpyr"   datasetsr   r   r   r
   r
   r
   r   <module>   s    