o
    oi                     @   s   d dl Z d dlmZmZmZmZ d dlmZmZ eG dd dZ	eG dd dZ
eG dd	 d	ZeG d
d dZeG dd dZG dd dZdS )    N)DictListAnyOptional)	dataclassfieldc                   @   s   e Zd ZU dZeed< eed< eed< ee ed< dZeee	eef   ed< dZ
eed	< dZee ed
< edefddZde	eef fddZdS )DatasetConfigz"Configuration for a single datasetnametext_column_nameaudio_column_namespeaker_column_nameNadd_constanttrainsplitsub_namereturnc                 C   s   | j dd S )z/Extract dataset prefix from name (part after /)/)r	   r   self r   ?/home/ubuntu/kanitts-2-dataset-pipeline/utils/config_manager.pydataset_prefix   s   zDatasetConfig.dataset_prefixc                 C   s   | j si S dd | j D S )z$Get constant columns as a dictionaryc                 S   s   i | ]	}|d  |d qS )keyvaluer   ).0itemr   r   r   
<dictcomp>       z6DatasetConfig.get_constant_columns.<locals>.<dictcomp>)r   r   r   r   r   get_constant_columns   s   z"DatasetConfig.get_constant_columns)__name__
__module____qualname____doc__str__annotations__r   r   r   r   r   r   propertyr   r   r   r   r   r   r      s   
 r   c                   @   sV   e Zd ZU dZeed< eed< eed< eed< eed< eed< eed< d	Zeed
< dS )BaseSettingszBase pipeline settingsaudio_codecnum_readersqsizeOUT_DIR
gzip_levelbuffer_sizelines_per_file   load_dataset_num_procN)r    r!   r"   r#   r$   r%   intr0   r   r   r   r   r'      s   
 r'   c                   @   s*   e Zd ZU dZee ed< ee ed< dS )SaveSettingsz&Settings for saving/uploading datasetslocal	hf_uploadN)r    r!   r"   r#   r   r$   r%   r   r   r   r   r2   *   s   
 r2   c                   @   s*   e Zd ZU dZeed< eed< eed< dS )GroupEmbeddingSettingsz5Settings for the per-speaker averaged embedding step.do_thisgroup_by_column_namegrouped_embedding_columnN)r    r!   r"   r#   boolr%   r$   r   r   r   r   r5   1   s
   
 r5   c                   @   s~   e Zd ZU dZeed< eed< eed< eed< eed< eed< eed< eed	< e	ed
< e	ed< dZ
eed< dZee ed< dS )SpeakerEmbeddingSettingsz1Settings for the speaker embedding pipeline step.add_speaker_emb
model_nameembedding_column	target_srmax_audio_sec
batch_sizeuse_multiprocessingdo_clustersumap_paramshdbscan_params
speaker_idclustering_speaker_columnN	group_emb)r    r!   r"   r#   r9   r%   r$   r1   floatdictrF   rG   r   r5   r   r   r   r   r:   9   s   
 r:   c                   @   sf   e Zd ZdZddefddZddd	Zdee fd
dZ	de
fddZdefddZdefddZdS )ConfigManagerz,Manages configuration loading and validationconfig.yamlconfig_pathc                 C   s  t |d}t|| _W d    n1 sw   Y  td&i | jd | _td&i | jd | _dd | jd D | _| j	di }|	di }t
|	d	d
|	dd|	ddd}t|	dd
|	di 	dd|	di 	dd|	di 	dd|	di 	dd|	di 	dd|	di 	dd|	d i 	d!d
|	d i 	d"i |	d i 	d#i |	d i 	d$d|d%| _d S )'Nrbase_settingssave_settingsc                 S   s   g | ]	}t d i |qS )r   )r   )r   dsr   r   r   
<listcomp>T   r   z*ConfigManager.__init__.<locals>.<listcomp>hf_datasetsspeaker_embedding_settingsgroup_sp_embr6   Fr7   rE   r8   grouped_sp_emb)r6   r7   r8   r;   modelr	    r=   wavlm_embeddingaudiotarget_sample_ratei>  max_duration_secg      >@
processingr@      rA   T
clusteringrB   UMAPHDBSCANspeaker_column)r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rF   rG   r   )openyaml	safe_loadconfigr'   rN   r2   rO   datasetsgetr5   r:   rS   )r   rL   fspgrprG   r   r   r   __init__N   s6   



zConfigManager.__init__r   Nc                 C   s   | j stdt }g }| j D ]}t|  }||j|f || q|rA|D ]\}}|| }|r@td| d| dq+td dS )z
        Validate that all datasets have matching additional columns.
        This prevents conflicts when merging the final dataset.
        z&No datasets specified in configurationz	Dataset 'z' is missing constant columns: zF. All datasets must have the same constant columns for proper merging.uL   ✅ Dataset validation passed: All datasets have matching additional columnsN)	rf   
ValueErrorsetr   keysappendr	   updateprint)r   all_constant_keysdataset_constant_keysrP   constant_keysds_namern   missing_keysr   r   r   validate_datasetsl   s"   
zConfigManager.validate_datasetsc                 C      | j S )z"Get list of dataset configurations)rf   r   r   r   r   get_datasets      zConfigManager.get_datasetsc                 C   rx   )zGet base pipeline settings)rN   r   r   r   r   get_base_settings   rz   zConfigManager.get_base_settingsc                 C   rx   )zGet save/upload settings)rO   r   r   r   r   get_save_settings   rz   zConfigManager.get_save_settingsc                 C   rx   )zGet speaker embedding settings)rS   r   r   r   r   get_speaker_embedding_settings   rz   z,ConfigManager.get_speaker_embedding_settings)rK   )r   N)r    r!   r"   r#   r$   rk   rw   r   r   ry   r'   r{   r2   r|   r:   r}   r   r   r   r   rJ   K   s    
rJ   )rc   typingr   r   r   r   dataclassesr   r   r   r'   r2   r5   r:   rJ   r   r   r   r   <module>   s    