o
    ziw                     @   sN   d dl Z d dlmZmZmZ d dlmZmZmZ ddl	m
Z
 G dd dZdS )    N)load_datasetload_from_diskAudio)DictAnyList   )DatasetConfigc                   @   sb   e Zd ZdZddedefddZdded	d
fddZdd Zde	e
ef d	e	e
ef fddZd
S )DatasetProcessorz9Handles loading and preprocessing of HuggingFace datasets"V  dataset_configsample_ratec                 C   s   || _ || _d | _g | _d S )N)configr   datasetpreserve_columns)selfr   r    r   L/home/ubuntu/kanitts-2-dataset-pipeline/utils/nanocodec/dataset_processor.py__init__
   s   
zDatasetProcessor.__init__   num_procreturnNc                 C   s   | j j }| j jr|d| j j d7 }|d| j j d7 }td|  tj| j jrIt| j j| _	| j j
| j	jv rH| j	| j j
t| j| _	nt| j j| j j|| j jddd| j j
t| j| _	td	t| j	 d
|  dS )z+Load dataset from HuggingFace or local diskz ()z []u   📦 Loading dataset: 	no_checksT)r   splitverification_modetrust_remote_codeu     ✅ Loaded z samples from N)r   namesub_namer   printospathisdirr   r   audio_column_namecolumn_namescast_columnr   r   r   len)r   r   dataset_descr   r   r   r      s.   
	zDatasetProcessor.load_datasetc                 C   s   | j du r	td| j S )zGet the loaded datasetNz.Dataset not loaded. Call load_dataset() first.)r   
ValueError)r   r   r   r   get_dataset+   s   
zDatasetProcessor.get_datasetitemc                 C   sn   || j j || j j d d}| j jr|| j j |d< | j  }|| | jD ]}||v r4|| ||< q(|S )z
        Prepare a single item for processing.
        Extracts text, audio, speaker (if specified), and adds constant fields.
        array)textwavespeaker)r   text_column_namer$   speaker_column_nameget_constant_columnsupdater   )r   r+   preparedconstant_colscolr   r   r   prepare_item1   s   



zDatasetProcessor.prepare_item)r   )r   )__name__
__module____qualname____doc__r	   intr   r   r*   r   strr   r7   r   r   r   r   r
      s    &r
   )r!   datasetsr   r   r   typingr   r   r   config_managerr	   r
   r   r   r   r   <module>   s
    