o
    i;                   	   @   s   d Z ddlZddlmZ ddlmZ ddlmZmZm	Z	m
Z
mZmZ ddlZddlZddlZddlZddlmZ ddlmZ ddlmZ d	d
 Zdd eejdd dd dd dd dd dZG dd deZdS )zIterable dataset module.    N)StringIO)Path)Callable
CollectionDictIteratorTupleUnion)IterableDataset)check_argument_types)ESPnetDatasetc                 C   s   t | }t|trRt|dksJ t|t|d tr+t|d tjr+|\}}|S t|d tr@t|d tjr@|\}}|S tdt	|d  dt	|d  t|tjs^J t	||}|S )N   r      zUnexpected type: z, )
kaldiioload_mat
isinstancetuplelenintnpndarrayRuntimeErrortype)inputretvalratearray r   R/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/train/iterable_dataset.py
load_kaldi   s   

$r   c                 C   s   t | d S )Nr   )	soundfilereadxr   r   r   <lambda>)   s    r$   c                 C      t jt| dt jddS Nr    ndmindtype	delimiterr   loadtxtr   longr"   r   r   r   r$   ,       c                 C   r%   Nr   ,r(   r,   r"   r   r   r   r$   /   s    c                 C   r%   r&   r   r-   r   float32r"   r   r   r   r$   0   r/   c                 C   r%   r0   r2   r"   r   r   r   r$   3   r/   c                 C   s   | S Nr   r"   r   r   r   r$   6   s    )sound	kaldi_arknpytext_intcsv_int
text_float	csv_floattextc                   @   s   e Zd ZdZ				ddeeeeef  deeeee	j
f geee	j
f f deded	ef
d
dZdefddZdeedf fddZdd Zdeeeeef eee	j
f f  fddZdS )IterableESPnetDatasetav  Pytorch Dataset class for ESPNet.

    Examples:
        >>> dataset = IterableESPnetDataset([('wav.scp', 'input', 'sound'),
        ...                                  ('token_int', 'output', 'text_int')],
        ...                                )
        >>> for uid, data in dataset:
        ...     data
        {'input': per_utt_array, 'output': per_utt_array}
    Nr3   r.   path_name_type_list
preprocessfloat_dtype	int_dtypekey_filec           
      C   s
  t  sJ t|dkrtdt|}|| _|| _|| _|| _i | _	g }g | _
|D ]/\}}}	|| j	v r<td| d||	f| j	|< |	tvrP||||	f q*| j
|||	f q*t|dkrjt||||d| _nd | _tt|d d jd rd| _d S d| _d S )	Nr   z91 or more elements are required for "path_name_type_list""z" is duplicated for data-key)r>   r?   r@   rA   utt2categoryTF)r   r   
ValueErrorcopydeepcopyr?   r@   rA   rB   
debug_infor>   r   
DATA_TYPESappendr   non_iterable_datasetr   parentexistsapply_utt2category)
selfr>   r?   r@   rA   rB   non_iterable_listpathname_typer   r   r   __init__F   s>   






zIterableESPnetDataset.__init__returnc                 C   s
   || j v S r4   )rH   )rO   rR   r   r   r   has_namez      
zIterableESPnetDataset.has_name.c                 C   s
   t | jS r4   )r   rH   )rO   r   r   r   names}   rW   zIterableESPnetDataset.namesc              	   C   sZ   | j j}|d7 }| j D ]\}\}}|d| d| d| d7 }q|d| j d7 }|S )N(z
  z: {"path": "z", "type": "z"}z
  preprocess: ))	__class____name__rH   itemsr?   )rO   _mesrR   rQ   rS   r   r   r   __repr__   s   zIterableESPnetDataset.__repr__c              
   c   s   | j d urdd t| j ddD }nt| jdkr+dd t| jd d ddD }nt| j}dd | jD }tjj	 }d}d}t
|d	D ]\}}|d ur[|d	 |j |jkr[qG	 g }g }|D ]E}	|d	7 }zt|	}
W n ty|   t| dw |
 jd	d}t|dkrtd|	 d| d|
 d|\}}|| || qbt
|D ]\}}||d krtd| d| qt|dks|d |krnq\i }t|| jD ]\}\}}}t| }||}|||< q| jd ur| j| \}}|| | jd ur| ||}|D ]C}|| }t|tjs$td| dt| d|jjdkr2|| j}n|jjdkr@|| j}nt d|j |||< q
||fV  qG|dkr]tdd S )Nc                 s   $    | ]}|  jd dd V  qdS r   maxsplitr   Nrstripsplit.0liner   r   r   	<genexpr>   
    
z1IterableESPnetDataset.__iter__.<locals>.<genexpr>utf-8encodingr   c                 s   r`   ra   rd   rg   r   r   r   rj      rk   c                 S   s   g | ]
}t |d  ddqS )r   rl   rm   )open)rh   lisr   r   r   
<listcomp>   s    z2IterableESPnetDataset.__iter__.<locals>.<listcomp>r   Tz is not found in the filesrb   r   z#This line doesn't include a space: z:Lz: rZ   z%Keys are mismatched. Text files (idx=z,) is not sorted or not having same keys at LzIAll values must be converted to np.ndarray object by preprocessing, but "z" is still .fizNot supported dtype: zNo iteration)!rB   ro   r   r>   iterrK   torchutilsdataget_worker_info	enumeratenum_workersidnextStopIterationr   re   rf   rJ   ziprI   updater?   r   r   r   r   r*   kindastyper@   rA   NotImplementedError)rO   uid_iterfilesworker_infolinenumcountuidkeysvaluesrs   ri   spskeyvaluek_idxkrx   rQ   rR   rS   funcr   _from_non_iterabler   r   r   __iter__   s   






zIterableESPnetDataset.__iter__)Nr3   r.   N)r\   
__module____qualname____doc__r   r   strr   r   r   r   rT   boolrV   rX   r_   r   r	   r   r   r   r   r   r   r=   :   s.    
40r=   )r   rF   ior   pathlibr   typingr   r   r   r   r   r	   r   numpyr   r    rv   torch.utils.data.datasetr
   	typeguardr   espnet2.train.datasetr   r   loadrI   r=   r   r   r   r   <module>   s.     