o
    Si                     @   s   d dl mZmZmZmZmZ d dlZd dlmZ d dl	m
Z
 d dlmZ d dlmZmZ d dlmZ G dd	 d	ejjjZd
e
ddfddZdS )    )CallableDictListSequenceUnionN)validate)CutSet)collate_audio)BatchIOPrecomputedFeatures)ifnonec                       s   e Zd ZdZde dddddfdeeegef  dede	e
e ef ded	ed
ededdf fddZdedeeejf fddZ  ZS )SpeechSynthesisDatasetab  
    The PyTorch Dataset for the speech synthesis task.
    Each item in this dataset is a dict of:

    .. code-block::

        {
            'audio': (B x NumSamples) float tensor
            'features': (B x NumFrames x NumFeatures) float tensor
            'audio_lens': (B, ) int tensor
            'features_lens': (B, ) int tensor
            'text': List[str] of len B  # when return_text=True
            'tokens': List[List[str]]  # when return_tokens=True
            'speakers': List[str] of len B  # when return_spk_ids=True
            'cut': List of Cuts  # when return_cuts=True
        }
    NTFcut_transformsfeature_input_strategyfeature_transformsreturn_textreturn_tokensreturn_spk_idsreturn_cutsreturnc                    sv   t    t|g | _|| _|| _|| _|| _|| _|d u r!g }nt	|t
s)|g}tdd |D s6J d|| _d S )Nc                 s   s    | ]}t |tV  qd S )N)
isinstancer   ).0	transform r   S/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/dataset/speech_synthesis.py	<genexpr>8   s    

z2SpeechSynthesisDataset.__init__.<locals>.<genexpr>z#Feature transforms must be Callable)super__init__r   r   r   r   r   r   r   r   r   allr   )selfr   r   r   r   r   r   r   	__class__r   r   r      s"   




zSpeechSynthesisDataset.__init__cutsc           
      C   s   t | | jD ]}||}qt|\}}| |\}}| jD ]}||}q||||d}| jr:dd |D }||d< | jrHdd |D }	|	|d< | jrTdd |D |d< | jr`d	d |D |d
< |S )N)audiofeatures
audio_lensfeatures_lensc                 S      g | ]}|j d  jqS r   )supervisionsnormalized_textr   cutr   r   r   
<listcomp>R       z6SpeechSynthesisDataset.__getitem__.<locals>.<listcomp>textc                 S   s   g | ]}|j qS r   )tokensr+   r   r   r   r-   V   s    r0   c                 S   r'   r(   )r)   speakerr+   r   r   r   r-   Z   r.   speakersc                 S   s   g | ]}|qS r   r   r+   r   r   r   r-   ]   s    r,   )	validate_for_ttsr   r	   r   r   r   r   r   r   )
r   r"   r   r#   r%   r$   r&   batchr/   r0   r   r   r   __getitem__=   s.   



z"SpeechSynthesisDataset.__getitem__)__name__
__module____qualname____doc__r   r   r   r   r
   r   r   boolr   r   strtorchTensorr5   __classcell__r   r   r    r   r      s6    	$r   r"   r   c                 C   s,   t |  | D ]}t|jdksJ dqd S )N   z4Only the Cuts with single supervision are supported.)r   lenr)   )r"   r,   r   r   r   r3   b   s   r3   )typingr   r   r   r   r   r<   lhotser   
lhotse.cutr   lhotse.dataset.collationr	   lhotse.dataset.input_strategiesr
   r   lhotse.utilsr   utilsdataDatasetr   r3   r   r   r   r   <module>   s    V