o
    i                     @   sJ   d Z ddlZddlZddlmZmZ G dd deZdd ZdddZdS )z-Dataset for EnCodec token-based TTS training.    N)Dataset
DataLoaderc                   @   s&   e Zd Zd
ddZdd Zdd Zd	S )TokenTTSDataset     c                    sj   t |ddd}t|| _W d    n1 sw   Y   fdd| jD | _tdt| j d d S )Nrutf-8)encodingc                    s   g | ]
}|d   kr|qS )n_tokens ).0e
max_tokensr   #/home/ubuntu/lewm-tts/dataset_v6.py
<listcomp>   s    z,TokenTTSDataset.__init__.<locals>.<listcomp>z	Dataset: z samples loaded)openjsonloadmanifestprintlen)selfmanifest_pathr   max_text_lenfr   r   r   __init__	   s
   zTokenTTSDataset.__init__c                 C   s
   t | jS )N)r   r   )r   r   r   r   __len__   s   
zTokenTTSDataset.__len__c                 C   sb   | j | }tj|d dd }t|d dd d }tj|tjd}|||jd t|d	S )
N
token_pathT)weights_onlytextr   r   dtyper   )tokenstext_tokensr
   text_len)	r   torchr   longlistencodetensorshaper   )r   idxentryr#   r    r$   r   r   r   __getitem__   s   
zTokenTTSDataset.__getitem__N)r   r   )__name__
__module____qualname__r   r   r.   r   r   r   r   r      s    
r   c                 C   s   t dd | D }t dd | D }t| }tj||tjd}tj||tjd}tj||tjd}tj||tjd}t| D ]0\}}	|	d }
|	d }|	d ||d |
f< |	d ||d |f< d	||d |
f< d	||d |f< q>||||d
S )Nc                 s       | ]}|d  V  qdS )r
   Nr   r   sr   r   r   	<genexpr>        zcollate_fn.<locals>.<genexpr>c                 s   r2   )r%   Nr   r3   r   r   r   r5   !   r6   r!   r
   r%   r#   r$   F)r#   r$   
token_mask	text_mask)maxr   r&   zerosr'   onesbool	enumerate)batchmax_tokmax_textB
tok_paddedtext_paddedtok_maskr8   ir4   ttxr   r   r   
collate_fn   s&   rH         Tc              	   C   s&   t | }t||||tddd}||fS )NT)
batch_sizeshufflenum_workersrH   
pin_memory	drop_last)r   r   rH   )r   rK   rM   rL   datasetloaderr   r   r   build_dataloader9   s   rR   )rI   rJ   T)	__doc__r   r&   torch.utils.datar   r   r   rH   rR   r   r   r   r   <module>   s    