o
    wi^E                     @   sx  d dl Z d dlZd dlmZ d dlmZ d dlmZmZm	Z	 d dl
Z
d dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZmZmZmZ d d
lmZm Z  d dl!m"Z" d dl!m#Z$ d dl%m&Z& d dl'm(Z( d)g ddd e* D  Z+eG dd dZ,eG dd dZ-dee. fddZ/de0de,de1de1fddZ2e&G d d! d!eZ3G d"d# d#e Z4dS )$    N)	dataclass)Path)DictListOptional)expand_sharded_filepaths)available_formats)read_manifest)FeatureProcessor)filter_dataset_by_durationget_weighted_sampler
load_audiosample_audiostack_tensors)DatasetIterableDataset)logging)
webdataset)experimental)webdataset_split_by_workers;)wavmp3flacopusc                 C   s   g | ]}|  qS  )lower).0fmtr   r   f/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/tts/data/vocoder_dataset.py
<listcomp>*   s    r    c                   @   s>   e Zd ZU eed< eed< dZeed< dZee	e
  ed< dS )DatasetMetamanifest_path	audio_dirg      ?sample_weightNaudio_tar_filepaths)__name__
__module____qualname__r   __annotations__r$   floatr%   r   r   strr   r   r   r   r!   -   s
   
 r!   c                   @   s&   e Zd ZU eed< eed< eed< dS )DatasetSampledataset_namemanifest_entryr#   N)r&   r'   r(   r+   r)   dictr   r   r   r   r   r,   5   s   
 r,   batchc           
      C   s   g }g }g }g }| D ]}| |d  | |d  | |d  | |d  q
t|}t|  }t||gd}||||d}	|	S )Nr-   audio_filepathaudio	audio_len)max_lens)dataset_namesaudio_filepathsr2   
audio_lens)appendtorch	IntTensorintmaxitemr   )
r0   dataset_name_listaudio_filepath_list
audio_listaudio_len_listexamplebatch_audio_lenaudio_max_lenbatch_audio
batch_dictr   r   r   audio_collate_fn<   s$   
rG   r-   datasetmin_durationmax_durationc                 C   s   t |j}t|||d\}}}t|  tdt|  tdt|  td|dd td|dd g }g }	|D ]}
t| |
t|jd}|	| |		|j
 qB||	fS )	N)entriesrI   rJ   zOriginal # of files: zFiltered # of files: zOriginal duration: z.2fz hourszFiltered duration: )r-   r.   r#   )r	   r"   r   r   infolenr,   r   r#   r8   r$   )r-   rH   rI   rJ   rK   filtered_entriestotal_hoursfiltered_hourssamplessample_weightsentrysampler   r   r   preprocess_manifestW   s    


rU   c                       s   e Zd ZdZ							ddededee dee deeeef  d	ee	 d
ee	 dee	 de
f fddZdededeejjj fddZdd Zdd Zdd Z  ZS )VocoderDataseta  
    Class for processing and loading Vocoder training examples.

    Args:
        dataset_meta: Dict of dataset names (string) to dataset metadata.
        sample_rate: Sample rate to load audio as. If the audio is stored at a different sample rate, then it will
            be resampled.
        n_samples: Optional int, if provided then n_samples samples will be randomly sampled from the full
            audio file.
        weighted_sampling_steps_per_epoch: Optional int, If provided, then data will be sampled (with replacement) based on
            the sample weights provided in the dataset metadata. If None, then sample weights will be ignored.
        feature_processors: Optional, list of feature processors to run on training examples.
        min_duration: Optional float, if provided audio files in the training manifest shorter than 'min_duration'
            will be ignored.
        max_duration: Optional float, if provided audio files in the training manifest longer than 'max_duration'
            will be ignored.
        trunc_duration: Optional int, if provided audio will be truncated to at most 'trunc_duration' seconds.
        volume_norm: Whether to apply volume normalization to loaded audio.
    NFdataset_metasample_rate	n_samples!weighted_sampling_steps_per_epochfeature_processorsrI   rJ   trunc_durationvolume_normc
                    s   t    || _|| _|| _|	| _|| _d| _|r+t	d|
   t| | _ng | _g | _g | _| D ]#\}
}tdi |}t|
|||d\}}|  j|7  _|  j|7  _q8d S )NFFound feature processors r-   rH   rI   rJ   r   )super__init__rX   rY   r\   r]   rZ   load_precomputed_melr   rL   keyslistvaluesr[   data_samplesrR   itemsr!   rU   )selfrW   rX   rY   rZ   r[   rI   rJ   r\   r]   r-   dataset_inforH   rQ   weights	__class__r   r   ra      s0   

zVocoderDataset.__init__
batch_size
world_sizereturnc                 C   s"   | j sd S t| j||| j d}|S )N)rR   rm   rn   	num_steps)rZ   r   rR   )rh   rm   rn   samplerr   r   r   get_sampler   s   zVocoderDataset.get_samplerc                 C   
   t | jS N)rM   rf   rh   r   r   r   __len__      
zVocoderDataset.__len__c           
      C   s   | j | }| jrt|j|j| j| j| jd\}}}nt|j|j| j| j| jd\}}}t	j
|t	jd}|jd }|j|||d}| jD ]}	|	| qC|S )N)r.   r#   rX   rY   r]   )r.   r#   rX   rJ   r]   )dtyper   r-   r1   r2   r3   )rf   rY   r   r.   r#   rX   r]   r   r\   r9   tensorfloat32shaper-   r[   process)
rh   indexdataaudio_array_audio_filepath_relr2   r3   rB   	processorr   r   r   __getitem__   s4   


zVocoderDataset.__getitem__c                 C      t |S rt   rG   rh   r0   r   r   r   
collate_fn      zVocoderDataset.collate_fn)NNNNNNF)r&   r'   r(   __doc__r   r;   r   r+   r
   r*   boolra   r9   utilsr   Samplerrr   rv   r   r   __classcell__r   r   rk   r   rV   r   s@    	
 ("rV   c                       s   e Zd ZdZ									d!deded	ee d
ededee dee deeee	f  dededef fddZ
dd Zdd ZdedefddZdd Zdd Zdd  Z  ZS )"TarredVocoderDatasetaH  
    A similar Dataset to the VocoderDataset, but loads tarred audio files.

    Accepts a single comma-separated JSON manifest file (in the same style as for the VocoderDataset),
    as well as the path(s) to the tarball(s) containing the wav files. Each line of the manifest should
    contain the information for one audio file, and duration of audio.

    Valid formats for the audio_tar_filepaths argument include:
    (1) a single string that can be brace-expanded, e.g. 'path/to/audio.tar' or 'path/to/audio_{1..100}.tar.gz', or
    (2) a list of file paths that will not be brace-expanded, e.g. ['audio_1.tar', 'audio_2.tar', ...].

    See the WebDataset documentation for more information about accepted data and input formats.

    If using multiple processes the number of shards should be divisible by the number of workers to ensure an
    even split among workers. If it is not divisible, logging will give a warning but training will proceed.
    In addition, if using mutiprocessing, each shard MUST HAVE THE SAME NUMBER OF ENTRIES after filtering
    is applied. We currently do not check for this, but your program may hang if the shards are uneven!

    Additionally, please note that the len() of this DataLayer is assumed to be the length of the manifest
    after filtering. An incorrect manifest length may lead to some DataLoader issues down the line.

    Args:
        dataset_meta: Dict of dataset names (string) to dataset metadata.
        audio_tar_filepaths: Either a list of audio tarball filepaths, or a
            string (can be brace-expandable).
        sample_rate: Sample rate to load audio as. If the audio is stored at a different sample rate, then it will
            be resampled.
        n_samples: Optional int, if provided then n_samples samples will be randomly sampled from the full
            audio file.
        shuffle_n (int): How many samples to look ahead and load to be shuffled.
            See WebDataset documentation for more details.
            Defaults to 0.
        min_duration: Optional float, if provided audio files in the training manifest shorter than 'min_duration'
            will be ignored.
        max_duration: Optional float, if provided audio files in the training manifest longer than 'max_duration'
            will be ignored.
        trunc_duration: Optional int, if provided audio will be truncated to at most 'trunc_duration' seconds.
        feature_processors: Optional, list of feature processors to run on training examples.
        shard_strategy (str): Tarred dataset shard distribution strategy chosen as a str value during ddp.
            -   `scatter`: The default shard strategy applied by WebDataset, where each node gets
                a unique set of shards, which are permanently pre-allocated and never changed at runtime.
            -   `replicate`: Optional shard strategy, where each node gets all of the set of shards
                available in the tarred dataset, which are permanently pre-allocated and never changed at runtime.
                The benefit of replication is that it allows each node to sample data points from the entire
                dataset independently of other nodes, and reduces dependence on value of `shuffle_n`.

                .. warning::
                    Replicated strategy allows every node to sample the entire set of available tarfiles,
                    and therefore more than one node may sample the same tarfile, and even sample the same
                    data points! As such, there is no assured guarantee that all samples in the dataset will be
                    sampled at least once during 1 epoch. Scattered strategy, on the other hand, on specific
                    occasions (when the number of shards is not divisible with ``world_size``), will not sample
                    the entire dataset. For these reasons it is not advisable to use tarred datasets as validation
                    or test datasets.
        global_rank (int): Worker rank, used for partitioning shards. Defaults to 0.
        world_size (int): Total number of processes, used for partitioning shards. Defaults to 0.
    Nr   皙?scatter   rW   rX   rY   	shuffle_nrI   rJ   r\   r[   shard_strategyglobal_rankrn   c                    s  t    t|dkrtd|  d || _|| _|r't|| j | _	nd | _	|r>t
d|   t| | _ng | _g | _g | _| D ]'\}}|j}|  j|g7  _tdi |}t||||d\}}|  j|7  _qKi | _| jD ](}tjtj|jd d }|| jvr|| j|< qytd| d|jd  t
d	|  t||
||	d
}ttj|dtt|t  tj!t"ddt#dd| j$t%| j&| _'d S )Nr   z
Arguments zA does not support for TarredVocoderDataset, they will be ignored.r^   r_   r1   zDuplicate file_id z found in manifest zworld size: )sharded_filepathsr   rn   r   )urls__key__)r2   keyr2   r   r   )(r`   ra   rM   r   warningrc   rX   rY   r;   trunc_samplesrL   rd   re   r[   rf   r%   rg   r!   rU   file_id_to_sample_mapospathsplitextbasenamer.   
ValueErrorr   wdsDataPipelineSimpleShardListr   shuffletarfile_to_samplesrenameVALID_FILE_FORMATSto_tuple_filtermap_build_sample_dataset)rh   rW   rX   rY   r   rI   rJ   r\   r[   r   r   rn   kwargsr-   ri   r%   rH   rQ   r   rT   file_idrk   r   r   ra      sh   







zTarredVocoderDataset.__init__c                    s   G  fddd}|| j S )Nc                       s(   e Zd Z fddZdd Zdd ZdS )z6TarredVocoderDataset._filter.<locals>.FilteredIteratorc                    s    | _ || _d S rt   )iteratorr   )rh   r   r   r   r   ra   q  s   
z?TarredVocoderDataset._filter.<locals>.FilteredIterator.__init__c                 S   s   | S rt   r   ru   r   r   r   __iter__u  s   z?TarredVocoderDataset._filter.<locals>.FilteredIterator.__iter__c                 S   s<   	 t | j\}}tjtj|d }|| jv r||fS q)NTr   )nextr   r   r   r   r   r   )rh   audio_bytesaudio_filenamer   r   r   r   __next__x  s   
z?TarredVocoderDataset._filter.<locals>.FilteredIterator.__next__N)r&   r'   r(   ra   r   r   r   r   r   r   FilteredIteratorp  s    r   )r   )rh   r   r   r   r   r   r   o  s   
zTarredVocoderDataset._filterc                 C   s"  |\}}t jt j|d }| j| }tjt|dd\}}|| j	kr=t
d| d| j	 d tjj||| j	d}t|}| jrj|jd }|| jkrctd|| j d}	||	|	| j  }n|d | j }| jrt|d | j }t|jd }
|j|||
d	}| jD ]}|| q|S )
Nr   r{   )filerx   zSample rate of z& does not match target sample rate of z. Resampling audio.)orig_sr	target_sr)   ry   )r   r   r   r   r   sfreadioBytesIOrX   r   r   librosacoreresampler9   
from_numpyrY   r|   randintr   rz   r-   r[   r}   )rh   tupr   r   r   r   r   sr	len_audiostartr3   rB   r   r   r   r   r     s6   





z"TarredVocoderDataset._build_samplerm   c                 C   s   dS )zH
        Currently sampler is not supported for tarred dataset.
        Nr   )rh   rm   rn   r   r   r   rr     s   z TarredVocoderDataset.get_samplerc                 C   r   rt   r   r   r   r   r   r     r   zTarredVocoderDataset.collate_fnc                 C   s
   | j  S rt   )r   r   ru   r   r   r   r     rw   zTarredVocoderDataset.__iter__c                 C   rs   rt   )rM   r   ru   r   r   r   rv     rw   zTarredVocoderDataset.__len__)	Nr   r   NNNr   r   r   )r&   r'   r(   r   r   r;   r   r*   r+   r
   ra   r   r   rr   r   r   rv   r   r   r   rk   r   r      sP    >	
O&r   )5r   r   dataclassesr   pathlibr   typingr   r   r   r   	soundfiler   torch.utils.datar9   'nemo.collections.asr.data.audio_to_textr   0nemo.collections.asr.parts.preprocessing.segmentr   valid_sf_formats/nemo.collections.asr.parts.utils.manifest_utilsr	   ;nemo.collections.tts.parts.preprocessing.feature_processorsr
   2nemo.collections.tts.parts.utils.tts_dataset_utilsr   r   r   r   r   nemo.core.classesr   r   
nemo.utilsr   r   r   nemo.utils.decoratorsr   nemo.utils.distributedr   joinrc   r   r!   r,   r/   rG   r+   r*   rU   rV   r   r   r   r   r   <module>   sH    
r