o
    }oin                     @   s  d dl Z d dlZd dlmZmZmZmZ d dlZd dlm	Z	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZmZ d dlmZmZmZmZmZ d d	lmZ d d
lmZ d dlm Z  d!g ddd e" D  Z#dej$de%de%dej$fddZ&dd Z'dd Z(dd Z)dd Z*dd Z+G d d! d!eZ,G d"d# d#e,Z-G d$d% d%e,Z.G d&d' d'eZ/G d(d) d)e/Z0G d*d+ d+e/Z1G d,d- d-eZ2G d.d/ d/eZ3G d0d1 d1e.Z4dS )2    N)DictListOptionalUnion)cache_datastore_manifestsexpand_sharded_filepaths)WaveformFeaturizer)available_formats)collections)DatasetIterableDataset)AudioSignal
LabelsTypeLengthsType
NeuralTypeRegressionValuesType)logging)
webdataset)webdataset_split_by_workers;)wavmp3flacopusc                 C   s   g | ]}|  qS  )lower).0fmtr   r   \/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/asr/data/audio_to_label.py
<listcomp>       r   signalsig_lenrequired_lengthreturnc                 C   sl   t g }t|| }t|| }t g }t || g }|dkr2| | d }t ||f} | S |} | S )a6  repeat signal to make short signal to have required_length
    Args:
        signal (Tensor): input signal
        sig_len (int): length of input signal
        required_length (int): length of generated signal
    Returns:
        signal (Tensor): generated signal of required_length by repeating itself.
    r   N)torchtensorintcat)r!   r"   r#   subrepeatremrep_sigr   r   r   repeat_signal"   s   
	
r-   c                 C   s   | |    }||   S )zUnormalize signal
    Args:
        signal(FloatTensor): signal to be normalized.
    )meanabsmax)r!   signal_minusmeanr   r   r   	normalize8   s   r2   c                 C   s6   t  }| D ]}|dd }||dd ||< q|S )a   Count number of wav files in Dict manifest_file_id. Use for _TarredAudioToLabelDataset.
    Args:
        manifest_file_id (Dict): Dict of files and their corresponding id. {'A-sub0' : 1, ..., 'S-sub10':100}
    Returns:
        count (Dict): Dict of wav files {'A' : 2, ..., 'S':10}
    -subr      )dictsplitget)manifest_file_idcountiaudio_filenamer   r   r   count_occurenceA   s
   r<   c                 C   s  t |  \}}}}d}|d du}|rt| }t| }g g }}	| D ]@\}
}}}|rH| }||k rCd|| f}tjj|
|}
||
 | }||k r`d|| f}tjjj|||d}|	| q%|rst|}t|}nd\}}t|	}	t|}|||	|fS )ar  collate batch of audio sig, audio len, tokens, tokens len
    Args:
        batch (Optional[FloatTensor], Optional[LongTensor], LongTensor,
               LongTensor):  A tuple of tuples of signal, signal lengths,
               encoded tokens, and encoded tokens length.  This collate func
               assumes the signals are 1d torch tensors (i.e. mono audio).
    r   N)valueNN)	zipr0   itemr%   nn
functionalpadappendstack)batchpad_id_audio_lengthstokens_lengthsmax_audio_len	has_audiomax_tokens_lenaudio_signaltokenssigr"   tokens_itokens_i_lenrC   r   r   r   _speech_collate_fnO   s4   




rS   c                 C   s  t | \}}}}|d du}tt|}g g g }}}	|D ]M\}
}}}|rf| }|| }|dk rY|| }|| }|dkrE|
| d ntg }t||
g }t||f}
|	t| ||
 || q|ryt|}t|	}nd\}}t|}t|}||||fS )ai  collate batch of audio sig, audio len, tokens, tokens len
    Args:
        batch (Optional[FloatTensor], Optional[LongTensor], LongTensor,
            LongTensor):  A tuple of tuples of signal, signal lengths,
            encoded tokens, and encoded tokens length.  This collate func
            assumes the signals are 1d torch tensors (i.e. mono audio).
    r   Nr>   )	r?   r'   r0   r@   r%   r&   r(   rD   rE   )selfrF   rH   rI   rJ   rL   fixed_lengthrN   rO   new_audio_lengthsrP   r"   rQ   
chunck_lenr*   r+   r)   r,   r   r   r   _fixed_seq_collate_fnw   s0    



rX   c                 C   s~  t | jj| j }t| \}}}}t t|t|}t | jj| j }|d du}g g g g f\}}	}
}|d }||d  }|D ]^\}}}}| jrLt	|}t
|}t
|}t
|||f}||7 }|rt
j|| |dd}t|D ]}|| }|| }||| }|| qr|	| |
|g|  ||g|  q?|rt
|}t
|}nd\}}t
|
}
t
|	}|||
|fS )a  collate batch of audio sig, audio len, tokens, tokens len
    Args:
        batch (Optional[FloatTensor], Optional[LongTensor], LongTensor,
            LongTensor):  A tuple of tuples of signal, signal lengths,
            encoded tokens, and encoded tokens length.  This collate func
            assumes the signals are 1d torch tensors (i.e. mono audio).
            batch size equals to 1.
    r   N   trunc)rounding_moder>   )r'   
featurizersample_ratewindow_length_in_secr?   minr0   shift_length_in_secnormalize_audior2   r%   zerosr(   divrangerD   extendrE   r&   )rT   rF   slice_lengthrH   rI   rJ   shiftrL   rN   
num_slicesrO   append_len_startappend_len_endrP   r"   rQ   startendslicesslice_id	start_idxend_idxr!   r   r   r   _vad_frame_seq_collate_fn   sB   	





rq   c                       s   e Zd ZdZedeeeef  fddZ	dddddddd	e
eee f d
ee dee dee dede
eeee f dedee f fddZdd Zdd Z  ZS )_AudioLabelDataseta  
    Dataset that loads tensors via a json file containing paths to audio files,
    labels, and durations and offsets(in seconds). Each new line is a
    different sample. Example below:
    and their target labels. JSON files should be of the following format::
        {"audio_filepath": "/path/to/audio_wav_0.wav", "duration": time_in_sec_0, "label": target_label_0, "offset": offset_in_sec_0}
        ...
        {"audio_filepath": "/path/to/audio_wav_n.wav", "duration": time_in_sec_n, "label": target_label_n, "offset": offset_in_sec_n}
    Args:
        manifest_filepath (Union[str, List[str]]): Dataset parameter. Path to JSON containing data.
        labels (list): Dataset parameter. List of target classes that can be output by the speaker recognition model.
        featurizer
        min_duration (float): Dataset parameter. All training files which have a duration less than min_duration
            are dropped. Note: Duration is read from the manifest JSON.
            Defaults to 0.1.
        max_duration (float): Dataset parameter.
            All training files which have a duration more than max_duration
            are dropped. Note: Duration is read from the manifest JSON.
            Defaults to None.
        trim (bool): Whether to use trim silence from beginning and end of audio signal using librosa.effects.trim().
            Defaults to False.
        channel selector (Union[str, int, List[int]]): string denoting the downmix mode, an integer denoting the channel to be selected, or an iterable
            of integers denoting a subset of channels. Channel selector is using zero-based indexing.
            If set to `None`, the original signal will be used.
    r$   c                 C   s   t d| durt| drt| jdnt t tdt d}| jr7|t tdt t tdt d |S |t tdt	 t tdt d |S )	+Returns definitions of module output ports.BTN_sample_ratefreqru   rN   a_sig_lengthtargetstargets_lengthlabellabel_length
r   hasattrr   rw   tupler   is_regression_taskupdater   r   rT   output_typesr   r   r   r      s*   z_AudioLabelDataset.output_types皙?NF)min_durationmax_durationtrimchannel_selectorr   cal_labels_occurrencemanifest_filepathlabelsr   r   r   r   r   r   c       	            sH  t    t|tr|d}t|dd tj|||||	d _| _	| _
| _| _|s|r2|n jj _ jd urAt jnd _i i  _ _i g  _ _t jD ]\}
}|
 j|< | j|
< |	rp jj|  j|
< qW|	r fddt jD  _tt jd d D ]}td	| j|  qd S g  _d _d S )
N,T)manifest_filepathscache_audio)manifests_filesr   r   r   r   r4   c                       g | ]} j | qS r   )id2occurrence)r   krT   r   r   r   =      z/_AudioLabelDataset.__init__.<locals>.<listcomp>   $ label id {} and its mapped label {})super__init__
isinstancestrr6   r   r
   ASRSpeechLabel
collectionr\   r   r   r   uniq_labelsr   lennum_classeslabel2idid2labelr   labels_occurrence	enumeratesortedrd   r   debugformat)rT   r   r   r\   r   r   r   r   r   r   label_idr   idx	__class__r   r   r     sB   





z_AudioLabelDataset.__init__c                 C   
   t | jS Nr   r   r   r   r   r   __len__F     
z_AudioLabelDataset.__len__c           	      C   s   | j | }|j}|d u rd}| jj|j||j| j| jd}|t	|j
d  }}| js9t	| j|j  }nt	|j }t	d }||||fS )Nr   )offsetdurationr   r   r4   )r   r   r\   process
audio_filer   r   r   r%   r&   shapelongr   r   r   float	rT   indexsampler   featuresfflttlr   r   r   __getitem__I  s"   
z_AudioLabelDataset.__getitem__)__name__
__module____qualname____doc__propertyr   r   r   r   r   r   r   r   boolr'   r   r   r   __classcell__r   r   r   r   rr      s:    '	
4rr   c                   @      e Zd ZdZdd ZdS )!AudioToClassificationLabelDatasetaZ  
    Dataset that loads tensors via a json file containing paths to audio
    files, command class, and durations (in seconds). Each new line is a
    different sample. Example below:
    {"audio_filepath": "/path/to/audio_wav_0.wav", "duration": time_in_sec_0, "label":         target_label_0, "offset": offset_in_sec_0}
    ...
    {"audio_filepath": "/path/to/audio_wav_n.wav", "duration": time_in_sec_n, "label":         target_label_n, "offset": offset_in_sec_n}
    Args:
        manifest_filepath (Union[str, List[str]]): Path to manifest json as described above. Can
            be comma-separated paths.
        labels (Optional[list]): String containing all the possible labels to map to
            if None then automatically picks from ASRSpeechLabel collection.
        featurizer: Initialized featurizer class that converts paths of
            audio to feature tensors
        max_duration: If audio exceeds this length, do not include in dataset
        min_duration: If audio is less than this length, do not include
            in dataset
        trim: Boolean flag whether to trim the audio
    c                 C      t |ddS Nr   )rG   rS   rT   rF   r   r   r   _collate_fn|     z-AudioToClassificationLabelDataset._collate_fnNr   r   r   r   r   r   r   r   r   r   e  s    r   c                       s   e Zd ZdZdddddddddd	deeee f d	ee d
ee dee de	deeee
ee
 f  dee dee de	de	dee	 f fddZdd Zdd Z  ZS )AudioToSpeechLabelDataseta4	  
    Dataset that loads tensors via a json file containing paths to audio
    files, command class, and durations (in seconds). Each new line is a
    different sample. Example below:
    {"audio_filepath": "/path/to/audio_wav_0.wav", "duration": time_in_sec_0, "label":         target_label_0, "offset": offset_in_sec_0}
    ...
    {"audio_filepath": "/path/to/audio_wav_n.wav", "duration": time_in_sec_n, "label":         target_label_n, "offset": offset_in_sec_n}
    Args:
        manifest_filepath (Union[str, List[str]]): Path to manifest json as described above. Can
            be comma-separated paths.
        labels (Optional[list]): String containing all the possible labels to map to
            if None then automatically picks from ASRSpeechLabel collection.
        min_duration (float): Dataset parameter.
            All training files which have a duration less than min_duration
            are dropped. Note: Duration is read from the manifest JSON.
            Defaults to 0.1.
        max_duration (float): Dataset parameter.
            All training files which have a duration more than max_duration
            are dropped. Note: Duration is read from the manifest JSON.
            Defaults to None.
        trim (bool): Whether to use trim silence from beginning and end
            of audio signal using librosa.effects.trim().
            Defaults to False.
        channel selector (Union[str, int, List[int]]): string denoting the downmix mode, an integer denoting the channel to be selected, or an iterable
            of integers denoting a subset of channels. Channel selector is using zero-based indexing.
            If set to `None`, the original signal will be used.
        window_length_in_sec (float): length of window/slice (in seconds)
            Use this for speaker recognition and VAD tasks.
        shift_length_in_sec (float): amount of shift of window for generating the frame for VAD task in a batch
            Use this for VAD task during inference.
        normalize_audio (bool): Whether to normalize audio signal.
            Defaults to False.
        is_regression_task (bool): Whether the dataset is for a regression task instead of classification.
            Defaults to False.
        cal_labels_occurrence (bool): Whether to calculate occurrence of labels
            Defaults to False.
    r   NF   r4   )	r   r   r   r   r^   r`   ra   r   r   r   r   r   r   r   r   r^   r`   ra   r   r   c                   sX   || _ |	| _|
| _td| j  td| j t j|||||||||d	 d S )N5Window/slice length considered for collate func is {}.Shift length considered for collate func is {})	r   r   r\   r   r   r   r   r   r   )r^   r`   ra   r   r   r   r   r   )rT   r   r   r\   r   r   r   r   r^   r`   ra   r   r   r   r   r   r     s    
z"AudioToSpeechLabelDataset.__init__c                 C   
   t | |S r   rX   r   r   r   r   fixed_seq_collate_fn  r   z.AudioToSpeechLabelDataset.fixed_seq_collate_fnc                 C   r   r   rq   r   r   r   r   vad_frame_seq_collate_fn  r   z2AudioToSpeechLabelDataset.vad_frame_seq_collate_fn)r   r   r   r   r   r   r   r   r   r   r'   r   r   r   r   r   r   r   r   r     sH    .	
#r   c                   @   s   e Zd ZdZddddddddddeeee f d	eeee f d
ee dedee	 dee	 de
dededede
fddZdd Zdd Zdd Zdd ZdS )_TarredAudioLabelDataseta  
    A similar Dataset to the AudioLabelDataSet, but which loads tarred audio files.

    Accepts a single comma-separated JSON manifest file (in the same style as for the AudioToSpeechLabelDataset),
    as well as the path(s) to the tarball(s) containing the wav files. Each line of the manifest should
    contain the information for one audio file, including at least the label and name of the audio
    file within the tarball.

    Valid formats for the audio_tar_filepaths argument include:
    (1) a single string that can be brace-expanded, e.g. 'path/to/audio.tar' or 'path/to/audio_{1..100}.tar.gz', or
    (2) a list of file paths that will not be brace-expanded, e.g. ['audio_1.tar', 'audio_2.tar', ...].

    Note: For brace expansion in (1), there may be cases where `{x..y}` syntax cannot be used due to shell interference.
    This occurs most commonly inside SLURM scripts. Therefore we provide a few equivalent replacements.
    Supported opening braces - { <=> (, [, < and the special tag _OP_.
    Supported closing braces - } <=> ), ], > and the special tag _CL_.
    For SLURM based tasks, we suggest the use of the special tags for ease of use.

    See the documentation for more information about accepted data and input formats.

    If using multiple processes the number of shards should be divisible by the number of workers to ensure an
    even split among workers. If it is not divisible, logging will give a warning but training will proceed.
    In addition, if using mutiprocessing, each shard MUST HAVE THE SAME NUMBER OF ENTRIES after filtering
    is applied. We currently do not check for this, but your program may hang if the shards are uneven!

    Notice that a few arguments are different from the AudioLabelDataSet; for example, shuffle (bool) has been
    replaced by shuffle_n (int).

    Additionally, please note that the len() of this DataLayer is assumed to be the length of the manifest
    after filtering. An incorrect manifest length may lead to some DataLoader issues down the line.

    Args:
        audio_tar_filepaths: Either a list of audio tarball filepaths, or a
            string (can be brace-expandable).
        manifest_filepath (str): Path to the manifest.
        labels (list): Dataset parameter.
            List of target classes that can be output by the speaker recognition model.
        featurizer
        shuffle_n (int): How many samples to look ahead and load to be shuffled.
            See WebDataset documentation for more details.
            Defaults to 0.
        min_duration (float): Dataset parameter.
            All training files which have a duration less than min_duration
            are dropped. Note: Duration is read from the manifest JSON.
            Defaults to 0.1.
        max_duration (float): Dataset parameter.
            All training files which have a duration more than max_duration
            are dropped. Note: Duration is read from the manifest JSON.
            Defaults to None.
        trim(bool): Whether to use trim silence from beginning and end
            of audio signal using librosa.effects.trim().
            Defaults to False.
        window_length_in_sec (float): length of slice/window (in seconds) # Pass this only for speaker recognition and VAD task
        shift_length_in_sec (float): amount of shift of window for generating the frame for VAD task. in a batch # Pass this only for VAD task during inference.
        normalize_audio (bool): Whether to normalize audio signal. Defaults to False.
        shard_strategy (str): Tarred dataset shard distribution strategy chosen as a str value during ddp.
            -   `scatter`: The default shard strategy applied by WebDataset, where each node gets
                a unique set of shards, which are permanently pre-allocated and never changed at runtime.
            -   `replicate`: Optional shard strategy, where each node gets all of the set of shards
                available in the tarred dataset, which are permanently pre-allocated and never changed at runtime.
                The benefit of replication is that it allows each node to sample data points from the entire
                dataset independently of other nodes, and reduces dependence on the value of `shuffle_n`.

                .. warning::
                    Replicated strategy allows every node to sample the entire set of available tarfiles,
                    and therefore more than one node may sample the same tarfile, and even sample the same
                    data points! As such, there is no assured guarantee that all samples in the dataset will be
                    sampled at least once during 1 epoch. Scattered strategy, on the other hand, on specific
                    occasions (when the number of shards is not divisible with ``world_size``), will not sample
                    the entire dataset. For these reasons it is not advisable to use tarred datasets as validation
                    or test datasets.
        global_rank (int): Worker rank, used for partitioning shards. Defaults to 0.
        world_size (int): Total number of processes, used for partitioning shards. Defaults to 0.
        is_regression_task (bool): Whether it is a regression task. Defualts to False.
    r   r   NFscatter)	shuffle_nr   r   r   shard_strategyglobal_rank
world_sizer   audio_tar_filepathsr   r   r   r   r   r   r   r   r   r   c                C   s  t |d tj|||dd| _t| jj| _|| _|| _|r |n| jj	| _
t| j
| _i i | _| _t| j
D ]\}}|| j|< || j|< q7tt| j
d d D ]}td|| j|  qQt||	||
d}ttj|dtt|t tjtdd	td
d| jt| j | _!d S )N)r   T)r   r   r   index_by_file_idr   r   sharded_filepathsr   r   r   urls__key__audiokeyr   r   )"r   r
   r   r   r<   mappingfile_occurencer\   r   r   r   r   r   r   r   r   rd   r   r   r   r   wdsDataPipelineSimpleShardListr   shuffletarfile_to_samplesrenameVALID_FILE_FORMATSto_tuple_filtermap_build_sample_dataset)rT   r   r   r   r\   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r      sD   





z!_TarredAudioLabelDataset.__init__c                        G  fddd}|| j | jS )  This function is used to remove samples that have been filtered out by ASRSpeechLabel already.
        Otherwise, we would get a KeyError as _build_sample attempts to find the manifest entry for a sample
        that was filtered out (e.g. for duration).
        Note that if using multi-GPU training, filtering may lead to an imbalance in samples in each shard,
        which may make your code hang as one process will finish before the other.
        c                       0   e Zd Z fddZdd Zdd Zdd Zd	S )
z;_TarredAudioLabelDataset._filter.<locals>.TarredAudioFilterc                         | _ || _|| _|  | _d S r   iteratorr   r   _internal_generator	_iterablerT   r   r   r  r   r   r   d     zD_TarredAudioLabelDataset._filter.<locals>.TarredAudioFilter.__init__c                 S      |   | _| S r   r  r  r   r   r   r   __iter__j     
zD_TarredAudioLabelDataset._filter.<locals>.TarredAudioFilter.__iter__c                 S   :   zt | j}W |S  ty   |  | _t | j}Y |S w r   nextr  StopIterationr  rT   valuesr   r   r   __next__n     
zD_TarredAudioLabelDataset._filter.<locals>.TarredAudioFilter.__next__c                 s       t | jD ]8\}}|\}}tjtj|\}}|| jv r>td| j| D ]}|dkr0|}n|d t| }||fV  q'qdS a>  
                WebDataset requires an Iterator, but we require an iterable that yields 1-or-more
                values per value inside self.iterator.

                Therefore wrap the iterator with a generator function that will yield 1-or-more
                values per sample in the iterator.
                r   r3   N	r   r  ospathsplitextbasenamer   rd   r   rT   rH   tupaudio_bytesr;   file_idjr   r   r   r  x     
zO_TarredAudioLabelDataset._filter.<locals>.TarredAudioFilter._internal_generatorNr   r   r   r   r	  r  r  r   r  r   r   TarredAudioFilterc  
    
r!  r   r   rT   r  r!  r   r  r   r   [     )z _TarredAudioLabelDataset._filterc                 C   s   |\}}t jt j|\}}| jj| }| j| }|j}|du r$d}t|}	| j	j
|	||j| jd}
|	  |
t|
jd  }}| j|j }d}||t| t| fS )\Builds the training sample by combining the data from the WebDataset with the manifest info.Nr   r   r   r   r4   )r  r  r  r  r   r   r   ioBytesIOr\   r   r   r   closer%   r&   r   r   r   r   rT   r  r  r;   r  rH   manifest_idxmanifest_entryr   audio_filestreamr   r   r   r   r   r   r   r   r     s&   

 z&_TarredAudioLabelDataset._build_samplec                 C   
   | j  S r   r   r	  r   r   r   r   r	    r   z!_TarredAudioLabelDataset.__iter__c                 C   r   r   r   r   r   r   r   r     r   z _TarredAudioLabelDataset.__len__)r   r   r   r   r   r   r   r'   r   r   r   r   r   r   r	  r   r   r   r   r   r     sJ    S	

;3 r   c                   @   r   )'TarredAudioToClassificationLabelDataseta  
    A similar Dataset to the AudioToClassificationLabelDataset, but which loads tarred audio files.

    Accepts a single comma-separated JSON manifest file (in the same style as for the AudioToClassificationLabelDataset),
    as well as the path(s) to the tarball(s) containing the wav files. Each line of the manifest should
    contain the information for one audio file, including at least the transcript and name of the audio
    file within the tarball.

    Valid formats for the audio_tar_filepaths argument include:
    (1) a single string that can be brace-expanded, e.g. 'path/to/audio.tar' or 'path/to/audio_{1..100}.tar.gz', or
    (2) a list of file paths that will not be brace-expanded, e.g. ['audio_1.tar', 'audio_2.tar', ...].

    See the WebDataset documentation for more information about accepted data and input formats.

    If using multiple processes the number of shards should be divisible by the number of workers to ensure an
    even split among workers. If it is not divisible, logging will give a warning but training will proceed.
    In addition, if using mutiprocessing, each shard MUST HAVE THE SAME NUMBER OF ENTRIES after filtering
    is applied. We currently do not check for this, but your program may hang if the shards are uneven!

    Notice that a few arguments are different from the AudioToBPEDataset; for example, shuffle (bool) has been
    replaced by shuffle_n (int).

    Additionally, please note that the len() of this DataLayer is assumed to be the length of the manifest
    after filtering. An incorrect manifest length may lead to some DataLoader issues down the line.

    Args:
        audio_tar_filepaths: Either a list of audio tarball filepaths, or a
            string (can be brace-expandable).
        manifest_filepath (str): Path to the manifest.
        labels (list): Dataset parameter.
            List of target classes that can be output by the speaker recognition model.
        featurizer
        shuffle_n (int): How many samples to look ahead and load to be shuffled.
            See WebDataset documentation for more details.
            Defaults to 0.
        min_duration (float): Dataset parameter.
            All training files which have a duration less than min_duration
            are dropped. Note: Duration is read from the manifest JSON.
            Defaults to 0.1.
        max_duration (float): Dataset parameter.
            All training files which have a duration more than max_duration
            are dropped. Note: Duration is read from the manifest JSON.
            Defaults to None.
        trim(bool): Whether to use trim silence from beginning and end
            of audio signal using librosa.effects.trim().
            Defaults to False.
        shard_strategy (str): Tarred dataset shard distribution strategy chosen as a str value during ddp.
            -   `scatter`: The default shard strategy applied by WebDataset, where each node gets
                a unique set of shards, which are permanently pre-allocated and never changed at runtime.
            -   `replicate`: Optional shard strategy, where each node gets all of the set of shards
                available in the tarred dataset, which are permanently pre-allocated and never changed at runtime.
                The benefit of replication is that it allows each node to sample data points from the entire
                dataset independently of other nodes, and reduces dependence on value of `shuffle_n`.

                .. warning::
                    Replicated strategy allows every node to sample the entire set of available tarfiles,
                    and therefore more than one node may sample the same tarfile, and even sample the same
                    data points! As such, there is no assured guarantee that all samples in the dataset will be
                    sampled at least once during 1 epoch. Scattered strategy, on the other hand, on specific
                    occasions (when the number of shards is not divisible with ``world_size``), will not sample
                    the entire dataset. For these reasons it is not advisable to use tarred datasets as validation
                    or test datasets.
        global_rank (int): Worker rank, used for partitioning shards. Defaults to 0.
        world_size (int): Total number of processes, used for partitioning shards. Defaults to 0.
        is_regression_task (bool): Whether it is a regression task. Defualts to False.
    c                 C   r   r   r   r   r   r   r   r     r   z3TarredAudioToClassificationLabelDataset._collate_fnNr   r   r   r   r   r1    s    Cr1  c                       s   e Zd ZdZddddddddddd	
d
eeee f deeee f dee dedee	 dee	 de
dee	 dee	 de
dededef fddZdd Zdd Zdd Z  ZS )TarredAudioToSpeechLabelDataseta  
    A similar Dataset to the AudioToSpeechLabelDataset, but which loads tarred audio files.

    Accepts a single comma-separated JSON manifest file (in the same style as for the AudioToSpeechLabelDataset),
    as well as the path(s) to the tarball(s) containing the wav files. Each line of the manifest should
    contain the information for one audio file, including at least the transcript and name of the audio
    file within the tarball.

    Valid formats for the audio_tar_filepaths argument include:
    (1) a single string that can be brace-expanded, e.g. 'path/to/audio.tar' or 'path/to/audio_{1..100}.tar.gz', or
    (2) a list of file paths that will not be brace-expanded, e.g. ['audio_1.tar', 'audio_2.tar', ...].

    See the WebDataset documentation for more information about accepted data and input formats.

    If using multiple processes the number of shards should be divisible by the number of workers to ensure an
    even split among workers. If it is not divisible, logging will give a warning but training will proceed.
    In addition, if using mutiprocessing, each shard MUST HAVE THE SAME NUMBER OF ENTRIES after filtering
    is applied. We currently do not check for this, but your program may hang if the shards are uneven!

    Notice that a few arguments are different from the AudioToBPEDataset; for example, shuffle (bool) has been
    replaced by shuffle_n (int).

    Additionally, please note that the len() of this DataLayer is assumed to be the length of the manifest
    after filtering. An incorrect manifest length may lead to some DataLoader issues down the line.

    Args:
        audio_tar_filepaths: Either a list of audio tarball filepaths, or a
            string (can be brace-expandable).
        manifest_filepath (str): Path to the manifest.
        labels (list): Dataset parameter.
            List of target classes that can be output by the speaker recognition model.
        featurizer
        shuffle_n (int): How many samples to look ahead and load to be shuffled.
            See WebDataset documentation for more details.
            Defaults to 0.
        min_duration (float): Dataset parameter.
            All training files which have a duration less than min_duration
            are dropped. Note: Duration is read from the manifest JSON.
            Defaults to 0.1.
        max_duration (float): Dataset parameter.
            All training files which have a duration more than max_duration
            are dropped. Note: Duration is read from the manifest JSON.
            Defaults to None.
        trim(bool): Whether to use trim silence from beginning and end
            of audio signal using librosa.effects.trim().
            Defaults to False.
        window_length_in_sec (float): time length of window/slice (in seconds) # Pass this only for speaker recognition and VAD task
        shift_length_in_sec (float): amount of shift of window for generating the frame for VAD task. in a batch # Pass this only for VAD task during inference.
        normalize_audio (bool): Whether to normalize audio signal. Defaults to False.
        shard_strategy (str): Tarred dataset shard distribution strategy chosen as a str value during ddp.
            -   `scatter`: The default shard strategy applied by WebDataset, where each node gets
                a unique set of shards, which are permanently pre-allocated and never changed at runtime.
            -   `replicate`: Optional shard strategy, where each node gets all of the set of shards
                available in the tarred dataset, which are permanently pre-allocated and never changed at runtime.
                The benefit of replication is that it allows each node to sample data points from the entire
                dataset independently of other nodes, and reduces dependence on value of `shuffle_n`.

                .. warning::
                    Replicated strategy allows every node to sample the entire set of available tarfiles,
                    and therefore more than one node may sample the same tarfile, and even sample the same
                    data points! As such, there is no assured guarantee that all samples in the dataset will be
                    sampled at least once during 1 epoch. Scattered strategy, on the other hand, on specific
                    occasions (when the number of shards is not divisible with ``world_size``), will not sample
                    the entire dataset. For these reasons it is not advisable to use tarred datasets as validation
                    or test datasets.
        global_rank (int): Worker rank, used for partitioning shards. Defaults to 0.
        world_size (int): Total number of processes, used for partitioning shards. Defaults to 0.
    r   r   NFr   r4   r   )
r   r   r   r   r^   r`   ra   r   r   r   r   r   r   r   r   r   r   r^   r`   ra   r   r   r   c                   sX   t d|	 t d|
 |	| _|
| _|| _t j|||||||||||d d S )Nr   r   )r   r   r   r\   r   r   r   r   r   r   r   )r   infor   r^   r`   ra   r   r   )rT   r   r   r   r\   r   r   r   r   r^   r`   ra   r   r   r   r   r   r   r   C  s$   
z(TarredAudioToSpeechLabelDataset.__init__c                 C   r   r   r   r   r   r   r   r   i  r   z4TarredAudioToSpeechLabelDataset.fixed_seq_collate_fnc                 C   s   t r   )NotImplementedErrorr   r   r   r   sliced_seq_collate_fnl  s   z5TarredAudioToSpeechLabelDataset.sliced_seq_collate_fnc                 C   r   r   r   r   r   r   r   r   o  r   z8TarredAudioToSpeechLabelDataset.vad_frame_seq_collate_fn)r   r   r   r   r   r   r   r'   r   r   r   r   r   r5  r   r   r   r   r   r   r2    sT    L	
&r2  c                       s   e Zd ZdZedeeeef  fddZ	ddddddddddddd	e
eee f d
edeee  dedddee dee dedee
eeee f  dedee dee dee f fddZdd ZdefddZdd Zd d! Zd"d# Z  ZS )$AudioToMultiLabelDataseta	  
    Dataset that loads a json file containing paths to audio files, durations (in seconds), and a sequence of labels.
    Each new line is a different sample. Example below:
    {"audio_filepath": "/path/to/audio_wav_0.wav", "duration": time_in_sec_0, "label":         "0 1 1 0 1", "offset": offset_in_sec_0}
    ...
    {"audio_filepath": "/path/to/audio_wav_n.wav", "duration": time_in_sec_n, "label":         "0 1 0 0 1", "offset": offset_in_sec_n}
    Args:
        manifest_filepath (Union[str, List[str]]): Path to manifest json as described above. Can
            be comma-separated paths.
        labels (Optional[list]): String containing all the possible labels to map to
            if None then automatically picks from ASRSpeechLabel collection.
        min_duration (float): Dataset parameter.
            All training files which have a duration less than min_duration
            are dropped. Note: Duration is read from the manifest JSON.
            Defaults to 0.1.
        max_duration (float): Dataset parameter.
            All training files which have a duration more than max_duration
            are dropped. Note: Duration is read from the manifest JSON.
            Defaults to None.
        trim_silence (bool): Whether to use trim silence from beginning and end
            of audio signal using librosa.effects.trim().
            Defaults to False.
        channel selector (Union[str, int, List[int]]): string denoting the downmix mode, an integer denoting the channel to be selected, or an iterable
            of integers denoting a subset of channels. Channel selector is using zero-based indexing.
            If set to `None`, the original signal will be used.
        window_length_in_sec (float): length of window/slice (in seconds)
            Use this for speaker recognition and VAD tasks.
        shift_length_in_sec (float): amount of shift of window for generating the frame for VAD task in a batch
            Use this for VAD task during inference.
        normalize_audio (bool): Whether to normalize audio signal.
            Defaults to False.
        is_regression_task (bool): Whether the dataset is for a regression task instead of classification.
            Defaults to False.
        cal_labels_occurrence (bool): Whether to calculate occurrence of labels
            Defaults to False.
        delimiter (Optional[str]): Delimiter to use when splitting the label string, default to None.
        normalize_audio_db (Optional[float]):  normalize audio signal to a target db, default to None.
    r$   c                 C   s   t d| durt| drt| jdnt t tdt d}| jr7|t tdt t tdt d |S |t dt	 t tdt d	 |S )
rs   rt   Nrw   rx   ru   rz   zB, Tr|   r   r   r   r   r   r   r     s*   
z%AudioToMultiLabelDataset.output_typesNFr   )r   
int_values	augmentorr   r   trim_silencer   r   r   	delimiternormalize_audio_dbr   r]   r   r7  r8  1nemo.collections.asr.parts.perturb.AudioAugmentorr   r   r9  r   r   r   r:  r;  c                   sF  t    t|tr|d}|| _|| _tj||||
||d| _	t
|||d| _|| _|	| _|
| _i | _d | _|
s|r>|n|  | _| jd urMt| jnd| _i i | _| _t| jD ]"\}}|| j|< || j|< |r~| j	j| | j|< | j| j|  q\tt| jd d D ]}td|| j|  qd S g | _d| _d S )Nr   )r   r   r   r   r   r:  r]   r7  r8  r4   r   r   )r   r   r   r   r6   r:  r;  r
   r   r   r   r\   r   r   r   r   r   _get_label_setr   r   r   r   r   r   rD   rd   r   r   r   )rT   r   r]   r   r7  r8  r   r   r9  r   r   r   r:  r;  r   r   r   r   r   r   r     sF   


	


z!AudioToMultiLabelDataset.__init__c                 C   J   g }| j D ]}|j}|r| jr|| jn| }|| qtt|S r   r   r   r:  r6   re   r   setrT   r   r   	label_strlabel_str_listr   r   r   r>       

z'AudioToMultiLabelDataset._get_label_setrC  c                    d    j r	| j n| } jr dd |D }t| }|S  fdd|D }t| }|S )Nc                 S      g | ]}t |qS r   r   r   sr   r   r   r     r    zAAudioToMultiLabelDataset._label_str_to_tensor.<locals>.<listcomp>c                    r   r   r   rI  r   r   r   r     r   r:  r6   r   r%   r&   r   r   rT   rC  r   r   r   r   _label_str_to_tensor      z-AudioToMultiLabelDataset._label_str_to_tensorc                 C   r   r   r   r   r   r   r   r     r   z AudioToMultiLabelDataset.__len__c           	      C   s   | j | }|j}|d u rd}| jj|j||j| j| j| jd}|t	
|d }}| |j}t	
|d }||||fS )Nr   )r   r   r   r   normalize_db)r   r   r\   r   r   r   r   r   r;  r%   r&   sizer   rN  r   r   r   r   r   r     s    
	z$AudioToMultiLabelDataset.__getitem__c                 C   r   r   r   r   r   r   r   r   '  r   z$AudioToMultiLabelDataset._collate_fn)r   r   r   r   r   r   r   r   r   r   r   r   r'   r   r   r   r>  rN  r   r   r   r   r   r   r   r   r6  s  s^    )%
	
9	r6  c                "       s   e Zd ZdZdddddddddddddddeeee f d	eeee f d
edeee  dede	dddee
 dee
 de	de	dedededee dee
 f  fddZdd ZdefddZd d! Zd"d# Zd$d% Zd&d' Zd(d) Z  ZS )*TarredAudioToMultiLabelDatasetaS  
    A similar Dataset to the AudioToMultiLabelDataset, but which loads tarred audio files.

    Accepts a single comma-separated JSON manifest file (in the same style as for the AudioToSpeechLabelDataset),
    as well as the path(s) to the tarball(s) containing the wav files. Each line of the manifest should
    contain the information for one audio file, including at least the transcript and name of the audio
    file within the tarball.

    Valid formats for the audio_tar_filepaths argument include:
    (1) a single string that can be brace-expanded, e.g. 'path/to/audio.tar' or 'path/to/audio_{1..100}.tar.gz', or
    (2) a list of file paths that will not be brace-expanded, e.g. ['audio_1.tar', 'audio_2.tar', ...].

    See the WebDataset documentation for more information about accepted data and input formats.

    If using multiple processes the number of shards should be divisible by the number of workers to ensure an
    even split among workers. If it is not divisible, logging will give a warning but training will proceed.
    In addition, if using mutiprocessing, each shard MUST HAVE THE SAME NUMBER OF ENTRIES after filtering
    is applied. We currently do not check for this, but your program may hang if the shards are uneven!

    Notice that a few arguments are different from the AudioToBPEDataset; for example, shuffle (bool) has been
    replaced by shuffle_n (int).

    Additionally, please note that the len() of this DataLayer is assumed to be the length of the manifest
    after filtering. An incorrect manifest length may lead to some DataLoader issues down the line.

    Args:
        audio_tar_filepaths: Either a list of audio tarball filepaths, or a
            string (can be brace-expandable).
        manifest_filepath (str): Path to the manifest.
        labels (list): Dataset parameter.
            List of target classes that can be output by the speaker recognition model.
        shuffle_n (int): How many samples to look ahead and load to be shuffled.
            See WebDataset documentation for more details.
            Defaults to 0.
        min_duration (float): Dataset parameter.
            All training files which have a duration less than min_duration
            are dropped. Note: Duration is read from the manifest JSON.
            Defaults to 0.1.
        max_duration (float): Dataset parameter.
            All training files which have a duration more than max_duration
            are dropped. Note: Duration is read from the manifest JSON.
            Defaults to None.
        trim(bool): Whether to use trim silence from beginning and end
            of audio signal using librosa.effects.trim().
            Defaults to False.
        window_length_in_sec (float): time length of window/slice (in seconds) # Pass this only for speaker recognition and VAD task
        shift_length_in_sec (float): amount of shift of window for generating the frame for VAD task. in a batch # Pass this only for VAD task during inference.
        normalize_audio (bool): Whether to normalize audio signal. Defaults to False.
        shard_strategy (str): Tarred dataset shard distribution strategy chosen as a str value during ddp.
            -   `scatter`: The default shard strategy applied by WebDataset, where each node gets
                a unique set of shards, which are permanently pre-allocated and never changed at runtime.
            -   `replicate`: Optional shard strategy, where each node gets all of the set of shards
                available in the tarred dataset, which are permanently pre-allocated and never changed at runtime.
                The benefit of replication is that it allows each node to sample data points from the entire
                dataset independently of other nodes, and reduces dependence on value of `shuffle_n`.

                .. warning::
                    Replicated strategy allows every node to sample the entire set of available tarfiles,
                    and therefore more than one node may sample the same tarfile, and even sample the same
                    data points! As such, there is no assured guarantee that all samples in the dataset will be
                    sampled at least once during 1 epoch. Scattered strategy, on the other hand, on specific
                    occasions (when the number of shards is not divisible with ``world_size``), will not sample
                    the entire dataset. For these reasons it is not advisable to use tarred datasets as validation
                    or test datasets.
        global_rank (int): Worker rank, used for partitioning shards. Defaults to 0.
        world_size (int): Total number of processes, used for partitioning shards. Defaults to 0.
        delimiter (Optional[str]): Delimiter to use when splitting the label string, default to None.
        normalize_audio_db (Optional[float]):  normalize audio signal to a target db, default to None.
    Nr   Fr   r   )r   r   r7  r8  r   r   r9  r   r   r   r   r:  r;  r   r   r]   r   r   r7  r8  r<  r   r   r9  r   r   r   r   r:  r;  c                   sn  t    t|tr|d}|
| _|| _|| _|| _t	j
|||	|dd| _t| jj| _t|||d| _|s|r;|n|  | _| jd urJt| jnd| _i i | _| _t| jD ]\}}|| j|< || j|< qYtt| jd d D ]}td|| j|  qsng | _d| _t||||d}ttj|d	t t!|t" tj#t$d
dt%dd| j&t'| j(| _)d S )Nr   T)r   r   r   r   r   r=  r4   r   r   r   r   r   r   r   r   )*r   r   r   r   r6   r   r   r:  r;  r
   r   r   r<   r   r   r   r\   r>  r   r   r   r   r   r   rd   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )rT   r   r   r]   r   r   r7  r8  r   r   r9  r   r   r   r   r:  r;  r   r   r   r   r   r   r   r  sX   







z'TarredAudioToMultiLabelDataset.__init__c                 C   r?  r   r@  rB  r   r   r   r>    rE  z-TarredAudioToMultiLabelDataset._get_label_setrC  c                    rF  )Nc                 S   rG  r   rH  rI  r   r   r   r     r    zGTarredAudioToMultiLabelDataset._label_str_to_tensor.<locals>.<listcomp>c                    r   r   rK  rI  r   r   r   r     r   rL  rM  r   r   r   rN    rO  z3TarredAudioToMultiLabelDataset._label_str_to_tensorc                    r   )r   c                       r   )
zATarredAudioToMultiLabelDataset._filter.<locals>.TarredAudioFilterc                    r   r   r   r  r  r   r   r     r  zJTarredAudioToMultiLabelDataset._filter.<locals>.TarredAudioFilter.__init__c                 S   r  r   r  r   r   r   r   r	    r
  zJTarredAudioToMultiLabelDataset._filter.<locals>.TarredAudioFilter.__iter__c                 S   r  r   r  r  r   r   r   r    r  zJTarredAudioToMultiLabelDataset._filter.<locals>.TarredAudioFilter.__next__c                 s   r  r  r  r  r   r   r   r    r  zUTarredAudioToMultiLabelDataset._filter.<locals>.TarredAudioFilter._internal_generatorNr   r   r  r   r   r!    r"  r!  r#  r$  r   r  r   r     r%  z&TarredAudioToMultiLabelDataset._filterc                 C   s   |\}}t jt j|\}}| jj| }| j| }|j}|du r$d}t|}	| j	j
|	||j| j| jd}
|	  |
t|
jd  }}| |j}t|d }||||fS )r&  Nr   )r   r   r   rP  )r  r  r  r  r   r   r   r(  r)  r\   r   r   r   r;  r*  r%   r&   r   r   rN  r   rQ  r+  r   r   r   r     s(   

z,TarredAudioToMultiLabelDataset._build_samplec                 C   r/  r   r0  r   r   r   r   r	  #  r   z'TarredAudioToMultiLabelDataset.__iter__c                 C   r   r   r   r   r   r   r   r   &  r   z&TarredAudioToMultiLabelDataset.__len__c                 C   r   r   r   r   r   r   r   r   )  r   z*TarredAudioToMultiLabelDataset._collate_fn)r   r   r   r   r   r   r   r'   r   r   r   r   r>  rN  r   r   r	  r   r   r   r   r   r   r   rR  +  sn    L
	
H	3"rR  c                       s   e Zd ZdZedeeeef  fddZ	ddddd	dd
dee
e B de
e dedB dedB dededB dedB def fddZdd Zdd Z  ZS )AudioPairToLabelDataseta  
    Dataset class for audio pairs classification tasks, such as calculating EER for speaker verification.
    The input manifest file should contain pairs of audio files and a label. It's format is almost the same as
    `AudioToSpeechLabelDataset` except that the `audio_filepath` field should be a list of two audio file paths
    instead of one, and that `offset` and `duration` are not used as the dataset class will load the whole audio.

    Example of a line in the manifest file:
    {
        "audio_filepath": ["/path/to/audio_wav_0.wav", "/path/to/audio_wav_1.wav"],
        "duration": null,  # not used, will load the whole audio
        "offset": 0.0,  # not used, will load the whole audio
        "label": "0"  # label for the pair, can be a string or an integer
    }

    r$   c                 C   s   t d| durt| drt| jdnt t tdt t d| dur,t| dr,t| jdnt t tdt t tdt t tdt d}|S )rs   rt   Nrw   rx   ru   )rN   r{   audio_signal_2a_sig_length_2r   r   )r   r   r   rw   r   r   r   r   r   r   r   r   >  s$   z$AudioPairToLabelDataset.output_typesr   NFr   r4   )r   r   r   r^   r`   ra   r   r   r   r   r   r^   r`   ra   c       	            s&   t  j|||||||||	ddd d S )NF)r   r   r\   r   r   r   r^   r`   ra   r   r   )r   r   )rT   r   r   r\   r   r   r   r^   r`   ra   kwargsr   r   r   r   [  s   
z AudioPairToLabelDataset.__init__c                 C   s   | j | }|j}| jj|d dd | jd}|t|jd  }}| jj|d dd | jd}|t|jd  }}	t| j	|j
  }
td }||||	|
|fS )Nr   r'  r4   )r   r   r\   r   r   r%   r&   r   r   r   r   )rT   r   r   
audio_pairr   r   r   	features2f2fl2r   r   r   r   r   r   w  s   
z#AudioPairToLabelDataset.__getitem__c                 C   sl   t | \}}}}}}tt ||||}t| |\}	}
}}tt ||||}t| |\}}}}|	|
||||fS r   )r?   listrX   )rT   rF   audio1
audio_len1audio2
audio_len2r   	label_lenbatch1a_sig1
a_sig_len1
pair_labelpair_label_lenbatch2a_sig2
a_sig_len2rH   r   r   r   r     s   z,AudioPairToLabelDataset.fixed_seq_collate_fn)r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rS  -  s:    "
	
rS  )5r(  r  typingr   r   r   r   r%   'nemo.collections.asr.data.audio_to_textr   r   1nemo.collections.asr.parts.preprocessing.featuresr   0nemo.collections.asr.parts.preprocessing.segmentr	   valid_sf_formats+nemo.collections.common.parts.preprocessingr
   nemo.core.classesr   r   nemo.core.neural_typesr   r   r   r   r   
nemo.utilsr   r   r   nemo.utils.distributedr   joinkeysr   Tensorr'   r-   r2   r<   rS   rX   rq   rr   r   r   r   r1  r2  r6  rR  rS  r   r   r   r   <module>   sD    	(*2 S cHv 9  