o
    }oi                     @   st  d dl Z d dlZd dlZd dlmZmZ d dlmZ d dlm	Z	m
Z
mZmZmZmZmZ d dlZd dlZd dlZd dlmZmZ d dlmZ d dlmZ d dlmZ d d	lmZmZmZm Z  d d
l!m"Z" g dZ#dee$ deej% fddZ&eG dd dZ'G dd dZ(G dd deZ)edddZ*G dd de)Z+edddZ,G dd de)Z-edd dZ.G d!d" d"e)Z/dS )#    N)OrderedDict
namedtuple)	dataclass)CallableDictListOptionalTupleTypeUnion)AudioSegmentChannelSelectorType)collectionsflatten)Dataset)AudioSignalEncodedRepresentationLengthsType
NeuralType)logging)AudioToTargetDataset!AudioToTargetWithReferenceDataset!AudioToTargetWithEmbeddingDatasetbatchreturnc           	         s   | d   }t }|D ]k  fdd| D }t|}g }t|| D ]E\}}||k r_|  jdkr7d|| f}n|  jdkrGd|| ddf}ntd  d j dtjj	
|  || < ||   q!|t|tj|tjd	f7 }q|S )
a"  Collate a batch of items returned by __getitem__.
    Examples for each signal are zero padded to the same length
    (batch_length), which is determined by the longest example.
    Lengths of the original signals are returned in the output.

    Args:
        batch: List of dictionaries. Each element of the list
            has the following format
            ```
            {
                'signal_0': 1D or 2D tensor,
                'signal_1': 1D or 2D tensor,
                ...
                'signal_N': 1D or 2D tensor,
            }
            ```
            1D tensors have shape (num_samples,) and 2D tensors
            have shape (num_channels, num_samples)

    Returns:
        A tuple containing signal tensor and signal length tensor (in samples)
        for each signal.
        The output has the following format:
        ```
        (signal_0, signal_0_length, signal_1, signal_1_length, ..., signal_N, signal_N_length)
        ```
        Note that the output format is obtained by interleaving signals and their length.
    r   c                    s   g | ]	}|  j d  qS ))shape).0bsignal ^/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/audio/data/audio_to_audio.py
<listcomp>J   s    z%_audio_collate_fn.<locals>.<listcomp>      zSignal z has unsuported dimensions z1. Currently, only 1D and 2D arrays are supported.)dtype)keystuplemaxzipndimRuntimeErrorr   torchnn
functionalpadappendstacktensorint32)	r   signalsbatchedsignal_lengthbatch_lengthb_signals_lenr   r1   r"   r    r#   _audio_collate_fn(   s&    r<   c                   @   sF   e Zd ZU ee ed< dZeee	e
f  ed< dZeee  ed< dS )SignalSetupr6   Ndurationchannel_selectors)__name__
__module____qualname__r   str__annotations__r>   r   r   floatlistr?   r   r"   r"   r"   r#   r=   i   s   
 r=   c                   @   s  e Zd ZdZ		dCdededee defdd	Ze	d
efddZ
e
jdefddZ
e	d
efddZejdefddZe	d
efddZejdee fddZe	d
efddZejdee fddZe	d
efddZejdefddZdejjd
eeejf fddZdejjd
eeejf fddZd eeejf d
eeejf fd!d"Zdejjd
eeejf fd#d$Zdejjd
eeejf fd%d&Ze			'	(dDd)eded*ee d+ed,eded
ej fd-d.Z!e			'	(dDd/e"e ded*ee d0ee"e  d,eded
e"ej  fd1d2Z#e		dEd)e$ee"e f ded3ed4ee d+ee d
ej fd5d6Z%e&		dEd)eded3ed4ee d+ee d
ej fd7d8Z'e&d9e$ej e"ej  f d
ej fd:d;Z(e&d/e"e d
e"e fd<d=Z)dejjd
eeejf fd>d?Z*e&d@ed
ej fdAdBZ+dS )FASRAudioProcessora  Class that processes an example from Audio collection and returns
    a dictionary with prepared signals.

    For example, the output dictionary may be the following
    ```
    {
        'input_signal': input_signal_tensor,
        'target_signal': target_signal_tensor,
        'reference_signal': reference_signal_tensor,
        'embedding_vector': embedding_vector
    }
    ```
    Keys in the output dictionary are ordered with synchronous signals given first,
    followed by asynchronous signals and embedding.

    Args:
        sample_rate: sample rate used for all audio signals
        random_offset: If `True`, offset will be randomized when loading a subsegment
                       from a file.
        normalization_signal: Normalize all audio with a factor that ensures the signal
                    `example[normalization_signal]` in `process` is in range [-1, 1].
                    All other audio signals are scaled by the same factor. Default is
                    `None`, corresponding to no normalization.
    N:0yE>sample_raterandom_offsetnormalization_signalepsc                 C   s.   || _ || _|| _|| _d | _d | _d | _d S N)rI   rJ   rK   rL   
sync_setupasync_setupembedding_setup)selfrI   rJ   rK   rL   r"   r"   r#   __init__   s   
zASRAudioProcessor.__init__r   c                 C      | j S rM   )_sample_raterQ   r"   r"   r#   rI         zASRAudioProcessor.sample_ratevaluec                 C   s    |dkrt d| || _d S )Nr   z'Sample rate must be positive, received )
ValueErrorrT   rQ   rW   r"   r"   r#   rI      s   
c                 C   rS   rM   _random_offsetrU   r"   r"   r#   rJ      rV   zASRAudioProcessor.random_offsetc                 C   s
   || _ d S rM   rZ   rY   r"   r"   r#   rJ         
c                 C   rS   )zReturn the current setup for synchronous signals.

        Returns:
            A dataclass containing the list of signals, their
            duration and channel selectors.
        )_sync_setuprU   r"   r"   r#   rN         zASRAudioProcessor.sync_setupc                 C   6   |du s	t |tr|| _dS tdt| d| d)a  Setup signals to be loaded synchronously.

        Args:
            value: An instance of SignalSetup with the following fields
                - signals: list of signals (keys of example.audio_signals) which will be loaded
                           synchronously with the same start time and duration.
                - duration: Duration for each signal to be loaded.
                            If duration is set to None, the whole file will be loaded.
                - channel_selectors: A list of channel selector for each signal. If channel selector
                                     is None, all channels in the audio file will be loaded.
        NUnexpected type  for value .)
isinstancer=   r]   rX   typerY   r"   r"   r#   rN      s   
c                 C   rS   )zReturn the current setup for asynchronous signals.

        Returns:
            A dataclass containing the list of signals, their
            duration and channel selectors.
        )_async_setuprU   r"   r"   r#   rO      r^   zASRAudioProcessor.async_setupc                 C   r_   )a  Setup signals to be loaded asynchronously.

        Args:
        Args:
            value: An instance of SignalSetup with the following fields
                - signals: list of signals (keys of example.audio_signals) which will be loaded
                           asynchronously with signals possibly having different start and duration
                - duration: Duration for each signal to be loaded.
                            If duration is set to None, the whole file will be loaded.
                - channel_selectors: A list of channel selector for each signal. If channel selector
                                     is None, all channels in the audio file will be loaded.
        Nr`   ra   rb   )rc   r=   re   rX   rd   rY   r"   r"   r#   rO      s   
c                 C   rS   )z3Setup signals corresponding to an embedding vector.)_embedding_setuprU   r"   r"   r#   rP      s   z!ASRAudioProcessor.embedding_setupc                 C   r_   )a$  Setup signals corresponding to an embedding vector.

        Args:
            value: An instance of SignalSetup with the following fields
                - signals: list of signals (keys of example.audio_signals) which will be loaded
                           as embedding vectors.
        Nr`   ra   rb   )rc   r=   rf   rX   rd   rY   r"   r"   r#   rP      s   	
examplec                 C   s   | j |d}| j|d}|S )a  Process an example from a collection of audio examples.

        Args:
            example: an example from Audio collection.

        Returns:
            An ordered dictionary of signals and their tensors.
            For example, the output dictionary may be the following
            ```
            {
                'input_signal': input_signal_tensor,
                'target_signal': target_signal_tensor,
                'reference_signal': reference_signal_tensor,
                'embedding_vector': embedding_vector
            }
            ```
            Keys in the output dictionary are ordered with synchronous signals given first,
            followed by asynchronous signals and embedding.
        rg   )audio)
load_audioprocess_audio)rQ   rg   ri   r"   r"   r#   process   s   zASRAudioProcessor.processc                 C   sp   t  }| jdur| |}|| | jdur!| |}|| | jdur0| |}|| |s6td|S )a  Given an example, load audio from `example.audio_files` and prepare
        the output dictionary.

        Args:
            example: An example from an audio collection

        Returns:
            An ordered dictionary of signals and their tensors.
            For example, the output dictionary may be the following
            ```
            {
                'input_signal': input_signal_tensor,
                'target_signal': target_signal_tensor,
                'reference_signal': reference_signal_tensor,
                'embedding_vector': embedding_vector
            }
            ```
            Keys in the output dictionary are ordered with synchronous signals given first,
            followed by asynchronous signals and embedding.
        NzUOutput dictionary is empty. Please use `_setup` methods to setup signals to be loaded)	r   rN   load_sync_signalsupdaterO   load_async_signalsrP   load_embeddingr-   )rQ   rg   outputsync_signalsasync_signals	embeddingr"   r"   r#   rj     s   








zASRAudioProcessor.load_audiori   c                 C   sZ   | j r+|| j    }| jdur| jjng }|D ]}||vr*|| || j  ||< q|S )zProcess audio signals available in the input dictionary.

        Args:
            audio: A dictionary containing loaded signals `signal: tensor`

        Returns:
            An ordered dictionary of signals and their tensors.
        N)rK   absr*   rP   r6   rL   )rQ   ri   
norm_scaleskip_signalsr!   r"   r"   r#   rk   <  s   	zASRAudioProcessor.process_audioc                    sj   t  } fdd| jjD }| j|| jj| j| jj j| jd}t	| jj|D ]\}}t
|||< q'|S )zLoad signals with the same start and duration.

        Args:
            example: an example from audio collection

        Returns:
            An ordered dictionary of signals and their tensors.
        c                    s   g | ]} j | qS r"   )audio_files)r   srh   r"   r#   r$   _  s    z7ASRAudioProcessor.load_sync_signals.<locals>.<listcomp>)rx   r?   rI   r>   fixed_offsetrJ   )r   rN   r6   get_samples_synchronizedr?   rI   r>   offsetrJ   r+   r.   r4   )rQ   rg   rq   sync_audio_filessync_samplesr!   samplesr"   rh   r#   rm   U  s   		z#ASRAudioProcessor.load_sync_signalsc              	   C   s`   t  }t| jjD ]$\}}| j|j| | j| jj| | jj| |j	| j
d}t|||< q	|S )zLoad each async signal independently, no constraints on starting
        from the same time.

        Args:
            example: an example from audio collection

        Returns:
            An ordered dictionary of signals and their tensors.
        )
audio_filerI   r>   channel_selectorrz   rJ   )r   	enumeraterO   r6   get_samplesrx   rI   r>   r?   r|   rJ   r.   r4   )rQ   rg   rq   idxr!   r   r"   r"   r#   ro   o  s   


z$ASRAudioProcessor.load_async_signalsr   Fr   r>   r   rz   c                 C   s"   | j |g|||g||d}|d S )a  Get samples from an audio file.
        For a single-channel signal, the output is shape (num_samples,).
        For a multi-channel signal, the output is shape (num_samples, num_channels).

        Args:
            audio_file: path to an audio file
            sample_rate: desired sample rate for output samples
            duration: Optional desired duration of output samples.
                    If `None`, the complete file will be loaded.
                    If set, a segment of `duration` seconds will be loaded.
            channel_selector: Optional channel selector, for selecting a subset of channels.
            fixed_offset: Optional fixed offset when loading samples.
            random_offset: If `True`, offset will be randomized when loading a short segment
                        from a file. The value is randomized between fixed_offset and
                        max_offset (set depending on the duration and fixed_offset).

        Returns:
            Numpy array with samples from audio file.
            The array has shape (num_samples,) for a single-channel signal
            or (num_channels, num_samples) for a multi-channel signal.
        )rx   rI   r>   r?   rz   rJ   r   )r{   )clsr   rI   r>   r   rz   rJ   rq   r"   r"   r#   r     s   	zASRAudioProcessor.get_samplesrx   r?   c                 C   s  |du rdgt | }|du r|}d}nO| |}	t|	}
|
| }|dkr0td| d|
 d|| |
krMtd|
 d| d| d	| d
	 |}|}n|rZ|
| }t||}n|}t	|| }g }t
|D ]\}}| j|||||| d}|| qi|S )a  Get samples from multiple files with the same start and end point.

        Args:
            audio_files: list of paths to audio files
            sample_rate: desired sample rate for output samples
            duration: Optional desired duration of output samples.
                    If `None`, the complete files will be loaded.
                    If set, a segment of `duration` seconds will be loaded from
                    all files. Segment is synchronized across files, so that
                    start and end points are the same.
            channel_selectors: Optional channel selector for each signal, for selecting
                            a subset of channels.
            fixed_offset: Optional fixed offset when loading samples.
            random_offset: If `True`, offset will be randomized when loading a short segment
                        from a file. The value is randomized between fixed_offset and
                        max_offset (set depending on the duration and fixed_offset).

        Returns:
            List with the same size as `audio_files` but containing numpy arrays
            with samples from each audio file.
            Each array has shape (num_samples,) or (num_channels, num_samples), for single-
            or multi-channel signal, respectively.
            For example, if `audio_files = [path/to/file_1.wav, path/to/file_2.wav]`,
            the output will be a list `output = [samples_file_1, samples_file_2]`.
        Nr   zFixed offset zs is larger than shortest file zs.zShortest file (z%s) is less than the desired duration zs + fixed offset z)s. Returned signals will be shortened to z	 seconds.r   rI   r|   num_samplesr   )lenget_durationminrX   r   debugrandomuniformmathfloorr   get_samples_from_filer2   )r   rx   rI   r>   r?   rz   rJ   r|   r   audio_durationsmin_audio_durationavailable_duration
max_offsetrq   r   r   segment_samplesr"   r"   r#   r{     s@   #
z*ASRAudioProcessor.get_samples_synchronizedr|   r   c           	      C   s   t |tr| j|||||d}|S t |tr3g }|D ]}| j|||||d}|| q| |}|S |du r;g }|S tdt| )al  Get samples from a single or multiple files.
        If loading samples from multiple files, they will
        be concatenated along the channel dimension.

        Args:
            audio_file: path or a list of paths.
            sample_rate: sample rate of the loaded samples
            offset: fixed offset in seconds
            num_samples: Optional, number of samples to load.
                         If `None`, all available samples will be loaded.
            channel_selector: Select a subset of available channels.

        Returns:
            An array with shape (samples,) or (channels, samples)
        r   NzUnexpected audio_file type )rc   rC   get_segment_from_filerF   r2   list_to_multichannelr-   rd   )	r   r   rI   r|   r   r   r   a_filea_file_samplesr"   r"   r#   r     s4   


z'ASRAudioProcessor.get_samples_from_filec                 C   sh   |du rt j| |||d}n
t j| ||||d}|jjdkr!|jS |jjdkr+|jjS td|jj )a  Get a segment of samples from a single audio file.

        Args:
            audio_file: path to an audio file
            sample_rate: sample rate of the loaded samples
            offset: fixed offset in seconds
            num_samples: Optional, number of samples to load.
                         If `None`, all available samples will be loaded.
            channel_selector: Select a subset of available channels.

        Returns:
           An array with shape (samples,) or (channels, samples)
        N)r   	target_srr|   r   )r   r   
n_segmentsr|   r   r%   r&   zUnexpected samples shape: )r   	from_filesegment_from_filer   r,   Tr-   r   )r   rI   r|   r   r   segmentr"   r"   r#   r   :  s&   z'ASRAudioProcessor.get_segment_from_filer!   c                 C   s   t | ts| S t| dkr| S t| dkr| d S | d jdkr)tj| dd}|S | d jdkr9tj| dd}|S td| d j d)a  Convert a list of signals into a multi-channel signal by concatenating
        the elements of the list along the channel dimension.

        If input is not a list, it is returned unmodified.

        Args:
            signal: list of arrays

        Returns:
            Numpy array obtained by concatenating the elements of the list
            along the channel dimension (axis=0).
        r   r%   )axisr&   zUnexpected target with z dimensions.)rc   rF   r   r,   npr3   concatenater-   )r!   	mc_signalr"   r"   r#   r   i  s   
z&ASRAudioProcessor.list_to_multichannelc                 C   s   dd t | D }|S )zGet duration for each audio file in `audio_files`.

        Args:
            audio_files: list of paths to audio files

        Returns:
            List of durations in seconds.
        c                 S   s   g | ]}t j|d qS ))path)librosar   )r   fr"   r"   r#   r$     s    z2ASRAudioProcessor.get_duration.<locals>.<listcomp>r   )rx   r>   r"   r"   r#   r     s   
zASRAudioProcessor.get_durationc                 C   sB   t  }t| jjD ]\}}|j| }| |}t|||< q	|S )a  Given an example, load embedding from `example.audio_files[embedding]`
        and return it in a dictionary.

        Args:
            example: An example from audio collection

        Returns:
            An dictionary of embedding keys and their tensors.
        )r   r   rP   r6   rx   load_embedding_vectorr.   r4   )rQ   rg   rq   r   r!   embedding_filert   r"   r"   r#   rp     s   


z ASRAudioProcessor.load_embeddingfilepathc                 C   sR   |  dr"t| d}t|}W d   |S 1 sw   Y  |S td|  )zLoad an embedding vector from a file.

        Args:
            filepath: path to a file storing a vector.
                    Currently, it is assumed the file is a npy file.

        Returns:
            Array loaded from filepath.
        z.npyrbNz'Unknown embedding file format in file: )endswithopenr   loadr-   )r   r   rt   r"   r"   r#   r     s   

z'ASRAudioProcessor.load_embedding_vector)NrH   )NNr   F)NN),r@   rA   rB   __doc__rE   boolr   rC   rR   propertyrI   setterrJ   r=   rN   rO   rP   r   AudioOUTPUT_TYPEr   r.   Tensorrl   rj   rk   rm   ro   classmethodintr   r   ndarrayr   r   r{   r   r   staticmethodr   r   r   rp   r   r"   r"   r"   r#   rG   p   s    
		  &+  )
T4.&# rG   c                       s   e Zd ZdZeejdeee	e
f  fddZdejdedee f fdd	Zdefd
dZdedee	ejf fddZdefddZdeej fddZ  ZS )BaseAudioDataseta  Base class of audio datasets, providing common functionality
    for other audio datasets.

    Args:
        collection: Collection of audio examples prepared from manifest files.
        audio_processor: Used to process every example from the collection.
                         A callable with `process` method. For reference,
                         please check ASRAudioProcessor.
    r   c                 C   s   dS )z+Returns definitions of module output ports.Nr"   rU   r"   r"   r#   output_types  s    zBaseAudioDataset.output_types
collectionaudio_processoroutput_typec                    s    t    || _|| _|| _dS )zInstantiates an audio dataset.N)superrR   r   r   r   )rQ   r   r   r   	__class__r"   r#   rR     s   

zBaseAudioDataset.__init__c                 C   sR   |  d}|| jdkrdS || jdkr|| jd S td| d|| j )a  Returns the number of channels for a particular signal in
        items prepared by this dictionary.

        More specifically, this will get the tensor from the first
        item in the dataset, check if it's a one- or two-dimensional
        tensor, and return the number of channels based on the size
        of the first axis (shape[0]).

        NOTE:
        This assumes that all examples have the same number of channels.

        Args:
            signal_key: string, used to select a signal from the dictionary
                        output by __getitem__

        Returns:
            Number of channels for the selected signal.
        r   r%   r&   z*Unexpected number of dimension for signal z with shape )__getitem__r,   r   r-   )rQ   
signal_keyitemr"   r"   r#   num_channels  s   
zBaseAudioDataset.num_channelsindexc                 C   s   | j | }| jj|d}|S )a  Return a single example from the dataset.

        Args:
            index: integer index of an example in the collection

        Returns:
            Dictionary providing mapping from signal to its tensor.
            For example:
            ```
            {
                'input_signal': input_signal_tensor,
                'target_signal': target_signal_tensor,
            }
            ```
        rh   )r   r   rl   )rQ   r   rg   rq   r"   r"   r#   r     s   
zBaseAudioDataset.__getitem__c                 C   s
   t | jS )z-Return the number of examples in the dataset.)r   r   rU   r"   r"   r#   __len__  r\   zBaseAudioDataset.__len__c                 C   s   | j t| S )zCollate items in a batch.)r   r<   )rQ   r   r"   r"   r#   _collate_fn  s   zBaseAudioDataset._collate_fn)r@   rA   rB   r   r   abcabstractmethodr   r   rC   r   r   r   r   r   r
   r   rR   r   r   r.   r   r   r   r	   r   __classcell__r"   r"   r   r#   r     s    
 r   AudioToTargetExamplez5input_signal input_length target_signal target_length)typenamefield_namesc                       s   e Zd ZdZ								ddededededee d	ed
ee dee dee dee dee dee f fddZ	e
deeeef  fddZ  ZS )r   a	  A dataset for audio-to-audio tasks where the goal is to use
    an input signal to recover the corresponding target signal.

    Each line of the manifest file is expected to have the following format:

    .. code-block:: json

        {"input_key": "path/to/input.wav", "target_key": "path/to/target.wav", "duration": "duration_in_seconds"}

    Additionally, multiple audio files may be provided for each key in the manifest, for example,

    .. code-block:: json

        {"input_key": "path/to/input.wav", "target_key": ["path/to/path_to_target_ch0.wav", "path/to/path_to_target_ch1.wav"], "duration": "duration_in_seconds"}

    Keys for input and target signals can be configured in the constructor (`input_key` and `target_key`).

    Args:
        manifest_filepath: Path to manifest file in a format described above.
        sample_rate: Sample rate for loaded audio signals.
        input_key: Key pointing to input audio files in the manifest
        target_key: Key pointing to target audio files in manifest
        audio_duration: Optional duration of each item returned by __getitem__.
                        If `None`, complete audio will be loaded.
                        If set, a random subsegment will be loaded synchronously from
                        target and audio, i.e., with the same start and end point.
        random_offset: If `True`, offset will be randomized when loading a subsegment
                       from a file.
        max_duration: If audio exceeds this length, do not include in dataset.
        min_duration: If audio is less than this length, do not include in dataset.
        max_utts: Limit number of utterances.
        input_channel_selector: Optional, select subset of channels from each input audio file.
                                If `None`, all channels will be loaded.
        target_channel_selector: Optional, select subset of channels from each input audio file.
                                 If `None`, all channels will be loaded.
        normalization_signal: Normalize audio signals with a scale that ensures the normalization signal is in range [-1, 1].
                              All audio signals are scaled by the same factor. Supported values are `None` (no normalization),
                              'input_signal', 'target_signal'.
    NFmanifest_filepathrI   	input_key
target_keyaudio_durationrJ   max_durationmin_durationmax_uttsinput_channel_selectortarget_channel_selectorrK   c                    sZ   ||d}t j|||||	d}t|||d}tddg||
|gd|_t j||td d S )N)input_signaltarget_signalmanifest_filesaudio_to_manifest_keyr   r   
max_numberrI   rJ   rK   r   r   r6   r>   r?   r   r   r   )r   AudioCollectionrG   r=   rN   r   rR   r   )rQ   r   rI   r   r   r   rJ   r   r   r   r   r   rK   r   r   r   r   r"   r#   rR   B  s*   zAudioToTargetDataset.__init__r   c                 C   s\   t dt }t dt }t| ddkr|n|t dt | ddkr%|n|t dt dS )a  Returns definitions of module output ports.

        Returns:
            OrderedDict: Dictionary containing the following items:
                input_signal:
                    Batched single- or multi-channel input audio signal
                input_length:
                    Batched original length of each input signal
                target_signal:
                    Batched single- or multi-channel target audio signal
                target_length:
                    Batched original length of each target signal
        Br   r   Cr   r   r%   r   r   )r   input_lengthr   target_lengthr   r   r   r   r   rQ   sc_audio_typemc_audio_typer"   r"   r#   r   k  s   

z!AudioToTargetDataset.output_typesNFNNNNNNr@   rA   rB   r   rC   r   r   rE   r   rR   r   r   r   r   r   r"   r"   r   r#   r     sJ    .	
)$r   !AudioToTargetWithReferenceExamplezWinput_signal input_length target_signal target_length reference_signal reference_lengthc                "       s   e Zd ZdZ											ddedededed	ed
ee dedee dee dee dee dee dee dedee dee f  fddZ	e
deeeef  fddZ  ZS )r   aq  A dataset for audio-to-audio tasks where the goal is to use
    an input signal to recover the corresponding target signal and an
    additional reference signal is available.

    This can be used, for example, when a reference signal is
    available from
    - enrollment utterance for the target signal
    - echo reference from playback
    - reference from another sensor that correlates with the target signal

    Each line of the manifest file is expected to have the following format

    .. code-block:: json

        {"input_key": "path/to/input.wav", "target_key": "path/to/path_to_target.wav", "reference_key": "path/to/path_to_reference.wav", "duration": "duration_in_seconds"}

    Keys for input, target and reference signals can be configured in the constructor.

    Args:
        manifest_filepath: Path to manifest file in a format described above.
        sample_rate: Sample rate for loaded audio signals.
        input_key: Key pointing to input audio files in the manifest
        target_key: Key pointing to target audio files in manifest
        reference_key: Key pointing to reference audio files in manifest
        audio_duration: Optional duration of each item returned by __getitem__.
                        If `None`, complete audio will be loaded.
                        If set, a random subsegment will be loaded synchronously from
                        target and audio, i.e., with the same start and end point.
        random_offset: If `True`, offset will be randomized when loading a subsegment
                       from a file.
        max_duration: If audio exceeds this length, do not include in dataset.
        min_duration: If audio is less than this length, do not include in dataset.
        max_utts: Limit number of utterances.
        input_channel_selector: Optional, select subset of channels from each input audio file.
                                If `None`, all channels will be loaded.
        target_channel_selector: Optional, select subset of channels from each input audio file.
                                 If `None`, all channels will be loaded.
        reference_channel_selector: Optional, select subset of channels from each input audio file.
                                    If `None`, all channels will be loaded.
        reference_is_synchronized: If True, it is assumed that the reference signal is synchronized
                                   with the input signal, so the same subsegment will be loaded as for
                                   input and target. If False, reference signal will be loaded independently
                                   from input and target.
        reference_duration: Optional, can be used to set a fixed duration of the reference utterance. If `None`,
                            complete audio file will be loaded.
        normalization_signal: Normalize audio signals with a scale that ensures the normalization signal is in range [-1, 1].
                              All audio signals are scaled by the same factor. Supported values are `None` (no normalization),
                              'input_signal', 'target_signal', 'reference_signal'.
    NFTr   rI   r   r   reference_keyr   rJ   r   r   r   r   r   reference_channel_selectorreference_is_synchronizedreference_durationrK   c                    s   |||d}t j|||	||
d}t|||d}|r'tg d||||gd|_ntddg|||gd|_tdg|g|gd|_t j||td d S )	N)r   r   reference_signalr   r   r   r   r   r   r   )	r   r   rG   r=   rN   rO   r   rR   r   )rQ   r   rI   r   r   r   r   rJ   r   r   r   r   r   r   r   r   rK   r   r   r   r   r"   r#   rR     sF   

z*AudioToTargetWithReferenceDataset.__init__r   c              	   C   sz   t dt }t dt }t| ddkr|n|t dt | ddkr%|n|t dt | ddkr4|n|t dt dS )	a  Returns definitions of module output ports.

        Returns:
            OrderedDict: Dictionary containing the following items:
                input_signal:
                    Batched single- or multi-channel input audio signal
                input_length:
                    Batched original length of each input signal
                target_signal:
                    Batched single- or multi-channel target audio signal
                target_length:
                    Batched original length of each target signal
                reference_signal:
                    Batched single- or multi-channel reference audio signal
                reference_length:
                    Batched original length of each reference signal
        r   r   r   r%   r   r   r   )r   r   r   r   r   reference_lengthr   r   r"   r"   r#   r     s   


z.AudioToTargetWithReferenceDataset.output_types)NFNNNNNNTNNr   r"   r"   r   r#   r     s`    9	
=$r   !AudioToTargetWithEmbeddingExamplezWinput_signal input_length target_signal target_length embedding_vector embedding_lengthc                       s   e Zd ZdZ								ddededededed	ee d
edee dee dee dee dee dee f fddZ	e
deeeef  fddZ  ZS )r   a  A dataset for audio-to-audio tasks where the goal is to use
    an input signal to recover the corresponding target signal and an
    additional embedding signal. It is assumed that the embedding
    is in a form of a vector.

    Each line of the manifest file is expected to have the following format

    .. code-block:: json

        {"input_key": "path/to/input.wav", "target_key": "path/to/path_to_target.wav", "embedding_key": "path/to/path_to_reference.npy", "duration": "duration_in_seconds"}

    Keys for input, target and embedding signals can be configured in the constructor.

    Args:
        manifest_filepath: Path to manifest file in a format described above.
        sample_rate: Sample rate for loaded audio signals.
        input_key: Key pointing to input audio files in the manifest
        target_key: Key pointing to target audio files in manifest
        embedding_key: Key pointing to embedding files in manifest
        audio_duration: Optional duration of each item returned by __getitem__.
                        If `None`, complete audio will be loaded.
                        If set, a random subsegment will be loaded synchronously from
                        target and audio, i.e., with the same start and end point.
        random_offset: If `True`, offset will be randomized when loading a subsegment
                       from a file.
        max_duration: If audio exceeds this length, do not include in dataset.
        min_duration: If audio is less than this length, do not include in dataset.
        max_utts: Limit number of utterances.
        input_channel_selector: Optional, select subset of channels from each input audio file.
                                If `None`, all channels will be loaded.
        target_channel_selector: Optional, select subset of channels from each input audio file.
                                 If `None`, all channels will be loaded.
        normalization_signal: Normalize audio signals with a scale that ensures the normalization signal is in range [-1, 1].
                              All audio signals are scaled by the same factor. Supported values are `None` (no normalization),
                              'input_signal', 'target_signal'.
    NFr   rI   r   r   embedding_keyr   rJ   r   r   r   r   r   rK   c                    sj   |||d}t j|||	||
d}t|||d}tddg|||gd|_tdgd|_t j||td	 d S )
N)r   r   embedding_vectorr   r   r   r   r   r   )r6   r   )	r   r   rG   r=   rN   rP   r   rR   r   )rQ   r   rI   r   r   r   r   rJ   r   r   r   r   r   rK   r   r   r   r   r"   r#   rR   G  s2   
z*AudioToTargetWithEmbeddingDataset.__init__r   c              	   C   sp   t dt }t dt }t| ddkr|n|t dt | ddkr%|n|t dt t dt t dt dS )	a  Returns definitions of module output ports.

        Returns:
            OrderedDict: Dictionary containing the following items:
                input_signal:
                    Batched single- or multi-channel input audio signal
                input_length:
                    Batched original length of each input signal
                target_signal:
                    Batched single- or multi-channel target audio signal
                target_length:
                    Batched original length of each target signal
                embedding_vector:
                    Batched embedded vector format
                embedding_length:
                    Batched original length of each embedding vector
        r   r   r   r%   r   r   )r   D)r   r   r   r   r   embedding_length)r   r   r   r   r   r   r   r"   r"   r#   r   u  s   



z.AudioToTargetWithEmbeddingDataset.output_typesr   r   r"   r"   r   r#   r   !  sN    ,	
.$r   )0r   r   r   r   r   r   dataclassesr   typingr   r   r   r   r	   r
   r   r   numpyr   r.   0nemo.collections.asr.parts.preprocessing.segmentr   r   +nemo.collections.common.parts.preprocessing#nemo.collections.common.parts.utilsr   nemo.core.classesr   nemo.core.neural_typesr   r   r   r   
nemo.utilsr   __all__dictr   r<   r=   rG   r   r   r   r   r   r   r   r"   r"   r"   r#   <module>   sP   $A    SUl 