o
    }oi                     @   s   d dl Z d dlZd dlZd dlZd dlZd dlZd dl	m
Z
 d dlmZ d dlmZ d dlmZmZmZmZmZ d dlmZmZ d dlmZ d dlmZ G d	d
 d
ZdS )    N)	OmegaConf)write_manifest)audio_to_audio_dataset)ASRAudioProcessorAudioToTargetDataset!AudioToTargetWithEmbeddingDataset!AudioToTargetWithReferenceDataset_audio_collate_fn)LhotseAudioToTargetDatasetconvert_manifest_nemo_to_lhotse)get_segment_start)!get_lhotse_dataloader_from_configc                   @   s   e Zd Zejjejdddgejdddgdd Zejjejdddgdd	 Zejjd
d Z	ejjdd Z
ejjdd Zejjdd Zejjdd Zejjdd ZdS )TestAudioDatasetsnum_channels      num_targets   c                    sr   d}d}t jj|d}|j| |fd  fddt|D }t  k s,J t| k s7J dS )z(Test conversion of a list of arrays into*     seedsizec                    s,   g | ]} | |d   ddf qS )r   N .0ngolden_targetr   r   _/home/ubuntu/.local/lib/python3.10/site-packages/tests/collections/audio/test_audio_datasets.py
<listcomp>8      , z?TestAudioDatasets.test_list_to_multichannel.<locals>.<listcomp>N)nprandomdefault_rngnormalranger   list_to_multichannelall)selfr   r   random_seednum_samples_rngtarget_listr   r   r    test_list_to_multichannel)   s   z+TestAudioDatasets.test_list_to_multichannelc                    s   dd}g d}dg| D ]L}t dd|d}t|D ]>} fdd	|D }||}|r;d
||   |j  }	nd
}	|D ]}
t||
 ||
 |	 sWJ d| d|
 q?qqdS )z+Test signal normalization in process_audio.r      input_signaltarget_signalreference_signalN>  F)sample_raterandom_offsetnormalization_signalc                    s   i | ]	}|t  qS r   )torchrandn)r   signalr   r,   r   r    
<dictcomp>P   s    zBTestAudioDatasets.test_processor_process_audio.<locals>.<dictcomp>g      ?zFailed example  signal )r   r'   process_audioabsmaxepsr9   allclose)r*   r   num_examplessignalsr8   	processorr   exampleprocessed_examplescaler;   r   r<   r    test_processor_process_audio?   s,   
z.TestAudioDatasets.test_processor_process_audioc                 C   s  d}d}d}t jj|d}dddd}|jdd	|d
|jdd	|d
|jdd	|d
d}g }t|D ]+}t }	| D ]\}
}|j|||
 | fd}t |}t	
||	|
< q:||	 q1t|}|d    |d    |d    d}|d    |d    |d    d}| D ][\}
}t|D ]R}||
 | }||
 | }||ksJ d| d|
 d| d| d	|| d|df }|| |
 d|df    }t j|||dsJ d| d|
 dqqdS )zTest `_audio_collate_fn`   r   gh㈵>r   r   r   r1         lowhighr   r   r      r   zExample r>   z length mismatch: batched (z) != golden ()N.atolz value mismatch.)r#   r$   r%   integersr'   dictitemsr&   squeezer9   tensorappendr	   cpudetachnumpyrC   )r*   
batch_sizer+   rT   r-   signal_to_channelssignal_to_lengthbatchr   itemr;   r   random_signalbatchedbatched_signalsbatched_lengthsb_signal
uut_lengthgolden_length
uut_signalgolden_signalr   r   r    test_audio_collate_fn_   sZ   

$z'TestAudioDatasets.test_audio_collate_fnc           /         s  d}d}d}ddd}d}d}d	d
d}d}t jj|d}	t |	j|||dd}
t |
| t}t }|	 D ]1\}}g ||< t
|D ]$}|dkrW|	jdd|| d}n|	jdd||| fd}|| | qFq:t }g }t
|D ]6}t }|D ]#}| d|dd}ttj|||| | j|d |||| < q|
| |d< || qxtj|d}t|| t||d |d |d||d |d |d}t|}|dd}t|||d |d d |d|dd}tt|d dt d!}d"d# |D }|D ]%}|| |ksJ d$| || ||ks)J d$| qt
|D ]|}|D ]v}|| | }d%D ]j}|rK|| | d n || }| || }|j!|j!ksrJ d&| d'| d(|j! d)|j! t j"|||d*sJ d&| d+| d,| d-| d.	t j"|||d*sJ d&| d/| d,| d-| d.	q=q3q/d0d1t||d |d |d2|d|dd3}tt|d dt d!}d4d# |D }fd5d#t#|
D } t
t$D ]]}d%D ]W}|D ]Q}|r|| | d n || }|| | |  }|j!|j!ks*J d6| d'| d(|j! d)|j! t j"|||d*sDJ d6| d+| d,| d-| d.	qqqd dgdd}!t||d |d |!d |!d |d7t
t$D ]S} |}"|D ]H}|!| }#|"| % & ' }|| | |#d8f }|j!|j!ksJ d9| d:|j! d)|j! t j"|||d*sJ d;| d,| d-| d.qrqid< tt  | }$ fd=d#t#|
D } d>D ]}%t||d |d |  |%d?|d  |%rd@ndA|ddB}tt|d dt d!}dCd# |D }t
t$D ]}d%D ]}|r|| n |}"dD }&}'|D ]}|r2|"| d n|"| }|| | |  }(|&dDu rit(|(d dDdDf |d dDdDf dE}&|%se|&d kseJ dF| dG|&|$ }'|(d8|&|'f }|j!dH |$ksJ dF| dI|j!dH  dJ|$ d.|j!|j!ksJ dF| d'| d(|j! d)|j! t j"|||d*sJ dF| d+| d,| d-| d.	q&qqqdK})d%D ]}|r|)|dL< tt|d dt d!}t)t*|}*nfdMd#t
|)D }++|+}*t#|, D ]f\}}|dNdO},t-|*tr|*| j!}-|*|, }.n|*d|  j!}-|*d| d  }.|-|)|| |$fks;J dP| dQ| dR|- t$|.|)ksOJ dP| dSt$|. d.t.|.|$ks`J dP| dT|. qqW dD   dDS 1 spw   Y  dDS )UaV  Test AudioWithTargetDataset in different configurations.

        Test below cover the following:
        1) no constraints
        2) filtering based on signal duration
        3) use with channel selector
        4) use with fixed audio duration and random subsegments
        5) collate a batch of items

        In this use case, each line of the manifest file has the following format:
        ```
        {
            'input_filepath': 'path/to/input.wav',
            'target_filepath': 'path/to/path_to_target.wav',
            'duration': duration_of_input,
        }
        ```
        r   r5   rM   rQ   r   r2   r3          @       @input_filepathtarget_filepathư>r   rN   r   r               ?_02d.wavfloatdurationmanifest.jsonr2   r3   manifest_filepath	input_key
target_keyr6   .json_cuts.jsonlinput_manifestoutput_manifestr}   r~   T	cuts_path
use_lhotser6   r^   r   global_rank
world_sizedatasetc                 S      g | ]}|qS r   r   r   rb   r   r   r    r!         zBTestAudioDatasets.test_audio_to_target_dataset.<locals>.<listcomp>z$Num channels not correct for signal FTTest 1, use_lhotse=	: Signal  item shape  not matching reference shape rS   : Failed for example 	, signal  (random seed rR   : Failed for factory example g      @g      @)r|   r}   r~   min_durationmax_durationr6   )r   r   r   r   r6   r^   c                 S   r   r   r   r   r   r   r    r!   O  r   c                    s,   g | ]\}}|  kr krn n|qS r   r   r   r   val)r   r   r   r    r!   Q  r"   zTest 2, use_lhotse=)r|   r}   r~   input_channel_selectortarget_channel_selectorr6   .Signal : item shape Test 3: Failed for example       @c                       g | ]
\}}| kr|qS r   r   r   audio_durationr   r    r!         )TF)r|   r}   r~   r6   r   r   r7   r$   start)r   r   r   truncate_durationtruncate_offset_typer6   r^   c                 S   r   r   r   r   r   r   r    r!     r   Nr;   segmentzTest 4, use_lhotse=z@: Expecting the signal to start at 0 when random_offset is Falsez: Signal length ($) not matching the expected length (rK   r^   c                       g | ]}  |qS r   __getitem__r   r   r   r    r!         _signal_lengthzTest 5, use_lhotse=z: Unexpected signal z shape z#: Unexpected length of signal_len (z: Unexpected signal_len )/r#   r$   r%   rounduniformfloorastypeintrV   rW   r'   rZ   tempfileTemporaryDirectorysfwriteospathjoinTr   r   r   get_audio_to_target_datasetreplacer   r   r   creater
   r   rX   r   shaperC   	enumeratelenr[   r\   r]   r   nextiter
collate_fnkeys
isinstancer)   )/r*   r+   r6   rD   data_num_channelsdata_min_durationdata_max_durationdata_keyrT   r-   data_durationdata_duration_samplesdatar;   r   r   rc   test_dirmetadatametasignal_filenamer|   configdataset_factoryr   config_lhotse	dl_lhotsedataset_lhotserk   r   item_signalitem_factory_signalfiltered_exampleschannel_selectorrb   csaudio_duration_samplesr7   golden_start
golden_endfull_golden_signalr^   rd   ra   lengthsignal_shape
signal_lenr   )r   r   r   r   r    test_audio_to_target_dataset   s   $
	
	



$

$
	

	


%



 $z.TestAudioDatasets.test_audio_to_target_datasetc           #      C   s  d}d}d}ddd}d}d}d	d
d}d}t jj|d}	t |	j|||dd}
t |
| t}t }|	 D ]1\}}g ||< t
|D ]$}|dkrW|	jdd|| d}n|	jdd||| fd}|| | qFq:t }g }t
|D ]n}t }|D ][}|dkrg }t
|| D ]*}|| d|dd| d ttj||d || | |ddf |d qn| d|dd}ttj|||| | j|d |||| < q|
| |d< || qxtj|d}t|| t||d |d |d}||d |d |d}t|}|dd}t|||d |d d  |d!|dd"}tt|d#dt d$}d%d& |D }t
|D ]}d'D ]}}|rL|| n||}||} |D ]f}|rd|| d#n|| }!|| | }"|!j |"j ksJ d(| d)| d*|!j  d+|"j  t j!|!|"|d,sJ d(| d-| d.| d/| d0	t j!| | |"|d,sJ d(| d1| d.| d/| d0	qXqCq?t||d |d |d gd#|d2}t
|D ]c}||}|D ]X}|| " # $ }!|| | }"|dkrt j%|d | d#dd3f |"gd#d4}"|!j |"j ks!J d5| d6|!j  d+|"j  t j!|!|"|d,s8J d7| d.| d/| d0qqW d   dS 1 sHw   Y  dS )8a  Test AudioWithTargetDataset when the input manifest has a list
        of audio files in the target key.

        In this use case, each line of the manifest file has the following format:
        ```
        {
            'input_filepath': 'path/to/input.wav',
            'target_filepath': ['path/to/path_to_target_ch0.wav', 'path/to/path_to_target_ch1.wav'],
            'duration': duration_of_input,
        }
        ```
        r   r5   rM   rQ   r   rm   rn   ro   rp   rq   rr   r   rN   r   r   rs   rt   r3   ru   rv   _ch_rw   r   Nrx   ry   rz   r2   r{   r   r   r   Tr   r   r   c                 S   r   r   r   r   r   r   r    r!   n  r   zSTestAudioDatasets.test_audio_to_target_dataset_with_target_list.<locals>.<listcomp>r   r   r   r   r   rS   r   r   r   rR   r   )r|   r}   r~   r   r6   .)axisr   r   Test 2: Failed for example )&r#   r$   r%   r   r   r   r   r   rV   rW   r'   rZ   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r
   r   rX   r   rC   r[   r\   r]   concatenate)#r*   r+   r6   rD   r   r   r   r   rT   r-   r   r   r   r;   r   r   rc   r   r   r   r   chr|   r   r   r   r   r   r   r   r   rb   item_factoryr   rk   r   r   r    -test_audio_to_target_dataset_with_target_list  s   $

	





&
$z?TestAudioDatasets.test_audio_to_target_dataset_with_target_listc           !      C   sH  d}d}d}ddi}d}d}ddi}d	}t jj|d
}	t |	j|||dd}
t |
| t}t }|	 D ]1\}}g ||< t
|D ]$}|dkrU|	jdd|| d}n|	jdd||| fd}|| | qDq8t *}g }t
|D ]6}t }|D ]#}| d|dd}ttj|||| | j|d |||| < q}|
| |d< || qvtj|d}t|| t||d d|d}||d d|d}t|}|dd}t|||d dd |d|dd}tt|ddt d}dd  |D }t
|D ]}d!D ]}|d"kr||}n|d#kr||}n|d$kr%|| }ntd%| d&|v rA|d&   dksAJ | d'|D ]J}|d$krQ|| !dn|| }|| | } |j"| j"kssJ | d(| d)|j" d*| j" t j#|| |d+sJ | d,| d-| d.| d/qCqqW d   dS 1 sw   Y  dS )0a  Test AudioWithTargetDataset when target_key is
        not set, i.e., it is `None`. This is the case, e.g., when
        running inference, and a target is not available.

        In this use case, each line of the manifest file has the following format:
        ```
        {
            'input_filepath': 'path/to/input.wav',
            'duration': duration_of_input,
        }
        ```
        r   r5   rM   r2   rQ   rn   ro   rp   rr   r   rN   r   r   rs   rt   ru   rv   rw   rx   ry   rz   Nr{   r   r   r   Tr   r   r   c                 S   r   r   r   r   r   r   r    r!     r   zPTestAudioDatasets.test_audio_to_target_dataset_for_inference.<locals>.<listcomp>)originalfactorylhotser   r   r   zUnknown label r3   z(: target_signal is expected to be empty.z -- Signal r   r   rS   z -- Test 1: Failed for example r   r   rR   )$r#   r$   r%   r   r   r   r   r   rV   rW   r'   rZ   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r
   r   
ValueErrornumelrX   r   rC   )!r*   r+   r6   rD   r   r   r   r   rT   r-   r   r   r   r;   r   r   rc   r   r   r   r   r|   r   r   r   r   r   r   r   labelrb   r   rk   r   r   r    *test_audio_to_target_dataset_for_inference  s   $
	
	




  
$z<TestAudioDatasets.test_audio_to_target_dataset_for_inferencec           %         s  d}d}d}dddd}d}d	}d
ddd}d}t jj|d}	t |	j|||dd}
t |
| t}t }|	 D ]1\}}g ||< t
|D ]$}|dkrY|	jdd|| d}n|	jdd||| fd}|| | qHq<t h}g }t
|D ]6}t }|D ]#}| d|dd}ttj|||| | j|d |||| < q|
| |d< || qztj|d}t|| t||d |d |d d|d||d |d |d d|d}t|}t
|D ]o}|}||}|D ]`}||    }|| | }|j|jksJ d| d|j d |j t j|||d!s2J d"| d#| d$| d%||    }t j|||d!sSJ d&| d#| d$| d%qqd' tt  | }t||d |d |d d(|  d(d)	 fd*d+t|
D }t
tD ]w}|}d, }} |D ]h}||    }|| ||  }!|d,u rt |!d-d,d,f |d-d,d,f d.}|| } |!d/|| f }|jd0 |ksJ d1| d2|jd0  d3| d%t j|||d!sJ d4| d#| d$| d%qqd' tt  | }t||d |d |d d|  d(d)	 fd5d+t|
D }t
tD ]}|}d, }} |D ]}||    }|| ||  }!|dkrT|!}n<|d,u rot |!d-d,d,f |d-d,d,f d.}|| } |!d/|| f }|jd0 |ksJ d6| d2|jd0  d3| d%|j|jksJ d| d|j d |j t j|||d!sJ d7| d#| d$| d%q8q+d8}"fd9d+t
|"D }#!|#}$W d,   d,S 1 sw   Y  d,S ):a  Test AudioWithTargetWithReferenceDataset in different configurations.

        1) reference synchronized with input and target
        2) reference not synchronized

        In this use case, each line of the manifest file has the following format:
        ```
        {
            'input_filepath': 'path/to/input.wav',
            'target_filepath': 'path/to/path_to_target.wav',
            'reference_filepath': 'path/to/path_to_reference.wav',
            'duration': duration_of_input,
        }
        ```
        r   r5   rM   rQ   r   r   r1   rn   ro   rp   rq   reference_filepathrr   r   rN   r   rs   rt   ru   rv   rw   rx   ry   rz   r2   r3   r4   F)r|   r}   r~   reference_keyreference_is_synchronizedr6   r   r   r   rS   Test 1: Failed for example r   r   rR   #Test 1: Failed for factory example r   T)	r|   r}   r~   r   r   r6   r   r   r7   c                    r   r   r   r   r   r   r    r!     r   zQTestAudioDatasets.test_audio_to_target_with_reference_dataset.<locals>.<listcomp>Nr   r   .r   zTest 2: Signal z	 length (r   r   c                    r   r   r   r   r   r   r    r!     r   zTest 3: Signal r   rK   c                    r   r   r   r   r   r   r    r!     r   )"r#   r$   r%   r   r   r   r   r   rV   rW   r'   rZ   r   r   r   r   r   r   r   r   r   r   r   *get_audio_to_target_with_reference_datasetr   r[   r\   r]   r   rC   r   r   r   r   )%r*   r+   r6   rD   r   r   r   r   rT   r-   r   r   r   r;   r   r   rc   r   r   r   r   r|   r   r   rb   r   r   rk   r   r   r   r   r   r   r^   ra   ru   r   )r   r   r    +test_audio_to_target_with_reference_dataset$  s2  $







$




 $z=TestAudioDatasets.test_audio_to_target_with_reference_datasetc           "         s>  d}d}d}dddd}d}d	}d
}dddd}d}	t jj|d}
t |
j|||dd}t || t}t }|	 D ]7\}}g ||< t
|D ]*}|dkrR|n|| }|dkrc|
jdd|d}n
|
jdd||fd}|| | qJq>t }g }t
|D ]S}t }|D ]@}|dkr| d|dd}t tj|||| |  n| d|dd}ttj|||| | j|d |||| < q|| |d< || qtj|d}t|| t||d |d |d |d ||d |d |d |d}t|}t
|D ]q} |}||}|D ]a}||    }|| | }|j|jks>J d| d |j d!|j t j|||	d"sUJ d#| d$| d%| d&||    }t j|||	d"svJ d'| d$| d%| d&qqd(} fd)d*t
|D }  | }!W d+   d+S 1 sw   Y  d+S ),a  Test AudioWithTargetWithEmbeddingDataset.

        In this use case, each line of the manifest file has the following format:
        ```
        {
            'input_filepath': 'path/to/input.wav',
            'target_filepath': 'path/to/path_to_target.wav',
            'embedding_filepath': 'path/to/path_to_embedding.npy',
            'duration': duration_of_input,
        }
        ```
        r   r5   rM   rQ   r   r   )r2   r3   embedding_vectorrn   ro   @   rp   rq   embedding_filepathrr   r   rN   r   r   rs   rt   ru   rv   z.npyrw   rx   ry   rz   r2   r3   )r|   r}   r~   embedding_keyr6   r   r   r   rS   r   r   r   rR   r   rK   c                    r   r   r   r   r   r   r    r!     r   zQTestAudioDatasets.test_audio_to_target_with_embedding_dataset.<locals>.<listcomp>N) r#   r$   r%   r   r   r   r   r   rV   rW   r'   rZ   r   r   saver   r   r   r   r   r   r   r   r   *get_audio_to_target_with_embedding_datasetr   r[   r\   r]   r   rC   r   )"r*   r+   r6   rD   r   r   r   embedding_lengthr   rT   r-   r   r   r   r;   r   r   data_lengthrc   r   r   r   r   r|   r   r   rb   r   r   rk   r   r^   ra   ru   r   r   r    +test_audio_to_target_with_embedding_dataset  s   	 $






$z=TestAudioDatasets.test_audio_to_target_with_embedding_datasetN)__name__
__module____qualname__pytestmarkunitparametrizer/   rJ   rl   r   r   r   r   r  r   r   r   r    r   (   s2    
>
  T
 -
 
 `r   )r   r   r]   r#   r	  	soundfiler   
torch.cudar9   	omegaconfr   /nemo.collections.asr.parts.utils.manifest_utilsr   nemo.collections.audio.datar   *nemo.collections.audio.data.audio_to_audior   r   r   r   r	   1nemo.collections.audio.data.audio_to_audio_lhotser
   r   (nemo.collections.audio.parts.utils.audior   #nemo.collections.common.data.lhotser   r   r   r   r   r    <module>   s   