o
    }oiTj                    @   s   d dl mZ d dlmZ d dlmZ d dlmZ d dlm	Z	m
Z
mZ d dlZd dlZd dlZd dlZd dlmZmZmZmZmZ d dlmZ d d	lmZmZmZ d d
lmZmZ d dlm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z*m+Z+ d dl,m-Z-m.Z. ej/dddefddZ0ej/dddedefddZ1ej/dddedefddZ2ej/dddefddZ3ej/dddedefddZ4ej/dddefdd Z5ej/dddedee6e6f fd!d"Z7ej/ddd"edee6e6f fd#d$Z8ej/ddd"e9e6e6f dee6e6f fd%d&Z:ej/ddd"ee6e6f dee6e6f fd'd(Z;G d)d* d*ej<j=j>Z?defd+d,Z@defd-d.ZAdefd/d0ZBdefd1d2ZCd efd3d4ZDdefd5d6ZEdefd7d8ZFdefd9d:ZGdefd;d<ZHG d=d> d>ZIdefd?d@ZJd"e9e6e6f fdAdBZKd"e9e6e6f fdCdDZLd&e9e6e6f fdEdFZMd&e9e6e6f fdGdHZNd"e9e6e6f fdIdJZOdedefdKdLZPdedefdMdNZQG dOdP dPej<j=j>ZRejSTdQdRgdSdTgdedQe6dRe6fdUdVZUG dWdX dXej<j=j>ZVejSTdYdZgd[d\gdedYe6dZe6fd]d^ZWd_efd`daZXd_efdbdcZYd_efdddeZZG dfdg dgej<j=j>Z[dhdi Z\djdk Z]dldm Z^ej/dddndo Z_ej/dddpdq Z`ej/ddde6fdrdsZadtdu Zbdvdw Zcej/dddxdy Zddye9e6e6f fdzd{Zeej/ddde-fd|d}Zfdye9e6e6f d&e9e6e6f d}e-dse6fd~dZgdye9e6e6f d&e9e6e6f d}e-dse6fddZhdye9e6e6f d&e9e6e6f d}e-dse6fddZidye9e6e6f d&e9e6e6f d}e-dse6fddZjdedefddZkdedefddZkdefddZlded&efddZmdefddZnd&e9e6e6f fddZod&e9e6e6f fddZpej/dddefddsZadedsefddZqd"e9dsefddZrd"e9dsefddZsej/dddee6e6f fddZtde9e6e6f fddZudefddZvdefddZwd(e9e6e6f fddZxdefddZyd$e9ee6f fddZzdd Z{dS )    )Counter)BytesIO)islice)Path)DictListTupleN)CutSetMonoCutNumpyFilesWriter	Recordingcompute_num_samples)AudioLoadingError)CutMixedCut
PaddingCut)RoundRobinSampler
ZipSamplerJsonlShardWriter)dummy_recording)deterministic_rng)	OmegaConf)!get_lhotse_dataloader_from_config)SourceTargetTextExampleTextExample)SentencePieceTokenizercreate_spt_modelsession)scopereturnc                 C   sx   ddl m} ddlm} ||dddd}|D ]}d|_d|_d|jd _q| d}|d	 }|d
 }||	| |S )z.10 utterances of length 1s as a Lhotse CutSet.r   r	   DummyManifest
   Tbegin_idend_id	with_dataNdatacuts.jsonl.gzaudio)
lhotser	   lhotse.testing.dummiesr#   featurescustomsupervisionsmktempsave_audiosto_file)tmp_path_factoryr	   r#   cutsctmp_pathppa r:   d/home/ubuntu/.local/lib/python3.10/site-packages/tests/collections/common/test_lhotse_dataloading.pycutset_path&   s   
r<   c                 C   sD   ddl m} || }| jd }|jdd |j|ddidd	 |S )
z<10 utterances of length 1s as a Lhotse Shar (tarred) CutSet.r   r!   sharTexist_ok	recordingwav   fields
shard_size)r,   r	   	from_fileparentmkdirto_sharr<   r	   r5   r8   r:   r:   r;   cutset_shar_path9   s   

rK   c                 C   sN   ddl m} || dd }| jd }|jdd |j|dd	id
d |S )zT10 utterances of length 1s as a Lhotse Shar (tarred) CutSet, but with different IDs.r   r!   c                 S   s
   d|  S )Nzother-r:   idr:   r:   r;   <lambda>J   s   
 z(cutset_shar_path_other.<locals>.<lambda>z
shar-otherTr>   r@   rA   rB   rC   )r,   r	   rF   
modify_idsrG   rH   rI   rJ   r:   r:   r;   cutset_shar_path_otherE   s   
rP   c                 C   sj   ddl m} ddlm} g }|| D ]}||jjd jdd|j	dddd q| j
d	 }||| |S )
z.10 utterances of length 1s as a NeMo manifest.r   r!   )save_to_jsonl
irrelevantnot relevantenpl)audio_filepathtext
text-otherdurationmy-custom-fieldlangcustom-langznemo_manifest.json)r,   r	   lhotse.serializationrQ   rF   appendr@   sourcessourcerY   rG   )r<   r	   rQ   nemor6   r8   r:   r:   r;   nemo_manifest_pathQ   s"   

rb   c                 C   sN   ddl m}m} t|| }|dd D ]}d|d< q| jd }||| |S )zRCreate a nemo manifest with last 2 utterances out of 10 with `_skipme` key enabledr   
load_jsonlrQ   NT_skipmeznemo_manifest_with_skipme.jsonr]   rd   rQ   listrG   )rb   rd   rQ   	all_itemsitemr8   r:   r:   r;   nemo_manifest_with_skipme_pathi   s   


rk   c              	   C   s  ddl m}m} ddlm} d}d}||d|| dd}g }t|D ]L}g }	t|D ]}
||| |
  jjd }|
g|_|		| q*t
|	d|d	|d j|d j|d jd
}||d|d	ddtt|d}|	| q"||}| d}|d }|d }||| |S )z:10 two-channel utterances of length 1s as a Lhotse CutSet.r   )r	   MultiCutr"   r$      Tr%   zmc-dummy-recording-02d)r_   rM   num_samplesrY   sampling_ratezmc-dummy-cut-      ?)r@   rM   startrY   channelr)   zmc_cuts.jsonl.gzmc_audio)r,   r	   rl   r-   r#   ranger@   r_   channelsr^   r   ro   rY   rp   rh   	from_cutsr1   r2   r3   )r4   r	   rl   r#   num_examplesnum_channelssc_cutsmc_cutsn
mc_sourcesrs   r`   reccutr7   r8   r9   r:   r:   r;   mc_cutset_pathx   s:   
	

r   c                 C   s&  ddl m}m} ddlm} | jd }|jdd || ddd	a}||d
 K}t|| D ]<\}}|d }	t|	j	}
t
|	d}||
t|  W d   n1 sWw   Y  |i ||
t|dkd q0W d   n1 sww   Y  W d   n1 sw   Y  |j| dfS )510 utterances of length 1s as a NeMo tarred manifest.r   )SequentialJsonlWriterrd   	TarWriternemo_tarTr>   z/audios_%01d.tarrB   rE   tarred_audio_filepaths.jsonlrV   rbN   )rV   shard_idz/audios__OP_0..1_CL_.tar)r]   r   rd   lhotse.shar.writersr   rG   rH   	enumerater   nameopenwriter   readintpath)rb   r   rd   r   root
tar_writer
mft_writeridxdr8   r   fr:   r:   r;   nemo_tarred_manifest_path   s,   


" 
r   c                 C   sZ   ddl m}m} | \}}t||}|dd D ]}d|d< q|jd }||| ||fS )zZCreate a nemo tarred manifest with last 2 utterances out of 10 with `_skipme` key enabled.r   rc   re   NTrf   z(tarred_audio_filepaths_with_skipme.jsonlrg   )r   rd   rQ   json_ptar_pri   rj   r8   r:   r:   r;   %nemo_tarred_manifest_with_skipme_path   s   


r   c                 C   s   ddl m} ddlm} | \}}|jd }|jdd || ddd	}||D ]}|| q)W d
   n1 s;w   Y  | d|fS )zW10 utterances of length 1s as a NeMo tarred manifest. Stored in one manifest per shard.r   rd   r   shard_manifestsTr>   /manifest_%d.jsonlrB   r   N/manifest__OP_0..1_CL_.jsonl)r]   rd   r   r   rG   rH   r   )r   rd   r   r   r   json_dirr   rj   r:   r:   r;   nemo_tarred_manifest_path_multi   s   
r   c                 C   s   ddl m} ddlm} | \}}|jd }|jdd t||}|dd }|dd }|d	d |d	d  }	|| d
dd}
|	D ]}|
| qEW d   n1 sWw   Y  | d||	fS )zMCreate a shard manifests with randomly chosen 50% percent of tarred contents.r   r   r   r   Tr>   NrB   r      r   r   )r]   rd   r   r   rG   rH   rh   r   )r   rd   r   r   r   r   ri   tarr_0_datatarr_1_datasubset_itemsr   rj   r:   r:   r;    nemo_tarred_manifest_subset_path   s   
r   c                   @   s*   e Zd Zdejdeeejf fddZ	dS )UnsupervisedAudioDatasetr5   r    c                 C   s(   t jj|\}}||dd |D dS )Nc                 S   s   g | ]}|j qS r:   rL   .0r6   r:   r:   r;   
<listcomp>   s    z8UnsupervisedAudioDataset.__getitem__.<locals>.<listcomp>)r+   
audio_lensids)r,   dataset	collationcollate_audio)selfr5   r+   r   r:   r:   r;   __getitem__   s   z$UnsupervisedAudioDataset.__getitem__N)
__name__
__module____qualname__r,   r	   r   strtorchTensorr   r:   r:   r:   r;   r      s    "r   c                 C   s  t | dddddddddddd	dd
}t|ddt d}dd |D }t|dks,J |d }t| h dks<J |d jd |d jd   krQdksTJ  J |d }t| h dksdJ |d jd |d jd   krydks|J  J |d }t| h dksJ |d jd |d jd   krdksJ  J |d }t| h dksJ |d jd |d jd   krdksJ  J d S )N>  Tr   Frm         @      .@r$   d   )	cuts_pathsample_rateshuffle
use_lhotsenum_workersuse_bucketingconcurrent_bucketingnum_buckets	drop_lastbatch_durationquadratic_durationshuffle_buffer_sizebucket_buffer_sizeseed   configglobal_rank
world_sizer   c                 S      g | ]}|qS r:   r:   r   batchr:   r:   r;   r         z4test_dataloader_from_lhotse_cuts.<locals>.<listcomp>r   >   r   r+   r   r+   r   r   r   creater   r   lensetkeysshape)r<   r   dlbatchesbr:   r:   r;    test_dataloader_from_lhotse_cuts   sD   
0004r   c                 C   s   t | dddddddd}t|ddt d}d	d
 |D }t|dks&J |d d jdks1J |d d jdks<J |d d jdksGJ d S )N      ?r   Tr   r   )r   truncate_durationr   r   r   r   
batch_sizer   r   r   c                 S   r   r:   r:   r   r   r:   r:   r;   r   6  r   z=test_dataloader_from_lhotse_cuts_truncate.<locals>.<listcomp>r   r+   r   @  rm   )rm   r   r   r   r   r   r   r   r<   r   r   r   r:   r:   r;   )test_dataloader_from_lhotse_cuts_truncate$  s&   
r   c                 C   s   t | dddddddd}t|ddt d}d	d
 |D }t|dks&J |d d jdks1J |d d jdks<J |d d jdksGJ |d d jdksRJ |d d jdks]J d S )Nr   r   Tr   r   )r   cut_into_windows_durationr   r   r   r   r   r   r   r   c                 S   r   r:   r:   r   r:   r:   r;   r   Q  r   zEtest_dataloader_from_lhotse_cuts_cut_into_windows.<locals>.<listcomp>rB   r+   r   rm   r   r   r   r:   r:   r;   1test_dataloader_from_lhotse_cuts_cut_into_windows?  s*   
r   c                 C   s   t | ddddddddd	}t|ddt d}tt|}|\}|jdks(J t|ts/J t	|j
d	ks8J t|j
d jtsCJ t|j
d jtsNJ d S )
Ng      5@leftr   Tr   r   )	r   pad_min_durationpad_directionr   r   r   r   r   r   r   rm   )r   r   r   IdentitynextiterrY   
isinstancer   r   tracksr   r   r
   r<   r   r   r   r   r:   r:   r;   1test_dataloader_from_lhotse_cuts_pad_min_duration\  s(   r   c           	      C   s.  t | ddddddd}t|ddt d}dd	 |D }t|d
ks%J |d d jdks0J |d d jdks;J |d d jdksFJ dD ]L}t | |ddddddd}t|ddt d}t|D ].\}}|d u r|t|d || d s{J qet|d || d d d |d d f sJ qeqHd S )Nr   Tr   r   )r   r   r   r   r   r   r   r   r   c                 S   r   r:   r:   r   r:   r:   r;   r     r   zEtest_dataloader_from_lhotse_cuts_channel_selector.<locals>.<listcomp>r   r+   )r   rm   r   rm   )rm   rm   r   )Nr   r   )r   channel_selectorr   r   r   r   r   r   )	r   r   r   r   r   r   r   r   equal)	r   r   r   r   r   	config_csdl_csr|   b_csr:   r:   r;   1test_dataloader_from_lhotse_cuts_channel_selectorv  sN   

0r   c                 C   s  t | dddddddddddd	ddd
}t|ddt d}dd t|dD }t|dks0J |d }t| h dks@J |d jd |d jd   krUdksXJ  J |d }t| h dkshJ |d jd |d jd   kr}dksJ  J |d }t| h dksJ |d jd |d jd   krdksJ  J |d }t| h dksJ |d jd |d jd   krdksJ  J d S )Nr   Tr   Frm   r   r   r$   r   	shar_pathr   r   r   r   r   r   r   r   r   r   r   r   r   
shard_seedr   r   c                 S   r   r:   r:   r   r:   r:   r;   r     r   z9test_dataloader_from_lhotse_shar_cuts.<locals>.<listcomp>r   >   r   r+   r   r+   r   r   	r   r   r   r   r   r   r   r   r   )rK   r   r   r   r   r:   r:   r;   %test_dataloader_from_lhotse_shar_cuts  sF   
0004r   c              
   C   sz   t |  d|  ddddddddd}t|dd	t d
}tt|}t|dks-J |d  }t|t	j
s;J d S )N%/cuts._OP_000000..000001_CL_.jsonl.gz%/recording._OP_000000..000001_CL_.tar)r5   r@   r   r   Fr   r   r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   
load_audior   npndarray)rK   r   r   r   r+   r:   r:   r;   0test_dataloader_from_lhotse_shar_cuts_via_fields  s"   r  c              
   C   s   |  d}t| ddd}tdD ]}|d| dd qW d    n1 s*w   Y  t| d	| d
| ddddddddd}t|ddt d}tt	|}t
|dks`J |d jdksiJ d S )Nwer_dirz/wer.%06d.jsonl.gzrB   r   r$   zdummy-mono-cut-%04dr   )cut_idwerr   r   z$/wer._OP_000000..000001_CL_.jsonl.gz)r5   r@   r  r   r   Fr   r  r   r   )r1   r   ru   r   r   r   r   r   r   r   r   r  )r4   rK   r  writerir   r   r   r:   r:   r;   3test_dataloader_from_lhotse_shar_cuts_add_new_field  s.   
r  c                 C   s  t | dddddddddddd	ddd
}t|ddt d}dd |D }t|dks-J |d }t| h dks=J |d jd |d jd   krRdksUJ  J |d }t| h dkseJ |d jd |d jd   krzdks}J  J |d }t| h dksJ |d jd |d jd   krdksJ  J |d }t| h dksJ |d jd |d jd   krdksJ  J d S )Nr   Tr   Frm   r   r   r$   r   )manifest_filepathr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   c                 S   r   r:   r:   r   r:   r:   r;   r   0  r   z6test_dataloader_from_nemo_manifest.<locals>.<listcomp>r   >   r   r+   r   r+   r   r   r   )rb   r   r   r   r   r:   r:   r;   "test_dataloader_from_nemo_manifest  sF   
0004r  c                   @   s   e Zd Zdd ZdS )	_Identityc                 C      |S Nr:   r   r5   r:   r:   r;   r   E     z_Identity.__getitem__N)r   r   r   r   r:   r:   r:   r;   r  D  s    r  c                 C   sj   t | dddddddddd
}t|ddt d	}tt|}|D ]}t|jts+J d
|jv s2J q!d S )Nr   Tr   Fr   r$   )
r  r   r   r   r   r   r   r   r   r   r   r   rZ   )	r   r   r   r  r   r   r   r/   dict)rb   r   r   r   r   r:   r:   r;   4test_dataloader_from_nemo_manifest_has_custom_fieldsI  s&   r  c                 C     | \}}t i d|d|dddddddd	d
ddddddddddddddddd	dd	}t|d	dt d}dd t|dD }t|dksTJ |d	 }t| h dksdJ |d  jd	 |d! jd	   kryd"ks|J  J |d }t| h dksJ |d  jd	 |d! jd	   krd"ksJ  J |d }t| h dksJ |d  jd	 |d! jd	   krd"ksJ  J |d" }t| h dksJ |d  jd	 |d! jd	   krd"ksJ  J d S )#Nr  tarred_audio_filepathsr   r   r   Tr   r   r   r   r   Fr   rm   r   r   r   r   r   r   r$   r   r   r   r   r   r   c                 S   r   r:   r:   r   r:   r:   r;   r   ~  r   z=test_dataloader_from_tarred_nemo_manifest.<locals>.<listcomp>r   >   r   r+   r   r+   r   r   r   )r   json_mfttar_mftr   r   r   r   r:   r:   r;   )test_dataloader_from_tarred_nemo_manifestb  j   	

0004r  c                 C   s   | \}}t i d|dg|dggd|g|ggddddd	dd
ddddddddddddddddddddd}t|ddt d}tt|}t| h dks^J |d jd |d  jd   krsd!ksvJ  J d S )"Nr  皙?皙?r  r   r   r   Tr   r   r   r   r   Fr   rm   r   r   r   r   r   r   r$   r   r   r   r   r   r   >   r   r+   r   r+   r   r   )	r   r   r   r   r   r   r   r   r   )r   r  r  r   r   r   r:   r:   r;   >test_dataloader_from_tarred_nemo_manifest_weighted_combination  sT   	

4r  c                 C   r  )#Nr  r  r   r   r   Tr   r   r   r   r   Fr   rm   r   r   r   r   r   r   r$   r   r   r   r   r   r   c                 S   r   r:   r:   r   r:   r:   r;   r     r   zCtest_dataloader_from_tarred_nemo_manifest_multi.<locals>.<listcomp>r   >   r   r+   r   r+   r   r   r   )r   r  r  r   r   r   r   r:   r:   r;   /test_dataloader_from_tarred_nemo_manifest_multi  r  r  c                 C   s   | \}}t i d|g|ggd|g|ggdddddddd	d
ddddddddddddddddddd	dd	}t|d	dt d}tt|}d S )Nr  r  r   r   r   Tr   r   r   r   r   Fr   rm   max_open_streamsr   r   r   r   r   r   r   r$   r   r   r   r   r   )r   r   r   r   r   r   )r   r  r  r   r   _r:   r:   r;   @test_dataloader_from_tarred_nemo_manifest_multi_max_open_streams  sT   	

r!  c                 C   s  | \}}t ||dddddddddddddd	}t|dd
t d}dd t|dD }t|dks4J tjddgtjd}|d }t	|
 h dksNJ |d jd |d jd   krcdksfJ  J tj|d | |d
 }t	|
 h dksJ |d jd |d jd   krdksJ  J tj|d | |d }t	|
 h dksJ |d jd |d jd   krdksJ  J tj|d | |d }t	|
 h dksJ |d jd |d jd   krdksJ  J tj|d | d S )Nr   Tr         @r   r   Fr$   )r  r  r   r   r   r   concatenate_samplesconcatenate_duration_factorr   r   r   r   r   r   r   r   r   c                 S   r   r:   r:   r   r:   r:   r;   r     r   zDtest_dataloader_from_tarred_nemo_manifest_concat.<locals>.<listcomp>r   i@  )dtype>   r   r+   r   r+   r   rm   r   )r   r   r   r   r   r   r   tensorint32r   r   r   testingassert_close)r   r  r  r   r   r   expected_audio_lensr   r:   r:   r;   0test_dataloader_from_tarred_nemo_manifest_concat  sR   
0000r+  c                 C   sx  t | |gdddddddddddd	ddd
}t|ddt d}dd t|dD }t|dks2J |d }tdd |d D dksEJ tdd |d D dksTJ |d }tdd |d D dksgJ tdd |d D dksvJ |d }tdd |d D dksJ tdd |d D dksJ |d }tdd |d D dksJ tdd |d D dksJ dS )z
    Note: if we iterated more mini-batches in this test, in the expectation there
    will be 50-50 % mini-batch occupancy of examples from both datasets.
    r   Tr   Frm   r   r   r$   r   r   r   r   c                 S   r   r:   r:   r   r:   r:   r;   r   [  r   zUtest_dataloader_from_lhotse_shar_cuts_combine_datasets_unweighted.<locals>.<listcomp>r   c                 S      g | ]	}| d r|qS dummy
startswithr   cidr:   r:   r;   r   _      r   c                 S   r,  otherr/  r1  r:   r:   r;   r   `  r3  c                 S   r,  r-  r/  r1  r:   r:   r;   r   c  r3  c                 S   r,  r4  r/  r1  r:   r:   r;   r   d  r3  r   c                 S   r,  r-  r/  r1  r:   r:   r;   r   g  r3  c                 S   r,  r4  r/  r1  r:   r:   r;   r   h  r3  c                 S   r,  r-  r/  r1  r:   r:   r;   r   k  r3  c                 S   r,  r4  r/  r1  r:   r:   r;   r   l  r3  Nr   r   r   r   r   r   rK   rP   r   r   r   r   r:   r:   r;   Atest_dataloader_from_lhotse_shar_cuts_combine_datasets_unweighted:  sF   
"r8  c                 C   s
  t | dg|dggdddddddddd	dd
ddd}t|ddt d}dd t|dD }t|dks6J |d }tdd |d D dksIJ tdd |d D dksXJ |d }tdd |d D dkskJ tdd |d D dkszJ |d }tdd |d D dksJ tdd |d D dksJ |d }tdd |d D dksJ tdd |d D dksJ |d }tdd |d D dksJ tdd |d D dksJ |d }tdd |d D dksJ td d |d D dksJ d!S )"z
    Note: if we iterated more mini-batches in this test, in the expectation there
    will be 90-10 % mini-batch occupancy of examples from both datasets.
    Z   r$   r   Tr   Frm   r   r   r   r   r   r   c                 S   r   r:   r:   r   r:   r:   r;   r     r   zStest_dataloader_from_lhotse_shar_cuts_combine_datasets_weighted.<locals>.<listcomp>   c                 S   r,  r-  r/  r1  r:   r:   r;   r     r3  r   r   c                 S   r,  r4  r/  r1  r:   r:   r;   r     r3  c                 S   r,  r-  r/  r1  r:   r:   r;   r     r3  c                 S   r,  r4  r/  r1  r:   r:   r;   r     r3  c                 S   r,  r-  r/  r1  r:   r:   r;   r     r3  c                 S   r,  r4  r/  r1  r:   r:   r;   r     r3  c                 S   r,  r-  r/  r1  r:   r:   r;   r     r3  c                 S   r,  r4  r/  r1  r:   r:   r;   r     r3  r   c                 S   r,  r-  r/  r1  r:   r:   r;   r     r3  c                 S   r,  r4  r/  r1  r:   r:   r;   r     r3  rB   c                 S   r,  r-  r/  r1  r:   r:   r;   r     r3  c                 S   r,  r4  r/  r1  r:   r:   r;   r     r3  Nr6  r7  r:   r:   r;   ?test_dataloader_from_lhotse_shar_cuts_combine_datasets_weightedo  sR   
$r;  c                   @   $   e Zd Zdejdee fddZdS )TextDatasetr5   r    c                 C      dd |D S )Nc                 S      g | ]}|j d  jqS r   )r0   rW   r   r:   r:   r;   r         z+TextDataset.__getitem__.<locals>.<listcomp>r:   r  r:   r:   r;   r        zTextDataset.__getitem__Nr   r   r   r,   r	   r   r   r   r:   r:   r:   r;   r=        r=  
text_field
text_value)NrR   )rX   rS   c              
   C   f   |d urd|ini }t | ddddddd|}t|ddt d	}tt|}||gd ks1J d S )
NrE  r   Tr   rm   Fr  r   r   r   r   r   r   r   r   )r   r   r   r=  r   r   )rb   rE  rF  kwargr   r   r   r:   r:   r;   2test_dataloader_from_nemo_manifest_with_text_field      	rJ  c                   @   r<  )LangDatasetr5   r    c                 C   r>  )Nc                 S   r?  r@  )r0   languager   r:   r:   r;   r     rA  z+LangDataset.__getitem__.<locals>.<listcomp>r:   r  r:   r:   r;   r     rB  zLangDataset.__getitem__NrC  r:   r:   r:   r;   rL    rD  rL  
lang_field
lang_value)NrT   )r\   rU   c              
   C   rG  )
NrN  r   Tr   rm   FrH  r   r   )r   r   r   rL  r   r   )rb   rN  rO  rI  r   r   r   r:   r:   r;   2test_dataloader_from_nemo_manifest_with_lang_field  rK  rP  r7   c                 C   s  dd l }dd l}ddlm} d}|jj| d |dd|j| }t| d }|	||d t| d	 }t
j|d
ddd|ddddg| t
||}|d }	t|	t
js[J |	jd
ksbJ |	jdksiJ |	jdkspJ |	jdkswJ |	jd jdksJ |	 }
|
jdksJ |j|
d |d d  |d }	t|	t
jsJ |	jdksJ |	jdksJ |	jdksJ |	jdksJ |	jd jdksJ |	 }
|
jdksJ |jj|
d |dd  dd |d j|d jksJ d S )Nr   LazyNeMoIterator   r   r   lowhighsize	dummy.wavr   manifest.json        r   rR   rV   offsetrY   rW   r   r   r   g-C6
?)atol)numpy	soundfile1nemo.collections.common.data.lhotse.nemo_adaptersrR  randomrandintastypefloat32r   r   r,   serializationrQ   r	   r   r
   rr   rY   rp   ro   r0   rW   r  r   r(  assert_equalassert_allcloserM   r7   r  sfrR  INT16MAXexpected_audio
audio_pathmanifest_pathr5   r   r+   r:   r:   r;   )test_lazy_nemo_iterator_with_offset_field  sH   $rp  c                 C   s*  dd l }dd l}ddlm} d}|jj| d |dd|j| }t| d }|	||d t| d	 }t
jdd
dddg| t
||}|d }	|	 }
t|	t
jsYJ |	jd
ks`J |	jdksgJ |	jdksnJ |	jdksuJ |	jd jdksJ |
jdksJ |j|
d |d d  d S )Nr   rQ  rS  r   rT  rU  rY  r   rZ  r[  r   rR   r\  r   r^  )r`  ra  rb  rR  rc  rd  re  rf  r   r   r,   rg  rQ   r	   r  r   r
   rr   rY   rp   ro   r0   rW   r   r(  rh  rj  r:   r:   r;   +test_lazy_nemo_iterator_with_relative_paths  s0   $rq  c                 C   s~  | d }| d }t j|tjdd d t| }t	|j
|jjd _|j|_t| }||jtjd|_d|j_W d    n1 sJw   Y  tt |  |  W d    n1 sgw   Y  t|g| t|dddd	d
}t|ddt d}dd |D }t |dksJ |d D ]!}|j!sJ |  |"dsJ |  |"dsJ |#  qd S )Nr*   z_relative_test_audio_.wavr   r   r        Trm   )r   r   r   r   r   r   r   c                 S   r   r:   r:   r   r:   r:   r;   r   R  r   z;test_lhotse_cuts_resolve_relative_paths.<locals>.<listcomp>target_recording
some_array)$r,   r+   
save_audior  rc  randr   rF   to_cutr   r   r@   r_   r`   rt  r   store_arrayrM   randnru  storage_pathpytestraisesr   r  load_target_recordingr	   r3   r   r   r   r  r   has_recording
has_customload_some_array)r7   r   rn  r   wr   r   r   r:   r:   r;   'test_lhotse_cuts_resolve_relative_paths5  sD   





r  c                   @   s"   e Zd ZdejdejfddZdS )r   r5   r    c                 C   r  r  r:   r  r:   r:   r;   r   _  r  zIdentity.__getitem__N)r   r   r   r,   r	   r   r:   r:   r:   r;   r   ^  s    r   c              
   C   sf  t d|d |d dddddd	d
| ddddddgddddddd}t|ddt d}dd t|dD }|d }t|tjsEJ tdd |D sPJ tdd |D s[J t	dd |D dkshJ t	dd |D dksuJ |d }t|tjsJ tdd |D sJ tdd |D sJ t	dd |D dksJ t	dd |D dksJ d S )Nnemo_tarredr   r   r   rT   r+   D1rM  modalitydataset_nametyper  r  weighttagslhotse_sharD2r  r   r  r  r   Tr   	input_cfgr   r   r   r   r   r   r   c                 S   r   r:   r:   r   r:   r:   r;   r     r   z0test_extended_data_input_cfg.<locals>.<listcomp>rm   c                 s       | ]
}|j d  dkV  qdS rM  rT   Nr/   r   r:   r:   r;   	<genexpr>      z/test_extended_data_input_cfg.<locals>.<genexpr>c                 s   r  r  r+   Nr  r   r:   r:   r;   r    r  c                 s   r  r  r  Nr  r   r:   r:   r;   r    r  c                 s   r  r  r  Nr  r   r:   r:   r;   r    r  c                 s   r  r  r  r   r:   r:   r;   r    r  c                 s   r  r  r  r   r:   r:   r;   r    r  c                 s   r  r  r  r   r:   r:   r;   r    r  c                 s   r  r  r  r   r:   r:   r;   r    r  r   )
r   r   r   r   r   r   r,   r	   allsum)rK   r   r   r   r   r   r:   r:   r;   test_extended_data_input_cfgc  sR   "r  c                 C   s  t dd|d |d ddddd	d
d| ddddd	dgdddidddd|d |d ddddd	d
d| ddddd	dgddidgddddddd}t|ddt d}t }t }t|dD ]}|D ]}||j  d7  < ||j  d7  < qgqct|	 }|D ]
}	||	  |  < q|D ]
}	||	  |  < qdd }
|d |
dksJ |d |
dksJ |d |
dksJ |d |
dksJ |d |
d ksJ |d |
d ksJ d S )!Ngroupr  r   r   r   rT   r+   r  r  r  r  r  r  r  
group_nameG1)r  r  r  r  r  D3D4G2)r  r  r  r  r   Trr  r  r   r   c                 S   s   t j| ddS )Ng{Gz?)abs)r|  approx)numberr:   r:   r;   almost  rB  z5test_extended_data_input_cfg_subgroup.<locals>.almost皙?g?)
r   r   r   r   r   r   r  r  r  values)rK   r   r   r   group_occurrencesdataset_occurrencesr   r   totkr  r:   r:   r;   %test_extended_data_input_cfg_subgroup  s   >Ir  c           	   
   C   s   dt |d t |d dddddd	d
t |ddddddg}| d }tj|| t|ddddddd}t|ddt d}tt	|}t
|tjsPJ |D ]	}|jdv s[J qRd S )Nr  r   r   r   rT   r+   r  r  r  r  r  r  zinput_cfg.yamlr   Trr  r  r   )r  r  )r   r,   rg  save_to_yamlr   r   r   r   r   r   r   r	   r  )	r7   rK   r   r  	yaml_pathr   r   r   r   r:   r:   r;   &test_extended_data_input_cfg_yaml_path  sH   

r  c                 C       |  d}|d }|d |S )N	text_dataztext.enz3Example text in English.
Another sentence.
        r1   
write_text)r4   r7   en_pathr:   r:   r;   txt_en_path/  s   
r  c                 C   r  )Nr  ztext.esz!Otro texto en ingles.
Otra frase.r  )r4   r7   es_pathr:   r:   r;   txt_es_path;  s   
r  c                 C   s$   |  d}|d }|d t|S )N	questionsquestions.txtz"translate the following to spanish)r1   r  r   )r4   tmpdirqpr:   r:   r;   questions_pathF  s   

r  c              	   C   s   t d| ddgdddddd}t|ddt d	}d
d t|dD }|d }t|tjs0J tdd |D s;J tdd |D sFJ |d }t|tjsRJ tdd |D s]J tdd |D shJ d S )NtxtrT   )r  pathsrM  Tr   r   r  r   r   r   r   r   r   r   c                 S   r   r:   r:   r   r:   r:   r;   r   d  r   z(test_text_file_input.<locals>.<listcomp>rm   c                 s       | ]}t |tV  qd S r  r   r   r   r:   r:   r;   r  h      z'test_text_file_input.<locals>.<genexpr>c                 s       | ]}|j d kV  qdS rT   NrM  r   r:   r:   r;   r  i  r  c                 s   r  r  r  r   r:   r:   r;   r  m  r  c                 s   r  r  r  r   r:   r:   r;   r  n  r  	r   r   r   r   r   r   r,   r	   r  )r  r  r   r   r   r   r:   r:   r;   test_text_file_inputN  s.   r  c              
   C   s  t d| ||ddddgdddddd}t|dd	t d
}dd t|dD }|d }t|tjs4J tdd |D s?J tdd |D sJJ tdd |D sUJ |d	 }t|tjsaJ tdd |D slJ tdd |D swJ tdd |D sJ d S )Ntxt_pairrT   esr  source_pathstarget_pathsr  source_languagetarget_languagequestions_languageTr   r   r  r   r   c                 S   r   r:   r:   r   r:   r:   r;   r     r   z.test_text_file_pairs_input.<locals>.<listcomp>rm   c                 s   r  r  r   r   r   r:   r:   r;   r    r  z-test_text_file_pairs_input.<locals>.<genexpr>c                 s       | ]	}|j jd kV  qdS r  r`   rM  r   r:   r:   r;   r        c                 s   r  r  NtargetrM  r   r:   r:   r;   r    r  c                 s   r  r  r  r   r:   r:   r;   r    r  c                 s   r  r  r  r   r:   r:   r;   r    r  c                 s   r  r  r  r   r:   r:   r;   r    r  r  )r  r  r  r   r   r   r   r:   r:   r;   test_text_file_pairs_inputq  s:   r  c                 C   s   |  d}|  }|d d|d d  |d d|dd   |  }|d d|d d  |d d|dd   | d| d	fS )
Ntext_data_shardszen_0.txt
rB   zen_1.txtzes_0.txtzes_1.txtz/en__OP_0..1_CL_.txtz/es__OP_0..1_CL_.txt)r1   	read_text
splitlinesr  join)r4   r  r  r7   en_textes_textr:   r:   r;   txt_pair_paths_shards  s   
r  c              
   C   s  | \}}t d|||ddddgdddddd}t|dd	t d
}dd t|dD }|d }t|tjs8J tdd |D sCJ tdd |D sNJ tdd |D sYJ |d	 }t|tjseJ tdd |D spJ tdd |D s{J tdd |D sJ d S )Nr  rT   r  r  Tr   r   r  r   r   c                 S   r   r:   r:   r   r:   r:   r;   r     r   z5test_text_file_pairs_shards_input.<locals>.<listcomp>rm   c                 s   r  r  r  r   r:   r:   r;   r    r  z4test_text_file_pairs_shards_input.<locals>.<genexpr>c                 s   r  r  r  r   r:   r:   r;   r    r  c                 s   r  r  r  r   r:   r:   r;   r    r  c                 s   r  r  r  r   r:   r:   r;   r    r  c                 s   r  r  r  r   r:   r:   r;   r    r  c                 s   r  r  r  r   r:   r:   r;   r    r  r  )r  r  en_pathses_pathsr   r   r   r   r:   r:   r;   !test_text_file_pairs_shards_input  s<   r  c                 C   sR   |  d}|d }|| d |   t|dddt|d tt|d S )	Nen_es_tokenizerztext.txtr     F)
vocab_sizesample_sizedo_lower_case
output_dirztokenizer.model)r1   r  r  r   r   r   )r4   r  r  r  	text_pathr:   r:   r;   r    s
   
r  c                    s  | \}}|\}}d\ }t d||dd|dddidd||dd	id
gdddd|d ddd
}	t|	ddt |d}
tt|
}t|tjsHJ t	|sNJ t
dd |D sYJ t
dd |D sdJ t fdd|D |kssJ |D ]z}t|tr|jd	ksJ t| tjsJ t|jd jtsJ t|tr|jdksJ |jjdksJ |jjdksJ t|jjtsJ t|jjtsJ t|jjtsJ t|jsJ t|jsJ t|jsJ t|jsJ qud S )N)2   i   r  rT   r  r  rW   r  r  r  r  r  r  r  r  r  r+   r  r  r  r  Tr   plainr  )
r  r   r   use_multimodal_samplingprompt_formatbatch_tokenstoken_equivalent_durationquadratic_factorr   r   r   r   r   r   r   	tokenizerc                 s   r  r  r   r   r   exr:   r:   r;   r    r  z9test_multimodal_text_audio_dataloading.<locals>.<genexpr>c                 s   r  r  r  r  r:   r:   r;   r    r  c                 3       | ]
}|j d    V  qdS rm   N
num_tokensr  QFr:   r;   r    r  )r   r   r   r   r   r   r   r,   r	   r   anyr  r   r  r  r  r  r0   rW   r   r   r`   rM  r  questionr   	is_tensor	input_idscontext_ids
answer_idsmask)r  r   r  r  r  r  r  r  BTr   r   r   r  r:   r  r;   &test_multimodal_text_audio_dataloading  s~   *

r  c                    sF  | \}}|\}}d\ }t ddddddd||ddidgd	d|d
 dd||dd|dddidgdd	dd
ddd}	t|	ddt |d}
t|
jjtsOJ dd t|
dD }|d }t|t	j
seJ t|skJ tdd |D svJ tdd |D sJ t fdd|D |d ksJ |D ]_}t|tr|jdksJ t| tjsJ t|jd jtsJ t|tr|jdksJ |jjdksJ |jjdksJ t|jsJ t|jsJ t|jsJ t|jsJ q|d }t|t	j
sJ t|sJ tdd |D sJ tdd |D s J t fd d|D |d ks2J |D ]l}t|tr[|jdksDJ t| tjsOJ t|jd jts[J t|tr|jdksiJ |jjdksrJ |jjdks{J t|jsJ t|jsJ t|jsJ t|jsJ q4d S )!Nr  @   Tzipr   r  r  r+   r  r  r  r  r  r  r  r  r  r  rT   r  rW   r  r
  r  r  r  r  r  r  r  multi_configsampler_fusionr   r   r   r   r+   rW   r   r  c                 S   r   r:   r:   r   r:   r:   r;   r     r   zGtest_multimodal_text_audio_dataloading_zip_strategy.<locals>.<listcomp>rm   c                 s   r  r  r  r  r:   r:   r;   r    r  zFtest_multimodal_text_audio_dataloading_zip_strategy.<locals>.<genexpr>c                 s   r  r  r  r  r:   r:   r;   r    r  c                 3   r  r  r  r  r  r:   r;   r    r  c                 s   r  r  r  r  r:   r:   r;   r    r  c                 s   r  r  r  r  r:   r:   r;   r    r  c                 3   r  r  r  r  r  r:   r;   r    r  ) r   r   r   r   r   r   samplerr   r   r,   r	   r   r   r  r   r  r  r  r  r0   rW   r   r   r`   rM  r  r   r  r  r  r  r  r  r   r  r  r  r  r  r  r  r   r   r   r   r  r:   r  r;   3test_multimodal_text_audio_dataloading_zip_strategy1  s   
="

$r  c                    s$  | \}}|\}}d\ }t ddddddd||ddidgdd	|d
 dd||dd|dddidgd	d|d
 dd}	t|	ddt |d}
t|
jjtsOJ dd t|
dD }|d }t|t	j
seJ t|skJ tdd |D svJ t fdd|D |ksJ |D ]}|jdksJ t| tjsJ t|jd jtsJ q|d }t|t	j
sJ t|sJ tdd |D sJ t fdd|D |ksJ |D ];}|jdksJ |jjdksJ |jjdksJ t|jsJ t|jsJ t|jsJ t|jsJ qd S )Nr	  Tround_robinr   r  r  r+   r  r  r  r  r  rT   r  rW   r  r  r  r   r  c                 S   r   r:   r:   r   r:   r:   r;   r     r   zOtest_multimodal_text_audio_dataloading_round_robin_strategy.<locals>.<listcomp>rm   c                 s   r  r  r  r  r:   r:   r;   r    r  zNtest_multimodal_text_audio_dataloading_round_robin_strategy.<locals>.<genexpr>c                 3   r  r  r  r  r  r:   r;   r  
  r  c                 s   r  r  r  r  r:   r:   r;   r    r  c                 3   r  r  r  r  r  r:   r;   r    r  r   r   r   r   r   r   r  r   r   r,   r	   r   r  r  r  r  r  r  r0   rW   r   r`   rM  r  r   r  r  r  r  r  r  r:   r  r;   ;test_multimodal_text_audio_dataloading_round_robin_strategy  s   
=r  c                    s.  |\}}|\}}d\ }	t dddddddddd||dd	id
gdd|	d dd||dd|dddidgdd|	d dd	}
t|
ddt |d}t|jjtsSJ dd t|dD }|d }t|t	j
siJ t|soJ tdd |D szJ t fdd|D |	ksJ |D ]}|jd	ksJ t| tjsJ t|jd jtsJ q|d }t|t	j
sJ t|sJ tdd |D sJ t fdd|D |	ksJ |D ]<}|jdksJ |jjdksJ |jjdksJ t|jsJ t|jsJ t|jsJ t|jsJ qd S )Nr	  Trandomized_round_robinr   )r+   rW   r   r  r  r+   r  r  r  r  r  rT   r  rW   r  r  )	r  r  sampler_weightsr   r   r   r   r+   rW   r   r  c                 S   r   r:   r:   r   r:   r:   r;   r   w  r   zZtest_multimodal_text_audio_dataloading_randomized_round_robin_strategy.<locals>.<listcomp>rm   c                 s   r  r  r  r  r:   r:   r;   r  }  r  zYtest_multimodal_text_audio_dataloading_randomized_round_robin_strategy.<locals>.<genexpr>c                 3   r  r  r  r  r  r:   r;   r    r  c                 s   r  r  r  r  r:   r:   r;   r    r  c                 3   r  r  r  r  r  r:   r;   r    r  r  )r   r  r   r  r  r  r  r  r  r  r   r   r   r   r  r:   r  r;   Ftest_multimodal_text_audio_dataloading_randomized_round_robin_strategy!  s   
Ar  c              
   C      t t| t|dddgdddd}t|ddt d}tt|}t|ts)J t	|dks1J |d }t|t
s<J d|jd j  k rKdk sNJ  J |d }t|t
sYJ d|jd j  k rhdk skJ  J d S 	Nrq               @rm   r   r   
noise_pathnoise_mix_prob	noise_snrr   r   r   r   r   r   r   r   r   r   r   r   r   r	   r   r   r   snrr<   rb   r   r   r   r   r:   r:   r;   $test_dataloader_with_noise_nemo_json  2   $(r%  c              
   C   r  r  r"  r$  r:   r:   r;   r%    r&  c              
   C   s   t t| t| dddgdddd}t|ddt d}tt|}t|ts)J t	|dks1J |d }t|t
s<J d|jd j  k rKdk sNJ  J |d }t|t
sYJ d|jd j  k rhdk skJ  J d S r  r"  r   r:   r:   r;   'test_dataloader_with_noise_lhotse_jsonl  r&  r'  c              
   C   s   |\}}t t| ||ddddgdddd}t|ddt d	}tt|}t|ts.J t	|dks6J |d }t|t
sAJ d|jd j  k rPdk sSJ  J |d }t|t
s^J d|jd j  k rmdk spJ  J d S )
N)r  r  rq   r  r  rm   r   r  r   r   r"  )r<   r   
noise_json	noise_tarr   r   r   r   r:   r:   r;   #test_dataloader_with_noise_nemo_tar  s8   $(r*  c              	   C   s  ddl m} tt| dddddd}t|ddt d}tt|}t	|t
s*J t|dks2J |d }t	|ts=J |jjd u sEJ |d }t	|tsPJ |jjd u sXJ |d	 }t	|tscJ t	|jjtrrt|jjdkstJ |jjd }t	|tr|d
 dksJ nt	||sJ |d }t	|tsJ t	|jjtrt|jjdksJ |jjd }t	|tr|d
 dksJ d S t	||sJ d S )Nr   )ReverbWithImpulseResponseTr   r   )r   rir_enabledrir_probr   r   r   r   r   rm   r   r+  r   )lhotse.augmentationr+  r   r   r   r   r   r   r   r   r	   r   r
   r@   
transformsrh   r  )r<   r+  r   r   r   r   tfnmr:   r:   r;   test_dataloader_with_synth_rir  sN   
"
"
r1  c                 C   sp   | \}}t ||ddddddddgddgdd	d
ddd}t|ddt d}t|d	D ]
}t|dks5J q+d S )Nr   Tr   F       @r   rm   r   r$   r   r  r  r   r   r   r   r   r   bucket_duration_binsbucket_batch_sizer   r   r   r   r   r   r   r   r   r   r   r   )r   r  r  r   r   r   r:   r:   r;   !test_dataloader_bucket_batch_size<  s.   r7  c                 C   s   | \}}t ||ddddddddgddgdd	gdd
gddgddggg ddddddd}t|ddt |d}t|dD ]
}t|dksFJ q<d S )Nr   Tr   Fr   r   rm   r2  rB      r   r$      )   r:  rB   r   r   rm   r   r3  r  r   r6  )r   r  r  r  r   r   r   r:   r:   r;   test_dataloader_2d_bucketingZ  s2   &r;  c                 C   s4   |  d}|d }|ddd tdD  |S )z4A text file with 10 lines containing question valuesr  r  r  c                 s   s    | ]}d | V  qdS )zsome question number Nr:   )r   r
  r:   r:   r;   r    r  z!questions_path.<locals>.<genexpr>r$   )r1   r  r  ru   )r4   qdirr   r:   r:   r;   r  {  s   
c              
   C   s   t | ddd|dgdgdddd	d
dd}t|d	dt d}tt|}|d	 }t|ts1J t|ds8J |j	dks?J |d }t|tsJJ t|dsQJ |j	dksXJ d S )Nra   	text_iterr  r  r   r   )r  r  extra_fieldsr   FTr   rm   r  r   r   r   r   r   r   r   r   some question number 0some question number 1
r   r   r   r   r   r   r   r
   hasattrr  )rb   r  r   r   r   r6   r:   r:   r;   Ltest_dataloader_from_nemo_nontarred_manifest_with_extra_questions_field_iter  s8   rE  c              
   C   s   t | d | d ddd|dgdgdd	d
ddd	d}t|ddt d}tt|}|d }t|ts6J t|ds=J |j	dksDJ |d }t|tsOJ t|dsVJ |j	dks]J d S )Nr   r   r  r=  r  r>  r  r  r  r?  r   FTrm   r@  r   rA  rB  rC  r   r  r   r   r   r6   r:   r:   r;   Btest_dataloader_from_nemo_manifest_with_extra_questions_field_iter  s:   rH  c                 C   sX  t | d | d ddd|dgdgdd	d
ddddd	d	}t|ddt d}tt|}|d }t|ts8J t|ds?J |j	dksFJ |d }t|tsQJ t|dsXJ |j	dks_J |d }t|tsjJ t|dsqJ |j	dksxJ |d }t|tsJ t|dsJ |j	dksJ |d }t|tsJ t|dsJ |j	dksJ d S )Nr   r   r  text_sampler  r>  rF  r   FTrB   )	r  r   r   r   r   r   r   r   r   r   zsome question number 6rm   rA  r   zsome question number 4r   zsome question number 8rC  rG  r:   r:   r;   Dtest_dataloader_from_nemo_manifest_with_extra_questions_field_sample  sV   rJ  c                    s*  ddl m} ddlm} | d}|jdd tdddd || d	d
d^}||d H}ddtf fdd}|| t	 j
d j || dddddd ||ddddddd ||ddddddd W d
   n1 syw   Y  W d
   n1 sw   Y  |j|jd fS )r   r   )r   r   nemo_tar_offsetTr>         $@)rY   r(   z/audios_0.tarNr   r   r|   c                    s&    j | d u rd d S d|   d S )Nrs  z-subz.wavrL   )r|   r@   r:   r;   rn  	  s   &z9nemo_tarred_manifest_path_with_offset.<locals>.audio_pathr[  r"  rR   rT   )rV   r]  rY   rW   r[   r   r   r   r  irrelevant-2rm   irrelevant irrelevant-2r  )r]   r   r   r   r1   rH   r   r   r   r   r_   r`   r   output_paths)r4   r   r   r   r   r   rn  r:   rM  r;   %nemo_tarred_manifest_path_with_offset	  sX   



 'rQ  c                 C   s  | \}}t ||dddddddd	}t|ddt d}d	d
 |D }t|dks+J |\}t|dks6J |d }|jdksAJ |jdksHJ |jd jdksRJ |jd j	dks\J |
 }|jd |j  krodksrJ  J |d }	|	jdks}J |	jdksJ |	jd jdksJ |	jd j	dksJ |	
 }
|
jd |	jksJ tj|
|d d d t|	j|	jf  |d }	|	jdksJ |	jdksJ |	jd jdksJ |	jd j	dksJ |	
 }
|
jd |	jksJ tj|
|d d td|	jtd|	jf  d S )Nr   Fr   r   T)	r  r  r   r   r   r   r   r   force_finiter   r   c                 S   r   r:   r:   r   r:   r:   r;   r   R	  r   zItest_dataloader_from_tarred_nemo_manifest_with_offset.<locals>.<listcomp>r[  rL  rO  rT   i q rm   r"  rR   r  rN  r   g      "@)r   r   r   r   r   rr   rY   r0   rW   rM  r  r   ro   r  r(  rh  r   rp   )rQ  r  r  r   r   r   r   full_cut
full_audior   r+   r:   r:   r;   5test_dataloader_from_tarred_nemo_manifest_with_offset?	  sV   $($rU  c                 C   s   t | ddd}t|ddt d}dd |D }t | dddd	}t|ddt d}d
d |D }t|dt| ks?J tdd |D tdd |D ksSJ d S )Nrm   )r   r   r   r   r   r   c                 S   r   r:   r:   r   r:   r:   r;   r   	  r   z/test_force_iterable_dataset.<locals>.<listcomp>T)r   r   r   force_iterable_datasetc                 S   r   r:   r:   r   r:   r:   r;   r   	  r   c                 s        | ]}|D ]}|j V  qqd S r  rL   r   r   r6   r:   r:   r;   r  	      z.test_force_iterable_dataset.<locals>.<genexpr>r   r   r   r   r   r   )r<   r   r   batches_mapbatches_iterr:   r:   r;   test_force_iterable_dataset|	  s   ,r]  c                 C   s   t | dddd}t|ddt d}dd |D }t | ddddd	}t|ddt d}d
d |D }t|dt| ksAJ tdd |D tdd |D ksUJ d S )Nrm   T)r   r   r   rR  r   r   r   c                 S   r   r:   r:   r   r:   r:   r;   r   	  r   z*test_force_map_dataset.<locals>.<listcomp>)r   r   r   force_map_datasetrR  c                 S   r   r:   r:   r   r:   r:   r;   r   	  r   c                 s   rW  r  rL   rX  r:   r:   r;   r  	  rY  z)test_force_map_dataset.<locals>.<genexpr>rZ  )rK   r   r   r\  r[  r:   r:   r;   test_force_map_dataset	  s   	,r_  c                 C   s   | \}}}t i d|d|dddddddd	d
ddddddddddddddddd	dd	ddddi}t|d	dt d}t }|D ]
}|d }||7 }qOtdd  |D }	t|}
t|
t|kssJ d!|
|	ks{J d"d S )#Nr  r  r   r   r   Tr   r   r   r   r   Fr   rm   r   r   r   r   r   r   r$   r   r   r   r   tarred_random_accessrR  r   r   r   c                 S   s   g | ]}|d  qS )rV   r:   )r   r)   r:   r:   r;   r   	  s    zDtest_dataloader_from_tarred_nemo_subset_manifest.<locals>.<listcomp>z!Duplicate IDs found in the batch.zFThe set of IDs in the batches does not match the input JSON manifests.)r   r   r   r   rh   r   r   )r   r  r  r   r   r   seen_idsr   current_idsexpected_idsseen_ids_setr:   r:   r;   0test_dataloader_from_tarred_nemo_subset_manifest	  sf   
	


re  c              
   C   sh   t | ddddddd}t|ddt d}dd	 |D }d
d	 |D }t|dks,J t|r2J d S )Nr   Tr   r   FrH  r   c                 S   r   r:   r:   r   r:   r:   r;   r   	  r   zBtest_dataloader_from_nemo_manifest_with_skipme.<locals>.<listcomp>c                 S   $   g | ]}|D ]	}|j d dqqS rf   r   r/   getr   r   r   r:   r:   r;   r   	     $    r   r   r   r  r   r   )rk   r   r   r   skipme_sr:   r:   r;   .test_dataloader_from_nemo_manifest_with_skipme	  s   ro  c                 C   st   | \}}t ||dddddddd	}t|ddt d}dd	 |D }d
d	 |D }t|dks2J t|r8J d S )Nr   Tr   r   F)	r  r  r   r   r   r   r   r   rR  r   c                 S   r   r:   r:   r   r:   r:   r;   r   	  r   zItest_dataloader_from_tarred_nemo_manifest_with_skipme.<locals>.<listcomp>c                 S   rf  rg  rh  rj  r:   r:   r;   r   	  rk  rl  rm  )r   r  r  r   r   r   rn  r:   r:   r;   5test_dataloader_from_tarred_nemo_manifest_with_skipme	  s$   rp  c                 C   s   t d|d |d dddddd	d
| ddddddgdddddddd}t|ddt d}dd |D }dd |D }t|rDJ d S )Nr  r   r   r   rT   r+   r  r  r  r  r  r  r   Tr   )r  r   r   r   r   r   r   rR  r   c                 S   r   r:   r:   r   r:   r:   r;   r   
  r   zMtest_dataloader_from_data_input_cfg_yaml_path_with_skipme.<locals>.<listcomp>c                 S   rf  rg  rh  rj  r:   r:   r;   r    
  rk  )r   r   r   r   r   )rK   r   r   r   r   rn  r:   r:   r;   9test_dataloader_from_data_input_cfg_yaml_path_with_skipme	  s@   #rq  )|collectionsr   ior   	itertoolsr   pathlibr   typingr   r   r   r,   r`  r  r|  r   r	   r
   r   r   r   lhotse.audior   
lhotse.cutr   r   r   lhotse.datasetr   r   lhotse.sharr   r-   r   lhotse.testing.randomr   	omegaconfr   #nemo.collections.common.data.lhotser   1nemo.collections.common.data.lhotse.text_adaptersr   r   :nemo.collections.common.tokenizers.sentencepiece_tokenizerr   r   fixturer<   rK   rP   rb   rk   r   r   r   r   tupler   r   utilsr)   Datasetr   r   r   r   r   r   r   r  r  r  r  r  r  r  r  r!  r+  r8  r;  r=  markparametrizerJ  rL  rP  rp  rq  r  r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r%  r'  r*  r1  r7  r;  rE  rH  rJ  rQ  rU  r]  r_  re  ro  rp  rq  r:   r:   r:   r;   <module>   sF  






,

$
$-7/".0!0 7
5
=0#)7e
0




#
)

+


T


 


p


u!.
!
(
(

73='