o
    Ni                     @   s   d Z ddlmZ ddlmZ ddlmZ ddlZddlZddlm  m	Z
 ddlmZ dZdZdZd	Zd
ZG dd dejjZdS )z#The audio part of VoxCeleb dataset.    )absolute_import)division)print_functionNz
@InProceedings{Nagrani17,
	author       = "Nagrani, A. and Chung, J.~S. and Zisserman, A.",
	title        = "VoxCeleb: a large-scale speaker identification dataset",
	booktitle    = "INTERSPEECH",
	year         = "2017",
}
z
An large scale dataset for speaker identification. This data is collected from
over 1,251 speakers, with over 150k samples in total.
This release contains the audio part of the voxceleb1.1 dataset.
z7http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1.htmlzAhttp://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/iden_split.txti  c                   @   sF   e Zd ZdZejdZde	Z
dd Zdd Zdd	 Zd
d ZdS )Voxcelebz0The VoxCeleb dataset for speaker identification.z1.1.1z
  manual_dir should contain the file vox_dev_wav.zip. The instructions for
  downloading this file are found in {}. This dataset requires registration.
  c              	   C   s<   t jj| tt jt jjdddt jjtdddt	t
dS )Nwavi>  )file_formatsample_rate)num_classesaudiolabel)builderdescriptionfeaturessupervised_keyshomepagecitation)tfdscoreDatasetInfo_DESCRIPTIONr   FeaturesDictAudio
ClassLabelNUM_CLASSES	_HOMEPAGE	_CITATION)self r   V/home/ubuntu/.local/lib/python3.10/site-packages/tensorflow_datasets/audio/voxceleb.py_info;   s   zVoxceleb._infoc                 C   s   t j|jd}tjj|std	||
|}|dti}| |d }tjjtjj||d ddtjjtjj||d ddtjjtjj||d ddgS )	zReturns SplitGenerators.zvox_dev_wav.zipzcVoxCeleb requires manual download of the data. Please download the audio data and place it into: {}
iden_splittrain)extract_path
file_names)name
gen_kwargs
validationtest)ospathjoin
manual_dirtfiogfileexistsAssertionErrorformatextractdownloadIDEN_SPLITS_URL_calculate_splitsr   r   SplitGeneratorSplitTRAIN
VALIDATIONTEST)r   
dl_managerzip_pathr#   iden_splits_pathiden_splitsr   r   r   _split_generatorsH   s8   
zVoxceleb._split_generatorsc           	      c   sr    |D ]3}t j|d|}tjj|sq|dtd  d\}}}t	|dd }||d}||fV  qdS )zYields examples.r   Nz.wav/   r
   )
r)   r*   r+   r-   r.   r/   r0   lensplitint)	r   r#   r$   	file_name	full_namespeaker_
speaker_idexampler   r   r   _generate_exampleso   s   
zVoxceleb._generate_examplesc                 C   s~   t t}tjj|)}|D ]}|  \}}ddddt	| }|| 
|  qW d   |S 1 s8w   Y  |S )zCRead the train/dev/test splits from VoxCeleb's iden_split.txt file.r"   r'   r(   )      rB   N)collectionsdefaultdictsetr-   r.   r/   GFilestriprD   rE   add)r   r>   data_splitsflinegroupr*   
split_namer   r   r   r6   z   s   

zVoxceleb._calculate_splitsN)__name__
__module____qualname____doc__r   r   VersionVERSIONr2   r   MANUAL_DOWNLOAD_INSTRUCTIONSr    r@   rL   r6   r   r   r   r   r   1   s    'r   )r]   
__future__r   r   r   rO   r)   tensorflow.compat.v2compatv2r-   tensorflow_datasets.public_api
public_apir   r   r   r   r5   r   r   GeneratorBasedBuilderr   r   r   r   r   <module>   s   	