o
    NiJ                     @   s   d Z ddlmZ ddlmZ ddlmZ ddlZddlZddlZddlZddl	Z
ddlm  mZ ddlmZ dZedddZd	e Zd
ZdZg dZdd Zdd ZG dd dejjZdS )zVoxForge dataset.    )absolute_import)division)print_functionNz
@article{maclean2018voxforge,
  title={Voxforge},
  author={MacLean, Ken},
  journal={Ken MacLean.[Online]. Available: http://www.voxforge.org/home.[Acedido em 2012]},
  year={2018}
}
i     a  
VoxForge is a language classification dataset. It consists of user submitted
audio clips submitted to the website. In this release, data from 6 languages
is collected - English, Spanish, French, German, Russian, and Italian.
Since the website is constantly updated, and for the sake of reproducibility,
this release contains only recordings submitted prior to {}.
The samples are splitted between train, validation and testing so that samples
from each speaker belongs to exactly one split.
zhttp://www.voxforge.org/zMhttps://storage.googleapis.com/tfds-data/downloads/voxforge/voxforge_urls.txt)deenesfritruc                 C   s   t | |krtdjt | |dtdd | D }td| dkr)td| g }d}| D ]\}}|}||7 }||t|| t|| f q/|d	 d
 |d	 d |f|d	< |S )a  Computes boundary indices for each of the splits in split_probs.

  Args:
    split_probs: List of (split_name, prob), e.g. [('train', 0.6), ('dev', 0.2),
      ('test', 0.2)]
    n_items: Number of items we want to split.

  Returns:
    The item indices of boundaries between different splits. For the above
    example and n_items=100, these will be
    [('train', 0, 60), ('dev', 60, 80), ('test', 80, 100)].
  z]Not enough items for the splits. There are {splits} splits while there are only {items} items)splitsitemsc                 s       | ]\}}|V  qd S N ).0namepr   r   V/home/ubuntu/.local/lib/python3.10/site-packages/tensorflow_datasets/audio/voxforge.py	<genexpr>M       z,_compute_split_boundaries.<locals>.<genexpr>r   g:0yE>z"Probs should sum up to 1. probs={}g        r   )len
ValueErrorformatsumabsappendint)split_probsn_itemstotal_probssplit_boundariessum_pr   r   prevr   r   r   _compute_split_boundaries<   s$   "r%   c                 C   s   t tdd | D }tj|}|| t|t|}i }|D ]\}}}	t||	D ]}
||||
 < q+q!t	
t}| D ]\}}|| }|| | q<|S )aD  Split items to train/dev/test, so all items in group go into same split.

  Each group contains all the samples from the same speaker ID. The samples are
  splitted between train, validation and testing so that samples from each
  speaker belongs to exactly one split.

  Args:
    items_and_groups: Sequence of (item_id, group_id) pairs.
    split_probs: List of (split_name, prob), e.g. [('train', 0.6), ('dev', 0.2),
      ('test', 0.2)]
    split_number: Generated splits should change with split_number.

  Returns:
    Dictionary that looks like {split name -> set(ids)}.
  c                 s   r   r   r   )r   item_idgroup_idr   r   r   r   n   r   z-_get_inter_splits_by_group.<locals>.<genexpr>)sortedsetnprandomRandomStateshuffler%   r   rangecollectionsdefaultdictadd)items_and_groupsr   split_numbergroupsrngr"   group_id_to_split
split_namei_starti_endisplit_to_idsr&   r'   splitr   r   r   _get_inter_splits_by_group^   s   

r=   c                   @   sD   e Zd ZdZejdZe	d
eZdd Zdd Zdd	 Zd
S )Voxforgez@A Language classification dataset based on the VoxForge website.z1.0.0aD  
  VoxForge requires manual download of the audio archives. The complete list of
  archives can be found in {}. It can be downloaded using the following command:
  wget -i voxforge_urls.txt -x
  Note that downloading and building the dataset locally requires ~100GB disk
  space (but only ~60GB will be used permanently).
  c              	   C   s@   t jj| tt jt jjdddt jjtdt	j
ddttdS )Nwavi>  )file_formatsample_rate)namesaudiolabel
speaker_id)rD   rE   )builderdescriptionfeaturessupervised_keyshomepagecitation)tfdscoreDatasetInfo_DESCRIPTIONrI   FeaturesDictAudio
ClassLabelLABELStfstring	_HOMEPAGE	_CITATION)selfr   r   r   _info   s   zVoxforge._infoc                 C   sP  | dti}g }tjj|d 4}|D ])}| dddd}tj	
|j|}tjj|s9td||| qW d   n1 sIw   Y  |}g }	t||D ]\}
}tj	|\}}|dd }|	|
|f qWg d	}t|	|d}tjjtjj|d
 |ddtjjtjj|d |ddtjjtjj|d |ddgS )zReturns SplitGenerators.	urls_list" 'z5VoxForge requires manual download. Path {} is missingN-r   ))traingffffff?)
validationg?)testg?r`   )
file_names
dl_manager)r   
gen_kwargsra   rb   )download_URLS_LIST_FILErU   iogfileGFilestripreplaceospathjoin
manual_direxistsAssertionErrorr   r   zipr<   r=   rM   rN   SplitGeneratorSplitTRAIN
VALIDATIONTEST)rY   rd   r[   archive_urlsflinearchive_urlarchive_patharchivesarchives_and_speaker_idsarchive_archive_namerF   r   r   r   r   r   _split_generators   sX   
zVoxforge._split_generatorsc              
   c   s    |D ]Q}t j|\}}|dd }|dd }|||d  }||}	|	D ])\}
}|
ds4q*t j|
\}}d|||dtd  }||||dfV  q*qdS )	zYields examples.r_   r   z/Trunk   z.wavz{}_{}_{}NrC   )rm   rn   r<   indexiter_archiveendswithr   r   )rY   rc   rd   fnamefolderr   rF   	label_idxrE   r   wav_pathwav_objr   wav_namekeyr   r   r   _generate_examples   s   

zVoxforge._generate_examplesN)__name__
__module____qualname____doc__rM   rN   VersionVERSIONtextwrapdedentr   rg   MANUAL_DOWNLOAD_INSTRUCTIONSrZ   r   r   r   r   r   r   r>      s    ,r>   ) r   
__future__r   r   r   r/   datetimerm   r   numpyr*   tensorflow.compat.v2compatv2rU   tensorflow_datasets.public_api
public_apirM   rX   date
_LAST_DATEr   	isoformatrP   rW   rg   rT   r%   r=   rN   GeneratorBasedBuilderr>   r   r   r   r   <module>   s,   	

""