o
    Ni<                     @   s   d Z ddlmZ ddlmZ ddlmZ ddlZddlm  mZ	 ddl
mZ dZdZdZd	Zed
 ed ed ed ed ed ed dZG dd dejjZdd ZG dd dejjZdd ZdS )Librispeech dataset.    )absolute_import)division)print_functionNan  @inproceedings{panayotov2015librispeech,
  title={Librispeech: an ASR corpus based on public domain audio books},
  author={Panayotov, Vassil and Chen, Guoguo and Povey, Daniel and Khudanpur, Sanjeev},
  booktitle={Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference on},
  pages={5206--5210},
  year={2015},
  organization={IEEE}
}
a  LibriSpeech is a corpus of approximately 1000 hours of read English speech with sampling rate of 16 kHz,
prepared by Vassil Panayotov with the assistance of Daniel Povey. The data is derived from read
audiobooks from the LibriVox project, and has been carefully segmented and aligned.87
zhttp://www.openslr.org/12z$http://www.openslr.org/resources/12/zdev-clean.tar.gzzdev-other.tar.gzztest-clean.tar.gzztest-other.tar.gzztrain-clean-100.tar.gzztrain-clean-360.tar.gzztrain-other-500.tar.gz)	dev_clean	dev_other
test_clean
test_othertrain_clean100train_clean360train_other500c                       s*   e Zd ZdZejjd fdd	Z  ZS )LibrispeechConfigzBuilderConfig for Librispeech.Nc                    sx   | d}|du r|r|jnd}||d< | d}|du r(|r&d|jj }nd}||d< tt| jdi | || _dS )a  Constructs a LibrispeechConfig.

    Args:
      text_encoder_config: `tfds.features.text.TextEncoderConfig`, configuration
        for the `tfds.features.text.TextEncoder` used for the text feature.
      **kwargs: keyword arguments forwarded to super.
    nameN
plain_textdescriptionzTranscriptions use the %sz!Transcriptions are in plain text. )getr   encoder_cls__name__superr   __init__text_encoder_config)selfr   kwargsr   r   	__class__r   Y/home/ubuntu/.local/lib/python3.10/site-packages/tensorflow_datasets/audio/librispeech.pyr   >   s   
	

zLibrispeechConfig.__init__)N)	r   
__module____qualname____doc__tfdscoredisallow_positional_argsr   __classcell__r   r   r   r   r   ;   s    r   c                  C   sf   dt jjjdt jjjddt jjjdt jjjddg} g }| D ]}tt jd|d}|| q|S )	zMake built-in Librispeech BuilderConfigs.

  Uses 3 text encodings (plain_text, subwords with 8k vocab, subwords with 32k
  vocab).

  Returns:
    `list<tfds.audio.LibrispeechConfig>`
  N
subwords8ki    )r   r   
vocab_sizesubwords32ki   z1.1.0)versionr   )	r    featurestextTextEncoderConfigSubwordTextEncoderr   r!   Versionappend)text_encoder_configsconfigsr   configr   r   r   _make_builder_configsY   s(   

r1   c                   @   sF   e Zd ZdZe Zdd Zdd Zdd Zdd	 Z	d
d Z
dd ZdS )Librispeechr   c                 C   sV   t jj| tt jt jjddt jj| jj	dt
jt
jt
jddttt jjdddS )Ni>  )sample_rate)encoder_config)speechr)   
speaker_id
chapter_idid)r5   r)   )builderr   r(   supervised_keyshomepagecitationmetadata)r    r!   DatasetInfo_DESCRIPTIONr(   FeaturesDictAudioTextbuilder_configr   tfint64string_URL	_CITATIONMetadataDict)r   r   r   r   _info{   s"   zLibrispeech._infoc                 c   s,    |D ]}t |D ]	\}}|d V  q	qd S )Nr)   )_generate_librispeech_examples)r   dirs	directory_exampler   r   r   _vocab_text_gen   s   zLibrispeech._vocab_text_genc                 C   sX   t | d }| tj|dg d| jjd< | tj|dg d| jjd< d S )Nr   zLibriSpeech/SPEAKERS.TXT)r6   gendersubsetminutesr   speakerszLibriSpeech/CHAPTERS.TXT)r7   r6   rS   rR   
project_idbook_idchapter_titleproject_titlechapters)listvalues_read_metadata_fileospathjoininfor=   )r   rL   rM   r   r   r   _populate_metadata   s   zLibrispeech._populate_metadatac                 C   s   i }t jj|4}|D ](}|drq|dt|}dd t|dd  |dd  D |t|d < qW d    |S 1 s@w   Y  |S )N;|c                 S   s   i | ]	\}}||  qS r   )strip.0kvr   r   r   
<dictcomp>   s    z3Librispeech._read_metadata_file.<locals>.<dictcomp>   r   )	rD   iogfileGFile
startswithsplitlenzipint)r   r^   field_namesr=   flinefieldsr   r   r   r\      s   

zLibrispeech._read_metadata_filec                 C   sT   | t}dd | D }| jjd | | | | dd | D }|S )Nc                 S   s   g | ]\}}| d r|qS )train)rn   re   r   r   r   
<listcomp>   s
    z1Librispeech._split_generators.<locals>.<listcomp>r)   c                 S   s$   g | ]\}}t jj|d |idqS )rM   )r   
gen_kwargs)r    r!   SplitGeneratorre   r   r   r   rx      s    )download_and_extract_DL_URLSitemsr`   r(   maybe_build_from_corpusrP   ra   )r   
dl_managerextracted_dirsall_train_dirssplitsr   r   r   _split_generators   s   

zLibrispeech._split_generatorsc                 C   s,   t jjj}|||gB |tB | B S )zGenerates examples as dicts.)r    r!   lazy_importsapache_beamCreateFlatMaprK   	Reshuffle)r   pipelinerM   beamr   r   r   _build_pcollection   s   

zLibrispeech._build_pcollectionN)r   r   r   r   r1   BUILDER_CONFIGSrJ   rP   ra   r\   r   r   r   r   r   r   r2   v   s    r2   c              
   c   s    t j| dd}tjj|D ][}t j|}tjjt j||?}|D ]4}|	 }|
dd\}}d| }dd |
dd	d
 D \}	}
||	|
t j|||d}||fV  q'W d	   n1 sfw   Y  qd	S )z/Generate examples from a Librispeech directory.LibriSpeechz*/*/*/*.txt rj   z%s.flacc                 S   s   g | ]}t |qS r   )rr   )rf   elr   r   r   rx      s    z2_generate_librispeech_examples.<locals>.<listcomp>-N   )r8   r6   r7   r5   r)   )r]   r^   r_   rD   rk   rl   globdirnamerm   rd   ro   )rM   transcripts_globtranscript_filer^   rt   ru   key
transcript
audio_filer6   r7   rO   r   r   r   rK      s*    rK   )r   
__future__r   r   r   r]   tensorflow.compat.v2compatv2rD   tensorflow_datasets.public_api
public_apir    rH   r?   rG   _DL_URLr|   r!   BuilderConfigr   r1   BeamBasedBuilderr2   rK   r   r   r   r   <module>   s.   O