o
    Sik                     @   sH  d Z ddlZddlmZ ddlmZ ddlmZ ddlm	Z	m
Z
mZmZmZmZ ddlmZ ddlmZmZmZmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZ ddl m!Z!m"Z" dZ#de!de$de$de$deeeef  f
ddZ%	dde$de!de&deeef fddZ'		d de!dee! de&de	e$eeef f fddZ(dS )!u  
The People’s Speech Dataset is among the world’s largest English speech recognition corpus today
that is licensed for academic and commercial usage under CC-BY-SA and CC-BY 4.0.
It includes 30,000+ hours of transcribed speech in English languages with a diverse set of speakers.
This open dataset is large enough to train speech-to-text systems and crucially is available with
a permissive license.
Just as ImageNet catalyzed machine learning for vision, the People’s Speech will unleash innovation
in speech research and products that are available to users across the globe.

Source: https://mlcommons.org/en/peoples-speech/
Full paper: https://openreview.net/pdf?id=R8CwidgJ0yT
    N)defaultdict)ThreadPoolExecutor)Path)DictListOptionalSequenceTupleUnion)tqdm)AudioSource	RecordingRecordingSetinfo)fix_manifests$validate_recordings_and_supervisions)manifests_exist)
load_jsonl)SupervisionSegmentSupervisionSet)Pathlikecompute_num_samples)ztrain/dirty_saztrain/dirtyztrain/clean_saztrain/cleanzvalidation/validationz	test/test	audio_dirtext
audio_path
identifierreturnc              
   C   sB   | | }t j||jd}t|j|jd|jd|dd|id}||fS )N)pathrecording_idg        r   English
session_id)idr   startdurationchannelr   languagecustom)r   	from_filestemr   r!   r#   )r   r   r   r   	full_path	recordingsegment r,   Q/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/recipes/peoples_speech.py_parse_utterance'   s    r.      subset
corpus_dirnum_jobsc                 C   s.  t |}|| dd  }| dd }||  }t|o}g }g }g }	tt|| d ddD ]}
t|
d   D ]\}}}||t	||||
d  q:q0t|d	dD ]}|
 }|\}}|| |	| qUt|}t|	}t||\}}t||d
 W d   ||fS 1 sw   Y  ||fS )z
    Returns the RecodingSet and SupervisionSet given a dataset part.
    :param subset: str, the name of the subset.
    :param corpus_dir: Pathlike, the path of the data dir.
    :return: the RecodingSet and SupervisionSet for train and valid.
    /r   r/   z.jsonzDistributing tasksdesctraining_datar   
Processing
recordingssupervisionsN)r   splitr   r   r   zipvaluesappendsubmitr.   resultr   from_recordingsr   from_segmentsr   r   )r0   r1   r2   part_dir	part_namer   exfuturesr9   r:   item_r   r   futurer@   r*   r+   recording_setsupervision_setr,   r,   r-   _prepare_subsetA   sL   





""rL   
output_dirc                 C   s   t | } |  sJ d|  td t}|dur%t |}|jddd tt}t|ddD ]^}td|	d	d
   t
|	d	d
 |dddrZtd|	d	d
  d q/t|| |\}}|dur||d|	d	d
  d  ||d|	d	d
  d  ||d||< q/|S )a_  
    Prepare :class:`~lhotse.RecordingSet` and :class:`~lhotse.SupervisionSet` manifests
    for The People's Speech.

    :param corpus_dir: Pathlike, the path of the main data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a dict with keys "recordings" and "supervisions" with lazily opened manifests.
    zNo such directory: zPreparing People's Speech...NT)parentsexist_okzDataset partsr4   z#Processing People's Speech subset: r3   r/   peoples_speechzjsonl.gz)partrM   prefixsuffixzPeople's Speech subset: z already prepared - skipping.peoples_speech_supervisions_z	.jsonl.gzpeoples_speech_recordings_r8   )r   is_dirloggingr   PEOPLES_SPEECHmkdirr   dictr   r;   r   rL   to_file)r1   rM   r2   subsets	manifestsrQ   rJ   rK   r,   r,   r-   prepare_peoples_speechv   s@   
r^   )r/   )Nr/   ))__doc__rW   collectionsr   concurrent.futures.threadr   pathlibr   typingr   r   r   r   r	   r
   	tqdm.autor   lhotse.audior   r   r   r   	lhotse.qar   r   lhotse.recipes.utilsr   lhotse.serializationr   lhotse.supervisionr   r   lhotse.utilsr   r   rX   strr.   intrL   r^   r,   r,   r,   r-   <module>   s\     



7