o
    Si                     @   s@  d Z ddlZddlZddlmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZmZmZmZmZmZmZ ddlmZ dd	lmZmZmZ dd
lmZmZmZ ddlmZmZ ddl m!Z!m"Z" dZ#			dde!dee$ee$ f dee! de%dee$ee$eeef f f f
ddZ&dede
dedeeee$ee f f fddZ'dS )aJ  
Description taken from the official website of wenetspeech
(https://wenet-e2e.github.io/WenetSpeech/)

We release a 10000+ hours multi-domain transcribed Mandarin Speech Corpus
collected from YouTube and Podcast. Optical character recognition (OCR) and
automatic speech recognition (ASR) techniques are adopted to label each YouTube
and Podcast recording, respectively. To improve the quality of the corpus,
we use a novel end-to-end label error detection method to further validate and
filter the data.

See https://github.com/wenet-e2e/WenetSpeech for more details about WenetSpeech
    N)defaultdict)ProcessPoolExecutor)repeat)Path)AnyDictListOptionalSequenceTupleUnion)tqdm)compute_num_samplesfix_manifests$validate_recordings_and_supervisions)AudioSource	RecordingRecordingSet)SupervisionSegmentSupervisionSet)Pathlikeadd_durations)LMSDEVTEST_NETTEST_MEETINGall   
corpus_dirdataset_parts
output_dirnum_jobsreturnc              	   C   s  t | } |  sJ d|  |durt |}|jddd d|v r$tn|}tt}|D ]}|tvr9td| g g d||< q,| d }| sPJ d	| t	d
|  t
t|ddd}t|8}	t|	t|d t| t|ddD ]\}
}|D ]}|| d |
 || d ||  qqyW d   n1 sw   Y  |D ]?}tt|| d t|| d d\}}t||d |dur||d| d  ||d| d  ||d||< q|S )a  
    Returns the manifests which consist of the Recordings and Supervisions
    :param corpus_dir: Pathlike, the path of the data dir.
    :param dataset_parts: Which parts of dataset to prepare, all for all the
                          parts.
    :param output_dir: Pathlike, the path where to write the manifests.
    :num_jobs Number of workers to extract manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with
             the keys 'recordings' and 'supervisions'.
    zNo such directory: NT)parentsexist_okr   z)No such part of dataset in WenetSpeech : )
recordingssupervisionszWenetSpeech.jsonzNo such file : zLoading raw manifests from : rutf8)encodingaudiosz#Processing WenetSpeech JSON entries)descr'   r(   wenetspeech_supervisions_z	.jsonl.gzwenetspeech_recordings_)r   is_dirmkdirWETNET_SPEECH_PARTSr   dict
ValueErroris_filelogginginfojsonloadopenr   r   mapparse_utterancer   appendextendr   r   from_recordingsr   from_segmentsr   to_file)r    r!   r"   r#   subsets	manifestssubraw_manifests_pathraw_manifestsex	recordingsegmentspartr'   r(    rK   O/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/recipes/wenet_speech.pyprepare_wenet_speech%   s`   
	
rM   audio	root_pathrB   c           	   
   C   s   d}t | d tddgt|| d  dgt| d |d|| d d	}tt}|D ]}g ||< q(| d
 D ]2}t|d | d |d t|d |d  |dd|d  d}|d D ]}||v rd|| 	| qWq3||fS )Ni>  aidfiler   path)typechannelssourceduration)rV   sampling_rate)idsourcesnum_samplesrW   rV   rI   sid
begin_timeend_time)rW   Chinesetext)rX   recording_idstartrV   languager_   rB   )
r   r   strr   r   r3   r   r   stripr=   )	rN   rO   rB   rW   rH   rI   rD   segsegmentrK   rK   rL   r<   m   sF   


r<   )r   Nr   )(__doc__r8   r6   collectionsr   concurrent.futuresr   	itertoolsr   pathlibr   typingr   r   r   r	   r
   r   r   	tqdm.autor   lhotser   r   r   lhotse.audior   r   r   lhotse.supervisionr   r   lhotse.utilsr   r   r2   rc   intrM   r<   rK   rK   rK   rL   <module>   sJ    $
H