o
    Sie                     @   s4  d Z ddlZddlmZ ddlmZ ddlmZ ddlm	Z	m
Z
mZmZmZmZ ddlmZ ddlmZmZmZmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZ ddl m!Z!m"Z" dZ#de$de$fddZ%		d!de!dee! dee$ee$ f de&fddZ'dede$de$de$de$deeeef  fdd Z(dS )"a  
The KeSpeech is an open source speech dataset, KeSpeech, which involves 1,542 hours of speech
signals recorded by 27,237 speakers in 34 cities in China, and the pronunciation includes
standard Mandarin and its 8 subdialects. The new dataset possesses several properties.
The dataset provides multiple labels including content transcription, speaker identity and
subdialect, hence supporting a variety of speech processing tasks, such as speech recognition,
speaker recognition, and subdialect identification, as well as other advanced techniques
like multi-task learning and conditional learning.

Full paper: https://openreview.net/forum?id=b3Zoeq2sCLq
    N)defaultdict)ThreadPoolExecutor)Path)DictListOptionalSequenceTupleUnion)tqdm)AudioSource	RecordingRecordingSetinfo)fix_manifests$validate_recordings_and_supervisions)manifests_exist)
load_jsonl)SupervisionSegmentSupervisionSet)Pathlikecompute_num_samples)train_phase1train_phase2
dev_phase1
dev_phase2testlinereturnc                 C   s   |  dd} | S )Nz<SPOKEN_NOISE> )replace)r    r!   K/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/recipes/kespeech.pytext_normalize   s   r#   all   
corpus_dir
output_dirdataset_partsnum_jobsc                 C   s  t | } | d d }|  sJ d|  | s J d| |d ur/t |}|jddd d|v r5tn|}tt}|D ]}|tvrJtd| g g d||< q=t|}t|d	d
dD ]}	t	
d|	  t|	|dryt	
d|	 d q_g }
g }||	 }g }t|d dh}t|d dQ}t|d d:}t|d d#}t||||D ]\}}}}||t| |||| qW d    n1 sw   Y  W d    n1 sw   Y  W d    n1 sw   Y  W d    n1 sw   Y  t|dddD ]}| }|d u rq|\}}|
| || qt|
}t|}t||\}}t|| |d urO||d|	 d  ||d|	 d  ||d||	< q_W d    |S 1 scw   Y  |S )NTasksASRzNo such directory: T)parentsexist_okr$   z&No such part of dataset in KeSpeech : )
recordingssupervisionszProcessing KeSpeechsubset)descunitzProcessing KeSpeech subset: )partr'   zKeSpeech subset: z already prepared - skipping.zwav.scp)filetextutt2subdialectutt2spk
ProcessingF)r1   leavezkespeech-asr_supervisions_z	.jsonl.gzzkespeech-asr_recordings_)r   is_dirmkdirKE_SPEECH_PARTSr   dict
ValueErrorr   r   loggingr   r   openzipappendsubmitparse_utteranceresultr   from_recordingsr   from_segmentsr   r   to_file)r&   r'   r(   r)   	tasks_dirsubsets	manifestssubexr3   r.   r/   	part_pathfutureswav_scpr5   r6   r7   wav_line	text_linedialect_linespk_linefuturerE   	recordingsegmentrecording_setsupervision_setr!   r!   r"   prepare_kespeech$   s      






<<rZ   rQ   rR   rS   rT   c              	   C   s   |  jdd\}}|  jdd\}}|  jdd\}	}
|  jdd\}}||kr4||	kr4|	|ks6J d}t| | }t|tddgt| | dg|t|j|d|jd}t||d	|jt	|  |
|d
}||fS )Nr%   )maxspliti>  r4   r   )typechannelssource)durationsampling_rate)idsourcesr`   num_samplesr_   g        )ra   recording_idstartr_   r5   languagespeaker)
stripsplitr   r   r   strr   r_   r   r#   )r&   rQ   rR   rS   rT   wav_idwav_patht_wav_id
transcriptd_wav_iddialects_wav_idrg   r`   recording_inforV   supervisionr!   r!   r"   rD   z   s@   

	rD   )r$   r%   ))__doc__r?   collectionsr   concurrent.futures.threadr   pathlibr   typingr   r   r   r   r	   r
   	tqdm.autor   lhotse.audior   r   r   r   	lhotse.qar   r   lhotse.recipes.utilsr   lhotse.serializationr   lhotse.supervisionr   r   lhotse.utilsr   r   r<   rj   r#   intrZ   rD   r!   r!   r!   r"   <module>   sN     
V