o
    Si!                     @   sJ  d Z ddlZddlmZ ddlmZ ddlmZ ddlm	Z	m
Z
mZmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZ ddlmZmZmZ dZdd dD Z d!dee! fddZ			d"de
e de
e dee dee! dee! de	e"eeef f fddZ#	d!de
e dee! de
e fdd Z$dS )#aH  
The GALE Mandarin Broadcast news corpus consists of the following LDC corpora:

Audio: LDC2013S08, LDC2013S04, LDC2014S09, LDC2015S06, LDC2015S13, LDC2016S03
Text: LDC2013T20, LDC2013T08, LDC2014T28, LDC2015T09, LDC2015T25, LDC2016T12

# Training:  Testing:

The `S` corpora contain speech data and the `T` corpora contain the corresponding
transcriptions. This recipe prepares any subset of these corpora provided as
arguments, but pairs of speech and transcript corpora must be present. E.g.
to only prepare phase 3 news speech, the arguments
`audio_dirs = ["/export/data/LDC2013S08","/export/data/LDC2014S09"]` and
`transcript_dirs = ["/export/data/LDC2013T20","/export/data/LDC2014T28"]` must
be provided to the `prepare_gale_mandarin` method.

This data is not available for free - your institution needs to have an LDC subscription.
    N)defaultdict)chain)Path)DictListOptionalUnion)urlopen)$validate_recordings_and_supervisions)	RecordingRecordingSet)fix_manifests)check_dependencies)SupervisionSegmentSupervisionSet)Pathlikecheck_and_rglobis_module_availablezOhttps://github.com/kaldi-asr/kaldi/blob/master/egs/gale_mandarin/s5/local/test.c                 C   s   g | ]}t | qS  )KALDI_BASE_URL).0namer   r   P/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/recipes/gale_mandarin.py
<listcomp>&       r   )
LDC2013S04
LDC2013S08
LDC2014S09
LDC2015S06
LDC2015S13
LDC2016S03Fsegment_wordsc                 C   s,   t dstd| rt dstdd S d S )NpandaszGALE Mandarin data preparation requires the 'pandas' package to be installed. Please install it with 'pip install pandas' and try again.jiebazThe '--segment-words' option requires the 'jieba' package to be installed. Please install it with 'pip install jieba' and try again.)r   ImportErrorr!   r   r   r   r   3   s   r   T
audio_dirstranscript_dirs
output_dirabsolute_pathsreturnc           
         s  t | t |ksJ dtd ttdd tdd | D D tdd |D }td tfd	d
	 D }td t
t||dfdd}t||\}}t|| dd tD  tt}| fdd| fddd|d< | fdd| fddd|d< |durtd t|}|jddd dD ] }	||	 d |d|	 d  ||	 d |d |	 d  q|S )!a9  
    Prepare manifests for GALE Mandarin Broadcast speech corpus.

    :param audio_dirs: List of paths to audio corpora.
    :param transcripts_dirs: List of paths to transcript corpora.
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :param absolute_paths: Wheter to write absolute paths to audio sources (default = False)
    :param segment_words: Use `jieba` package to perform word segmentation (default = False)
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    z@Paths to the same speech and transcript corpora must be providedz5Reading audio and transcript paths from provided dirsc                 S   s   i | ]}|j |qS r   )stemr   pr   r   r   
<dictcomp>\   r   z)prepare_gale_mandarin.<locals>.<dictcomp>c                 S   s$   g | ]}d D ]	}t ||ddqqS ))z*.wavz*.flacF)strictr   )r   dirextr   r   r   r   _   s    z)prepare_gale_mandarin.<locals>.<listcomp>c                 S   s   g | ]}t |d qS )z*.tdfr0   )r   r1   r   r   r   r   h       zPreparing recordings manifestc                 3   s&    | ]}t j| rd nddV  qd S )N   )relative_path_depth)r   	from_filer,   )r)   r   r   	<genexpr>m   s
    
z(prepare_gale_mandarin.<locals>.<genexpr>zPreparing supervisions manifestr%   c                    
   | j  v S Nrecording_ids)audio_pathsr   r   <lambda>u      
 z'prepare_gale_mandarin.<locals>.<lambda>c                 S   s(   g | ]}t |D ]	}|d  qqS )zutf-8)r	   decodestrip)r   urlliner   r   r   r   {   s
    c                    r8   r9   idrTESTr   r   r?      r@   c                    r8   r9   r:   r<   rI   r   r   r?      r@   )
recordingssupervisionsdevc                    
   | j  vS r9   rE   rG   rI   r   r   r?      r@   c                    rN   r9   r:   r<   rI   r   r   r?      r@   trainNz Writing manifests to JSONL filesT)parentsexist_ok)rO   rM   rK   zgale-mandarin_recordings_z	.jsonl.gzrL   zgale-mandarin_supervisions_)lenlogginginfor   r   r   from_iterabler   from_recordingsvaluesr   from_segmentsparse_transcriptsfilterr   r
   TEST_FILE_URLSdictmkdirto_file)
r&   r'   r(   r)   r!   transcript_pathsrK   rL   	manifestspartr   )rJ   r)   r>   r   prepare_gale_mandarinA   sj   







rb   r_   c                    s  t | dd l |rdd l}g }t }| D ]} j|ddtdg dttttttddddd		}||j	d
k }|d 
dd |d< |d 
 fdd|d< |d 
 fdd|d< | D ]h\}}|d  d|d  d| }	t|d |d  dd}
|	|v s|
dkrq`||	 |t|	|d |d |
|d |d d|s|d n	d||d |d |d |d |d |d |d |d  d!d"
 q`q|S )#Nr   	r4      )reco_idchannelstartendspeakergenderdialecttextsectionturnsegmentsection_typesu_type)re   rf   rg   rh   ri   rl   TF)	delimiterskiprowsusecolsnamesdtypeskipinitialspaceerror_bad_lineswarn_bad_linesnontransre   c                 S   s   |   ddddS )Nz(1) z.sph)rB   replacexr   r   r   r?      s    z#parse_transcripts.<locals>.<lambda>ri   c                    s     | s| dd S | S )N#r{   )isnullr|   rB   r}   pdr   r   r?      s    rl   c                    s     | s	|  S | S r9   )r   rB   r}   r   r   r   r?      r3   -rh   rg      )ndigitsrj   Mandarin rf   rk   rm   rn   ro   rp   rq   )rk   rm   rn   ro   rp   rq   )
rF   r;   rg   durationri   rj   languagerl   rf   custom)r   r"   r#   setread_csvrangestrintfloatrp   applyiterrowsroundaddappendr   joincut)r_   r!   r#   rL   supervision_idsfiledfidxrowsupervision_idr   r   r   r   rY      s|   !


rY   )F)NTF)%__doc__rS   collectionsr   	itertoolsr   pathlibr   typingr   r   r   r   urllib.requestr	   lhotser
   lhotse.audior   r   	lhotse.qar   lhotse.recipes.nscr   lhotse.supervisionr   r   lhotse.utilsr   r   r   r   r[   boolr   rb   rY   r   r   r   r   <module>   sV    
X