o
    2wi0                     @   s  d Z ddlZddlZddlZddlZddlmZ ddlmZ ddl	m
Z
 ddlmZ ddlmZmZmZmZ ddlmZ dd	lmZmZmZmZmZ dd
lmZ ddlmZmZmZm Z  dZ!dgZ"dZ#dddddddddZ$		d9dedeee%ee% f  defddZ&			d:d ed!ee deee%ee% f  d"e'dee%ee%ee%eeef f f f f
d#d$Z(		%	d;d ed!ee d%e%d"e'dee%ee%eeef f f f
d&d'Z)d(ed%e%fd)d*Z*d+d, Z+d-d. Z,d/d0 Z-d1d2 Z.d3d4 Z/d5d6 Z0d7d8 Z1dS )<a  
MTEDx is a collection of transcribed and translated speech corpora:
 https://openslr.org/100

It has 8 languages:
es - 189h
fr - 189h
pt - 164h
it - 107h
ru - 53h
el - 30h
ar - 18h
de - 15h

A subset of this audio is translated and split into the following partitions:
     - train
     - dev
     - test
     - iwslt2021 sets

This recipe only prepares the ASR portion of the data.
    N)defaultdict)ThreadPoolExecutor)partial)Path)DictOptionalSequenceUnion)tqdm)	RecordingRecordingSetSupervisionSegmentSupervisionSet$validate_recordings_and_supervisions)fix_manifests)Pathlikeis_module_availableresumable_downloadsafe_extract)	McMnLlLmLoLtLuNdZsu   ’)esfrptitruelardeFrenchSpanish
PortugueseItalianRussianGreekArabicGerman)r   r   r    r!   r"   r#   r$   r%   .all
target_dir	languagesreturnc              	   C   s  t | d } | jddd tt }t|tr |dkr |g}nt|ts*t|tr,|}t|dD ]N}| | d| d }| d| d	 }|	 rVt
d
| d| d q1td| d|d t|}t|| d W d   n1 svw   Y  |  q1| S )a  
    Download and untar the dataset.

    :param: target_dir: Pathlike, the path of the directory where the
        mtdex_corpus directory will be created and to which data will
        be downloaded.
    :param: languages: A str or sequence of strings specifying which
        languages to download. The default 'all', downloads all available
        languages.
    :return: the path to downloaded and extracted directory with data.
    mtedx_corpusTparentsexist_okr/   zDownloading MTEDx languages-z.tgzr.   z
.completedz	Skipping z	 because z exists.z+http://www.openslr.org/resources/100/mtedx_)filename)pathN)r   mkdirlistISOCODE2LANGkeys
isinstancestrtupler
   is_filelogginginfor   tarfileopenr   touch)r0   r1   
langs_listlangtar_pathcompleted_detectortar rL   Q/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/lhotse/recipes/mtedx.pydownload_mtedxL   s*   
rN      
corpus_dir
output_dirnum_jobsc           	      C   s   t | tr	t| } t |trt|}tt }t |tr%|dkr%|g}nt |ts/t |tr7|d dkr7|}tt}|D ]}| | d|  }||  }|	 r[t
||||d||< q=t|S )a  
    Prepares manifest of all MTEDx languages requested.

    :param corpus_dir: Path to the root where MTEDx data was downloaded.
                       It should be called mtedx_corpus.
    :param output_dir: Root directory where .json manifests are stored.
    :param languages: str or str sequence specifying the languages to prepare.
        The str 'all' prepares all languages.
    :return:
    r/   r   r7   )languagerR   )r>   r?   r   r;   r<   r=   r@   r   dictis_dirprepare_single_mtedx_language)	rP   rQ   r1   rR   rG   	manifestsrH   corpus_dir_langoutput_dir_langrL   rL   rM   prepare_mtedxx   s.   



rZ   rS   c              
   C   s  t | tr	t| } tt}t|}dD ]}| d| d }tdd |dD }t	|dkr9t
d|  g }	| d| d	 }
g }|
d
D ]}||t|| qJt|dddD ]}| }|du riq^|D ]}|	| qkq^t	|	dkrt
d|
  t|	}	t||	\}}	t||	 ||	d||< |durt |trt|}|jddd |dkrdn|}||d| d| d  |	|d| d| d  qW d   t|S 1 sw   Y  t|S )a  
    Prepares manifests using a single MTEDx language.

    This function works as follows:

        - First it looks for the audio directory in the data/wav where the .flac
            files are stored.
        - Then, it looks for the vtt directory in data/{train,dev,test}/vtt
            which contains the segmentation and transcripts for the audio.
        - The transcripts undergo some basic text normalization

    :param corpus_dir: Path to the root of the MTEDx download
    :param output_dir: Path where the manifests are stored as .json files
    :param language: The two-letter language code.
    :param num_jobs: Number of threads to use when preparing data.
    :return:
    )trainvalidtestzdata/z/wavc                 s   s    | ]}t |V  qd S N)r   	from_file).0prL   rL   rM   	<genexpr>   s    

z0prepare_single_mtedx_language.<locals>.<genexpr>z*.flacr   zNo .flac files found in z/vtt*
ProcessingF)descleaveNzNo supervisions found in )
recordingssupervisionsTr4   r\   devzmtedx-_recordings_z	.jsonl.gz_supervisions_)r>   r?   r   r   rT   r   r   from_recordingsgloblenrB   warningappendsubmit_filename_to_supervisionsr
   resultr   from_segmentsr   r   r:   to_file)rP   rQ   rS   rR   rW   exsplit	audio_dirrg   rh   text_dirfuturesra   futurers   sup
save_splitrL   rL   rM   rV      s`   







..rV   r8   c                 C   s   |   }| jdd }g }tt}t|dD ]d\}}}g }	| D ]!}
|
 }td|r4|		| q"||
r>|		| q"|		d q"d
|	}td|tjrV| }nd|v s^d	|v r_q| }|	tt||||t|| d
dd|||d q|S )Nr.   r   z<noise>z^(\([^)]*\) *)+$z<unk> z^\w+ *(<[^>]*> *)+$<>   )ndigits)idrecording_idstartdurationchanneltextrS   speaker)	read_textstemrw   r   _filter_word
_parse_vttstriprematchrp   joinUNICODEr   _format_uttidround)r8   rS   linesrecoidrh   	filterfunr   endline	line_listww_line_line_newrL   rL   rM   rr      s@   

rr   c                 C   s$   d tt|d }d| |gS )Nz{0:08d}d   _)formatintfloatr   )r   r   rL   rL   rM   r   "  s   r   c                 C   s*   | D ]}t |tvr|tvr dS qdS )NFTunicodedatacategoryVALID_CATEGORIES	KEEP_LIST)scrL   rL   rM   r   +  s
   r   c                 C   s   t | tv p
| tv S r^   r   )r   rL   rL   rM   _filter3  s   r   c                 C   s0   |  d\}}}t|d t|d  t| S )N:g      @g      N@)rw   r   r   )timehrmnsecrL   rL   rM   	_time2sec8  s    r   c                 C   s&   |  d\}}t|}t|}||fS )Nz --> )rw   r   )lr   r   rL   rL   rM   _parse_time_segment>  s   r   c                 C   s   t | dkr	dS | S )Nr   r~   )r   r   )r   rL   rL   rM   _normalize_spaceF  s   r   c                 c   s^   t ds	tddd l}|d|j}|d}|d}| d}t|dD ]\}}|dkr| d	kr|d
}	t|	d \}
}d	|	dd  }|}|dd	kr|
||}|
d|}|
d	|}g }||D ]}|d		dd tt| ddD  qnd| d }|	|}|
ddd |}|
dd|  }|
||fV  q(d S )Nregexz>regex package not found. Please install... (pip install regex)r   z	\([^)]*\)z	(\w)'(\w)z(&[^ ;]*;)|(</?[iu]>)z

 
r~   rO   z- z
\1\u2019\2c                 S   s   g | ]}|qS rL   rL   )r`   irL   rL   rM   
<listcomp>j  s    z_parse_vtt.<locals>.<listcomp>r7   z\p{Zs}c                 S   s   t | dS )Nr   )r   group)mrL   rL   rM   <lambda>p  s    z_parse_vtt.<locals>.<lambda>z +)r   ImportErrorr   compiler   rw   	enumerater   r   r   subrp   filterr   replacelower)r   noisere2noise_patternapostrophe_pattern	html_tagsblocksr   bb_linesr   r   r   r   
line_partsline_parts_newlpjoinerrL   rL   rM   r   N  sH   





r   )r.   r/   )Nr/   rO   )NrS   rO   )2__doc__rB   r   rD   r   collectionsr   concurrent.futures.threadr   	functoolsr   pathlibr   typingr   r   r   r	   	tqdm.autor
   lhotser   r   r   r   r   	lhotse.qar   lhotse.utilsr   r   r   r   r   r   ASRr<   r?   rN   r   rZ   rV   rr   r   r   r   r   r   r   r   rL   rL   rL   rM   <module>   s    
."
7
N'	