o
    Si+                     @   s  d Z ddlmZ ddlmZ ddlmZmZ ddlm	Z	 ddl
mZmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZmZmZmZmZ ddlmZ ddlmZmZ ddlm Z m!Z!m"Z"m#Z# 	d9de ddfddZ$				d:de de de%de%de&de&dee'ee'eeef f f fddZ(d Z)d!Z*d"d# e+e)e*D Z,d$e'de'fd%d&Z-d'e'de'fd(d)Z.d'e'de'fd*d+Z/d'e'de'fd,d-Z0d'e'de'fd.d/Z1d'e'de'fd0d1Z2d'e'de'fd2d3Z3d'e'de'fd4d5Z4d6e'de&ddfd7d8Z5dS );a  
Description taken from official website: https://arabicspeech.org/mgb2/
The Multi-Dialect Broadcast News Arabic Speech Recognition (MGB-2):
The second edition of the Multi-Genre Broadcast (MGB-2) Challenge is
an evaluation of speech recognition and lightly supervised alignment
using TV recordings in Arabic. The speech data is broad and multi-genre,
spanning the whole range of TV output, and represents a challenging task for
speech technology. In 2016, the challenge featured two new Arabic tracks based
on TV data from Aljazeera. It was an official challenge at the 2016 IEEE
Workshop on Spoken Language Technology. The 1,200 hours MGB-2: from Aljazeera
TV programs have been manually captioned with no timing information.
QCRI Arabic ASR system has been used to recognize all programs. The ASR output
was used to align the manual captioning and produce speech segments for
training speech recognition. More than 20 hours from 2015 programs have been
transcribed verbatim and manually segmented. This data is split into a
development set of 10 hours, and a similar evaluation set of 10 hours.
Both the development and evaluation data have been released in the 2016 MGB
challenge
    )chaininfo)pathsystem)Path)matchsub)copy)punctuation)DictUnion)RecordingSetSupervisionSegmentSupervisionSetfix_manifests$validate_recordings_and_supervisions)load_kaldi_data_dir)manifests_existread_manifests_if_cached)Pathlikecheck_and_rglobis_module_availablerecursion_limit.
target_dirreturnNc                 C   s   t d dS )z
    Download and untar the dataset.

    NOTE: This function just returns with a message since MGB2 is not available
    for direct download.

    :param target_dir: Pathlike, the path of the dir to storage the dataset.
    z{MGB2 is not available for direct download. Please fill out the format https://arabicspeech.org/mgb2 to download the corpus.Nr   )r    r   G/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/recipes/mgb2.pydownload_mgb2)   s   r   TF   P   
corpus_dir
output_dirtext_cleaningbuck_walternum_jobs
mer_threshc                    s  t | } |  sJ d|  g d}i }|dur-t |}|jddd t||dddd}|D ]T}td	|  t||ddd
rJtd| d q/t |}t | } |dksZ|dkr|| jddd t| | d || d  t| | d || d  t| | d d5}	t|| d d}
|	D ]}|
|	d|  d| d qW d   n1 sw   Y  W d   n1 sw   Y  t
|| d\}}}|du r|t}|dkrt|dksJ dt| nq|dkrt|dksJ dt| n]|dkr[tj| | d  d!|d"}tt| |d#d$}td% tt fd&d'|D }W d   n	1 s7w   Y  t|}t|d(ksQJ d)t| |du r[|t}t||\}}t|| ||d*| d+  ||d,| d+  ||d-||< q/|S ).a  
    Returns the manifests which consist of the Recordings and Supervisions.
    When all the manifests are available in the ``output_dir``, it will simply read and return them.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :param text_cleaning: Bool, if True, basic text cleaning is performed (similar to ESPNet recipe).
    :param buck_walter: Bool, use BuckWalter transliteration
    :param num_jobs: int, the number of jobs to use for parallel processing.
    :param mer_thresh: int, filter out segments based on mer (Match Error Rate)
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.

    .. note::
        Unlike other recipes, output_dir is not Optional here because we write the manifests
        to the output directory while processing to avoid OOM issues, since it is a large dataset.

    .. caution::
        The `text_cleaning` option removes all punctuation and diacritics.
    zNo such directory: )devtraintestNT)parentsexist_okmgb2zjsonl.gz)dataset_partsr#   prefixsuffixlazyzProcessing MGB2 subset: )partr#   r/   r0   zMGB2 subset: z already prepared - skipping.r*   r(   ztext.non_overlap_speechtextzsegments.non_overlap_speechsegmentszwav.scprwzwav//z/wav/i>  Fi  z+Expected 5365 supervisions for test, found i  z*Expected 5002 supervisions for dev, found r)   wavz*.wav)patternr&   zxml/utf8z*.xmli  c                    s   g | ]}t | qS r   )make_supervisions).0pr'   r   r   
<listcomp>   s    z prepare_mgb2.<locals>.<listcomp>i? z.Expected 375103 supervisions for train, found mgb2_recordings_z	.jsonl.gzmgb2_supervisions_)
recordingssupervisions)r   is_dirmkdirr   r   r   r
   openwritereplacer   transform_textfrom_buck_walterlenr   from_dirr   r   joinr   listr   from_iterabler   from_segmentscleaningr   r   to_file)r"   r#   r$   r%   r&   r'   r.   	manifestsr2   f_inf_outlinerA   rB   _	xml_pathssupervisions_listr   r=   r   prepare_mgb2:   s   




  







rY   uf   آؤئبتجگخذزشضظغـقلنويٌَِْٰپچءأإڤاةثحدرسصطعفكمهىًٍُّٱz3|&}btjGx*z$DZg_qlnwyNaio`PJ'><VApvHdrsSTEfkmhYFKu~{c                 C   s   i | ]	\}}t ||qS r   )ord)r;   abr   r   r   
<dictcomp>   s    r]   sc                 C   s
   |  tS N)	translate_backward_map)r^   r   r   r   rI      s   
rI   r3   c                 C   s   t dd| S )Nz/[\u064B-\u0652\u06D4\u0670\u0674\u06D5-\u06ED]+ r	   r3   r   r   r   remove_diacritics   s   re   c                 C   s6   d}t }t|| }|D ]}|| v r| |d} q| S )z;This function  removes all punctuations except the verbatimu:   ﴿﴾`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ )r   setrG   )r3   arabic_punctuationsenglish_punctuationsall_punctuationsr<   r   r   r   remove_punctuations   s   rk   c                 C   s   |   } tdd| S )Nz[^\u0600-\u06FF\s\da-z]+rb   )lowerr	   rd   r   r   r   remove_non_alphanumeric   s   rm   c                 C   s    |   }dd |D }d|S )z
    Remove single character word from text
    Example: I am in a a home for two years => am in home for two years
    Args:
            text (str): text
    Returns:
            (str): text with single char removed
    c                 S   s$   g | ]}t |d ks| r|qS )r    )rJ   	isnumeric)r;   wordr   r   r   r>      s   $ z+remove_single_char_word.<locals>.<listcomp>rf   )splitrL   )r3   wordsfilter_wordsr   r   r   remove_single_char_word   s   	
rs   c                 C   s6   ddddddddd	d
ddddd}t |}| |S )N0123456789%rf   u   ف)u   ٠u   ١u   ٢u   ٣u   ٤u   ٥u   ٦u   ٧u   ٨u   ٩u   ٪rV   u   ڤ|)str	maketransr`   )r3   eastern_to_westerntrans_stringr   r   r   east_to_west_num   s"   

r   c                 C   s   t dd| } t dd| } | S )Nz\s+rf   z\s+\.\s+r   rc   rd   r   r   r   remove_extra_space  s   r   c                 C   s4   t | } t| } t| } t| } t| } t| } | S r_   )rk   r   re   rm   rs   r   rd   r   r   r   rP     s   rP   xml_pathc                    sH   t dstdddlm} t| d}||d} fdd|d	D S )
Nbs4z@To prepare MGB2 data, please 'pip install beautifulsoup4' first.r   )BeautifulSoupr5   xmlc                    s   g | ]Z} d u st |d  krt|d d |d  d |d  |d dd dd	t |d tt |d t |d  d
ddddd |dD dttd|d 	ddqS )NWMERidrV   	starttime:endtime_uttr   -   )ndigitsrf   c                 S   s   g | ]
}|j d ur|j qS r_   )string)r;   elementr   r   r   r>   )  s
    
z0make_supervisions.<locals>.<listcomp>.<listcomp>r   Arabicz\w+speaker(\d+)\w+whor    )r   recording_idstartdurationchannelr3   languagespeaker)
floatr   rp   rG   roundrL   find_allintr   group)r;   segmentr=   r   r   r>     s(    
z%make_supervisions.<locals>.<listcomp>r   )r   
ValueErrorr   r   rE   r   )r   r'   r   
xml_handlesoupr   r=   r   r:     s   


r:   )r   )TFr    r!   )6__doc__	itertoolsr   loggingr   osr   r   pathlibr   rer   r	   shutilr
   r   r   typingr   r   lhotser   r   r   r   r   lhotse.kaldir   lhotse.recipes.utilsr   r   lhotse.utilsr   r   r   r   r   boolr   r   rY   _unicode_buckwalterzipra   rI   re   rk   rm   rs   r   r   rP   r:   r   r   r   r   <module>   sh    

 
