o
    Si2                     @   s  d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	m
Z
mZmZmZ ddlZddlmZmZmZmZmZmZ ddlmZ ddlmZ 		d7d
eddfddZddddgdfdededee dedeee  dede	ee	eeeef f f fddZdededededef
ddZ dd Z!d d! Z"d"e
e dee fd#d$Z#d%Z$d&Z%d'd( e&e%e$D Z'd)d* Z(d+d, Z)d-d. Z*d/d0 Z+d1d2 Z,d3d4 Z-d5d6 Z.dS )8a  
The IWSLT Tunisian dataset is a 3-way parallel dataset consisting of approximately 160 hours
and 200,000 lines of aligned audio, Tunisian transcripts, and English translations. This dataset
comprises conversational telephone speech recorded at a sampling rate of 8kHz. The train, dev,
and test1 splits of the iwslt2022 shared task correspond to catalog number LDC2022E01. Please
note that access to this data requires an LDC subscription from your institution.To obtain this
dataset, you should download the predefined splits by running the following command:
git clone https://github.com/kevinduh/iwslt22-dialect.git. For more detailed information about
the shared task, please refer to the task paper available at this link:
https://aclanthology.org/2022.iwslt-1.10/.
    N)ThreadPoolExecutor)Path)DictIterableListOptionalUnion)AudioSource	RecordingRecordingSetSupervisionSegmentSupervisionSet$validate_recordings_and_supervisions)fix_manifests)Pathlike.
target_dirreturnc                 C   s   t d dS )z
    Download and untar the dataset.

    NOTE: This function just returns with a message since IWSLT22 Tunisian-English is not available
    for direct download.

    :param target_dir: Pathlike, the path of the dir to storage the dataset.
    z
        To obtain this data your institution needs to have an LDC subscription.
        You also should download the pre-defined splits with
        git clone https://github.com/kevinduh/iwslt22-dialect.git
    N)logginginfo)r    r   M/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/recipes/iwslt22_ta.pydownload_iwslt22_ta6   s   r   Ftaeng   
corpus_dirsplits
output_dirnormalize_textlangsnum_jobsc                    s  ddl }i }tt|t| } | d }g }	ttt|d )}
|
D ]}|  \}}}t|}|	| dt	d| d q#W d   n1 sLw   Y  i }g }| d }g }t
|}|d	D ]t}|jd
s|jjjd d |j }||jdd d |j }| r|dj}|t|| d }||vrt|tddgt|| d dg|j|j|j|j d||< ||t||||	| qft|j d qftj|dddD ]}| }|du rq|D ]}|| qqt|}t !|}t"#|$ }t%||\}}t&|| dD ]D |' fdd}|' fdd}||d| < |durZt|}|j(ddd |)|d  d   |)|d!  d   qW d   |S 1 shw   Y  |S )"a  
    Prepares manifests for the train dev and test1 splits.

    :param corpus_dir: Path to ``LDC2022E01`` the path of the data dir.
    :param splits: Path to splits from https://github.com/kevinduh/iwslt22-dialect
    :param normalize_text: Bool, if True, Arabic text normalization is performed
        from https://aclanthology.org/2022.iwslt-1.29.pdf.
    :param output_dir: Directory where the manifests should be written. Can be omitted
        to avoid writing.
    :param langs: str, list of language abbreviations for source and target languages.
    :param num_jobs: int, the number of jobs to use for parallel processing.
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.

    r   Nzdata/audio/tazexclude-utterance.txt_d   06zdata/transcripts/taz*.tsvz._translationsr   r   z.eng z.sphfile)typechannelssource)idsourcessampling_ratenum_samplesdurationzI does not exist, please make sure number of translations = transcriptionszProcessing textF)descleavetraindevtest1c                       | j   v S N)recording_idssplitsplit_filesr   r   <lambda>       z$prepare_iwslt22_ta.<locals>.<lambda>c                    r6   r7   r+   )rr;   r   r   r>      r?   )
recordingssupervisionsT)parentsexist_okziwslt22-ta_recordings_z	.jsonl.gzziwslt22-ta_supervisions_)*	soundfileload_splitsr   openstrstripr<   floatappendintr   globstem
startswithparentname	with_namesuffixexistswith_suffix	SoundFiler
   r	   
samplerateframessubmit_filename_to_supervisionsr   warningtqdmresultdeduplicate_supervisionsr   from_segmentsr   from_recordingsvaluesr   r   filtermkdirto_file)r   r   r   r   r    r!   sf	manifests	audio_direxcludefl	excludeidstartendrB   rC   text_dirfuturesexptranslations_pathfilenameaudio_sffuturer^   supsups_recs_r   r;   r   prepare_iwslt22_taJ   s    






@@rz   rr   rs   ri   c                 C   s  g }|  djd\}}}}	|  djd\}
}}}|  }|   }t|dd d}t|dd d}t||D ]\}}| d\}}}}| d\}}}}t|}t|}||d}||d	}| d| d| d|	 dt	d
| d	}|
 d| d| d| dt	d
| d	}||ksJ d| d| |rt
|}| dkrtd| j d| d| d| d	 q=||v rq=|t| d|d  d|d  d| |  dj|t|| ddd||d |d|d |iid	 q=|S )Nr&   r"   c                 S      |  dd S N	r   r<   liner   r   r   r>      r?   z+_filename_to_supervisions.<locals>.<lambda>keyc                 S   r{   r|   r~   r   r   r   r   r>      r?   r}   
transcripttranslationr#   r$   z<The loaded source and target files are not sorted properly:  z	Skipping z" with empty cleaned transcript ...r   r      )ndigitstranslated_text)	r+   r8   rm   r/   channeltextlanguagespeakercustom)rV   rO   r<   	read_text
splitlinessortedziprstriprK   rM   text_cleaningrJ   r   r\   rL   r   round)rr   rs   r   ri   r    rC   datetimesomeidr   date_tgttime_tgt
someid_tgtchannel_tgtr%   transcriptssorted_translationssorted_transcriptssrctgtrm   rn   sidr   r"   text_tgtutt_id
utt_id_tgtr   r   r   r[      sV   

*(
 
r[   c                 C   s\   t d}t d}|dkrt |d| d S |dkr&t |d| d  S td| d)	Nz[OUM]+/*|\u061F|\?|\!|\.z#\(|\)|\#|\+|\=|\?|\!|\;|\.|\,|\"|\:r   r&   r   r   zText normalization for z is not supported)recompilesubnlower
ValueError)	utterancer   arabic_filterenglish_filterr   r   r   r      s   

c              	   C   sp   i }dD ]1}g }| | d }t t|}|D ]	}||  qW d    n1 s,w   Y  |||< q|S )Nr2   z.file_id.txt)rH   rI   rL   rJ   )pathr   r<   	file_list	split_scprj   rk   r   r   r   rG      s   
rG   rC   c                 C   s|   ddl m} |dd t| dd d}g }| D ]"\}}t|dkr4tdt| d|d j d	 ||d  q|S )
Nr   )groupbyc                 S   s   | j S r7   r@   r9   r   r   r   r>     s    z*deduplicate_supervisions.<locals>.<lambda>r   r   zFound z$ supervisions with conflicting IDs (z) - keeping only the first one.)	cytoolzr   r   itemslenr   r\   r+   rL   )rC   r   
duplicatesfilteredkvr   r   r   r_   
  s   r_   u    ةىأإآu    هياااc                 C   s   i | ]	\}}t ||qS r   )ord).0abr   r   r   
<dictcomp>  s    r   c                 C   s
   |  tS r7   )	translate_toNormalizer9   r   r   r   normalize_text_   s   
r   c                 C   sX   t dd| } t dd| } t dd| } t dd| } t dd| } t dd	| } | S )
Nu   [إأٱآا]u   اu   (أ){2,}u   (ا){2,}u   (آ){2,}u   (ص){2,}u   صu   (و){2,}u   وr   subr   r   r   r   normalize_arabic$  s   r   c                 C   s   t dd| S )Nz/[\u064B-\u0652\u06D4\u0670\u0674\u06D5-\u06ED]+r&   r   r   r   r   r   remove_diacritics.  s   r   c                 C   s8   d}t j}t|| }|D ]}|| v r| |d} q| S )z;This function  removes all punctuations except the verbatimu4   `÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـr   )stringpunctuationsetreplace)r   arabic_punctuationsenglish_punctuationsall_punctuationsrr   r   r   r   remove_punctuations3  s   r   c                 C   s    t dd| } t dd| } | S )Nz\s+r   z\s+\.\s+r   r   r   r   r   r   remove_extra_spaceB  s   r   c                 C   s6   ddddddddd	d
ddddd}t |}| |S )N0123456789%r   u   ف)u   ٠u   ١u   ٢u   ٣u   ٤u   ٥u   ٦u   ٧u   ٨u   ٩u   ٪r"   u   ڤ|)rI   	maketransr   )r   eastern_to_westerntrans_stringr   r   r   east_to_west_numH  s"   

r   c                 C   s4   t | } t| } t| } t| } t| } t| } | S r7   )r   r   r   r   r   r   r   r   r   r   r   ]  s   r   )r   )/__doc__r   r   r   concurrent.futures.threadr   pathlibr   typingr   r   r   r   r   r]   lhotser	   r
   r   r   r   r   	lhotse.qar   lhotse.utilsr   r   boolrI   rM   rz   listr[   r   rG   r_   _preNormalize_postNormalizer   r   r   r   r   r   r   r   r   r   r   r   r   <module>   s    


j
?

