o
    2wi                     @   s   d dl mZmZmZ d dlZd dlmZ d dlmZ d dl	m
Z
 ejedddejd	ejddd
dejdejddd
dejde dejddedddejddddejddddd	e
de
de
dededefddZdS )    )OptionalSequenceUnionN)prepare)prepare_iwslt22_ta)PathlikeT)show_default)context_settings
corpus_dir)existsdir_okay)typesplits
output_dirz-jz
--num-jobs   zBHow many threads to use (can give good speed-ups with slow disks).)r   defaulthelpz--normalize-textFzpWhether to perform additional text cleaning and normalization from https://aclanthology.org/2022.iwslt-1.29.pdf.)r   r   z--langs zNComma-separated list of language abbreviations for source and target languagesnormalize_textlangsnum_jobsc                 C   s"   | d}t| |||||d dS )a  
    IWSLT_2022 data preparation.
    
    This is conversational telephone speech collected as 8kHz-sampled data.
    The catalog number LDC2022E01 corresponds to the train, dev, and test1
    splits of the iwslt2022 shared task.
    To obtaining this data your institution needs to have an LDC subscription.
    You also should download the predined splits with
    git clone https://github.com/kevinduh/iwslt22-dialect.git
    ,)r   r   cleanr   N)splitr   )r
   r   r   r   r   r   
langs_list r   `/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/lhotse/bin/modes/recipes/iwslt22_ta.py
iwslt22_ta
   s   
'
r   )typingr   r   r   clicklhotse.bin.modesr   lhotse.recipes.iwslt22_tar   lhotse.utilsr   commanddictargumentPathoptionintboolstrr   r   r   r   r   <module>   sN    