o
    5ti                     @   s4   d dl Z ddlmZ ddlmZ G dd deZdS )    N   )
smart_open   )Datasetc                   @   s&   e Zd ZdZedd ZdddZdS )
TSVDatasetzM
    The format used by the MTNT datasets. Data is in a single TSV file.
    c                 C   sh   |  d}t|dkr(zt|d }W n ty!   td|  w ||d fS |dkr.dnd}|| fS )a  
        Splits the index and filename from a metadata string.

        e.g. meta="3:en-de.tsv", filed=[Any value] -> (3, "en-de.tsv")
             "en-de.tsv", filed="src" -> (1, "en-de.tsv")
             "en-de.tsv", filed="tgt" -> (2, "en-de.tsv")
        :r   r   zInvalid meta for TSVDataset: r   src)splitlenint
ValueError	Exception)metafieldarrindex r   I/home/ubuntu/.local/lib/python3.10/site-packages/sacrebleu/dataset/tsv.py_split_index_and_filename   s   
	z$TSVDataset._split_index_and_filenameNc                    s        |}|D ]r} |} fdd|| D }t|||| D ]W\}}} ||\}}tj j|} 	||}	t
|0}
t
|	d}|
D ]}t|dd| |d qMW d   n1 shw   Y  W d   n1 sww   Y  q%qdS )zProcesses raw files to plain text files.

        :param langpair: The language pair to process. e.g. "en-de". If None, all files will be processed.
        c                    s   g | ]
}t j j|qS r   )ospathjoin_rawdir).0r   selfr   r   
<listcomp>,   s    z.TSVDataset.process_to_text.<locals>.<listcomp>wt
	)fileN)maybe_download_get_langpair_metadata
fieldnameszipr   r   r   r   r   _get_txt_file_pathr   printrstripr	   )r   langpair	langpairsr#   origin_filesr   origin_filer   r   output_filefinfoutliner   r   r   process_to_text!   s0   




zTSVDataset.process_to_text)N)__name__
__module____qualname____doc__staticmethodr   r0   r   r   r   r   r      s
    
r   )r   utilsr   baser   r   r   r   r   r   <module>   s    