o
    5ti                     @   s4   d dl Z ddlmZ ddlmZ G dd deZdS )    N   )
smart_open   )Datasetc                   @   s   e Zd ZdZdddZdS )PlainTextDatasetz~
    The plain text format. Data is separated into source and reference files.
    Each line of the two files is aligned.
    Nc                    s        |}|D ]`} |} fdd|| D }t||D ]H\}}tj j|} ||}t	|*}t	|d}	|D ]
}
t
|
 |	d qAW d   n1 sVw   Y  W d   n1 sew   Y  q"qdS )zProcesses raw files to plain text files.

        :param langpair: The language pair to process. e.g. "en-de". If None, all files will be processed.
        c                    s   g | ]
}t j j|qS  )ospathjoin_rawdir).0r	   selfr   P/home/ubuntu/.local/lib/python3.10/site-packages/sacrebleu/dataset/plain_text.py
<listcomp>   s    z4PlainTextDataset.process_to_text.<locals>.<listcomp>wt)fileN)maybe_download_get_langpair_metadata
fieldnameszipr   r	   r
   r   _get_txt_file_pathr   printrstrip)r   langpair	langpairsr   origin_filesfieldorigin_fileoutput_filefinfoutliner   r   r   process_to_text   s*   



z PlainTextDataset.process_to_text)N)__name__
__module____qualname____doc__r#   r   r   r   r   r      s    r   )r   utilsr   baser   r   r   r   r   r   <module>   s    