o
    5ti                     @   sL   d dl Z d dlZddlmZ ddlmZ G dd deZG dd	 d	eZdS )
    N   )
smart_open   )Datasetc                   @   s2   e Zd ZdZdd Zdd ZdddZd	d
 ZdS )FakeSGMLDatasetz
    The fake SGML format used by WMT prior to 2021. Can't be properly parsed.
    Source and reference(s) in separate files.
    c              
   C   s   t |@}t |d"}|D ]}|dr$| tdd|}t||d qW d   n1 s/w   Y  W d   dS W d   dS 1 sGw   Y  dS )zK
        Extract data from raw file and convert to raw txt format.
        wt<seg z<seg.*?>(.*)</seg>.*?z\1fileN)r   
startswith_cleanresubprintselfinput_file_pathoutput_filep_pathfinfoutline r   O/home/ubuntu/.local/lib/python3.10/site-packages/sacrebleu/dataset/fake_sgml.py_convert_format   s   
PzFakeSGMLDataset._convert_formatc           	   	   C   s   t |P}t |d2}d}|D ]%}|dr)t| d|}|dur(|d}q|dr4t||d qW d   n1 s?w   Y  W d   dS W d   dS 1 sWw   Y  dS )	zP
        Extract metadata from document tags, projects across segments.
        r    z<doc z="(.*?)"Nr   r   r	   )r   r   r   searchgroupr   )	r   r   fieldr   r   r   valuer   matchr   r   r   _convert_meta   s    


PzFakeSGMLDataset._convert_metaNc                    s       D ]Q  }fdd  D }| fddtdD 7 }t||D ]*\}}tjj|}	 |}|
dsM|
drT|| q1||| q1qdS )zProcesses raw files to plain text files.

        :param langpair: The language pair to process. e.g. "en-de". If None, all files will be processed.
        c                    s   g | ]
}t j j|qS r   ospathjoin_rawdir).0r#   )r   r   r   
<listcomp>7   s    z3FakeSGMLDataset.process_to_text.<locals>.<listcomp>c                    s$   g | ]}t jj  d  qS )r   r!   )r&   _langpair	langpairsr   r   r   r'   <   s       srcrefN)maybe_download_get_langpair_metadata
fieldnamesrangezipr"   r#   r$   r%   _get_txt_file_pathr   r   r    )r   r*   r1   origin_filesr   origin_fileoutput_filer   r)   r   process_to_text,   s$   


zFakeSGMLDataset.process_to_textc                 C   s   |  |}t|| }|dksJ d| j ddg}|dkr%|d nt|| dd D ]\}}|d|  q/| jd	sH|g d
7 }|S )a)  
        Return a list of all the field names. For most source, this is just
        the source and the reference. For others, it might include the document
        ID for each line, or the original language (origLang).

        get_files() should return the same number of items as this.
        r   zEach language pair in z must have at least 2 fields.r-   r.   r   Nzref:wmt08)docidgenreoriglang)r0   lennameappend	enumerater   )r   r*   metalengthfieldsir(   r   r   r   r1   K   s   

zFakeSGMLDataset.fieldnames)N)__name__
__module____qualname____doc__r   r    r8   r1   r   r   r   r   r      s    
r   c                       s    e Zd ZdZ fddZ  ZS )WMTAdditionDatasetz=
    Handle special case of WMT Google addition dataset.
    c              	      s   | drt ||S t|3}t|d}|D ]
}t| |d qW d    n1 s.w   Y  W d    d S W d    d S 1 sFw   Y  d S )Nz.sgmr   r	   )endswithsuperr   r   r   rstripr   	__class__r   r   r   m   s   

"z"WMTAdditionDataset._convert_format)rE   rF   rG   rH   r   __classcell__r   r   rM   r   rI   h   s    rI   )r"   r   utilsr   baser   r   rI   r   r   r   r   <module>   s    `