o
    5ti                     @   sT   d dl Z d dlmZ ddlmZ ddlmZ d dlm	Z	 dd Z
G d	d
 d
eZdS )    N   )
smart_open   )Dataset)defaultdictc                 C   s   | sdS d|  S )Nrefzref: 
translatorr   r   M/home/ubuntu/.local/lib/python3.10/site-packages/sacrebleu/dataset/wmt_xml.py_get_field_by_translator   s   
r   c                   @   sF   e Zd ZdZedd Zdd ZdddZd	d
 Zdd Z	dd Z
dS )WMTXMLDatasetzy
    The 2021+ WMT dataset format. Everything is contained in a single file.
    Can be parsed with the lxml parser.
    c              
      sh  t | }t t t }}}| dD ]
}||d q| dD ]}||d |d}|| q)t|dksKJ d|  t|dks]J dt| d|  g }g }	g }
g }d	d
 |D }tt	}d\}}}| dD ]}d|j
v rq||d7 }dd
 |dD }dd  |d} fdd
|D }|d} fdd
|D }t| D ]dtfdd| D sq|D ]}|t| |||i id q||  | D ]}|| ||   q|	|j
d  |
|j
d  ||d |ddu}|d7 }qq|d|i||	|
d|}|r2||d< |S )a  
        Unwraps the XML file from wmt21 or later.
        This script is adapted from https://github.com/wmt-conference/wmt-format-tools

        :param raw_file: The raw xml file to unwrap.
        :return: Dictionary which contains the following fields
            (each a list with values for each sentence):
            - `src`: The source sentences.
            - `docid`: ID indicating which document the sentences belong to.
            - `origlang`: The original language of the document.
            - `domain`: Domain of the document.
            - `ref:{translator}`: The references produced by each translator.
            - `ref`: An alias for the references from the first translator.
        z.//srclangz.//refr
   r   z-Multiple source languages found in the file: zFound z( reference languages found in the file: c                 S   s   i | ]}t |g qS r   r   ).0r
   r   r   r   
<dictcomp>=   s    z8WMTXMLDataset._unwrap_wmt21_or_later.<locals>.<dictcomp>)r   r   Fz.//doc	testsuitec                 S   s   i | ]}t |d |jqS )idintgettextr   segr   r   r   r   H   s    z.//src//segc                 S   s   dd |  dD S )Nc                 S   s(   i | ]}t |d |jr|jndqS )r    r   r   r   r   r   r   M   s    zKWMTXMLDataset._unwrap_wmt21_or_later.<locals>.get_sents.<locals>.<dictcomp>z.//seg)findall)docr   r   r   	get_sentsL   s   z7WMTXMLDataset._unwrap_wmt21_or_later.<locals>.get_sentsc                       i | ]
}| d  |qS r	   r   )r   ref_docr   r   r   r   T       z.//hypc                    r   )systemr   )r   hyp_docr!   r   r   r   Y   r"   c                    s   g | ]}|  d qS )r   r   )r   value)seg_idr   r   
<listcomp>_   s    z8WMTXMLDataset._unwrap_wmt21_or_later.<locals>.<listcomp>r   r   origlangdomainNsrc)docidr(   )ETparsesetgetrootr   addr   lenr   listattribsortedkeysanyvaluesr   append)raw_filetree	src_langs	ref_langstranslatorssrc_docr    r
   r*   docids
orig_langsdomainsrefssystemssrc_sent_count	doc_countseen_domainr   	src_sentsref_docstrans_to_refhyp_docshypssystem_namefieldsr   )r   r&   r   _unwrap_wmt21_or_later   sr   








z$WMTXMLDataset._unwrap_wmt21_or_laterc                 C   s8   |  || }t|tr|d n|d }tj| j|S )z
        Returns the path for this language pair.
        This is useful because in WMT22, the language-pair data structure can be a dict,
        in order to allow for overriding which test set to use.
        pathr   )_get_langpair_metadata
isinstancedictosrO   join_rawdir)selflangpairlangpair_datarel_pathr   r   r   _get_langpair_patht   s   z WMTXMLDataset._get_langpair_pathNc           	   
   C   s   |    t| | D ]_}| |}t|}| |}W d   n1 s(w   Y  |D ]<}| ||}tj	
|rFtj	|dkrFq/t|d}|| D ]}t| ||d qPW d   n1 sfw   Y  q/qdS )zProcesses raw files to plain text files.

        :param langpair: The language pair to process. e.g. "en-de". If None, all files will be processed.
        Nr   w)file)maybe_downloadr4   rP   r5   rZ   r   rN   _get_txt_file_pathrS   rO   existsgetsizeprint_clean)	rV   rW   rawfilefinrM   	fieldnametextfilefoutliner   r   r   process_to_text~   s$   

zWMTXMLDataset.process_to_textc                 C   sJ   | j dg }| || }t|tr|d|}n|}dd |D }|S )a  
        Returns the preferred references for this language pair.
        This can be set in the language pair block (as in WMT22), and backs off to the
        test-set-level default, or nothing.

        There is one exception. In the metadata, sometimes there is no translator field
        listed (e.g., wmt22:liv-en). In this case, the reference is set to "", and the
        field "ref" is returned.
        rB   c                 S   s   g | ]}t |qS r   r   )r   r   r   r   r   r'      s    z<WMTXMLDataset._get_langpair_allowed_refs.<locals>.<listcomp>)kwargsr   rP   rQ   rR   )rV   rW   defaultsrX   allowed_refsr   r   r   _get_langpair_allowed_refs   s   

z(WMTXMLDataset._get_langpair_allowed_refsc                    s:   |  | | |}| |} fddt||D }|S )z
        Returns the requested reference files.
        This is defined as a default at the test-set level, and can be overridden per language.
        c                    s   g | ]
\}}| v r|qS r   r   )r   ffieldrl   r   r   r'      s    z5WMTXMLDataset.get_reference_files.<locals>.<listcomp>)rm   	get_files
fieldnameszip)rV   rW   	all_files
all_fields	ref_filesr   rp   r   get_reference_files   s   



z!WMTXMLDataset.get_reference_filesc                 C   sP   |    | |}t|}| |}W d   n1 sw   Y  t| S )a  
        Return a list of all the field names. For most source, this is just
        the source and the reference. For others, it might include the document
        ID for each line, or the original language (origLang).

        get_files() should return the same number of items as this.

        :param langpair: The language pair (e.g., "de-en")
        :return: a list of field names
        N)r]   rZ   r   rN   r2   r5   )rV   rW   rc   rd   rM   r   r   r   rr      s   

zWMTXMLDataset.fieldnames)N)__name__
__module____qualname____doc__staticmethodrN   rZ   ri   rm   rw   rr   r   r   r   r   r      s    
]

r   )rS   
lxml.etreeetreer,   utilsr   baser   collectionsr   r   r   r   r   r   r   <module>   s    