o
    5tiD                     @   sb   d Z ddlZddlZddlmZmZ ddlmZmZm	Z	 ddl
mZmZmZ G dd ded	ZdS )
z+
The base class for all types of datasets.
    N)ABCMetaabstractmethod)DictListOptional   )SACREBLEU_DIRdownload_file
smart_openc                   @   s   e Zd Zddddeeee f fdedeee  dee dee deee  f
ddZd	d
 Ze	dd Z
dd Zdd Zdd Zed$d%ddZdee fddZdd Zdd Zdd Zdd Zd d! Zd"d# ZdS )&DatasetNnamedatadescriptioncitationmd5c                 K   sR   || _ || _|| _|| _|| _|| _|| _tj	t
| j | _tj	| jd| _dS )av  
        Params come from the values in DATASETS.

        :param name: Name of the dataset.
        :param data: URL of the raw data of the dataset.
        :param description: Description of the dataset.
        :param citation: Citation for the dataset.
        :param md5: MD5 checksum of the dataset.
        :param langpairs: List of available language pairs.
        rawN)r   r   r   r   r   	langpairskwargsospathjoinr   _outdir_rawdir)selfr   r   r   r   r   r   r    r   J/home/ubuntu/.local/lib/python3.10/site-packages/sacrebleu/dataset/base.py__init__   s   zDataset.__init__c                 C   sn   t j| jdd | jr| jndgt| j }t| j|D ]\}}t j| j| 	|}t
||| j|d qdS )z
        If the dataset isn't downloaded, use utils/download_file()
        This can be implemented here in the base class. It should write
        to ~/.sacreleu/DATASET/raw exactly as it does now.
        T)exist_okN)
extract_toexpected_md5)r   makedirsr   r   lenr   zipr   r   _get_tarball_filenamer	   )r   expected_checksumsurlr   tarballr   r   r   maybe_download0   s   
zDataset.maybe_downloadc                 C   s   t dd|  S )z
        Removes trailing and leading spaces and collapses multiple consecutive internal spaces to a single one.

        :param s: The string.
        :return: A cleaned-up string.
        z\s+ )resubstrip)sr   r   r   _cleanA   s   zDataset._cleanc                 C   s   | j ddd tj| S )z
        Produces a local filename for tarball.
        :param url: The url to download.
        :return: A name produced from the dataset identifier and the URL basename.
        /_.)r   replacer   r   basename)r   r%   r   r   r   r#   K   s   zDataset._get_tarball_filenamec                 C   s:   | j dd}|dd}tj| j| d| d| S )a#  
        Given the language pair and fieldname, return the path to the text file.
        The format is: ~/.sacrebleu/DATASET/DATASET.LANGPAIR.FIELDNAME

        :param langpair: The language pair.
        :param fieldname: The fieldname.
        :return: The path to the text file.
        r.   r/   :-r0   )r   r1   r   r   r   r   )r   langpair	fieldnamer   r   r   r   _get_txt_file_pathS   s   
 zDataset._get_txt_file_pathc                 C   sD   |du r	| j }|S || j vrtd| j d| || j | i}|S )a  
        Given a language pair, return the metadata for that language pair.
        Deal with errors if the language pair is not available.

        :param langpair: The language pair. e.g. "en-de"
        :return: Dict format which is same as self.langpairs.
        NzNo such language pair r.   )r   	Exceptionr   )r   r5   r   r   r   r   _get_langpair_metadatab   s   
zDataset._get_langpair_metadatareturnc                 C   s   dS )zProcesses raw files to plain text files.

        :param langpair: The language pair to process. e.g. "en-de". If None, all files will be processed.
        Nr   r   r5   r   r   r   process_to_texts   s   zDataset.process_to_textc                 C   s   ddgS )a  
        Return a list of all the field names. For most source, this is just
        the source and the reference. For others, it might include the document
        ID for each line, or the original language (origLang).

        get_files() should return the same number of items as this.

        :param langpair: The language pair (e.g., "de-en")
        :return: a list of field names
        srcrefr   r;   r   r   r   
fieldnames{   s   zDataset.fieldnamesc                 c   2    |  |}dd |D }t| D ]}|V  qdS )zs
        Iterates over all fields (source, references, and other metadata) defined
        by the dataset.
        c                 S      g | ]}t |qS r   r
   .0fr   r   r   
<listcomp>       z$Dataset.__iter__.<locals>.<listcomp>N)	get_filesr"   )r   r5   	all_filesall_finsitemr   r   r   __iter__   s   
zDataset.__iter__c                 c   sN    |  |}t|}|D ]}| V  qW d   dS 1 s w   Y  dS )z;
        Return an iterable over the source lines.
        N)get_source_filer
   r+   )r   r5   source_filefinliner   r   r   source   s   

"zDataset.sourcec                 c   r@   )z9
        Return an iterable over the references.
        c                 S   rA   r   rB   rC   r   r   r   rF      rG   z&Dataset.references.<locals>.<listcomp>N)get_reference_filesr"   )r   r5   	ref_filesref_finsrK   r   r   r   
references   s   
zDataset.referencesc                 C   s&   |  |}| |}|d}|| S )Nr=   )rH   r?   index)r   r5   rI   
all_fieldsrV   r   r   r   rM      s   


zDataset.get_source_filec                 C   s,   |  |}| |}dd t||D }|S )Nc                 S   s   g | ]\}}| d r|qS )r>   )
startswith)rD   rE   fieldr   r   r   rF      s
    z/Dataset.get_reference_files.<locals>.<listcomp>)rH   r?   r"   )r   r5   rI   rW   rS   r   r   r   rR      s   

zDataset.get_reference_filesc                    sB     } fdd|D }|D ]}tj|s  q|S )aE  
        Returns the path of the source file and all reference files for
        the provided test set / language pair.
        Downloads the references first if they are not already local.

        :param langpair: The language pair (e.g., "de-en")
        :return: a list of the source file and all reference files
        c                    s   g | ]}  |qS r   )r7   )rD   rY   r5   r   r   r   rF      s    z%Dataset.get_files.<locals>.<listcomp>)r?   r   r   existsr<   )r   r5   fieldsfilesfiler   rZ   r   rH      s   
	
zDataset.get_files)N)r:   N)__name__
__module____qualname__r   strr   r   r   r'   staticmethodr-   r#   r7   r9   r   r<   r?   rL   rQ   rU   rM   rR   rH   r   r   r   r   r      s@    


#
		
r   )	metaclass)__doc__r   r)   abcr   r   typingr   r   r   utilsr   r	   r
   r   r   r   r   r   <module>   s    