o
    wiM                     @   s4   d dl mZ d dlZddlmZ G dd deZdS )    )	lru_cacheN   )BaseTokenizerc                   @   s<   e Zd ZdZdd Zdd Zeddded	efd
dZdS )TokenizerV14InternationalaE  Tokenizes a string following the official BLEU implementation.

    See github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/mteval-v14.pl#L954-L983

    In our case, the input string is expected to be just one line.
    We just tokenize on punctuation and symbols,
    except when a punctuation is preceded and followed by a digit
    (e.g. a comma/dot as a thousand/decimal separator).
    We do not recover escaped forms of punctuations such as &apos; or &gt;
    as these should never appear in MT system outputs (see issue #138)

    Note that a number (e.g., a year) followed by a dot at the end of
    sentence is NOT tokenized, i.e. the dot stays with the number because
    `s/(\p{P})(\P{N})/ $1 $2/g` does not match this case (unless we add a
    space after each sentence). However, this error is already in the
    original mteval-v14.pl and we want to be consistent with it.
    The error is not present in the non-international version,
    which uses `$norm_text = " $norm_text "`.

    :param line: the input string to tokenize.
    :return: The tokenized string.
    c                 C   s   dS )Nintl selfr   r   `/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/sacrebleu/tokenizers/tokenizer_intl.py	signature    s   z#TokenizerV14International.signaturec                 C   s.   t ddft ddft ddfg| _d S )Nz(\P{N})(\p{P})z\1 \2 z(\p{P})(\P{N})z \1 \2z(\p{S})z \1 )regexcompile_rer   r   r   r
   __init__#   s   
z"TokenizerV14International.__init__i   )maxsizelinereturnc                 C   s*   | j D ]
\}}|||}qd| S )N )r   subjoinsplit)r	   r   r   replr   r   r
   __call__-   s   z"TokenizerV14International.__call__N)	__name__
__module____qualname____doc__r   r   r   strr   r   r   r   r
   r      s    
r   )	functoolsr   r   tokenizer_baser   r   r   r   r   r
   <module>   s    