o
    5t¾iM  ã                   @   s4   d dl mZ d dlZddlmZ G dd„ deƒZdS )é    )Ú	lru_cacheNé   )ÚBaseTokenizerc                   @   s<   e Zd ZdZdd„ Zdd„ Zeddded	efd
d„ƒZdS )ÚTokenizerV14InternationalaE  Tokenizes a string following the official BLEU implementation.

    See github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/mteval-v14.pl#L954-L983

    In our case, the input string is expected to be just one line.
    We just tokenize on punctuation and symbols,
    except when a punctuation is preceded and followed by a digit
    (e.g. a comma/dot as a thousand/decimal separator).
    We do not recover escaped forms of punctuations such as &apos; or &gt;
    as these should never appear in MT system outputs (see issue #138)

    Note that a number (e.g., a year) followed by a dot at the end of
    sentence is NOT tokenized, i.e. the dot stays with the number because
    `s/(\p{P})(\P{N})/ $1 $2/g` does not match this case (unless we add a
    space after each sentence). However, this error is already in the
    original mteval-v14.pl and we want to be consistent with it.
    The error is not present in the non-international version,
    which uses `$norm_text = " $norm_text "`.

    :param line: the input string to tokenize.
    :return: The tokenized string.
    c                 C   s   dS )NÚintl© ©Úselfr   r   úW/home/ubuntu/.local/lib/python3.10/site-packages/sacrebleu/tokenizers/tokenizer_intl.pyÚ	signature    s   z#TokenizerV14International.signaturec                 C   s.   t  d¡dft  d¡dft  d¡dfg| _d S )Nz(\P{N})(\p{P})z\1 \2 z(\p{P})(\P{N})z \1 \2z(\p{S})z \1 )ÚregexÚcompileÚ_rer   r   r   r
   Ú__init__#   s   
úz"TokenizerV14International.__init__i   )ÚmaxsizeÚlineÚreturnc                 C   s*   | j D ]
\}}| ||¡}qd | ¡ ¡S )Nú )r   ÚsubÚjoinÚsplit)r	   r   r   Úreplr   r   r
   Ú__call__-   s   z"TokenizerV14International.__call__N)	Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   r   r   Ústrr   r   r   r   r
   r      s    
r   )Ú	functoolsr   r   Útokenizer_baser   r   r   r   r   r
   Ú<module>   s    