o
    yiU3                     @   s   d dl Z d dlmZ d dlmZmZ d dlZd dlmZmZ d dl	m
Z
 d dlmZmZ d dlmZ dZd	ZG d
d dZ					ddee deee  dedede
d dedeee  defddZdS )    N)partial)OptionalSequence)Tensortensor)Literal)_bleu_score_compute_bleu_score_update)_REGEX_AVAILABLEnone13azhintlchar))u   㐀u   䶵)u   一u   龥)u   龦u   龻)u   豈u   鶴)u   侮u   頻)u   並u   龎)u    0u   ⩭6)u   ⾀0u   ⾡d)u   ＀u   ￯)u   ⺀u   ⻿)u   　u   〿)u   ㇀u   ㇯)u   ⼀u   ⿟)u   ⿰u   ⿿)u   ㄀u   ㄯ)u   ㆠu   ㆿ)u   ︐u   ︟)u   ︰u   ﹏)u   ☀u   ⛿)u   ✀u   ➿)u   ㈀u   ㋿)u   ㌀u   ㏿c                   @   s  e Zd ZdZeddfeddfeddfeddffZer:d	d
lZeddfeddfeddffZ	ddddddZ
d0ded dedd
fddZdedee fddZe	d0deded dedee fddZededefdd Zed!edefd"d#Zededefd$d%Zededefd&d'Zededefd(d)Zededefd*d+Zededefd,d-Zedededefd.d/Zd
S )1_SacreBLEUTokenizerzTokenizer used for SacreBLEU calculation.

    Source: https://github.com/mjpost/sacrebleu/tree/master/sacrebleu/tokenizers
    z([\{-\~\[-\` -\&\(-\+\:-\@\/])z \1 z([^0-9])([\.,])z\1 \2 z([\.,])([^0-9])z \1 \2z
([0-9])(-)r   Nz(\P{N})(\p{P})z(\p{P})(\P{N})z(\p{S})_tokenize_base_tokenize_13a_tokenize_zh_tokenize_international_tokenize_charr   Ftokenize	lowercasereturnc                 C   s   t | | j| | _|| _d S N)getattr_TOKENIZE_FNtokenize_fnr   )selfr   r    r   [/home/ubuntu/.local/lib/python3.10/site-packages/torchmetrics/functional/text/sacre_bleu.py__init__x   s   
z_SacreBLEUTokenizer.__init__linec                 C   s   |  |}| || j S r   )r   _lowerr   split)r   r"   tokenized_liner   r   r    __call__|   s   
z_SacreBLEUTokenizer.__call__c                 C   s(   t | | j| }||}| || S r   )r   r   r#   r$   )clsr"   r   r   r   r%   r   r   r    r      s   z_SacreBLEUTokenizer.tokenizec                 C   *   | j D ]
\}}|||}qd| S )zCommon post-processing tokenizer for `13a` and `zh` tokenizers.

        Args:
            line: a segment to tokenize

        Return:
            the tokenized line
         )_REGEXsubjoinr$   r'   r"   _rereplr   r   r    _tokenize_regex   s   
z#_SacreBLEUTokenizer._tokenize_regexucharc                 C   s.   t D ]\}}||   kr|kr dS  qqdS )z
        Args:
            uchar: input char in unicode

        Return:
            whether the input char is a Chinese character.
        TF)_UCODE_RANGES)r1   startendr   r   r    _is_chinese_char   s
   	z$_SacreBLEUTokenizer._is_chinese_charc                 C   s   |S )zTokenizes an input line with the tokenizer.

        Args:
            line: a segment to tokenize

        Return:
            the tokenized line
        r   r'   r"   r   r   r    r      s   
z"_SacreBLEUTokenizer._tokenize_basec                 C   sf   | dd}| dd}| dd}d|v r.| dd}| d	d}| d
d}| dd}| |S )zTokenizes an input line using a relatively minimal tokenization that is however equivalent to
        mteval-v13a, used by WMT.

        Args:
            line: input sentence

        Return:
            tokenized sentence
        z	<skipped> z-

r)   &z&quot;"z&amp;z&lt;<z&gt;>)replacer0   r6   r   r   r    r      s   
z!_SacreBLEUTokenizer._tokenize_13ac                 C   sL   |  }d}|D ]}| |r|d7 }||7 }|d7 }q||7 }q| |S )aj  The tokenization of Chinese text in this script contains two
        steps: separate each Chinese characters (by utf-8 encoding); tokenize
        the Chinese part (following the `13a` i.e. mteval tokenizer).
        Author: Shujian Huang huangsj@nju.edu.cn

        Args:
            line: input sentence

        Return:
            tokenized sentence
        r7   r)   )stripr5   r0   )r'   r"   line_in_charsr   r   r   r    r      s   



z _SacreBLEUTokenizer._tokenize_zhc                 C   r(   )a  Tokenizes a string following the official BLEU implementation.

        See github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/mteval-v14.pl#L954-L983

        In our case, the input string is expected to be just one line.
        We just tokenize on punctuation and symbols,
        except when a punctuation is preceded and followed by a digit
        (e.g. a comma/dot as a thousand/decimal separator).
        We do not recover escaped forms of punctuations such as &apos; or &gt;
        as these should never appear in MT system outputs (see issue #138)

        Note that a number (e.g., a year) followed by a dot at the end of
        sentence is NOT tokenized, i.e. the dot stays with the number because
        `s/(\p{P})(\P{N})/ $1 $2/g` does not match this case (unless we add a
        space after each sentence). However, this error is already in the
        original mteval-v14.pl and we want to be consistent with it.
        The error is not present in the non-international version,
        which uses `$norm_text = " $norm_text "`.

        Args:
            line: the input string to tokenize.

        Return:
            The tokenized string.
        r)   )
_INT_REGEXr+   r,   r$   r-   r   r   r    r      s   z+_SacreBLEUTokenizer._tokenize_internationalc                 C   s   d dd |D S )zTokenizes all the characters in the input line.

        Args:
            line: a segment to tokenize

        Return:
            the tokenized line
        r)   c                 s   s    | ]}|V  qd S r   r   ).0r   r   r   r    	<genexpr>  s    z5_SacreBLEUTokenizer._tokenize_char.<locals>.<genexpr>)r,   r6   r   r   r    r     s   
z"_SacreBLEUTokenizer._tokenize_charc                 C   s   |r|   S | S r   )lower)r"   r   r   r   r    r#     s   z_SacreBLEUTokenizer._lower)F)__name__
__module____qualname____doc__recompiler*   r
   regexr@   r   r   boolr!   strr   r&   classmethodr   r0   staticmethodr5   r   r   r   r   r   r#   r   r   r   r    r   P   s`    
r      Fr   predstargetn_gramsmoothr   r   weightsr   c              	   C   s(  |t vrtdt  d| d|tj vr"tdttj  t| t|kr8tdt|  dt| |dkrBtsBtd|d	urXt||krXtd
t| d| |d	u rcd| g| }t	
|}t	
|}td}	td}
ttj||d}t| ||||	|
||\}	}
t|	|
|||||S )u  Calculate `BLEU score`_ [1] of machine translated text with one or more references. This implementation
    follows the behaviour of SacreBLEU [2] implementation from https://github.com/mjpost/sacrebleu.

    Args:
        preds: An iterable of machine translated corpus
        target: An iterable of iterables of reference corpus
        n_gram: Gram value ranged from 1 to 4
        smooth: Whether to apply smoothing – see [2]
        tokenize: Tokenization technique to be used.
            Supported tokenization: ['none', '13a', 'zh', 'intl', 'char']
        lowercase: If ``True``, BLEU score over lowercased text is calculated.
        weights:
            Weights used for unigrams, bigrams, etc. to calculate BLEU score.
            If not provided, uniform weights are used.

    Return:
        Tensor with BLEU Score

    Raises:
        ValueError: If ``preds`` and ``target`` corpus have different lengths.
        ValueError: If a length of a list of weights is not ``None`` and not equal to ``n_gram``.

    Example:
        >>> from torchmetrics.functional import sacre_bleu_score
        >>> preds = ['the cat is on the mat']
        >>> target = [['there is a cat on the mat', 'a cat is on the mat']]
        >>> sacre_bleu_score(preds, target)
        tensor(0.7598)

    References:
        [1] BLEU: a Method for Automatic Evaluation of Machine Translation by Papineni,
        Kishore, Salim Roukos, Todd Ward, and Wei-Jing Zhu `BLEU`_

        [2] A Call for Clarity in Reporting BLEU Scores by Matt Post.

        [3] Automatic Evaluation of Machine Translation Quality Using Longest Common Subsequence
        and Skip-Bigram Statistics by Chin-Yew Lin and Franz Josef Och `Machine Translation Evolution`_
    z*Argument `tokenize` expected to be one of z	 but got .z6Unsupported tokenizer selected. Please, choose one of zCorpus has different size z != r   zv`'intl'` tokenization requires that `regex` is installed. Use `pip install regex` or `pip install torchmetrics[text]`.Nz5List of weights has different weights than `n_gram`: g      ?g        )r   r   )AVAILABLE_TOKENIZERS
ValueErrorr   r   keyslistlenr
   ModuleNotFoundErrortorchzerosr   r   r   r	   r   )rP   rQ   rR   rS   r   r   rT   	numeratordenominator	preds_len
target_lenr   r   r   r    sacre_bleu_score  s@   0

rb   )rO   Fr   FN)rH   	functoolsr   typingr   r   r\   r   r   typing_extensionsr   !torchmetrics.functional.text.bleur   r	   torchmetrics.utilities.importsr
   rV   r2   r   rL   intrK   floatrb   r   r   r   r    <module>   sD   ' K

