o
    .wiN                     @   s   d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZm	Z	m
Z
 d dlZd dlmZmZ d dlmZ d dlmZmZ d dlmZmZmZmZmZmZ d	Zed	 Zd
Ze je dZ dddZ!G dd dZ"					ddee# deee#  de$de%dede%de
ee&  defddZ'dS )    N)Sequence)partial)AnyClassVarOptional)Tensortensor)Literal)_bleu_score_compute_bleu_score_update)_IPADIC_AVAILABLE_MECAB_AVAILABLE_MECAB_KO_AVAILABLE_MECAB_KO_DIC_AVAILABLE_REGEX_AVAILABLE_SENTENCEPIECE_AVAILABLE	none13azhintlcharja-mecabko-mecab	flores101	flores200))u   㐀u   䶵)u   一u   龥)u   龦u   龻)u   豈u   鶴)u   侮u   頻)u   並u   龎)u    0u   ⩭6)u   ⾀0u   ⾡d)u   ＀u   ￯)u   ⺀u   ⻿)u   　u   〿)u   ㇀u   ㇯)u   ⼀u   ⿟)u   ⿰u   ⿿)u   ㄀u   ㄯ)u   ㆠu   ㆿ)u   ︐u   ︟)u   ︰u   ﹏)u   ☀u   ⛿)u   ✀u   ➿)u   ㈀u   ㋿)u   ㌀u   ㏿ztorchmetrics-floreszRhttps://dl.fbaipublicfiles.com/fairseq/models/flores/sacrebleu_tokenizer_spm.modelz)https://tinyurl.com/flores200sacrebleuspmr   r   c                   @   s  e Zd ZU dZeddfeddfeddfeddffZer;d	d
lZeddfeddfeddffZ	dddddddddd	Z
ee ed< d
d
dZeeeee f  ed< dGdededd
fdd Zd!edee fd"d#Ze	dGd$ed  d!edededee f
d%d&Zed$ed  d!edefd'd(Zed)edefd*d+Zed$ed  d!edefd,d-Zed$ed  d!edefd.d/Zed$ed  d!edefd0d1Zed$ed  d!edefd2d3Z ed$ed  d!edefd4d5Z!ed$ed  d!edefd6d7Z"ed$ed  d!edefd8d9Z#ed$ed  d!ede$d defd:d;Z%ed$ed  d!edefd<d=Z&ed$ed  d!edefd>d?Z'ed!ededefd@dAZ(ed$ed  dedd
fdBdCZ)edDe$d dd
fdEdFZ*d
S )H_SacreBLEUTokenizerzTokenizer used for SacreBLEU calculation.

    Source: https://github.com/mjpost/sacrebleu/tree/master/sacrebleu/tokenizers

    z([\{-\~\[-\` -\&\(-\+\:-\@\/])z \1 z([^0-9])([\.,])z\1 \2 z([\.,])([^0-9])z \1 \2z
([0-9])(-)r   Nz(\P{N})(\p{P})z(\p{P})(\P{N})z(\p{S})_tokenize_base_tokenize_13a_tokenize_zh_tokenize_international_tokenize_char_tokenize_ja_mecab_tokenize_ko_mecab_tokenize_flores_101_tokenize_flores_200r   _TOKENIZE_FNr   sentencepiece_processorsFtokenize	lowercasereturnc                 C   s&   |  | t| | j| | _|| _d S N)_check_tokenizers_validitygetattrr'   tokenize_fnr*   )selfr)   r*    r1   d/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/torchmetrics/functional/text/sacre_bleu.py__init__   s   

z_SacreBLEUTokenizer.__init__linec                 C   s   |  |}| || j S r,   )r/   _lowerr*   split)r0   r4   tokenized_liner1   r1   r2   __call__   s   
z_SacreBLEUTokenizer.__call__clsc                 C   s2   |  | t| | j| }||}| || S r,   )r-   r.   r'   r5   r6   )r9   r4   r)   r*   r/   r7   r1   r1   r2   r)      s   
z_SacreBLEUTokenizer.tokenizec                 C   *   | j D ]
\}}|||}qd| S )zPost-processing tokenizer for `13a` and `zh` tokenizers.

        Args:
            line: a segment to tokenize

        Return:
            the tokenized line

         )_REGEXsubjoinr6   r9   r4   _rereplr1   r1   r2   _tokenize_regex   s   z#_SacreBLEUTokenizer._tokenize_regexucharc                    s   t  fddtD S )zCheck if character is chinese.

        Args:
            uchar: input char in unicode.

        Return:
            whether the input char is a Chinese character.

        c                 3   s,    | ]\}}|   ko|kn  V  qd S r,   r1   ).0startendrC   r1   r2   	<genexpr>   s   * z7_SacreBLEUTokenizer._is_chinese_char.<locals>.<genexpr>)any_UCODE_RANGESrG   r1   rG   r2   _is_chinese_char   s   z$_SacreBLEUTokenizer._is_chinese_charc                 C   s   |S )zTokenizes an input line with the tokenizer.

        Args:
            line: a segment to tokenize

        Return:
            the tokenized line

        r1   r9   r4   r1   r1   r2   r      s   z"_SacreBLEUTokenizer._tokenize_basec                 C   sn   | dd}| dd}| dd}d|v r.| dd}| d	d}| d
d}| dd}| d| dS )zTokenizes a line using a relatively minimal tokenization that is equivalent to mteval-v13a, used by WMT.

        Args:
            line: input sentence

        Return:
            tokenized sentence

        z	<skipped> z-

r;   &z&quot;"z&amp;z&lt;<z&gt;>)replacerB   rL   r1   r1   r2   r      s   z!_SacreBLEUTokenizer._tokenize_13ac                 C   sL   |  }d}|D ]}| |r|d7 }||7 }|d7 }q||7 }q| |S )ap  Tokenization of Chinese text.

        This is done in two steps: separate each Chinese characters (by utf-8 encoding) and afterwards tokenize the
        Chinese part (following the `13a` i.e. mteval tokenizer).
        Author: Shujian Huang huangsj@nju.edu.cn.

        Args:
            line: input sentence

        Return:
            tokenized sentence

        rM   r;   )striprK   rB   )r9   r4   line_in_charsr   r1   r1   r2   r       s   



z _SacreBLEUTokenizer._tokenize_zhc                 C   r:   )a  Tokenizes a string following the official BLEU implementation.

        See github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/mteval-v14.pl#L954-L983

        In our case, the input string is expected to be just one line.
        We just tokenize on punctuation and symbols,
        except when a punctuation is preceded and followed by a digit
        (e.g. a comma/dot as a thousand/decimal separator).
        We do not recover escaped forms of punctuation such as &apos; or &gt;
        as these should never appear in MT system outputs (see issue #138)

        Note that a number (e.g., a year) followed by a dot at the end of
        sentence is NOT tokenized, i.e. the dot stays with the number because
        `s/(\\p{P})(\\P{N})/ $1 $2/g` does not match this case (unless we add a
        space after each sentence). However, this error is already in the
        original mteval-v14.pl and we want to be consistent with it.
        The error is not present in the non-international version,
        which uses `$norm_text = " $norm_text "`.

        Args:
            line: the input string to tokenize.

        Return:
            The tokenized string.

        r;   )
_INT_REGEXr=   r>   r6   r?   r1   r1   r2   r!     s   z+_SacreBLEUTokenizer._tokenize_internationalc                 C   s   d dd |D S )zTokenizes all the characters in the input line.

        Args:
            line: a segment to tokenize

        Return:
            the tokenized line

        r;   c                 s   s    | ]}|V  qd S r,   r1   )rD   r   r1   r1   r2   rH   4  s    z5_SacreBLEUTokenizer._tokenize_char.<locals>.<genexpr>)r>   rL   r1   r1   r2   r"   )  s   z"_SacreBLEUTokenizer._tokenize_charc                 C   s6   ddl }ddl}||jd }| }|| S )zTokenizes a Japanese string line using MeCab morphological analyzer.

        Args:
            line: the input string to tokenize.

        Return:
            The tokenized string.

        r   N	 -Owakati)ipadicMeCabTagger
MECAB_ARGSrT   parse)r9   r4   rX   rY   taggerr1   r1   r2   r#   6  
   z&_SacreBLEUTokenizer._tokenize_ja_mecabc                 C   s6   ddl }ddl}||jd }| }|| S )zTokenizes a Korean string line using MeCab-korean morphological analyzer.

        Args:
            line: the input string to tokenize.

        Return:
            The tokenized string.

        r   NrW   )mecab_komecab_ko_dicrZ   r[   rT   r\   )r9   r4   r_   r`   r]   r1   r1   r2   r$   I  r^   z&_SacreBLEUTokenizer._tokenize_ko_mecabc                 C   s|   ddl }| j| du r3| | j|< tjtt| dd }tj	|s+| 
| | j| | d| j| |S )zTokenizes a string line using sentencepiece tokenizer.

        Args:
            line: the input string to tokenize.
            tokenize: Tokenization technique to be used.

        Return:
            The tokenized string.

        r   N/r;   )sentencepiecer(   SentencePieceProcessorospathr>   _FLORES_LOCAL_DIR_FLORES_MODELS_URLr6   existsdownload_flores_fileLoadEncodeAsPieces)r9   r4   r)   rc   	file_pathr1   r1   r2   _tokenize_flores\  s   
z$_SacreBLEUTokenizer._tokenize_floresc                 C      |  |dS )zTokenizes a string line using sentencepiece tokenizer according to `FLORES-101`_ dataset.

        Args:
            line: the input string to tokenize.

        Return:
            The tokenized string.

        r   rn   rL   r1   r1   r2   r%   w     z(_SacreBLEUTokenizer._tokenize_flores_101c                 C   ro   )zTokenizes a string line using sentencepiece tokenizer according to `FLORES-200`_ dataset.

        Args:
            line: the input string to tokenize.

        Return:
            The tokenized string.

        r   rp   rL   r1   r1   r2   r&     rq   z(_SacreBLEUTokenizer._tokenize_flores_200c                 C   s   |r|   S | S r,   )lower)r4   r*   r1   r1   r2   r5     s   z_SacreBLEUTokenizer._lowerc                 C   s   || j vrtdt| j   |dkrtstd|dkr'tr#ts'td|dkr3tr/t	s3tdd|v r=t
s?td	d
S d
S )z}Check if a supported tokenizer is chosen.

        Also check all dependencies of a given tokenizers are installed.

        z6Unsupported tokenizer selected. Please, choose one of r   zv`'intl'` tokenization requires that `regex` is installed. Use `pip install regex` or `pip install torchmetrics[text]`.r   z`'ja-mecab'` tokenization requires that `MeCab` and `ipadic` are installed. Use `pip install mecab-python3 ipadic` or `pip install torchmetrics[text]`.r   z`'ko-mecab'` tokenization requires that `mecab_ko` and `mecab_ko_dic` are installed. Use `pip install mecab_ko mecab_ko_dic` or `pip install torchmetrics[text]`.floresz`'flores101' and 'flores200'` tokenizations require that `sentencepiece` is installed. Use `pip install sentencepiece` or `pip install torchmetrics[text]`.N)r'   
ValueErrorlistkeysr   ModuleNotFoundErrorr   r   r   r   r   )r9   r)   r1   r1   r2   r-     s&   
z._SacreBLEUTokenizer._check_tokenizers_validity
model_namec              
   C   s   ddl }ddl}tjtdd t|  }tjt|dd }z@t	|d0}|j
|}||  W d   n1 s>w   Y  W d   W dS W d   W dS 1 sXw   Y  W dS  |jyu } z	td|  d	|d}~ww )
zGDownload necessary files for `flores` tokenization via `sentencepiece`.r   NT)exist_okra   rb   wbzFailed to download z model.)sslurllib.requestre   makedirsrg   rh   rf   r>   r6   openrequesturlopenwritereadSSLErrorOSError)rx   r{   urllib	model_urlrm   out_fileremote_fileer1   r1   r2   rj     s   Vz(_SacreBLEUTokenizer.download_flores_file)F)+__name__
__module____qualname____doc__recompiler<   r   regexrV   r'   r   dict__annotations__r(   strr   r   _TokenizersLiteralboolr3   r   r8   classmethodtyper)   rB   staticmethodrK   r   r   r    r!   r"   r#   r$   r	   rn   r%   r&   r5   r-   rj   r1   r1   r1   r2   r   c   s   
 
" !r      Fr   predstargetn_gramsmoothr)   r*   weightsr+   c              	   C   s   t | t |krtdt |  dt | |dur,t ||kr,tdt | d| |du r7d| g| }t|}t|}td}	td}
ttj||d}t| ||||	|
||\}	}
t	|	|
|||||S )a^  Calculate `BLEU score`_ [1] of machine translated text with one or more references.

    This implementation follows the behaviour of SacreBLEU [2] implementation from https://github.com/mjpost/sacrebleu.

    .. note::
        In the original SacreBLEU, references are passed as a list of reference sets (grouped by reference index).
        In TorchMetrics, references are passed grouped per prediction (each prediction has its own list of references).

        For example::

            # Predictions
            preds = ['The dog bit the man.', "It wasn't surprising.", 'The man had just bitten him.']

            # Original SacreBLEU:
            refs = [
                ['The dog bit the man.', 'It was not unexpected.', 'The man bit him first.'], # First set
                ['The dog had bit the man.', 'No one was surprised.', 'The man had bitten the dog.'], # Second set
            ]

            # TorchMetrics SacreBLEU:
            target = [
                ['The dog bit the man.', 'The dog had bit the man.'], # References for first prediction
                ['It was not unexpected.', 'No one was surprised.'], # References for second prediction
                ['The man bit him first.', 'The man had bitten the dog.'], # References for third prediction
            ]

    Args:
        preds: An iterable of machine translated corpus
        target: An iterable of iterables of reference corpus
        n_gram: Gram value ranged from 1 to 4
        smooth: Whether to apply smoothing - see [2]
        tokenize: Tokenization technique to be used. Choose between ``'none'``, ``'13a'``, ``'zh'``, ``'intl'``,
            ``'char'``, ``'ja-mecab'``, ``'ko-mecab'``, ``'flores101'`` and ``'flores200'``.
        lowercase: If ``True``, BLEU score over lowercased text is calculated.
        weights:
            Weights used for unigrams, bigrams, etc. to calculate BLEU score.
            If not provided, uniform weights are used.

    Return:
        Tensor with BLEU Score

    Raises:
        ValueError: If ``preds`` and ``target`` corpus have different lengths.
        ValueError: If a length of a list of weights is not ``None`` and not equal to ``n_gram``.

    Example:
        >>> from torchmetrics.functional.text import sacre_bleu_score
        >>> preds = ['the cat is on the mat']
        >>> target = [['there is a cat on the mat', 'a cat is on the mat']]
        >>> sacre_bleu_score(preds, target)
        tensor(0.7598)

    References:
        [1] BLEU: a Method for Automatic Evaluation of Machine Translation by Papineni,
        Kishore, Salim Roukos, Todd Ward, and Wei-Jing Zhu `BLEU`_

        [2] A Call for Clarity in Reporting BLEU Scores by Matt Post.

        [3] Automatic Evaluation of Machine Translation Quality Using Longest Common Subsequence
        and Skip-Bigram Statistics by Chin-Yew Lin and Franz Josef Och `Machine Translation Evolution`_

    zCorpus has different size z != Nz5List of weights has different weights than `n_gram`: g      ?g        )r)   r*   )
lenrt   torchzerosr   r   r   r)   r   r
   )r   r   r   r   r)   r*   r   	numeratordenominator	preds_len
target_lenr/   r1   r1   r2   sacre_bleu_score  s,   G

r   )r   Fr   FN)(re   r   tempfilecollections.abcr   	functoolsr   typingr   r   r   r   r   r   typing_extensionsr	   !torchmetrics.functional.text.bleur
   r   torchmetrics.utilities.importsr   r   r   r   r   r   AVAILABLE_TOKENIZERSr   rJ   rf   r>   
gettempdirrg   rh   r   r   intr   floatr   r1   r1   r1   r2   <module>   sV   ' 	  m

