o
    yiZ                     @   s  d dl Z d dlmZ d dlmZmZmZmZmZm	Z	m
Z
 d dlmZmZ d dlmZmZmZmZ dZdZdZG d	d
 d
ZdededefddZdee dee dee	eeef  fddZdeeef dee dee dedededefddZdee dedededee f
dd Zdee dee d!ed"ede	eee ef f
d#d$Zdee dee defd%d&Zdee deee  de	eef fd'd(Z d)ed*edefd+d,Z!	d>d-e
eee f dee
eee f  ded.ed/ed0eee  de	eeeee  f fd1d2Z"d.ed/edefd3d4Z#	5	5	6	5	5d?d-e
eee f dee
eee f  d7ed8ed9ed:ed;ede
ee	eee f f fd<d=Z$dS )@    N)	lru_cache)DictIteratorListOptionalSequenceTupleUnion)Tensortensor)_flip_trace_LevenshteinEditDistance_trace_to_alignment_validate_inputs
   2   i  c                   @   s   e Zd ZdZdZdZ				ddededed	ed
df
ddZeddde	d
e	fddZ
ede	d
e	fddZede	d
e	fddZede	d
e	fddZede	d
e	fddZdS )_TercomTokenizera  Re-implementation of Tercom Tokenizer in Python 3.

    See src/ter/core/Normalizer.java in https://github.com/jhclark/tercom Note that Python doesn't support named Unicode
    blocks so the mapping for relevant blocks was taken from here: https://unicode-table.com/en/blocks/

    This implementation follows the implemenation from
    https://github.com/mjpost/sacrebleu/blob/master/sacrebleu/tokenizers/tokenizer_ter.py.
    z=([\u3001\u3002\u3008-\u3011\u3014-\u301f\uff61-\uff65\u30fb])z:([\uff0e\uff0c\uff1f\uff1a\uff1b\uff01\uff02\uff08\uff09])FT	normalizeno_punctuation	lowercaseasian_supportreturnNc                 C   s   || _ || _|| _|| _dS )a  Initialize the tokenizer.

        Args:
            normalize: An indication whether a general tokenization to be applied.
            no_punctuation: An indication whteher a punctuation to be removed from the sentences.
            lowercase: An indication whether to enable case-insesitivity.
            asian_support: An indication whether asian characters to be processed.
        N)r   r   r   r   )selfr   r   r   r    r   T/home/ubuntu/.local/lib/python3.10/site-packages/torchmetrics/functional/text/ter.py__init__F   s   
z_TercomTokenizer.__init__i   )maxsizesentencec                 C   sd   |sdS | j r| }| jr| |}| jr| |}| jr+| |}| jr+| |}d	|
 S )zApply a different tokenization techniques according.

        Args:
            sentence: An input sentence to pre-process and tokenize.

        Return:
            A tokenized and pre-processed sentence.
          )r   lowerr   _normalize_general_and_westernr   _normalize_asianr   _remove_punct_remove_asian_punctjoinsplit)r   r   r   r   r   __call__Z   s   




z_TercomTokenizer.__call__c                 C   s4   d|  d} g d}|D ]\}}t ||| } q| S )z4Apply a language-independent (general) tokenization.r   ))z\n-r   )z\nr   )z&quot;")z&amp;&)z&lt;<)z&gt;>)z([{-~[-` -&(-+:-@/]) \1 )z's z 's )z's$z 's)z([^0-9])([\.,])\1 \2 )z([\.,])([^0-9])z \1 \2)z
([0-9])(-)r-   resub)r   rulespatternreplacementr   r   r   r!   w   s
   z/_TercomTokenizer._normalize_general_and_westernc                 C   s   t dd|}t dd|}t dd|}t dd|}t dd|}t dd|}t d	d|}t | jd|}t | jd|}|S )
z?Split Chinese chars and Japanese kanji down to character level.z([\u4e00-\u9fff\u3400-\u4dbf])r,   z([\u31c0-\u31ef\u2e80-\u2eff])z+([\u3300-\u33ff\uf900-\ufaff\ufe30-\ufe4f])z([\u3200-\u3f22])z<(^|^[\u3040-\u309f])([\u3040-\u309f]+)(?=$|^[\u3040-\u309f])r-   z<(^|^[\u30a0-\u30ff])([\u30a0-\u30ff]+)(?=$|^[\u30a0-\u30ff])z<(^|^[\u31f0-\u31ff])([\u31f0-\u31ff]+)(?=$|^[\u31f0-\u31ff])r/   r0   _ASIAN_PUNCTUATION_FULL_WIDTH_PUNCTUATIONclsr   r   r   r   r"      s   z!_TercomTokenizer._normalize_asianc                 C   s   t dd| S )z1Remove punctuation from an input sentence string.z[\.,\?:;!\"\(\)]r   r.   )r   r   r   r   r#      s   z_TercomTokenizer._remove_punctc                 C   s$   t | jd|}t | jd|}|S )z7Remove asian punctuation from an input sentence string.r   r4   r7   r   r   r   r$      s   z$_TercomTokenizer._remove_asian_punct)FFTF)__name__
__module____qualname____doc__r5   r6   boolr   r   strr'   staticmethodr!   classmethodr"   r#   r$   r   r   r   r   r   9   s:    	
r   r   	tokenizerr   c                 C   s   ||   S )zGiven a sentence, apply tokenization.

    Args:
        sentence: The input sentence string.
        tokenizer: An instance of ``_TercomTokenizer`` handling a sentence tokenization.

    Return:
        The pre-processed output sentence string.
    )rstrip)r   rA   r   r   r   _preprocess_sentence   s   
rC   
pred_wordstarget_wordsc                 c   s    t t| D ]J}t t|D ]A}t|| tkrqt dtD ]0}| || d  ||| d  kr3 n|||fV  t| || k}t||| k}|sM|rO nqqqdS )a  Find matching word sub-sequences in two lists of words. Ignores sub-sequences starting at the same position.

    Args:
        pred_words: A list of a tokenized hypothesis sentence.
        target_words: A list of a tokenized reference sentence.

    Return:
        Yields tuples of ``target_start, pred_start, length`` such that:
        ``target_words[target_start : target_start + length] == pred_words[pred_start : pred_start + length]``

        pred_start:
            A list of hypothesis start indices.
        target_start:
            A list of reference start indices.
        length:
            A length of a word span to be considered.
       N)rangelenabs_MAX_SHIFT_DIST_MAX_SHIFT_SIZE)rD   rE   
pred_starttarget_startlength_hyp_refr   r   r   _find_shifted_pairs   s"    rQ   
alignmentspred_errorstarget_errorsrL   rM   rN   c                 C   s`   t ||||  dkrdS t ||||  dkrdS || |   kr+|| k r.dS  dS dS )aJ  A helper function which returns ``True`` if any of corner cases has been met. Otherwise, ``False`` is
    returned.

    Args:
        alignments: A dictionary mapping aligned positions between a reference and a hypothesis.
        pred_errors: A list of error positions in a hypothesis.
        target_errors: A list of error positions in a reference.
        pred_start: A hypothesis start index.
        target_start: A reference start index.
        length: A length of a word span to be considered.

    Return:
        An indication whether any of conrner cases has been met.
    r   TF)sum)rR   rS   rT   rL   rM   rN   r   r   r   $_handle_corner_cases_during_shifting   s   rV   wordsstarttargetc                 C   s   dt t dtdtdtdt t f
dd}dt t dtdtdtdt t f
dd	}dt t dtdtdtdt t f
d
d}||k rD|| |||S ||| krQ|| |||S || |||S )a<  Perform a shift in ``words`` from ``start`` to ``target``.

    Args:
        words: A words to shift.
        start: An index where to start shifting from.
        length: A number of how many words to be considered.
        target: An index where to end shifting.

    Return:
        A list of shifted words.
    rW   rX   rY   rN   r   c                 S   s8   | d | | |||   | ||  | || d   S Nr   rW   rX   rY   rN   r   r   r   $_shift_word_before_previous_position#     8z<_perform_shift.<locals>._shift_word_before_previous_positionc                 S   s8   | d | | || |  | |||   | |d   S rZ   r   r[   r   r   r   #_shift_word_after_previous_position&  r]   z;_perform_shift.<locals>._shift_word_after_previous_positionc                 S   sP   | d | }|| || ||  7 }|| |||  7 }|| || d  7 }|S rZ   r   )rW   rX   rY   rN   shifted_wordsr   r   r   !_shift_word_within_shifted_string)  s
   z9_perform_shift.<locals>._shift_word_within_shifted_string)r   r>   int)rW   rX   rN   rY   r\   r^   r`   r   r   r   _perform_shift  s   &&&rb   cached_edit_distancechecked_candidatesc                 C   s  || \}}t |}t|\}}}	d}
t| |D ]b\}}}t||	||||r'qd}td|D ]E}|| dkr9d}n|| |v rH|||  d }n n*||krOq.|}t| |||}|||d  || | |f}|d7 }|
rq||
krs|}
q.|tkrz nq|
sd| |fS |
\}}}}}|||fS )a  Attempt to shift words to match a hypothesis with a reference. It returns the lowest number of required
    edits between a hypothesis and a provided reference, a list of shifted words and number of checked candidates.

    Note that the filtering of possible shifts and shift selection are heavily based on somewhat arbitrary heuristics.
    The code here follows as closely as possible the logic in Tercom, not always justifying the particular design
    choices. (The paragraph copied from https://github.com/mjpost/sacrebleu/blob/master/sacrebleu/metrics/lib_ter.py)

    Args:
        pred_words: A list of tokenized hypothesis sentence.
        target_words: A list of lists of tokenized reference sentences.
        cached_edit_distance: A pre-computed edit distance between a hypothesis and a reference.
        checked_candidates: A number of checked hypothesis candidates to match a provided reference.

    Return:
        best_score:
            The best (lowest) number of required edits to match hypothesis and reference sentences.
        shifted_words:
            A list of shifted words in hypothesis sentences.
        checked_candidates:
            A number of checked hypothesis candidates to match a provided reference.
    Nr   rF   )r   r   rQ   rV   rG   rb   _MAX_SHIFT_CANDIDATES)rD   rE   rc   rd   edit_distanceinverted_tracetracerR   rT   rS   bestrL   rM   rN   prev_idxoffsetidxr_   	candidate
best_score_r   r   r   _shift_words7  sJ   

rq   c                 C   sz   t |dkr
tdS t|}d}d}| }	 t||||\}}}|tks'|dkr(n|d7 }|}q||\}}	|| }
t|
S )a=  Compute translation edit rate between hypothesis and reference sentences.

    Args:
        pred_words: A list of a tokenized hypothesis sentence.
        target_words: A list of lists of tokenized reference sentences.

    Return:
        A number of required edits to match hypothesis and reference sentences.
    r           TrF   )rH   r   r   rq   rf   )rD   rE   rc   
num_shiftsrd   input_wordsdeltanew_input_wordsrg   rp   total_editsr   r   r   _translation_edit_rate  s$   


rx   c                 C   sP   t d}t d}|D ]}t|| }|t|7 }||k r|}q
|t| }||fS )a  Compute sentence TER statistics between hypothesis and provided references.

    Args:
        pred_words: A list of tokenized hypothesis sentence.
        target_words: A list of lists of tokenized reference sentences.

    Return:
        best_num_edits:
            The best (lowest) number of required edits to match hypothesis and reference sentences.
        avg_tgt_len:
            Average length of tokenized reference sentences.
    rr   g 7yQC)r   rx   rH   )rD   rE   tgt_lengthsbest_num_edits	tgt_words	num_editsavg_tgt_lenr   r   r   _compute_sentence_statistics  s   
r~   r|   
tgt_lengthc                 C   sD   |dkr| dkr| | }|S |dkr| dkrt d}|S t d}|S )aP  Compute TER score based on pre-computed a number of edits and an average reference length.

    Args:
        num_edits: A number of required edits to match hypothesis and reference sentences.
        tgt_length: An average length of reference sentences.

    Return:
        A corpus-level TER score or 1 if reference_length == 0.
    r   g      ?rr   )r   )r|   r   scorer   r   r   "_compute_ter_score_from_statistics  s   
r   predstotal_num_editstotal_tgt_lengthsentence_terc                    s   t || \}} t| |D ]2\}} fdd|D }t|  }	t|	|\}
}||
7 }||7 }|dur>|t|
|d q|||fS )a  Update TER statistics.

    Args:
        preds: An iterable of hypothesis corpus.
        target: An iterable of iterables of reference corpus.
        tokenizer:
        total_num_edits: A total number of required edits to match hypothesis and reference sentences.
        total_tgt_length: A total average length of reference sentences.

    Return:
        total_num_edits:
            A total number of required edits to match hypothesis and reference sentences.
        total_tgt_length:
            A total average length of reference sentences.
        sentence_ter:
            (Optionally) A list of sentence-level TER.

    Raises:
        ValueError:
            If length of ``preds`` and ``target`` differs.
    c                    s   g | ]	}t |  qS r   )rC   r&   ).0_tgtrA   r   r   
<listcomp>  s    z_ter_update.<locals>.<listcomp>Nr   )r   ziprC   r&   r~   appendr   	unsqueeze)r   rY   rA   r   r   r   predtgt
tgt_words_pred_words_r|   r   r   r   r   _ter_update  s   
r   c                 C   s
   t | |S )aM  Compute TER based on pre-computed a total number of edits and a total average reference length.
    Args:
        total_num_edits: A total number of required edits to match hypothesis and reference sentences.
        total_tgt_length: A total average length of reference sentences.

    Return:
        A corpus-level TER score.
    )r   )r   r   r   r   r   _ter_compute  s   
	r   FTr   r   r   r   return_sentence_level_scorec                 C   s   t |tstd| dt |tstd| dt |ts'td| dt |ts4td| dt||||}td}td}	|rGg nd}
t| ||||	|
\}}	}
t||	}|
r`||
fS |S )a  Calculate Translation edit rate (`TER`_)  of machine translated text with one or more references. This
    implementation follows the implmenetaions from
    https://github.com/mjpost/sacrebleu/blob/master/sacrebleu/metrics/ter.py. The `sacrebleu` implmenetation is a
    near-exact reimplementation of the Tercom algorithm, produces identical results on all "sane" outputs.

    Args:
        preds: An iterable of hypothesis corpus.
        target: An iterable of iterables of reference corpus.
        normalize: An indication whether a general tokenization to be applied.
        no_punctuation: An indication whteher a punctuation to be removed from the sentences.
        lowercase: An indication whether to enable case-insesitivity.
        asian_support: An indication whether asian characters to be processed.
        return_sentence_level_score: An indication whether a sentence-level TER to be returned.

    Return:
        A corpus-level translation edit rate (TER).
        (Optionally) A list of sentence-level translation_edit_rate (TER) if `return_sentence_level_score=True`.

    Example:
        >>> preds = ['the cat is on the mat']
        >>> target = [['there is a cat on the mat', 'a cat is on the mat']]
        >>> translation_edit_rate(preds, target)
        tensor(0.1538)

    References:
        [1] A Study of Translation Edit Rate with Targeted Human Annotation
        by Mathew Snover, Bonnie Dorr, Richard Schwartz, Linnea Micciulla and John Makhoul `TER`_
    z<Expected argument `normalize` to be of type boolean but got .zAExpected argument `no_punctuation` to be of type boolean but got z<Expected argument `lowercase` to be of type boolean but got z@Expected argument `asian_support` to be of type boolean but got rr   N)
isinstancer=   
ValueErrorr   r   r   r   )r   rY   r   r   r   r   r   rA   r   r   r   	ter_scorer   r   r   translation_edit_rate  s0   
%




r   rZ   )FFTFF)%r/   	functoolsr   typingr   r   r   r   r   r   r	   torchr
   r   #torchmetrics.functional.text.helperr   r   r   r   rK   rJ   rf   r   r>   rC   ra   rQ   r=   rV   rb   rq   rx   r~   r   r   r   r   r   r   r   r   <module>   s   #$ ,&

&%!
O*"

*