o
    5tiY                     @   s   d Z ddlmZmZmZmZmZ ddlmZ ddl	m
Z
 ddlmZmZmZ ddlmZ G d	d
 d
eZG dd deZG dd deZdS )z;The implementation of the TER metric (Snover et al., 2006).    )ListDictSequenceOptionalAny   )TercomTokenizer)sum_of_lists   )Score	SignatureMetric)translation_edit_ratec                       s&   e Zd ZdZdef fddZ  ZS )TERSignaturezA convenience class to represent the reproducibility signature for TER.

    :param args: key-value dictionary passed from the actual metric instance.
    argsc                    s^   t  | | jdddddd | j|d rdnd	|d
 |d |d  |d d dS )z`TERSignature` initializer.ctnrpnas)casetoknormpunctasiancase_sensitivemixedlctokenizer_signature
normalizedno_punctasian_supportN)super__init___abbrupdateinfo)selfr   	__class__ I/home/ubuntu/.local/lib/python3.10/site-packages/sacrebleu/metrics/ter.pyr#      s   zTERSignature.__init__)__name__
__module____qualname____doc__dictr#   __classcell__r*   r*   r(   r+   r      s    r   c                       s.   e Zd ZdZdededef fddZ  ZS )TERScorezA convenience class to represent TER scores.

    :param score: The TER score.
    :param num_edits: The cumulative number of edits.
    :param ref_length: The cumulative average reference length.
    score	num_edits
ref_lengthc                    s"   t  d| t|| _|| _dS )z`TERScore` initializer.TERN)r"   r#   intr4   r5   )r'   r3   r4   r5   r(   r*   r+   r#   :   s   

zTERScore.__init__)r,   r-   r.   r/   floatr#   r1   r*   r*   r(   r+   r2   3   s    "r2   c                       s   e Zd ZdZeZ					ddededededeeee	   f
 fd	d
Z
de	de	fddZdee defddZdeee  defddZde	dedee fddZdee	 dee	ef fddZ  ZS )r6   a  Translation edit rate (TER). A near-exact reimplementation of the Tercom
    algorithm, produces identical results on all "sane" outputs.

    Tercom original implementation: https://github.com/jhclark/tercom

    The beam edit distance algorithm uses a slightly different approach (we stay
    around the diagonal which is faster, at least in Python) so in some
    (extreme) corner cases, the output could differ.

    Caching in the edit distance is based partly on the PyTer package by Hiroyuki
    Tanaka (MIT license). (https://github.com/aflc/pyter)

    :param normalized: Enable character normalization. By default, normalizes a couple of things such as
        newlines being stripped, retrieving XML encoded characters, and fixing tokenization for punctuation. When
        'asian_support' is enabled, also normalizes specific Asian (CJK) character sequences, i.e.
        split them down to the character level.
    :param no_punct: Remove punctuation. Can be used in conjunction with 'asian_support' to also remove typical
        punctuation markers in Asian languages (CJK).
    :param asian_support: Enable special treatment of Asian characters. This option only has an effect when
        'normalized' and/or 'no_punct' is enabled. If 'normalized' is also enabled, then Asian (CJK)
        characters are split down to the character level. If 'no_punct' is enabled alongside 'asian_support',
        specific unicode ranges for CJK and full-width punctuations are also removed.
    :param case_sensitive: If `True`, does not lowercase sentences.
    :param references: A sequence of reference documents with document being
        defined as a sequence of reference strings. If given, the reference info
        will be pre-computed and cached for faster re-computation across many systems.
    FNr   r    r!   r   
referencesc                    sd   t    || _|| _|| _|| _t| j| j| j| jd| _| j | _	|dur0| 
|| _dS dS )z`TER` initializer.)r   r    r!   r   N)r"   r#   r    r   r!   r   r   	tokenizer	signaturer   _cache_references
_ref_cache)r'   r   r    r!   r   r9   r(   r*   r+   r#   `   s   
zTER.__init__sentreturnc                 C   s   |  | S )zGiven a sentence, apply tokenization if enabled.

        :param sent: The input sentence string.
        :return: The pre-processed output string.
        )r:   rstrip)r'   r>   r*   r*   r+   _preprocess_segmentx      zTER._preprocess_segmentstatsc                 C   sF   |d |d }}|dkr|| }n	|dkrd}nd}t d| ||S )zComputes the final score from already aggregated statistics.

        :param stats: A list or numpy array of segment-level statistics.
        :return: A `TERScore` object.
        r   r
   g      ?g        d   )r2   )r'   rC   total_editssum_ref_lengthsr3   r*   r*   r+   _compute_score_from_stats   s   
zTER._compute_score_from_statsc                 C   s   |  t|S )zComputes the final TER score given the pre-computed corpus statistics.

        :param stats: A list of segment-level statistics
        :return: A `TERScore` instance.
        )rG   r	   )r'   rC   r*   r*   r+   _aggregate_and_compute   rB   zTER._aggregate_and_compute
hypothesis
ref_kwargsc                 C   s\   d}t d}| }|d }|D ]}t||\}}	||	7 }||k r#|}q|t| }
||
gS )a  Given a (pre-processed) hypothesis sentence and already computed
        reference words, returns the segment statistics required to compute
        the full TER score.

        :param hypothesis: Hypothesis sentence.
        :param ref_kwargs: A dictionary with `ref_words` key which is a list
        where each sublist contains reference words.
        :return: A two-element list that contains the 'minimum number of edits'
        and 'the average reference length'.
        r   g 7yAC	ref_words)r7   splitr   len)r'   rI   rJ   ref_lengthsbest_num_edits	words_hyprK   	words_refr4   ref_lenavg_ref_lenr*   r*   r+   _compute_segment_statistics   s   zTER._compute_segment_statisticsrefsc                 C   s*   g }|D ]}| | |  qd|iS )a.  Given a list of reference segments, applies pre-processing & tokenization
        and returns list of tokens for each reference.

        :param refs: A sequence of strings.
        :return: A dictionary that will be passed to `_compute_segment_statistics()`
        through keyword arguments.
        rK   )appendrA   rL   )r'   rU   rK   refr*   r*   r+   _extract_reference_info   s   zTER._extract_reference_info)FFFFN)r,   r-   r.   r/   r   _SIGNATURE_TYPEboolr   r   strr#   rA   r   r8   r2   rG   rH   r   rT   r   rX   r1   r*   r*   r(   r+   r6   A   s8    
&r6   N)r/   typingr   r   r   r   r   tokenizers.tokenizer_terr   utilsr	   baser   r   r   lib_terr   r   r2   r6   r*   r*   r*   r+   <module>   s    