o
    5ti]@                     @   s<  d Z ddlZddlmZmZmZ dZdZdZdZ	dZ
dZdZd	Zed
ZdZdZdZdZdZeee ee Zdee dee deeef fddZdee dee dedeeee ef fddZdee dedededee f
ddZdee dee fd d!Zd"d# Zd$edeeeef fd%d&ZG d'd( d(ZdS ))zDThis module implements various utility functions for the TER metric.    N)ListTupleDict   
   2      i'  i  g 7yACid sx	words_hyp	words_refreturnc                 C   s   t |}t | }|dkrt| }|dfS t|}d}| }d}	 t||||\}	}
}|tkr.n|	dkr3n|d7 }|
}q||\}}|| }||fS )zCalculate the translation edit rate.

    :param words_hyp: Tokenized translation hypothesis.
    :param words_ref: Tokenized reference translation.
    :return: tuple (number of edits, length)
    r   Tr   )len_OP_DELBeamEditDistance_shift_MAX_SHIFT_CANDIDATES)r   r   n_words_refn_words_hyptrace	cached_edshiftsinput_wordschecked_candidatesdeltanew_input_wordsedit_distancetotal_edits r!   M/home/ubuntu/.local/lib/python3.10/site-packages/sacrebleu/metrics/lib_ter.pytranslation_edit_rate-   s.   
r#   words_hwords_rr   c                 C   sr  || \}}t |}t|\}}}	d}
t| |D ]\}}}t|	|||  dkr*qt||||  dkr7q|||   krE|| k rHn nqd}td|D ]O}|| dkrZd}n|| |v ri|||  d }n n4||krpqO|}t| |||}t|t| ksJ |||d  || | |f}|d7 }|
r||
kr|}
qO|tkr nq|
sd| |fS |
\}}}}}|||fS )a  Attempt to shift words in hypothesis to match reference.

    Returns the shift that reduces the edit distance the most.

    Note that the filtering of possible shifts and shift selection are heavily
    based on somewhat arbitrary heuristics. The code here follows as closely
    as possible the logic in Tercom, not always justifying the particular design
    choices.

    :param words_h: Hypothesis.
    :param words_r: Reference.
    :param cached_ed: Cached edit distance.
    :param checked_candidates: Number of shift candidates that were already
                               evaluated.
    :return: (score, shifted_words, checked_candidates). Best shift and updated
             number of evaluated shift candidates.
    Nr   r   )_flip_tracetrace_to_alignment_find_shifted_pairssumrange_perform_shiftr   r   )r$   r%   r   r   	pre_score	inv_tracer   alignref_errhyp_errbeststart_hstart_rlengthprev_idxoffsetidxshifted_words	candidate
best_score_r!   r!   r"   r   T   sP    

r   wordsstartr5   targetc                 C   s   ||k r | d| | |||   | ||  | || d  S ||| krB| d| | || |  | |||   | |d  S | d| | || ||   | |||   | || d  S )zPerform a shift in `words` from `start` to `target`.

    :param words: Words to shift.
    :param start: Where from.
    :param length: How many words.
    :param target: Where to.
    :return: Shifted words.
    Nr!   )r=   r>   r5   r?   r!   r!   r"   r,      s"   	

r,   c                 c   s    t | }t |}t|D ]K}t|D ]D}t|| tkrqd}| ||  |||  krW|tk rW|d7 }|||fV  ||| ksF||| krGn| ||  |||  krW|tk s0qqdS )aQ  Find matching word sub-sequences in two lists of words.

    Ignores sub-sequences starting at the same position.

    :param words_h: First word list.
    :param words_r: Second word list.
    :return: Yields tuples of (h_start, r_start, length) such that:
         words_h[h_start:h_start+length] = words_r[r_start:r_start+length]
    r   r   N)r   r+   abs_MAX_SHIFT_DIST_MAX_SHIFT_SIZE)r$   r%   	n_words_h	n_words_rr3   r4   r5   r!   r!   r"   r)      s"   
  r)   c                 C   s
   |  tS )zFlip the trace of edit operations.

    Instead of rewriting a->b, get a recipe for rewriting b->a.

    Simply flips insertions and deletions.
    )	translate	_FLIP_OPS)r   r!   r!   r"   r'      s   
r'   r   c                 C   s   d}d}g }g }i }| D ]^}|t kr)|d7 }|d7 }|||< |d |d q|tkrD|d7 }|d7 }|||< |d |d q|tkrR|d7 }|d q|tkrd|d7 }|||< |d qtd||||fS )zTransform trace of edit operations into an alignment of the sequences.

    :param trace: Trace of edit operations (' '=no change or 's'/'i'/'d').
    :return: Alignment, error positions in reference, error positions in hypothesis.
    r&   r   r   unknown operation )_OP_NOPappend_OP_SUB_OP_INSr   	Exception)r   pos_hyppos_refr1   r0   r/   opr!   r!   r"   r(      s6   


r(   c                   @   s   e Zd ZdZdee fddZdee deeef fddZ	d	ee d
edeeeeef   deeeef fddZ
dee deee  fddZdee deeee f fddZdS )r   a  Edit distance with several features required for TER calculation.

        * internal cache
        * "beam" search
        * tracking of edit operations

    The internal self._cache works like this:

    Keys are words of the hypothesis. Values are tuples (next_node, row) where:

        * next_node is the cache for the next word in the sequence
        * row is the stored row of the edit distance matrix

    Effectively, caching allows to skip several rows in the edit distance
    matrix calculation and instead, to initialize the computation with the last
    matching matrix row.

    Beam search, as implemented here, only explores a fixed-size sub-row of
    candidates around the matrix diagonal (more precisely, it's a
    "pseudo"-diagonal since we take the ratio of sequence lengths into account).

    Tracking allows to reconstruct the optimal sequence of edit operations.

    :param words_ref: A list of reference tokens.
    r   c                 C   sR   || _ t| j | _dd t| jd D | _i | _d| _ttfg| jd  | _	dS )z`BeamEditDistance` initializer.c                 S   s   g | ]}|t  tfqS r!   )	_COST_INSrK   ).0r	   r!   r!   r"   
<listcomp>0  s    z-BeamEditDistance.__init__.<locals>.<listcomp>r   r   N)

_words_refr   _n_words_refr+   _initial_row_cache_cache_size_INT_INFINITY	_OP_UNDEF
_empty_row)selfr   r!   r!   r"   __init__)  s   zBeamEditDistance.__init__r   r   c                 C   s6   |  |\}}| |||\}}}| || ||fS )zCalculate edit distance between self._words_ref and the hypothesis.

        Uses cache to skip some of the computation.

        :param words_hyp: Words in translation hypothesis.
        :return: Edit distance score.
        )_find_cache_edit_distance
_add_cache)r[   r   start_positiondistr   newly_created_matrixr   r!   r!   r"   __call__:  s   
zBeamEditDistance.__call__r$   r3   cachec                    s  t |} fddt|| D }|| }t ||d ksJ |r& j| nd}t|d k r8t|d t }nt}t|d |d D ]}	t|	| }
td|
| }t jd |
| }|	|krf jd }t||D ]s}|dkr||	d  | d t	 t
f||	 |< qk||	d   j|d  krd}t}nt}t}||	d  |d  d | |f||	d  | d t	 t
f||	 |d  d t tff}|D ]\}}||	 | d |kr||f||	 |< qqkqCd}|}	 j}|	dks|dkr/||	 | d }|| }|ttfv r|	d8 }	|d8 }n|tkr|d8 }n|t
kr |	d8 }	ntd||	dks|dks|d d d |t |d	 |fS )
aF  Actual edit distance calculation.

        Can be initialized with the last cached row and a start position in
        the hypothesis that it corresponds to.

        :param words_h: Words in translation hypothesis.
        :param start_h: Position from which to start the calculation.
                        (This is zero if no cache match was found.)
        :param cache: Precomputed rows corresponding to edit distance matrix
                      before `start_h`.
        :return: Edit distance value, newly computed rows to update the
                 cache, trace.
        c                    s   g | ]}t  jqS r!   )listrZ   )rQ   r<   r[   r!   r"   rR   c  s    z3BeamEditDistance._edit_distance.<locals>.<listcomp>r      r    rG   r&   N)r   r+   rT   _BEAM_WIDTHmathceilfloormaxmin	_COST_DELr   rS   rH   	_COST_SUBrJ   rP   rK   rL   )r[   r$   r3   rd   rC   
rest_emptyra   length_ratio
beam_widthr	   pseudo_diagmin_jmax_jjcost_subop_subopsop_costop_namer   rO   r!   rf   r"   r^   P  sd   


&




"zBeamEditDistance._edit_distancematc           
      C   s   | j tkrdS | j}t|}t|| }t|D ]
}|||  d }qt||d |ks/J t||d |D ]\}}||vrOi t|f||< |  j d7  _ || }	|	d }q8dS )a{  Add newly computed rows to cache.

        Since edit distance is only calculated on the hypothesis suffix that
        was not in cache, the number of rows in `mat` may be shorter than
        hypothesis length. In that case, we skip over these initial words.

        :param words_hyp: Hypothesis words.
        :param mat: Edit distance matrix rows for each position.
        Nr   r   )rW   _MAX_CACHE_SIZErV   r   r+   ziptuple)
r[   r   r}   noden_matskip_numr	   wordrowvaluer!   r!   r"   r_     s   


zBeamEditDistance._add_cachec                 C   sT   | j }d}| jg}|D ]}||v r!|d7 }|| \}}|| q ||fS ||fS )zFind the already computed rows of the edit distance matrix in cache.

        Returns a partially computed edit distance matrix.

        :param words_hyp: Translation hypothesis.
        :return: Tuple (start position, dist).
        r   r   )rV   rU   rI   )r[   r   r   r`   ra   r   r   r!   r!   r"   r]     s   zBeamEditDistance._find_cacheN)__name__
__module____qualname____doc__r   strr\   r   intrc   r^   r_   r]   r!   r!   r!   r"   r     s    
Y&"r   )r   rj   typingr   r   r   rP   ro   rp   rB   rA   ri   r~   r   r   rX   rK   r   rH   rJ   rY   r   	maketransrF   r#   r   r,   r)   r'   r(   r   r!   r!   r!   r"   <module>   s:    &'
&U
'