o
    5t¾i]@  ã                   @   s<  d Z ddlZddlmZmZmZ dZdZdZdZ	dZ
dZdZd	Zed
ƒZdZdZdZdZdZe ee ee ¡Zdee dee deeef fdd„Zdee dee dedeeee ef fdd„Zdee dedededee f
dd„Zdee dee fd d!„Zd"d#„ Zd$edeeeef fd%d&„ZG d'd(„ d(ƒZdS ))zDThis module implements various utility functions for the TER metric.é    N)ÚListÚTupleÚDicté   é
   é2   é   i'  iè  g €à7yÃACÚiÚdú ÚsÚxÚ	words_hypÚ	words_refÚreturnc                 C   s   t |ƒ}t | ƒ}|dkrt| }|dfS t|ƒ}d}| }d}	 t||||ƒ\}	}
}|tkr.n|	dkr3n|d7 }|
}q||ƒ\}}|| }||fS )zÆCalculate the translation edit rate.

    :param words_hyp: Tokenized translation hypothesis.
    :param words_ref: Tokenized reference translation.
    :return: tuple (number of edits, length)
    r   Tr   )ÚlenÚ_OP_DELÚBeamEditDistanceÚ_shiftÚ_MAX_SHIFT_CANDIDATES)r   r   Ún_words_refÚn_words_hypÚtraceÚ	cached_edÚshiftsÚinput_wordsÚchecked_candidatesÚdeltaÚnew_input_wordsÚedit_distanceÚtotal_edits© r!   úM/home/ubuntu/.local/lib/python3.10/site-packages/sacrebleu/metrics/lib_ter.pyÚtranslation_edit_rate-   s.   
ÿõr#   Úwords_hÚwords_rr   c                 C   sr  || ƒ\}}t |ƒ}t|ƒ\}}}	d}
t| |ƒD ]\}}}t|	||| … ƒdkr*qt|||| … ƒdkr7q|||   krE|| k rHn nqd}td|ƒD ]O}|| dkrZd}n|| |v ri|||  d }n n4||krpqO|}t| |||ƒ}t|ƒt| ƒksƒJ ‚|||ƒd  || | |f}|d7 }|
rœ||
krž|}
qO|tkr¥ nq|
s­d| |fS |
\}}}}}|||fS )aÝ  Attempt to shift words in hypothesis to match reference.

    Returns the shift that reduces the edit distance the most.

    Note that the filtering of possible shifts and shift selection are heavily
    based on somewhat arbitrary heuristics. The code here follows as closely
    as possible the logic in Tercom, not always justifying the particular design
    choices.

    :param words_h: Hypothesis.
    :param words_r: Reference.
    :param cached_ed: Cached edit distance.
    :param checked_candidates: Number of shift candidates that were already
                               evaluated.
    :return: (score, shifted_words, checked_candidates). Best shift and updated
             number of evaluated shift candidates.
    Nr   éÿÿÿÿr   )Ú_flip_traceÚtrace_to_alignmentÚ_find_shifted_pairsÚsumÚrangeÚ_perform_shiftr   r   )r$   r%   r   r   Ú	pre_scoreÚ	inv_tracer   ÚalignÚref_errÚhyp_errÚbestÚstart_hÚstart_rÚlengthÚprev_idxÚoffsetÚidxÚshifted_wordsÚ	candidateÚ
best_scoreÚ_r!   r!   r"   r   T   sP    û€ÿ

r   ÚwordsÚstartr5   Útargetc                 C   sÄ   ||k r | d|… | ||| …  | ||…  | || d…  S ||| krB| d|… | || |…  | ||| …  | |d…  S | d|… | || || …  | ||| …  | || d…  S )zÖPerform a shift in `words` from `start` to `target`.

    :param words: Words to shift.
    :param start: Where from.
    :param length: How many words.
    :param target: Where to.
    :return: Shifted words.
    Nr!   )r=   r>   r5   r?   r!   r!   r"   r,   ©   s"   	
ÿÿÿ
ÿÿÿr,   c                 c   s¶    t | ƒ}t |ƒ}t|ƒD ]K}t|ƒD ]D}t|| ƒtkrqd}| ||  |||  krW|tk rW|d7 }|||fV  ||| ksF||| krGn| ||  |||  krW|tk s0qqdS )aQ  Find matching word sub-sequences in two lists of words.

    Ignores sub-sequences starting at the same position.

    :param words_h: First word list.
    :param words_r: Second word list.
    :return: Yields tuples of (h_start, r_start, length) such that:
         words_h[h_start:h_start+length] = words_r[r_start:r_start+length]
    r   r   N)r   r+   ÚabsÚ_MAX_SHIFT_DISTÚ_MAX_SHIFT_SIZE)r$   r%   Ú	n_words_hÚ	n_words_rr3   r4   r5   r!   r!   r"   r)   À   s"   €
  ù€ùÿr)   c                 C   s
   |   t¡S )z”Flip the trace of edit operations.

    Instead of rewriting a->b, get a recipe for rewriting b->a.

    Simply flips insertions and deletions.
    )Ú	translateÚ	_FLIP_OPS)r   r!   r!   r"   r'   Þ   s   
r'   r   c                 C   sà   d}d}g }g }i }| D ]^}|t kr)|d7 }|d7 }|||< | d¡ | d¡ q|tkrD|d7 }|d7 }|||< | d¡ | d¡ q|tkrR|d7 }| d¡ q|tkrd|d7 }|||< | d¡ qtd|›ƒ‚|||fS )zìTransform trace of edit operations into an alignment of the sequences.

    :param trace: Trace of edit operations (' '=no change or 's'/'i'/'d').
    :return: Alignment, error positions in reference, error positions in hypothesis.
    r&   r   r   úunknown operation )Ú_OP_NOPÚappendÚ_OP_SUBÚ_OP_INSr   Ú	Exception)r   Úpos_hypÚpos_refr1   r0   r/   Úopr!   r!   r"   r(   è   s6   


r(   c                   @   s¸   e Zd ZdZdee fdd„Zdee deeef fdd„Z	d	ee d
edeeeeef   deeeef fdd„Z
dee deee  fdd„Zdee deeee f fdd„ZdS )r   a²  Edit distance with several features required for TER calculation.

        * internal cache
        * "beam" search
        * tracking of edit operations

    The internal self._cache works like this:

    Keys are words of the hypothesis. Values are tuples (next_node, row) where:

        * next_node is the cache for the next word in the sequence
        * row is the stored row of the edit distance matrix

    Effectively, caching allows to skip several rows in the edit distance
    matrix calculation and instead, to initialize the computation with the last
    matching matrix row.

    Beam search, as implemented here, only explores a fixed-size sub-row of
    candidates around the matrix diagonal (more precisely, it's a
    "pseudo"-diagonal since we take the ratio of sequence lengths into account).

    Tracking allows to reconstruct the optimal sequence of edit operations.

    :param words_ref: A list of reference tokens.
    r   c                 C   sR   || _ t| j ƒ| _dd„ t| jd ƒD ƒ| _i | _d| _ttfg| jd  | _	dS )z`BeamEditDistance` initializer.c                 S   s   g | ]}|t  tf‘qS r!   )Ú	_COST_INSrK   )Ú.0r	   r!   r!   r"   Ú
<listcomp>0  s    ÿz-BeamEditDistance.__init__.<locals>.<listcomp>r   r   N)
Ú
_words_refr   Ú_n_words_refr+   Ú_initial_rowÚ_cacheÚ_cache_sizeÚ_INT_INFINITYÚ	_OP_UNDEFÚ
_empty_row)Úselfr   r!   r!   r"   Ú__init__)  s   ÿzBeamEditDistance.__init__r   r   c                 C   s6   |   |¡\}}|  |||¡\}}}|  ||¡ ||fS )zãCalculate edit distance between self._words_ref and the hypothesis.

        Uses cache to skip some of the computation.

        :param words_hyp: Words in translation hypothesis.
        :return: Edit distance score.
        )Ú_find_cacheÚ_edit_distanceÚ
_add_cache)r[   r   Ústart_positionÚdistr   Únewly_created_matrixr   r!   r!   r"   Ú__call__:  s   
ÿzBeamEditDistance.__call__r$   r3   Úcachec                    s€  t |ƒ}‡ fdd„t|| ƒD ƒ}|| }t |ƒ|d ksJ ‚|r&ˆ j| nd}t|d k r8t |d t ¡}nt}t|d |d ƒD ]œ}	t |	| ¡}
td|
| ƒ}tˆ jd |
| ƒ}|	|krfˆ jd }t||ƒD ]s}|dkr„||	d  | d t	 t
f||	 |< qk||	d  ˆ j|d  kr–d}t}nt}t}||	d  |d  d | |f||	d  | d t	 t
f||	 |d  d t tff}|D ]\}}||	 | d |krÝ||f||	 |< qÇqkqCd}|}	ˆ j}|	dksð|dkr/||	 | d }|| }|ttfv r|	d8 }	|d8 }n|tkr|d8 }n|t
kr |	d8 }	ntd|›ƒ‚|	dksð|dksð|d d d |t |ƒd	… |fS )
aF  Actual edit distance calculation.

        Can be initialized with the last cached row and a start position in
        the hypothesis that it corresponds to.

        :param words_h: Words in translation hypothesis.
        :param start_h: Position from which to start the calculation.
                        (This is zero if no cache match was found.)
        :param cache: Precomputed rows corresponding to edit distance matrix
                      before `start_h`.
        :return: Edit distance value, newly computed rows to update the
                 cache, trace.
        c                    s   g | ]}t ˆ jƒ‘qS r!   )ÚlistrZ   )rQ   r<   ©r[   r!   r"   rR   c  s    ÿz3BeamEditDistance._edit_distance.<locals>.<listcomp>r   é   r   Ú rG   r&   N)r   r+   rT   Ú_BEAM_WIDTHÚmathÚceilÚfloorÚmaxÚminÚ	_COST_DELr   rS   rH   Ú	_COST_SUBrJ   rP   rK   rL   )r[   r$   r3   rd   rC   Ú
rest_emptyra   Úlength_ratioÚ
beam_widthr	   Úpseudo_diagÚmin_jÚmax_jÚjÚcost_subÚop_subÚopsÚop_costÚop_namer   rO   r!   rf   r"   r^   P  sd   

ÿ
&ý€þë


õ"zBeamEditDistance._edit_distanceÚmatc           
      C   s´   | j tkrdS | j}t|ƒ}t|ƒ| }t|ƒD ]
}|||  d }qt||d… ƒ|ks/J ‚t||d… |ƒD ]\}}||vrOi t|ƒf||< |  j d7  _ || }	|	d }q8dS )a{  Add newly computed rows to cache.

        Since edit distance is only calculated on the hypothesis suffix that
        was not in cache, the number of rows in `mat` may be shorter than
        hypothesis length. In that case, we skip over these initial words.

        :param words_hyp: Hypothesis words.
        :param mat: Edit distance matrix rows for each position.
        Nr   r   )rW   Ú_MAX_CACHE_SIZErV   r   r+   ÚzipÚtuple)
r[   r   r}   ÚnodeÚn_matÚskip_numr	   ÚwordÚrowÚvaluer!   r!   r"   r_   ©  s   


ûzBeamEditDistance._add_cachec                 C   sT   | j }d}| jg}|D ]}||v r!|d7 }|| \}}| |¡ q ||fS ||fS )zëFind the already computed rows of the edit distance matrix in cache.

        Returns a partially computed edit distance matrix.

        :param words_hyp: Translation hypothesis.
        :return: Tuple (start position, dist).
        r   r   )rV   rU   rI   )r[   r   r   r`   ra   r„   r…   r!   r!   r"   r]   Ë  s   zBeamEditDistance._find_cacheN)Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   Ústrr\   r   Úintrc   r^   r_   r]   r!   r!   r!   r"   r     s    ÿ
ÿY&"r   )rŠ   rj   Útypingr   r   r   rP   ro   rp   rB   rA   ri   r~   r   rŒ   rX   rK   r   rH   rJ   rY   r‹   Ú	maketransrF   r#   r   r,   r)   r'   r(   r   r!   r!   r!   r"   Ú<module>   s:    &'ÿ
ÿ&U
'