o
    yiC                  	   @   s>  d dl Z d dlmZmZmZ d dlmZmZmZm	Z	m
Z
 dZdZedZeG dd deeZG d	d
 d
eZG dd dZde
ee eee  f de
eee f de	eee  ee f fddZdee dee defddZde	edf de	edf fddZde	edf de	eeef ee ee f fddZdS )    N)EnumIntEnumunique)DictListSequenceTupleUnion   i'  g 7yACc                   @   s$   e Zd ZdZdZdZdZdZdZdS )_EDIT_OPERATIONSz1Enumerations for the Levenhstein edit operations.insertdelete
substitutenothing	undefinedN)	__name__
__module____qualname____doc__	OP_INSERT	OP_DELETEOP_SUBSTITUTE
OP_NOTHINGOP_UNDEFINED r   r   W/home/ubuntu/.local/lib/python3.10/site-packages/torchmetrics/functional/text/helper.pyr   +   s    r   c                   @   s$   e Zd ZdZdZdZdZdZeZ	dS )_EDIT_OPERATIONS_COSTz6Enumerations for the Levenhstein edit operation costs.   r   N)
r   r   r   r   r   r   r   r   _INT_INFINITYr   r   r   r   r   r   6   s    r   c                   @   sf  e Zd ZdZdee ddfddZdee deeee	df f fd	d
Z
dee dedeeeee	f   deeeeeee	f   ee	df f fddZdedeeeee	f   dee	df fddZdee deeeee	f   ddfddZdee deeeeeee	f   f fddZededeeee	f  fddZededeeee	f  fddZdS )_LevenshteinEditDistancea`  A convenience class for calculating the Levenshtein edit distance, which also caches some intermediate
    values to hasten the calculation.

    The implementation follows the implemenation from
    https://github.com/mjpost/sacrebleu/blob/master/sacrebleu/metrics/lib_ter.py, where the most of this implementation
    is adapted and copied from.
    reference_tokensreturnNc                 C   s    || _ t|| _i | _d| _dS )zInitialize _LevenshteinEditDistance object.

        Args:
            reference_tokens:
                A tokenized reference sentence.
        r   N)r    lenreference_lencache
cache_size)selfr    r   r   r   __init__I   s   

z!_LevenshteinEditDistance.__init__prediction_tokens.c                 C   s6   |  |\}}| |||\}}}| || ||fS )a5  Calculate edit distance between self._words_ref and the hypothesis. Uses cache to skip some of the
        computation.

        Args:
            prediction_tokens: A tokenized predicted sentence.

        Return:
            A tuple of a calculated edit distance and a trace of executed operations.
        )_find_cache_levenshtein_edit_distance
_add_cache)r&   r(   start_positioncached_edit_distanceedit_distance_intedit_distancetracer   r   r   __call__V   s   
z!_LevenshteinEditDistance.__call__prediction_startr$   c                    s  t |} fddt|| D }|| }|r j| nd}t|d k r-t|d t nt}t|d |d D ]}	t|	| }
td|
| }|	|krQ jd n	t jd |
| }t||D ]}}|dkr{||	d  | d t	j
 tj
f||	 |< q`||	d   j|d  krt	j}tj}nt	j}tj}||	d  |d  d | |f||	d  | d t	j
 tj
f||	 |d  d t	j tjff}|D ]\}}||	 | d |kr||f||	 |< qq`q8 ||}|d d d |t |d |fS )	a  A dynamic programming algorithm to compute the Levenhstein edit distance.

        Args:
            prediction_tokens: A tokenized predicted sentence.
            prediction_start: An index where a predicted sentence to be considered from.
            cache: A cached Levenshtein edit distance.

        Returns:
            Edit distance between the predicted sentence and the reference sentence
        c                    s   g | ]
}t   jqS r   )list_get_empty_rowr#   .0_r&   r   r   
<listcomp>~   s    zG_LevenshteinEditDistance._levenshtein_edit_distance.<locals>.<listcomp>g      ?   r   r   N)r"   ranger#   _BEAM_WIDTHmathceilfloormaxminr   r   r   r    r   r   r   
_get_trace)r&   r(   r2   r$   prediction_len
empty_rowsr/   length_ratio
beam_widthipseudo_diagmin_jmax_jjcost_substituteoperation_substitute
operationsoperation_costoperation_namer0   r   r8   r   r*   l   sD   

"$"z3_LevenshteinEditDistance._levenshtein_edit_distancerD   r/   c                 C   s   d}|}| j }|dks|dkrP|| | d }|f| }|tjtjfv r-|d8 }|d8 }n|tjkr7|d8 }n|tjkrA|d8 }ntd||dks|dks|S )a?  Get a trace of executed operations from the edit distance matrix.

        Args:
            prediction_len: A length of a tokenized predicted sentence.
            edit_distance:
                A matrix of the Levenshtedin edit distance. The element part of the matrix is a tuple of an edit
                operation cost and an edit operation itself.

        Return:
            A trace of executed operations returned as a tuple of `_EDIT_OPERATIONS` enumerates.

        Raises:
            ValueError:
                If an unknown operation has been applied.
        r   r   r   Unknown operation )r#   r   r   r   r   r   
ValueError)r&   rD   r/   r0   rH   rL   	operationr   r   r   rC      s    





z#_LevenshteinEditDistance._get_tracec           	      C   s   | j tkrdS | j}t|t| }t|D ]
}|||  d }qt||d |D ]\}}||vrAi t|f||< |  j d7  _ || }|d }q*dS )a*  Add newly computed rows to cache. Since edit distance is only calculated on the hypothesis suffix that
        was not in cache, the number of rows in `edit_distance` matrx may be shorter than hypothesis length. In
        that case we skip over these initial words.

        Args:
            prediction_tokens: A tokenized predicted sentence.
            edit_distance:
                A matrix of the Levenshtedin edit distance. The element part of the matrix is a tuple of an edit
                operation cost and an edit operation itself.
        Nr   r   )r%   _MAX_CACHE_SIZEr$   r"   r<   ziptuple)	r&   r(   r/   nodeskip_numrH   wordrowvaluer   r   r   r+      s   

z#_LevenshteinEditDistance._add_cachec                 C   sZ   | j }d}| | jg}|D ]}||v r$|d7 }|| \}}|| q ||fS ||fS )a*  Find the already calculated rows of the Levenshtein edit distance matric.

        Args:
            prediction_tokens: A tokenized predicted sentence.

        Return:
            A tuple of a start hypothesis position and `edit_distance` matrix.

            prediction_start: An index where a predicted sentence to be considered from.
            edit_distance:
                A matrix of the cached Levenshtedin edit distance. The element part of the matrix is a tuple of an edit
                operation cost and an edit operation itself.
        r   r   )r$   _get_initial_rowr#   append)r&   r(   rX   r,   r/   rZ   r[   r   r   r   r)      s   z$_LevenshteinEditDistance._find_cachelengthc                 C   s   t tjtjfg| d  }|S )a  Precomputed empty matrix row for Levenhstein edit distance.

        Args:
            length: A length of a tokenized sentence.

        Return:
            A list of tuples containing infinite edit operation costs and yet undefined edit operations.
        r   )intr   r   r   )r_   	empty_rowr   r   r   r4     s   
z'_LevenshteinEditDistance._get_empty_rowc                 C   s   dd t | d D }|S )a7  First row corresponds to insertion operations of the reference, so we do 1 edit operation per reference
        word.

        Args:
            length: A length of a tokenized sentence.

        Return:
            A list of tuples containing edit operation costs of insert and insert edit operations.
        c                 S   s   g | ]
}|t j tjfqS r   )r   r   r   )r6   rH   r   r   r   r9   &  s    z=_LevenshteinEditDistance._get_initial_row.<locals>.<listcomp>r   )r<   )r_   initial_rowr   r   r   r]     s   z)_LevenshteinEditDistance._get_initial_row)r   r   r   r   r   strr'   r   r`   r   r1   r*   rC   r+   r)   staticmethodr4   r]   r   r   r   r   r   @   s4    &$
C

*%. $r   reference_corpushypothesis_corpusr!   c                 C   s   t |tr|g}tdd | D r"t|dkr| g} ndd | D } |rCtdd | D rCt| t|krCtdt|  dt| | |fS )	a	  Check and update (if needed) the format of reference and hypothesis corpora for various text evaluation
    metrics.

    Args:
        reference_corpus: An iterable of iterables of reference corpus.
        hypothesis_corpus: An iterable of hypothesis corpus.

    Return:
        reference_corpus: An iterable of iterables of reference corpus.
        hypothesis_corpus: An iterable of hypothesis corpus.

    Raises:
        ValueError:
            If length of `reference_corpus` and `hypothesis_corpus` differs.
    c                 s   s    | ]}t |tV  qd S N)
isinstancerc   r6   refr   r   r   	<genexpr>A      z#_validate_inputs.<locals>.<genexpr>r   c                 S   s   g | ]}|gqS r   r   ri   r   r   r   r9   E  s    z$_validate_inputs.<locals>.<listcomp>c                 s   s    | ]}|V  qd S rg   r   ri   r   r   r   rk   G  s    zCorpus has different size z != )rh   rc   allr"   rS   )re   rf   r   r   r   _validate_inputs*  s   
&rn   r(   r    c                    s   fddt t| d D }t t| d D ]}||| d< qt t d D ]}||d |< q(t dt| d D ]I}t dt d D ]=}| |d   |d  krb||d  |d  || |< qEt||d  | || |d  ||d  |d  d || |< qEq:|d d S )a$  Standard dynamic programming algorithm to compute the edit distance.

    Args:
        prediction_tokens: A tokenized predicted sentence
        reference_tokens: A tokenized reference sentence
    Returns:
        Edit distance between the predicted sentence and the reference sentence
    c                    s   g | ]}d gt  d  qS )r   r   )r"   r5   r    r   r   r9   V  s    z"_edit_distance.<locals>.<listcomp>r   r   r;   )r<   r"   rB   )r(   r    dprH   rL   r   ro   r   _edit_distanceM  s   	Brq   r0   .c                    sN   t jt jt jt ji dt dtt t f dt fddt fdd| D }|S )a  Flip the trace of edit operations. Instead of rewriting a->b, get a recipe for rewriting b->a. Simply flips
    insertions and deletions.

    Args:
        trace: A tuple of edit operations.

    Return:
        inverted_trace:
            A tuple of inverted edit operations.
    rT   _flip_operationsr!   c                 S   s   | |v r	| | S | S rg   )get)rT   rr   r   r   r   _replace_operation_or_retaint  s   
z1_flip_trace.<locals>._replace_operation_or_retainc                 3   s    | ]}| V  qd S rg   r   )r6   rT   rr   rt   r   r   rk   {  rl   z_flip_trace.<locals>.<genexpr>)r   r   r   r   rW   )r0   inverted_tracer   ru   r   _flip_traced  s   

rw   c                 C   s   d }}g }g }i }| D ]c}|t jkr*|d7 }|d7 }|||< |d |d q|t jkrF|d7 }|d7 }|||< |d |d q|t jkrU|d7 }|d q|t jkrh|d7 }|||< |d qtd|d|||fS )a  Transform trace of edit operations into an alignment of the sequences.

    Args:
        trace: A trace of edit operations as a tuple of `_EDIT_OPERATIONS` enumerates.

    Return:
        alignments: A dictionary mapping aligned positions between a reference and a hypothesis.
        reference_errors: A list of error positions in a reference.
        hypothesis_errors: A list of error positions in a hypothesis.

    Raises:
        ValueError:
            If an unknown operation is
    r;   r   r   rR   .)r   r   r^   r   r   r   rS   )r0   reference_positionhypothesis_positionreference_errorshypothesis_errors
alignmentsrT   r   r   r   _trace_to_alignment  s4   






r~   )r>   enumr   r   r   typingr   r   r   r   r	   r=   rU   r`   r   rc   r   r   r   rn   rq   rw   r~   r   r   r   r   <module>   s*   

 k
#"8