o
    .wilB                  	   @   s&  d dl Z d dlmZ d dlmZmZ d dlmZ dZdZ	e
dZeG dd	 d	eeZG d
d dZdeee eee  f deeee f deeee  ee f fddZdee dee de
fddZdeedf deedf fddZdeedf deee
e
f ee
 ee
 f fddZdS )    N)Sequence)Enumunique)Union   i'  g 7yACc                   @   s$   e Zd ZdZdZdZdZdZdZdS )_EditOperationsz1Enumerations for the Levenhstein edit operations.insertdelete
substitutenothing	undefinedN)	__name__
__module____qualname____doc__	OP_INSERT	OP_DELETEOP_SUBSTITUTE
OP_NOTHINGOP_UNDEFINED r   r   `/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/torchmetrics/functional/text/helper.pyr   ,   s    r   c                   @   sn  e Zd ZdZ	d dee dedededdf
d	d
Zdee deeee	df f fddZ
dee dedeeeee	f   deeeeeee	f   ee	df f fddZdedeeeee	f   dee	df fddZdee deeeee	f   ddfddZdee deeeeeee	f   f fddZdedeeee	f  fddZdedeeee	f  fddZdS )!_LevenshteinEditDistancea.  A convenience class for calculating the Levenshtein edit distance.

    Class will cache some intermediate values to hasten the calculation. The implementation follows the implementation
    from https://github.com/mjpost/sacrebleu/blob/master/sacrebleu/metrics/lib_ter.py,
    where the most of this implementation is adapted and copied from.

    Args:
        reference_tokens: list of reference tokens
        op_insert: cost of insertion operation
        op_delete: cost of deletion operation
        op_substitute: cost of substitution operation

       reference_tokens	op_insert	op_deleteop_substitutereturnNc                 C   s>   || _ t|| _i | _d| _|| _|| _|| _d| _t	| _
d S )Nr   )r   lenreference_lencache
cache_sizer   r   r   
op_nothing_INT_INFINITYop_undefined)selfr   r   r   r   r   r   r   __init__F   s   

z!_LevenshteinEditDistance.__init__prediction_tokens.c                 C   s6   |  |\}}| |||\}}}| || ||fS )a(  Calculate edit distance between self._words_ref and the hypothesis. Uses cache to skip some computations.

        Args:
            prediction_tokens: A tokenized predicted sentence.

        Return:
            A tuple of a calculated edit distance and a trace of executed operations.

        )_find_cache_levenshtein_edit_distance
_add_cache)r&   r(   start_positioncached_edit_distanceedit_distance_intedit_distancetracer   r   r   __call__U   s   
z!_LevenshteinEditDistance.__call__prediction_startr!   c                    s  t |} fddt|| D }|| }|r j| nd}|d tkr-t|d t nt}t|d |d D ]}	t|	| }
td|
| }|	|krQ jd n	t jd |
| }t||D ]}}|dkr{||	d  | d  j	 t
jf||	 |< q`||	d   j|d  kr j}t
j}n j}t
j}||	d  |d  d | |f||	d  | d  j	 t
jf||	 |d  d  j t
jff}|D ]\}}||	 | d |kr||f||	 |< qq`q8 ||}|d d d |t |d |fS )	a  Dynamic programming algorithm to compute the Levenhstein edit distance.

        Args:
            prediction_tokens: A tokenized predicted sentence.
            prediction_start: An index where a predicted sentence to be considered from.
            cache: A cached Levenshtein edit distance.

        Returns:
            Edit distance between the predicted sentence and the reference sentence

        c                    s   g | ]
}t   jqS r   )list_get_empty_rowr    .0_r&   r   r   
<listcomp>}   s    zG_LevenshteinEditDistance._levenshtein_edit_distance.<locals>.<listcomp>g      ?   r   r   N)r   ranger    _BEAM_WIDTHmathceilfloormaxminr   r   r   r   r#   r   r   r   r   r   
_get_trace)r&   r(   r2   r!   prediction_len
empty_rowsr/   length_ratio
beam_widthipseudo_diagmin_jmax_jjcost_substituteoperation_substitute
operationsoperation_costoperation_namer0   r   r8   r   r*   j   sD   

"$"z3_LevenshteinEditDistance._levenshtein_edit_distancerD   r/   c                 C   s   d}|}| j }|dks|dkrQ|| | d }|g|R }|tjtjfv r.|d8 }|d8 }n|tjkr8|d8 }n|tjkrB|d8 }ntd||dks|dks|S )a@  Get a trace of executed operations from the edit distance matrix.

        Args:
            prediction_len: A length of a tokenized predicted sentence.
            edit_distance:
                A matrix of the Levenshtedin edit distance. The element part of the matrix is a tuple of an edit
                operation cost and an edit operation itself.

        Return:
            A trace of executed operations returned as a tuple of `_EDIT_OPERATIONS` enumerates.

        Raises:
            ValueError:
                If an unknown operation has been applied.

        r   r   r   Unknown operation )r    r   r   r   r   r   
ValueError)r&   rD   r/   r0   rH   rL   	operationr   r   r   rC      s    




z#_LevenshteinEditDistance._get_tracec           	      C   s   | j tkrdS | j}t|t| }t|D ]
}|||  d }qt||d |D ]\}}||vrAi t|f||< |  j d7  _ || }|d }q*dS )a,  Add newly computed rows to cache.

        Since edit distance is only calculated on the hypothesis suffix that was not in cache, the number of rows in
        `edit_distance` matrx may be shorter than hypothesis length. In that case we skip over these initial words.

        Args:
            prediction_tokens: A tokenized predicted sentence.
            edit_distance:
                A matrix of the Levenshtedin edit distance. The element part of the matrix is a tuple of an edit
                operation cost and an edit operation itself.

        Nr   r   )r"   _MAX_CACHE_SIZEr!   r   r<   ziptuple)	r&   r(   r/   nodeskip_numrH   wordrowvaluer   r   r   r+      s   

z#_LevenshteinEditDistance._add_cachec                 C   sZ   | j }d}| | jg}|D ]}||v r$|d7 }|| \}}|| q ||fS ||fS )a+  Find the already calculated rows of the Levenshtein edit distance metric.

        Args:
            prediction_tokens: A tokenized predicted sentence.

        Return:
            A tuple of a start hypothesis position and `edit_distance` matrix.

            prediction_start: An index where a predicted sentence to be considered from.
            edit_distance:
                A matrix of the cached Levenshtedin edit distance. The element part of the matrix is a tuple of an edit
                operation cost and an edit operation itself.

        r   r   )r!   _get_initial_rowr    append)r&   r(   rX   r,   r/   rZ   r[   r   r   r   r)      s   z$_LevenshteinEditDistance._find_cachelengthc                 C   s   t | jtjfg|d  S )a  Precomputed empty matrix row for Levenhstein edit distance.

        Args:
            length: A length of a tokenized sentence.

        Return:
            A list of tuples containing infinite edit operation costs and yet undefined edit operations.

        r   )intr%   r   r   r&   r_   r   r   r   r4        
z'_LevenshteinEditDistance._get_empty_rowc                    s    fddt |d D S )a*  First row corresponds to insertion operations of the reference, so 1 edit operation per reference word.

        Args:
            length: A length of a tokenized sentence.

        Return:
            A list of tuples containing edit operation costs of insert and insert edit operations.

        c                    s   g | ]
}| j  tjfqS r   )r   r   r   )r6   rH   r8   r   r   r9   '  s    z=_LevenshteinEditDistance._get_initial_row.<locals>.<listcomp>r   )r<   ra   r   r8   r   r]     rb   z)_LevenshteinEditDistance._get_initial_row)r   r   r   )r   r   r   r   r3   strr`   r'   rW   r   r1   r*   rC   r+   r)   r4   r]   r   r   r   r   r   7   sF    
&$
D

*&.!"r   
ref_corpushypothesis_corpusr   c                 C   s   t |tr|g}tdd | D r!t|dkr| gndd | D } |rBtdd | D rBt| t|krBtdt|  dt| | |fS )	a  Check and update (if needed) the format of reference and hypothesis corpora for various text evaluation metrics.

    Args:
        ref_corpus: An iterable of iterables of reference corpus.
        hypothesis_corpus: An iterable of hypothesis corpus.

    Return:
        ref_corpus: An iterable of iterables of reference corpus.
        hypothesis_corpus: An iterable of hypothesis corpus.

    Raises:
        ValueError:
            If length of `ref_corpus` and `hypothesis_corpus` differs.

    c                 s   s    | ]}t |tV  qd S N)
isinstancerc   r6   refr   r   r   	<genexpr>A      z#_validate_inputs.<locals>.<genexpr>r   c                 S   s   g | ]}|gqS r   r   rh   r   r   r   r9   B  s    z$_validate_inputs.<locals>.<listcomp>c                 s   s    | ]}|V  qd S rf   r   rh   r   r   r   rj   D  s    zCorpus has different size z != )rg   rc   allr   rS   )rd   re   r   r   r   _validate_inputs*  s   
 &rm   r(   r   c                    s   fddt t| d D }t t| d D ]}||| d< qt t d D ]}||d |< q(t dt| d D ]I}t dt d D ]=}| |d   |d  krb||d  |d  || |< qEt||d  | || |d  ||d  |d  d || |< qEq:|d d S )a  Dynamic programming algorithm to compute the edit distance.

    Args:
        prediction_tokens: A tokenized predicted sentence
        reference_tokens: A tokenized reference sentence
    Returns:
        Edit distance between the predicted sentence and the reference sentence

    c                    s   g | ]}d gt  d  qS )r   r   )r   r5   r   r   r   r9   T  s    z"_edit_distance.<locals>.<listcomp>r   r   r;   )r<   r   rB   )r(   r   dprH   rL   r   rn   r   _edit_distanceJ  s   
Brp   r0   .c                    sJ   t jt jt jt ji dt dtt t f dt fddt fdd| D S )a  Flip the trace of edit operations.

    Instead of rewriting a->b, get a recipe for rewriting b->a. Simply flips insertions and deletions.

    Args:
        trace: A tuple of edit operations.

    Return:
        inverted_trace:
            A tuple of inverted edit operations.

    rT   _flip_operationsr   c                 S   s   | |v r	| | S | S rf   )get)rT   rq   r   r   r   _replace_operation_or_retaint  s   
z1_flip_trace.<locals>._replace_operation_or_retainc                 3   s    | ]}| V  qd S rf   r   )r6   rT   rq   rs   r   r   rj   {  rk   z_flip_trace.<locals>.<genexpr>)r   r   r   dictrW   )r0   r   rt   r   _flip_traceb  s   

rv   c                 C   s   d }}g }g }i }| D ]c}|t jkr*|d7 }|d7 }|||< |d |d q|t jkrF|d7 }|d7 }|||< |d |d q|t jkrU|d7 }|d q|t jkrh|d7 }|||< |d qtd|d|||fS )a  Transform trace of edit operations into an alignment of the sequences.

    Args:
        trace: A trace of edit operations as a tuple of `_EDIT_OPERATIONS` enumerates.

    Return:
        alignments: A dictionary mapping aligned positions between a reference and a hypothesis.
        reference_errors: A list of error positions in a reference.
        hypothesis_errors: A list of error positions in a hypothesis.

    Raises:
        ValueError:
            If an unknown operation is

    r;   r   r   rR   .)r   r   r^   r   r   r   rS   )r0   reference_positionhypothesis_positionreference_errorshypothesis_errors
alignmentsrT   r   r   r   _trace_to_alignment~  s4   






r}   )r>   collections.abcr   enumr   r   typingr   r=   rU   r`   r$   rc   r   r   rW   rm   r3   rp   rv   ru   r}   r   r   r   r   <module>   s*   
 t
 "8