o
    .wiD                     @   sZ  d dl Z d dlZd dlmZ d dlmZ d dlmZmZm	Z	 d dl
mZmZmZ d dlmZ d dlmZ ded	ed
efddZ				d2dedededededed
efddZded
efddZded
efddZdee d
efdd Zd!e	eee f d"ee	eee f  d#ed$ d
ee	eee f ee	eee f  f fd%d&Z				d2ded'e	eee f dedededed
efd(d)Z	*					d3d!e	eee f d"ee	eee f  d#ed$ dedededed+eee  d
ee fd,d-Z	*	.				d4d!e	eee f d"ee	eee f  d#ed$ d/ededededed
e	eeeef f fd0d1ZdS )5    N)Sequence)inf)ListOptionalUnion)Tensorstacktensor)Literal)_validate_inputs
preds_wordtarget_wordreturnc                 C   s   t | |kS )a.  Distance measure used for substitutions/identity operation.

    Code adapted from https://github.com/rwth-i6/ExtendedEditDistance/blob/master/EED.py.

    Args:
        preds_word: hypothesis word string
        target_word: reference word string

    Return:
        0 for match, 1 for no match

    )int)r   r    r   ]/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/torchmetrics/functional/text/eed.py_distance_between_wordse   s   r          @333333?皙?      ?hyprefalpharhodeletion	insertionc              
      sf  dgt | d  }dgt | d  }d|d< tgt | d  }tdt |d D ]n}	tt | d D ]2}
|
dkr\t||
d  | ||
d  t| |
d  ||	d   ||
 | ||
< q2||
 d ||
< q2|t|}||  d7  < ||	d  dkr|||    fdd|D }|}tgt | d  }q(|td	d
 |D  }td|d | tt ||  S )a8  Compute extended edit distance score for two lists of strings: hyp and ref.

    Code adapted from: https://github.com/rwth-i6/ExtendedEditDistance/blob/master/EED.py.

    Args:
        hyp: A hypothesis string
        ref: A reference string
        alpha: optimal jump penalty, penalty for jumps between characters
        rho: coverage cost, penalty for repetition of characters
        deletion: penalty for deletion of character
        insertion: penalty for insertion or substitution of character

    Return:
        Extended edit distance score as float
       r           r    c                    s   g | ]}t | qS r   )min.0xjumpr   r   
<listcomp>   s    z!_eed_function.<locals>.<listcomp>c                 s   s     | ]}|d kr
|ndV  qdS )r   r   Nr   r"   r   r   r   	<genexpr>   s    z _eed_function.<locals>.<genexpr>)lenr   ranger!   r   indexsumfloat)r   r   r   r   r   r   number_of_visitsrownext_rowwi	min_indexcoverager   r%   r   _eed_functionu   s,   $

"r5   sentencec                 C   s   t | tstdt|  d|  } g d}|D ]
\}}| ||} qg d}|D ]\}}t||| } q*g d}|D ]
\}}| ||} q<d|  d S )zPreprocess english sentences.

    Copied from https://github.com/rwth-i6/ExtendedEditDistance/blob/master/util.py.

    Raises:
        ValueError: If input sentence is not of a type `str`.

    6Only strings allowed during preprocessing step, found  instead)).z .)!z !)?z ?),z ,))z\s+r    )z(\d) ([.,]) (\d)z\1\2\3)z#(Dr|Jr|Prof|Rev|Gen|Mr|Mt|Mrs|Ms) .z\1.))ze . g .ze.g.)zi . e .zi.e.)zU . S .zU.S.r    )
isinstancestr
ValueErrortyperstripreplaceresub)r6   rules_interpunctionpatternreplacementrules_rer   r   r   _preprocess_en   s   
	rI   c                 C   s2   t | tstdt|  d|  } td| S )zPreprocess japanese sentences.

    Copy from https://github.com/rwth-i6/ExtendedEditDistance/blob/master/util.py.

    Raises:
        ValueError: If input sentence is not of a type `str`.

    r7   r8   NFKC)r=   r>   r?   r@   rA   unicodedata	normalize)r6   r   r   r   _preprocess_ja   s   
	rM   sentence_level_scoresc                 C   s(   t | dkr
tdS t| tt |  S )zReduction for extended edit distance.

    Args:
        sentence_level_scores: list of sentence-level scores as floats

    Return:
        average of scores as a tensor

    r   r   )r)   r	   r,   )rN   r   r   r   _eed_compute   s   
rO   predstargetlanguage)enjac                    sf   t | |d\}} |dkrt n|dkrt ntd|  fdd| D }  fdd|D }| |fS )au  Preprocess strings according to language requirements.

    Args:
        preds: An iterable of hypothesis corpus.
        target: An iterable of iterables of reference corpus.
        language: Language used in sentences. Only supports English (en) and Japanese (ja) for now. Defaults to en

    Return:
        Tuple of lists that contain the cleaned strings for target and preds

    Raises:
        ValueError: If a different language than ``'en'`` or ``'ja'`` is used
        ValueError: If length of target not equal to length of preds
        ValueError: If objects in reference and hypothesis corpus are not strings

    )hypothesis_corpus
ref_corpusrS   rT   z?Expected argument `language` to either be `en` or `ja` but got c                       g | ]} |qS r   r   )r#   predpreprocess_functionr   r   r'         z)_preprocess_sentences.<locals>.<listcomp>c                    s   g | ]} fd d|D qS )c                    rW   r   r   )r#   r   rY   r   r   r'     r[   z4_preprocess_sentences.<locals>.<listcomp>.<listcomp>r   )r#   	referencerY   r   r   r'     s    )r   rI   rM   r?   )rP   rQ   rR   r   rY   r   _preprocess_sentences   s   r]   target_wordsc           	      C   s4   t }|D ]}t| |||||}||k r|}qt|S )a  Compute scores for ExtendedEditDistance.

    Args:
        target_words: An iterable of reference words
        preds_word: A hypothesis word
        alpha: An optimal jump penalty, penalty for jumps between characters
        rho: coverage cost, penalty for repetition of characters
        deletion: penalty for deletion of character
        insertion: penalty for insertion or substitution of character

    Return:
        best_score: best (lowest) sentence-level score as a Tensor

    )r   r5   r	   )	r   r^   r   r   r   r   
best_scorer\   scorer   r   r   _compute_sentence_statistics#  s   ra   rS   sentence_eedc                 C   sl   t | ||\} }|du rg }dt| t|d fv r|S t| |D ]\}}	t||	||||}
||
 q!|S )a  Compute scores for ExtendedEditDistance.

    Args:
        preds: An iterable of hypothesis corpus
        target: An iterable of iterables of reference corpus
        language: Language used in sentences. Only supports English (en) and Japanese (ja) for now. Defaults to en
        alpha: optimal jump penalty, penalty for jumps between characters
        rho: coverage cost, penalty for repetition of characters
        deletion: penalty for deletion of character
        insertion: penalty for insertion or substitution of character
        sentence_eed: list of sentence-level scores

    Return:
        individual sentence scores as a list of Tensors

    Nr   )r]   r)   zipra   append)rP   rQ   rR   r   r   r   r   rb   
hypothesisr^   r`   r   r   r   _eed_updateC  s   rf   Freturn_sentence_level_scorec                 C   s|   t g d||||gD ]\}}	t|	trt|	tr%|	dk r%td| dqt| ||||||}
t|
}|r<|t|
fS |S )uX  Compute extended edit distance score (`ExtendedEditDistance`_) [1] for strings or list of strings.

    The metric utilises the Levenshtein distance and extends it by adding a jump operation.

    Args:
        preds: An iterable of hypothesis corpus.
        target: An iterable of iterables of reference corpus.
        language: Language used in sentences. Only supports English (en) and Japanese (ja) for now. Defaults to en
        return_sentence_level_score: An indication of whether sentence-level EED score is to be returned.
        alpha: optimal jump penalty, penalty for jumps between characters
        rho: coverage cost, penalty for repetition of characters
        deletion: penalty for deletion of character
        insertion: penalty for insertion or substitution of character

    Return:
        Extended edit distance score as a tensor

    Example:
        >>> from torchmetrics.functional.text import extended_edit_distance
        >>> preds = ["this is the prediction", "here is an other sample"]
        >>> target = ["this is the reference", "here is another one"]
        >>> extended_edit_distance(preds=preds, target=target)
        tensor(0.3078)

    References:
        [1] P. Stanchev, W. Wang, and H. Ney, “EED: Extended Edit Distance Measure for Machine Translation”,
        submitted to WMT 2019. `ExtendedEditDistance`_

    )r   r   r   r   r   zParameter `z)` is expected to be a non-negative float.)rc   r=   r-   r?   rf   rO   r   )rP   rQ   rR   rg   r   r   r   r   
param_nameparamrN   averager   r   r   extended_edit_distancem  s   (rk   )r   r   r   r   )rS   r   r   r   r   N)rS   Fr   r   r   r   ) rC   rK   collections.abcr   mathr   typingr   r   r   torchr   r   r	   typing_extensionsr
   #torchmetrics.functional.text.helperr   r>   r   r   r-   r5   rI   rM   rO   tupler]   ra   rf   boolrk   r   r   r   r   <module>   s   X
:-&
)
#
	
-	