o
    yiD                     @   sb  d dl Z d dlZd dlmZ d dlmZmZmZmZm	Z	 d dl
mZmZmZ d dlmZ d dlmZ deded	efd
dZ				d2dedededededed	efddZded	efddZded	efddZdee d	efddZd e	eee f d!ee	eee f  d"e	ed# ed$ f d	ee	eee f ee	eee f  f fd%d&Z				d2ded'e	eee f dedededed	efd(d)Z	#					d3d e	eee f d!ee	eee f  d"ed* dedededed+eee  d	ee fd,d-Z	#	.				d4d e	eee f d!ee	eee f  d"ed* d/ededededed	e	eeeef f fd0d1ZdS )5    N)inf)ListOptionalSequenceTupleUnion)Tensorstacktensor)Literal)_validate_inputs
preds_wordtarget_wordreturnc                 C   s   t | |kS )a,  Distance measure used for substitutions/identity operation. Code adapted from
    https://github.com/rwth-i6/ExtendedEditDistance/blob/master/EED.py.

    Args:
        preds_word: hypothesis word string
        target_word: reference word string

    Return:
        0 for match, 1 for no match
    )int)r   r    r   T/home/ubuntu/.local/lib/python3.10/site-packages/torchmetrics/functional/text/eed.py_distance_between_wordsd   s   r          @333333?皙?      ?hyprefalpharhodeletion	insertionc              
      sh  dgt | d  }dgt | d  }d|d< tgt | d  }tdt |d D ]o}	tdt | d D ]2}
|
dkr]t||
d  | ||
d  t| |
d  ||	d   ||
 | ||
< q3||
 d ||
< q3|t|}||  d7  < ||	d  dkr|||    fdd|D }|}tgt | d  }q(|td	d
 |D  }td|d | tt ||  S )a9  Computes extended edit distance score for two lists of strings: hyp and ref.

    Code adapted from: https://github.com/rwth-i6/ExtendedEditDistance/blob/master/EED.py.

    Args:
        hyp: A hypothesis string
        ref: A reference string
        alpha: optimal jump penalty, penalty for jumps between characters
        rho: coverage cost, penalty for repetition of characters
        deletion: penalty for deletion of character
        insertion: penalty for insertion or substitution of character

    Return:
        Extended edit distance score as float
       r           r    c                    s   g | ]}t | qS r   )min.0xjumpr   r   
<listcomp>   s    z!_eed_function.<locals>.<listcomp>c                 s   s     | ]}|d kr
|ndV  qdS )r   r   Nr   r#   r   r   r   	<genexpr>   s    z _eed_function.<locals>.<genexpr>)lenr   ranger"   r   indexsumfloat)r   r   r   r   r   r   number_of_visitsrownext_rowwi	min_indexcoverager   r&   r   _eed_functionr   s,   $

"r6   sentencec                 C   s   t | tstdt|  d|  } g d}|D ]
\}}| ||} qg d}|D ]\}}t||| } q*g d}|D ]
\}}| ||} q<d|  d } | S )Copied from https://github.com/rwth-i6/ExtendedEditDistance/blob/master/util.py.

    Raises:
        ValueError: If input sentence is not of a type `str`.
    6Only strings allowed during preprocessing step, found  instead)).z .)!z !)?z ?),z ,))z\s+r!   )z(\d) ([.,]) (\d)z\1\2\3)z#(Dr|Jr|Prof|Rev|Gen|Mr|Mt|Mrs|Ms) .z\1.))ze . g .ze.g.)zi . e .zi.e.)zU . S .zU.S.r!   )
isinstancestr
ValueErrortyperstripreplaceresub)r7   rules_interpunctionpatternreplacementrules_rer   r   r   _preprocess_en   s   
rK   c                 C   s6   t | tstdt|  d|  } td| } | S )r8   r9   r:   NFKC)r?   r@   rA   rB   rC   unicodedata	normalize)r7   r   r   r   _preprocess_ja   s
   
rO   sentence_level_scoresc                 C   s,   t | dkr
tdS t| tt |  }|S )zFinal step in extended edit distance.

    Args:
        sentence_level_scores: list of sentence-level scores as floats

    Return:
        average of scores as a tensor
    r   r    )r*   r
   r-   )rP   averager   r   r   _eed_compute   s   	rR   predstargetlanguageenjac                    sf   t | |d\}} |dkrt n|dkrt ntd|  fdd| D }  fdd|D }| |fS )at  Preprocess strings according to language requirements.

    Args:
        preds: An iterable of hypothesis corpus.
        target: An iterable of iterables of reference corpus.
        language: Language used in sentences. Only supports English (en) and Japanese (ja) for now. Defaults to en

    Return:
        Tuple of lists that contain the cleaned strings for target and preds

    Raises:
        ValueError: If a different language than ``'en'`` or ``'ja'`` is used
        ValueError: If length of target not equal to length of preds
        ValueError: If objects in reference and hypothesis corpus are not strings
    )hypothesis_corpusreference_corpusrV   rW   z?Expected argument `language` to either be `en` or `ja` but got c                       g | ]} |qS r   r   )r$   predpreprocess_functionr   r   r(         z)_preprocess_sentences.<locals>.<listcomp>c                    s   g | ]} fd d|D qS )c                    rZ   r   r   )r$   r   r\   r   r   r(     r^   z4_preprocess_sentences.<locals>.<listcomp>.<listcomp>r   )r$   	referencer\   r   r   r(     s    )r   rK   rO   rA   )rS   rT   rU   r   r\   r   _preprocess_sentences   s   r`   target_wordsc           	      C   s4   t }|D ]}t| |||||}||k r|}qt|S )a  Compute scores for ExtendedEditDistance.

    Args:
        target_words: An iterable of reference words
        preds_word: A hypothesis word
        alpha: An optimal jump penalty, penalty for jumps between characters
        rho: coverage cost, penalty for repetition of characters
        deletion: penalty for deletion of character
        insertion: penalty for insertion or substitution of character

    Return:
        best_score: best (lowest) sentence-level score as a Tensor
    )r   r6   r
   )	r   ra   r   r   r   r   
best_scorer_   scorer   r   r   _compute_sentence_statistics  s   rd   )rV   rW   sentence_eedc                 C   sl   t | ||\} }|du rg }dt| t|d fv r|S t| |D ]\}}	t||	||||}
||
 q!|S )a  Compute scores for ExtendedEditDistance.

    Args:
        preds: An iterable of hypothesis corpus
        target: An iterable of iterables of reference corpus
        language: Language used in sentences. Only supports English (en) and Japanese (ja) for now. Defaults to en
        alpha: optimal jump penalty, penalty for jumps between characters
        rho: coverage cost, penalty for repetition of characters
        deletion: penalty for deletion of character
        insertion: penalty for insertion or substitution of character
        sentence_eed: list of sentence-level scores

    Return:
        individual sentence scores as a list of Tensors
    Nr   )r`   r*   ziprd   append)rS   rT   rU   r   r   r   r   re   
hypothesisra   rc   r   r   r   _eed_update<  s   ri   Freturn_sentence_level_scorec                 C   s|   t g d||||gD ]\}}	t|	trt|	tr%|	dk r%td| dqt| ||||||}
t|
}|r<|t|
fS |S )uR  Computes extended edit distance score (`ExtendedEditDistance`_) [1] for strings or list of strings. The
    metric utilises the Levenshtein distance and extends it by adding a jump operation.

    Args:
        preds: An iterable of hypothesis corpus.
        target: An iterable of iterables of reference corpus.
        language: Language used in sentences. Only supports English (en) and Japanese (ja) for now. Defaults to en
        return_sentence_level_score: An indication of whether sentence-level EED score is to be returned.
        alpha: optimal jump penalty, penalty for jumps between characters
        rho: coverage cost, penalty for repetition of characters
        deletion: penalty for deletion of character
        insertion: penalty for insertion or substitution of character

    Return:
        Extended edit distance score as a tensor

    Example:
        >>> from torchmetrics.functional import extended_edit_distance
        >>> preds = ["this is the prediction", "here is an other sample"]
        >>> target = ["this is the reference", "here is another one"]
        >>> extended_edit_distance(preds=preds, target=target)
        tensor(0.3078)

    References:
        [1] P. Stanchev, W. Wang, and H. Ney, “EED: Extended Edit Distance Measure for Machine Translation”,
        submitted to WMT 2019. `ExtendedEditDistance`_
    )r   r   r   r   r   zParameter `z)` is expected to be a non-negative float.)rf   r?   r.   rA   ri   rR   r	   )rS   rT   rU   rj   r   r   r   r   
param_nameparamrP   rQ   r   r   r   extended_edit_distancee  s   &rm   )r   r   r   r   )rV   r   r   r   r   N)rV   Fr   r   r   r   )rE   rM   mathr   typingr   r   r   r   r   torchr   r	   r
   typing_extensionsr   #torchmetrics.functional.text.helperr   r@   r   r   r.   r6   rK   rO   rR   r`   rd   ri   boolrm   r   r   r   r   <module>   s   X
;,&
(
"
	
,	