o
    yipe                  *   @   s  d dl mZ d dlmZmZmZmZmZmZ d dl	Z	d dl	m
Z
mZ d dlmZ edZedZded	ed
eeee
f eee
f eee
f eee
f eee
f eee
f f fddZdeded
ee fddZded
ee fddZded
ee fddZdee ded
eeeeedf e
f f fddZdeded	ededed
eeeeeedf e
f f eeeeedf e
f f eee
f eee
f f fddZdeeeeedf e
f f deeeeedf e
f f d
eee
f fd d!Zd"eee
f d#eee
f d
eee
f fd$d%Zd&eee
f d'eee
f d(eee
f d)eee
f d*eee
f d+eee
f d,ed-ed
e
fd.d/Zd0ee d1eeeeedf e
f f d2eeeeedf e
f f d3eee
f d4eee
f ded	ed,ed-ededed
ee
eee
f eee
f eee
f eee
f f fd5d6Z	dKd7eeee f d8eee eee  f d9eee
f d:eee
f d;eee
f d<eee
f d=eee
f d>eee
f ded	ed,ed-ededed?eee
  d
eeee
f eee
f eee
f eee
f eee
f eee
f eee
  f f d@dAZd9eee
f d:eee
f d;eee
f d<eee
f d=eee
f d>eee
f d,ed-ed
e
fdBdCZ 	D	E	F	G	G	GdLd7eeee f d8eeeee f  ded	ed-edededHed
ee
ee
e
f f fdIdJZ!dS )M    defaultdict)DictListOptionalSequenceTupleUnionN)Tensortensor)_validate_inputsgؗҜ<z !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~n_char_ordern_word_orderreturnc                 C   s|   dd t | D }dd t |D }dd t | D }dd t |D }dd t | D }dd t |D }||||||fS )a~  Prepare dictionaries dictionaries with default zero values for total reference, hypothesis and matching
    character and word n-grams.

    Args:
        n_char_order: A character n-gram order.
        n_word_order: A word n-gram order.

    Return:
        Dictionaries with default zero values for total reference, hypothesis and matching character and word
        n-grams.
    c                 S      i | ]	}|d  t dqS            r   .0n r   U/home/ubuntu/.local/lib/python3.10/site-packages/torchmetrics/functional/text/chrf.py
<dictcomp>@       z*_prepare_n_grams_dicts.<locals>.<dictcomp>c                 S   r   r   r   r   r   r   r   r   A   r   c                 S   r   r   r   r   r   r   r   r   B   r   c                 S   r   r   r   r   r   r   r   r   C   r   c                 S   r   r   r   r   r   r   r   r   D   r   c                 S   r   r   r   r   r   r   r   r   E   r   )range)r   r   total_preds_char_n_gramstotal_preds_word_n_gramstotal_target_char_n_gramstotal_target_word_n_gramstotal_matching_char_n_gramstotal_matching_word_n_gramsr   r   r   _prepare_n_grams_dicts0   s   r#   sentence
whitespacec                 C   s    |rt | S t |  ddS )zSplit sentence into individual characters.

    Args:
        sentence: An input sentence to split.
        whitespace: An indication whether to keep whitespaces during character n-gram extraction.

    Return:
        A list of separated characters.
      )liststripreplace)r$   r%   r   r   r   _get_charactersQ   s   
r+   wordc                 C   sX   t | dkr	| gS | d tv r| dd | d gS | d tv r)| d | dd gS | gS )aw  
    Separates out punctuations from beginning and end of words for chrF. Adapted from https://github.com/m-popovic/chrF
    and https://github.com/mjpost/sacrebleu/blob/master/sacrebleu/metrics/chrf.py.
    Args:
        word: An input word to be separated from a punctuation if present.

    Return:
        A list of a single word or a separated word and punctuation.
    r   Nr   )len_PUNCTUATIONS)r,   r   r   r   _separate_word_and_punctiation`   s   
r0   c                 C   s   t dd |   D g S )zSeparates out punctuations from beginning and end of words for chrF for all words in the sentence.

    Args:
        sentence: An input sentence to split

    Return:
        An aggregated list of separated words and punctuations.
    c                 s   s    | ]}t |V  qd S N)r0   )r   r,   r   r   r   	<genexpr>}   s    z-_get_words_and_punctiation.<locals>.<genexpr>)sumr)   split)r$   r   r   r   _get_words_and_punctiationt   s   	r5   char_or_word_listn_gram_order.c                    sf   t dd }td|d D ]# fddtt  d D D ]}| |  td7  < q!q|S )z
    Args:
        char_or_word_list: A list of characters of words
        n_gram_order: The largest number of n-gram.

    Return:
        A dictionary of dictionaries with a counts of given n-grams.
    c                   S   s   t dd S )Nc                   S      t dS Nr   r   r   r   r   r   <lambda>       z1_ngram_counts.<locals>.<lambda>.<locals>.<lambda>r   r   r   r   r   r:      s    z_ngram_counts.<locals>.<lambda>r   c                 3   s$    | ]}t  ||  V  qd S r1   )tuple)r   ir6   r   r   r   r2      s   " z _ngram_counts.<locals>.<genexpr>)r   r   r.   r   )r6   r7   ngramsngramr   r>   r   _ngram_counts   s   	(rA   	lowercasec                    s   dt dtdtdtdtttttt df tf f ttttt df tf f f f
 fdd}d	ttttt df tf f dtttf fd
d}|| |||\}}||}	||}
|||	|
fS )a  
    Args:
        sentence: An input sentence
        n_char_order: A character n-gram order.
        n_word_order: A word n-gram order.
        lowercase: An indication whether to enable case-insensitivity.
        whitespace: An indication whether to keep whitespaces during character n-gram extraction.

    Return:
        char_n_grams_counts: A dictionary of dictionaries with sentence character n-grams.
        word_n_grams_counts: A dictionary of dictionaries with sentence word n-grams.
        total_char_n_grams: A dictionary containing a total number of sentence character n-grams.
        total_word_n_grams: A dictionary containing a total number of sentence word n-grams.
    r$   r   r   rB   r   .c                    s2   |r|   } tt|  |}tt| |}||fS )z@Get a dictionary of dictionaries with a counts of given n-grams.)lowerrA   r+   r5   )r$   r   r   rB   char_n_grams_countsword_n_grams_countsr%   r   r   _char_and_word_ngrams_counts   s
   zJ_get_n_grams_counts_and_total_ngrams.<locals>._char_and_word_ngrams_countsn_grams_countsc                 S   s2   t dd }| D ]}tt| |  ||< q|S )z.Get total sum of n-grams over n-grams w.r.t n.c                   S   r8   r9   r   r   r   r   r   r:      r;   zQ_get_n_grams_counts_and_total_ngrams.<locals>._get_total_ngrams.<locals>.<lambda>)r   r   r3   values)rH   total_n_gramsr   r   r   r   _get_total_ngrams   s   z?_get_n_grams_counts_and_total_ngrams.<locals>._get_total_ngrams)strintboolr   r   r
   )r$   r   r   rB   r%   rG   rK   rD   rE   total_char_n_gramstotal_word_n_gramsr   rF   r   $_get_n_grams_counts_and_total_ngrams   s$   :2
rQ   hyp_n_grams_countsref_n_grams_countsc                    s@   t dd } D ]tt fdd  D |< q|S )zGet a number of n-gram matches between reference and hypothesis n-grams.

    Args:
        hyp_n_grams_counts:
        ref_n_grams_counts:

    Return:
        matching_n_grams
    c                   S   r8   r9   r   r   r   r   r   r:      r;   z$_get_ngram_matches.<locals>.<lambda>c                 3   s,    | ]}t  |   | V  qd S r1   )torchmin)r   n_gramrR   r   rS   r   r   r2      s
    
z%_get_ngram_matches.<locals>.<genexpr>)r   r   r3   )rR   rS   matching_n_gramsr   rW   r   _get_ngram_matches   s   
rY   rJ   n_gramsc                 C   s"   |D ]}| |  || 7  < q| S )aA  Aggregate total n-grams to keep corpus-level statistics.

    Args:
        total_n_grams: A dictionary containing a total corpus-level number of n-grams.
        n_grams: A dictionary containing a sentence-level number of n-grams.

    Return:
        A dictionary containing a total corpus-level number of n-grams.
    r   )rJ   rZ   r   r   r   r   _sum_over_dicts   s   
r[   matching_char_n_gramsmatching_word_n_gramshyp_char_n_gramshyp_word_n_gramsref_char_n_gramsref_word_n_gramsn_orderbetac                 C   s~   dt ttf dt ttf dt ttf dtdt ttf f
dd}|| |||}	|||||}
t|	 t|
  t| }|S )a  Calculate sentence-level chrF/chrF++ score.

    For given hypothesis and reference statistics (either sentence-level or corpus-level)
    the chrF/chrF++ score is returned.

    Args:
        matching_char_n_grams:
            A total number of matching character n-grams between the best matching reference and hypothesis.
        matching_word_n_grams:
            A total number of matching word n-grams between the best matching reference and hypothesis.
        hyp_char_n_grams: A total number of hypothesis character n-grams.
        hyp_word_n_grams: A total number of hypothesis word n-grams.
        ref_char_n_grams: A total number of reference character n-grams.
        ref_word_n_grams: A total number of reference word n-grams.
        n_order: A sum of character and word n-gram order.
        beta: A parameter determining an importance of recall w.r.t. precision. If `beta=1`, their importance is equal.

    Return:
        A chrF/chrF++ score. This function is universal both for sentence-level and corpus-level calucation.
    rX   ref_n_gramshyp_n_gramsrc   r   c                    sZ   fddD fddD  fddD  fddD }|S )zGet n-gram level f-score.c                    s2   i | ]}| | d kr|  |  nt dqS r   r   r   r   )re   rX   r   r   r         &zA_calculate_fscore.<locals>._get_n_gram_fscore.<locals>.<dictcomp>c                    s2   i | ]}|| d kr | |  nt dqS rf   r   r   )rX   rd   r   r   r     rg   c                    s.   i | ]}|t  d  |  |  tqS )   )rT   max_EPS_SMOOTHINGr   )rc   	precisionrecallr   r   r     s    "c                    s2   i | ]}|d  d  |  |  |  qS )r   rh   r   r   )rc   denominatorrk   rl   r   r   r     rg   r   )rX   rd   re   rc   f_scorer   )rc   rm   re   rX   rk   rl   rd   r   _get_n_gram_fscore  s   z-_calculate_fscore.<locals>._get_n_gram_fscore)r   rM   r
   floatr3   rI   r   )r\   r]   r^   r_   r`   ra   rb   rc   ro   char_n_gram_f_scoreword_n_gram_f_scorern   r   r   r   _calculate_fscore   s   




 rs   targetspred_char_n_grams_countspred_word_n_grams_countspreds_char_n_gramspreds_word_n_gramsc              
   C   s   t d}tdd }tdd }tdd }tdd }| D ]1}t||||	|
\}}}}t||}t||}t||||||||}||krO|}|}|}|}|}q|||||fS )aT  Calculate the best sentence-level chrF/chrF++ score.

    For a given pre-processed hypothesis, all references are evaluated and score and statistics
    for the best matching reference is returned.

    Args:
        targets: An iterable of references.
        preds_char_n_grams_counts: A dictionary of dictionaries with hypothesis character n-grams.
        preds_word_n_grams_counts: A dictionary of dictionaries with hypothesis word n-grams.
        pred_char_n_grams: A total number of hypothesis character n-grams.
        pred_word_n_grams: A total number of hypothesis word n-grams.
        n_char_order: A character n-gram order.
        n_word_order: A word n-gram order.
        n_order: A sum of character and word n-gram order.
        beta: A parameter determining an importance of recall w.r.t. precision. If `beta=1`, their importance is equal.
        lowercase: An indication whether to enable case-insensitivity.
        whitespace: An indication whether to keep whitespaces during character n-gram extraction.

    Return:
        Return chrF/chrF++ score and statistics for the best matching hypothesis and reference.

        f_score: A sentence-level chrF/chrF++ score.
        matching_char_n_grams:
            A total number of matching character n-grams between the best matching reference and hypothesis.
        matching_word_n_grams:
            A total number of matching word n-grams between the best matching reference and hypothesis.
        target_char_n_grams: A total number of reference character n-grams.
        target_word_n_grams: A total number of reference word n-grams.
    r   c                   S   r8   r9   r   r   r   r   r   r:   M  r;   z6_calculate_sentence_level_chrf_score.<locals>.<lambda>c                   S   r8   r9   r   r   r   r   r   r:   N  r;   c                   S   r8   r9   r   r   r   r   r   r:   O  r;   c                   S   r8   r9   r   r   r   r   r   r:   P  r;   )r   r   rQ   rY   rs   )rt   ru   rv   rw   rx   r   r   rb   rc   rB   r%   best_f_scorebest_matching_char_n_gramsbest_matching_word_n_gramsbest_target_char_n_gramsbest_target_word_n_gramstargettarget_char_n_grams_countstarget_word_n_grams_countstarget_char_n_gramstarget_word_n_gramsr\   r]   rn   r   r   r   $_calculate_sentence_level_chrf_score!  sJ   +

r   predsr~   r   r   r   r    r!   r"   sentence_chrf_scorec                 C   s   t || \}} t| |D ]M\}}t|||	||\}}}}t||}t||}t|||||||	|
|||\}}}}}|durE||d t||}t||}t||}t||}q|||||||fS )a  
    Args:
        preds: An iterable of hypothesis corpus.
        target: An iterable of iterables of reference corpus.
        total_preds_char_n_grams: A dictionary containing a total number of hypothesis character n-grams.
        total_preds_word_n_grams: A dictionary containing a total number of hypothesis word n-grams.
        total_target_char_n_grams: A dictionary containing a total number of reference character n-grams.
        total_target_word_n_grams: A dictionary containing a total number of reference word n-grams.
        total_matching_char_n_grams:
            A dictionary containing a total number of matching character n-grams between references and hypotheses.
        total_matching_word_n_grams:
            A dictionary containing a total number of total matching word n-grams between references and hypotheses.
        n_char_order: A character n-gram order.
        n_word_order: A word n-gram order.
        n_order: Sum of character and word n-gram order.
        beta: A parameter determining an importance of recall w.r.t. precision. If `beta=1`, their importance is equal.
        lowercase: An indication whether to enable case-insensitivity.
        whitespace: An indication whether to keep whitespaces during character n-gram extraction.
        sentence_chrf_score: A list of sentence-level chrF/chrF++ scores.

    Return:
        total_target_char_n_grams: number of reference character n-grams.
        total_target_word_n_grams: number of reference word n-grams.
        total_preds_char_n_grams: number of hypothesis character n-grams.
        total_preds_word_n_grams: number of hypothesis word n-grams.
        total_matching_char_n_grams: number of matching character n-grams between references and hypotheses.
        total_matching_word_n_grams: number of total matching word n-grams between references and hypotheses.
        sentence_chrf_score: A list of sentence-level chrF/chrF++ scores.

    Raises:
        ValueError:
            If length of ``preds`` and ``target`` differs.
    Nr   )r   ziprQ   r[   r   append	unsqueeze)r   r~   r   r   r   r    r!   r"   r   r   rb   rc   rB   r%   r   target_corpuspredrt   ru   rv   pred_char_n_gramspred_word_n_gramssentence_level_f_scorer\   r]   r   r   r   r   r   _chrf_score_updatew  sV   :




r   c           	   	   C   s   t ||| |||||}|S )aj  Compute chrF/chrF++ score based on pre-computed target, prediction and matching character and word n-grams.

    Args:
        total_preds_char_n_grams: number of hypothesis character n-grams.
        total_preds_word_n_grams: number of hypothesis word n-grams.
        total_target_char_n_grams: number of reference character n-grams.
        total_target_word_n_grams: number of reference word n-grams.
        total_matching_char_n_grams: number of matching character n-grams between references and hypotheses.
        total_matching_word_n_grams: number of total matching word n-grams between references and hypotheses.
        n_order: A sum of character and word n-gram order.
        beta:
            A parameter determining an importance of recall w.r.t. precision. If `beta=1`, their importance is equal.

    Return:
        A corpus-level chrF/chrF++ score.
    )rs   )	r   r   r   r    r!   r"   rb   rc   chrf_f_scorer   r   r   _chrf_score_compute  s   
r      rh          @Freturn_sentence_level_scorec                 C   s   t |tr	|dk rtdt |tr|dk rtd|dk r"tdt|| }t||\}	}
}}}}|r7g nd}t| ||	|
|||||||||||\}	}
}}}}}t|	|
||||||}|rf|t|fS |S )uX  Calculate `chrF score`_  of machine translated text with one or more references. This implementation
    supports both chrF score computation introduced in [1] and chrF++ score introduced in `chrF++ score`_. This
    implementation follows the implmenetaions from https://github.com/m-popovic/chrF and
    https://github.com/mjpost/sacrebleu/blob/master/sacrebleu/metrics/chrf.py.

    Args:
        preds: An iterable of hypothesis corpus.
        target: An iterable of iterables of reference corpus.
        n_char_order:
            A character n-gram order. If `n_char_order=6`, the metrics refers to the official chrF/chrF++.
        n_word_order:
            A word n-gram order. If `n_word_order=2`, the metric refers to the official chrF++. If `n_word_order=0`, the
            metric is equivalent to the original chrF.
        beta:
            A parameter determining an importance of recall w.r.t. precision. If `beta=1`, their importance is equal.
        lowercase: An indication whether to enable case-insesitivity.
        whitespace: An indication whether to keep whitespaces during character n-gram extraction.
        return_sentence_level_score: An indication whether a sentence-level chrF/chrF++ score to be returned.

    Return:
        A corpus-level chrF/chrF++ score.
        (Optionally) A list of sentence-level chrF/chrF++ scores if `return_sentence_level_score=True`.

    Raises:
        ValueError:
            If ``n_char_order`` is not an integer greater than or equal to 1.
        ValueError:
            If ``n_word_order`` is not an integer greater than or equal to 0.
        ValueError:
            If ``beta`` is smaller than 0.

    Example:
        >>> from torchmetrics.functional import chrf_score
        >>> preds = ['the cat is on the mat']
        >>> target = [['there is a cat on the mat', 'a cat is on the mat']]
        >>> chrf_score(preds, target)
        tensor(0.8640)

    References:
        [1] chrF: character n-gram F-score for automatic MT evaluation by Maja Popović `chrF score`_

        [2] chrF++: words helping character n-grams by Maja Popović `chrF++ score`_
    r   zMExpected argument `n_char_order` to be an integer greater than or equal to 1.r   zMExpected argument `n_word_order` to be an integer greater than or equal to 0.z.Expected argument `beta` to be greater than 0.N)	
isinstancerM   
ValueErrorrp   r#   r   r   rT   cat)r   r~   r   r   rc   rB   r%   r   rb   r   r   r   r    r!   r"   r   r   r   r   r   
chrf_score  sl   5	
r   r1   )r   rh   r   FFF)"collectionsr   typingr   r   r   r   r   r	   rT   r
   r   #torchmetrics.functional.text.helperr   rj   setr/   rM   r#   rL   rN   r+   r0   r5   rA   rQ   rY   r[   rp   rs   r   r   r   r   r   r   r   r   <module>   sd  " >
!2


1

.





	
9

	
0
e





	









m





	
*	