o
    !wiE                     @   s  d Z ddlZddlZddlZddlZddlZddlmZmZm	Z	m
Z
 g dZG dd deZG dd deZG d	d
 d
eZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZeddd ZG dd deZG dd deZG d d! d!eZG d"d# d#eZG d$d% d%eZG d&d' d'eZG d(d) d)eZG d*d+ d+eZ dS ),a@  
This file implements the building blocks for transforming a collection
of input strings to the desired format in order to calculate the WER of CER.

In principle, for word error rate calculations, every string of a sentence needs to be
collapsed into a list of strings, where each string is a *single* word.
This is done with [transforms.ReduceToListOfListOfWords][].
A composition of multiple transformations must therefore *always* end with
[transforms.ReduceToListOfListOfWords][].

For the character error rate, every string of a sentence also needs to be collapsed into
a list of strings, but here each string is a *single* character.
This is done with [transforms.ReduceToListOfListOfChars][]. Similarly, a
composition of multiple transformations must therefore also always end with
[transforms.ReduceToListOfListOfChars][].
    N)IterableUnionListMapping)AbstractTransformComposeExpandCommonEnglishContractionsRemoveEmptyStringsReduceToListOfListOfWordsReduceToListOfListOfCharsReduceToSingleSentenceRemoveKaldiNonWordsRemoveMultipleSpacesRemovePunctuationRemoveSpecificWordsRemoveWhiteSpaceStripSubstituteRegexesSubstituteWordsToLowerCaseToUpperCasec                   @   sJ   e Zd ZdZdeeee f fddZdefddZdee fd	d
Z	dS )r   z(
    The base class of a Transform.
    	sentencesc                 C   s6   t |tr
| |S t |tr| |S td|)z
        Transforms one or more strings.

        Args:
            sentences: The strings to transform.

        Returns:
            (Union[str, List[str]]): The transformed strings.

        z7input {} was expected to be a string or list of strings)
isinstancestrprocess_stringlistprocess_list
ValueErrorformat)selfr    r    M/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/jiwer/transforms.py__call__G   s   



zAbstractTransform.__call__sc                 C   s   t  N)NotImplementedErrorr   r#   r    r    r!   r   ]   s   z AbstractTransform.process_stringinpc                        fdd|D S )Nc                       g | ]}  |qS r    r   .0r#   r   r    r!   
<listcomp>a       z2AbstractTransform.process_list.<locals>.<listcomp>r    r   r'   r    r-   r!   r   `      zAbstractTransform.process_listN)
__name__
__module____qualname____doc__r   r   r   r"   r   r   r    r    r    r!   r   B   s
    r   c                   @   s*   e Zd ZdZdee fddZdd ZdS )r   a  
    Chain multiple transformations back-to-back to create a pipeline combining multiple
    transformations.

    Note that each transformation needs to end with either `ReduceToListOfListOfWords`
    or `ReduceToListOfListOfChars`, depending on whether word error rate,
    or character error rate is desired.

    Example:
        ```python3
        import jiwer

        jiwer.Compose([
            jiwer.RemoveMultipleSpaces(),
            jiwer.ReduceToListOfListOfWords()
        ])
        ```
    
transformsc                 C   
   || _ dS )zV

        Args:
            transforms: The list of transformations to chain.
        Nr6   )r   r6   r    r    r!   __init__x      
zCompose.__init__c                 C   s   | j D ]}||}q|S r$   r8   )r   texttrr    r    r!   r"      s   

zCompose.__call__N)r2   r3   r4   r5   r   r   r9   r"   r    r    r    r!   r   d   s    r   c                   @   s@   e Zd Zddee fddZdefddZdee fd	d
ZdS )BaseRemoveTransform tokens_to_removec                 C   s   || _ || _d S r$   )r?   replace_token)r   r?   r@   r    r    r!   r9      s   
zBaseRemoveTransform.__init__r#   c                 C   s   | j D ]	}||| j}q|S r$   )r?   replacer@   )r   r#   wr    r    r!   r      s   
z"BaseRemoveTransform.process_stringr'   c                    r(   )Nc                    r)   r    r*   r+   r-   r    r!   r.      r/   z4BaseRemoveTransform.process_list.<locals>.<listcomp>r    r0   r    r-   r!   r      r1   z BaseRemoveTransform.process_listNr>   )	r2   r3   r4   r   r   r9   r   r   r   r    r    r    r!   r=      s    r=   c                   @   @   e Zd ZdZddefddZdefddZd	ee fd
dZdS )r
   a  
    Transforms a single input sentence, or a list of input sentences, into
    a list with lists of words, which is the expected format for calculating the
    edit operations between two input sentences on a word-level.

    A sentence is assumed to be a string, where words are delimited by a token
    (such as ` `, space). Each string is expected to contain only a single sentence.
    Empty strings (no output) are removed for the list.

    Example:
        ```python
        import jiwer

        sentences = ["hi", "this is an example"]

        print(jiwer.ReduceToListOfListOfWords()(sentences))
        # prints: [['hi'], ['this', 'is', 'an, 'example']]
        ```
     word_delimiterc                 C   r7   )zo
        Args:
            word_delimiter: the character which delimits words. Default is ` ` (space).
        NrF   r   rF   r    r    r!   r9         
z"ReduceToListOfListOfWords.__init__r#   c                 C   s   dd | | jD gS )Nc                 S      g | ]
}t |d kr|qS    lenr,   rB   r    r    r!   r.          z<ReduceToListOfListOfWords.process_string.<locals>.<listcomp>)splitrF   r&   r    r    r!   r      s   z(ReduceToListOfListOfWords.process_stringr'   c                 C   <   g }|D ]}|  |d }|| qt|dkrg gS |S Nr   r   appendrN   r   r'   sentence_collectionsentencelist_of_wordsr    r    r!   r         z&ReduceToListOfListOfWords.process_listNrE   	r2   r3   r4   r5   r   r9   r   r   r   r    r    r    r!   r
      s
    r
   c                   @   0   e Zd ZdZdefddZdee fddZdS )	r   a  
    Transforms a single input sentence, or a list of input sentences, into
    a list with lists of characters, which is the expected format for calculating the
    edit operations between two input sentences on a character-level.

    A sentence is assumed to be a string. Each string is expected to contain only a
    single sentence.

    Example:
        ```python
        import jiwer

        sentences = ["hi", "this is an example"]

        print(jiwer.ReduceToListOfListOfChars()(sentences))
        # prints: [['h', 'i'], ['t', 'h', 'i', 's', ' ', 'i', 's', ' ', 'a', 'n', ' ', 'e', 'x', 'a', 'm', 'p', 'l', 'e']]
        ```
    r#   c                 C   s   dd |D gS )Nc                 S      g | ]}|qS r    r    rO   r    r    r!   r.          z<ReduceToListOfListOfChars.process_string.<locals>.<listcomp>r    r&   r    r    r!   r      s   z(ReduceToListOfListOfChars.process_stringr'   c                 C   rR   rS   rT   rV   r    r    r!   r      rZ   z&ReduceToListOfListOfChars.process_listNr2   r3   r4   r5   r   r   r   r   r    r    r    r!   r      s    r   c                   @   rD   )r   a&  
    Transforms multiple sentences into a single sentence.
    This operation can be useful when the number of reference and hypothesis sentences
    differ, and you want to do a minimal alignment over these lists.
    Note that this creates an invariance: `wer([a, b], [a, b])` might not be equal to
    `wer([b, a], [b, a])`.

    Example:
        ```python3
        import jiwer

        sentences = ["hi", "this is an example"]

        print(jiwer.ReduceToSingleSentence()(sentences))
        # prints: ['hi this is an example']
        ```
    rE   rF   c                 C   r7   )zd
        :param word_delimiter: the character which delimits words. Default is ` ` (space).
        NrG   rH   r    r    r!   r9      s   
zReduceToSingleSentence.__init__r#   c                 C   s   |S r$   r    r&   r    r    r!   r     s   z%ReduceToSingleSentence.process_stringr'   c                 C   s2   dd |D }t |dkrg S d| j|gS )Nc                 S   rJ   rK   rM   r,   ir    r    r!   r.     rP   z7ReduceToSingleSentence.process_list.<locals>.<listcomp>r   z{})rN   r   rF   join)r   r'   filtered_inpr    r    r!   r     s   z#ReduceToSingleSentence.process_listNr[   r\   r    r    r    r!   r      s
    r   c                   @   4   e Zd ZdZdeeef fddZdefddZdS )	r   am  
    Transform strings by substituting substrings matching regex expressions into
    another substring.

    Example:
        ```python
        import jiwer

        sentences = ["is the world doomed or loved?", "edibles are allegedly cultivated"]

        # note: the regex string "\b(\w+)ed\b", matches every word ending in 'ed',
        # and "\1" stands for the first group ('\w+). It therefore removes 'ed' in every match.
        print(jiwer.SubstituteRegexes({r"doom": r"sacr", r"\b(\w+)ed\b": r"\1"})(sentences))

        # prints: ["is the world sacr or lov?", "edibles are allegedly cultivat"]
        ```
    substitutionsc                 C   r7   )zj

        Args:
            substitutions: a mapping of regex expressions to replacement strings.
        Nrf   r   rf   r    r    r!   r9      r:   zSubstituteRegexes.__init__r#   c                 C   s&   | j  D ]\}}t|||}q|S r$   )rf   itemsresubr   r#   keyvaluer    r    r!   r   (  s   z SubstituteRegexes.process_stringNr2   r3   r4   r5   r   r   r9   r   r    r    r    r!   r     s    r   c                   @   re   )	r   a|  
    This transform can be used to replace a word into another word.
    Note that the whole word is matched. If the word you're attempting to substitute
    is a substring of another word it will not be affected.
    For example, if you're substituting `foo` into `bar`, the word `foobar` will NOT
    be substituted into `barbar`.

    Example:
        ```python
        import jiwer

        sentences = ["you're pretty", "your book", "foobar"]

        print(jiwer.SubstituteWords({"pretty": "awesome", "you": "i", "'re": " am", 'foo': 'bar'})(sentences))

        # prints: ["i am awesome", "your book", "foobar"]
        ```

    rf   c                 C   r7   )z[
        Args:
            substitutions: A mapping of words to replacement words.
        Nrg   rh   r    r    r!   r9   D  rI   zSubstituteWords.__init__r#   c                 C   s2   | j  D ]\}}tdt|||}q|S )Nz\b{}\b)rf   ri   rj   rk   r   escaperl   r    r    r!   r   K  s   zSubstituteWords.process_stringNro   r    r    r    r!   r   /  s    r   c                       s*   e Zd ZdZdee f fddZ  ZS )r   a%  
    Can be used to filter out certain words.
    As words are replaced with a ` ` character, make sure to that
    `RemoveMultipleSpaces`, `Strip()` and `RemoveEmptyStrings` are present
    in the composition _after_ `RemoveSpecificWords`.

    Example:
        ```python
        import jiwer

        sentences = ["yhe awesome", "the apple is not a pear", "yhe"]

        print(jiwer.RemoveSpecificWords(["yhe", "the", "a"])(sentences))
        # prints: ['  awesome', '  apple is not   pear', ' ']
        # note the extra spaces
        ```
    words_to_removec                    s   dd |D }t  | dS )zM
        Args:
            words_to_remove: List of words to remove.
        c                 S   s   i | ]}|d qS r[   r    )r,   wordr    r    r!   
<dictcomp>j  s    z0RemoveSpecificWords.__init__.<locals>.<dictcomp>N)superr9   )r   rq   mapping	__class__r    r!   r9   e  s   zRemoveSpecificWords.__init__)r2   r3   r4   r5   r   r   r9   __classcell__r    r    rv   r!   r   R  s    r   c                       s(   e Zd ZdZddef fddZ  ZS )r   a|  
    This transform filters out white space characters.
    Note that by default space (` `) is also removed, which will make it impossible to
    split a sentence into a list of words by using `ReduceToListOfListOfWords` or
    `ReduceToSingleSentence`.
    This can be prevented by replacing all whitespace with the space character.
    If so, make sure that `jiwer.RemoveMultipleSpaces`,
    `Strip()` and `RemoveEmptyStrings` are present in the composition _after_
    `RemoveWhiteSpace`.

    Example:
        ```python
        import jiwer

        sentences = ["this is an example", "hello world	"]

        print(jiwer.RemoveWhiteSpace()(sentences))
        # prints: ["thisisanexample", "helloworld"]

        print(jiwer.RemoveWhiteSpace(replace_by_space=True)(sentences))
        # prints: ["this is an example", "hello world  "]
        # note the trailing spaces
        ```
    Freplace_by_spacec                    s2   dd t jD }|rd}nd}t j||d dS )zq

        Args:
            replace_by_space: every white space character is replaced with a space (` `)
        c                 S   r^   r    r    )r,   cr    r    r!   r.     r_   z-RemoveWhiteSpace.__init__.<locals>.<listcomp>rE   r>   )r@   N)string
whitespacert   r9   )r   ry   
charactersr@   rv   r    r!   r9     s
   zRemoveWhiteSpace.__init__)F)r2   r3   r4   r5   boolr9   rx   r    r    rv   r!   r   o  s    r   rL   c                  C   s$   t tjd } tdd | D }|S )z9Compute the punctuation characters only once and memoize.rL   c                 s   s,    | ]}t t|d rt|V  qdS )PN)unicodedatacategorychr
startswithra   r    r    r!   	<genexpr>  s    
z._get_punctuation_characters.<locals>.<genexpr>)rangesys
maxunicodeset)
codepointspunctuationr    r    r!   _get_punctuation_characters  s
   r   c                       s    e Zd ZdZ fddZ  ZS )r   a  
    This transform filters out punctuation. The punctuation characters are defined as
    all unicode characters whose category name starts with `P`.
    See [here](https://www.unicode.org/reports/tr44/#General_Category_Values) for more
    information.
    Example:
        ```python
        import jiwer

        sentences = ["this is an example!", "hello. goodbye"]

        print(jiwer.RemovePunctuation()(sentences))
        # prints: ['this is an example', "hello goodbye"]
        ```
    c                    s   t  }t | d S r$   )r   rt   r9   )r   punctuation_charactersrv   r    r!   r9     s   zRemovePunctuation.__init__)r2   r3   r4   r5   r9   rx   r    r    rv   r!   r     s    r   c                   @   r]   )	r   ao  
    Filter out multiple spaces between words.

    Example:
        ```python
        import jiwer

        sentences = ["this is   an   example ", "  hello goodbye  ", "  "]

        print(jiwer.RemoveMultipleSpaces()(sentences))
        # prints: ['this is an example ', " hello goodbye ", " "]
        # note that there are still trailing spaces
        ```

    r#   c                 C      t dd|S )Nz\s\s+rE   rj   rk   r&   r    r    r!   r        z#RemoveMultipleSpaces.process_stringr'   c                    r(   )Nc                    r)   r    r*   r+   r-   r    r!   r.     r/   z5RemoveMultipleSpaces.process_list.<locals>.<listcomp>r    r0   r    r-   r!   r     r1   z!RemoveMultipleSpaces.process_listNr`   r    r    r    r!   r     s    r   c                   @      e Zd ZdZdefddZdS )r   a~  
    Removes all leading and trailing spaces.

    Example:
        ```python
        import jiwer

        sentences = [" this is an example ", "  hello goodbye  ", "  "]

        print(jiwer.Strip()(sentences))
        # prints: ['this is an example', "hello goodbye", ""]
        # note that there is an empty string left behind which might need to be cleaned up
        ```
    r#   c                 C      |  S r$   stripr&   r    r    r!   r        zStrip.process_stringNr2   r3   r4   r5   r   r   r    r    r    r!   r         r   c                   @   r]   )	r	   a   
    Remove empty strings from a list of strings.

    Example:
        ```python
        import jiwer

        sentences = ["", "this is an example", " ",  "                "]

        print(jiwer.RemoveEmptyStrings()(sentences))
        # prints: ['this is an example']
        ```
    r#   c                 C   r   r$   r   r&   r    r    r!   r     r   z!RemoveEmptyStrings.process_stringr'   c                    r(   )Nc                    s   g | ]}  |d kr|qS rC   r*   r+   r-   r    r!   r.     s    z3RemoveEmptyStrings.process_list.<locals>.<listcomp>r    r0   r    r-   r!   r     r1   zRemoveEmptyStrings.process_listNr`   r    r    r    r!   r	     s    r	   c                   @   r   )r   u6  
    Replace common contractions such as `let's` to `let us`.

    Currently, this method will perform the following replacements. Note that `␣` is
     used to indicate a space (` `) to get around markdown rendering constrains.

    | Contraction   | transformed into |
    | ------------- |:----------------:|
    | `won't`       | `␣will not`      |
    | `can't`       | `␣can not`       |
    | `let's`       | `␣let us`        |
    | `n't`         | `␣not`           |
    | `'re`         | `␣are`           |
    | `'s`          | `␣is`            |
    | `'d`          | `␣would`         |
    | `'ll`         | `␣will`          |
    | `'t`          | `␣not`           |
    | `'ve`         | `␣have`          |
    | `'m`          | `␣am`            |

    Example:
        ```python
        import jiwer

        sentences = ["she'll make sure you can't make it", "let's party!"]

        print(jiwer.ExpandCommonEnglishContractions()(sentences))
        # prints: ["she will make sure you can not make it", "let us party!"]
        ```

    r#   c                 C   s   t dd|}t dd|}t dd|}t dd|}t d	d
|}t dd|}t dd|}t dd|}t dd|}t dd|}t dd|}|S )Nzwon'tzwill notzcan\'tzcan notzlet\'szlet uszn\'tz notz\'rez arez\'sz isz\'dz wouldz\'llz willz\'tz\'vez havez\'mz amr   r&   r    r    r!   r     s   z.ExpandCommonEnglishContractions.process_stringNr   r    r    r    r!   r     s     r   c                   @   r   )r   z
    Convert every character into lowercase.
    Example:
        ```python
        import jiwer

        sentences = ["You're PRETTY"]

        print(jiwer.ToLowerCase()(sentences))

        # prints: ["you're pretty"]
        ```
    r#   c                 C   r   r$   )lowerr&   r    r    r!   r   @  r   zToLowerCase.process_stringNr   r    r    r    r!   r   1  s    r   c                   @   r   )r   z
    Convert every character to uppercase.

    Example:
        ```python
        import jiwer

        sentences = ["You're amazing"]

        print(jiwer.ToUpperCase()(sentences))

        # prints: ["YOU'RE AMAZING"]
        ```
    r#   c                 C   r   r$   )upperr&   r    r    r!   r   T  r   zToUpperCase.process_stringNr   r    r    r    r!   r   D  r   r   c                   @   r   )r   a  
    Remove any word between `[]` and `<>`. This can be useful when working
    with hypotheses from the Kaldi project, which can output non-words such as
    `[laugh]` and `<unk>`.

    Example:
        ```python
        import jiwer

        sentences = ["you <unk> like [laugh]"]

        print(jiwer.RemoveKaldiNonWords()(sentences))

        # prints: ["you  like "]
        # note the extra spaces
        ```
    r#   c                 C   r   )Nz[<\[][^>\]]*[>\]]r>   r   r&   r    r    r!   r   k  r   z"RemoveKaldiNonWords.process_stringNr   r    r    r    r!   r   X  s    r   )!r5   r   	functoolsrj   r{   r   typingr   r   r   r   __all__objectr   r   r=   r
   r   r   r   r   r   r   	lru_cacher   r   r   r   r	   r   r   r   r   r    r    r    r!   <module>   s8   "#-%%"#*
	6