o
    i"                     @   s:   d Z ddlZejdkrddlZnddlZG dd dZdS )z"English Normalizer class for CLVP.    N)      c                   @   s   e Zd Zdd ZdedefddZdedefdd	Zd
edefddZd
edefddZ	d
edefddZ
dedefddZd
edefddZdedefddZdedefddZdedefddZdd ZdS )EnglishNormalizerc                 C   s2   dd dD | _ g d| _g d| _g d| _d S )Nc                 S   s*   g | ]}t d |d  t j|d fqS )z\b%s\.r      )recompile
IGNORECASE).0x r   ^/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/clvp/number_normalizer.py
<listcomp>   s    z.EnglishNormalizer.__init__.<locals>.<listcomp>))mrsmisess)mrmister)drdoctor)stsaint)cocompany)jrjunior)majmajor)gengeneral)drsdoctors)revreverend)lt
lieutenant)hon	honorable)sgtsergeant)captcaptain)esqesquire)ltdlimited)colcolonel)ftfort)
 onetwothreefourfivesixseveneightnine)
teneleventwelvethirteenfourteenfifteensixteen	seventeeneighteennineteen)
r2   r2   twentythirtyfortyfiftysixtyseventyeightyninety)_abbreviationsonesteenstens)selfr   r   r   __init__   s   

zEnglishNormalizer.__init__numreturnc                 C   s   |dkrdS |dk rd|  t| S |dk r| j| S |dk r'| j|d  S |dk rD| j|d  |d dkrAd|  |d   S d S |d	k rc| j|d  d
 |d dkr`d|  |d   S d S |dk r|  |d	 d |d	 dkrd|  |d	   S d S |dk r|  |d d |d dkrd|  |d   S d S |dk r|  |d d |d dkrd|  |d   S d S |dk r|  |d d |d dkrd|  |d   S d S |dk r|  |d d |d dkrd|  |d   S d S dS )ax  
        Converts numbers(`int`) to words(`str`).

        Please note that it only supports upto - "'nine hundred ninety-nine quadrillion, nine hundred ninety-nine
        trillion, nine hundred ninety-nine billion, nine hundred ninety-nine million, nine hundred ninety-nine
        thousand, nine hundred ninety-nine'" or `number_to_words(999_999_999_999_999_999)`.
        r   zerozminus 
      d   -r2      hundred i@B z	 thousandz, i ʚ;z millionl    J)z billionl     I5 z	 trillionl     NZoz quadrillionznumber out of range)number_to_wordsabsrO   rP   rQ   )rR   rT   r   r   r   r^   F   s   
2.
z!EnglishNormalizer.number_to_wordstextc                 C   s   | dddS )z+
        Converts unicode to ascii
        asciiignorezutf-8)encodedecoderR   r`   r   r   r   convert_to_ascii      z"EnglishNormalizer.convert_to_asciimc                 C   s   | d}|d}t|dkr|d S |d rt|d nd}t|dkr0|d r0t|d nd}|rN|rN|dkr<dnd}|dkrDdnd	}d
||||f S |r^|dkrVdnd}d||f S |rn|dkrfdnd	}d||f S dS )zZ
        This method is used to expand numerical dollar values into spoken words.
        r   .   z dollarsr   dollardollarscentcentsz%s %s, %s %sz%s %szzero dollars)groupsplitlenint)rR   rh   matchpartsrl   rn   dollar_unit	cent_unitr   r   r   _expand_dollars   s"   

$z!EnglishNormalizer._expand_dollarsc                 C      | dddS )zF
        This method is used to remove commas from sentences.
        r   ,r2   ro   replacerR   rh   r   r   r   _remove_commas   rg   z EnglishNormalizer._remove_commasc                 C   rx   )zO
        This method is used to expand '.' into spoken word ' point '.
        r   ri   z point rz   r|   r   r   r   _expand_decimal_point   rg   z'EnglishNormalizer._expand_decimal_pointc                 C   s^   dddd}t |ddd }d|d	 kr |d	 d
kr d}n||d d}| || S )z`
        This method is used to expand ordinals such as '1st', '2nd' into spoken words.
        r   ndrd)r   rj   r   r   NrW   rY   rX   th)rr   ro   getr^   )rR   rT   ordinal_suffixessuffixr   r   r   _expand_ordinal   s   z!EnglishNormalizer._expand_ordinalc                 C   s~   t |d}|dkr:|dk r:|dkrdS |dkr&|dk r&d| |d  S |d dkr5| |d d	 S | |S | |S )
a  
        This method acts as a preprocessing step for numbers between 1000 and 3000 (same as the original repository,
        link :
        https://github.com/neonbjb/tortoise-tts/blob/4003544b6ff4b68c09856e04d3eff9da26d023c2/tortoise/utils/tokenizer.py#L86)
        r   r[   i  i  ztwo thousandi  ztwo thousand rY   r\   )rr   ro   r^   )rR   rh   rT   r   r   r   _expand_number   s   

z EnglishNormalizer._expand_numberc                 C   sb   t d| j|}t dd|}t d| j|}t d| j|}t d| j|}t d| j|}|S )z
        This method is used to normalize numbers within a text such as converting the numbers to words, removing
        commas, etc.
        z([0-9][0-9,]+[0-9])u   £([0-9,]*[0-9])z	\1 poundsz\$([0-9.,]*[0-9])z([0-9]++\.[0-9]+)z[0-9]++(st|nd|rd|th)z[0-9]+)r   subr}   rw   r~   r   r   re   r   r   r   normalize_numbers   s   z#EnglishNormalizer.normalize_numbersc                 C   s"   | j D ]\}}t|||}q|S )z/
        Expands the abbreviate words.
        )rN   r   r   )rR   r`   regexreplacementr   r   r   expand_abbreviations   s   z&EnglishNormalizer.expand_abbreviationsc                 C   s   t t dd|S )z.
        Removes multiple whitespaces
        z\s+r]   )r   r   r   re   r   r   r   collapse_whitespace   s   z%EnglishNormalizer.collapse_whitespacec                 C   s@   |  |}| }| |}| |}| |}|dd}|S )z
        Converts text to ascii, numbers / number-like quantities to their spelt-out counterparts and expands
        abbreviations
        "r2   )rf   lowerr   r   r   r{   re   r   r   r   __call__   s   



zEnglishNormalizer.__call__N)__name__
__module____qualname__rS   rr   strr^   rf   rw   r}   r~   r   r   r   r   r   r   r   r   r   r   r      s    );r   )__doc__sysversion_infor   r   r   r   r   r   r   <module>   s   

