o
    i9                     @   sd  d Z ddlZddlZddlmZ e Zdd dD Zdd dD Zd	d
 Ze	dZ
e	dejZe	dejZe	dZe	dZe	dZe	dZe	dZe	dZe	dZe	dZe	dZe	dZe	dZe	dZe	dZe	dZdd Zdd Zd d! Zd"d# Zd$d% Z d&d' Z!d(d) Z"d*d+ Z#d,d- Z$d.d/ Z%d0d1 Z&d2d3 Z'd4d5 Z(d6d7 Z)d8d9 Z*d:d; Z+d<d= Z,d>d d?D Z-d@d dAD Z.e	dBZ/e	dCZ0e	dDejZ1e	dEZ2e	dFZ3dGdH Z4dIdJ Z5dKdL Z6dMdN Z7dOdP Z8dQdR Z9dSdT Z:dUdV Z;dWdX Z<dYdZ Z=d[d\ Z>d]d^ Z?d_d` Z@dadb ZAdcdd ZBdedf ZCdgdh ZDeEdikreFeDdj eFeDdk eFeDdl eFeDdm eFeDdn eFeDdo eFeDdp eFeDdq eFeDdr eFeDds eFeDdt eFeDdu eFeDdv eGdwdxD ]ZHeFeDeIeH qeFeDdy eFeDdz eFeDd{ eFeDd| eFeDd} eFeDd~ eFeDd eFeDd eFeDd eFeDd eFeDd eFeDd eFeDd eFeDd eFeDd eFeDd eFeDd eFeDd eFeDd eFeDd eFeDd eFeDd eFeDd eFeDd eFeDd eFeDd eFeDd eFeDd eFeDd eFeDd eFeDd eFeDd eFeDd eFeDd eFeDd eFeDd eFeDd eFeDd eFeDd eFeDd eFeDd eFeDd eFeDd eFeDd eFeDd eFeDd eFeDd dS dS )z
Normalize input text to a format that Soprano recognizes.
Adapted from https://github.com/neonbjb/tortoise-tts/blob/main/tortoise/utils/tokenizer.py
    N	unidecodec                 C   s*   g | ]}t d |d  t j|d fqS )z\b%s\.r      )recompile
IGNORECASE.0x r   Q/home/ubuntu/.local/lib/python3.10/site-packages/soprano/utils/text_normalizer.py
<listcomp>   s   * r   ))mrsmisess)msmiss)mrmister)drdoctor)stsaint)cocompany)jrjunior)majmajor)gengeneral)drsdoctors)revreverend)lt
lieutenant)hon	honorable)sgtsergeant)captcaptain)esqesquire)ltdlimited)colcolonel)ftfortc                 C   s&   g | ]}t d |d  |d fqS )z\b%s\br   r   r   r   r   r   r   r   r   %   s   & )$)Hzhertz)kHz	kilohertz)KBs	kilobytes)KBkilobyte)MBs	megabytes)MBmegabyte)GBs	gigabytes)GBgigabyte)TBs	terabytes)TBterabyte)APIsza p i's)APIza p i)CLIszc l i's)CLIzc l i)CPUszc p u's)CPUzc p u)GPUszg p u's)GPUzg p u)Aveavenue)etcz	et cetera)Monmonday)Tuestuesday)Wed	wednesday)Thursthursday)Frifriday)Satsaturday)Janjanuary)Febfebruary)Marmarch)Aprapril)Augaugust)Sept	september)Octoctober)Novnovember)Decdecember)and/orzand orc                 C   s$   t t D ]\}}t||| } q| S N)_abbreviations_cased_abbreviationsr   subtextregexreplacementr   r   r   expand_abbreviationsL   s   r{   z#\dz\b\d+(K|M|B|T)\bz(\d[a-z]|[a-z]\d)z(\d[\d\,]+\d)z1(^|[^/])(\d\d?[/-]\d\d?[/-]\d\d(?:\d\d)?)($|[^/])z$(\(?\d{3}\)?[-.\s]\d{3}[-.\s]?\d{4})z(\d\d?:\d\d(?::\d\d)?)u   £([\d\,]*\d+)z\$([\d\.\,]*\d+)z(\d+(?:\.\d+)+)z(\d\s?\*\s?\d)z(\d\s?/\s?\d)z(\d\s?\+\s?\d)z(\d?\s?-\s?\d)z(\d+(?:/\d+)+)z\d+(st|nd|rd|th)z\d+c                 C   s   |  d}d|d  S )Nr   znumber r   groupmmatchr   r   r   _expand_num_prefixg   s   
r   c                 C   s   |  d}|d  dkr|d d  dS |d  dkr'|d d  dS |d  dkr8|d d  dS |d  d	krI|d d  d
S |S )Nr   Kz	 thousandMz millionBz billionTz	 trillion)r}   upperr~   r   r   r   _expand_num_suffixk   s   
""""r   c                 C   s    |  d}|d  d|d  S )Nr   r    r|   r~   r   r   r   _split_alphanumerics      
r   c                 C   s   |  dddS )Nr   , )r}   replacer   r   r   r   _remove_commasw   s   r   c                 C   s4   |  d}td|}|  dd| |  d S )N   z[./-]r   z dash    )r}   r   splitjoinr~   r   r   r   _expand_datez   s   
r   c              
   C   sv   |  d}tdd|}t|dksJ dt|d d  ddt|dd  ddt|dd   }|S )	Nr   z\Dr   
   r   r   ,    )r}   r   rv   lenr   listr~   r   r   r   _expand_phone_number   s
   
Jr   c                 C   s@  |  d}|d}t|dkrE|\}}|dkr0t|dkr dS t|dkr+| dS | d	S |dr>d
|dd   }| d| S |\}}}t|dkr| d|dkrYdn|drcd
| n|h d|dkrod S |dr{d
|  S | S |dkr| d|dkrd S |drd
|  S | S |S )Nr   :r   00r   0   z minutesz o'clockzoh r   zoh ohr   )r}   r   r   int
startswith)r   r   hoursminutessecondsr   r   r   _expand_time   s$   




^6r   c                 C   s   |  d}|d}t|dkr|d S |d rt|d nd}t|dkr0|d r0t|d nd}|rN|rN|dkr<dnd}|dkrDdnd	}d
||||f S |r^|dkrVdnd}d||f S |rn|dkrfdnd	}d||f S dS )Nr   .r   z dollarsr   dollardollarscentcentsz%s %s, %s %sz%s %szzero dollars)r}   r   r   r   )r   r   partsr   r   dollar_unit	cent_unitr   r   r   _expand_dollars   s"   

$r   c                    sB   |  d  d  d d d fddtdt D  S )Nr   r   r   z point c                 3   s"    | ]}d  t | V  qdS )r   N)r   r   )r	   ir   r   r   	<genexpr>   s     z(_expand_decimal_point.<locals>.<genexpr>)r}   r   r   ranger   r   r   r   r   _expand_decimal_point   s   

.r   c                 C   s4   |  d}|d}t|dkrd|S d|S )Nr   /r    over  slash )r}   r   r   r   r~   r   r   r   _expand_fraction   s   

 r   c                 C      d | ddS )Nz times r   *r   r}   r   r   r   r   r   _expand_multiply      r   c                 C   r   )Nr   r   r   r   r   r   r   r   _expand_divide   r   r   c                 C   r   )N plus r   +r   r   r   r   r   _expand_add   r   r   c                 C   r   )Nz minus r   -r   r   r   r   r   _expand_subtract   r   r   c                 C   s   t j| dddS )Nr   r   andword)_inflectnumber_to_wordsr}   r   r   r   r   _expand_ordinal   s   r   c                 C   s   t | d}|dkrB|dk rB|dkrdS |dkr&|dk r&dt|d  S |d dkr5t|d d	 S tj|d
dddddS tj|d
dS )Nr   i  i  i  ztwo thousandi  ztwo thousand d   z hundredr   ohr   )r   zeror}   r   r   r   )r   r}   r   r   r   )r   numr   r   r   _expand_number   s   r   c                 C   s   t tt| } t tt| } t tt| } t tt	| } t t
t| } t tt| } t td| } t tt| } t tt| } t tt| } t tt| } t tt| } t tt| } t tt| } t tt| } tdD ]	}t t t!| } qmt t"t#| } | S )Nz	\1 poundsr   )$r   rv   _num_prefix_rer   _num_suffix_rer   _comma_number_rer   _date_rer   _phone_number_rer   _time_rer   
_pounds_re_dollars_rer   _decimal_number_rer   _multiply_rer   
_divide_rer   _add_rer   _subtract_rer   _fraction_rer   _ordinal_rer   r   _num_letter_split_rer   
_number_rer   )rx   _r   r   r   normalize_numbers   s&   r   c                 C   "   g | ]}t |d  |d fqS r   r   r4   r   r   r   r   r         " ))u   —z - c                 C   r   r   r4   r   r   r   r   r      r   ))@z at )&z and )%z	 percent )r   r   );r   )z\+r   )z\\z backslash )~z about )z(^| )<3z heart )z<=z less than or equal to )z>=z greater than or equal to )<z less than )>z greater than )=z equals )r   r   )r   r   )z\*r   z(https?://)z(. - .)z([A-Z]\.[A-Z])z[\(\[\{].*[\)\]\}](.|$)z\b([A-Z][a-z]*)+\bc                 C       t D ]\}}t||| } q| S rs   )_preunicode_special_charactersr   rv   rw   r   r   r   $expand_preunicode_special_characters     r   c                 C   r   rs   )_special_charactersr   rv   rw   r   r   r   expand_special_characters  r   r   c                 C   s   dS )Nzh t t p s colon slash slash r   r   r   r   r   _expand_link_header  s   r   c                 C       |  d}|d  d|d  S )Nr   r      r|   r~   r   r   r   _expand_dash  r   r   c                 C   r   )Nr   z dot r   r|   r~   r   r   r   _expand_dot  r   r   c                 C   s8   |  d}tdd|}tdd|}tdd|}|S )Nr   z[\(\[\{]r   z[\)\]\}][^$.!?,]z[\)\]\}]r   )r}   r   rv   r~   r   r   r   _expand_parantheses!  s
   
r   c                 C   sv   |  d}td|}t|dkr|S t|t|kr|S t|t|d kr6|d dkr6|d d  dS d|S )Nr   z[A-Z][a-z]*r   r   sz'sr   )r}   r   findallr   r   )r   r   matchesr   r   r   _split_mixedcase(  s   
 
r   c                 C   s<   t tt| } t tt| } t tt| } t tt	| } | S rs   )
r   rv   _link_header_rer   _dash_rer   _dot_rer   _parentheses_rer   rx   r   r   r   normalize_special3  s
   r   c                 C   s   t tt| } | S rs   )r   rv   _camelcase_rer   r   r   r   r   normalize_mixedcase:  s   r  c                 C   s   |   S rs   )lowerr   r   r   r   	lowercaseA     r  c                 C   s   t | S rs   r   r   r   r   r   convert_to_asciiD  r  r  c                 C   sb   |  d} tt| D ] }| |  | |< | | sq| | d dvr+| |  d| |< qd| S )N
r   z.!?r   r   )r   r   r   stripr   )rx   r   r   r   r   normalize_newlinesG  s   


r	  c                 C   s    t dd| } t dd| } | S )Nz'[^A-Za-z !\$%&'\*\+,-./0123456789<>\?_]r   z[<>/_+]r   rv   r   r   r   r   remove_unknown_charactersP  s   r  c                 C   s(   t dd| } t ddd | } |  S )Nz\s+r   z [.\?!,]c                 S   s   |  dd S )Nr   r   r|   r   r   r   r   <lambda>W  s    z%collapse_whitespace.<locals>.<lambda>)r   rv   r  r   r   r   r   collapse_whitespaceU  s   r  c                 C   sX   t dd| } t dd| } t dd| } t dd| } t d	d
| } t dd| } | S )Nz\.\.\.+z
[ELLIPSIS]z,+r   z[\.,]*\.[\.,]*r   z[\.,!]*![\.,!]*!z[\.,!\?]*\?[\.,!\?]*?z\[ELLIPSIS\]z...r
  r   r   r   r   dedup_punctuationZ  s   r  c                 C   s   t ddd | } | S )Nz
(\w)\1{2,}c                 S   s   |  dd d S )Nr   r   r|   r   r   r   r   r  d  s    z)collapse_triple_letters.<locals>.<lambda>r
  r   r   r   r   collapse_triple_lettersc  s   r  c                 C   sl   t | } t| } t| } t| } t| } t| } t| } t| } t| } t	| } t
| } t| } t| } | S rs   )r   r  r	  r   r   r{   r  r   r  r  r  r  r  r   r   r   r   
clean_textg  s   r  __main__z1,2,3,456,176z123,456,789z123,456,789thz123-456-7890z111-111-1111z(111) 111-1111zA(111) 111-1111zA (111) 111-1111z$2.47z$247z$0.27z$1.00u   £20i  i  26561024z2.47023z20.47023z1.17.1.1z111.111.1111z1/1/2025z1-1-2025z1-1-25z
A 1/1/11 AzA 1/1 Az1/1z1/10z1/1/10z	11/1/1/10z0:00z12:00z13:00z8:00z8:05z8:15z0:00:00z00:01:10z00:10:01z01:01:01z00:01:00z01:00:00z-1 + 2 * 3 - 4 / 5z-1+2*3-5/4/25100x1100k100m100b100tz#1z11:59z01:000100z1st 2nd 3rd 4thz1K 1M 1B 1T 1K1M1B1Trr   LMDeployLMDeployDecoderModelTest	UPPERCASETPUs)J__doc__r   inflectr   enginer   rt   ru   r{   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r   r   r   r   r   r   r   r   r  r  r  r	  r  r  r  r  r  __name__printr   r   strr   r   r   r   <module>   s   '


















		
