o
    }oi;                      @   s   d dl Z d dlmZ d dlmZ e dZe dZe dZe dZ	e dZ
e d	Zd
d dD Zdd dD Zdd dD Zd dlmZ edd Zd ddZdd Zdd Zd ddZdd ZG dd dZdS )!    N)	unidecode)loggingz1([$]?)(^|\s)(\S*[0-9]\S*)(?=(\s|$)((\S*)(\s|$))?)z([0-9]{1,2}):([0-9]{2})(am|pm)?z\$z([0-9]+)(st|nd|rd|th)z$([0-9]{3})([.,][0-9]{1,2})?([!.?])?$z([.,][0-9]{1,2})$c                 C   &   g | ]}t d |d  |d fqS z\b%s\.r      recompile.0x r   h/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/common/parts/preprocessing/cleaners.py
<listcomp>       r   )#msmissmrsmisessmrmister)messrsmesseursdrdoctordrsdoctorsstsaintcocompanyjrjuniorsrseniorrevreverendhon	honorablesgtsergeantcaptcaptainmajmajorcolcolonellt
lieutenantgengeneralprof	professorlbpoundsreprepresentativer#   streetaveavenue)etcz	et ceterajanjanuaryfebfebruarymarmarchaprapriljunjunejuljulyaugaugustsep	septemberoctoctobernovnovemberdecdecemberc                 C   r   r   r   r
   r   r   r   r   F   r   )ltdlimitedfigfigurefigsfiguresgent	gentlemenftfortesqesquirepreppreperationbrosbrothersindindependentmmemadameproprofessionalvsversus)incincludec                 C   r   r   r   r
   r   r   r   r   Y   r   )-r   r   r   r   r   r"   r%   r(   r+   r.   r1   r4   r7   r:   r=   r@   rC   rF   rI   rL   rO   rQ   rU   rX   r[   r^   ra   rd   rg   rj   rm   rp   rs   rv   ry   r|   r   r   r   r   r   r   r   r   r   )cachec                  C   s   dd l } |  S )Nr   )inflectengine)r   r   r   r   inflect_engine   s   r   c                 C   s\   t |  t| } |  } tdd| } t| } t| |d} t| ||} tdd|  } | S )Nz\s+ )version)	warn_common_charsr   lowerr   subclean_numbersclean_abbreviationsclean_punctuationsstrip)stringtablepunctuation_to_replaceabbreviation_versionr   r   r   
clean_text   s   r   c                 C   s   t d| rtd d S d S )Nu   [£€]uN   Your transcript contains one of '£' or '€' which we do not currently handle)r   searchr   warning)r   r   r   r   r      s   r   c                 C   s   t  }t|j| } | S N)NumberCleaner	NUM_CHECKr   clean)r   cleanerr   r   r   r      s   r   c                 C   s@   t }|dkr	t}n|dkrt|_|D ]\}}t||| } q| S )N	fastpitchexpanded)ABBREVIATIONS_COMMONABBREVIATIONS_TTS_FASTPITCHABBREVIATIONS_EXPANDEDextendr   r   )r   r   abbbreviationsregexreplacementr   r   r   r      s   r   c                 C   s>   |  D ]\}}td|d|| } q|r| |} | S )Nz\{}z {} )itemsr   r   format	translate)r   r   r   puncr   r   r   r   r      s
   
r   c                       s4   e Zd Z fddZdd Zdd Zdd Z  ZS )	r   c                    s   t    |   d S r   )super__init__resetself	__class__r   r   r      s   
zNumberCleaner.__init__c                 C   s   g | _ d | _d S r   )curr_numcurrencyr   r   r   r   r      s   
zNumberCleaner.resetc                 C   s   t  }| jr1||}||dkrdnd7 }|r+|dt  | 7 }|||kr(dnd7 }|   |S |   |rB|d| 7 }||S dd	 }td
||S )Nr   z dollarz dollarsz and z centz cents.c                 S   s   dt  | d d S )Nr   r   )r   number_to_wordsgroup)matchr   r   r   convert_to_word   s   z:NumberCleaner.format_final_number.<locals>.convert_to_wordz[0-9,]+)r   r   r   r   r   r   )r   	whole_numdecimalr   return_stringr   r   r   r   format_final_number   s   

z!NumberCleaner.format_final_numberc                 C   st  t  }|d}|d}|d}t|}|rP|||d d }t|d}d}	|dkr<d||d }	d}
|drJd|d }
||	|
S t|}t|rd|||d S | jd u rt|dprt	|| _t
|d	}|r| j| dS d| j| }d }t|}|r|ddd  }|d t| d  }td
d|}|| || S )N         r   z{}{} r   r      z\.)r   r   
TIME_CHECKr   r   intr   	ORD_CHECKr   CURRENCY_CHECKTHREE_CHECKr   appendjoinDECIMAL_CHECKr   lenr   r   r   )r   r   r   wsnumber_proceeding_symbol
time_matchr   mins
min_stringampm_string	ord_matchthree_matchr   r   decimal_matchr   r   r   r      s@   








zNumberCleaner.clean)__name__
__module____qualname__r   r   r   r   __classcell__r   r   r   r   r      s
    r   r   )r   text_unidecoder   
nemo.utilsr   r	   r   r   r   r   r   r   r   r   r   	functoolsr   r   r   r   r   r   r   r   r   r   r   r   <module>   s6   





)4


