o
    iIY                     @   s
  d dl Z d dlZd dlmZ d dlmZ d dl mZ d dlmZm	Z	 d dl
Z
i dddd	d
ddddddddddddddddddddddddd d!d"Zd1d$efd%d&Zd$efd'd(ZG d)d* d*ZG d+d, d,ZG d-d. d.ZG d/d0 d0ZdS )2    N)Iterator)Fraction)Match)OptionalUnionu   œoeu   ŒOE   øo   ØO   æae   ÆAE   ßssu   ẞSSu   đdu   ĐD   ð   Ð   þth   Þu   łlu   ŁL sc                    s,    fddd fddtd| D S )z
    Replace any other markers, symbols, and punctuations with a space, and drop any diacritics (category 'Mn' and some
    manual mappings)
    c                    sH   |  v r| S | t v rt |  S t| dkrdS t| d dv r"dS | S )NMnr   r   MSP )ADDITIONAL_DIACRITICSunicodedatacategory)charkeep b/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/whisper/english_normalizer.pyreplace_character5   s   z8remove_symbols_and_diacritics.<locals>.replace_characterr   c                 3   s    | ]} |V  qd S Nr(   .0c)r*   r(   r)   	<genexpr>C   s    z0remove_symbols_and_diacritics.<locals>.<genexpr>NFKDjoinr#   	normalize)r   r'   r(   )r'   r*   r)   remove_symbols_and_diacritics/   s    r4   c                 C   s   d dd td| D S )z[
    Replace any other markers, symbols, punctuations with a space, keeping diacritics
    r   c                 s   s*    | ]}t |d  dv rdn|V  qdS )r   r    r!   N)r#   r$   r,   r(   r(   r)   r/   J   s   ( z!remove_symbols.<locals>.<genexpr>NFKCr1   r   r(   r(   r)   remove_symbolsF   s   r7   c                   @   s.   e Zd Zd
dedefddZdefddZd	S )BasicTextNormalizerFremove_diacriticssplit_lettersc                 C   s   |rt nt| _|| _d S r+   )r4   r7   cleanr:   )selfr9   r:   r(   r(   r)   __init__N   s   
zBasicTextNormalizer.__init__r   c                 C   s`   |  }tdd|}tdd|}| |  }| jr'dtd|tj}tdd|}|S )N[<\[][^>\]]*[>\]]r   \(([^)]+?)\)r!   z\X\s+)	lowerresubr;   r:   r2   regexfindallUr<   r   r(   r(   r)   __call__R   s   zBasicTextNormalizer.__call__N)FF)__name__
__module____qualname__boolr=   strrH   r(   r(   r(   r)   r8   M   s    r8   c                       sd   e Zd ZdZ fddZdee dee fddZdefd	d
Z	defddZ
defddZ  ZS )EnglishNumberNormalizerav  
    Convert any spelled-out numbers into arabic numbers, while handling:

    - remove any commas
    - keep the suffixes such as: `1960s`, `274th`, `32nd`, etc.
    - spell out currency symbols after the number. e.g. `$20 million` -> `20000000 dollars`
    - spell out `one` and `ones`
    - interpret successive single-digit numbers as nominal: `one oh one` -> `101`
    c                    s  t    h d| _dd tg dddD | _dd | j D | _dd	d
dddddd | j D | _i | j| j| _ddddddddd| _	dd | j	 D | _
dd | j	 D | _i | j
| j| _dddddd d!d"d#d$d%d&d'| _d(d | j D | _d)d | j D | _i | j| j| _h | j| j	| j| _d*d*d+d+d,| _d-d-d.d.d/d/d0d0d1| _tt| j t| j  | _d2d3id3d4| _h d5| _d6d7 | j| j| j| j	| j| j| j| j| j| j| jfD | _d8d9h| _d S ):N>   r
   ohzeroc                 S   s   i | ]\}}||qS r(   r(   )r-   inamer(   r(   r)   
<dictcomp>p   s    z4EnglishNumberNormalizer.__init__.<locals>.<dictcomp>)onetwothreefourfivesixseveneightnineteneleventwelvethirteenfourteenfifteensixteen	seventeeneighteennineteen   )startc                 S   s*   i | ]\}}|d krdn|d |dfqS )rY   sixesr   r(   r-   rR   valuer(   r(   r)   rS   x   s    )r   r   )rg   st)   nd)   rd)   r   )   r   )zerothfirstsecondthirdfifthtwelfthc                 S   sD   i | ]\}}|d kr|dkr|dkr|| drdnd |dfqS )ro   rq   rr   thr   )endswithrj   r(   r(   r)   rS      s
          (   2   <   F   P   Z   )twentythirtyfortyfiftysixtyseventyeightyninetyc                 S   "   i | ]\}}| d d|dfqS )yiesr   replacerj   r(   r(   r)   rS         " c                 S   r   )r   iethr   r   rj   r(   r(   r)   rS      r   d     i@B i ʚ;l    J)l     I5 l     NZol     @=7M.cl      B3v^!< l      P ~cegl       73Me'l       (l
F3YHqS )hundredthousandmillionbilliontrillionquadrillionquintillion
sextillion
septillion	octillion	nonillion	decillionc                 S      i | ]\}}|d  |d fqS r6   r(   rj   r(   r(   r)   rS          c                 S   r   )r   r(   rj   r(   r(   r)   rS      r   -+)minusnegativepluspositive   £u   €$   ¢)poundpoundseuroeurosdollardollarscentcentsr   %)perpercent>   andpointdoubletriplec                 S   s   h | ]	}|D ]}|qqS r(   r(   )r-   mappingkeyr(   r(   r)   	<setcomp>   s    z3EnglishNumberNormalizer.__init__.<locals>.<setcomp>rT   ones)superr=   zeros	enumerater   itemsones_pluralones_ordinalones_suffixedtenstens_pluraltens_ordinaltens_suffixedmultipliersmultipliers_pluralmultipliers_ordinalmultipliers_suffixeddecimalspreceding_prefixersfollowing_prefixerssetlistvaluesprefixes	suffixersspecialswordsliteral_wordsr<   	__class__r(   r)   r=   k   s   


"

z EnglishNumberNormalizer.__init__r   returnc                 #   s   d  d d}dt fdd}dtt tf f fdd}t|dkr$d S t|D ] \}}|dkr7||d	  nd }|t|d	 krG||d	  nd }|rNd}q(|d uoWtd
|}	|d | jv }
|
rg|d	d  n|}td
|r||}|d u r{tdd urt	t r
drt t | q(|V  |
r|d n  |jd	kr|jq(|q(|| jvrd ur|V  ||V  q(|| jv rt pdd q(|| jv r6| j| }d u r|q(t	t s|| jv r|| jv r|dk rd d t | q(t t | q(|dk r!d dkr|7 q(t t | q(d dkr-|7 q(t t | q(|| jv r| j| \}}d u rR|t || V  n{t	t s^|| jv r|| jv ry|dk ry|d d t | | V  nT|t t | | V  nF|dk rd dkr|t | | V  n.|t t | | V  n d dkr|t | | V  n|t t | | V  d q(|| jv r| j| }d u r|q(t	t rt t | q(d dkr|7 q(t t | q(|| jv rX| j| \}}d u r#|t || V  q(t	t r7|t t | | V  q(d dkrJ|t | | V  q(|t t | | V  q(|| jv r| j| }d u rk|q(t	t svdkr|}|d ur|| nd }|d ur|jd	kr|jq(|V  |q(d d }d }|||  q(|| jv r| j| \}}d u r|t || V  nSt	t r|}|d ur|| nd }|d ur|jd	kr|t |j| V  n(|V  |t || V  nd d }d }|||  |t | V  d q(|| jv rDd ur/|V  || jv s8|	r>| j|  q(||V  q(|| jv r`d urZ| j|  |V  q(||V  q(|| jv rd ur| j| }t	|tr||v r|t ||  V  d}q(|V  ||V  q(|t | V  q(||V  q(|| jv rC|| jvr|	sÈd ur|V  ||V  q(|dkr|| jvr݈d ur|V  ||V  q(|dks|dkr$|| jv s|| jv r|dkrdnd}| j|d}t p	dt ||  d}q(d ur|V  ||V  q(|dkr<|| jv s2|	r;t p7dd q(td| td| d urV|V  d S d S )NFr   c                 S   s    zt | W S  ty   Y d S w r+   )r   
ValueErrorr6   r(   r(   r)   to_fraction   s
   
z:EnglishNumberNormalizer.process_words.<locals>.to_fractionresultc                    s$   t | }  d ur |  } d d  | S r+   )rM   )r   prefixrk   r(   r)   output   s   z5EnglishNumberNormalizer.process_words.<locals>.outputr   rg   z^\d+(\.\d+)?$zConverting the fraction failed.r   0
   r   r   Tr   r   r   rm   ro   r   zUnexpected token: )rM   r   intlenr   rB   matchr   r   
isinstancer{   denominator	numeratorr   r   r   r   r   r   r   r   r   r   r   dictr   getr   )r<   r   skipr   r   rQ   currentprevnextnext_is_numeric
has_prefixcurrent_without_prefixfr   suffixr   
multiplierpbeforeresidualrepeatsr(   r   r)   process_words   sL  	 









 



























z%EnglishNumberNormalizer.process_wordsr   c                 C   s   g }t d|}t|D ]=\}}t| dkrq|t|d kr'|| q|| |jddd }|| jv s>|| jv rD|d q|d qd		|}t 
d
d|}t 
dd|}t 
dd|}|S )Nz\band\s+a\s+half\br   rg   rm   )maxsplitr   z
point fivez
and a halfr!   z([a-z])([0-9])z\1 \2z([0-9])([a-z])z([0-9])\s+(st|nd|rd|th|s)\b\1\2)rB   splitr   r   stripappendrsplitr   r   r2   rC   )r<   r   resultssegmentsrQ   segment	last_wordr(   r(   r)   
preprocess  s"   

z"EnglishNumberNormalizer.preprocessc                 C   sJ   dt fdd}dt fdd}td||}td||}tdd	|}|S )
Nmc                 S   sR   z|  d}|  d}t|  d}| | d|dW S  ty(   | j Y S w )Nrg   rm   ro   r   02d)groupr   r   string)r	  currencyintegerr   r(   r(   r)   combine_cents  s   


z:EnglishNumberNormalizer.postprocess.<locals>.combine_centsc                 S   s0   zdt | d W S  ty   | j Y S w )Nr   rg   )r   r  r   r  )r	  r(   r(   r)   extract_cents  s
   
z:EnglishNumberNormalizer.postprocess.<locals>.extract_centsu,   ([€£$])([0-9]+) (?:and )?¢([0-9]{1,2})\bu   [€£$]0.([0-9]{1,2})\bz	\b1(s?)\bzone\1)r   rB   rC   )r<   r   r  r  r(   r(   r)   postprocess  s   	z#EnglishNumberNormalizer.postprocessc                 C   s6   |  |}ddd | | D }| |}|S )Nr!   c                 s   s    | ]	}|d ur|V  qd S r+   r(   r-   wordr(   r(   r)   r/     s    z3EnglishNumberNormalizer.__call__.<locals>.<genexpr>)r  r2   r   r   r  rG   r(   r(   r)   rH     s   

z EnglishNumberNormalizer.__call__)rI   rJ   rK   __doc__r=   r   rM   r   r   r  r  rH   __classcell__r(   r(   r   r)   rN   `   s    
j `rN   c                   @   s&   e Zd ZdZdd ZdefddZdS )EnglishSpellingNormalizerz~
    Applies British-American spelling mappings as listed in [1].

    [1] https://www.tysto.com/uk-us-spelling-list.html
    c                 C   s
   || _ d S r+   )r   r<   english_spelling_mappingr(   r(   r)   r=     s   
z"EnglishSpellingNormalizer.__init__r   c                    s   d  fdd| D S )Nr!   c                 3   s    | ]
} j ||V  qd S r+   )r   r   r  r   r(   r)   r/     s    z5EnglishSpellingNormalizer.__call__.<locals>.<genexpr>)r2   r   rG   r(   r   r)   rH     s   z"EnglishSpellingNormalizer.__call__N)rI   rJ   rK   r  r=   rM   rH   r(   r(   r(   r)   r    s    r  c                   @   s"   e Zd Zdd ZdefddZdS )EnglishTextNormalizerc                 C   s  d| _ i dddddddd	d
ddddddddddddddddddddddd d!d"i d#d$d%d&d'd(d)d*d+d,d-d.d/d0d1d2d3d4d5d6d7d8d9d:d;d<d=d>d?d@dAdBdCdDdEdFdGdHdIdJdKdLdMdNdOdPdLdQdRdS| _t | _t|| _d S )TNz\b(hmm|mm|mhm|mmm|uh|um)\bz	\bwon't\bzwill notz	\bcan't\bzcan notz	\blet's\bzlet usz	\bain't\baintz	\by'all\bzyou allz	\bwanna\bzwant toz	\bgotta\bzgot toz	\bgonna\bzgoing toz\bi'ma\bzi am going toz\bimma\bz
\bwoulda\bz
would havez
\bcoulda\bz
could havez\bshoulda\bzshould havez	\bma'am\bmadamz\bmr\bzmister z\bmrs\bzmissus z\bst\bzsaint z\bdr\bzdoctor z\bprof\bz
professor z\bcapt\bzcaptain z\bgov\bz	governor z\bald\bz	alderman z\bgen\bzgeneral z\bsen\bzsenator z\brep\bzrepresentative z\bpres\bz
president z\brev\bz	reverend z\bhon\bz
honorable z\basst\bz
assistant z	\bassoc\bz
associate z\blt\bzlieutenant z\bcol\bzcolonel z\bjr\bzjunior z\bsr\bzsenior zesquire z	 had beenz	 has beenz	 had gonez	 has gonez	 had donez has gotz notz arez isz wouldz willz havez am)z\besq\bz	'd been\bz	's been\bz	'd gone\bz	's gone\bz	'd done\bz's got\bzn't\bz're\bz's\bz'd\bz'll\bz't\bz've\bz'm\b)ignore_patterns	replacersrN   standardize_numbersr  standardize_spellingsr  r(   r(   r)   r=     s   	
 !"#$%
7zEnglishTextNormalizer.__init__r   c                 C   s   |  }tdd|}tdd|}t| jd|}tdd|}| j D ]\}}t|||}q&tdd|}tdd	|}t|d
d}| |}| |}tdd	|}tdd|}tdd|}|S )Nr>   r   r?   z\s+''z	(\d),(\d)r   z\.([^0-9]|$)z \1u
   .%$¢€£r&   u   [.$¢€£]([^0-9])z	([^0-9])%z\1 r@   r!   )	rA   rB   rC   r  r  r   r4   r  r  )r<   r   patternreplacementr(   r(   r)   rH   =  s    

zEnglishTextNormalizer.__call__N)rI   rJ   rK   r=   rM   rH   r(   r(   r(   r)   r     s    <r  )r   )rB   r#   collections.abcr   	fractionsr   r   typingr   r   rD   r"   rM   r4   r7   r8   rN   r  r  r(   r(   r(   r)   <module>   sb   	
   