o
    Xi9Q                     @   s   d dl Z d dlZd dlZd dlmZ d dlmZmZmZm	Z	m
Z
 d dlmZ ddlmZ G dd dZG d	d
 d
ZG dd dZdS )    N)Fraction)IteratorListMatchOptionalUnion)windowed   )remove_symbols_and_diacriticsc                       sd   e Zd ZdZ fddZdee dee fddZdefd	d
Z	defddZ
defddZ  ZS )EnglishNumberNormalizerav  
    Convert any spelled-out numbers into arabic numbers, while handling:

    - remove any commas
    - keep the suffixes such as: `1960s`, `274th`, `32nd`, etc.
    - spell out currency symbols after the number. e.g. `$20 million` -> `20000000 dollars`
    - spell out `one` and `ones`
    - interpret successive single-digit numbers as nominal: `one oh one` -> `101`
    c                    s  t    h d| _dd tg dddD | _dd | j D | _dd	d
dddddd | j D | _i | j| j| _ddddddddd| _	dd | j	 D | _
dd | j	 D | _i | j
| j| _dddddd d!d"d#d$d%d&d'| _d(d | j D | _d)d | j D | _i | j| j| _h | j| j	| j| _d*d*d+d+d,| _d-d-d.d.d/d/d0d0d1| _tt| j t| j  | _d2d3id3d4| _h d5| _td6d7 | j| j| j| j	| j| j| j| j| j| j| jfD | _d8d9h| _d S ):N>   oohzeroc                 S   s   i | ]\}}||qS  r   ).0inamer   r   O/home/ubuntu/.local/lib/python3.10/site-packages/whisper/normalizers/english.py
<dictcomp>   s    z4EnglishNumberNormalizer.__init__.<locals>.<dictcomp>)onetwothreefourfivesixseveneightnineteneleventwelvethirteenfourteenfifteensixteen	seventeeneighteennineteenr	   )startc                 S   s*   i | ]\}}|d krdn|d |dfqS )r   sixessr   r   r   valuer   r   r   r   6   s    )r   th)r	   st)   nd)   rd)   r-   )   r-   )zerothfirstsecondthirdfifthtwelfthc                 S   sD   i | ]\}}|d kr|dkr|dkr|| drdnd |dfqS )r1   r3   r4   thr-   )endswithr+   r   r   r   r   A   s
          (   2   <   F   P   Z   )twentythirtyfortyfiftysixtyseventyeightyninetyc                 S   "   i | ]\}}| d d|dfqS )yiesr*   replacer+   r   r   r   r   S       c                 S   rN   )rO   iethr-   rQ   r+   r   r   r   r   V   rS   d     i@B i ʚ;l    J)l     I5 l     NZol     @=7M.cl      B3v^!< l      P ~cegl       73Me'l       (l
F3YHqS )hundredthousandmillionbilliontrillionquadrillionquintillion
sextillion
septillion	octillion	nonillion	decillionc                 S      i | ]\}}|d  |d fqS r*   r   r+   r   r   r   r   i       c                 S   rc   )r-   r   r+   r   r   r   r   l   re   -+)minusnegativepluspositive   £u   €$   ¢)poundpoundseuroeurosdollardollarscentcentsru   %)perpercent>   andpointdoubletriplec                 S   s   g | ]	}|D ]}|qqS r   r   )r   mappingkeyr   r   r   
<listcomp>   s    z4EnglishNumberNormalizer.__init__.<locals>.<listcomp>r   ones)super__init__zeros	enumerater   itemsones_pluralones_ordinalones_suffixedtenstens_pluraltens_ordinaltens_suffixedmultipliersmultipliers_pluralmultipliers_ordinalmultipliers_suffixeddecimalspreceding_prefixersfollowing_prefixerssetlistvaluesprefixes	suffixersspecialswordsliteral_wordsself	__class__r   r   r      s   




z EnglishNumberNormalizer.__init__r   returnc                 #   s   d  d d}dt fdd}dtt tf f fdd}t|dkr$d S td g| d g d	D ]\}}}|r:d}q/|d uoCtd
|}|d | jv }	|	rS|dd  n|}
td
|
r||
}|d useJ d urtt r|	dr|t t | q/|V  |	r|d n  |j
dkr|jq/|
q/|| jvrd ur|V  ||V  q/|| jv rt pdd q/|| jv r'| j| }d u r|q/tt s|| jv r|| jv r|dk rd dksJ d d t | q/t t | q/|dk rd dkr	|7 q/t t | q/d dkr|7 q/t t | q/|| jv r| j| \}}d u rC|t || V  ntt sO|| jv r|| jv rs|dk rsd dkscJ |d d t | | V  nT|t t | | V  nF|dk rd dkr|t | | V  n.|t t | | V  n d dkr|t | | V  n|t t | | V  d q/|| jv r| j| }d u r|q/tt rt t | q/d dkr|7 q/t t | q/|| jv rR| j| \}}d u r|t || V  q/tt r1|t t | | V  q/d dkrD|t | | V  q/|t t | | V  q/|| jv r| j| }d u re|q/tt spdkr|}|d ur}|| nd }|d ur|j
dkr|jq/|V  |q/d d }d }|||  q/|| jv r| j| \}}d u r|t || V  nStt r|}|d ur|| nd }|d ur|j
dkr|t |j| V  n(|V  |t || V  nd d }d }|||  |t | V  d q/|| jv r>d ur)|V  || jv s2|r8| j|  q/||V  q/|| jv rZd urT| j|  |V  q/||V  q/|| jv rd ur| j| }t|tr||v r|t ||  V  d}q/|V  ||V  q/|t | V  q/||V  q/|| jv r=|| jvr|sd ur|V  ||V  q/|dkr|| jvr׈d ur|V  ||V  q/|dks|dkr|| jv s|| jv r|dkrdnd	}| j|d}t pdt ||  d}q/d ur|V  ||V  q/|dkr6|| jv s,|r5t p1dd q/td| td| d urP|V  d S d S )NFr*   c                 S   s    zt | W S  ty   Y d S w N)r   
ValueErrorrd   r   r   r   to_fraction   s
   
z:EnglishNumberNormalizer.process_words.<locals>.to_fractionresultc                    s$   t | }  d ur |  } d d  | S r   )str)r   prefixr,   r   r   output   s   z5EnglishNumberNormalizer.process_words.<locals>.outputr   r1   z^\d+(\.\d+)?$r	   . 0
   rU   rV   Trz   r|   r}   r/   r{   zUnexpected token: )r   r   intlenr   rematchr   
isinstancer=   denominator	numeratorr   r   r   r   r   r   r   r   r   r   r   dictr   getr   r   )r   r   skipr   r   prevcurrentnextnext_is_numeric
has_prefixcurrent_without_prefixfr   suffixr   
multiplierpbeforeresidualrepeatsr   r   r   process_words   sJ  	"









 



























z%EnglishNumberNormalizer.process_wordsr*   c                 C   s   g }t d|}t|D ]=\}}t| dkrq|t|d kr'|| q|| |jddd }|| jv s>|| jv rD|d q|d qd		|}t 
d
d|}t 
dd|}t 
dd|}|S )Nz\band\s+a\s+half\br   r	   r/   )maxsplitr   z
point fivez
and a half z([a-z])([0-9])z\1 \2z([0-9])([a-z])z([0-9])\s+(st|nd|rd|th|s)\b\1\2)r   splitr   r   stripappendrsplitr   r   joinsub)r   r*   resultssegmentsr   segment	last_wordr   r   r   
preprocess}  s"   

z"EnglishNumberNormalizer.preprocessc                 C   sJ   dt fdd}dt fdd}td||}td||}tdd	|}|S )
Nmc                 S   sR   z|  d}|  d}t|  d}| | d|dW S  ty(   | j Y S w )Nr	   r/   r1   r   02d)groupr   r   string)r   currencyintegerrv   r   r   r   combine_cents  s   


z:EnglishNumberNormalizer.postprocess.<locals>.combine_centsc                 S   s0   zdt | d W S  ty   | j Y S w )Nrn   r	   )r   r   r   r   )r   r   r   r   extract_cents  s
   
z:EnglishNumberNormalizer.postprocess.<locals>.extract_centsu,   ([€£$])([0-9]+) (?:and )?¢([0-9]{1,2})\bu   [€£$]0.([0-9]{1,2})\bz	\b1(s?)\bzone\1)r   r   r   )r   r*   r   r   r   r   r   postprocess  s   	z#EnglishNumberNormalizer.postprocessc                 C   s6   |  |}ddd | | D }| |}|S )Nr   c                 s   s    | ]	}|d ur|V  qd S r   r   r   wordr   r   r   	<genexpr>  s    z3EnglishNumberNormalizer.__call__.<locals>.<genexpr>)r   r   r   r   r   r   r*   r   r   r   __call__  s   

z EnglishNumberNormalizer.__call__)__name__
__module____qualname____doc__r   r   r   r   r   r   r   r   __classcell__r   r   r   r   r      s    
 
 ^r   c                   @   s&   e Zd ZdZdd ZdefddZdS )EnglishSpellingNormalizerz~
    Applies British-American spelling mappings as listed in [1].

    [1] https://www.tysto.com/uk-us-spelling-list.html
    c                 C   s*   t jt jtd}tt|| _d S )Nzenglish.json)	ospathr   dirname__file__jsonloadopenr~   )r   mapping_pathr   r   r   r     s   z"EnglishSpellingNormalizer.__init__r*   c                    s   d  fdd| D S )Nr   c                 3   s    | ]
} j ||V  qd S r   )r~   r   r   r   r   r   r     s    z5EnglishSpellingNormalizer.__call__.<locals>.<genexpr>)r   r   r   r   r   r   r     s   z"EnglishSpellingNormalizer.__call__N)r   r   r   r   r   r   r   r   r   r   r   r     s    r   c                   @   s"   e Zd Zdd ZdefddZdS )EnglishTextNormalizerc                 C   s  d| _ i dddddddd	d
ddddddddddddddddddddddd d!d"i d#d$d%d&d'd(d)d*d+d,d-d.d/d0d1d2d3d4d5d6d7d8d9d:d;d<d=d>d?d@dAdBdCdDdEdFdGdHdIdJdKdLdMdNdOdPdLdQdRdS| _t | _t | _d S )TNz\b(hmm|mm|mhm|mmm|uh|um)\bz	\bwon't\bzwill notz	\bcan't\bzcan notz	\blet's\bzlet usz	\bain't\baintz	\by'all\bzyou allz	\bwanna\bzwant toz	\bgotta\bzgot toz	\bgonna\bzgoing toz\bi'ma\bzi am going toz\bimma\bz
\bwoulda\bz
would havez
\bcoulda\bz
could havez\bshoulda\bzshould havez	\bma'am\bmadamz\bmr\bzmister z\bmrs\bzmissus z\bst\bzsaint z\bdr\bzdoctor z\bprof\bz
professor z\bcapt\bzcaptain z\bgov\bz	governor z\bald\bz	alderman z\bgen\bzgeneral z\bsen\bzsenator z\brep\bzrepresentative z\bpres\bz
president z\brev\bz	reverend z\bhon\bz
honorable z\basst\bz
assistant z	\bassoc\bz
associate z\blt\bzlieutenant z\bcol\bzcolonel z\bjr\bzjunior z\bsr\bzsenior zesquire z	 had beenz	 has beenz	 had gonez	 has gonez	 had donez has gotz notz arez isz wouldz willz havez am)z\besq\bz	'd been\bz	's been\bz	'd gone\bz	's gone\bz	'd done\bz's got\bzn't\bz're\bz's\bz'd\bz'll\bz't\bz've\bz'm\b)ignore_patterns	replacersr   standardize_numbersr   standardize_spellingsr   r   r   r   r     s   	
 !"#$%
7zEnglishTextNormalizer.__init__r*   c                 C   s   |  }tdd|}tdd|}t| jd|}tdd|}| j D ]\}}t|||}q&tdd|}tdd	|}t|d
d}| |}| |}tdd	|}tdd|}tdd|}|S )Nz[<\[][^>\]]*[>\]]r   z\(([^)]+?)\)z\s+''z	(\d),(\d)r   z\.([^0-9]|$)z \1u
   .%$¢€£)keepu   [.$¢€£]([^0-9])z	([^0-9])%z\1 z\s+r   )	lowerr   r   r   r   r   r
   r   r   )r   r*   patternreplacementr   r   r   r     s    

zEnglishTextNormalizer.__call__N)r   r   r   r   r   r   r   r   r   r   r     s    <r   )r   r   r   	fractionsr   typingr   r   r   r   r   more_itertoolsr   basicr
   r   r   r   r   r   r   r   <module>   s       2