o
    wiZ                     @   s   d Z g dZddlZddlZddlZddlmZ ddlmZm	Z	m
Z
mZmZ ddlmZ ddlmZ dd	lmZ G d
d dZG dd dZG dd dZdS )z,OpenAI's English text standardisation module)EnglishNumberNormalizerEnglishSpellingNormalizerEnglishTextNormalizer    N)Fraction)IteratorListMatchOptionalUnion)files)windowed   )remove_symbols_and_diacriticsc                       sd   e Zd ZdZ fddZdee dee fddZdefd	d
Z	defddZ
defddZ  ZS )r   av  
    Convert any spelled-out numbers into arabic numbers, while handling:

    - remove any commas
    - keep the suffixes such as: `1960s`, `274th`, `32nd`, etc.
    - spell out currency symbols after the number. e.g. `$20 million` -> `20000000 dollars`
    - spell out `one` and `ones`
    - interpret successive single-digit numbers as nominal: `one oh one` -> `101`
    c                    s  t    h d| _dd tg dddD | _dd | j D | _dd	d
dddddd | j D | _i | j| j| _ddddddddd| _	dd | j	 D | _
dd | j	 D | _i | j
| j| _dddddd d!d"d#d$d%d&d'| _d(d | j D | _d)d | j D | _i | j| j| _h | j| j	| j| _d*d*d+d+d,| _d-d-d.d.d/d/d0d0d1| _tt| j t| j  | _d2d3id3d4| _h d5| _td6d7 | j| j| j| j	| j| j| j| j| j| j| jfD | _d8d9h| _d S ):N>   oohzeroc                 S   s   i | ]\}}||qS  r   ).0inamer   r   W/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/whisper_normalizer/english.py
<dictcomp>%   s    z4EnglishNumberNormalizer.__init__.<locals>.<dictcomp>)onetwothreefourfivesixseveneightnineteneleventwelvethirteenfourteenfifteensixteen	seventeeneighteennineteenr   )startc                 S   s*   i | ]\}}|d krdn|d |dfqS )r   sixessr   r   r   valuer   r   r   r   @   s    )r   th)r   st)   nd)   rd)   r0   )   r0   )zerothfirstsecondthirdfifthtwelfthc                 S   sD   i | ]\}}|d kr|dkr|dkr|| drdnd |dfqS )r4   r6   r7   thr0   )endswithr.   r   r   r   r   K   s
          (   2   <   F   P   Z   )twentythirtyfortyfiftysixtyseventyeightyninetyc                 S   "   i | ]\}}| d d|dfqS )yiesr-   replacer.   r   r   r   r   ]   s    c                 S   rQ   )rR   iethr0   rT   r.   r   r   r   r   `   s    d     i@B i ʚ;l    J)l     I5 l     NZol     @=7M.cl      B3v^!< l      P ~cegl       73Me'l       (l
F3YHqS )hundredthousandmillionbilliontrillionquadrillionquintillion
sextillion
septillion	octillion	nonillion	decillionc                 S      i | ]\}}|d  |d fqS r-   r   r.   r   r   r   r   t       c                 S   re   )r0   r   r.   r   r   r   r   w   rg   -+)minusnegativepluspositive   £u   €$   ¢)poundpoundseuroeurosdollardollarscentcentsrw   %)perpercent>   andpointdoubletriplec                 S   s   g | ]	}|D ]}|qqS r   r   )r   mappingkeyr   r   r   
<listcomp>   s    z4EnglishNumberNormalizer.__init__.<locals>.<listcomp>r   ones)super__init__zeros	enumerater   itemsones_pluralones_ordinalones_suffixedtenstens_pluraltens_ordinaltens_suffixedmultipliersmultipliers_pluralmultipliers_ordinalmultipliers_suffixeddecimalspreceding_prefixersfollowing_prefixerssetlistvaluesprefixes	suffixersspecialswordsliteral_wordsself	__class__r   r   r   !   s   




z EnglishNumberNormalizer.__init__r   returnc                 #   s   d  d d}dt fdd}dtt tf f fdd}t|dkr$d S td g| d g d	D ]\}}}|r:d}q/|d uoCtd
|}|d | jv }	|	rS|dd  n|}
td
|
r||
}|d useJ d urtt r|	dr|t t | q/|V  |	r|d n  |j
dkr|jq/|
q/|| jvrd ur|V  ||V  q/|| jv rt pdd q/|| jv r'| j| }d u r|q/tt s|| jv r|| jv r|dk rd dksJ d d t | q/t t | q/|dk rd dkr	|7 q/t t | q/d dkr|7 q/t t | q/|| jv r| j| \}}d u rC|t || V  ntt sO|| jv r|| jv rs|dk rsd dkscJ |d d t | | V  nT|t t | | V  nF|dk rd dkr|t | | V  n.|t t | | V  n d dkr|t | | V  n|t t | | V  d q/|| jv r| j| }d u r|q/tt rt t | q/d dkr|7 q/t t | q/|| jv rR| j| \}}d u r|t || V  q/tt r1|t t | | V  q/d dkrD|t | | V  q/|t t | | V  q/|| jv r| j| }d u re|q/tt spdkr|}|d ur}|| nd }|d ur|j
dkr|jq/|V  |q/d d }d }|||  q/|| jv r| j| \}}d u r|t || V  nStt r|}|d ur|| nd }|d ur|j
dkr|t |j| V  n(|V  |t || V  nd d }d }|||  |t | V  d q/|| jv r>d ur)|V  || jv s2|r8| j|  q/||V  q/|| jv rZd urT| j|  |V  q/||V  q/|| jv rd ur| j| }t|tr||v r|t ||  V  d}q/|V  ||V  q/|t | V  q/||V  q/|| jv r=|| jvr|sd ur|V  ||V  q/|dkr|| jvr׈d ur|V  ||V  q/|dks|dkr|| jv s|| jv r|dkrdnd	}| j|d}t pdt ||  d}q/d ur|V  ||V  q/|dkr6|| jv s,|r5t p1dd q/td| td| d urP|V  d S d S )NFr-   c                 S   s    zt | W S  ty   Y d S w N)r   
ValueErrorrf   r   r   r   to_fraction   s
   
z:EnglishNumberNormalizer.process_words.<locals>.to_fractionresultc                    s$   t | }  d ur |  } d d  | S r   )str)r   prefixr/   r   r   output   s   z5EnglishNumberNormalizer.process_words.<locals>.outputr   r4   z^\d+(\.\d+)?$r   . 0
   rW   rX   Tr|   r~   r   r2   r}   zUnexpected token: )r   r
   intlenr   rematchr   
isinstancer@   denominator	numeratorr   r   r   r   r   r   r   r   r   r   r   dictr   getr   r   )r   r   skipr   r   prevcurrentnextnext_is_numeric
has_prefixcurrent_without_prefixfr   suffixr   
multiplierpbeforeresidualrepeatsr   r   r   process_words   sJ  	"









 



























z%EnglishNumberNormalizer.process_wordsr-   c                 C   s   g }t d|}t|D ]=\}}t| dkrq|t|d kr'|| q|| |jddd }|| jv s>|| jv rD|d q|d qd		|}t 
d
d|}t 
dd|}t 
dd|}|S )Nz\band\s+a\s+half\br   r   r2   )maxsplitr   z
point fivez
and a half z([a-z])([0-9])z\1 \2z([0-9])([a-z])z([0-9])\s+(st|nd|rd|th|s)\b\1\2)r   splitr   r   stripappendrsplitr   r   joinsub)r   r-   resultssegmentsr   segment	last_wordr   r   r   
preprocess  s"   

z"EnglishNumberNormalizer.preprocessc                 C   sJ   dt fdd}dt fdd}td||}td||}tdd	|}|S )
Nmc                 S   sR   z|  d}|  d}t|  d}| | d|dW S  ty(   | j Y S w )Nr   r2   r4   r   02d)groupr   r   string)r   currencyintegerrx   r   r   r   combine_cents  s   


z:EnglishNumberNormalizer.postprocess.<locals>.combine_centsc                 S   s0   zdt | d W S  ty   | j Y S w )Nrp   r   )r   r   r   r   )r   r   r   r   extract_cents  s
   
z:EnglishNumberNormalizer.postprocess.<locals>.extract_centsu,   ([€£$])([0-9]+) (?:and )?¢([0-9]{1,2})\bu   [€£$]0.([0-9]{1,2})\bz	\b1(s?)\bzone\1)r   r   r   )r   r-   r   r   r   r   r   postprocess  s   	z#EnglishNumberNormalizer.postprocessc                 C   s6   |  |}ddd | | D }| |}|S )Nr   c                 s   s    | ]	}|d ur|V  qd S r   r   r   wordr   r   r   	<genexpr>  s    z3EnglishNumberNormalizer.__call__.<locals>.<genexpr>)r   r   r   r   r   r   r-   r   r   r   __call__  s   

z EnglishNumberNormalizer.__call__)__name__
__module____qualname____doc__r   r   r   r   r   r   r   r   __classcell__r   r   r   r   r      s    
  `r   c                   @   &   e Zd ZdZdd ZdefddZdS )r   z
    Applies British-American spelling mappings as listed in [1].

    [1] https://web.archive.org/web/20230326222449/https://www.tysto.com/uk-us-spelling-list.html
    c                 C   sJ   t dd}t|d}t|| _W d    d S 1 sw   Y  d S )Nwhisper_normalizerznormalizers/english.jsonr)r   joinpathopenjsonloadr   )r   english_json_pathenglish_normalization_dictr   r   r   r     s   "z"EnglishSpellingNormalizer.__init__r-   c                    s   d  fdd| D S )Nr   c                 3   s    | ]
} j ||V  qd S r   )r   r   r   r   r   r   r     s    z5EnglishSpellingNormalizer.__call__.<locals>.<genexpr>)r   r   r   r   r   r   r     s   z"EnglishSpellingNormalizer.__call__Nr   r   r   r   r   r   r   r   r   r   r   r     s    r   c                   @   r   )r   us  Applies all the rules for normalizing English text as mentioned in OpenAI whisper paper. As per the text normalization/standardization approach  Appendix Section C pp.21 the paper [Robust Speech Recognition via Large-Scale  Weak Supervision](https://cdn.openai.com/papers/whisper.pdf). The `EnglishTextNormalizer` does the following functionality:

    1. Remove any phrases between matching brackets ([, ]).
    2. Remove any phrases between matching parentheses ((, )).
    3. Remove any of the following words: hmm, mm, mhm, mmm, uh, um
    4. Remove whitespace characters that comes before an apostrophe ’
    5. Convert standard or informal contracted forms of English into the original form.
    6. Remove commas (,) between digits
    7. Remove periods (.) not followed by numbers
    8. Remove symbols as well as diacritics from the text, where symbols are the characters with the Unicode category
    starting with M, S, or P, except period, percent, and currency symbols that may be detected in the next step.
    9. Detect any numeric expressions of numbers and currencies and replace with a form using Arabic numbers, e.g. “Ten
    thousand dollars” → “$10000”.
    10. Convert British spellings into American spellings.
    11. Remove remaining symbols that are not part of any numeric expressions.
    12. Replace any successive whitespace characters with a space.
    c                 C   sd  d| _ i dddddddd	d
ddddddddddddddddddddddd d!d"i d#d$d%d&d'd(d)d*d+d,d-d.d/d0d1d2d3d4d5d6d7d8d9d:d;d<d=d>d?d@dAdBdCdDi dEdFdGdHdIdJdKdLdMdNdOdPdQdRdSdTdUdVdWdXdYdZd[d\d]d^d_d`dadbdcddded\dfdgdh| _t | _t | _d S )iNz\b(hmm|mm|mhm|mmm|uh|um)\bz	\bwon't\bzwill notz	\bcan't\bzcan notz	\blet's\bzlet usz	\bain't\baintz	\by'all\bzyou allz	\bwanna\bzwant toz	\bkinda\bzkind ofz	\bsorta\bzsort ofz	\bdunno\bzdo not knowz	\bgotta\bzgot toz	\bgonna\bzgoing toz\bi'ma\bzi am going toz\bimma\bz
\bwoulda\bz
would havez
\bcoulda\bz
could havez\bshoulda\bzshould havez	\bcause\bbecausez	\bma'am\bmadamz\bmr\bzmister z\bmrs\bzmissus z\bst\bzsaint z\bdr\bzdoctor z\bprof\bz
professor z\bcapt\bzcaptain z\bgov\bz	governor z\bald\bz	alderman z\bgen\bzgeneral z\bsen\bzsenator z\brep\bzrepresentative z\bpres\bz
president z\brev\bz	reverend z\bhon\bz
honorable z\basst\bz
assistant z	\bassoc\bz
associate z\blt\bzlieutenant z\bcol\bzcolonel z\bjr\bzjunior z\bsr\bzsenior z\besq\bzesquire z	'd been\bz	 had beenz	's been\bz	 has beenz	'd gone\bz	 had gonez	's gone\bz	 has gonez	'd done\bz	 had donez's got\bz has gotzn't\bz notz're\bz arez's\bz isz'd\bz wouldz'll\bz willz't\bz havez am)z've\bz'm\b)ignore_patterns	replacersr   standardize_numbersr   standardize_spellingsr   r   r   r   r     s   	
 !"#$%&'()+,-./02345678
;zEnglishTextNormalizer.__init__r-   c                 C   s   |  }tdd|}tdd|}t| jd|}tdd|}| j D ]\}}t|||}q&tdd|}tdd	|}t|d
d}| |}| |}tdd	|}tdd|}tdd|}|S )Nz[<\[][^>\]]*[>\]]r   z\(([^)]+?)\)z\s+''z	(\d),(\d)r   z\.([^0-9]|$)z \1u
   .%$¢€£)keepu   [.$¢€£]([^0-9])z	([^0-9])%z\1 z\s+r   )	lowerr   r   r   r   r   r   r   r   )r   r-   patternreplacementr   r   r   r   1  s    

zEnglishTextNormalizer.__call__Nr   r   r   r   r   r     s    @r   )r   __all__r   osr   	fractionsr   typingr   r   r   r	   r
   importlib.resourcesr   more_itertoolsr   basicr   r   r   r   r   r   r   r   <module>   s        9