o
    wi                    @   s   g d Z ddlZddlmZ ddlmZ G dd deZG dd	 d	eZG d
d deZ	G dd deZ
G dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZdS ))NormalizerIBaseNormalizerDevanagariNormalizerHindiNormalizerPunjabiNormalizerTeluguNormalizerGujaratiNormalizerOdiaNormalizerBengaliNormalizerTamilNormalizerKannadaNormalizerMalayalamNormalizer    N	num2words   )langinfoc                   @   s@   e Zd ZdZdZdZdZdZdZdZ	dZ
d	Zd
d Zdd ZdS )r   a  
    The normalizer classes do the following:
    * Some characters have multiple Unicode codepoints. The normalizer chooses a single standard representation
    * Some control characters are deleted
    * While typing using the Latin keyboard, certain typical mistakes occur which are corrected by the module
    Base class for normalizer. Performs some common normalization, which includes:
    * Byte order mark, word joiner, etc. removal
    * ZERO_WIDTH_NON_JOINER and ZERO_WIDTH_JOINER removal
    * ZERO_WIDTH_SPACE and NO_BREAK_SPACE replaced by spaces
    Script specific normalizers should derive from this class and override the normalize() method.
    They can call the super class 'normalize() method to avail of the common normalization
    u   ﻿u   ￾u   ⁠   ­u   ​    u   ‌u   ‍c                 C   s   | tjd}| dd}| dd}| dd}| dd}| dd	}| d
d}| dd}| dd}| dd}| dd}| dd}| dd}|S )z
        Normalize punctuations.
        Applied many of the punctuation normalizations that are part of MosesNormalizer
        from sacremoses
         u   „"u   “u   ”u   –-u   —z -    ´'u   ‘u   ‚u   ’z''u   ´´u   …z...)replacer   BYTE_ORDER_MARKselftext r   U/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/whisper_normalizer/indic.py_normalize_punctuations6   s   z#NormalizerI._normalize_punctuationsc                 C   s   d S Nr   r   r   r   r   	normalizeL   s   zNormalizerI.normalizeN)__name__
__module____qualname____doc__r   BYTE_ORDER_MARK_2WORD_JOINERSOFT_HYPHENZERO_WIDTH_SPACENO_BREAK_SPACEZERO_WIDTH_NON_JOINERZERO_WIDTH_JOINERr    r"   r   r   r   r   r      s    r   c                   @   s   e Zd ZdZ				d)ddZdd Zdd	 Zd
d Zdd Zdd Z	dd Z
dd Zdd Zdd Zdd Zdd Zdd Zdd Zd d! Zd"d# Zd$d% Zd&d' Zd(S )*r   zDCommon class used in most of indic languages inherit from this code.F
do_nothingc                 C   s:   || _ || _|| _|| _|| _|   |   |   d S r!   )langremove_nuktasnasals_modedo_normalize_chandrasdo_normalize_vowel_ending_init_normalize_chandras_init_normalize_nasals_init_normalize_vowel_endingr   r/   r0   r1   r2   r3   r   r   r   __init__S   s   zBaseNormalizer.__init__c                 C   s>   | j tjv r| j| _d S | j tjv r| j| _d S dd | _d S )Nc                 S   s   | S r!   r   )xr   r   r   <lambda>l   s    z=BaseNormalizer._init_normalize_vowel_ending.<locals>.<lambda>)r/   r   IE_LANGUAGES_normalize_word_vowel_ending_iefn_vowel_endingDRAVIDIAN_LANGUAGES&_normalize_word_vowel_ending_dravidianr   r   r   r   r6   f   s
   z+BaseNormalizer._init_normalize_vowel_endingc                    s@   ddgddgddgddgd	d
gdd
gg} fdd|D  _ d S )N            E   G   I   K   r      r   c                    s0   g | ]}t |d   jt |d  jfqS )r   r   r   offset_to_charr/   .0r9   r@   r   r   
<listcomp>y   s    z;BaseNormalizer._init_normalize_chandras.<locals>.<listcomp>)chandra_substitutions)r   substitution_offsetsr   r@   r   r4   n   s   

z'BaseNormalizer._init_normalize_chandrasc                 C   s    | j D ]
\}}|||}q|S r!   )rO   r   )r   r   matchreplr   r   r   _normalize_chandras   s   z"BaseNormalizer._normalize_chandrasc                 C   s   g dg dg dg dg dg dg}d}d}g }|D ]-}t d	jt|d
 | jt|| jt|d | jt|d | jd}|| qdjt|| jd}||f| _dS )I
        `r1_nasal=re.compile(r'\u0919\u094D([\u0915-\u0918])')`
                          #      "   (   $   '   )   rc   rd   .   *   -   M   rI   z${nasal}{halant}([{start_r}-{end_r}])r   r   )nasalhalantstart_rend_rz{anusvaara}\1	anusvaaraN)recompileformatr   rK   r/   append
pats_repls)r   pat_signatureshalant_offsetanusvaara_offsetpatspat_signaturepatrepl_stringr   r   r   _init_to_anusvaara_strict   s0   	z(BaseNormalizer._init_to_anusvaara_strictc                 C   s$   | j \}}|D ]}|||}q|S r!   rv   sub)r   r   rz   r}   r|   r   r   r   _to_anusvaara_strict   s   
z#BaseNormalizer._to_anusvaara_strictc                    sj   g d}d  fdd|D }d}d}tdj|t| jd}d	jt| jd
}||f _dS )rT   )rV   rZ   r^   rb   rf   rh   ,c                    s   g | ]	}t | jqS r   rJ   rL   r@   r   r   rN      s    z=BaseNormalizer._init_to_anusvaara_relaxed.<locals>.<listcomp>rk   rI   z[{nasals_list_str}]{halant})nasals_list_strrm   z{anusvaara}rp   N)joinrr   rs   rt   r   rK   r/   rv   )r   nasals_listr   rx   ry   r|   r}   r   r@   r   _init_to_anusvaara_relaxed   s    z)BaseNormalizer._init_to_anusvaara_relaxedc                 C   s   | j \}}|||S r!   r   )r   r   r|   r}   r   r   r   _to_anusvaara_relaxed   s   
z$BaseNormalizer._to_anusvaara_relaxedc           	   
   C   s   g dg dg dg dg dg dg}d}d}g }g }|D ]=}t d	jt|| jt|d
 | jt|d | jd}|| djt|d | jt|| jd}|| qtt||| _	dS )rT   rU   rY   r]   ra   re   rg   rk   rI   z {anusvaara}([{start_r}-{end_r}])r   )rq   rn   ro   z{nasal}{halant}\1r   )rl   rm   N)
rr   rs   rt   r   rK   r/   ru   listziprv   )	r   rw   rx   ry   rz   repl_stringsr{   r|   r}   r   r   r   _init_to_nasal_consonants   s4   	
z(BaseNormalizer._init_to_nasal_consonantsc                 C   s    | j D ]
\}}|||}q|S r!   r   )r   r   r|   rR   r   r   r   _to_nasal_consonants   s   z#BaseNormalizer._to_nasal_consonantsc                 C   sF   | j dkr|   d S | j dkr|   d S | j dkr!|   d S d S Nto_anusvaara_strictto_anusvaara_relaxedto_nasal_consonants)r1   r~   r   r   r@   r   r   r   r5      s   


z%BaseNormalizer._init_normalize_nasalsc                 C   s@   | j dkr
| |S | j dkr| |S | j dkr| |S |S r   )r1   r   r   r   r   r   r   r   _normalize_nasals  s   





z BaseNormalizer._normalize_nasalsc                 C   s4   t |dkrt|d | jr|td| j S |S )z
        for Dravidian
        - consonant ending: add 'a' ki maatra
        - halant ending: no change
        - 'a' ki maatra: no change
        r   >   )lenr   is_consonantr/   rK   r   wordr   r   r   r?     s   z5BaseNormalizer._normalize_word_vowel_ending_dravidianc                 C   s6   t |dkrt|d | jr|ttj| j S |S )z
        for IE
        - consonant ending: add halant
        - halant ending: no change
        - 'a' ki maatra: no change
        r   r   )r   r   r   r/   rK   HALANTA_OFFSETr   r   r   r   r<     s   z.BaseNormalizer._normalize_word_vowel_ending_iec                    s   d  fdd|dD S )N c                    s   g | ]}  |qS r   )r=   )rM   wr@   r   r   rN   %  s    z:BaseNormalizer._normalize_vowel_ending.<locals>.<listcomp>)r   splitr   r   r@   r   _normalize_vowel_ending$  s   z&BaseNormalizer._normalize_vowel_endingc                 C   s   | tjd}| tjd}| tjd}| tjd}| tjd}| tjd}| tjd}| tj	d}| 
|}| jrE| |}| |}| jrR| |}|S )zL
        Method to be implemented for normalization for each script
        r   r   )r   r   r   r'   r(   r)   r*   r+   r,   r-   r    r2   rS   r   r3   r   r   r   r   r   r"   '  s   



zBaseNormalizer.normalizec                 C   s   t tttj| t tttj| t tttj| t tttj| t tttj	| t tttj
| t tttj| t tttj| d S r!   )printr   rr   findallr   r   r'   r(   r)   r*   r+   r,   r-   r   r   r   r   get_char_stats@  s   zBaseNormalizer.get_char_statsc                 C   s   t dd|}d S )N([\u0900-\u097f]):   \1ः)rr   r   )r   r   visarga_char
char_ranger   r   r   correct_visargaP  s   zBaseNormalizer.correct_visargaN)Fr.   FF)r#   r$   r%   r&   r8   r6   r4   rS   r~   r   r   r   r   r   r5   r   r?   r<   r   r"   r   r   r   r   r   r   r   P   s0    
$%
r   c                       sH   e Zd ZdZdZ					d fdd	Z fdd	Z fd
dZ  ZS )r   aH  
    Normalizer for the Devanagari script. In addition to basic normalization by the super class,
    * Replaces the composite characters containing nuktas by their decomposed form
    * replace pipe character '|' by poorna virama character
    * replace colon ':' by visarga if the colon follows a charcter in this script

       ़hiFr.   c                    s   t t| ||||| d S r!   )superr   r8   r7   	__class__r   r   r8   _  s   
zDevanagariNormalizer.__init__c                    sT  t t| |}|dd}|ddtj }|ddtj }|ddtj }|d	d
tj }|ddtj }|ddtj }|ddtj }|ddtj }|ddtj }|ddtj }|ddtj }| jr{|tjd}|dd}tdd|}tdd |D }|rt	d |}|D ]}||t
|d!d"}q|S )#N   ॲ   ए   ऩ   न   ऱ   र   ऴ   ळ   क़   क   ख़   ख   ग़   ग   ज़   ज   ड़   ड   ढ़   ढ   फ़   फ   य़   यr   |   ।r   r   c                 s       | ]}|  V  qd S r!   isdigitrM   charr   r   r   	<genexpr>      z0DevanagariNormalizer.__call__.<locals>.<genexpr>\d+r   r/   )r   r   r"   r   NUKTAr0   rr   r   anyr   r   )r   r   
has_digitsdigit_partspartr   r   r   __call__o  s.   zDevanagariNormalizer.__call__c                    s   t t| | tttd| tttd| tttd| tttd| tttd| tttd| tttd| tttd| tttd	| tttd
| tttd| d S )Nr   r   r   r   r   r   r   r   r   r   r   )r   r   r   r   r   rr   r   r   r   r   r   r     s   z#DevanagariNormalizer.get_char_stats)r   Fr.   FF)	r#   r$   r%   r&   r   r8   r   r   __classcell__r   r   r   r   r   T  s    &r   c                       s>   e Zd ZdZdZ						d
 fdd	Z fdd	Z  ZS )r   zNFork of Devanagiri normalizer. With additional changes for Hindi and tts_mode.r   r   Fr.   c                    "   t t| ||||| || _d S r!   )r   r   r8   tts_moder   r/   r0   r1   r2   r3   r   r   r   r   r8        
	
zHindiNormalizer.__init__c              	      sr  t t| |}|dd}|ddtj }|ddtj }|ddtj }|d	d
tj }|ddtj }|ddtj }|ddtj }|ddtj }|ddtj }|ddtj }|ddtj }| jr{|tjd}|dd}tdd|}| jrtdd|}td d|}td!d|}td"d#|}td$d#|}td%d&|}td'd&|}d(d) }td*||}d+d, }td-||}d.d/ }d0}tj|||tj	d1}td2d3d4 |}d5d6d7d8d9d:d;d<d=}|
 D ]
\}}|||}qd>d? }	td@|	|}| }tdAdB |D }
|
r7tdC|}|D ]}||t|dDdE}q)|S )FNr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   INR\s+(\d+)u   रुपये \1Rs\.\s+(\d+)   ₹\s*(\d+)USD\s+(\d+)u   डॉलर \1
\$\s*(\d+)KRW\s+(\d+)u"   कोरियाई वॉन \1   ₩\s*(\d+)c                 S   sD   |  d|  d}}t|dd}ddd |D }| d| S )	Nr   rI   r   r   r   c                 s   s    | ]	}t |d dV  qdS )r   r   Nr   )rM   digitr   r   r   r     s    zDHindiNormalizer.__call__.<locals>.replace_decimal.<locals>.<genexpr>u    पॉइंट )groupr   r   )rQ   wholefracwhole_words
frac_wordsr   r   r   replace_decimal  s   z1HindiNormalizer.__call__.<locals>.replace_decimal(\d+)\.(\d+)c           
      S      |  d}d}|drd}|tdd  }n|dr&d}|tdd  }|dd}|d }t|dkr:|d nd}|d	}d
dd |D }d}|rf|d}	ddd |	D }|rdd| nd}| | |  S )Nr   r   https://uP   एच टी टी पी एस कोलन स्लैश स्लैश http://uI   एच टी टी पी कोलन स्लैश स्लैश /r   .    डॉट c                 s       | ]
}d  | V  qdS r   Nr   upperrM   r   r   r   r   r         
zBHindiNormalizer.__call__.<locals>.normalize_url.<locals>.<genexpr>u    स्लैश c                 s   "    | ]}|rd  | V  qdS r   r   rM   pr   r   r   r         
r   
startswithr   r   r   strip
rQ   urlschemepartsdomainpathdomain_partsspoken_domainspoken_path
path_partsr   r   r   normalize_url  .   






z/HindiNormalizer.__call__.<locals>.normalize_urlhttps?://[^\s]+c                 S   $   |  dd}ddd |D S )Nr   r   r   c                 s   r   r   r   r   r   r   r   r         zGHindiNormalizer.__call__.<locals>.normalize_bare_url.<locals>.<genexpr>r   r   r   rQ   r   r   r   r   normalize_bare_url     z4HindiNormalizer.__call__.<locals>.normalize_bare_urlY\b(?:www\.)?[\w-]+\.(?:com|in|org|net|edu|gov|ai|co|io|info|biz|nic\.in|ac\.in|gov\.in)\bflags \b([\w\.-]+)@([\w\.-]+)\.(\w+)\bc                 S   &   |  d d|  d d|  d S )Nr   u    एट rI   r      r   mr   r   r   r:        & z*HindiNormalizer.__call__.<locals>.<lambda> or  at 	 percent  plus  minus  equals  star  hash &@%+r   =*#c                 S      d | dS Nr   r   r   r   rQ   r   r   r   expand_acronyms*     z1HindiNormalizer.__call__.<locals>.expand_acronyms\b[A-Z]{2,}\bc                 s   r   r!   r   r   r   r   r   r   0  r   z+HindiNormalizer.__call__.<locals>.<genexpr>\d+\.\d+|\d+r   r   )r   r   r"   r   r   r0   rr   r   r   
IGNORECASEitemslowerr   r   r   r   r   r   r  r  bare_url_tld_pattern
symbol_mapsymr   r+  r   r   r9   r   r   r   r     sv   

zHindiNormalizer.__call__)r   Fr.   FFF)r#   r$   r%   r&   r   r8   r   r   r   r   r   r   r     s    r   c                
       sd   e Zd ZdZdZddddddd	d
dd	Z									d fdd	Zdd Z fddZ  Z	S )r   a  
    Normalizer for the Gurmukhi script. In addition to basic normalization by the super class,
    * Replaces the composite characters containing nuktas by their decomposed form
    * Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama
    * replace pipe character '|' by poorna virama character
    * replace colon ':' by visarga if the colon follows a charcter in this script
    u   ਼u   ਆ   ਇu   ਈ   ਉu   ਊu   ਏu   ਐu   ਓu   ਔ)	u   ਅਾu   ੲਿu   ੲੀu   ੳੁu   ੳੂu   ੲੇu   ਅੈu   ੳੋu   ਅੌpaFr.   c
           
         s4   t t| ||||| || _|| _|| _|	| _d S r!   )r   r   r8   do_canonicalize_addakdo_canonicalize_tippido_replace_vowel_basesr   )
r   r/   r0   r1   r2   r3   r9  r:  r;  r   r   r   r   r8   R  s   

zPunjabiNormalizer.__init__c                 C   sB   t j D ]
\}}|||}q| jr|dd}|dd}|S )r   u   ੲr6  u   ੳr7  )r   VOWEL_NORM_MAPSr0  r   r;  )r   r   kvr   r   r   _normalize_vowelsj  s   	z#PunjabiNormalizer._normalize_vowelsc              	      sL  | j r
tdd|}| jr|dd}| |}tt| |}|ddtj	 }|ddtj	 }|d	d
tj	 }|ddtj	 }|ddtj	 }|ddtj	 }| j
r`|tj	d}|dd}|dd}|dd}tdd|}| jrtdd|}tdd|}tdd|}tdd|}tdd|}td d!|}td"d!|}d#d$ }td%||}d&d' }td(||}d)d* }d+}tj|||tjd,}td-d.d/ |}d0d1d2d3d4d5d6d7d8}| D ]
\}}|||}qd9d: }	td;|	|}td<d= |D }
|
r$td>|}|D ]}||t|d?d@}q|S )ANz	\u0a71(.)u   ੍u   ੰu   ਂu   ਲ਼u   ਲu   ਸ਼u   ਸu   ਖ਼u   ਖu   ਗ਼u   ਗu   ਜ਼u   ਜu   ਫ਼u   ਫr   u   ੤r   u   ੥   ॥r   z([\u0a00-\u0a7f]):u   ਃr   u   ਰੁਪਏ \1r   r   r   u   ਡਾਲਰ \1	$\s*(\d+)r   u   ਕੋਰੀਆਈ ਵੌਨ \1r   c                 S   $   |  d|  d}}| d| S )Nr   rI   u    ਪੌਇੰਟ r  rQ   r   r   r   r   r   r        z3PunjabiNormalizer.__call__.<locals>.replace_decimalr   c           
      S   r   )Nr   r   r   uS   ਐਚ ਟੀ ਟੀ ਪੀ ਐੱਸ ਕੋਲਨ ਸਲੈਸ਼ ਸਲੈਸ਼ r   uI   ਐਚ ਟੀ ਟੀ ਪੀ ਕੋਲਨ ਸਲੈਸ਼ ਸਲੈਸ਼ r   r   r       ਡਾਟ c                 s   r   r   r   r   r   r   r   r     r   zDPunjabiNormalizer.__call__.<locals>.normalize_url.<locals>.<genexpr>u    ਸਲੈਸ਼ c                 s   r   r   r   r   r   r   r   r     r   r   r   r   r   r   r    r  z1PunjabiNormalizer.__call__.<locals>.normalize_urlr  c                 S   r  )Nr   r   rE  c                 s   r   r   r   r   r   r   r   r     r  zIPunjabiNormalizer.__call__.<locals>.normalize_bare_url.<locals>.<genexpr>r	  r
  r   r   r   r    r  z6PunjabiNormalizer.__call__.<locals>.normalize_bare_urlr  r  r  c                 S   r  )Nr   u    ਐਟ rI   rE  r  r  r  r   r   r   r:     r  z,PunjabiNormalizer.__call__.<locals>.<lambda>r  r  r  r  r  r  r  r  r  c                 S   r'  r(  r)  r*  r   r   r   r+    r,  z3PunjabiNormalizer.__call__.<locals>.expand_acronymsr-  c                 s   r   r!   r   r   r   r   r   r     r   z-PunjabiNormalizer.__call__.<locals>.<genexpr>r.  r8  r   )r9  rr   r   r:  r   r?  r   r   r"   r   r0   r   r/  r0  r   r   r   r2  r   r   r   r     sv   


zPunjabiNormalizer.__call__)	r8  Fr.   FFFFFF)
r#   r$   r%   r&   r   r<  r8   r?  r   r   r   r   r   r   r   9  s2    r   c                       @   e Zd ZdZ						d
 fdd	Zdef fdd	Z  ZS )r   a_  
    Normalizer for the Teluguscript. In addition to basic normalization by the super class,
    * Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama
    * canonicalize two-part dependent vowel signs
    * replace colon ':' by visarga if the colon follows a charcter in this script
    teFr.   c                    r   r!   )r   r   r8   r   r   r   r   r   r8     r   zTeluguNormalizer.__init__r   c              	      s  t t| |}|dd}|dd}|dd}tdd|}| jrtd	d
|}tdd
|}tdd
|}tdd|}tdd|}tdd|}tdd|}dd }td||}dd }td||}dd }d}tj|||tjd}tddd |}d d!d"d#d$d%d&d'd(}| D ]
\}}|||}qd)d* }	td+|	|}t	d,d- |D }
|
rt
d.|}|D ]}||t|d/d0}q| S )1Nu   ౤r   u   ౥r@  u   ైu   ైz([\u0c00-\u0c7f]):u   ఃr   u   రూపాయలు \1r   r   r   u   డాలర్లు \1rA  r   u%   కొరియన్ వాన్ \1r   c                 S   rB  )Nr   rI   u    పాయింట్ r  rC  r   r   r   r   :  rD  z2TeluguNormalizer.__call__.<locals>.replace_decimalr   c           
      S   r   )Nr   r   r   ub   హెచ్ టి టి పి ఎస్ కొలన్ స్లాష్ స్లాష్ r   uX   హెచ్ టి టి పి కొలన్ స్లాష్ స్లాష్ r   r   r       డాట్ c                 s   r   r   r   r   r   r   r   r   P  r   zCTeluguNormalizer.__call__.<locals>.normalize_url.<locals>.<genexpr>u    స్లాష్ c                 s   r   r   r   r   r   r   r   r   W  r   r   r   r   r   r   r  A  r  z0TeluguNormalizer.__call__.<locals>.normalize_urlr  c                 S   r  )Nr   r   rH  c                 s   r   r   r   r   r   r   r   r   c  r  zHTeluguNormalizer.__call__.<locals>.normalize_bare_url.<locals>.<genexpr>r	  r
  r   r   r   r  a  r  z5TeluguNormalizer.__call__.<locals>.normalize_bare_urlr  r  r  c                 S   r  )Nr   u    అట్ rI   rH  r  r  r  r   r   r   r:   m  r  z+TeluguNormalizer.__call__.<locals>.<lambda>r  r  r  r  r  r  r  r  r  c                 S   r'  r(  r)  r*  r   r   r   r+  ~  r,  z2TeluguNormalizer.__call__.<locals>.expand_acronymsr-  c                 s   r   r!   r   r   r   r   r   r     r   z,TeluguNormalizer.__call__.<locals>.<genexpr>r   rG  r   )r   r   r"   r   rr   r   r   r/  r0  r   r   r   r1  r   r   r   r  r  r3  r4  r5  r   r+  r   r   r   r   r   r   r      s\   

zTeluguNormalizer.__call__)rG  Fr.   FFFr#   r$   r%   r&   r8   strr   r   r   r   r   r   r         	r   c                       sD   e Zd ZdZdZ						d fdd	Zdef fd	d
Z  ZS )r   a0  
    Normalizer for the Gujarati script. In addition to basic normalization by the super class,
    * Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama
    * replace colon ':' by visarga if the colon follows a charcter in this script
    u   ઼guFr.   c                    r   r!   )r   r   r8   r   r   r   r   r   r8     r   zGujaratiNormalizer.__init__r   c              	      s  t t| |}| jr|tjd}|dd}|dd}tdd|}| jrtdd	|}td
d	|}tdd	|}tdd|}tdd|}tdd|}tdd|}dd }td||}dd }td||}dd }d}tj|||tj	d}tddd |}dd d!d"d#d$d%d&d'}|
 D ]
\}}|||}qd(d) }	td*|	|}td+d, |D }
|
rtd-|}|D ]}||t|d.d/}q| S )0Nr   u   ૤r   u   ૥r@  z([\u0a80-\u0aff]):u   \1ઃr   u   રૂપિયા \1r   r   r   u   ડોલર \1r   r   u   કોરિયન વોન \1r   c                 S   rB  )Nr   rI   u    પોઈન્ટ r  rC  r   r   r   r     rD  z4GujaratiNormalizer.__call__.<locals>.replace_decimalr   c           
      S   r   )Nr   r   r   uO   એચ ટી ટી પીએસ કોલન સ્લેશ સ્લેશ r   uI   એચ ટી ટી પી કોલન સ્લેશ સ્લેશ r   r   r       ડોટ c                 s   r   r   r   r   r   r   r   r     r   zEGujaratiNormalizer.__call__.<locals>.normalize_url.<locals>.<genexpr>u    સ્લેશ c                 s   r   r   r   r   r   r   r   r     r   r   r   r   r   r   r    r  z2GujaratiNormalizer.__call__.<locals>.normalize_urlr  c                 S   r  )Nr   r   rN  c                 s   r   r   r   r   r   r   r   r     r  zJGujaratiNormalizer.__call__.<locals>.normalize_bare_url.<locals>.<genexpr>r	  r
  r   r   r   r    r  z7GujaratiNormalizer.__call__.<locals>.normalize_bare_urlr  r  r  c                 S   r  )Nr   u    એટ rI   rN  r  r  r  r   r   r   r:     r  z-GujaratiNormalizer.__call__.<locals>.<lambda>r  r  r  r  r  r  r  r  r  c                 S   r'  r(  r)  r*  r   r   r   r+    r,  z4GujaratiNormalizer.__call__.<locals>.expand_acronymsr-  c                 s   r   r!   r   r   r   r   r   r     r   z.GujaratiNormalizer.__call__.<locals>.<genexpr>r   rM  r   )r   r   r"   r0   r   r   rr   r   r   r/  r0  r   r   r   r1  rI  r   r   r   r     s^   

zGujaratiNormalizer.__call__)rM  Fr.   FFF	r#   r$   r%   r&   r   r8   rK  r   r   r   r   r   r   r     s    r   c                       sR   e Zd ZdZdZddddZ								d fd
d	Zdef fddZ  Z	S )r   a  
    Normalizer for the Oriya script. In addition to basic normalization by the super class,
    * Replaces the composite characters containing nuktas by their decomposed form
    * Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama
    * Canonicalize two part dependent vowels
    * Replace 'va' with 'ba'
    * replace pipe character '|' by poorna virama character
    * replace colon ':' by visarga if the colon follows a charcter in this script
    u   ଼u   ଆu   ଐu   ଔ)u   ଅାu   ଏୗu   ଓୗorFr.   c                    s(   t t| ||||| || _|| _d S r!   )r   r   r8   do_remap_war   )r   r/   r0   r1   r2   r3   rQ  r   r   r   r   r8   (     


zOdiaNormalizer.__init__r   c              	      s8  t t| |}tj D ]
\}}|||}q|ddtj }|ddtj }| jr4|tjd}|dd}|dd	}|d
d}| jrO|dd}|dd}|dd}|dd}|dd}t	
dd|}| jrt	
dd|}t	
dd|}t	
dd|}t	
dd|}t	
dd|}t	
dd|}t	
dd|}d d! }t	
d"||}d#d$ }t	
d%||}d&d' }d(}t	j
|||t	jd)}t	
d*d+d, |}d-d.d/d0d1d2d3d4d5}| D ]
\}	}
||	|
}qd6d7 }t	
d8||}td9d: |D }|rt	d;|}|D ]}||t|d<d=}q
| S )>Nu   ଡ଼u   ଡu   ଢ଼u   ଢr   u   ୤r   u   ୥r@  u   ୼u   ୱu   ବu   ଵu   ୈu   ୘u   ୋu   ୋu   ୌu   ୌz([\u0b00-\u0b7f]):u   \1ଃr   u   ଟଙ୍କା \1r   r   r   u   ଡଲାର \1r   r   u"   କୋରିଆନ୍ ୱନ୍ \1r   c                 S   rB  )Nr   rI   u    ପଏଣ୍ଟ r  rC  r   r   r   r   t  rD  z0OdiaNormalizer.__call__.<locals>.replace_decimalr   c           
      S   r   )Nr   r   r   u_   ଏଚ୍ ଟି ଟି ପି ଏସ୍ କୋଲନ୍ ସ୍ଲାଶ୍ ସ୍ଲାଶ୍ r   uU   ଏଚ୍ ଟି ଟି ପି କୋଲନ୍ ସ୍ଲାଶ୍ ସ୍ଲାଶ୍ r   r   r       ଡଟ୍ c                 s   r   r   r   r   r   r   r   r     r   zAOdiaNormalizer.__call__.<locals>.normalize_url.<locals>.<genexpr>u    ସ୍ଲାଶ୍ c                 s   r   r   r   r   r   r   r   r     r   r   r   r   r   r   r  z  r  z.OdiaNormalizer.__call__.<locals>.normalize_urlr  c                 S   r  )Nr   r   rS  c                 s   r   r   r   r   r   r   r   r     r  zFOdiaNormalizer.__call__.<locals>.normalize_bare_url.<locals>.<genexpr>r	  r
  r   r   r   r    r  z3OdiaNormalizer.__call__.<locals>.normalize_bare_urlr  r  r  c                 S   r  )Nr   u    ଏଟ୍ rI   rS  r  r  r  r   r   r   r:     r  z)OdiaNormalizer.__call__.<locals>.<lambda>r  r  r  r  r  r  r  r  r  c                 S   r'  r(  r)  r*  r   r   r   r+    r,  z0OdiaNormalizer.__call__.<locals>.expand_acronymsr-  c                 s   r   r!   r   r   r   r   r   r     r   z*OdiaNormalizer.__call__.<locals>.<genexpr>r   rP  r   )r   r   r"   r<  r0  r   r   r0   rQ  rr   r   r   r/  r   r   r   r1  )r   r   r=  r>  r   r  r  r3  r4  r5  r   r+  r   r   r   r   r   r   r   <  st   
zOdiaNormalizer.__call__)rP  Fr.   FFFF)
r#   r$   r%   r&   r   r<  r8   rK  r   r   r   r   r   r   r     s     
	r   c                       sF   e Zd ZdZdZ							d fdd	Zdef fd	d
Z  ZS )r	   a  
    Normalizer for the Bengali script. In addition to basic normalization by the super class,
    * Replaces the composite characters containing nuktas by their decomposed form
    * Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama
    * Canonicalize two part dependent vowels
    * replace pipe character '|' by poorna virama character
    * replace colon ':' by visarga if the colon follows a charcter in this script
    u   ়bnFr.   c                    s(   t t| ||||| || _d| _d S )NT)r   r	   r8   do_remap_assamese_charsr   )r   r/   r0   r1   r2   r3   rU  r   r   r   r   r8     rR  zBengaliNormalizer.__init__r   c              	      s4  t t| |}|ddtj }|ddtj }|ddtj }| jr-|tjd}| jrA| jdkrA|d	d
}|dd}|dd}|dd}|dd}|dd}|dd}|dd}t	dd|}| j
rt	dd|}t	dd|}t	dd|}t	dd|}t	dd|}t	d d!|}t	d"d!|}d#d$ }t	d%||}d&d' }t	d(||}d)d* }d+}tj	|||tjd,}t	d-d.d/ |}d0d1d2d3d4d5d6d7d8}| D ]
\}}|||}qd9d: }	t	d;|	|}td<d= |D }
|
rtd>|}|D ]}||t|d?d@}q| S )ANu   ড়u   ডu   ঢ়u   ঢu   য়u   যr   asu   ৰu   রu   ৱu   বu   ৤r   u   ৥r@  r   u   ৷u   োu   োu   ৌu   ৌz([\u0980-\u09ff]):u   \1ঃr   u   রুপি \1r   r   r   u   ডলার \1r   r   u"   কোরিয়ান ওন \1r   c                 S   rB  )Nr   rI   u    পয়েন্ট r  rC  r   r   r   r     rD  z3BengaliNormalizer.__call__.<locals>.replace_decimalr   c           
      S   r   )Nr   r   r   u_   এইচ টি টি পি এস কোলন স্ল্যাশ স্ল্যাশ r   uX   এইচ টি টি পি কোলন স্ল্যাশ স্ল্যাশ r   r   r       ডট c                 s   r   r   r   r   r   r   r   r   &  r   zDBengaliNormalizer.__call__.<locals>.normalize_url.<locals>.<genexpr>u    স্ল্যাশ c                 s   r   r   r   r   r   r   r   r   -  r   r   r   r   r   r   r    s.   






z1BengaliNormalizer.__call__.<locals>.normalize_urlr  c                 S   r  )Nr   r   rW  c                 s   r   r   r   r   r   r   r   r   9  r  zIBengaliNormalizer.__call__.<locals>.normalize_bare_url.<locals>.<genexpr>r	  r
  r   r   r   r  7  r  z6BengaliNormalizer.__call__.<locals>.normalize_bare_urlr  r  r  c                 S   r  )Nr   u    অ্যাট rI   rW  r  r  r  r   r   r   r:   C  r  z,BengaliNormalizer.__call__.<locals>.<lambda>r  r  r  r  r  r  r  r  r  c                 S   r'  r(  r)  r*  r   r   r   r+  U  r,  z3BengaliNormalizer.__call__.<locals>.expand_acronymsr-  c                 s   r   r!   r   r   r   r   r   r   Z  r   z-BengaliNormalizer.__call__.<locals>.<genexpr>r   rT  r   )r   r	   r"   r   r   r0   rU  r/   rr   r   r   r/  r0  r   r   r   r1  rI  r   r   r   r     sr   
zBengaliNormalizer.__call__)rT  Fr.   FFFFrO  r   r   r   r   r	     s    	r	   c                       rF  )r
   a_  
    Normalizer for the Tamil script. In addition to basic normalization by the super class,
    * Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama
    * canonicalize two-part dependent vowel signs
    * replace colon ':' by visarga if the colon follows a charcter in this script
    taFr.   c                    r   r!   )r   r
   r8   r   r   r   r   r   r8   k  r   zTamilNormalizer.__init__r   c              	      s  t t| |}|dd}|dd}|dd}|dd}|d	d
}|dd}tdd|}| jrtdd|}tdd|}tdd|}tdd|}tdd|}tdd|}tdd|}dd }td||}dd }td||}dd  }d!}tj|||tjd"}td#d$d% |}d&d'd(d)d*d+d,d-d.}| D ]
\}}|||}qd/d0 }	td1|	|}t	d2d3 |D }
|
rt
d4|}|D ]}||t|d5d6}q| S )7Nu   ௤r   u   ௥r@  u   ஔu   ஔu   ொu   ொu   ோu   ோu   ௌu   ௌz([\u0b80-\u0bff]):u   \1ஃr   u   ரூபாய் \1r   r   r   u   டாலர் \1r   r   u   கொரிய வான் \1r   c                 S   rB  )Nr   rI   u    பாயிண்ட் r  rC  r   r   r   r     rD  z1TamilNormalizer.__call__.<locals>.replace_decimalr   c           
      S   r   )Nr   r   r   u_   எச் டி டி பி எஸ் கோலன் ஸ்லாஷ் ஸ்லாஷ் r   uU   எச் டி டி பி கோலன் ஸ்லாஷ் ஸ்லாஷ் r   r   r       டாட் c                 s   r   r   r   r   r   r   r   r     r   zBTamilNormalizer.__call__.<locals>.normalize_url.<locals>.<genexpr>u    ஸ்லாஷ் c                 s   r   r   r   r   r   r   r   r     r   r   r   r   r   r   r    r  z/TamilNormalizer.__call__.<locals>.normalize_urlr  c                 S   r  )Nr   r   rY  c                 s   r   r   r   r   r   r   r   r     r  zGTamilNormalizer.__call__.<locals>.normalize_bare_url.<locals>.<genexpr>r	  r
  r   r   r   r    r  z4TamilNormalizer.__call__.<locals>.normalize_bare_urlr  r  r  c                 S   r  )Nr   u    அட் rI   rY  r  r  r  r   r   r   r:     r  z*TamilNormalizer.__call__.<locals>.<lambda>r  r  r  r  r  r  r  r  r  c                 S   r'  r(  r)  r*  r   r   r   r+    r,  z1TamilNormalizer.__call__.<locals>.expand_acronymsr-  c                 s   r   r!   r   r   r   r   r   r     r   z+TamilNormalizer.__call__.<locals>.<genexpr>r   rX  r   )r   r
   r"   r   rr   r   r   r/  r0  r   r   r   r1  rI  r   r   r   r   }  sb   
zTamilNormalizer.__call__)rX  Fr.   FFFrJ  r   r   r   r   r
   c  rL  r
   c                       rF  )r   aa  
    Normalizer for the Kannada script. In addition to basic normalization by the super class,
    * Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama
    * canonicalize two-part dependent vowel signs
    * replace colon ':' by visarga if the colon follows a charcter in this script
    knFr.   c                    r   r!   )r   r   r8   r   r   r   r   r   r8     r   zKannadaNormalizer.__init__r   c              	      s  t t| |}|dd}|dd}|dd}|dd}|d	d
}|dd}|dd}tdd|}| jrtdd|}tdd|}tdd|}tdd|}tdd|}tdd|}tdd|}dd }td||}dd }td ||}d!d" }d#}tj|||tjd$}td%d&d' |}d(d)d*d+d,d-d.d/d0}| D ]
\}}|||}qd1d2 }	td3|	|}t	d4d5 |D }
|
rt
d6|}|D ]}||t|d7d8}q| S )9Nu   ೤r   u   ೥r@  u   ೀu   ೀu   ೇu   ೇu   ೈu   ೈu   ೊu   ೊu   ೋu   ೋz([\u0c80-\u0cff]):u   \1ಃr   u   ರೂಪಾಯಿ \1r   r   r   u   ಡಾಲರ್ \1r   r   u%   ಕೊರಿಯನ್ ವಾನ್ \1r   c                 S   rB  )Nr   rI   u    ಪಾಯಿಂಟ್ r  rC  r   r   r   r   &  rD  z3KannadaNormalizer.__call__.<locals>.replace_decimalr   c           
      S   r   )Nr   r   r   uk   ಎಚ್ ಟಿ ಟಿ ಪಿ ಎಸ್ ಕೋಲನ್ ಸ್ಲ್ಯಾಶ್ ಸ್ಲ್ಯಾಶ್ r   ua   ಎಚ್ ಟಿ ಟಿ ಪಿ ಕೋಲನ್ ಸ್ಲ್ಯಾಶ್ ಸ್ಲ್ಯಾಶ್ r   r   r       ಡಾಟ್ c                 s   r   r   r   r   r   r   r   r   <  r   zDKannadaNormalizer.__call__.<locals>.normalize_url.<locals>.<genexpr>u    ಸ್ಲ್ಯಾಶ್ c                 s   r   r   r   r   r   r   r   r   C  r   r   r   r   r   r   r  -  r  z1KannadaNormalizer.__call__.<locals>.normalize_urlr  c                 S   ,   |  d}|d}ddd |D }|S )Nr   r   r[  c                 s   r   r   r   r   r   r   r   r   P  r  zIKannadaNormalizer.__call__.<locals>.normalize_bare_url.<locals>.<genexpr>r	  rQ   r   r   spokenr   r   r   r  M     

z6KannadaNormalizer.__call__.<locals>.normalize_bare_urlr  r  r  c                 S   r  )Nr   u    ಅಟ್ rI   r[  r  r  r  r   r   r   r:   [  r  z,KannadaNormalizer.__call__.<locals>.<lambda>r  r  r  r  r  r  r  r  r  c                 S   r'  r(  r)  r*  r   r   r   r+  m  r,  z3KannadaNormalizer.__call__.<locals>.expand_acronymsr-  c                 s   r   r!   r   r   r   r   r   r   r  r   z-KannadaNormalizer.__call__.<locals>.<genexpr>r   rZ  r   )r   r   r"   r   rr   r   r   r/  r0  r   r   r   r1  rI  r   r   r   r     sd   

zKannadaNormalizer.__call__)rZ  Fr.   FFFrJ  r   r   r   r   r     rL  r   c                       sf   e Zd ZdZdddddddZd	d
 Zdd Z								d fdd	Zdef fddZ	  Z
S )r   a  
    Normalizer for the Malayalam script. In addition to basic normalization by the super class,
    * Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama
    * canonicalize two-part dependent vowel signs
    * Change from old encoding of chillus (till Unicode 5.0) to new encoding
    * replace colon ':' by visarga if the colon follows a charcter in this script
    u   ണu   നu   രu   ലu   ളu   ക)   ൺ   ൻ   ർ   ൽ   ൾ   ൿc                 C   s*   t j D ]\}}||d|}q|S )Nu   {}്)r   CHILLU_CHAR_MAPr0  r   rt   )r   r   chillur   r   r   r   _canonicalize_chillus  s   z)MalayalamNormalizer._canonicalize_chillusc                 C   s   | ddS )Nu	   റ്റu	   ട്ട)r   r   r   r   r   _correct_geminated_T  s   z(MalayalamNormalizer._correct_geminated_TmlFr.   c	           	         s.   t t| ||||| || _|| _|| _d S r!   )r   r   r8   do_canonicalize_chillusdo_correct_geminated_Tr   )	r   r/   r0   r1   r2   r3   rk  rl  r   r   r   r   r8     s   

zMalayalamNormalizer.__init__r   c              	      s6  | dd}| dd}| dd}| dd}| d	d
}| dd}| jr,| |}tt| |}| dd}| dd}| dd}| dd}| dd}| dd}| jr`| |}t	dd|}| dd}| j
rt	dd|}t	dd|}t	dd|}t	d d!|}t	d"d!|}t	d#d$|}t	d%d$|}d&d' }t	d(||}d)d* }t	d+||}d,d- }d.}tj	|||tjd/}t	d0d1d2 |}d3d4d5d6d7d8d9d:d;}| D ]
\}}| ||}qd<d= }	t	d>|	|}td?d@ |D }
|
rtdA|}|D ]}| |t|dBdC}q	| S )DNu	   ണ്‍r`  u	   ന്‍ra  u	   ര്‍rb  u	   ല്‍rc  u	   ള്‍rd  u	   ക്‍re  u   ൤r   u   ൥r@  u   ൊu   ൊu   ോu   ോu   ൌu   ൌu   ൗz([\u0d00-\u0d7f]):u   \1ഃu   ു്u   ്r   u   രൂപ \1r   r   r   u   ഡോളർ \1r   r   u   കൊറിയൻ വോൺ \1r   c                 S   rB  )Nr   rI   u    പോയിന്റ് r  rC  r   r   r   r     rD  z5MalayalamNormalizer.__call__.<locals>.replace_decimalr   c           
      S   r   )Nr   r   r   ub   എച്ച് ടി ടി പി എസ് കോളൺ സ്ലാഷ് സ്ലാഷ് r   uX   എച്ച് ടി ടി പി കോളൺ സ്ലാഷ് സ്ലാഷ് r   r   r       ഡോട്ട് c                 s   r   r   r   r   r   r   r   r     r   zFMalayalamNormalizer.__call__.<locals>.normalize_url.<locals>.<genexpr>u    സ്ലാഷ് c                 s   r   r   r   r   r   r   r   r     r   r   r   r   r   r   r    s.   






z3MalayalamNormalizer.__call__.<locals>.normalize_urlr  c                 S   r\  )Nr   r   rm  c                 s   r   r   r   r   r   r   r   r     r  zKMalayalamNormalizer.__call__.<locals>.normalize_bare_url.<locals>.<genexpr>r	  r]  r   r   r   r    r_  z8MalayalamNormalizer.__call__.<locals>.normalize_bare_urlr  r  r  c                 S   r  )Nr   u    അറ്റ് rI   rm  r  r  r  r   r   r   r:     r  z.MalayalamNormalizer.__call__.<locals>.<lambda>r  r  r  r  r  r  r  r  r  c                 S   r'  r(  r)  r*  r   r   r   r+    r,  z5MalayalamNormalizer.__call__.<locals>.expand_acronymsr-  c                 s   r   r!   r   r   r   r   r   r   $  r   z/MalayalamNormalizer.__call__.<locals>.<genexpr>r   rj  r   )r   rk  rh  r   r   r"   rl  ri  rr   r   r   r/  r0  r   r   r   r1  rI  r   r   r   r     sx   



zMalayalamNormalizer.__call__)rj  Fr.   FFFFF)r#   r$   r%   r&   rf  rh  ri  r8   rK  r   r   r   r   r   r   r   {  s*    		r   )__all__rr   indic_numtowordsr   r   r   objectr   r   r   r   r   r   r   r   r	   r
   r   r   r   r   r   r   <module>   s4   3  Q  N  	 3   