o
    KεiQ                     @   s4   d Z ddlmZ ddlmZmZ G dd deZdS )z
@author:Bengali.AI
    )print_function   )BaseNormalizer	languagesc                       s   e Zd Z			d% fdd	Zdd Zdd Zd	d
 Zdd Zdd Zdd Z	dd Z
dd Zdd Zdd Zdd Zdd Zdd Zdd  Zd!d" Zd#d$ Z  ZS )&
NormalizerFNc                    s   |dkr	t d j}t d j| _tt| jd|||d ddd| _| j| jd< dd	d
ddd| _	| j
| jd< 	 g d| _| j| jd< | j| jd< | j| jd< | j| jd< dS )u0  
            initialize a normalizer
            args:
                allow_english                   :   allow english letters numbers and punctuations [default:False]
                keep_legacy_symbols             :   legacy symbols will be considered as valid unicodes[default:False]
                                                    '৺':Isshar 
                                                    '৻':Ganda
                                                    'ঀ':Anji (not '৭')
                                                    'ঌ':li
                                                    'ৡ':dirgho li
                                                    'ঽ':Avagraha
                                                    'ৠ':Vocalic Rr (not 'ঋ')
                                                    '৲':rupi
                                                    '৴':currency numerator 1
                                                    '৵':currency numerator 2
                                                    '৶':currency numerator 3
                                                    '৷':currency numerator 4
                                                    '৸':currency numerator one less than the denominator
                                                    '৹':Currency Denominator Sixteen
                legacy_maps                     :   a dictionay for changing legacy symbols into a more used  unicode 
                                                    a default legacy map is included in the language class as well,
                                                    legacy_maps={'ঀ':'৭',
                                                                'ঌ':'৯',
                                                                'ৡ':'৯',
                                                                '৵':'৯',
                                                                '৻':'ৎ',
                                                                'ৠ':'ঋ',
                                                                'ঽ':'ই'}
                                            
                                                    pass-   
                                                    * legacy_maps=None; for keeping the legacy symbols as they are
                                                    * legacy_maps="default"; for using the default legacy map
                                                    * legacy_maps=custom dictionary(type-dict) ; which will map your desired legacy symbol to any of symbol you want
                                                        * the keys in the custiom dicts must belong to any of the legacy symbols
                                                        * the values in the custiom dicts must belong to either vowels,consonants,numbers or diacritics  
                                                        vowels         =   ['অ', 'আ', 'ই', 'ঈ', 'উ', 'ঊ', 'ঋ', 'এ', 'ঐ', 'ও', 'ঔ']
                                                        consonants     =   ['ক', 'খ', 'গ', 'ঘ', 'ঙ', 'চ', 'ছ','জ', 'ঝ', 'ঞ', 
                                                                            'ট', 'ঠ', 'ড', 'ঢ', 'ণ', 'ত', 'থ', 'দ', 'ধ', 'ন', 
                                                                            'প', 'ফ', 'ব', 'ভ', 'ম', 'য', 'র', 'ল', 'শ', 'ষ', 
                                                                            'স', 'হ','ড়', 'ঢ়', 'য়','ৎ']    
                                                        numbers        =    ['০', '১', '২', '৩', '৪', '৫', '৬', '৭', '৮', '৯']
                                                        vowel_diacritics       =   ['া', 'ি', 'ী', 'ু', 'ূ', 'ৃ', 'ে', 'ৈ', 'ো', 'ৌ']
                                                        consonant_diacritics   =   ['ঁ', 'ং', 'ঃ']
    
                                                        > for example you may want to map 'ঽ':Avagraha as 'হ' based on visual similiarity 
                                                            (default:'ই')

                ** legacy contions: keep_legacy_symbols and legacy_maps operates as follows 
                    case-1) keep_legacy_symbols=True and legacy_maps=None
                        : all legacy symbols will be considered valid unicodes. None of them will be changed
                    case-2) keep_legacy_symbols=True and legacy_maps=valid dictionary example:{'ঀ':'ক'}
                        : all legacy symbols will be considered valid unicodes. Only 'ঀ' will be changed to 'ক' , others will be untouched
                    case-3) keep_legacy_symbols=False and legacy_maps=None
                        : all legacy symbols will be removed
                    case-4) keep_legacy_symbols=False and legacy_maps=valid dictionary example:{'ঽ':'ই','ৠ':'ঋ'}
                        : 'ঽ' will be changed to 'ই' and 'ৠ' will be changed to 'ঋ'. All other legacy symbols will be removed
        defaultbangla)languageallow_englishkeep_legacy_symbolslegacy_maps   র   ব)u   ৰu   ৱAssameseReplacement"u   ।-')r   u   ৷u   –r   r   PunctuationReplacement)   তu   থu   নr   u   ম   যr   base_bangla_composeToAndHosontoNormalizeNormalizeConjunctsDiacriticsComplexRootNormalizationN)r   r   complex_rootssuperr   __init__assamese_mapreplaceAssameseword_level_opspunctuations_mapreplacePunctuations%valid_consonants_after_to_and_hosontobaseComposedecomp_level_opsnormalizeToandHosontocleanInvalidConjunctDiacriticsconvertComplexRoots)selfr
   r   r   	__class__ R/home/ubuntu/.local/lib/python3.10/site-packages/bnunicodenormalizer/normalizer.pyr      s.   >

zNormalizer.__init__c                 C      |  | j d S N)replaceMapsr   r(   r+   r+   r,   r   {      zNormalizer.replaceAssamesec                 C   r-   r.   )r/   r    r0   r+   r+   r,   r!   ~   r1   zNormalizer.replacePunctuationsc                 C   s   t | jD ]l\}}|t| jd k rq|| jjv rq| j|d  | jjv rq|| j|d  kr2d | j|< q|dv rH| j|d  dkrH| ||d  q|dkr]| j|d  dkr]d | j|d < q|dkrq| j|d  dkrqd | j|d < qd S )Nr   )   ং   ঃu   ঁr2   r3   )	enumeratedecomplenlangconsonant_diacriticsswapIdxsr(   idxdr+   r+   r,   cleanConsonantDiacritics   s   "z#Normalizer.cleanConsonantDiacriticsc                 C   s  t | jD ]!\}}|dkr| j| dv rd | j|< q| j| dkr&d| j|< qdd | jD | _t | jD ]\}}|dkr| j| dkr|t| jd krSd | j|< q5| j|d  | jjkrkd | j|< d | j|d < q5| j|d  dkrzd | j|< q5|dkr| j|d	  | jjkrd | j|< q5|t| jd k r| j|d  | jjkrd | j|< q5|t| jd	 k r| j|d	  d
kr| j|d  | jjkrd | j|< q5| j|d   | j| 7  < d | j|< q5dd | jD | _d S )Nr   )   ‌   ‍r>   r?   c                 S      g | ]}|d ur|qS r.   r+   .0xr+   r+   r,   
<listcomp>       z-Normalizer.fixNoSpaceChar.<locals>.<listcomp>r   r      r   c                 S   r@   r.   r+   rA   r+   r+   r,   rD      rE   r4   r5   r6   r7   	connectorr:   r+   r+   r,   fixNoSpaceChar   s6   

(:
zNormalizer.fixNoSpaceCharc                 C   sZ  t | jD ]z\}}|t| jd k r|| jjkrE| j|d  dkrE| j|d  dvrE| j|d  | jjv s@| j|d  | jjv rEd | j|< || jjkrb| j|d  dkrb| j|d  dkrbd | j|< || jjkr| j|d  dkr| j|d  dvrd | j|< qdd | jD | _d| j}d	|v r|d	d
}d|v r|dd
}dd |D | _d S )Nr   r   )u   অ   এr   )u   জu   দu   ধr   r   r   u   লc                 S   r@   r.   r+   )rB   r<   r+   r+   r,   rD      rE   z4Normalizer.cleanInvalidConnector.<locals>.<listcomp> u   এ্যাu   অ্যাu	   অ্যc                 S   s   g | ]}|qS r+   r+   )rB   chr+   r+   r,   rD      s    )r4   r5   r6   r7   rH   invalid_connectorsjoinreplace)r(   r;   r<   wordr+   r+   r,   cleanInvalidConnector   s"   0,
0
0
z Normalizer.cleanInvalidConnectorc                 C   s0  t | jD ]\}}|t| jd k r|dkr| j|d  | jjkr|t| jd k r| j|d  | jvrAd| j|< d| j|d < q|t| jd k r| j|d  dkr| j|d  | jjkr|t| jd k r|| j|d  dvr|d| j|< d| j|d < |t| jd k r| j|d  d	krd| j|d < qdS )
u4  
            normalizes to+hosonto for ['ত','থ','ন','ব','ম','য','র'] 
            # Example-1:
            (a)বুত্পত্তি==(b)বুৎপত্তি-->False
                (a) breaks as ['ব', 'ু', 'ত', '্', 'প', 'ত', '্', 'ত', 'ি']
                (b) breaks as ['ব', 'ু', 'ৎ', 'প', 'ত', '্', 'ত', 'ি']
            # Example-2:
            (a)উত্স==(b)উৎস-->False
                (a) breaks as ['উ', 'ত', '্', 'স']
                (b) breaks as ['উ', 'ৎ', 'স']
        r   r   rF      ৎN      )r   r   r   r   )r4   r5   r6   r7   rH   r"   r:   r+   r+   r,   convertToAndHosonto   s$   
(
zNormalizer.convertToAndHosontoc                 C   sX   t | jD ]$\}}|t| jd k r)|dkr)| j|d  | jjv r)| ||d  qdS )z4
            puts diacritics in right place
        r   rR   N)r4   r5   r6   r7   
diacriticsr9   r:   r+   r+   r,   swapToAndHosontoDiacritics   s   z%Normalizer.swapToAndHosontoDiacriticsc                 C   s$   |  | j |  | j |   d S r.   )safeoprU   rW   r#   r0   r+   r+   r,   r%     s   z Normalizer.normalizeToandHosontoc                 C   sf   t | jD ]+\}}|| jjv r0| j|d  | jjv r0| j|d  dkr)d| j|< qd| j|d < qdS )ui  
            takes care of vowels and modifier followed by vowel diacritics
            # Example-1:
            (a)উুলু==(b)উলু-->False
                (a) breaks as ['উ', 'ু', 'ল', 'ু']
                (b) breaks as ['উ', 'ল', 'ু']
            # Example-2:
            (a)আর্কিওোলজি==(b)আর্কিওলজি-->False
                (a) breaks as ['আ', 'র', '্', 'ক', 'ি', 'ও', 'ো', 'ল', 'জ', 'ি']
                (b) breaks as ['আ', 'র', '্', 'ক', 'ি', 'ও', 'ল', 'জ', 'ি']
            

            Also Normalizes 'এ' and 'ত্র'
            # Example-1:
            (a)একএে==(b)একত্রে-->False
                (a) breaks as ['এ', 'ক', 'এ', 'ে']
                (b) breaks as ['এ', 'ক', 'ত', '্', 'র', 'ে']
                
        r   rJ   Nu	   ত্র)r4   r5   r7   vowel_diacriticsvowelsr:   r+   r+   r,   #cleanVowelDiacriticComingAfterVowel
  s   "z.Normalizer.cleanVowelDiacriticComingAfterVowelc                 C   sV   t | jD ]#\}}|t| jd k r(|| jjkr(| j|d  dkr(d| j|d < qd S )Nr   u   য়r   rG   r:   r+   r+   r,   fixTypoForJoFola*  s   zNormalizer.fixTypoForJoFolac                 C   s   t | jD ]N\}}|t| jd k rS|| jjkrS| j|d  | jjv rS| j|d  | jjkrS| j|d  | jjv rS| j|d  | j|d  krSd | j|< d | j|d < qd S )NrS   r   rF   )r4   r5   r6   r7   rH   
consonantsr:   r+   r+   r,   cleanDoubleCC0  s   ",
zNormalizer.cleanDoubleCCc                 C   s   t | jD ]<\}}|t| jd k rA|dkrA| j|d  | jjkrA| j|d  dkrA| j|d  | jjkrAd | j|< d | j|d < qd S )NrS   r   r   rF   rG   r:   r+   r+   r,   cleanDoubleRef<  s   (
zNormalizer.cleanDoubleRefc                 C   sl   t | jD ].\}}|t| jd k r3|| jjkr3| j|d  dkr3| j|d  | jjkr3d | j|d < qd S )NrF   r   r   rG   r:   r+   r+   r,   cleanConnectotForJoFolaD  s   4z"Normalizer.cleanConnectotForJoFolac                 C   s<   |  | j |  | j |  | j |  | j |   dS )u?  
            cleans repeated folas
            # Example-1:
            (a)গ্র্রামকে==(b)গ্রামকে-->False
                (a) breaks as ['গ', '্', 'র', '্', 'র', 'া', 'ম', 'ক', 'ে']
                (b) breaks as ['গ', '্', 'র', 'া', 'ম', 'ক', 'ে']
        N)rX   r\   r^   r_   r`   r#   r0   r+   r+   r,   r&   L  s
   z)Normalizer.cleanInvalidConjunctDiacriticsc           
      C   s   g }g }t |D ][\}}|dkrc||vrc|}|t|d kr$|| qt|d t|dD ]4}|| }|d | }	|	| jvrF||  n|t|d krV|	}|| q.|	}|| ||	 q.qd|S )Nu   ্r   rF   rK   )r4   r6   appendranger   rN   )
r(   rootformed
formed_idxicrjr<   kr+   r+   r,   checkComplexRoot^  s,   




zNormalizer.checkComplexRootc                 C   s`   |    dd | jD | _|   t| jD ]\}}|| jvr-| jj|v r-| || j|< qd S )Nc                 S   r@   r.   r+   rA   r+   r+   r,   rD   }  rE   z2Normalizer.convertComplexRoots.<locals>.<listcomp>)rI   r5   constructComplexDecompr4   r   r7   rH   rk   r:   r+   r+   r,   r'   {  s   zNormalizer.convertComplexRoots)FFN)__name__
__module____qualname__r   r   r!   r=   rI   rQ   rU   rW   r%   r[   r\   r^   r_   r`   r&   rk   r'   __classcell__r+   r+   r)   r,   r      s*    k+(	 r   N)__doc__
__future__r   baser   r   r   r+   r+   r+   r,   <module>   s   