o
    Kεi{6                     @   s<   d Z ddlmZ ddlmZ ddlmZ G dd deZdS )	z
@author:Bengali.AI
    )print_function   )	languages)langsc                   @   s   e Zd Z			d+ddZdd Zdd Zd	d
 Zdd Zdd Zdd Z	dd Z
dd Zdd Zdd Zdd Zdd Zdd Zdd  Zd!d" Zd#d$ Zd%d& Zd'd( Zd)d* ZdS ),BaseNormalizerFNc                 C   s  t |tks
J d|t v sJ dt |tksJ dt |tks(J dt| | _|durjt |tks;J dt| dksGJ d| D ]\}}|| jj	v s\J | d	|| jj
v siJ | d
qK| jj| _| jj| _|| _|rttt| jtd j | _ttt| jtd j | _|rttt| j| jj	 | _ttt| j| jj	 | _| j| jd| _| j| j| j| j| jd| _dS )aa  
            initialize a normalizer
            args:
                language                        :   language identifier/name  type(str)
                allow_english                   :   allow english letters numbers and punctuations [default:False] type(bool)
                keep_legacy_symbols             :   legacy symbols will be considered as valid unicodes[default:False] type(bool)
                legacy_maps                     :   a dictionay for changing legacy symbols into a more used  unicode [default:None] type(dict/None)
                                                    
        zlanguage is not string type!!!zLanguage is not availablez1allow_english is not of type boolean [True/False]z7keep_legacy_symbols is not of type boolean [True/False]Nz'legacy_maps is not of type dict or Noner   zlegacy_maps is an empty dictzO is not a legacy symbol.See README.md initialization section for legacy symbolszR is not a valid legacy map.See README.md initialization section for legacy symbolsenglish)LegacySymbolsBrokenDiacritics)BrokenNuktaInvalidUnicodeInvalidConnectorFixDiacriticsVowelDiacriticAfterVowel)typestrr   keysboollangdictlenitemslegacy_symbolsusedvalidcomplex_rootsrootslegacy_mapssortedlistsetmapLegacySymbolsfixBrokenDiacriticsword_level_opsfixBrokenNuktacleanInvalidUnicodescleanInvalidConnectorcleanDiacritics#cleanVowelDiacriticComingAfterVoweldecomp_level_ops)selflanguageallow_englishkeep_legacy_symbolsr   kv r/   L/home/ubuntu/.local/lib/python3.10/site-packages/bnunicodenormalizer/base.py__init__   s:   


zBaseNormalizer.__init__c                 C   s   t | jdkr	dS dS )Nr   TF)r   decompr)   r/   r/   r0   checkDecompA   s   zBaseNormalizer.checkDecompc                 C   s&   |  D ]\}}| j||| _qd S N)r   wordreplace)r)   map_dictr-   r.   r/   r/   r0   replaceMapsE   s   zBaseNormalizer.replaceMapsc                 C   s(   | j | }| j | | j |< || j |< d S r5   )r2   )r)   idx1idx2tempr/   r/   r0   swapIdxsI   s   
zBaseNormalizer.swapIdxsc                 C   s^   dd | j D | _ dd d| j D | _ |  dd | j D | _ dd d| j D | _ d S )Nc                 S      g | ]}|d ur|qS r5   r/   .0xr/   r/   r0   
<listcomp>P       z)BaseNormalizer.safeop.<locals>.<listcomp>c                 S   r>   r5   r/   r?   r/   r/   r0   rB   Q   rC    c                 S   r>   r5   r/   r?   r/   r/   r0   rB   T   rC   c                 S   r>   r5   r/   r?   r/   r/   r0   rB   U   rC   )r2   join)r)   opr/   r/   r0   safeopN   s
   zBaseNormalizer.safeopc                    sT   j j jv r fddt jD }dd |D }g }t|dkrm|^}}t|}d}t||kr\t|}g }|D ]}t|t|dkrN|t|O }q:|| q:|}t||ks2|tt	| |}t|dks"g }	|D ]+}
d}|
D ]	}| j| 7 }qw|	| |
D ]}||
d kr| j|< qd j|< qqqdd  jD  _dS dS )	z9
            creates grapheme root based decomp 
        c                    s    g | ]\}}| j jkr|qS r/   )r   	connector)r@   irA   r3   r/   r0   rB   a   s     z9BaseNormalizer.constructComplexDecomp.<locals>.<listcomp>c                 S   s   g | ]}|d  ||d  gqS )r   r/   )r@   cidr/   r/   r0   rB   c   s    r   rD   Nc                 S   r>   r5   r/   )r@   dr/   r/   r0   rB      rC   )
r   rH   r2   	enumerater   r   intersectionappendr   r   )r)   c_idxscompsr_decompfirstrestlfrest2rcombsridxcombrI   r/   r3   r0   constructComplexDecomp[   sB   
z%BaseNormalizer.constructComplexDecompc                 C   s   | j d ur| | j  d S d S r5   )r   r9   r3   r/   r/   r0   r       s   
zBaseNormalizer.mapLegacySymbolsc                 C   s"   | j jd ur| | j j d S d S r5   )r   diacritic_mapr9   r3   r/   r/   r0   r!      s   z"BaseNormalizer.fixBrokenDiacriticsc                 C   s   | j jd ur?t| jD ]5\}}|| j jkr>t|d ddD ] }| j| | j j v r=| j j| j|  | j|< d | j|<  nqqd S d S )Nr   rK   )r   	nukta_maprM   r2   nuktaranger   )r)   idxrL   cidxr/   r/   r0   r#      s   
zBaseNormalizer.fixBrokenNuktac                 C   s   |   r"| jd | jjv r"| jdd  | _|   r"| jd | jjv s|   rD| jd | jjkrD| jd d | _|   rD| jd | jjks/|   r\t| jD ]\}}|| jvr[d | j|< qMd S d S )Nr   r   rK   )r4   r2   r   invalid_startsrH   rM   r   r)   r`   rL   r/   r/   r0   r$      s   

z#BaseNormalizer.cleanInvalidUnicodesc                 C   sl   t | jD ].\}}|t| jd k r3|| jjkr3| j|d  | jjv s.| j|d  | jjv r3d | j|< qd S Nr   )rM   r2   r   r   rH   invalid_connectorsrc   r/   r/   r0   r%      s   ,
z$BaseNormalizer.cleanInvalidConnectorc                 C   sx   t | jD ]4\}}|t| jd k r9|| jjv r9| j|d  | jjv r9|| j|d  kr2d | j|< qd | j|d < qd S rd   )rM   r2   r   r   vowel_diacriticsrc   r/   r/   r0   cleanVowelDiacritics   s   "z#BaseNormalizer.cleanVowelDiacriticsc                 C   sh   t | jD ],\}}|t| jd k r1|| jjv r1| j|d  | jjv r1|| j|d  kr1d | j|< qd S rd   )rM   r2   r   r   consonant_diacriticsrc   r/   r/   r0   cleanConsonantDiacritics   s   "
z'BaseNormalizer.cleanConsonantDiacriticsc                 C   s\   t | jD ]&\}}|t| jd k r+|| jjv r+| j|d  | jjv r+| ||d  qd S rd   )rM   r2   r   r   rh   rf   r=   rc   r/   r/   r0   fixDiacriticOrder   s   "z BaseNormalizer.fixDiacriticOrderc                 C   sL   t | jD ]\}}|dkr#|| jjv r#| j|d  | jjv r#d | j|< qd S )Nr   r   )rM   r2   r   
diacritics	non_charsrc   r/   r/   r0   cleanNonCharDiacs   s   "
z BaseNormalizer.cleanNonCharDiacsc                 C   s4   |  | j |  | j |  | j |  | j d S r5   )rG   rg   ri   rj   rm   r3   r/   r/   r0   r&      s   zBaseNormalizer.cleanDiacriticsc                 C   sD   t | jD ]\}}|| jjv r| j|d  | jjv rd | j|< qd S rd   )rM   r2   r   rf   vowelsrc   r/   r/   r0   r'      s
   "
z2BaseNormalizer.cleanVowelDiacriticComingAfterVowelc                 C   s   t | jD ]Z\}}|dkr| j| dv rd | j|< q|t| jd k rA| j| dv rA| j|d  | jjgddg v rAd | j|d < |dkr_| j| dv r_| j|d  d gddg v r_d | j|< qd S )Nr   )   ‌   ‍r   ro   rp   )rM   r2   r   r   rH   rc   r/   r/   r0   fixNoSpaceChar   s   @2
zBaseNormalizer.fixNoSpaceCharc                 C   s@   |  | j |  | j |  | j |  | j |  | j d S r5   )rG   r$   r%   r&   r'   rq   r3   r/   r/   r0   baseCompose   s
   zBaseNormalizer.baseComposec                 C   sT  g }|| _ t| j tstdt| j  ddkr'td| j dd || _	| j
 D ]"\}}| j	dd }|  | j	dd }||krQ||||d q/dd	 | j	D | _| j D ]/\}}d| j}| | |  s|d| j |d
  S d| j}||kr||||d q`| | j ddd	 | jD | _	| j	| j |d
S )a  
            normalizes a given word
            args:
                word    : the string to normalize
            returns: 
                a dictionary- 
                * "given" = provided text
                * "normalized = normalized text (gives None if during the operation length of the text becomes 0)
                * "ops" = list of operations (dictionary) that were executed in given text to create normalized text
                *  each dictionary in ops has:
                    * "operation": the name of the operation / problem in given text
                    * "before" : what the text looked like before the specific operation
                    * "after"  : what the text looks like after the specific operation   
        z+The provided argument/ word is not a string r   ziThe provided string has hultiple words.Make sure no space exists in the middle of the text.probable word:rD   N)	operationbeforeafterc                 S   s   g | ]}|qS r/   r/   )r@   chr/   r/   r0   rB     s    z+BaseNormalizer.__call__.<locals>.<listcomp>)
normalizedgivenopsc                 S   r>   r5   r/   r?   r/   r/   r0   rB   /  rC   )check
isinstancer   	TypeErrorr   stripsplit
ValueErrorr7   r6   r"   r   rO   r2   r(   rE   rG   r4   rr   )r)   r6   detailsop_idrF   word_before_opword_after_opr/   r/   r0   __call__   s6   
zBaseNormalizer.__call__)FFN)__name__
__module____qualname__r1   r4   r9   r=   rG   r[   r    r!   r#   r$   r%   rg   ri   rj   rm   r&   r'   rq   rr   r   r/   r/   r/   r0   r      s0    
3/
	
r   N)__doc__
__future__r   r   r   bnunicodenormalizerobjectr   r/   r/   r/   r0   <module>   s
   