o
    'NiR1                  	   @   s*  d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZ d dlm	Z	 d dl
mZ d dlmZ d dlZi aeeai add ZG d	d
 d
eZG dd deZedkree jdk rged e d e jd dkre jd Ze jd Z e!e jd ddd>Z"e!e jd dddZ#e"$ D ]Z%e&e%ee Z'e#(e' qW d   n1 sw   Y  W d   dS W d   dS 1 sw   Y  dS e jd dkr0e jd Z)e!e jd dddDZ"e!e jd ddd Z#e"$ D ]Z%e*e%e)Z'e'+ddZ'e#(e' qW d   n1 sw   Y  W d   dS W d   dS 1 s)w   Y  dS e jd dkre jd Z)e!e jd ddd?Z"e!e jd dddZ#e"$ D ]Z%e,e%e)Z'e#(e' qWW d   n1 sqw   Y  W d   dS W d   dS 1 sw   Y  dS dS dS )    N)defaultdict)common)langinfo)indic_scripts)SinhalaDevanagariTransliteratorc                  C   s   t jt dd} tj| dd}| D ]B}|d d }t|d d dd	}|t	|< t
|r>t|d
d  |dg nt| | ddddddddddddddddaqd
S )zI
    To be called by library loader, do not call it in your program 
    transliteratezoffset_itrans_map.csvutf-8encoding   itrans
offset_hex   )baseNM   aaiiuuzR^izR^IzL^izL^IldvkShzj~nz.mAUM)AIURRiRRILLiLLILwxgjdnyz.nMOM)ospathjoinr   get_resources_pathpdread_csviterrowsintOFFSET_TO_ITRANSr   is_consonant_offsetITRANS_TO_OFFSETextendappend DUPLICATE_ITRANS_REPRESENTATIONS)itrans_map_fname	itrans_dfrr   o r9   `/home/ubuntu/.local/lib/python3.10/site-packages/indicnlp/transliterate/unicode_transliterate.pyinit   s4   
r;   c                   @   (   e Zd ZdZedd Zedd ZdS )UnicodeIndicTransliteratora   
    Base class for rule-based transliteration among Indian languages. 

    Script pair specific transliterators should derive from this class and override the transliterate() method. 
    They can call the super class 'transliterate()' method to avail of the common transliteration
    c                 C   sl   | dkr(| dkr(| dkr(| d d dks(| d d dks(| d d }dd|  } | dv r.d} | d	kr4d
} | S )N   (         r      )+   ,   -   *   6   7   r9   )offset
subst_charr9   r9   r:   _correct_tamil_mapping]   s    z1UnicodeIndicTransliterator._correct_tamil_mappingc                 C   s   |t jv ro|t jv ro|dkrt| } d}d}|dkrd}d}g }| D ]:}|}t|t j| d  }|t jkrX|t jkrX|dkrX|dkrX|dkrMt|}t	t j| d | }|
| q#|dkrjtd|S d|S | S )z
        convert the source language script (lang1) to target language script (lang2)

        text: text to transliterate
        lang1_code: language 1 code 
        lang1_code: language 2 code 
        sihi r   u   ।u   ॥ta)r   SCRIPT_RANGESsdtsinhala_to_devanagariord!COORDINATED_RANGE_START_INCLUSIVECOORDINATED_RANGE_END_INCLUSIVEr=   rK   chrr3   devanagari_to_sinhalar)   )text
lang1_code
lang2_codeorg_lang2_codetrans_lit_textcnewcrI   r9   r9   r:   r   t   s*   	
$

z(UnicodeIndicTransliterator.transliterateN)__name__
__module____qualname____doc__staticmethodrK   r   r9   r9   r9   r:   r=   U   s    
r=   c                   @   r<   )ItransTransliteratorz:
    Transliterator between Indian scripts and ITRANS
    c                    s    t jv rw dkr-| dd} | dd} | dd} | dd	} | d
d} | dd}  fdd| D }g }|D ]7}t|tt j  d | }t |r]d}t|dkr\|  nt 	|rlt|dkrl|  |
| q:d|S | S )Nmlu   ൺu   ണ്u   ൻu   ന്u   ർu   ര്u   ൽu   ല്u   ൾu   ള്u   ൿu   ക്c                       g | ]}t | qS r9   )isc
get_offset).0r]   	lang_coder9   r:   
<listcomp>       z2ItransTransliterator.to_itrans.<locals>.<listcomp>r   rN   )r   rP   replacer/   getrV   is_halanta_offsetlenpopis_vowel_sign_offsetr2   r)   )rX   rk   offsetsitrans_lr8   r   r9   rj   r:   	to_itrans   s,   


zItransTransliterator.to_itransc                    s  d}t  D ]\}}|| v r| ||} qd}d}g }|d }|t| kr| || }	|	tv rjt|	 }
t|
dkrYt|
d rYt|dkrTt|d  rT|
d g}
n|
d g}
d fdd	|
D }||f}n-t|	dkru||	f}n"|t| k r|| |d k r|d }q|	|d  |d }|}d}|d }|t| ks%|dur|	|d  t
d|}g }tt|d D ]/}t||  rt||d   st||d   s||d  td
 kr|| qt|D ]}|| qd|}|td
 d}|S )z
        TODO: Document this method properly
        TODO: A little hack is used to handle schwa: needs to be documented
        TODO: check for robustness
        rB   r   Nr      r   rN   c                    rf   r9   )r   offset_to_char)ri   r"   langr9   r:   rl      rm   z4ItransTransliterator.from_itrans.<locals>.<listcomp>   )r4   itemsrn   rq   r1   r   is_vowel_offset
is_halantar)   r2   listrangeis_vowel_signis_nuktarx   r3   reversedrr   )rX   rz   MAXCODEkr   startmatchsolutionir   offsr]   temp_outrem_indicesoutr9   ry   r:   from_itrans   sd   		


4

z ItransTransliterator.from_itransN)r_   r`   ra   rb   rc   rv   r   r9   r9   r9   r:   rd      s    
rd   __main__rB   zaUsage: python unicode_transliterate.py <command> <infile> <outfile> <src_language> <tgt_language>r   r   rA   rw   r7   r   r	      r!   romanizephfindicize)-sysstring	itertoolsrer'   collectionsr   indicnlpr   r   indicnlp.scriptr   rg   -indicnlp.transliterate.sinhala_transliteratorr   rQ   pandasr+   r/   r   r1   r4   r;   objectr=   rd   r_   rq   argvprintexitsrc_languagetgt_languageopenifileofile	readlinesliner   transliterated_linewritelanguagerv   rn   r   r9   r9   r9   r:   <module>   st   (8I 



"
	 $
 $-