o
    Ni                     @   s^  d dl mZ d dlmZ ddlmZmZmZmZ ddlm	Z	 d dl
Z
d dlZd dlZd dlZd dlmZ d dlZd dlZd dlZd dlZd dlZd dlZd dlmZmZ d	d
 Zd)ddZd*ddZdd Zdd Zdg g dfddZd dlZejdddd Z dg g fddZ!dd Z"dd  Z#d!d" Z$d#d$ Z%ejddd%d& Z&ejddd'd( Z'dS )+    )
GeneralMap)Transliterator   )ConvertPostOptionsPostProcess
PreProcess)
ConvertFixN)Counter)
getmembers
isfunctionc                 C   s   |  ddkr| ddS d S )Nar    )countreplace)r    r   N/home/ubuntu/.local/lib/python3.10/site-packages/aksharamukha/transliterate.pyremoveA   s   r   c                 c   sh    t  }|j}|du rt|j| D ]	}|| |V  qdS | D ]}||}||vr1|| |V  q dS )zHList unique elements, preserving order. Remember all elements ever seen.N)setadd	itertoolsfilterfalse__contains__)iterablekeyseenseen_addelementkr   r   r   unique_everseen   s   r   Fc              	      s  g } D ]2}z't |dd  }|dkr|| n|t |dd   W q ty6   Y qw t|}g }| D ]\}}|t| d }	||	|f qA|sit|dkrft	|d d }n'd}n$t|dkrt	|d d dkrt	|d	 d }nt	|d d }nd}|d 
 |dd   }
g d
}|
dkrd v sd v rd}
|
S |
dkrt fdd|D rd}
|
S |
dkrd}
|
S |
dkrTd}
g d}t fdd|D rd}
dddd} dd  D ]9}z	t | }W n   Y d|v r|d  d7  < qd|v r|d   d7  < qd!|v r'|d"  d7  < qdd l}t	| |dd#}|d dksL|d  dksL|d" dkrR|d d }
|
S |
d$kr]d%}
|
S |
d&krfd'}
|
S |
d(krod)}
|
S |
d*krxd+}
|
S |
d,krd-}
|
S |
d.krd/}
|
S |
d0krd1}
|
S |
d2krd3}
|
S |
d4krd5}
|
S |
d6krd7}|
S |
d8krd9}
|
S |
d:krd;}
|
S |
d<krd=}
d>d}t fd?d|D rd@}
dAd}t fdBd|D rdC}
|
S |
dDkr1dE}
dFd}t fdGd|D r	dH}
dId}t fdJd|D rdK}
dLd}t fdMd|D r/dN}
|
S |
dOkrg dP}g dQ}g dR}g dS}dT v rOdU}
|
S t fdVd|D r_dW}
|
S t fdXd|D rodY}
|
S t fdZd|D rd[ v sd\ v sd] v rd^}
|
S d_}
|
S t fd`d|D rda}
|
S db}
|
S |
tjv s|
tjv s|
dcv r	 |
S dddlm} | }| }
|
S )eN r   oldr   d   r   latin)u   ຆu   ຉu   ຌu   ຎu   ຏu   ຐu   ຑu   ຒu   ຓu   ຘu   ຠu   ຨu   ຩu   ຬu   ຺Bengaliu   ৰu   ৱAssameseLaoc                 3       | ]}| v V  qd S Nr   .0chartextr   r   	<genexpr>X       zauto_detect.<locals>.<genexpr>LaoPaliBatak	BatakKaroMyanmarBurmese)u   ၚu   ၛu   ္ညu   ၞu   ၟu   ၠu   ဳu   ဨc                    s   g | ]}| v qS r   r   r+   r.   r   r   
<listcomp>`   s    zauto_detect.<locals>.<listcomp>Mon)ShanTaiLaing
KhamtiShanu   ႃshanr9   z	tai laingr:   khamtir;   )r   MeeteiMeeteiMayekDives
DivesAkuruPersian
OldPersianzPhags-paPhagsPaOlSantaliSoraSoraSompengSylotiSylotiNagriTaiTaiThamWarang
WarangCitiSiddhamsiddhamUnicodeCyrillicRussianCyrillic	ZanabazarZanabazarSquareSyriacSyreu   ܲ ܵ ܝܼ ܘܼ ܸ ܹ ܘܿc                 3   r)   r*   r   r+   r.   r   r   r0      r1   Syrnu   ܰ ܺ ܶ ّ ܽc                 3   r)   r*   r   r+   r.   r   r   r0      r1   SyrjArabicArabu   چ گ ژ پ هٔc                 3   r)   r*   r   r+   r.   r   r   r0      r1   zArab-Fau   ڈ ٹ ڑ ھc                 3   r)   r*   r   r+   r.   r   r   r0      r1   Urduu%   ݨ لؕ مھ نھ یھ رھ لھ وھc                 3   r)   r*   r   r+   r.   r   r   r0      r1   	ShahmukhiLatin)u   āu   īu   ūu   ṃu   ḥu   śu   ṣu   ṇu   ṛu   ṝu   ḷu   ḹu   ḻu   ṉu   ṟu   ṭu   ḍu   ṅ   ñ)
zR^izR^IzL^izL^Iz.Nz~Nz~nChshSh)   ʾu   ʿu   šw)   ´u   ˝u   ʻu   ʰTitusc                 3   r)   r*   r   r+   r.   r   r   r0      r1   Latnc                 3   r)   r*   r   r+   r.   r   r   r0      r1   BurmeseRomanLoCc                 3   r)   r*   r   r+   r.   r   r   r0      r1   u   ēu   ōu   r̥ISOIASTc                 3   r)   r*   r   r+   r.   r   r   r0      r1   ItransHKHiraganaKatakana)gimeltra)unicodedatanamesplitlowerappend
ValueErrorr
   itemslensortedupperanyr   operator
itemgetterr   IndicScriptsLatinScriptsr   ro   r   auto_script)r/   pluginscriptsucharscript_namecountsscript_percentscriptr   percentinputScriptlaoPalimoncountSubr-   r{   sorted_x
preOptionseastern_diawestern_diapersian_char	urdu_charshahmukh_char
diacriticsrj   semiticrg   ro   trr   r.   r   auto_detect*   s.  nki



*M
E
C
A
?
=
;
9
7
5
3
1
/


'






"
r   c                 C   s   g }|dkrD|  dd}d| v sd| v sd| v sd| v r!dg}|S d	|v r.d
| v r.dg}|S d	|v r7dg}|S d
| v s?d| v rBdg}|S |dksL|dkru|  dd}d|v r_d| v r_dg}|S d|v rhdg}|S d| v spd| v rsdg}|S |dkr|dg}|S )NThaiu   ห์r   u   ͜u   ̥u   งํu   ็ThaiPhoneticu   ์u   ะThaiSajjhayawithAThaiSajjhayaOrthographyu   ัThaiOrthographyr(   r2   u   ຫ໌u   ໌u   ະLaoSajhayaOrthographywithALaoSajhayaOrthographyu   ັLaoTranscriptionr[   UrduShortNotShown)r   )r/   r   
preoptionstextNewr   r   r   detect_preoptions   s:    r   c                 C   s	  d}|dkrg| t jv rg| t jv r| d } nd| v rd} | t jv r%| | }n| t jv r,d}| t jv rK||d g }|d g| }d}| d	v rJ|d
g }n| t j v rYt j|  | }| t j v rg|t j|   }|dkrz| t jvrz| t j	v rxd}nd}| dkr|t jv r|t jv r|d }nd|v rd}|t jv r||  } n|t jv rd} |t jv r| d g| }| d g| }d}n|t j
 v rt j
| | }|t j v r|t j|  }| dkr|t jvr|t j	v rd} nd} |t j v r$t j| | kr
t| t j| ||||}t j| } t j| | kr$|d g| }|d g| }d}d}| t j v rVt j|  |krE| d g| }| d g| }d} nt| t j|  ||||}t j|  } 	 |dksa|dkrc|S |g kry|g kry|dkry| |kry|S ddddd}|t j	v s|t j v r| | v r||  } | t j	v s| t j v r|| v r|| }| | v r|| v r||  } || }|s| dkrd} |s| dkrd} |s| dkrd} |s| dkrd} | dv r|t jv r|d7 }	 |r| t j	v r|t jv r|d7 }| |kr,| dkr,| dkr,| t j	vr,|}d}d|v re| t jv rCd|v rCt|| }n"t d | d! d"krT|d#g }n|t jvr`|d$g }n|d%g }t|| |||}d&|v rz|d'krzd(}d)|v r|d'krd(}d&|v r| d'krd(} d*|v r|d+krd,}d-|v r| d.krd/} d-|v r|d.krd/}d0|v r|d1krd}d2|v r|d3krd4}d}d5|v r|d3krd4}d}d6|v r|d7krd6}d}d8|v r|d9krd:| }d;|v r| d<krd=}d>|v r| d?krd@}|D ]
}tt||}qdA|v r/| dkr/|dBd}d}	| dkrA| dCkrA| t j	v sX| t jv rS|t j	v rS| t jvsX| dDv r]t|}| dksg| dkrrt| ||}dE}	d} |dks||dkrt| ||||}| dFkr|dGkrt|}| dHkrdI|v rd}
t|dID ]\}}|dJ dKkr|
t || |7 }
q|
|7 }
qnt || |}
|	dEkr|dkrd&|vrt |
dd}
| |kr|}t |
d|}
| t j	vr|dLkr|sdM|v r	ttdM|
}
|r)t!"|
| ||}
|dkr)|dNkr$t#|
}
nt$|
}
dO|v r:|dNkr:t%dPdQ |}|D ]
}tt||
}
q<| dNkra|dGkrat&'dR| dS }|j(|_)|j*}
| dFkrp|dGkrpt+|
}
	 tj|
dTdU}
t,| ||
|||}
|
S )VNr   RomanLoCLoCThamThamLoCrh   TargetF)KhmerLoCSchwaFinalKhmerLoCISO233Sourcerf   IgnoreHebrThaaArab-UrArab-Pa)HebrewThaanar[   r\   r   r[   r\   r   )r   r   u   ׍u   ׌rm   rn   
DevanagariindicDandas|SignMapr   u   ।RetainDevangariDanda
Dot2Dandas	Dot2PipessiddhammuktarO   SiddhamDevanagari	siddhamap	LaoNativer(   Lao2
egrantamilGranthaGranthaGrantamilnepaldevafontNewaranjanalantsaRanjanaTibetanranjanawartuBengaliRaBar&   SoyomboFinalsSoyomborb   BalineseSimplifiedBalineseBalineseSimpleRomanLoCJavaneseSimplifiedJavaneseJavaneseSimpleRomanLoCnovowelshebrewu   ַTyperl   JapaneseOriyaIPArj   z##   r   rZ   arabicRemoveAdditionsPhoneticTamilRemoveDiacriticsc                 S   s   | dkrdS | S )Nr   RemoveDiacriticsTamilr   xr   r   r   <lambda>  s    zconvert.<locals>.<lambda>z&http://anunaadam.appspot.com/api?text=z	&method=2Treverse)-r   
LoCScripts	LoCSrcMap	LoCTgtMap	LoCTgtISO
LocPostPreLoCTgtPostOptionskeysLoCTgtPreOptionsSemiticScriptsLoCSrcPostOptionsLoCSrcPreOptions
semiticISOconvertr}   pipeScriptsr   RetainPipeDanda
CrunchListTransliterationgetattrr   r~   retainLatinJapanesePreProcessr   JapanesePostProcessr	   OriyaIPAFixPre	enumeraterr   r   convertScriptr   ApplyScriptDefaultsr   r   maprequestsgetapparent_encodingencodingr/   OriyaIPAFixdefaultPost)srctgttxtnativizer   postoptionstgtOldIndicSemiticMappingoptionssrcOldtransliterationiwordrr   r   r   r      sn  
















(** *

 $



&





r   Tdefaultc                 C   sl   |dkrt | |||||S |dkrt| |||||S |dkr't| |||||S |dkr4t| |||||S d S )Nr  script_code	lang_code	lang_name)process_defaultprocess_script_tagprocess_lang_tagprocess_lang_name)r   r  r  r  post_optionspre_optionsparamr   r   r   process"  s   r  )maxsizec                 C   s^   dd l }|j|jt}t||  ddd}t|}W d    |S 1 s(w   Y  |S )Nr   r  utf8r   )ospathdirnamerealpath__file__openyaml	safe_load)	file_pathr  dir_pathstreamdata_loadedr   r   r   
_load_data1  s   
r(  c                    s^  t d}tjtj tj }ttdd ttt	ttdd }ttdd tt
t	 ttdd  }	 fdd|D }fd	d|D }|d
 | d }
||v r`d|| v r`t|
 | |vrod|  d }t| ||vr~d| d }t| |dkr| tjvrd|  d }t| | dkr|tjvrd| d }t| t| |||||S )N/yaml/aksharamukha-scripts.yamlc                 S      | d S Nr   r   r   r   r   r   r   >      z!convert_default.<locals>.<lambda>c                 S      |   S r*   rs   r   r   r   r   r   ?  r,  c                 S   r*  r+  r   r   r   r   r   r   A  r,  c                 S   r-  r*   r.  r   r   r   r   r   B  r,  c                    *   g | ]} D ]}|  |  kr|qqS r   r.  r,   option	option_id)postOptionListr   r   r7   D     * z#convert_default.<locals>.<listcomp>c                    r/  r   r.  r0  )preOptionListr   r   r7   E  r4  z uses an hacked font to display the script. In the absence of this font, you text may appear different. 
 See: https://www.aksharamukha.com/describe/z for the font used	font_hackzSource script: zQ not found in the list of scripts supported. The text will not be transliterated.zTarget script: r   zThe LoC romanization of z is not yet supported. The output text will be rendered using ISO 233 if Semitic else ISO 15919. See: https://www.aksharamukha.com/locz is not yet supported. The input text will be treated as if it was ISO 233 if Semitic else ISO 15919.. See: https://www.aksharamukha.com/loc)r(  r   r}   r~   r   listr   r   r   r   r   warningswarnr   r   )r   r  r  r  r  r  r'  
scriptListpreOptionListLowerpostOptionListLowerfont_hack_warningscript_not_foundr   )r3  r5  r   convert_default9  s0   




r?  c                    s   t jt j }ttdd |} dkrt| t| }n  |v r. fdd|D d   |v r?fdd|D d t ||||S )Nc                 S   r-  r*   r.  r   r   r   r   r   a  r,  z!process_default.<locals>.<lambda>
autodetectc                        g | ]}   |  kr|qS r   r.  r,   	script_id)r   r   r   r7   h       z#process_default.<locals>.<listcomp>r   c                    rA  r   r.  rB  )r  r   r   r7   k  rD  )	r   r}   r~   r7  r   r   r   rs   r?  )r   r  r  r  r  r  r:  scriptListLowerr   )r   r  r   r  _  s   r  c                 C   s  t d}t d}g }g }	| dkrd} td |dkr"d}td | D ]}
||
 d }d||
  v rR||
 d d	d
 }ttdd ||
 d d	}nd
}d}|| v rm|||  v rm|| | d }nd
}d| vr||
 d  |  kr|||
f d|vr||
 d  | kr|	||
f d|v r|dd
  }|dd  }| | krd||
  v r| |v r|	d
|
f d| v r| dd
  }| dd  }| | krd||
  v r| |v r|d
|
f q&|  dv rd}|d7 }t| dg}| dv r2d}|d7 }t| dg}	d| v rM| dd
  dv rMd
| dd fg}d|v rh|dd
  dv rhd
|dd fg}	| dkryd
t	|fg}t
||}t|d
krt|ddd
 d }n| tjv r| }ntd|  d t|	d
krt|	ddd
 d }n|tjv r|}ntd| d t|dkrdd	tdd | d | d }|d7 }t| t|	dkrd d!td"d |	 d# | d }|d7 }t| t||||||S )$Nr)  /yaml/wikitra2-data.yamlSyrcrV   zPlease specify the variety of Syriac script for the source: Estrangelo (Syre), Eastern (Syrn) or Wester (Syrj). Defaulting to SyrezPlease specify the variety of Syriac script for the target: Estrangelo (Syre), Eastern (Syrn) or Wester (Syrj). Defaulting to Syrer   lang,r   c                 S   r-  r*   r.  r   r   r   r   r     r,  z$process_script_tag.<locals>.<lambda>r   
population-r   )latnenengRLatin has multiple transcription schemes. 'ISO 15919' has been selected by defaultzW
 Please use a transcription format e.g. la-IAST or la-HK to select a particular schemer   rh   r@  Tr   zSource script code: 
 not foundzTarget script code: zMultiple orthographies, c                 S   r*  Nr   r   r   r   r   r   r     r,  z:, are associated with the input script. The most popular '' has been selectedzh
 Please use the format lang_code-script_code e.g. ur-Arab or pa-Arab to select a particular orthographyzMultiple orthographies: z, c                 S   r*  rR  r   r   r   r   r   r     r,  z: are associated with the target script. The most popular ')r(  r8  r9  r   rr   r7  r   rs   rt   r   r   rw   rx   r   r   	Exceptionjoinr  )src_tagtgt_tagr  r  r  r  r'  data_loaded_wikir   r  scrpt	scrpt_taglang_tagrH  rJ  	lang_partscript_partr9  src_poptgt_popr   r   r   r  p  s   

"  ,
2

""

$
$
r  c                 C   s`  t d}t d}g }g }	| D ]}
d||
  v r+ttdd ||
 d d}nd}||
 d }|| v r@t|| }nd	}|  |v rO|||
f | |v r\|	||
f d
|v r|d
d  }|d
d	  }| |kr||v r|	d|
f d
| v r| d
d  }| d
d	  }| |kr||v r|d|
f q|  dv rd|  d }|d7 }t	| dg}| dv rd| d }|d7 }t	| dg}	d
| v r| d
d  dv r| D ]}
||
 d }| | d
d	  krd|
fg}qd
|v r>|d
d  dv r>| D ]}
||
 d }| |d
d	  kr<d|
fg}	q!|  dv rSd}|d7 }t	| dg}| dv rhd}|d7 }t	| dg}	d
| v r| d
d  dv rd| d
d	 fg}d
|v r|d
d  dv rd|d
d	 fg}	| dkrdt
|fg}t||}t|dkrt|ddd d	 }ntd|  d t|	dkrt|	ddd d	 }ntd| d t|d	krddtdd | d | d }|d 7 }t	| t|	d	kr'ddtd!d |	 d" | d }|d 7 }t	| t||||||S )#Nr)  rF  rH  c                 S   r-  r*   r.  r   r   r   r   r     r,  z"process_lang_tag.<locals>.<lambda>rI  r   r   r   rK  r   )sasanpiplizThe input language: zk is script-agnostic and can be written in multiple scripts. The most popular 'Devanagari' has been selectedzc
 Please use the format lang_code-script_code e.g. sa-Gran or sa-Sidd to select a particular script)r   r   zThe ouptput language: )larM  rN  rO  z[
 Please use a transcription format e.g. Latn-IAST or Latn-HK to select a particular schemerP  r@  Tr   zSource language code: rQ  zTarget language code: zMultiple scripts c                 S   r*  rR  r   r   r   r   r   r   J  r,  z7 associated with the input language. The most popular 'rS  zc
 Please use the format lang_code-script_code e.g. pa-Guru or pa-Arab to select a particular scriptc                 S   r*  rR  r   r   r   r   r   r   N  r,  z8 associated with the target language. The most popular ')r(  r   r7  r   rr   rw   rs   rt   r8  r9  r   r   rx   rT  rU  r  )rV  rW  r  r  r  r  r'  rX  r   r  rY  rH  rZ  script_countr\  r]  r9  r^  r_  r   r   r   r    s   "

"
"


""

$
$
r  c                 C   sJ   | dkrt |}t||}ntt| }tt|}t||||||S )Nr@  )r   r   str	langcodesfindr  )src_nametgt_namer  r  r  r  r   r  r   r   r   r  T  s   r  c                  C   s^   ddl m}  | | tj}t| |dddd}t|}W d    |S 1 s(w   Y  |S )Nr   )Pathzjson/gimeltra_data.jsonr  zutf-8r  )pathlibrk  r   parentr!  jsonload)rk  cwdfdatar   r   r   get_semitic_json_  s   
rs  c                 C   s   d S r*   r   )r   r   r   r   
getOptionsh  s   rt  r*   )F)(aksharamukhar   aksharamukha.gimeltrar   r   r   r   r   r   r	   rn  r   htmlr   collectionsr
   rp   r"  r8  rg  r  inspectr   r   r   r   r   r   r   r  	functools	lru_cacher(  r?  r  r  r  r  rs  rt  r   r   r   r   <module>   sJ    

   C

&lx


