o
    ai                    @  s  d Z ddlmZ ddlZddlmZ ddlZddlmZ ddl	m
Z
 ddlZddlZddlZddlZddlmZ ddlZddlZddlZddlmZmZ ddlZd	Zd
Zeejv r]ddlZdZdZdZdd Z dWdXddZ!dYddZ"dZddZ#dZddZ$d[d"d#Z%d\d]d*d+Z&d,d- Z'd^d.d/Z(d_d4d5Z)		d`dad<d=Z*dbd>d?Z+dWdcdBdCZ,G dDdE dEZ-G dFdG dGe-Z.G dHdI dIe-Z/G dJdK dKeZ0G dLdM dMZ1G dNdO dOZ2G dPdQ dQe2Z3G dRdS dSZ4dTdU Z5e6dVkre5  dS dS )da9  
Written by Ulf Hermjakob, USC/ISI  March-June 2024
uroman is a universal romanizer. It converts text in any script to the Latin alphabet.
This script is a Python reimplementation of an earlier Perl script, with some improvements.
The tool has been tested on 250 languages, with 100 or more sentences each.
This script is still under development and large-scale testing. Feedback welcome.
This script provides token-size caching (for faster runtimes).
Output formats include
  (1) best romanization string
  (2) best romanization edges ("best path"; incl. start and end positions with respect to the original string)
  (3) best romanization with alternatives (as applicable for ambiguous romanization)
  (4) best romanization full lattice (all edges, including superseded sub-edges)
See below for 'sample calls' under main()
    )annotationsN)defaultdict)Enum)Fraction)Path)ListTuplei   z	--profilez1.3.1.1zJune 27, 2024z_uroman is a universal romanizer. It converts text in any script to the standard Latin alphabet.c                   s    fdd}|S )Nc                    sv   t j  }td j |   td|d  | i |}t j  }||  }td|d td| d |S )Nz	Calling: zStart time: z%A, %B %d, %Y at %H:%Mz
End time: z
Duration: z seconds)datetimenowprint__name__total_seconds)argskwargs
start_timeresultend_time	time_difffunc P/home/ubuntu/maya3_transcribe/venv/lib/python3.10/site-packages/uroman/uroman.pywrapper1   s   

ztimer.<locals>.wrapperr   )r   r   r   r   r   timer0   s   
r   linestrslotdefaultstr | list | Nonereturnc                 C  s*   t d| d| }|r|d S |S )zFor a given slot, e.g. 'cost', get its value from a line such as '::s1 of course ::s2 ::cost 0.3' -> 0.3
    The value can be an empty string, as for ::s2 in the example above.z(?:.*\s)?::z(|\s+\S.*?)(?:\s+::\S.*|\s*)$   )regexmatchgroupstrip)r   r   r   mr   r   r   #slot_value_in_double_colon_del_list>   s   r&   boolc                 C  s   t t| |tS N)
isinstancer&   r   )r   r   r   r   r   "has_value_in_double_colon_del_listE      r*   sc                 C  s@   t | trtd| }|r|d|d dv r|dS | S )Nu   \s*(['"“])(.*)(['"”])\s*$r       )z''z""u   “”   )r)   r   r!   r"   r#   )r,   r%   r   r   r   dequote_stringI   s
   

r/   c                 C  s   t | r| t | d  S 	 d S Nr    )lenr,   r   r   r   last_chrQ   s   r3   charint | float | Nonec              	   C  s<   zt | }| rt|W S |W S  ttfy   Y d S w r(   )udnumeric
is_integerint
ValueError	TypeError)r4   num_fr   r   r   
ud_numericX   s   
r=   Fnum_sfilenameline_number
int | Nonesilentc                 C  s   t | trIzd| v rt| W S t| W S  tyH   |sEtjd|  d |r1tjd|  |r<tjd|  tjd Y d S Y d S w t | tsSt | trU| S d S )N.zCannot convert "z" to a numberz line: z file: 
)r)   r   floatr9   r:   sysstderrwrite)r>   r?   r@   rB   r   r   r   robust_str_to_num`   s$   
	rI   c                  G  s   | D ]
}|d ur|  S qd S r(   r   r   argr   r   r   first_non_noner   s
   rL   c                  G  s   | D ]	}|d ur dS qdS NTFr   rJ   r   r   r   any_not_noney   s
   rN   ddictkeyNonec                 C  s   |d ur
|| |< d S d S r(   r   )rO   rQ   valuer   r   r   add_non_none_to_dict   s   rT   fraction_charfraction_valuefloat | NoneuromanUroman | NoneFraction | Nonec           
   	   C  s   d}d }t |  D ]}z|tt|d7 }W q ty%   ||7 }Y qw td| }rM|dd\}}zt	t|t|}W n tyL   d }Y nw |d u ru|ru|ru|
| }	ruzt	|	d |	d }W |S  tyt   d }Y |S w |S )N    u   <fraction>(\d+)⁄(\d+)$r    r.   r   )r6   decompositionsplitchrr9   r:   r!   r"   r#   r   unicode_float2fraction)
rU   rV   rX   r,   fractionud_decomp_elemr%   numerator_sdenominator_snumerator_denominatorr   r   r   fraction_char2fraction   s2   rf   c              	   C  s&   zt | W S  ttfy   Y dS w )zmrobust version of ud.name; see related Uroman.char_name() that includes names not included in UnicodeData.txtr[   )r6   namer:   r;   r4   r   r   r   chr_name   s
   ri   r   argparse.Namespace | Nonec                 C  s   |r| |v rt ||  S d S r(   )vars)rQ   r   r   r   r   args_get   s   rl   c                   @  s.   e Zd Zdd Zdd Zd
ddZdd	 ZdS )	DictClassc                 K  s:   |D ]}| dd}|| }|d g dfvr|| j|< qd S )N_-F)replace__dict__)selfkw_argskw_argkw_arg2rS   r   r   r   __init__   s   
zDictClass.__init__c                 C  s
   t | jS r(   )r   rq   rr   r   r   r   __repr__   s   
zDictClass.__repr__Nc                 C  s   || j v r
| j | S |S r(   )rq   )rr   rQ   r   r   r   r   __getitem__   s   zDictClass.__getitem__c                 C  s   t | jdkS Nr   )r1   rq   rw   r   r   r   __bool__   s   zDictClass.__bool__r(   )r   
__module____qualname__rv   rx   ry   r{   r   r   r   r   rm      s
    
rm   c                   @     e Zd ZdS )RomRuleNr   r|   r}   r   r   r   r   r      s    r   c                   @  r~   )ScriptNr   r   r   r   r   r      s    r   c                   @  s(   e Zd ZdZdZdZdZdZdd ZdS )		RomFormatzOutput format of romanizationr   edgesaltslatticec                 C  s   | j S r(   rS   rw   r   r   r   __str__   s   zRomFormat.__str__N)	r   r|   r}   __doc__STREDGESALTSLATTICEr   r   r   r   r   r      s    r   c                   @  s  e Zd ZdZdwdxddZedyd	d
ZefdzddZd{ddZ	d|d}ddZ
d~ddd Zdwdd#d$Zd~dd%d&Zd~dd'd(Zedd*d+Zdd,d-Zd~dd.d/Zedd5d6Zddd:d;Zdd=d>Z	?	?dddBdCZdddEdFZeddGdHZeddIdJZeddKdLZddMdNZddPdQZddSdTZdddZd[Zdd\d]Zd^d_ Zd`da Z 		dddgdhZ!edddmdnZ"eddodpZ#dddsdtZ$de%j&fddudvZ'dS )UromanaJ  This class loads and maintains uroman data independent of any specific text corpus.
    Typically, only a single instance will be used. (In contrast to multiple lattice instances, one per text.)
    Methods include some testing. And finally methods to romanize a string (romanize_string()) or an entire file
    (romanize_file()).Ndata_dirPath | Nonec              	   K  s  |p	| j d	i || _tt| _tt| _tt| _tt	| _
tt| _tdd | _tt| _tt| _i | _i | _i | _i | _t  i | _d| _|dd| _| jdk| _i | _tt| _i | _|  | j|dd|dd|dd t!  d| _"d| _#d S )
Nc                   S  s   d S r(   r   r   r   r   r   <lambda>   s    z!Uroman.__init__.<locals>.<lambda>r   
cache_sizeload_logFrebuild_ud_propsrebuild_num_propsr   )$default_data_dirr   r   list	rom_rulesr   scriptsr'   	dict_boolr   dict_strr9   dict_intdict_numrP   	num_propssetdict_setfraction_connectorsminus_signs
plus_signsfloat2fractiongcdisable	rom_cacherom_cache_sizegetrom_max_cache_sizecache_p
hangul_romstatsabugida_cacheload_resource_filesenablen_error_messages_outputn_non_utf8_characters)rr   r   r   r   r   r   rv      s8   










zUroman.__init__r   r   c                  K  s`   t tj}|d  }|d  }| dr.tjdt| d tjdt| d |S )Ndataz	mini-testverbosez
data_dir: rD   zmini_test_dir: )	r   __file__parentresolver   rF   rG   rH   r   )r   root_dirr   mini_test_dirr   r   r   r      s   

zUroman.default_data_dirr   r9   c                 C  s"   i | _ d| _|| _| jdk| _d S rz   )r   r   r   r   )rr   r   r   r   r   reset_cache  s   zUroman.reset_cachecr   romrg   
str | NoneTuple[str | None, str]c                 C  s   |r?d|v r?|du r|  |}d|v r"td| }r"|d|fS d|v r5td| }r5|d|fS td|r?||fS d|fS )	zYMuch of this code will eventually move the old Perl code to generate cleaner primary data NzMYANMAR VOWEL SIGN KAYAHzkayah\s+(\S+)\s*$r    zMENDE KIKAKUI SYLLABLEzm\d+\s+(\S+)\s*$z\S\s+\S)ri   r!   searchr#   )rr   r   r   rg   r%   r   r   r   second_rom_filter	  s   
zUroman.second_rom_filterTr?   
provenancefile_formatr   r'   c           3      C  s  d}z	t |ddd}W n ty   tjd| d Y dS w |" t|dD ]\}}|d	r4q)td
|r;q)t	dd|}|dkrd}	t
t|d}
zt|
d}t|}W n	 tyc   Y q)w t
t|d}t|d }ry|| jd|f< t|d }r|| jd|f< t|d }r|| jd|f< t|d }r|| jd|f< nt
t|d}t
t|d}t
t|d}	t|d }durt|}|du r|n|| j|< t|d}|rd| j|< t|d}|rd| j|< t|d}t|d}t|d}|rd| j|< t|d}t|d}t|d }|rtd!|ng }t|d"}t|d#}t|d$}t|d%} t|d&}!t|d}t|||d'd(}t|d)}"|"rMtd!|"ng }#ttt
|#}#| ||d\}$}%|$ro|$|kro||krm	 |$}|dur=d*D ]}&t|&d+d,}'|'rd| j|&|f< qvt|||||||||	r=| | |d7 }||||| |!g}(td-d. |(D })|}*|du r|dur|*d/krd}*t dOi d|d|d0|*d1|d2|#d|d3|d4|d5|d6| d7|!d8|	d9|)d:|d;|d<|d=|d>|d?|d@|}+| j!| },t|,dkr5|,d d0 dAv r5|s5|s5|s5|s5| s5|!s5|+g| j!|< q)| j!| "|+ q)W d   n	1 sIw   Y  dB}-t#dCdDD ]#}t|}.|.|- }t |ddEdF}+| j!| sw|+g| j!|< | | qUtttt#dCdG}/dHdIgtttt#dJdK }0|/D ]'}1|0D ]!}2|1|2 |- }t |ddLdF}+| j!| s|+g| j!|< | | qq|rtjdM| dN| d dS dS )Pa  Reads in and processes the 3 main romanization data files: (1) romanization-auto-table.txt
        which was automatically generated from UnicodeData.txt (2) UnicodeDataOverwrite.txt that "corrects"
        some entries in romanization-auto-table.txt and (3) romanization-table.txt which was largely manually
        created and allows complex romanization rules, some for specific languages, some for specific contexts.r   rutf-8encodingCannot open file rD   Nr    #^\s*$
\s{2,}#.*$r[   u2rur\   rg   pic	tone-marksyllable-infor,   tzt-end-of-syllablenumis-minus-signTis-plus-signis-decimal-pointis-large-powerzfraction-connectorzpercentage-markerzint-frac-connectorlcode[,;]\s*use-only-at-start-of-worddont-use-at-start-of-worduse-only-at-end-of-worddont-use-at-end-of-worduse-only-for-whole-wordF)rB   zt-alt)r   r   r   r   ro   rn   c                 S     g | ]}|r|qS r   r   ).0restrr   r   r   
<listcomp>m      z(Uroman.load_rom_file.<locals>.<listcomp>r   provlcodest_altsuse_only_at_start_of_worddont_use_at_start_of_worduse_only_at_end_of_worddont_use_at_end_of_worduse_only_for_whole_wordt_at_end_of_syllablen_restris_minus_signis_plus_signis_decimal_pointfraction_connectorpercentage_markerint_frac_connectoris_large_power)r6   owu   ์i  iL  zauto cancel letter)r,   r   r   i/  u   ัu   ็i3  i;  zauto cancel syllableLoaded z from r   )$openFileNotFoundErrorrF   rG   rH   	enumerate
startswithr!   r"   subr/   r&   r9   r_   r:   r   rI   r   r*   r   r   r   r^   r   mapr   evalrp   r   rN   register_s_prefixr1   r   r   appendrange)3rr   r?   r   r   r   	n_entriesfr@   r   r   r   cpr,   r   rg   r   	tone_marksyllable_infor>   r   r   r   r   r   r   r   r   lcode_sr   r   r   r   r   r   t_alt_sr   t_modname2bool_key
bool_valuerestrictionsn_restrictionsprovenance2new_rom_ruleold_rom_rulesthai_cancellation_markr   thai_consonantsthai_vowel_modifiersc1vr   r   r   load_rom_file  s@  






















,	

 f

zUroman.load_rom_filec                 C  s(  d\}}z	t |ddd}W n ty!   tjd| d Y dS w | t|dD ]\}}|d	r4q*td
|r;q*t	dd|}t
|d }r| }	|	| jv rctjd| d| d| d nw|d7 }t
|d}
t
|d}|rytd|ng }t
|d}|rtd|ng }t
|d}|rtd|ng }t||||
|d}|| j|	< |D ]}| jd|f | q|D ]!}| }|| jv rtjd| d| d| d q|| j|< qt| }||kr|}q*W d   n1 sw   Y  |r|| jd< |rtjd| d| d| d dS dS )zReads in (typically from Scripts.txt) information about various scripts such as Devanagari,
        incl. information such as the default abugida vowel letter (e.g. "a").r   r   r   r   r   r   rD   Nr    r   r   r   r[   script-namez** Ignoring duplicate script "z
" in line z of 	directionzabugida-default-vowelr   zalt-script-namelanguage)script_namealt_script_names	languagesr  abugida_default_vowelsr   z/** Ignoring duplicate alternative script name "max_n_script_name_componentsr    script descriptions from z! (max_n_scripts_name_components: z)
)r   r   rF   rG   rH   r   r   r!   r"   r   r&   lowerr   r^   r   r   addr1   r   )rr   r?   r   r   r  r   r@   r   r  lc_script_namer  abugida_default_vowel_sr  alt_script_name_sr  
language_sr  
new_scriptr  alt_script_namelc_alt_script_namen_script_name_componentsr   r   r   load_script_file  s   






'
zUroman.load_script_filescript_name_plusfull_char_namec                 C  sZ   |r||krdS |r+|  | jv r"| j|    }r"|d  }r"|S tdd|}|s
dS )zUsing info from Scripts.txt, this script selects the script name from a Unicode,
        e.g. given "OLD HUNGARIAN CAPITAL LETTER A", extract "Old Hungarian".Nr  z
\s*\S*\s*$r[   )r  r   r!   r   )rr   r*  r+  scriptr  r   r   r   extract_script_name  s   zUroman.extract_script_namec                 C  s  d\}}}}}z	t |ddd}W n ty$   tjd| d Y dS w | t|dD ]\}	}
|
d	r7q-td
|
r>q-t	dd|
}
t
|
d }r|d7 }t
|
dg D ]}|| jd|f< |d7 }qVt
|
dg D ]}|| jd|f< |d7 }qjt
|
dg D ]}d| jd|f< |d7 }q~t
|
dg D ]}d| jd|f< |d7 }qt
|
dg D ]}d| jd|f< |d7 }qq-W d   n1 sw   Y  |rtjd| d|dd| d|dkrdnd  |s|s|rtjd| d| d | d! tjd" dS dS )#Loads Unicode derived data from (1) UnicodeDataProps.txt, (2) UnicodeDataPropsHangul.txt
        and UnicodeDataPropsCJK.txt with a list of valid script-specific characters.)r   r   r   r   r   r   r   r   r   rD   Nr    r   r   r   r[   r  r4   r,  numeralz
vowel-signTis-vowel-signzmedial-consonant-signis-medial-consonant-signzsign-virama	is-viramazLoaded from z mappings of z,dz characters to z scriptr,   z, with a total of z vowel signs, z medial consonant signs and z viramasz.
)r   r   rF   rG   rH   r   r   r!   r"   r   r&   r   r   )rr   r?   r   n_scriptn_script_charn_script_vowel_signn_script_medial_consonant_signn_script_viramar   r@   r   r  r4   r   r   r   load_unicode_data_props  s`   






zUroman.load_unicode_data_propsc           
      C  sZ  d}z	t |ddd}W n ty   tjd| d Y dS w |p t|dD ]b\}}|d	r2q(td
|r9q(t	
|}t|trx|d }rT|| j|< |d7 }ntjd| d| d|  d dD ]}	||	rvd| j|	|f< qhq(tjd| d| d|  d q(W d   n1 sw   Y  |rtjd| d| d dS dS )r.  r   r   r   r   r   rD   Nr    r   r   txtzMissing txt in l.	 in file : )r   Tz
json in l.z not a dict: r   z entries from )r   r   rF   rG   rH   r   r   r!   r"   jsonloadsr)   rP   r   r   r$   r   )
rr   r?   r   r   r   r@   r   rO   r9  r  r   r   r   load_num_props  s<   




$
&zUroman.load_num_propsr,   c              	   C  s   d}| D ]K}t |  }rKzdd |D }dd |D }W n ty2   tjd| d Y qw t|dkr@||d 7 }qtjd| d	 q||7 }q|d
d}|S )uf   De-accents a string from "liú" to "liu" and "ü" to "u" (to help process file Chinese_to_Pinyin.txt).r[   c                 S  s   g | ]	}t t|d qS )r\   )r_   r9   r   xr   r   r   r   1  s    z+Uroman.de_accent_pinyin.<locals>.<listcomp>c                 S  s    g | ]}t |d r|qS )L)r6   categoryr   r?  r   r   r   r   2  s     zCannot decode rD   r    r   z (expected 1 letter)
   ür   )	r6   r]   r^   r:   rF   rG   rH   r1   rp   )r,   r   r4   decompdecomp_charslettersr   r   r   de_accent_pinyin*  s    
zUroman.de_accent_pinyinc                 C  s2   t dt|d D ]}d| jd|d | f< q	d S )Nr    Ts-prefix)r   r1   r   )rr   r,   
prefix_lenr   r   r   r   ?  s   zUroman.register_s_prefixc                 C  s>  d}z	t |ddd}W n ty   tjd| d Y dS w |b t|dD ]T\}}|d	r2q(td
|r9q(z|	 
 \}}| |}	W n ty`   tjd| d| d|  Y q(w |}
t|
|	dg d}| j| | | |
 |d7 }q(W d   n1 sw   Y  |rtjd| d| d dS dS )zSLoads file Chinese_to_Pinyin.txt which maps Chinese characters to their Latin form.r   r   r   r   r   rD   Nr    r   r   zCannot process line r:  r;  z
rom pinyin)r,   r   r   r   r   r  )r   r   rF   rG   rH   r   r   r!   r"   rstripr^   rG  r:   r   r   r   r   )rr   r?   r   r   r   r@   r   chinesepinyinr   r,   r  r   r   r   load_chinese_pinyin_fileC  s:   
"

zUroman.load_chinese_pinyin_filerO   rP   r  
prop_classr4   c                 C  s>   | d  | ||f}|| v r| | | d S |g| |< d S )Nscript-names)r   r   )rO   r  rN  r4   rQ   r   r   r   %add_char_to_rebuild_unicode_data_dict_  s
   z,Uroman.add_char_to_rebuild_unicode_data_dictout_filenamecjkhangulc                 C  s  dt  i}d}d}d}dh}|dk r|d7 }t|}	| |	 }
s"qdD ]B}|r0t|tr0|}n|f}|D ]0}|d  d	d
}||vrJ|| t	d| dd|
}| 
||
 }re| ||||	 q5q$t	dd|
}| 
||
 }r| ||d|	 |d7 }| |	}td|tjr||	7 }|dk st|}dd |||fD }|r|n|}|r|n|}|D ]}z	t|ddd}W n ty   tjd| d Y qw | t|d D ]a}|dkr||krqn|dkr||krqn||krqd| g}|D ]2}||f}||v r0d||  }r0|dv r%|d| d	t|  |d| d	|  q|d	| d q||krN|rN|d| d W d   n	1 sYw   Y  qtjd| d| d t|d  d! dS )"a  This functions rebuilds UnicodeDataProps*.txt This might be useful when a new UnicodeData.txt
        version is released, or additional information is extracted from Unicode to UnicodeDataProps.txt
        Regular users normally never have to call this function.rO  r[   r   r4      r    )z
VOWEL SIGN)zMEDIAL CONSONANT SIGNzCONSONANT SIGN MEDIALzCONSONANT SIGN SHAN MEDIALzCONSONANT SIGN MON MEDIAL)zSIGN VIRAMAz	SIGN ASATz	AL-LAKUNAz
SIGN COENGzSIGN PAMAAEHzCHARACTER PHINTHU)NUMERALNUMBERDIGITFRACTIONr   ro   z\s+z\b.*$z\s+(CONSONANT|LETTER|LIGATURE|SIGN|SYLLABLE|SYLLABICS|VOWEL|IDEOGRAPH|HIEROGLYPH|POINT|ACCENT|CHARACTER|TIPPI|ADDAK|IRI|URA|SYMBOL GENITIVE|SYMBOL COMPLETED|SYMBOL LOCATIVE|SYMBOL AFOREMENTIONED|AU LENGTH MARK)\b.*$z^[aeiou]*[aeiouy]$c                 S  r   r   r   r?  r   r   r   r     r   z5Uroman.rebuild_unicode_data_props.<locals>.<listcomp>wr   r   zCannot write to file rD   CJKHangulz::script-name rh   z::n-z::z	::vowels NzRebuilt z with z characters for z
 scripts.
)r   r_   ri   r)   tupler  rp   r   r!   r   r-  rP  romanize_stringr"   
IGNORECASEsortedr   OSErrorrF   rG   rH   joinr   r1   )rr   rQ  rR  rS  rO   vowel_sn_script_refs	codepointprop_classesr   	char_nameprop_name_comp2	prop_listprop_name_comprN  script_name_candr  r   out_filenamescjk2hangul2out_filef_outprop_componentsrQ   charsr   r   r   rebuild_unicode_data_propsh  s   


%


z!Uroman.rebuild_unicode_data_propserr_filenamec                 C  s  d\}}t |ddd}t |ddd}d}|dk r|d7 }t|}tt|| |}	|	d u r4qi }
|}d }d }d }d }d }| jd|f }| | }rS|}n|d	v rYd
}| |}d}dD ]}||v rvd| 	dd }d} nqbdD ]}||v r| 	dd} nqy|rqt
|	tr|	}d|	  krdkrn nd}|	}d|v rd}n<d}n9tdt|	 }rt|d}td|d }|dkrdnd}nd}nd|v rt||	|  }r|}d}nd}|d u rd nt|}|d u rd n|j d!|j }|d u rd n|j|jg}|r|rdnd }|| | p|}t|
d"| t|
d#| t|
d$| t|
d| t|
d%| |rEd|
d< t|
d| t|
d&| t|
d'| |d(rrt|
d)| |t|
d*  |d7 }n|s{t|
d)| |t|
d*  |d7 }|dk sW d    n	1 sw   Y  W d    n	1 sw   Y  tjd+| d,| d-| d.| d-| d* d S )/Nr  rZ  r   r   rT  rU  r    r   
0123456789zascii-digitF)	SUPERSCRIPT	SUBSCRIPTCIRCLEDPARENTHESIZED	SEGMENTEDMATHEMATICALzROMAN NUMERALz	FULL STOPCOMMA*r   ro   T)zVULGAR FRACTIONr   	   rX  digitz
digit-likez([0-9]+?)(0*)$1r.   basemultiz	other-intrY  ra   z	other-numr[   /r9  r   rS   typemultr,  otherrg   rD   z
Processed z codepoints,
  wrote z
 lines to z	
    and )r   r_   rL   r=   	num_valuer   chr_script_nameri   r  rp   r)   r9   r!   r"   r   r#   rf   	numeratordenominatorrT   r   rH   r<  dumpsrF   rG   )rr   rQ  rt  n_outn_errrp  f_errre  r4   r   result_dictorig_txtrS   ra   num_basebase_multiplierr,  r   r  rg   exclude_from_number_processingscrypt_typenum_typer%   value_s
fraction_sfraction_listdelimiter_sr   r   r   r   r     s   



 XzUroman.rebuild_num_propsFr   r   c                 C  sF  |}t |tstjdt| d dS | jtj	|ddd|d | jtj	|dd	d
|d | jtj	|ddd|d | j
tj	|d|d | jtj	|d|d | jtj	|d|d dD ]}| jtj	|||d qe|r| jtj	|dtj	|dtj	|dd |r| tj	|dtj	|d dS dS )z1Loads all resource files needed for romanization.zError: data_dir is of z5, not a Path.
       Cannot load any resource files.
Nzromanization-auto-table.txtr6   r   )r   r   zUnicodeDataOverwrite.txtr   r   zromanization-table.txtmanzChinese_to_Pinyin.txt)r   zScripts.txtzNumProps.jsonl)UnicodeDataProps.txtUnicodeDataPropsCJK.txtUnicodeDataPropsHangul.txtr  r  r  )rR  rS  zNumPropsRejects.jsonl)r)   r   rF   rG   rH   r  r  ospathrb  rM  r)  r>  r8  rs  r   )rr   r   r   r   r   	base_filer   r   r   r     s8   
zUroman.load_resource_filespass_through_pc                 C  s   | j |d }r|S d }d }d }d}|D ]J}t|}	d|	  kr+dkr_n n2|	d }
t|
d }t|
d	 d
 }|
d	 }|| ||  ||  }|dd}|| j |< ||7 }q|re||7 }q|S )zYSpecial algorithmic solution to convert (Korean) Hangul characters to the Latin alphabet.Nz*g gg n d dd r m b bb s ss - j jj c k t p hz=a ae ya yae eo e yeo ye o wa wai oe yo u weo we wi yu eu yi izE- g gg gs n nj nh d l lg lm lb ls lt lp lh m b bs s ss ng j c k t p hr[        iL        ro   )r   r   r^   ordr9   rp   )rr   r,   r  
cached_romleadsvowelstailsr   r   r  code
lead_indexvowel_index
tail_indexr   r   r   r   unicode_hangul_romanization1  s*   

z"Uroman.unicode_hangul_romanizationc                 C     t | dkot| dkS )z] Checks whether a character is a nonspacing mark, e.g. combining accents, points, vowel signsr    Mnr1   r6   rB  r2   r   r   r   char_is_nonspacing_markH     zUroman.char_is_nonspacing_markc                 C  r  )zY Checks whether a character is a formatting character, e.g. a zero-with joiner/non-joinerr    Cfr  r2   r   r   r   char_is_format_charM  r  zUroman.char_is_format_charc                 C  r  )z Checks whether a character is a space,
            e.g. ' ', non-breakable space, en space, ideographic (Chinese) space, Ogham space mark
            but excluding 	, , 
r    Zsr  r2   r   r   r   char_is_space_separatorR  s   zUroman.char_is_space_separatorc              	   C  s@   zt |W S  ttfy   | jd|f  }r| Y S Y dS w )Nrg   r[   )r6   rg   r:   r;   r   )rr   r4   rg   r   r   r   ri   Y  s   zUroman.chr_nameint | float | Fraction | Nonec                 C  s,   | j | D ]}|d  }dur|  S qdS )zSrom_rules include numeric values beyond UnicodeData.txt, e.g. for Egyptian numeralsr   N)r   )rr   r,   rom_ruler   r   r   r   r  a  s
   zUroman.num_valuerQ   c                 C  s.   | j | D ]}|| }d ur|  S qd S r(   )r   r   )rr   r,   rQ   r  rS   r   r   r   rom_rule_valueh  s
   zUroman.rom_rule_valueư>r   rE   	precisionTuple[int, int] | Nonec                 C  s`   | j |d }r|S dD ] }dD ]}t|| | |k r,||f}|| j |< |    S qqdS )z!only for common unicode fractionsN)r    r.   r-                  r~  
      )r.   r-   r  r  r  r     r\          (   @   P      i@  )r   r   abs)rr   r   r  cached_valuer  r  r   r   r   r   r`   n  s   
zUroman.unicode_float2fractionc                 C  s   | j d|f S )z&For letters, diacritics, numerals etc.r,  )r   )rr   r4   r   r   r   r  z  s   zUroman.chr_script_namec                 C  s  d}dD ]}| j |  }|d| d| d7 }qdD ]}| j| }|d| d| d7 }qdD ]}|d	| d| | d7 }q0d
D ]p}| |}| j| }| jd|f }| jd|f }| jd|f }| jd|f }	|d| 7 }|r{|d| 7 }|r|d| dt|j	 d7 }|r|d| 7 }|r|d| 7 }|r|d| 7 }|	r|d|	 7 }|d7 }qCd}
d}d}d}t
d| |
 | | D ]\}}|dt||d ||  d7 }qdD ]}|d | j|  d7 }qt| d!S )"zJLow level test function that checks and displays romanization information.r[   )OriyaChinesezSCRIPT r   rD   )   ƿ   β   иu   μπu   ⠹u   亿u   ちょr     𓍧u   正u   分之u   ऽu   ศu   ด์u   ढ़u   ड़zDICT )r  r  u   नu   ुzSCRIPT-NAME )u   万u   u   𓍨u   𓂋u   ่u   เr   r   r   r   zPROPS z  name: z  num:  ()z  pic: z  tone-mark: z  syllable-info: z  is-large-power: u   𝋬r  u   𐍁u   u+   9九万萬百፲፱፻፸¾0²₂AⅫ⑫൵z
NUM-EDGE: r    )   ¼u   २zNUM-PROPS: N)r   r  r   r  ri   r   r   r   r  r   r   NumEdger   r   )rr   outputr,   rO   rg   r   r   r  r  r   mayan12egyptian600runic90klingon2offsetr   r   r   r   -test_output_of_selected_scripts_and_rom_rules~  sN   



" z4Uroman.test_output_of_selected_scripts_and_rom_rulesc                 K  s   g d}|D ]*}|d }t |dkr|d nd}| j|fd|i|}tjd| d| d	 qd}d
}|dk rs|d7 }t|}	| |	}td|rotd|ro| |	}
tjd|dd|	 d|
 d| d		 |d7 }|dk s9tj| d dS )z)A few full cases of romanization testing.))u   ألاسكاN)u8   यह एक अच्छा अनुवाद है.hin)!   ちょっとまってくださいkor)u   Μπανγκαλόρell)u   Зеленськийukr)u   കേരളംmalr   r.   r    Nr   zROM z -> rD   rT  rU  z\sz\SzU+04Xr   z  z alerts for roms with spaces
)	r1   r^  rF   rG   rH   r_   r!   r   ri   )rr   r   teststestr,   r   r   n_alertsre  r   rg   r   r   r   test_romanization  s$   

(zUroman.test_romanizationinput_filenameoutput_filenamer   direct_input	List[str]c                 K  sr  d\}}|r|du r|}nTt |trHzt|dddd}d}W nB ty3   tjd| d	 d}Y n. tyG   tjd
| d	 d}Y nw |du rPtj}ntjd| dt	| d d}t |trztt|ddd}	d}W n. ty   tjd| d	 d}	Y nw |du rtj
}	ntjd| dt	| d d}	d| _|r|	r|d}
d}zt|dD ]\}}td| }r#d}td||}t|}|  j|7  _d}| j|k r|dkrdnd}tjd| d| d| d| d| d |  d	 |  jd7  _n| j|kr!tjd! |  jd7  _|}td"| }r|dd#d$d%\}}}}| j||p?|fi |}|d&tjtjkrb| | | }|	|| d	  n6d'| d(}|g| j||pq|fi | }|	t|d	  n|	t| j|d	|fi |d	  |d)s|d* dkr|d+ dkrtjt| ntjd, d}tj  t  |
r||
kr nqW n/ ty } z"tjd-| d	 tjd. tjd/ tjd0 W Y d}~nd}~ww |rtjd	 tj  |r|  |r|	  |r5| jr7tjd1| d2| j d	 dS dS dS )3zyScript to apply romanization to an entire file. Input and output files needed.
        Language code (lcode) recommended.)FFNr   r   surrogateescaper   errorsTz)Error in romanize_file: Cannot open file rD   z+Error in romanize_file: File not in UTF-8: z2Error in romanize_file: argument 'input_filename' z is of wrong type: z (should be str)
rZ  r   z-Error in romanize_file: Cannot write to file z3Error in romanize_file: argument 'output_filename' r   	max_linesFr    z[\uDC80-\uDCFF]u   �r  r[   r,   zDetected encoding error: file z line z
 contains z non-UTF-8 characterz (replaced by z): z-Too many errors. No further errors reported.
z (::lcode\s+)([a-z]{3})(\s+)(.*)$r.   r-   r  
rom_formatz[0, 0, "", "lcode: z"]rB   d     rC   zUnicodeDecodeError: z<   Please make sure the input stream is in Unicode (UTF-8).
zR   Consider setting the encoding to UTF-8, e.g. set/export PYTHONIOENCODING=UTF-8
z|   Consider using uroman with -i/--input_filename option, which is more robust with respect to encoding errors than piping.
z(Total number of non-UTF-8 characters in r;  )r)   r   r   ra  rF   rG   rH   UnicodeDecodeErrorstdinr  stdoutr   r   r   r!   findallr   r1   r   rJ  r"   r#   r^  r   r   Edgejson_strflushr   collectclose)rr   r  r  r   r  r   f_in_to_be_closedf_out_to_be_closedf_inrp  r  progress_dots_outputr@   r   non_utf8_chars	repl_charline2n_non_utf8_charsmax_n_error_messagess_endingr%   lcode_kwlcode2spacesnt
rom_resultlcode_prefixprefixed_edgeserrorr   r   r   romanize_file  s   





 *

zUroman.romanize_filer   cached_rom_resultstr | List[Edge]r  c                   s,   t | tr| S  dkr| S  fdd| D S )Nr   c                   s*   g | ]}t |j  |j  |j|jqS r   )r  startendr9  r  )r   edger  r   r   r   )  s   * z@Uroman.apply_any_offset_to_cached_rom_result.<locals>.<listcomp>)r)   r   )r  r  r   r  r   %apply_any_offset_to_cached_rom_result!  s
   
z,Uroman.apply_any_offset_to_cached_rom_resultc                 C  s   t d| rOd}| }t d| }r@|ddd\}}}t|dd  d}|dkr2||t| 7 }n||| 7 }t d| }s||| d	rId	nd 7 }|S | S )
Nz\\[xuU][0-9A-Fa-f]{2}r[   zA(.*?)(\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8})(.*)$r    r.   r-   r\      rD   )r!   r   r"   r#   r9   r_   endswith)r,   r   restr%   precorer  r   r   r   decode_unicode_escapes+  s   zUroman.decode_unicode_escapesr  r   c                 K  s  | j r| j|||f}|dur| ||S t|| |d}|jdi | |jdi | |jdi | |j| fi | |j	di | |j
di | |tjkr}|dt|}|| | j| jk ru|| j|||f< |  jd7  _| ||}	|	S |dt|}
|tjtjfv r|tjkr||
 | j| jk r|
| j|||f< |  jd7  _| |
|}	|	S ||
}~| j| jk r|| j|||f< |  jd7  _|}	|	S )zLScript to support token-by-token romanization with caching for higher speed.N)rX   r   r   r    r   )r   r   r   r  Latticepick_tibetan_vowel_edgeprep_brailleadd_romanizationadd_numbersadd_braille_numbersadd_rom_fall_back_singlesr   r   	all_edgesr1   add_alternativesr   r   best_rom_edge_pathr   r   edge_path_to_surf)rr   r,   r   r  r  r   r  latr&  r   
best_edgesr   r   r   r   romanize_string_core=  sF   




zUroman.romanize_string_corec                 K  s  |p| dd}| dr| |}| jru|d}}|tjkr!dng }td| }re|ddd	\}	}
}|| j|	|||fi |7 }|t	|	7 }|| j|
|||fi |7 }|t	|
7 }td| }s+|| j||||fi |7 }|S | j|||dfi |S )
zMain entry point for romanizing a string. Recommended argument: lcode (language code).
        recursive only used for development.
        Method returns a string or a list of edges (with start and end offsets).r   Ndecode_unicoder   r[   u#   (.*?)([.,; ]*[ 。་][.,; ]*)(.*)$r    r.   r-   )
r   r  r   r   r   r!   r"   r#   r,  r1   )rr   r,   r   r  r   r  r  r   m3r  	delimiterr   r   r   r^  d  s    


zUroman.romanize_stringr(   )r   r   )r   r   )r   r9   )r   r   r   r   rg   r   r   r   )NT)r?   r   r   r   r   r   r   r'   )T)r?   r   r   r'   )r*  r   r+  r   r   r   r,   r   r   r   )r,   r   )rO   rP   r  r   rN  r   r4   r   NN)rQ  r   rR  r   rS  r   )rQ  r   rt  r   )FFF)r   r   r   r'   r   r'   r   r'   F)r,   r   r  r'   r   r'   r4   r   r   r   )r,   r   r   r  )r,   r   rQ   r   )r  )r   rE   r  rE   r   r  NNNN)r  r   r  r   r   r   r  r  )r   )r  r  r  r9   r   r  )
r,   r   r   r   r  r   r  r9   r   r  )r,   r   r   r   r  r   r   r  )(r   r|   r}   r   rv   staticmethodr   DEFAULT_ROM_MAX_CACHE_SIZEr   r   r  r)  r-  r8  r>  rG  r   rM  rP  rs  r   r   r  r  r  r  ri   r  r  r`   r  r  r  r  r  r  r,  r   r   r^  r   r   r   r   r      s^    	
 6*

Q]



*c	'r   c                   @  sD   e Zd ZdZddd	d
Zdd Zdd ZdddZedddZ	dS )r  zThis class defines edges that span part of a sentence with a specific romanization.
    There might be multiple edges for a given span. The edges in turn are part of the
    romanization lattice.Nr  r9   r  r,   r   
annotationc                 C  s   || _ || _|| _|| _d S r(   r  r  r9  r  )rr   r  r  r,   r8  r   r   r   rv     s   
zEdge.__init__c              	   C  s&   d| j  d| j d| j d| j d	S )N[ro   ] r  r  r9  rw   r   r   r   r     s   &zEdge.__str__c                 C  s   t | S r(   )r   rw   r   r   r   rx     s   zEdge.__repr__r   c                 C  s   t | j| j| j| jgS r(   )r<  r  r  r  r9  r  rw   r   r   r   r<    s   z	Edge.jsonr  List[Edge] | strc                 C  sL   t | tr| S d}| D ]}t |tr|| 7 }q|t|7 }q|d7 }|S )Nr:  ])r)   r   r  r<  )r  r   r  r   r   r   r    s   

zEdge.json_strr(   )r  r9   r  r9   r,   r   r8  r   r   r   )r  r<  r   r   )
r   r|   r}   r   rv   r   rx   r<  r6  r  r   r   r   r   r  ~  s    
r  c                   @  s>   e Zd Zd d!ddZ									d"d#ddZdd ZdS )$r  Fr  r9   r  r,   r   rX   rY   activer'   c           	      C  s  t | ||| ||| _| _d\| _| _| _| _ddd|f\| _| _	| _
| _d| _d| _|d |kr|d }|j| }rd| _|d| _|d}|rXt|d |d nd| _|d	| _|d
| _|d| _|d| _	|d| _
|   dS dS dS )zHFor NumEdge, the s argument is in original language (not yet romanized).r5  NFr    r   TrS   ra   r  r  r  r,  r   )r  rv   r  r9  rS   ra   r  r  r  r,  r   r?  
n_decimalsr  r   r   r   update)	rr   r  r  r,   rX   r?  r4   rO   r  r   r   r   rv     s*   
zNumEdge.__init__NrS   r5   r  r   ra   rZ   r@  rA   r  r  r,  e_typer  r   c
                 C  s,  t || j| _t || j| _t || j| _t || j| _t || j| _t || j| _t || j| _t || j| _t |	| j	| _	| jd urH| j}n'| jd u rPd}nt
| jtrj| jd urjt | j| jd| j d}nt| j}| jd u rvdn
| jj d| jj }
|r|
rdnd}|| |
 p| j	| _| jS )Nr[   z0.r   r  r   )rL   rS   r  ra   r@  r  r  r,  r  r  r)   rE   r   r  r  r9  )rr   rS   r  ra   r@  r  r  r,  rB  r  r  r  r   r   r   rA    s(   



$zNumEdge.updatec                 C  s  | j d ur| jd ur| j d| j  }nt| j }nd }| jr!dndd| j d| j d| j d| j d| j 
 | j	r=d	nd |d urHd
| nd | j
d ur]t| j
| jkr]d| j
 nd | jd urp| j| jkrpd| j nd | jr|d| j dn  | jrd| j  S d S )Nr}  r[   z *r:  ro   r;  z R:z T:z LPz B:z V:z VS:z F:.r   z S:)r  r  r   r?  r  r  r  r9  r  r   rS   r  r@  r,  )rr   b_clauser   r   r   r     s,   

*($zNumEdge.__str__r2  )
r  r9   r  r9   r,   r   rX   rY   r?  r'   )	NNNNNNNNN)rS   r5   r  r   ra   rZ   r@  rA   r  rA   r  r5   r,  r   rB  r   r  r   r   r   )r   r|   r}   rv   rA  r   r   r   r   r   r    s     r  c                   @  s  e Zd ZdZd}d~dd	Zd
d ZdddZdd ZedddZ	dddZ
dddZdddZdddZddd Zdd#d$Zdd%d&Zdd(d)Zdd+d,Zdd1d2Zdd4d5Zdd6d7Zddd:d;Zdd>d?ZddAdBZdddEdFZddHdIZdJdK ZeddMdNZeddPdQZeddRdSZeddUdVZddXdYZ dZd[ Z!d\d] Z"d^d_ Z#eddgdhZ$ddidjZ%ddkdlZ&dddndoZ'dddpdqZ(dddrdsZ)dddtduZ*		CdddydzZ+edd{d|Z,dS )r  z8Lattice for a specific romanization instance. Has edges.Nr,   r   rX   r   r   c                 C  sH   || _ || _tt| _t|| _|| _i | _i | _	tt
| _|   d S r(   )r,   r   r   r   r   r1   
max_vertexrX   propssimple_top_rom_cacher'   contains_scriptcheck_for_scripts)rr   r,   rX   r   r   r   r   rv     s   


zLattice.__init__c                 C  s>   | j D ]}| j|}d| j|< td| j rd| jd< qd S )NTz[\u2800-\u28FF]Braille)r,   rX   r  rG  r!   r   )rr   r   r  r   r   r   rH    s   


zLattice.check_for_scriptsr  r  c                 C  sL   | j |j|jf | | j |jdf |j | j |jdf |j d S )Nrightleft)r   r  r  r   )rr   r  r   r   r   add_edge  s   zLattice.add_edgec                 C  sn   g }t | jD ]*}| j|df D ] }| j||f D ]}|d| d| d|j d|j d	 qqqd|S )NrJ  r:  ro   r;  r  r  r   )r   rD  r   r   r9  r  rb  )rr   r   r  r  r  r   r   r   r     s   *
zLattice.__str__r   r   r'   c                 C  s    dt | d   kodkS   S )Ni (  r   i(  )r  )r   r   r   r   char_is_braille  s    zLattice.char_is_braillec                 C     d| j |v S )NzSUBJOINED LETTERrX   ri   rr   r   r   r   r   char_is_subjoined_letter  r+   z Lattice.char_is_subjoined_letterc                 C  s   | j |}d|v od|vS )NLETTER	SUBJOINEDrO  )rr   r   rg  r   r   r   char_is_regular_letter  s   zLattice.char_is_regular_letterc                 C  rN  )NrR  rO  rP  r   r   r   char_is_letter  r+   zLattice.char_is_letterc                 C  s   | j jd|f S )Nr0  )rX   r   rP  r   r   r   char_is_vowel_sign  r+   zLattice.char_is_vowel_signc                 C  s   |  |p	| |S r(   )rU  rV  rP  r   r   r   char_is_letter_or_vowel_sign  s   z$Lattice.char_is_letter_or_vowel_signpositionr9   c           
      C  s   | j | }| |}|}| jd|fd  }dv r| S |r4| j |d | }| r4d| jd|f< dS | j|df D ]2}| j||f D ](}|jdkrMd n|jd }	t|jrl|	 sa|rl|	d	v rld| jd|f<   dS qDq;d| jd|f< dS )
Npreceded_by_alphaTFr    TFrK  r[   rT  )')r,   rM  rE  r   isalphar   r9  r1   )
rr   rX  
first_charfirst_char_is_brailler  rY  prev_orig_letterr  r  prev_letterr   r   r   is_at_start_of_word   s(   

zLattice.is_at_start_of_wordc           	      C  s`  | j d|fd  }dv r| S |}|| jk r-| j||d  }| r-d| j d|f< dS |d | jk rg| j| j| rgd| j| j| v rg|d7 }|d | jk rg| j| j| rgd| j| j| v sHt|d | jd D ]5}| j|| }| jj	d|f s n#| jj
| D ]}|d }|d	 std
|rd| j d|f<   dS qqqd| j d|f< dS )Nfollowed_by_alpharZ  r    TFNUKTArH  r   r   z\pL)rE  r   rD  r,   r\  rX   r  ri   r   r   r   r!   r   )	rr   rX  cached_followed_by_alphar  next_orig_letterr  r,   r  r   r   r   r   is_at_end_of_word5  s<   
zLattice.is_at_end_of_wordTuple[bool, str]c           	      C  s  |dkr| j |d  nd}|| jk r| j | nd}| jjd|f r2|d }|| jk r/| j | nd}n|}|d | jk rB| j |d  nd}|du rJdS td|sRdS | jjd|f d	kr^d
S | jjd|f d	krjdS || jkrqdS td|sydS |dkr| |d }|rtd|jrdS t	| j
||d dd| j
||d ddd}td| sdd| fS |dkr|durt	| j
|d |d ddd}td| rdS dS )zAt least initially for Thair.   Nr   r    )Fzstart-of-stringz(?:\pL|\pM)$)Fzstart-of-tokenr   +written-pre-consonant-spoken-post-consonant)Fzpre-post-vowel-on-left)Tzpre-post-vowel-on-right)Tzend-of-stringz(?:\pL|\pM))Tzend-of-tokenr   z[bcdfghjklmnpqrstvxz]$)Fzconsonant-to-the-leftT)simple_search?z[aeiou]znot-followed-by-vowel    อ)Tzo-ang-followed-by-vowel)Fznot-at-syllable-end-by-default)r,   rD  rX   r   r!   r   r"   best_left_neighbor_edger9  rL   *simple_top_romanization_candidate_for_spanr  )	rr   rX  	prev_char	next_charadj_position
next_char2	left_edgenext_char_romnext_char2_romr   r   r   is_at_end_of_syllableN  sZ    
zLattice.is_at_end_of_syllabler   c                 C  s,   z| j j| d d W S  ty   Y d S w )Nr   r   )rX   r   
IndexError)rr   r,   r   r   r   romanization_by_first_rule}  s
   z"Lattice.romanization_by_first_ruler   r  r   Tuple[str, int, int, str | None]c                 K  s  |}| j }| j}d}|dkr|||dfS |dkr||d  nd}	|| }
||d  }|t|k r4|| nd}|	dkrRtd|rR|d  |dd  |d |dfS |d |krn| rn| rn|dd}d	|vrn|	 }|	r|	d
v r|
|	|
|	krtd| }r|	dv r|ddd| }n|ddd| }|d }|| }
|dkr||d  nd}	|
|
dkr|d |kr td|r |jd|	f dkr dD ]H}||krdD ]>}|| t|kr||| | d ||||   }|j| r|j| d }|d }|| || || df    S qq|
|	dkrK|jd|	f dkrKtd|rK| |	 }rK|| |d |dfS |
dkr|| dkr|
|	}|
|}| jd|ddd}| |}|dkr|dkrtd|rtd|sd||dfS |r|dkr|
|dkr| ||d sd| }|d }||d  }|t|k r|| nd}d}d}|r|d v r|
||
|krtd!|r| | }r| ||d s| ||d s|dd" | }|d }||d  }|t|k r|| nd}d}t|}|d#kr:|
|d$v r:|d%v r:|| ||d dfS | j jd&|f rK|||d dfS |d'ra|dks[|	d'kra|dd }|d'r{|t|d ksu|d'kr{|dd" }||||fS )(zThis method contains a number of special romanization heuristics that typically modify
        an existing or preliminary edge based on context.Nr[   r       ⠠z[a-z]r   zrom expablationnocapu	   っッੱz(ch|[bcdfghjklmnpqrstwz])u   っッchr   r   Thai[bcdfghjklmnpqrstvwxyz]+$r   rh  r    )r-   r.   r    u   –z[bcdfghjklmnpqrstvwxyz]rk  T)
return_strz[bcdfghjklmnpqrstvwxz]+$zrom delu   ̀Copticeu   ゃゅょャュョz([bcdfghjklmnpqrstvwxyz]i$)rT  u   ー)HiraganaKatakanaaeiour2  r   )rX   r,   r1   r!   r"   upperisupperislowerr   
capitalizer  r#   rp   r   r   rw  find_rom_edge_path_backwardsrm  r   r3   r   r   r  )rr   r   r  r  r   
orig_startrX   full_stringannotrn  r]  	last_charro  rz  m_double_consonantvowel_prefix_lenvowel_suffix_lenpatternvowel_rom_rule	vowel_romprev_scriptnext_scriptprev_romnext_romy_romlast_rom_charr   r   r   expand_rom_with_special_chars  s   $
 





 


 (z%Lattice.expand_rom_with_special_charsrR   c                 K  sv   | j d r7d}d}t| jD ]*\}}|dkr&| j|d  |kr&||kr&d}q|r6|dv r/d}qd| jd|f< qd S d S )NrI  ry  Fr    Tu   ⠀is-upper)rG  r   r,   rE  )rr   _argsdots6all_capsir   r   r   r   r!    s   
"zLattice.prep_braillec                   s  | j d sd S t|d}| j}| j}g }g }t| jD ]!}|| }||dkr5| |r5|	| q|r>|	| g }q|rF|	| |D ]}d }	d}
g }g  |d }|D ]}|| }|
|7 }
t
| ||d d}d | jd|f< | |s|rtd|r|}	d	| jd|f< |d
gkrd	| jd|d f< nr| |rˈ 	| ||kr|dkr|d }	d	| jd|d f< n	d| jd|d f< tdd|}nA|dkrd| jd|f< ||kr|d }	d	| jd|d f< ||d krd	| jd|f< |r|d dvrd}n
d
}ntdd|}|	| qY|	d ur0|D ]}| jd|fd u r-d| jd|f< qqHtjd d d f\}}}}t|}|D ]}|| }d|d |d  d||d d  }}| jd|fdu r{d}||k rz||||f\}}}}n|dkrd}||k r||||f\}}}}nj|dkr|dkrdnd}||k r||||f\}}}}nLtd|}td|}t fdd||d d  D }|r|rt|d }n"|rt|}n|r|rt|d }n|rt|d }ntj}||k r||||f\}}}}qA|d ur.|D ]}| jd|fd u r,||k}|| jd|f< q|rZt|tr9|nt|d}tjd| d| d|
 d t|d d!| d"| d# qHd S )$NTibetanr   r[   r   r    rj  
edge-vowel	[aeiou]+$Tr[  edge-deleteu   ྰFz([bcdfghjklmnpqrstvwxyz].*)a$z\1u   འrT  r  za'r  r.   g?z(?:|[bcdfghjklmnpqrstvwxz]|bh|bs|ch|cs|dd|ddh|dh|dz|dzh|gh|gr|gs|kh|khs|kss|n|nn|nt|ms|ng|ngs|ns|ph|rm|sh|ss|th|ts|tsh|tt|tth|zh|zhs)'?$z'?(?:.|bd|br|brg|brgy|bs|bsh|bst|bt|bts|by|bz|bzh|ch|db|dby|dk|dm|dp|dpy|dr|gl|gn|gr|gs|gt|gy|gzh|kh|khr|khy|kr|ky|ld|lh|lt|mkh|mny|mth|mtsh|ny|ph|phr|phy|rgy|rk|el|rn|rny|rt|rts|sk|skr|sky|sl|sm|sn|sny|sp|spy|sr|st|th|ts|tsh)$c                   s   g | ]}| v qS r   r   r?  subjoined_letter_positionsr   r   r   Z  s    z3Lattice.pick_tibetan_vowel_edge.<locals>.<listcomp>g333333?g      ?zTib. best cost: "az"  o:z  c:z   p:r   rD   )rG  r'   r   r,   rX   r   rD  r  rW  r   rL   rm  rE  rV  r!   r"   rQ  r   mathinfr1   rb  allr)   r9   roundrF   rG   rH   )rr   r   r   r,   rX   tibetan_syllabletibetan_letter_positionsr  r   	vowel_posr  romsfirst_letter_positionr  r   	best_costbest_vowel_posbest_pre	best_post	n_lettersrel_posr  postcostgood_suffixgood_prefixsubjoined_suffixrS   r   r  r   r      s   








.







&
zLattice.pick_tibetan_vowel_edger[   r8  c                 C  s  | j }| j}z|| }||d  }||}	| j j|	  }
|
d  }s(|W S |
|f}||jv r<|j| \}}}|}nzd|}dtdd |}t	d| d| }rd|
d}||
d }n8t	d	| d| }r|
d}||
d }|d
r|d |kr|d  r|dd }n|}||d  }t	d|s|	dkr|dksd\}}|||f|j|< |du r|W S d|v r|W S |dkr||d  nd}t||kr|| nd}t||d kr||d  nd}|	dkr| jd|frW dS | jd|fr|W S |W S |r |dv s|dv r |dv r |W S | j jd|f r,|W S | j jd|f r8|W S | |rA|W S | j |rT| j jd|f rT|W S | j jd|f r`|W S | j |rs| j jd|f rs|W S | j jd|f r|W S | |rtd|s|W S | |r|	dv r| jdvr|W S | jdv r|W S |W S |||	kr|W S d | j |v r|W S |||	kr|W S W n ty   | Y S w 	 |S )!zZAdds an abugida vowel (e.g. "a") where needed. Important for many languages in South Asia.r    zabugida-default-vowels|c                 S  s   | d S )N+r   )r@  r   r   r   r     s    z3Lattice.add_default_abugida_vowel.<locals>.<lambda>z([cfghkmnqrstxy]?y)(z)-?$r.   z([bcdfghjklmnpqrstvwxyz]+)(ro   r   NrT  r~  r  r[  r1  tailr[   r  r  bcdfghklmnpqrstvwz)ngu   យr0  r1  r2  zr[aeiou])
Devanagari)san)asmbengujkaspanVOCALIC)rX   r,   r  r   r  r   rb  r   r!   r"   r#   r  r\  r1   rE  r   r   rQ  r  ra  r   rf  r   ri   	Exception)rr   r   r  r  r8  rX   r,   first_s_charlast_s_charr  r,  r  rQ   base_rombase_rom_plus_vowelmod_romvowels_regex1vowels_regex2r%   prev_s_charnext_s_charnext2_s_charr   r   r   add_default_abugida_vowelt  s   




" 

z!Lattice.add_default_abugida_vowelr  r   c                 C  s   |d u rdS |d r|  |rdS |d r|  |sdS |d r'| |r'dS |d r2| |s2dS |d rB|  |r@| |sBdS |d  }rO| j|vrOdS dS )	NFr   r   r   r   r   r   T)ra  rf  r   )rr   r  r  r  r   r   r   r   r   cand_is_valid  s&   zLattice.cand_is_validr  c                 C  s   | j || }| jjd|f sg S g }| jj| D ]}|d }| ||||r4||d p.d|d f q|jdd dd |D S )	NrH  r   n-restrr   Treversec                 S  s   g | ]}|d  qS r  r   r?  r   r   r   r     r   zJLattice.simple_sorted_romanization_candidates_for_span.<locals>.<listcomp>)r,   rX   r   r   r  r   sort)rr   r  r  r,   rom_rule_candidatesr  r   r   r   r   .simple_sorted_romanization_candidates_for_span  s   z6Lattice.simple_sorted_romanization_candidates_for_spanFri  c                 C  s   |dk s	|| j krd S ||f}| j| }d ur|S d\}}}| jj| j||  D ]$}	| |	|||	d rO|	d p<d}
|d u sE|
|krO|	d |
|	}}}q+|rT|S |ri|d }|d uri| |\}}|ri|}|| j|< |S )Nr   NNNr   r  t-at-end-of-syllable)rD  rF  r   rX   r   r,   r  ru  )rr   r  r  ri  
span_rangecached_result	best_candbest_n_restrbest_rom_ruler  r   r   ru  	rationaler   r   r   rm    s,   

z2Lattice.simple_top_romanization_candidate_for_spanchar_positionc              	   C  s   | j }|| }d}t| }rg }g }d}| D ])}	|	dr'||	 qz	tt|	d}
W n ty>   ||	 Y qw ||
7 }q|rX|d dvrX|sX|rX| j	
|| j}|rt|dr|dd}|d	krxt||d	  drxd
| }|d	 t|k rt||d	  dr|d
7 }|S )uI   Input: decomposable character such as ﻼ or ½
        Output: la or 1/2Nr[   <r\   r   )z<super>z<sub>z	<noBreak>z<compat>u   ⁄r  r    r   )r,   r6   r]   r^   r   r   r_   r9   r:   rX   r^  r   r7   rp   r1   )rr   r  r  r4   r   ud_decomp_sformat_compsother_compsdecomp_srb   	norm_charr   r   r   
decomp_rom
  s:   

$zLattice.decomp_romc              	   K  s  t | jD ]}t |d | jd D ]}| jjd| j|| f s" n| || }dur| jd rC|d |krC| jd|frC|	 }d}t
d|rT|dd d}}| j||||d	}||r||t|d }|r|t
d
|r||d| d| 7 }|}| j|||f||ddd|\}}}	}
|
p|}| t||	|| q|t| jk r| j| }t|}d|  krdkrn n| j| }r| t||d |d | | }r| t||d |d qdS )z5Adds a romanization edge to the romanization lattice.r    rH  NrI  r  r   \+(m|ng|n|h|r)zrom tail)r8  r  z c:z s:	recursiveF)r8  r  r  r  
rom decomp)r   rD  rX   r   r,   rm  rG  rE  r   r  r!   r"   r  r   r1   r  rL  r  r  r  r  )rr   r   r  r  r   edge_annotationnew_romsuffixstart2end2exp_edge_annotationr4   r  
rom_decompr   r   r   r"  +  sH   



zLattice.add_romanizationList[NumEdge]c                 C  sP   d}g }| D ]}||v rd|_ |r|| d}q|| q|r&|| |S rM   )r?  r   )r   new_edge	old_edgesnew_edge_not_yet_addedr   r  r   r   r   update_edge_listR  s   

zLattice.update_edge_listEdge | Nonec                 C  sT   t | to)| jd uo)t | jto)| jdko)d| j  kodkn  o)| j| j dkS )Nr  r   r~  r    )r)   r  rS   r9   r  r  r  r  r   r   r   edge_is_digitb  s   

zLattice.edge_is_digitc                 C  s   t | to	| jdv S )N)u   零u   〇)r)   r  r  r  r   r   r   is_gap_null_edgek  s   zLattice.is_gap_null_edger4   c                 C  s   d | }|dkrt|S d S )Nu   ⠚⠁⠃⠉⠙⠑⠋⠛⠓⠊r   )findr   )r4   rX  r   r   r   braille_digito  s   
zLattice.braille_digitr9  c                 K  s$   t |||| j}d|_| | d S )Nnumber)r  rX   r  rL  )rr   r  r  r9  r  r  r   r   r   add_braille_numbert  s   zLattice.add_braille_numberc                 K  s   | j d ru| j}d\}}tt|D ]O}|| }|dkr#|d u r"|}q|d ur3| | }r3||7 }q|d ur@|dkr@|d7 }q|d urM|dkrM|d7 }qt|tra|dkra| ||| d\}}q|d urw|dkry| |t|| d S d S d S d S )	NrI  )r[   Nu   ⠼u   ⠲rC   u   ⠂,r[   )rG  r,   r   r1   r   r)   r9   r  )rr   r  r,   r>   r  r  r4   digit_sr   r   r   r$  y  s.   



zLattice.add_braille_numbersc               	   K  sr  t |d}| j}g }tt|D ]%}|| }|j| r7t||d ||}|| |r2td| | 	| q|D ]}	| 
|	r|	jrd}
d}t|	j}|	g}|	}	 | |j}| 
|rw|| |t|j7 }|durt|d7 }|}nU|jt|k r||j dkr|
dkr| |jd  }r| 
|r|du rt|j|jd ||j d}| 	| || || |dt|j 7 }|
d7 }
d}|}nnqUt|d	krd|v rt|nt|}t|d j|d
 jt||dd}|j|||dd|d
 jd | 	| | |||}|rt|j| q:|D ]}	t|	tr|	jr|	jdkrt|	jtr|	jdkr| j|	jdd}|rt|tr|jrt|jtr|jdkr|js|	j|j }t|	j|jt||dd}|j||jd|	j|j |jd | 	| | |||	|g}|rt|j| q|D ]}	t|	tr|	jrt|	jtr|	js|	g}|	}|	}d\}}|r9| j|jdd }r9t|tr9|jr9t|jtr9|js9| |s|j|jkr9|j|jkr9|| |}| |s|}|r9| j|jdd }r9t|tr9|jr9t|jtr9|js9| |s|j|jkr9|j|jkst|d	krtdd |D }t|d j|d
 jt||dd}|j||d
 jdddd |D |d
 jd | 	| | |||}d|_|rt|j| q|D ]}	t|	tr|	jr|	jst|	jtst|	jtr| j|	jdd}|rt|tr|jrt|jtr|jdkr|jrt|	j|j d}t|tr|  rt|}t|	j|jt||dd}|j||jd|	j|j |jd | 	| | |||	|g}|rt|j| q|D ]}	t|	tr5|	jr5t|	jtr5|	g}|d
  }r| j|jdd }rt|tr|jrt|jtr|j|jkr|j|jkr|jdkr|jdkrd|jvrt!"dt|jrd|j  krdkrn n|jd |jkr|jd }||j }||_||_d |_|| |d
  }r| j|jdd }rt|tr|jrt|jtr|j|jkr|j|jksjt|d	kr5td!d |D }t|d j|d
 jt||dd}|j||d
 jd"dd#d |D |d
 jd | 	| | |||}|r5t|j| q"|D ]}	|	jdu rCq9t|	jtsLq9| j#j$D ]}|	jt| }| j|	j| |krfqP| |}|jdu rsqP|	jd$krt|jtst|jtr|	jdkrt|	j|j|j d%d&}| 	| | |||	|g}qPt|jtr|	jdkrt|	j|j|j d'|	j | j#d}t%|j|	j|_&d(|_| 	| | |||	|g}qPq9|D ]m}	| '|	j}| j#j(D ]-}| j|	jt| |	j |kr t|	jt| |	jd)|	j) |	j d*}| 	| q| j#j*D ]-}| j|	jt| |	j |krRt|	jt| |	jd+|	j) |	j d,}| 	| q&q|D ]1}	t|	trt!"d-|	j)r| '|	j}|rt!+d.|j)r|	j&rd/}nd0}||	j) |	_)qX|D ]1}	t|	tr|	jr|	jdur|	jdkr|	jd |	jks|	jd1v s|	jd2v rd|	_q|r|rtd3 |D ]}t| qtt|D ]^}|| }| ,||d  }rt|trqt-| }dur5| j#.|}d4|v r)t|tr)d|  krdkr)n n| 	t||d t|d5 q|j/d6||f  d7  < qdS )7zZAdds a numerical romanization edge to the romanization lattice, currently just for digits.r   r    r  r   NTrC   zdecimal periodr.   rT  )r?  D1)rS   r  r@  r  rB  r,  Fskip_num_edgeG1)rS   r  rB  r  r,  r1  c                 S     g | ]}|j qS r   r   r   r  r   r   r   r         z'Lattice.add_numbers.<locals>.<listcomp>G2r[   c                 S  r	  r   r  r
  r   r   r   r     r  r  G3r[  r  tagz10+$r~  r  G4tagc                 S  r	  r   r   r
  r   r   r   r   '  r  G4c                 S  r	  r   r  r
  r   r   r   r   *  r  r  %
percentager  ra   ro   z -r  z +z\dz\d$r      ·u   兩參参伍陆陸什)u   京兆zactives:rX  r   z*NUM)0r'   r   r,   r   r1   r   r  r   r   rL  r  r?  r   rS   best_right_neighbor_edger  r  rE   r9   r  rA  r,  r  r  r)   r  r   r  r  sumrb  r  r8   r!   r"   rX   r   r   ra   rl  r   r9  r   r   best_edge_in_spanr=   ri   r   ) rr   rX   r   r   r,   	num_edgesr  r4   r  r  n_decimal_pointsr@  new_value_s	sub_edges	prev_edge
right_edgeright_edge2	new_valueprev_non_edgenew_num_baser   fraction_connector_end
_left_edge
minus_sign	plus_signrr  sepnum_edge
start_char	best_edger   rg   r   r   r   r#    sB  






$


"

 



*


"






"




"

(

 *
 *
 "2 zLattice.add_numbersc                 K  s   t | jD ]d}|d }| j| }| j||f si|d}}| j|r'd\}}n8| j|r2d\}}n-t|dkr>d\}}n!|dkrEd}n| 	|| }dur_|}t
d	|r]|dd }d
}| t|||| qdS )zFor characters in the original string not covered by romanizations and numbers,
        add a fallback edge based on type, romanization of single char, or original char.r    orig)r[   r  )r[   r  Co)r[   r+  r   Nr  z
rom single)r   rD  r,   r   rX   r  r  r6   rB  rm  r!   r"   rL  r  )rr   r  r  r  	orig_charr   r  rom2r   r   r   r%  |  s*   




z!Lattice.add_rom_fall_back_singlesr  
List[Edge]r  new_typerA   old_edge_dictrP   c                 C  sV   |||f|vr)t ||||}|d u r| | n| |d | |||||f< d S d S r0   )r  r   insert)r  r  r  r  r/  rX  r0  r  r   r   r   add_new_edge  s   zLattice.add_new_edgec                 C  sR  i }|D ]}|||j |j|jf< qt|D ]\}}|jdr q|j |j}}| j|| }|j}td|j }	rC|		dd\}
}nd\}
}| j
j| D ]X}|d }| ||||r|d }|d }|||
fv r|r|D ]}|ry||
kry||7 }| ||||d|| qm||kr|r| ||||d	|| ||kr| ||||d
|| qMqd S )Nzrom-altz\bc:([a-z]+)\s+s:([a-z]+)\br    r.   r1  r   zt-altsr  zrom-alt2zrom-alt3)r  r  r9  r   r  r   r,   r!   r   r#   rX   r   r  r2  )rr   r  r0  old_edgerX  r  r  orig_sold_romr%   old_rom_coreold_rom_suffixr  rom_trom_altsrom_end_of_syllablerom_altr   r   r   r'    s>   zLattice.add_alternativesc                 C  sV   g }t ||D ]!}tt| j|df ddD ]}||kr'|| j||f  q q|S )NrJ  Tr  )r   r`  r   r   extend)rr   r  r  r   r  r  r   r   r   r&    s   zLattice.all_edgesr  c           	      C  s   | j ||f }d\}}}|D ]2}t|tr|rq|jr|  S |jdr,|d u r+|}qtd|jr:|d u r9|}q|d u r@|}q|pF|pF|S )Nr  r  z(?:rom|num))r   r)   r  r?  r  r   r!   r"   )	rr   r  r  r  r   decomp_edge
other_edgerom_edger  r   r   r   r    s(   

zLattice.best_edge_in_spanc                 C  s@   t t| j|df ddD ]}| j|||d }r|  S qd S )NrJ  Tr  r  r`  r   r   r  )rr   r  r  r  r)  r   r   r   r    s
   z Lattice.best_right_neighbor_edgec                 C  s<   t t| j|df D ]}| j|||d }r|  S qd S )NrK  r  r@  )rr   r  r  r  r)  r   r   r   rl    s
   zLattice.best_left_neighbor_edgec                 C  sH   g }|}||k r"| j ||d }r|| |j}n|d7 }||k s|S )zFinds the best romanization edge path through the romanization lattice, including
        non-romanized pieces such as ASCII and non-ASCII punctuation.r  r    )r  r   r  )rr   r  r  r  r   r  r)  r   r   r   r(    s   
zLattice.best_rom_edge_pathmin_charr  r<  c                 C  sz   g }d}|}||k r7|}	| j ||d }
r"|
g| }|
j| }|
j}|r+t||kr+n|	|kr3|d8 }||k s
|r;|S |S )zFinds a partial best path on the left from a start position to provide left contexts for
        romanization rules. Can return a string or a list of edges. Is typically used for a short context,
        as specified by min_char.r[   r  r    )rl  r9  r  r1   )rr   r  r  rA  r  r  result_edgesr   r  old_end2r  r   r   r   r    s"   


z$Lattice.find_rom_edge_path_backwardsc                 C  s   d}| D ]}||j 7 }q|S )Nr[   )r9  )r   r   r  r   r   r   r)  	  s   zLattice.edge_path_to_surfr(   )r,   r   rX   r   r   r   )r  r  )r   r   r   r'   )rX  r9   r   r'   )rX  r9   r   rg  )r   r   )r   r   r  r9   r  r9   r   rx  )r   rR   )r[   )
r   r   r  r9   r  r9   r8  r   r   r   )
r  r   r  r9   r  r9   r   r   r   r'   )r   r  r2  )ri  r'   r   r   )r  r9   r   r   )r   r  )r  r  r   r'   )r  r  r   r'   )r4   r   r   r   )r  r9   r  r9   r9  r   r   rR   )r  r.  r  r9   r  r9   r  r   r/  r   rX  rA   r0  rP   r   rR   )r  r.  r   rR   )r  r9   r  r9   r   r.  )r  r9   r  r9   r  r'   r   r  )r  r9   r  r'   r   r  )r  r9   r  r'   r   r  )r  r9   r  r9   r  r'   r   r.  )NFF)r  r9   r  r9   rA  rA   r  r'   r  r'   r   r<  r>  )-r   r|   r}   r   rv   rH  rL  r   r6  rM  rQ  rT  rU  rV  rW  ra  rf  ru  rw  r  r!  r   r  r  r  rm  r  r"  r  r  r  r   r  r$  r#  r%  r2  r'  r&  r  r  rl  r(  r  r)  r   r   r   r   r    sh    









/

p
t
[

!'
 n

 
r  c                    s8  t  } | jddtd | jdtddd | jdd	td
d | jddtdd | jddtddd | jddttjttdd | jdtddd | jddddd | jddddd | jdd ddd!d | jd"d#ddd$ | jd%ddd&d | jd'ddd(d | jd)d*tt	d+d | jd,ddd-d | jd.d/td0d1d | jd2ddd3d | jd4ddd5d | jt
t jd6d7d8d9dd:d;d< | jd=d>d?t d@t dA |  }|j|j|j|j|j|j|j|j|j|j|j|jdB}d}|jr	t  ttj ttj  t!" }|  	 |j#r]t$|j% dC\}}}}t&| '| t&| j'|dDdE t&|t() j'|tj*dF t&|t() j'|tj*dF t&|t() j'|tj+dF  j,dGdHdI nt$|j%fi | |j-p|j.p|j/p|jp|j#p|jp|j }|j/D ],} j'|0dJfdK|j1i|}	t()|	}
|r|j-rt2j34|
dJ  qt&|
 q|rƈ j,|j-|j.f|j1|j/dL| |jr҈ 5   6   jr|jr fdMdNtt7 jddO D }t2j34dP| dQ |jr|r|8  t9j:||jdR;t9j<j=}|>  t&t?  dS dS )Sam  This function provides a user interface, either using argparse for a command line interface,
    or providing direct function calls.
    First, a uroman object will have to created, loading uroman data (directory must be provided,
    listed as default). This only needs to be done once.
    After that you can romanize from file to file, or just romanize a string.r  r}  )nargsr  z
--data_dirNzuroman resource dir)r  r   helpz-iz--input_filenamezdefault: sys.stdin)r  rE  z-oz--output_filenamezdefault: sys.stdoutz-lz--lcodez!ISO 639-3 language code, e.g. engz-fz--rom_formatz7Output format of romanization. 'edges' provides offsets)r  r   choicesrE  z--max_lineszlimit uroman to first n linesz
--load_logcountr   zreport load stats (boolean))actionr   rE  z--testzperform/display a few testsz-dz--decode_unicodeu2   decodes Unicode escape notation, e.g. \u03B4 to δz-vz	--verbose)rH  r   z--rebuild_ud_propsz:rebuild UnicodeDataProps files (for development mode only)z--rebuild_num_propsz1rebuild NumProps file (for development mode only)z-cz--cache_sizez	for speedz--silentzsuppress ... progressz-az
--ablationr[   zfor development mode: nocapz--statszfor development mode: numbersz--ignore_argszfor usage illustration onlyrZ  r   ignorer  zPROFILE-FILENAMEz*(optional output for performance analysis))r  r   metavarrE  z	--versionversionzuroman z   last modified: )rH  rK  )r  r   r  r   r   r  r   r   r   rz  rB   r-  )u
   Игорьr  u   ka‍n‍neu'   महात्मा गांधी  r  )r   )r  z../test/multi-script.txtz../test/multi-script-out3.txt)r  r  rD   r   )r   r  c                   s   i | ]}| j | qS r   )r   )r   krX   r   r   
<dictcomp>	  s    zmain.<locals>.<dictcomp>r  zStats: z ...
)stream)@argparseArgumentParseradd_argumentr   r   r   r   r   r9   r7  PROFILE_FLAGFileType__version____last_mod_date__
parse_argsr  r   r  r   r   r  r   r   r   rz  rB   r-  profiler   r   	set_debugDEBUG_STATS
DEBUG_LEAKcProfileProfileignore_argsr   r   r   r^  r  r  r   r   r  r  r  r  rJ  r   rF   rG   rH   r  r  rP   r   pstatsStats
sort_statsSortKeyTIMEprint_stats	get_stats)parserr   	args_dictprr,   s2s3s4romanize_file_pr   result_jsonstats100psr   rM  r   main	  s   




$rp  __main__r(   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r'   r0  )r4   r   r   r5   )NNF)
r>   r   r?   r   r@   rA   rB   r'   r   r5   r3  )rO   rP   rQ   r   r   rR   r1  )rU   r   rV   rW   rX   rY   r   rZ   r4  )rQ   r   r   rj   )7r   
__future__r   rP  collectionsr   r	   enumr   	fractionsr   r   r<  r  r  pathlibr   r_  r!   rF   typingr   r   unicodedatar6   r7  rS  argvr\  rU  rV  __description__r   r&   r*   r/   r3   r=   rI   rL   rN   rT   rf   ri   rl   rm   r   r   r   r   r  r  r  rp  r   r   r   r   r   <module>   s   







       1"J        8z
