o
    Xεi8x                     @   s*  d Z ddlZddlZddlZddlZddlmZ ddlZddl	m
Z
mZmZmZ ddlmZ ddlmZ ddlmZ ddlmZmZ dd	lmZmZmZ ed
Z						d[dedejejej eef   dejej eef  deje de!de!de!defddZ"G dd dZ#d\defddZ$d\defddZ%e&dZ'e&dZ(e&dZ)e&dZ*e&d ej+ej,B Z-d!ede!fd"d#Z.d!edeje/ fd$d%Z0d!edeje fd&d'Z1d(edeje fd)d*Z2d\defd+d,Z3d\defd-d.Z4d\defd/d0Z5G d1d2 d2Z6d3ed4ed5efd6d7Z7d\defd8d9Z8d3ed4ed5efd:d;Z9d<ed=ede!fd>d?Z:d@ede!fdAdBZ;d\defdCdDZ<d\defdEdFZ=d\defdGdHZ>d\defdIdJZ?d\defdKdLZ@d\defdMdNZAd\defdOdPZBd\defdQdRZCd\defdSdTZDG dUdV dVZEG dWdX dXZFG dYdZ dZZGdS )]zLanguage-specific settings    N)Path)PHONEMES_TYPE	GraphTypeSentenceNodeTime)GraphemesToPhonemes)SqlitePhonemizer)PartOfSpeechTagger)InterpretAsFormatTextProcessorSettings)find_lang_dirremove_non_word_charsresolve_langgruutTlangsearch_dirslang_dirmodel_prefixload_pos_taggerload_phoneme_lexiconload_g2p_guesserreturnc                 K   s  |pd}|r|}| }	nd| v r| j ddd\}	}n| }	d}t|	}	|du r+t|	|d}|durt|}|rQd|vrQ|d d	 }
|
 rJt|
|d< ntd
| |
 |r}d|vr}|| d }| rvtdd gt	j
d}t|fi ||d< ntd| | |rd|vr|| d d	 }| rt|t	j
d|d< ntd| | |	dkrt|fi |S |	dkrt|fi |S |	dv rt|fi |S |	dkrt|fi |S |	dv rt|fi |S |	dkrt|fi |S |	dkrt|fi |S |	dkrt|fi |S |	dkrt|fi |S |	dkrt|fi |S |	dkr(t|fi |S |	d kr5t|fi |S |	d!krBt|fi |S |	d"krOt|fi |S |	d#kr\t|fi |S td%d$| i|S )&z$Get settings for a specific language /   )maxsplitN)r   get_parts_of_speechposz	model.crfz)(%s) no part of speech tagger found at %slookup_phonemesz
lexicon.dbc                 S   s   t |  S N)r   lower)s r"   >/home/ubuntu/.local/lib/python3.10/site-packages/gruut/lang.py<lambda>P   s    zget_settings.<locals>.<lambda>)word_transform_funcscasing_funcz,(%s) no phoneme lexicon database found at %sguess_phonemesg2p)transform_funcz1(%s) no grapheme to phoneme CRF model found at %sarzcs-cz>   en-gben-uszde-de>   es-eses-mxfazfr-frzit-itlbnlptzru-ruzsv-seswzzh-cnr   r"   )splitr   r   r   is_fileDelayedPartOfSpeechTagger_LOGGERdebugr   strr    DelayedSqlitePhonemizerDelayedGraphemesToPhonemesget_ar_settingsget_cs_settingsget_en_us_settingsget_de_settingsget_es_settingsget_fa_settingsget_fr_settingsget_it_settingsget_lb_settingsget_nl_settingsget_pt_settingsget_ru_settingsget_sv_settingsget_sw_settingsget_zh_settingsr   )r   r   r   r   r   r   r   settings_argslang_model_prefix	lang_onlypos_model_pathlexicon_db_pathphonemizer_argsg2p_model_pathr"   r"   r#   get_settings   s   









rR   c                   @   s"   e Zd ZdZdedefddZdS )ArabicPreProcessTextz Pre-processes text using mishkaltextr   c                 C   sz   z(dd l }t| ds|j }t| d| nt| d}|d us!J ||}W |S  ty<   td td Y |S w )Nr   	vocalizerz/mishkal is highly recommended for language 'ar'zpip install 'mishkal>=0.4.0')	mishkal.tashkeelhasattrtashkeelTashkeelClasssetattrgetattrImportErrorr7   warning)selfrT   mishkalrU   r"   r"   r#   __call__   s   



zArabicPreProcessText.__call__N)__name__
__module____qualname____doc__r9   r`   r"   r"   r"   r#   rS      s    rS   c              	   K   sF   h dh dddhh dh dt jdgt d|}tdd	d
i|S )zCreate settings for Arabic      ؟.!>      ،:;-_      «   “   „"(<[      »   ”rr   )>]   ’')major_breaksminor_breaksword_breaksbegin_punctuationsend_punctuationsdefault_date_formatreplacementspre_process_textr   r*   Nr"   )r
   DATE_DMYrS   r   r   rK   r"   r"   r#   r<      s   	r<   c              	   K   D   h dh dddhh dh ddt jdgd	|}tdd
di|S )zCreate settings for Czech   rg   rh   ?   ,rj   rk   rl   rm      ro   r}   rp   rq   rr   rs   rt   ru      rw   r}   rx   rr   ry   rz   r{   EURr|   r   r   r   r   r   default_currencyr   r   r   cs_CZNr"   r
   r   r   r   r"   r"   r#   r=         	r=   z^\s*[A-Z]{2,}\s*$z^(?:\s*[a-zA-Z]\.){1,}\s*$z	^(\W|_)+$z#^(-?[0-9][0-9,]*)(?:st|nd|rd|th).*$z^((0?[0-9])|(1[0-1])|(1[2-9])|(2[0-3]))  # hours
         (?::
         ([0-5][0-9]))?                          # minutes
         \s*(a\\.m\\.|am|pm|p\\.m\\.|a\\.m|p\\.m)? # am/pm
         $rT   c                 C   s   t | dupt| duS )z)True if text is of the form TTS or T.T.S.N)EN_INITIALISM_PATTERNmatchEN_INITIALISM_DOTS_PATTERNrT   r"   r"   r#   en_is_initialism  s   r   c                 C   s.   t | }|durttdd|dS dS )z-Parse English ordinal string (e.g., 1st -> 1)Nz[^0-9]r   r   )EN_ORDINAL_PATTERNr   intresubgroup)rT   r   r"   r"   r#   en_get_ordinal  s   
r   c                 C   s   t |   }|du rdS t|d}|d}|du r!dnt|}|d}|dur8d|v r5d}n	d}nd	| vr>dS t|||d
S )z&Parse English clock time (e.g. 4:01pm)Nr      r      aA.M.P.M.rj   )hoursminutesperiod)EN_TIME_PATTERNr   stripr    r   r   r   )rT   r   r   maybe_minutesr   r   r"   r"   r#   en_parse_time  s   

r   timec                 c   s    | j }|dk}|dkr|d8 }n|dkrd}d}t|V  | j}|dkr1|dk r,dV  t|V  | jdu rB|r=dV  dS dV  dS | jV  dS )	zConvert time into words   r   T
   ohNr   r   )r   r9   r   r   )r   hour	past_noonminuter"   r"   r#   en_verbalize_time5  s&   





r   c                 K   s   i dh ddh ddddhdh d	d
h ddddddt ddd ddd dtdtdtddgdddddd d!d"d#d$d%d&d'd(d)d*d+d,d-d.d/d0d1|}td5d2d3i|S )6zCreate settings for Englishr   r   r      ...r   rj   rk   r   rl   rm   r   >	   *rm   ro   rp   rr   r~   rs   rt   ru   r   >	   r   rm   rw   rx   rr   r~   ry   rz   r{   r   USDr   z{m} {o}, {y}is_initialismsplit_initialismc                 S   s   t | ddS )Nrg   r   )listreplacer   r"   r"   r#   r$   ^  s    z$get_en_us_settings.<locals>.<lambda>is_non_wordc                 S   s   t | d uS r   )EN_NON_WORD_PATTERNr   r   r"   r"   r#   r$   _  s    get_ordinal
parse_timeverbalize_timer   r|   abbreviationsz\1ompanyz\1octorz\1octorsz	\1unior\2z\1imitedz\1isterz\1issz\1issusz\1treetz\1ersusz
\1 percentzand\1z\1ount)z
^([cC])o\.z
^([dD])r\.z^([dD])rs\.z^([jJ])r\.('s)?z^([lL])td\.z
^([mM])r\.z
^([mM])s\.z^([mM])rs\.z
^([sS])t\.z^([vV])s\.?z(.*\d)%z^&(\s*)$z
^([mM])t\.spell_out_wordsdotdashatstarplusslash)rg   rl   @r   +r   r   en_USNr"   )r   r   r   r   r   r   r"   r"   r#   r>   S  sr   	
&)r>   c              	   K   r   )zCreate settings for Germanr   r   rl   rm   r   r   r   r|   r   r   de_DENr"   )r
   DATE_DMY_ORDINALr   r   r"   r"   r#   r?     r   r?   c              	   K   r   )zCreate settings for Spanishr   r   rl   rm   >      ¡ro      ¿rp   rr   rs   rt   ru   rv   r   r|   r   r   es_ESNr"   r   r   r"   r"   r#   r@     r   r@   c                   @   s<   e Zd ZdZdefddZdeje deje fddZ	d	S )
FarsiPartOfSpeechTaggerzAdd POS tags with hazmr   c                 C   s
   || _ d S r   )r   )r^   r   r"   r"   r#   __init__  s   
z FarsiPartOfSpeechTagger.__init__wordsr   c                 C   s   g }z\dd l }t| dd }|d u r| }t| d| t| dd }|d u r:| jd d }|jt|d}t| d| d|}||	|D ]}|
||D ]	\}	}
||
 qQqGW |S  tyr   td td	 Y |S w )
Nr   
normalizertaggerr   zpostagger.model)model z,hazm is highly recommended for language 'fa'zpip install 'hazm>=0.7.0')hazmr[   
NormalizerrZ   r   	POSTaggerr9   joinsent_tokenize	normalizetagword_tokenizeappendr\   r7   r]   )r^   r   pos_tagsr   r   r   
model_pathrT   sentence_wordr   r"   r"   r#   r`     s0   

z FarsiPartOfSpeechTagger.__call__N)
ra   rb   rc   rd   r   r   typingSequencer9   r`   r"   r"   r"   r#   r     s    "r   graph	sent_nodesettingsc                 C   s   ddl m}m} t| |jD ]:}| |dksq| j| | }t||rIt	
||}|jrI|jdkrIt|jtr@|jd qt|jdg |_qdS )u   Add e̞ for genitive caser   	DATA_PROPWordNodeNeu   e̞N)gruut.text_processorr   r   nxdfs_preorder_nodesnode
out_degreenodes
isinstancer   castphonemesr   r   r   )r   r   r   r   r   dfs_noder   wordr"   r"   r#   fa_post_process_sentence  s   
r   c              	   K   s`   h dh dddhh dh dt jdgtd|}| d	ur'd
|vr't| |d
< tdddi|S )zCreate settings for Farsire   r   rl   rm   r   r   r|   )r   r   r   r   r   r   r   post_process_sentenceNr   r   r/   r"   )r
   r   r   r   r   r   r"   r"   r#   rA     s   	rA   c                 C   s  ddl m}m} ddlm} g }t| |jD ]!}| |dks!q| j	| | }t
||r8t||}	||	 q||dD ]\}
}|du rGq>|
jrS|
jrS|jrS|jsTq>d}|
jd }t||
jd }t|jd }|r|r|
jdkrtn6|
jd	v r|d
}n.|
jdkr|jdv rd
}n!|
jdks|
jdkrd
}n|
jdkr|jdv rd
}n|
jdv rd
}|r|
j}|dv r|d q>|dkr|d q>|dv r|| q>dS )zAdd liasons to phonemesr   r   )sliding_window   NFet>   DETNUMTPRON>   AUXVERBADPu   trèsADJ>   NOUNPROPN   r!   xzr  dt>   npr  )r   r   r   gruut.utilsr   r   r   r   r   r   r   r   r   r   rT   r   fr_has_silent_consonantfr_is_vowelr   )r   r   r   r   r   r   r   r   r   	word_nodeword1word2liason
last_char1ends_silent_consonantstarts_vowelliason_pronr"   r"   r#   fr_post_process_sentence  sV   






r  	last_charlast_phonemec                 C   sD   | dv r|| kS | dkr|dkS | dv r|dvS | dkr |dvS dS )	z*True if last consonant is silent in French>   r  r  r  ru   ʁr	  >   r!   r  r  >      ŋr  Fr"   )r  r  r"   r"   r#   r  M  s   r  phonemec                 C   s   | dv S )z!True if phoneme is a French vowel>      œ   ɔ   ə   ɛ   œ̃   ɑ̃   ɔ̃   ɛ̃r   eiouy   ør"   )r   r"   r"   r#   r  ]  s   r  c              
   K   F   h dh dddhh dh ddt jdgtd		|}tdd
di|S )zCreate settings for Frenchr   r   rl   rm   rn   rv   r   r|   	r   r   r   r   r   r   r   r   r   r   fr_FRNr"   )r
   r   r  r   r   r"   r"   r#   rB   r     
rB   c              
   K   r/  )zCreate settings for Italianr   r   rl   rm   rn   rv   r   r|   r0  r   it_ITNr"   )r
   r   r  r   r   r"   r"   r#   rC     r2  rC   c              
   K   sF   h dh dddhh dh ddt jdgd	d
	|}tddd	i|S )z!Create settings for Luxembourgishr   r   rl   rm   rn   rv   r   r|   r0   )	r   r   r   r   r   r   r   r   babel_localer   Nr"   r   r   r"   r"   r#   rD     r2  rD   c              	   K   r   )zCreate settings for Dutchr   r   rl   rm   rn   rv   r   r|   r   r   r1   Nr"   r   r   r"   r"   r#   rE     r   rE   c              	   K   r   )z&Create default settings for Portugueser   r   rl   rm   rn   rv   r   r|   r   r   r2   Nr"   r   r   r"   r"   r#   rF     r   rF   c              	   K   r   )zCreate settings for Russianr   r   rl   rm   rn   rv   RUBr|   r   r   ru_RUNr"   r   r   r"   r"   r#   rG     r   rG   c                 K   B   h dh dddhh dh dt jdgd|}tdd	d
i|S )zCreate settings for Swedishr   r   rl   rm   rn   rv   r|   r   r   r   r   r   r   r   r   sv_SENr"   r   r   r"   r"   r#   rH        
rH   c                 K   r7  )zCreate settings for Swahilir   r   rl   rm   rn   rv   r|   r8  r   r3   Nr"   r   r   r"   r"   r#   rI     r:  rI   c                 K   s<   h dh dh dh ddht dd|}tddd	i|S )zCreate settings for Chinese>      。   ！   ？>      、   ，   ：   ；   ……>      〈   《   「   【   ﹁   （   ［rr   >      〉   》   」   ﹂   ）   ］    】rr   u   ‧r   )r   r   r   r   r   split_wordsjoin_strr   zh_CNNr"   )r   r   r   r"   r"   r#   rJ     s   
rJ   c                   @   sh   e Zd ZdZ	ddejeef dejej	egef  fddZ
	ddedeje d	eje fd
dZdS )r;   z3Grapheme to phoneme guesser that loads on first useNr   r)   c                 K   s   || _ d | _|| _|| _d S r   )r   r(   r)   g2p_args)r^   r   r)   rT  r"   r"   r#   r   2  s   
z#DelayedGraphemesToPhonemes.__init__r   roler   c                 C   sZ   | j d u rtd| j t| jfi | j| _ | j d usJ | jd ur(| |}|  |S )Nz-Loading grapheme to phoneme CRF model from %s)r(   r7   r8   r   r   rT  r)   )r^   r   rU  r"   r"   r#   r`   =  s   



z#DelayedGraphemesToPhonemes.__call__r   )ra   rb   rc   rd   r   Unionr9   r   OptionalCallabler   r   r`   r"   r"   r"   r#   r;   /  s     
r;   c                   @   sF   e Zd ZdZdejeef fddZdej	e dej	e fddZ
d	S )
r6   z"POS tagger that loads on first user   c                 K      t || _d | _|| _d S r   )r   r   r   tagger_args)r^   r   rZ  r"   r"   r#   r   Q     

z"DelayedPartOfSpeechTagger.__init__r   r   c                 C   sF   | j d u rtd| j t| jfi | j| _ | j d usJ |  |S )Nz%Loading part of speech tagger from %s)r   r7   r8   r   r	   rZ  )r^   r   r"   r"   r#   r`   W  s
   

z"DelayedPartOfSpeechTagger.__call__N)ra   rb   rc   rd   r   rV  r9   r   r   r   r`   r"   r"   r"   r#   r6   N  s    "r6   c                
   @   sR   e Zd ZdZdejeef fddZ	ddedej	e d	e
d
ej	e fddZdS )r:   z"Phonemizer that loads on first usedb_pathc                 K   rY  r   )r   r\  
phonemizerrP   )r^   r\  rP   r"   r"   r#   r   c  r[  z DelayedSqlitePhonemizer.__init__NTr   rU  do_transformsr   c                 C   s\   | j d u rtd| j tt| j}tdd|i| j| _ | j d us&J | j |||dS )Nz$Connecting to lexicon database at %sdb_conn)rU  r^  r"   )	r]  r7   r8   r\  sqlite3connectr9   r   rP   )r^   r   rU  r^  r_  r"   r"   r#   r`   i  s   
z DelayedSqlitePhonemizer.__call__)NT)ra   rb   rc   rd   r   rV  r9   r   r   rW  boolr   r`   r"   r"   r"   r#   r:   `  s    r:   )NNNTTTr   )Hrd   loggingr   r`  r   pathlibr   networkxr   gruut.constr   r   r   r   	gruut.g2pr   gruut.phonemizer   	gruut.posr	   r   r
   r   r  r   r   r   	getLoggerr7   r9   rW  IterablerV  rb  rR   rS   r<   r=   compiler   r   r   r   
IGNORECASEXr   r   r   r   r   r   r>   r?   r@   r   r   rA   r  r  r  rB   rC   rD   rE   rF   rG   rH   rI   rJ   r;   r6   r:   r"   r"   r"   r#   <module>   s   
	
 !





	3%

J