o
    Xεi]                    @   sJ  d Z ddlZddlZddlZddlZddlm  mZ ddlm	Z	 ddl
mZ ddlZddlZddlZddlZddlmZ ddlmZ ddlmZmZmZmZmZmZmZmZmZmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z- ddl.m/Z/ dd	l0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8m9Z9 e:d
Z;dZ<G dd dZ=dS )3Tokenizes, verbalizes, and phonemizes text and SSML    NDecimalPath)IPA)	num2words)	DATA_PROPPHONEMES_TYPEREGEX_PATTERN	BreakNode	BreakTypeBreakWordNode
EndElement	GraphType
IgnoreNodeInlineLexiconInterpretAsInterpretAsFormatLexemeMarkNodeNodeParagraphNodePunctuationWordNodeSentenceSentenceNode	SpeakNodeSSMLParsingStateTextProcessorSettingsWordWordNodeWordRole)get_settings)	attrib_no_namespaceleavesload_lexiconmaybe_split_ipapipeline_splitpipeline_transformresolve_langtag_no_namespacetext_and_elementszgruut.text_processor c                %   @   sz  e Zd ZdZ					dedededejejeejee	f f  dejej
ejee	f   d	ejejeef  f
d
dZ							dfdedededededededededej
e fddZdededej
e fddZdgdeje defddZdd Z		 													!dhd"edeje d#eded$ed%ed&ed'ed(ed)ed*ed+ed,ed-ed.ed/edejeef f"d0d1Zdedefd2d3Zdededefd4d5Zded6efd7d8Zded6efd9d:Zded6efd;d<Zded6efd=d>Zd?d@ Z 	dgdAejee!f deje deje" fdBdCZ#			didDejej$ej$e   dEejej%eeje gef  fdFdGZ&ded6efdHdIZ'ded6efdJdKZ(ded6efdLdMZ)ded6efdNdOZ*ded6efdPdQZ+ded6edefdRdSZ,ded6edefdTdUZ-ded6efdVdWZ.ded6efdXdYZ/dZed	edeje fd[d\Z0ded6efd]d^Z1ded6efd_d`Z2ded6efdadbZ3ded6efdcddZ4dS )jTextProcessorr   en_USr,   Ndefault_langmodel_prefix	lang_dirssearch_dirssettingsc                 K   sN   || _ || _|| _|| _|d u ri }dd | D | _|d u r"i }|| _d S )Nc                 S   s   i | ]	\}}|t |qS  r   ).0dir_langdir_pathr4   r4   H/home/ubuntu/.local/lib/python3.10/site-packages/gruut/text_processor.py
<dictcomp>\   s    z*TextProcessor.__init__.<locals>.<dictcomp>)r/   default_settings_kwargsr0   r2   itemsr1   r3   )selfr/   r0   r1   r2   r3   kwargsr4   r4   r8   __init__G   s   
zTextProcessor.__init__Tgraphrootmajor_breaksminor_breakspunctuationsexplicit_langphonemesbreak_phonemesposreturnc
           +         s  dt dt f fdd}
d}d}d}d}g }d}g }g }t||jD ]}|j| t }t|tr9|d7 }d}q#t|tr_|	t
||d	d	d	|j|
|j||rQ|ndd
	 d}g }|d7 }q#||dkrt|tr|srJ d|d }tt|}|j	tt|j|j|j|j|j|r|jnd|	r|jnd|
|j|j||r|ndd d}g }q#t|tr|sJ d|d }tt|}|jtjk}|jtjk}|r|s|r|r|j	tt|j|j|j|j|j|r|rj|j|jdnd|||
|j|j||r|ndd d}g }q#|rNt|t rN|sJ d|d }tt |}|j	tt|j|j|j|j|jd|
|j||rD|ndd	 d}g }q#t|t!rtt!|}"||ttt#f}|dur|$ }t%|&|j}|'|j|jf}|t|d k}t|tr|sJ |d }|r| j(|7  _(q#|jr|jd  j(|7  _(q#||7 }q#t|tr|r|d j|kr|d  j(|7  _(q#||7 }q#t|t#r|r|d  j(|7  _(q#||7 }q#t|t)rtt)|} | j*}!"||ttt#f}"|"durt%|&|"j}#|#'|"j| jf}$|$t|#d k}t|"trd|s.J |d }|rE|j+du r>g |_+|j+	|! q#|jr^|jd }%|%j+du rWg |%_+|%j+	|! q#|	|! q#t|"tr|r|d j|kr|d }&|&j+du rg |&_+|&j+	|! q#|	|! q#t|"t#r|r|d }&|&j+du rg |&_+|&j+	|! q#|	|! q#|D ]},|j}'|'j-rd	.dd |jD |_n1g }(|jD ]%})|)j/r|(r|(	|'j0 |)j  q|(	|)j q|(	|)j qd	.|(|_|'1|j|_|'j0.dd |jD |_2|j}*|jD ]})|)jr.|*r+|*|)jkr+d	}* n|)j}*q|*r@|*|_|jD ]})|*|)_q9q|S )z(Processes text and returns each sentencelangrH   c                    s    s| j kr	| S dS )Nr,   )r/   rI   rD   r<   r4   r8   get_langs   s   z)TextProcessor.sentences.<locals>.get_langNr      r,   )	idxpar_idxtexttext_with_wstext_spokenvoicerI   pause_before_msmarks_beforezNo sentence)rO   sent_idxrP   rQ   rR   rE   rG   rI   rT   rU   rV   rJ   )rO   rW   rP   rQ   rR   rE   is_minor_breakis_major_breakrI   rT   rU   rV   T)	rO   rW   rP   rQ   rR   is_punctuationrI   rU   rV   c                 s   s    | ]}|j V  qd S N)rR   r5   wr4   r4   r8   	<genexpr>\  s    z*TextProcessor.sentences.<locals>.<genexpr>c                 s   s    | ]	}|j r|jV  qd S r[   )	is_spokenrQ   r\   r4   r4   r8   r^   n  s    
)3strnxdfs_preorder_nodesnodenodesr	   
isinstancer   r   appendr   rT   rI   
out_degreer    typingcastwordsr   lenrO   rP   rQ   rR   rE   rG   r   
break_typer   MINORMAJOR_phonemes_for_breakr   r   _find_parentr   get_millisecondslist	out_edgesindexpause_after_msr   namemarks_afterr"   keep_whitespacejoinr_   join_strnormalize_whitespacerS   )+r<   r?   r@   rA   rB   rC   rD   rE   rF   rG   rL   sentencerP   rW   sent_pause_before_mssent_marks_beforeword_pause_before_msword_marks_before	sentencesdfs_noderc   	word_nodebreak_word_noderX   rY   punct_word_node
break_nodebreak_parentbreak_msbreak_parent_edgesbreak_edge_idxis_last_edge	mark_node	mark_namemark_parentmark_parent_edgesmark_edge_idx	last_wordlast_sentencer3   
word_textsword
sent_voicer4   rK   r8   r   e   s  




















zTextProcessor.sentencesc                 k   s0    | j ||fi |D ]
}|D ]}|V  qqdS )z$Processes text and returns each wordN)r   )r<   r?   r@   r=   sentr   r4   r4   r8   rj     s   zTextProcessor.wordsrI   c                 C   s   |p| j }| j|}|dur|S t|}| j|}|dur)| j| | j|< |S td|| | j|}t|f|| j| j	d| j
}|| j|< || j|< |S )z'Gets or creates settings for a languageNzCNo custom settings for language %s (%s). Creating default settings.)lang_dirr0   r2   )r/   r3   getr)   _LOGGERdebugr1   r"   r0   r2   r:   )r<   rI   lang_settingsresolved_langr   r4   r4   r8   r"     s6   


zTextProcessor.get_settingsc                 O   s   | j |i |S )zProcesses text or SSML)process)r<   argsr=   r4   r4   r8   __call__  s   zTextProcessor.__call__F   rQ   ssml	phonemizepost_processadd_speak_tagdetect_numbersdetect_currencydetect_datesdetect_timesverbalize_numbersverbalize_currencyverbalize_datesverbalize_times
max_passesc           D   
      sR  |r6zt 	W n% ty. } z|rt d	 dntd |W Y d}~nd}~ww fdd}n	fdd}ttt	 }d}d}d}d}t
j}g 
g g }|pYj g d}d}i d}dd}d}d} 
fd	d
} 	dGdtdtjt dtffdd}!| D ]}"t|"tr|rqtt|"	|t
jkr|dusJ 	 |_q|t
jkr|dusJ 	 	t	|_q|dur|	|du rtt|dd}|j|j|d |du r|}|dusJ |du rtdHt|dd| t}#|j|#j|#d ||j|#j |#}|dusJ |du r;tdHt|dd| t}$|j|$j|$d ||j|$j |$}|dusBJ |t
jkr	}% }&|&j rb|sb|%!|&j"sb|%|&j"7 }%| t#}'|ro|$ |'d< |&%|%}(t#dHt||(|%|!|(p&|(|&d|'})|j|)j|)d ||j|)j qj'||	|| t#|!d qt|"t(rtt(|"}*t)|*j*j+}+|+dkrʈ
rɈ
$  q|+dkr׈rֈ$  q|+dkrr$  q|+dkrt
j}d}q|+dkr|t
jkrt
j,}q|+dkr|t
jkrt
j,}q|+dkrp|t
j,krp|dus!J d|jdus.J d| |dus7J d-|},|,dusHJ d | |,j.-|ji }-|j/ra|j/D ]}.|j|-|.< qWn|j|-t0j< |-|,j.|j< d}q|r|d! d" |+kr|$  |r|d! d#  nj |+d$v rt
j}d}dq|+d%krd}q|+d&krd}q|+d'kr|}q|+d(krd}q|+d)v rd}q|+dkrd}q|rq|"\}/}0tt j1|/}/ttjtj2ttj3f  |0}0t)|/j+}1|1d'kr#t4|/d*}2|2r|5|1|2f |2 tdHt||/d+| t}3|du r|3}|j|3j|d |}q|1dkr3t4|/d,}4
5|4 q|1d&kr|du rTtt|dd}|j|j|d |du rT|}|dus[J t4|/d*}2|2rl|5|1|2f |2 tdHt||/d+| t}#|j|#j|#d ||j|#j |#}d}q|1d%kr|du rtt|dd}|j|j|d |du r|}|dusJ |du rtdHd-t|i| t}#|j|#j|#d ||j|#j |#}t4|/d*}2|2r|5|1|2f |2 tdHt||/d+| t}$|j|$j|$d ||j|$j |$}q|1d$v r7t
j}|0r|0-d.dnd}t4|/d*}2|2r1|5|1|2f |2 t4|/d/q|1d0kri|pC|pC|}5|5dusKJ t6t||/t4|/d1d2d3}6|j|6j|6d ||5j|6j q|1d4kr|pu|pu|}5|5dus}J t7t||/t4|/d,d2d5}7|j|7j|7d ||5j|7j q|1dkr5t4|/d6d2t4|/d7d2f q|1d(krt4|/d8d2}q|1d)v rd}q|1dkr|t
j,krt4|/d9d28 }8|8rd:d; |8D }qd}q|1d*krt4|/d*d2}2|2r|5|1|2f |2 q|1dkrt4|/d<}9|9dusJ d=|/ d>5|9 q|1dkrYt4|/d?t9}|dus,J t4|/d@d2 : }:t;||:dA|< t4|/dBd2};|;rUt<dC|; t=|;|  qt
j,}q|1dkr~|t
j,kr~|du rlt> }t4|/d/}<|<r}t?|< 8 |_/q|1dkr|t
j,krt
j}|du rt> }q|1dkr|t
j,krt
j}|du rt> }q|dusJ |}=|=d"krd}>t@jA||rd}>t@jB||rd}>t@jC||rd}>t@jD||rd}>t@jE||rd}>t@jF||rd}>G||rd}>t@jH||rd}>|
rtIjJ||rd}>|	r%tIjK||r%d}>|r2tIjL||r2d}>|r?tIjM||r?d}>|rLtIjN||rLd}>|rYtIjO||rYd}>|rftIjP||rfd}>|rstIjQ||rsd}>t@jR||r}d}>t@jS||rd}>|>sn	|=d#8 }=|=d"ksdDtjTt# ffdEdF}?g }@tU||jD ]4}A|jV|A tW ttr|@r|?|@ g }@q|X|Ad"krtt#rtt#})|@5|) q|@r|?|@ g }@|r%tU||jD ])}A|jV|A tW ttrtt}B|BjY}C|CjZdur|CZ||B|C q[|| ||fS )Ia5  
        Processes text or SSML

        Args:
            text: input text or SSML (ssml=True)
            lang: default language of input text
            ssml: True if input text is SSML
            pos: False if part of speech tagging should be disabled
            phonemize: False if phonemization should be disabled
            post_process: False if sentence/graph post-processing should be disabled
            add_speak_tag: True if <speak> should be automatically added to input text when ssml=True
            detect_numbers: True if numbers should be annotated in text (interpret_as="number")
            detect_currency: True if currency amounts should be annotated in text (interpret_as="currency")
            detect_dates: True if dates should be annotated in text (interpret_as="date")
            detect_times: True if clock times should be annotated in text (interpret_as="time")
            verbalize_numbers: True if annotated numbers should be expanded into words
            verbalize_currency: True if annotated currency amounts should be expanded into words
            verbalize_dates: True if annotated dates should be expanded into words
            verbalize_times: True if annotated clock times should be expanded into words

        Returns:
            graph, root: text graph and root node

        z<speak>z</speak>TextProcessor.processNc                   3   s    t  E d H  d S r[   )r+   r4   )root_elementr4   r8   iter_elements  s   z,TextProcessor.process.<locals>.iter_elementsc                   3   s     V  d S r[   r4   r4   rQ   r4   r8   r     s   
Fc                    sd   i }r
d |d<  |d< | t u r0rd \|d< |d< d ur&|d< r0tt|d< |S )NrM   rT   rI   interpret_asformatrolelexicon_ids)r    rr   reversed)target_classscope)current_langlookup_stacksay_as_stackvoice_stack	word_roler4   r8   scope_kwargs$  s   z+TextProcessor.process.<locals>.scope_kwargs	word_textr   rH   c                    sl    r4t tgD ]*} |}|d u rq	|j| }|d u r q	|d ur+||v r+ dS tj|v r3 dS q	dS )NTF)	itertoolschainDEFAULT_LEXICON_IDr   rj   r!   DEFAULT)r   r   inline_lexicon_idmaybe_lexiconmaybe_role_phonemes)inline_lexiconsr   r4   r8   in_inline_lexicon8  s    

z0TextProcessor.process.<locals>.in_inline_lexiconTrc   implicitdatarE   )rc   rQ   rR   
in_lexicon)word_phonemesr   r   rT   zsay-aslookuplexicongraphemephonemelexemez	No lexemezNo phoneme for lexeme: zNo lexicon idzNo lexicon for id rM   r   rN   >   r]   tokenspspeaksub>   metametadatarI   )rc   elementrv   rc   is_lastr   breaktimer,   )rc   r   r   mark)rc   r   rv   zinterpret-asr   aliasphc                 S      g | ]}t |qS r4   )r&   )r5   phoneme_strr4   r4   r8   
<listcomp>  s    z)TextProcessor.process.<locals>.<listcomp>refzLookup id required ()idalphabet)
lexicon_idr   uriz%Loading pronunciation lexicon from %srj   c                    s^  r- j}|jd ur-|dd | D }t| |D ]\}}||_|js,d| |_qr| D ]{}|jr7q1g }|jrB||j |	t
 |D ]7} |}|d u rUqI|j|j}|d u raqI||j}	|	d u rw|jtjkrw|tj}	|	d ur|	|_ nqI|jrq1 |j}
|
jd ur|
|j|j|_|js|
jd ur|
|j|j|_q1d S d S )Nc                 S   s   g | ]}|j qS r4   r   )r5   r   r4   r4   r8   r   L  s    zCTextProcessor.process.<locals>.process_sentence.<locals>.<listcomp>zgruut:)r"   rI   get_parts_of_speechziprG   r   rE   r   extendrf   r   r   rj   rQ   r!   r   lookup_phonemesguess_phonemes)rj   pos_settingspos_tagsr   pos_tagr   r   r   r   maybe_phonemesphonemize_settings)r   rc   r   rG   r<   r4   r8   process_sentenceG  s`   




z/TextProcessor.process.<locals>.process_sentencer[   r4   )\etree
fromstring	Exceptionr   	exceptionrh   ri   r   ra   DiGraphr   r   r/   r`   Optionalboolre   IN_LEXICON_GRAPHEMEstripr   IN_LEXICON_PHONEMEr&   rE   r   rk   add_noderc   r   add_edger   IN_WORDr"   rx   endswithrz   r    popr{   _is_word_in_lexicon_pipeline_tokenizer   r*   r   tag
IN_LEXICONr   rj   rolesr!   ElementDictAnyr#   rf   r   r   splitr   lowerr   r   r%   r   setr'   _split_replacements_split_punctuations_split_minor_breaks_split_abbreviations_split_initialism_split_major_breaks_break_sentences_split_spell_outr(   _transform_date_transform_currency_transform_number_transform_time_verbalize_date_verbalize_time_verbalize_number_verbalize_currency_break_words_split_ignore_non_wordsListrb   rd   r	   rg   rI   post_process_sentencepost_process_graph)Dr<   rQ   rI   r   rG   r   r   r   r   r   r   r   r   r   r   r   r   er   r?   last_paragraphr   
last_speakr@   parsing_state
lang_stackr   r   is_last_word
last_aliasskip_elementsr   r   r   elem_or_textp_nodes_noder   r3   word_kwargsword_text_normr   end_elemend_tagr   role_phonemesr   elemelem_metadataelem_tag
maybe_lang
speak_node
voice_namelast_targetr   r   word_phonemes_strs	lookup_idlexicon_alphabetlexicon_urirole_strnum_passes_leftwas_changedr   sentence_wordsr   	sent_nodesent_settingsr4   )r   r   r   rc   r   rG   r   r   r<   rQ   r   r   r8   r     sX  +
	












	

























































 [B
r   c                 C   s   dS )z,User-defined post-processing of entire graphNr4   )r<   r?   r@   r4   r4   r8   r$    s   z TextProcessor.post_process_graphc                    s  d}t t||D ]}t|tsq	tt|}|jtjkrq	t	t
||j}|j| t }|g}t|tsQt	t
||}|j| t }|| t|tr7t|dksYJ |d }	t|	tsdJ |	jshq	|d }
t ||	j}||	j|
jf}||d d }|sq	| ||	t}|dusJ |j|	jf}t ||j}||}|| tt|dd |j j d	 ||d |j jf || || | fd
d|D  d}q	|S )zABreak sentences apart at BreakWordNode(break_type="major") nodes.F   rM   rN   NTr   r   c                    s   g | ]	\}} j |fqS r4   )rc   )r5   uv
new_s_noder4   r8   r     s    z2TextProcessor._break_sentences.<locals>.<listcomp>)rr   r$   re   r   rh   ri   rl   r   rn   nextiterpredecessorsrc   rd   r	   r   rf   rk   r   rs   rt   rp   r   remove_edges_fromr   insertadd_edges_from)r<   r?   r@   rB  	leaf_noder   parent_nodeparents_pathr/  below_s_nodes_edgesr   edges_to_mover.  p_s_edgep_edges
s_edge_idxr4   rJ  r8   r    sP   







zTextProcessor._break_sentencesrc   c                 c   s   t |tsdS tt|}|js|js|jsdS | |j}|j	du r&dS |j	
|j}t|dk r5dS ||j\}}t|d }t|D ]8\}	}
||
}|sSqG|jrl|	dkr^||
 }
|	|krg|
|7 }
n|
|j7 }
t||
d|j|j| ||ddfV  qGdS )z2Break apart words according to work breaks patternNrF  rN   r   T)rQ   rR   r   rI   rT   r   is_from_broken_word)re   r    rh   ri   r   r   r   r"   rI   word_breaks_patternr  rQ   rk   get_whitespacerR   	enumerater{   rx   rz   rT   r  )r<   r?   rc   r   r3   partsfirst_wslast_wslast_part_idxpart_idx	part_textpart_text_normr4   r4   r8   r     sD   





zTextProcessor._break_wordsc              
   c   s.   t |tsd S tt|}|js|jrd S | |j}|jd u r(|j	d u r(d S |j
}||j\}}d}|jd urttd |jj|dd}	d}
|rt|	dkr|	\}}|
r]|| }d}
||}d}t||d|j|jdfV  ttd |jj|dd}	|rt|	dksQg }|j	d urttd |j	j|dd}	|rt|	dkr|	\}}d}|| ttd |j	j|dd}	|rt|	dks|sd S |jr|s|| }||}|rt||d|j|j| ||dfV  t|d }tt|D ] \}}|jr||kr||7 }t| |d|j|jdfV  qd S )NFrN   maxsplitTrF  )rQ   rR   r   rI   rT   rQ   rR   r   rI   rT   r   )re   r    rh   ri   r   r   r"   rI   begin_punctuations_patternend_punctuations_patternrQ   r^  rR   rr   filterr  rk   r{   r   rT   rf   rx   r  r_  r   r   )r<   r?   rc   r   r3   r   ra  rb  has_punctuationr`  
first_word
punct_textpunct_text_normend_punctuationsr1  last_punct_idx	punct_idxr4   r4   r8   r  1  s   











	z!TextProcessor._split_punctuationsc           	   
   c   s    t |tsd S tt|}|js|jrd S | |j}|jd u r#d S |j	|j
}t|dk r2d S |d }|d }| rV||}t||d|j|j| ||dfV  n|| }ttj|||d|j|jdfV  d S )NrF  r   rN   Tri  rl   rQ   rR   r   rI   rT   )re   r    rh   ri   r   r   r"   rI   major_breaks_patternr  rR   rk   r   r{   rT   r  r   r   rn   )	r<   r?   rc   r   r3   r`  	word_part
break_partword_part_normr4   r4   r8   r    sB   




z!TextProcessor._split_major_breaksc           	   
   c   s    t |tsd S tt|}|js|jrd S | |j}|jd u r#d S |j	|j
}t|dk r2d S |d }| rQ||}t||d|j|j| ||dfV  |d }ttj|||d|j|jdfV  d S )NrF  r   Tri  rN   rt  )re   r    rh   ri   r   r   r"   rI   minor_breaks_patternr  rR   rk   r   r{   rT   r  r   r   rm   )	r<   r?   rc   r   r3   r`  rv  rx  rw  r4   r4   r8   r    s@   




	z!TextProcessor._split_minor_breaksc                 G   sl   g }| |jD ]}|j| t }t||r|  S || q|D ]}| |||}|dur3|  S q"dS )zDTries to find a node whose type is in classes in the tree above nodeN)rN  rc   rd   r	   re   rf   rp   )r<   r?   rc   classesparentsrS  rT  matchr4   r4   r8   rp     s   
zTextProcessor._find_parentrl   c                 C   s,   |t jkr
tjjgS |t jkrtjjgS d S r[   )r   rn   r   BREAK_MAJORvaluerm   BREAK_MINOR)r<   rl   rI   r4   r4   r8   ro     s
   



z!TextProcessor._phonemes_for_breakr   r   c              	   C   s  |du ri }| j }|dur|d|}| |}|dus#J d| |jdur-||}||D ]R}	||	}
|
s<q2|jsA|
}	|}|rOi |d| i}d}|dur]||
|d}|se| |
|}t	d	t
||
|	d|d|}|j|j|d ||j|j q2dS )
zSplits text into word nodesNrI   zNo settings for rE   r   T)rc   rQ   rR   r   r   r   r4   )r/   r   r"   pre_process_textsplit_wordsr{   rx   r  r  r    rk   r   rc   r  )r<   r?   rS  rQ   r   r   r   rI   r3   r   r1  r0  r   r   r4   r4   r8   r    sJ   




z TextProcessor._pipeline_tokenizec                 c   s    t |tsdS tt|}|jtjkrdS | |j}|	|j
\}}t|jd }t|jD ]H\}}	|j|	}
tj}|
du rM|	 rK|	}
tj}n|	}
|
sPq0|jri|dkr[||
 }
||krd|
|7 }
n|
|j7 }
t||
|
d|j|dfV  q0dS )z$Expand spell-out (a-1 -> a dash one)NrN   r   TrQ   rR   r   rI   r   )re   r    rh   ri   r   r   	SPELL_OUTr"   rI   r^  rR   rk   rQ   r_  spell_out_wordsr   r!   r   isalphaLETTERrx   rz   r{   )r<   r?   rc   r   r3   ra  rb  last_char_idxicr   r   r4   r4   r8   r  ]  sB   


zTextProcessor._split_spell_outc              
   c   s    t |tsdS tt|}|js|jrdS | |j}|js!dS d}|j	}|jD ]\}}t |t
s4J |||\}}	|	dkrBd}q)|rj||D ]!}
||
}|jsV|}
|sYqJt||
d|j| ||dfV  qJdS dS )z"Do regex replacements on word textNFr   TrQ   rR   r   rI   r   )re   r    rh   ri   r   r   r"   rI   replacementsrR   r   subnr  r{   rx   r  )r<   r?   rc   r   r3   matchednew_textpatterntemplatenum_subsre  rf  r4   r4   r8   r    sB   


z!TextProcessor._split_replacementsc              
   c   s    t |tsdS tt|}|js|jrdS | |j}|js!dS d}|j	 D ]\}}t |t
s5J |||j}|durF||} nq(|durp||D ]!}	||	}
|
sZqP|js_|
}	t|
|	d|j| |
|dfV  qPdS dS )zExpand abbreviationsNTr  )re   r    rh   ri   r   r   r"   rI   abbreviationsr;   r   r|  rR   expandr  r{   rx   r  )r<   r?   rc   r   r3   r  r  r  r|  re  rf  r4   r4   r8   r    sB   



z"TextProcessor._split_abbreviationsc                 c   s    t |tsdS tt|}|js|jst|jdk rdS | |j	}|j
du s-|jdu r/dS |
|js7dS ||j\}}||j}t|d }t|D ]>\}	}
||
}|s[qO|jr|	dkrf||
 }
d|	  krp|k rxn n|
|j7 }
n|	|kr|
|7 }
t||
d|j	tjdfV  qOdS )zSplit apart ABC or A.B.C.NrF  rN   r   Tr  )re   r    rh   ri   r   r   rk   rQ   r"   rI   is_initialismsplit_initialismr^  rR   r_  r{   rx   rz   r!   r  )r<   r?   rc   r   r3   ra  rb  r`  rc  rd  re  rf  r4   r4   r8   r    sB   

zTextProcessor._split_initialismc                 c   sd    t |tsdS tt|}|js|jrdS | |j}|jdu r#dS ||j	r0t
i fV  dS dS )zMark non-words as ignoredN)re   r    rh   ri   r   r   r"   rI   is_non_wordrQ   r   )r<   r?   rc   r   r3   r4   r4   r8   r!    s   

z%TextProcessor._split_ignore_non_wordsc                 C   s,  t |tsdS tt|}|jr|jr|jtjkrdS | |j	}|j
s&J |jd urD||j}|d urDtj|_tj|_t||_dS zDtjj|j|j
d}| sWtdtj|_|jsbtj|_||_d|  k rodk rn W dS td|jd urtj|_W dS W dS W dS  ty   d|_Y dS w )NFlocalezNot parsing nan or infi  i  z^\d+$T)re   r    rh   ri   is_maybe_numberr   r   NUMBERr"   rI   babel_localeget_ordinalrQ   r   NUMBER_ORDINALr   r   numberbabelnumbersparse_decimal	is_finite
ValueErrorNUMBER_CARDINALrer|  NUMBER_YEAR)r<   r?   rc   r   r3   ordinal_numr  r4   r4   r8   r  -  sP   




zTextProcessor._transform_numberc           
   	   C   s@  t |tsdS tt|}|jr|jr|jtjkrdS | |j	}|jd ur1||j
s1d|_dS |js6J d}|jD ]3}|j
|rn|j
t|d  }ztjj||jd}tj|_||_||_d}W  n
 tym   Y q;w q;|s|jtjkr|j}	|	rztjj|j
|jd}tj|_|	|_||_W dS  ty   Y dS w dS )NFr  T)re   r    rh   ri   is_maybe_currencyr   r   CURRENCYr"   rI   rQ   r  currency_symbols
startswithrk   r  r  r  currency_symbolr  r  default_currencycurrency_name)
r<   r?   rc   r   r3   parsedr  num_strr  r  r4   r4   r8   r  b  s`   




z!TextProcessor._transform_currencyc                 C   s"  t |tsdS tt|}|jr|jr|jtjkrdS | |j	}z]|jd ur3||j
s3d|_W dS |js8J ddi|jgd}tj|j
fi |}|d urYtj|_||_W dS |jtjkryd|d d< tj|j
fi |}|d ur|||_W dS W dS W dS  ty   td d|_Y dS w )NFSTRICT_PARSINGT)r3   	languagesr3   transform_date)re   r    rh   ri   is_maybe_dater   r   DATEr"   rI   rQ   dateparser_lang
dateparserparsedater   r   r   )r<   r?   rc   r   r3   dateparser_kwargsr  r4   r4   r8   r    sN   

	
zTextProcessor._transform_datec                 C   s   t |tsdS tt|}|jr|jr|jtjkrdS | |j	}|j
d u r(dS z(|jd ur:||js:d|_W dS |
|j}|d urNtj|_||_W dS W dS  tyb   td d|_Y dS w )NFtransform_timeT)re   r    rh   ri   is_maybe_timer   r   TIMEr"   rI   
parse_timerQ   r   r   r   r   )r<   r?   rc   r   r3   r   r4   r4   r8   r    s:   



zTextProcessor._transform_timer   c                 C   s    |j du rdS t|j |ddS )zTrue if word is in the lexiconNF)do_transforms)r   r   )r<   r   r3   r4   r4   r8   r    s   
z!TextProcessor._is_word_in_lexiconc              	   C   s  t |tsdS tt|}|jtjks|jdu rdS | |j	}|j
dur-|
|js-dS |js2J d|ji}|jg}|jtjkrFd|d< n,|jtjkrQd|d< n!|jtjkr\d|d< n|jtjkrrd|d< dd t|j D }|D ]q}|d	 d
k}|rt|}	nt|}	z
t|	fi |}
W n ty   td|j|j	 Y  dS w ||j\}}||
 | }
||
D ]+}||}|sq|js|}tt |d|j	||d}|j!|j"|d |#|j"|j" qqtdS )zSplit numbers into wordsNrI   cardinaltoordinalyearc                 S   r   r4   r   )r5   dr4   r4   r8   r     s    z3TextProcessor._verbalize_number.<locals>.<listcomp>rN   r   z4Failed to convert number %s to words for language %sTrc   r   rI   rQ   rR   r   )$re   r    rh   ri   r   r   r  r  r"   rI   r  rQ   num2words_langr   r   r  r  r  NUMBER_DIGITSr`   to_integral_valuefloatintr   NotImplementedErrorr   r   r^  rR   r  r{   rx   rk   r   rc   r  )r<   r?   rc   r   r3   num2words_kwargsdecimal_numsdecimal_numnum_has_frac	final_numr  ra  rb  number_word_textnumber_word_text_normnumber_wordr4   r4   r8   r    sp   






	
zTextProcessor._verbalize_numberc                 C   s`  t |tsdS tt|}|jtjks|jdu rdS | |j	}|j
s%J |js*J |j}|jp2|j}d|vrI|  }|jdd |D }n|}d}d}	d}
d}zqd|v s\d|v rftjj|d|j
d	}
d
|ji}d|v ssd|v rd|d< t|jfi |}d|v sd|v rd|d< t|jfi |}	d|v sd|v rzd|d< t|jfi |}W n ty   d|d< t|jfi |}Y nw W n ty   td|j|j	 Y dS w |jdi |
|
|||	|	||d}||j\}}|| | }||D ]1}||}|sq|j s|}|sqtt!|d|j	||d}|j"|j#|d |$|j#|j# qdS )zSplit dates into wordsN{c                 s   s    | ]	}d | dV  qdS )r  }Nr4   )r5   r  r4   r4   r8   r^   [  s    z0TextProcessor._verbalize_date.<locals>.<genexpr>r,   z{M}z{m}MMMMr  rI   z{D}z{d}r  r  z{O}z{o}r  z{Y}z{y}r  z(Failed to format date %s for language %s)MmDr  OoYyTr  r   r4   )%re   r    rh   ri   r   r   r  r  r"   rI   r  r  r   default_date_formatr   upperrz   ry   r  datesformat_dater   dayr  r   r   r   rQ   r^  rR   r  r{   rx   rk   r   rc   r  )r<   r?   rc   r   r3   r  date_formatdate_format_strday_card_strday_ord_str	month_stryear_strr  date_strra  rb  date_word_textdate_word_text_norm	date_wordr4   r4   r8   r  F  s   





zTextProcessor._verbalize_datec                 C   sl  t |tsdS tt|}|jtjks|jdu rdS | |j	}|j
du r'dS ||j\}}t|
|j}t|d }t|D ]r\}	}
|	dkrM||
 }
|	|krV|
|7 }
n|
|j7 }
||
}|scqA|jsh|}
|
skqAtt|d|j	||
d}|j|j|d ||j|j | || | ||D ]\}}|ddt|i|}|j|j|d ||j|j qqAdS )	zSplit times into wordsNrN   r   Tr  r   rc   r4   )re   r    rh   ri   r   r   r  r   r"   rI   verbalize_timer^  rR   rr   rk   r_  rz   r{   rx   r   rc   r  r  r  )r<   r?   rc   r   r3   ra  rb  
time_wordslast_idxword_idxtime_word_texttime_word_text_norm	time_word
node_classnode_kwargsnew_noder4   r4   r8   r    sP   




zTextProcessor._verbalize_timec                 C   s  t |tsdS tt|}|jtjks"|jdu r|jdu s"|j	du r$dS | 
|j}|js/J |j	}|d dk}|jdd}|jsU|j}|jrR|j|jpNd|j}||_|j|d< d|d< ztt|fi |}	W n ty|   td	||j Y dS w |r|	dd}	n	|	jddd
d }	||j\}
}|
|	 | }	||	D ]+}||}|sq|js|}tt|d|j||d}|j|j|d ||j|j qdS )z!Split currency amounts into wordsNrN   r   currency)rI   r  r,   |	separatorz/Failed to verbalize currency %s for language %srg  Tr  r   ) re   r    rh   ri   r   r   r  r  r  r  r"   rI   r  r  
currenciesr   r   r  r   r   r   replacer  r^  rR   r  r{   rx   rk   r   rc   r  )r<   r?   rc   r   r3   r  r  r  r  r  ra  rb  currency_word_textcurrency_word_text_normcurrency_wordr4   r4   r8   r    sd   




z!TextProcessor._verbalize_currency)r.   r,   NNN)TTTTTTTr[   )NFTTTTTTTTTTTTr   )NNN)5__name__
__module____qualname____doc__r`   rh   r   r  Unionr   IterableMutableMappingr   r>   r   r   r   r   r   r   rj   r"   r   r  Tupler   r$  r  r   r  r  r  rp   r   r
   ro   r"  Callabler  r  r  r  r  r!  r  r  r  r  r  r  r  r  r  r4   r4   r4   r8   r-   D   s:   
"	

  %(	

     wM0p.*

C1.+059.%
Ke9r-   )>r  r   loggingr  rh   xml.etree.ElementTreer   ElementTreedecimalr   pathlibr   r  babel.numbersr  networkxra   	gruut_ipar   r   gruut.constr	   r
   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   
gruut.langr"   gruut.utilsr#   r$   r%   r&   r'   r(   r)   r*   r+   	getLoggerr   r   r-   r4   r4   r4   r8   <module>   s(   l,
