o
    Xεiu                     @   sv   d Z ddlZddlZddlmZmZmZmZ ddlm	Z	 ddddZ
G dd dejZd	d
 Zedkr9e  dS dS )Tests for TextProcessor    N)SentenceTextProcessorTextProcessorSettingsWordprint_graphF)explicit_langphonemesposc                   @   s0  e Zd ZdZdd Zdd Zdd Zdd	 Zd
d Zdd Z	dd Z
dd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zd d! Zd"d# Zd$d% Zd&d' Zd(d) Zd*d+ Zd,d- Zd.d/ Zd0d1 Zd2d3 Zd4d5 Zd6d7 Zd8d9 Zd:d; Z d<d= Z!d>d? Z"d@dA Z#dBdC Z$dDdE Z%dFdG Z&dHdI Z'dJS )KTextProcessorTestCaser   c                 C   sr   t  }|d\}}t|j||fi t}| |tdddddtdddddtd	dd
ddtdddddg dS )zText whitespace preservationThis is  a   test    r   ThisThis idxsent_idxtexttext_with_ws   iszis     aza      testztest    Nr   listwordsWORDS_KWARGSassertEqualr   self	processorgraphrootr    r%   M/home/ubuntu/.local/lib/python3.10/site-packages/tests/test_text_processor.pytest_whitespace   s   z%TextProcessorTestCase.test_whitespacec                 C   sv   t dd}|d\}}t|j||fi t}| |tdddddtdddddtd	dd
d
dtdddddg dS )z)Test disabling of whitespace preservationF)keep_whitespacer   r   r   r   r   r   r   r   r   r   Nr   r    r%   r%   r&   test_no_whitespace    s   
z(TextProcessorTestCase.test_no_whitespacec                 C   s   t ddhddhdhdhd}|d\}}t|j||fi t}| |tddd	d
dtddddddtdddddtddddddtddddddtdddddtddddddtdddddtddddddtddddddg
 dS )z/Test splitting of punctuation from around words"   «   »,.)begin_punctuationsend_punctuationsminor_breaksmajor_breaksu   This «is»,  a "test".r   r   r   r   r   Tr   r   r   r   is_punctuationr   r   r      z,  r   r   r   r   is_minor_break   r   a       r      	   r   r   r   r   is_major_breakNr   r    r%   r%   r&   test_punctuation1   sD   





z&TextProcessorTestCase.test_punctuationc                 C   s   t dhdhdhd}|d\}}t|j||fi t}| |tdddddtd	dddd
dtdddddtddddd
dtddddd
dtdd	dddtd	d	dddtdd	ddd
dg dS )z Test break inside of punctuationr*   r.   )r/   r0   r2   zTest "one." Test two.r   TestzTest r   r   Tr3   r   oner   r>   r5   z" twoNr   r    r%   r%   r&   !test_punctuation_with_inner_breakY   s2   



z7TextProcessorTestCase.test_punctuation_with_inner_breakc                 C   s   t dhdhg dd}|d\}}t|j||fi t}| |tdddddtd	ddd
ddtdddddtdddddtdddddtdddddtddddddg dS )z+Test regex replacements during tokenizationr-   r.   ))z\B'r*   )z'\Br*   )z[\<\>\(\)\[\]"]+ )r1   r2   replacementsz#"This," [is] <a> (test) 'sentence.'r   r   r   r   , Tr6   r   r   is r   r   r9   r5   r   test r8   sentencer:   r>   Nr   r    r%   r%   r&   test_replacementsz   s,   	

z'TextProcessorTestCase.test_replacementsc                 C   s  t dhddhddddd}|d	\}}t|j||fi t}| |td
d
dddtdd
ddddtd
ddddtdddddtdddddtdddddtddddddtdddddtd dd!d"dtd#dd$d%dtd&dd'd(dtd)dd*d*dtd+dddddg d,S )-z8Test expansion of abbreviations (with case preservation)r-   r.   ?z\1octorz\1isterz\1treet)z
^([dD])r\.z
^([mM])r\.z
^([sS])t\.)r1   r2   abbreviationsz*Mr.? I'm just a dr., on this St. at least.r   Misterr   r   z? Tr>   zI'mzI'm justzjust r   r   r9   r   doctorr5   rG   r6   r8   onzon r:   thiszthis r;   StreetzStreet r<   atzat r=   least
   Nr   r    r%   r%   r&   test_abbreviations   sB   	


z(TextProcessorTestCase.test_abbreviationsc                 C   s6  t ddhd}|d\}}t|j||fi t}| |tdddddtd	dd
d
dtddddddtdd	dddtd	d	d
d
dtdd	ddddg t|j||fi t}| |tddddtdddddtd	dd
d
dtddddddgdtd	dddtdd	dddtd	d	d
d
dtdd	ddddgdg dS )zTest sentence breakr.   !r2   z"First  sentence. Second sentence! r   FirstzFirst  r   r   rJ   r   . Tr>   SecondSecond z! zFirst sentence.zFirst  sentence. zFirst sentence)r   r   r   text_spokenr   zSecond sentence!zSecond sentence! zSecond sentenceN)r   r   r   r   r   r   	sentencesr   )r!   r"   r#   r$   r   r_   r%   r%   r&   test_multiple_sentences   st   

z-TextProcessorTestCase.test_multiple_sentencesc                 C   s~   t  }|ddd\}}t|j||fi t}| |tddddddtdddd	d	dtdddd
ddtdddd	d	dg dS )zTest paragraph indexz<<speak><p>First paragraph</p><p>Second paragraph</p></speak>Tssmlr   rZ   First )r   r   par_idxr   r   r   	paragraphr\   r]   Nr   r    r%   r%   r&   test_multiple_paragraphs  s6   

z.TextProcessorTestCase.test_multiple_paragraphsc                 C   s   t ddhd}|ddd\}}t|j||fi t}| |tdddd	d
tddddd
tddddddtddddd
tddddd
tddddddg dS )z,Test <s> in SSML for avoiding sentence breakr.   rX   rY   z'<s>First sentence. Second sentence!</s>Tra   r   rZ   rc   r   r   rJ   r   r[   r>   r   r\   r]   r5   r8   Nr   r    r%   r%   r&   test_explicit_sentence(  s"   

z,TextProcessorTestCase.test_explicit_sentencec                 C   s   t dhd}|d\}}t|j||fi t}| |tdddddtddddd	d
tdddddtdddddtdddddg dS )zTest minor (phrase) breakr-   )r1   zthis, is a testr   rR   r   r   rG   Tr6   r   r   rH   r   r   r9   r5   r   Nr   r    r%   r%   r&   test_minor_breaks?  s   
z'TextProcessorTestCase.test_minor_breaksc              
   C   s\   t dhd}|d\}}t|j||fi t}| |tdddddtddd	d	dg d
S )zTest inner-word break-)word_breakszninety-niner   ninetyninety r   r   nineNr   r    r%   r%   r&   test_word_breaksS  s   z&TextProcessorTestCase.test_word_breaksc                 C   s   t dd}|ddd\}}t|j||fi t}t|| | |tddddd	td
dddd	tddddd	tddddd	tddddd	tddddd	tddddd	g dS )z%Test interpret-as="spell-out" in SSMLen_USdefault_langz1<say-as interpret-as="spell-out">test123</say-as>Tra   r   tzt r   r   eze r   szs r   r5   rB   one r8   rC   two r:   threeN)r   r   r   r   r   r   r   r    r%   r%   r&   test_spell_outb  s"   


z$TextProcessorTestCase.test_spell_outc                 C   s   t dhdd td}|d\}}t|j||fi t}| |tddddd	td
dddd	tddddd	tddddddg dS )zTest initialism spell outr.   c                 S   s   |   o|  S N)isalphaisupper)rt   r%   r%   r&   <lambda>~      z8TextProcessorTestCase.test_initialisms.<locals>.<lambda>)r2   is_initialismsplit_initialismzTTS.r   TzT r   r   r   Sr   Tr>   Nr   r    r%   r%   r&   test_initialismsz  s"   
z&TextProcessorTestCase.test_initialismsc                 C   sh   t dd}|d\}}t|j||fi t}| |tdddddtddd	d
dtdddddg dS )z+Test number verbalization (single language)ro   rp   z1 2 3r   rB   ru   r   r   rC   rv   r   rw   Nr   r    r%   r%   r&   test_numbers_one_language  s   
z/TextProcessorTestCase.test_numbers_one_languagec                 C   sn   t dd}|ddd\}}t|j||dd}| |tdddd	d
dtddddddtddddddg dS )z4Test number verbalization (SSML, multiple languages)ro   rp   z-1 <w lang="es_ES">2</w> <w lang="de_DE">3</w>Tra   F)r
   r   rB   ru   langr   r   r   r   es_ESr   doszdos de_DEr   dreiNr   r   r   r   r   r    r%   r%   r&   test_numbers_multiple_languages  s   

z5TextProcessorTestCase.test_numbers_multiple_languagesc                 C   s\   t dd}|d\}}t|j||ddd}| |tdddddd	tdd
dddd	g dS )z-Test currency verbalization (single language)ro   rp   z$10Fr
   r   r   tenten r   r   dollarsNr   r    r%   r%   r&   test_currency_one_language  s   
z0TextProcessorTestCase.test_currency_one_languagec                 C   s   t dd}|dddd\}}t|j||ddd}| |tdddd	d
dtddddddtddddddtddddddtddddddtddddddg dS )z6Test currency verbalization (SSML, multiple languages)ro   rp   u9   €10 <w lang="fr_FR">€10</w> <w lang="nl_NL">€10</w>TFrb   	phonemizer   r   r   r   r   r   eurozeuro fr_FRr   dixdix r   euroszeuros nl_NLr5   tienztien r8   Nr   r    r%   r%   r&   test_currency_multiple_language  s.   




z5TextProcessorTestCase.test_currency_multiple_languagec                 C   sb   t ddd}|ddd\}}t|j||ddd}| |tdd	d	d
ddtddd	dddg dS )zKTest default currency use when no currency symbol (interpret-as="currency")ro   USD)rq   default_currencyz+<say-as interpret-as="currency">10</say-as>Tra   Fr   r   r   r   r   r   r   Nr   r    r%   r%   r&   test_currency_default  s"   
z+TextProcessorTestCase.test_currency_defaultc                 C   s   t dd}|d\}}t|j||ddd}| |tdddddd	tdd
dddd	tdddddd	tdddddd	tdddddd	g dS )z!Test time verbalization (English)ro   rp   z  4:01pmFr   r   fourz  four r   r   ohzoh r   rB   ru   r   PzP r5   MNr   r    r%   r%   r&   	test_time  s   

zTextProcessorTestCase.test_timec                 C   sl   t dd}|d\}}t|j||ddd}| |tdddddd	tdd
dddd	tdddddd	g dS )z1Test time verbalization without a colon (English)ro   rp   10amFr   r   r   r   r   r   AA r   r   Nr   r    r%   r%   r&   test_time_no_colon  s   
z(TextProcessorTestCase.test_time_no_colonc                 C   s   t ddhd}|d\}}t|j||ddd}| |tddddd	d
tdddddd
tdddddddtdddddd
tdddddd
tdddddd
g dS )z)Test date verbalization (single language)ro   ri   rq   rj   z4/1/1999Fr   r   AprilApril r   r   firstr   r-   rG   Tr   r   r   r   r   r7   r   nineteen	nineteen r5   rk   rl   r8   rm   Nr   r    r%   r%   r&   test_date_one_language   sH   

z,TextProcessorTestCase.test_date_one_languagec                 C   sh  t ddhd}|dddd\}}t|j||ddd}| |tdd	d	d
ddtddd	dddtddd	ddddtddd	dddtddd	dddtddd	dddtdd	ddddtddddddtdddd d!dtdddd"d#dtdddd$d%dtdddd&d'dtdd(dd)d*dtdd+dd,d-dtdd.dd"d"dtd/d	dd0d1dtd/ddd
ddtd/ddd2d2dg d3S )4z2Test date verbalization (SSML, multiple languages)ro   ri   r   zY<speak><s>4/1/1999</s> <s lang="fr_FR">4/1/1999</s><s lang="de_DE">01.04.1999</s></speak>TFr   r   r   r   r   r   r   r   r   r-   rG   r   r   r   r   r5   rk   rl   r8   rm   r   u
   quatrièmeu   quatrième janvierzjanvier millezmille neufzneuf centzcent quatrezquatre r:   vingtzvingt r;   r   r   r<   r   erstezerste neunzehnhundertneunundneunzigNr   r    r%   r%   r&   test_date_multiple_languagesJ  s   






z2TextProcessorTestCase.test_date_multiple_languagesc                 C   s`   t dd}|ddd\}}t|j||ddd}| |tdddd	d
dtddddddg dS )z"Test date format in SSML (ordinal)ro   rp   z4<say-as interpret-as="date" format="md">4/1</say-as>Tra   Fr   r   r   r   r   r   rB   Nr   r    r%   r%   r&   test_date_format_ordinal  s   


z.TextProcessorTestCase.test_date_format_ordinalc                 C   s   t dd}|ddd\}}t|j||ddd}| |tdddd	d
dtddddddtddddddtddddddg dS )z#Test date format in SSML (cardinal)ro   rp   z:<say-as interpret-as="date" format="dmy">4/1/2000</say-as>Tra   Fr   r   rB   ru   r   r   r   r   r   rC   rv   r   thousandNr   r    r%   r%   r&   test_date_format_cardinal  s*   


z/TextProcessorTestCase.test_date_format_cardinalc                 C   sd   dd }t |d}|d\}}t|j||ddd}| |tdddd	d
dtddddddg dS )zTest part-of-speech taggingc                 _   s   dd | D S )Nc                 S   s   g | ]}|  qS r%   )upper).0wr%   r%   r&   
<listcomp>  s    zbTextProcessorTestCase.test_part_of_speech_tagging.<locals>.get_parts_of_speech.<locals>.<listcomp>r%   )r   argskwargsr%   r%   r&   get_parts_of_speech  s   zNTextProcessorTestCase.test_part_of_speech_tagging.<locals>.get_parts_of_speech)r   za testF)r	   r
   r   r   r9   r   )r   r   r   r   r   r   r   TESTNr   )r!   r   r"   r#   r$   r   r%   r%   r&   test_part_of_speech_tagging  s   z1TextProcessorTestCase.test_part_of_speech_taggingc              
   C   s^   dt fdd}t|d}|d\}}t|j||ddd}| |tddddg d	d
g dS )z!Test phonemizer (single language)wordc                 _      t | S ry   r   r   r   r   r%   r%   r&   lookup_phonemes     zJTextProcessorTestCase.test_phonemize_one_language.<locals>.lookup_phonemesr   r   Fr   r	   r   rr   rs   rt   rr   r   r   r   r   r
   N)strr   r   r   r   r   r!   r   r"   r#   r$   r   r%   r%   r&   test_phonemize_one_language  s"   z1TextProcessorTestCase.test_phonemize_one_languagec                 C   st   ddd}t |d}|dddd\}}t|j||ddd	}| |td
d
ddg ddtdd
ddg ddg dS )z+Test phonemizer (SSML, multiple word roles)Nc                 [   s   |st | S t |  S ry   r   r   )r   roler   r%   r%   r&   r     s   zYTextProcessorTestCase.test_phonemize_one_language_multiple_roles.<locals>.lookup_phonemesr   z0<speak>test <w role="some_role">test</w></speak>TF)rb   r   r   r   r   rI   r   r   r   r   Er   r   ry   r   r   r%   r%   r&   *test_phonemize_one_language_multiple_roles  s4   

z@TextProcessorTestCase.test_phonemize_one_language_multiple_rolesc                 C   s   dt fdd}dt fdd}td|dtd|did	}|d
dd\}}t|||}| |tdddddg ddtdddddg ddg dS )z*Test phonemizer (SSML, multiple languages)r   c                 _   r   ry   r   r   r%   r%   r&   en_lookup_phonemes;  r   zSTextProcessorTestCase.test_phonemize_multiple_languages.<locals>.en_lookup_phonemesc                 _   s   t |  S ry   r   r   r%   r%   r&   de_lookup_phonemes>  s   zSTextProcessorTestCase.test_phonemize_multiple_languages.<locals>.de_lookup_phonemesro   r   )r   r   )rq   r   settingsz,<speak>test <w lang="de_DE">test</w></speak>Tra   r   r   rI   r   )r   r   r   r   r   r
   r   r   N)r   r   r   r   r   r   r   )r!   r   r   r"   r#   r$   r   r%   r%   r&   !test_phonemize_multiple_languages8  sD   	
z7TextProcessorTestCase.test_phonemize_multiple_languagesc                 C   sz   t dd}|ddd\}}t|j||fi t}| |tddddd	td
dddd	tddddd	tddddd	g dS )zTest SSML substitutionro   rp   z?<speak><sub alias="World Wide Web Consortium">W3C</sub></speak>Tra   r   WorldzWorld r   r   WidezWide r   WebzWeb r   
ConsortiumNr   r    r%   r%   r&   test_subf  s   

zTextProcessorTestCase.test_subc                 C   sx   t ddd}|ddd\}}t|j||fi t}| |tddddd	d
tddddd	ddtdddddgdg dS )zTest SSML break tagro   Frq   r(   a2  
        <speak>
          <break time="1s"/>
          <p>
            <break time="2s" />
            <s>
              <break time="3s" />
              Break <break time="4s" /> here
            </s>
            <break time="5s" />
          </p>
          <break time="6s" />
        </speak>
        Tra   r   z
Break herei  i*  Breaki  )r   r   r   r   pause_before_mspause_after_msr   herer   )r   r   r   r^   r   r   r   Nr   r   r_   r   r   r   r   r!   r"   r#   r$   r_   r%   r%   r&   
test_breaky  s8   
z TextProcessorTestCase.test_breakc                 C   s   t ddd}|ddd\}}t|j||fi t}| |tddddd	d
gddgtdddddgdgdtdddddgdg dS )zTest SSML mark tagro   Fr   a%  
        <speak>
          <mark name="a"/>
          <p>
            <mark name="b" />
            <s>
              <mark name="c" />
              Mark <mark name="d" /> here
            </s>
            <mark name="e" />
          </p>
          <mark name="f" />
        </speak>
        Tra   r   z	Mark herer   brs   fMarkcd)r   r   r   r   marks_beforemarks_afterr   r   r   )r   r   r   r^   r   r   r   Nr   r   r%   r%   r&   	test_mark  s8   
zTextProcessorTestCase.test_markc              
   C   sZ   t  }|ddd\}}t|j||fi t}| |tdddddtdddddg d	S )
z"Test SSML with missing <speak> tagz<s>hello</s><s>world</s>Tra   r   hellor   r   worldNr   r    r%   r%   r&   test_missing_speak  s   z(TextProcessorTestCase.test_missing_speakc                 C   sb   t  }|ddd\}}t|j||fddit}| |tdddddd	tdd
dddd	g dS )z$Test SSML with adjacent <voice> tagsz<<voice name="a">hello.</voice><voice name="b">world.</voice>Tra   r2   Fr   r   r   )r   r   voicer   r   r   r   r   Nr   r    r%   r%   r&   test_adjacent_voice  s   
z)TextProcessorTestCase.test_adjacent_voicec                 C   sv   t  }|d\}}t|j||fi t}| |tddddtddddtd	d
ddtddddtddddg dS )z9Test sentence that needs multiple passes to fully resolvezABCD-10r   r   r   r   r   r   r   BzB r   CzC r   DzD r5   r   Nr   r    r%   r%   r&   test_multiple_passes  s   z*TextProcessorTestCase.test_multiple_passesc              	   C   sR   t  }|d\}}t|j||fi t}| |tddddtddddg dS )	zTest sentence with nan or infznan infr   nanznan r   r   infNr   r    r%   r%   r&   test_number_nonfinite  s   z+TextProcessorTestCase.test_number_nonfinitec                 C   s   t  }|d\}}t|j||fi t}| |tddddtddddtd	dddtd
dddtddddtddddg |ddd\}}t|j||fi t}| |tddddg dS )zBTest use of inline lexicon pronunciation to override an initialismROOFUSr   RzR r   r   OzO r   r   FzF r5   UzU r8   r   u   
        <speak>
          <lexicon>
            <lexeme>
              <grapheme>ROOFUS</grapheme>
              <phoneme>ɹ ˈu f ə s</phoneme>
            </lexeme>
          </lexicon>
          <s>ROOFUS</s>
        </speak>Tra   Nr   r    r%   r%   r&   test_override_initialism  s*   

z.TextProcessorTestCase.test_override_initialismN)(__name__
__module____qualname____doc__r'   r)   r@   rD   rK   rW   r`   rf   rg   rh   rn   rx   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r%   r%   r%   r&   r      sL    (! (E!*i&.00r   c                 C   s   t | |dd d dS )zPrint graph to stderrc                  W   s   t | dtjiS )Nfile)printsysstderr)pr%   r%   r&   r|   D  r}   z$print_graph_stderr.<locals>.<lambda>)
print_funcNr   )r#   r$   r%   r%   r&   print_graph_stderrB  s   r  __main__)r  r  unittestgruut.text_processorr   r   r   r   gruut.utilsr   r   TestCaser   r  r  mainr%   r%   r%   r&   <module>   s&           >