o
     ¢i_  ã                   @   s.  d dl mZ d dlZg d¢Zg d¢Zg d¢Zej dg d¢¡dd	„ ƒZej d
e¡ej ddg¡dd„ ƒƒZ	ej d
e¡ej ddg¡dd„ ƒƒZ
ej d
e¡ej ddg¡ej ddg¡dd„ ƒƒƒZej d
e¡ej ddg¡ej ddg¡dd„ ƒƒƒZej d
e¡ej ddg¡dd„ ƒƒZej d
e¡ej ddg¡dd„ ƒƒZej ddg¡dd„ ƒZej ddg¡dd „ ƒZej d!e¡ej dd"g¡d#d$„ ƒƒZej d!e¡ej d%d&g¡ej dd"g¡d'd(„ ƒƒƒZej dd)g¡d*d+„ ƒZd,d-„ Zej dg d.¢¡d/d0„ ƒZej dd1d2g¡d3d4„ ƒZdS )5é    )ÚpunctuationN)ú(ú[Ú{Ú*)ú)ú]Ú}r   ))r   r   )r   r   )r   r	   )r   r   Útext)r   z((ú<c                 C   s    | |ƒ}t |ƒt |ƒksJ ‚d S )N©Úlen©Úru_tokenizerr
   Útokens© r   úV/home/ubuntu/.local/lib/python3.10/site-packages/spacy/tests/lang/ru/test_tokenizer.pyÚ$test_ru_tokenizer_handles_only_punct
   s   r   Úpunctu   ÐŸÑ€Ð¸Ð²ÐµÑ‚c                 C   sD   | || ƒ}t |ƒdksJ ‚|d j|ksJ ‚|d j|ks J ‚d S ©Né   r   é   ©r   r
   ©r   r   r
   r   r   r   r   Ú#test_ru_tokenizer_splits_open_punct   ó   r   c                 C   sD   | || ƒ}t |ƒdksJ ‚|d j|ksJ ‚|d j|ks J ‚d S r   r   r   r   r   r   Ú$test_ru_tokenizer_splits_close_punct   r   r   Ú	punct_addú`c                 C   sZ   | || | ƒ}t |ƒdksJ ‚|d j|ksJ ‚|d j|ks"J ‚|d j|ks+J ‚d S ©Né   r   r   r   r   ©r   r   r   r
   r   r   r   r   Ú,test_ru_tokenizer_splits_two_diff_open_punct"   ó
   r"   ú'c                 C   sZ   | || | ƒ}t |ƒdksJ ‚|d j|ksJ ‚|d j|ks"J ‚|d j|ks+J ‚d S r   r   r!   r   r   r   Ú-test_ru_tokenizer_splits_two_diff_close_punct-   r#   r%   c                 C   sL   | || | | ƒ}t |ƒdksJ ‚|d j|ksJ ‚|d j|ks$J ‚d S )Né   r   r    r   r   r   r   r   Ú(test_ru_tokenizer_splits_same_open_punct8   ó   r'   c                 C   sL   | || | | ƒ}t |ƒdksJ ‚|d j|ksJ ‚|d j|ks$J ‚d S )Nr&   r   r   r   r   r   r   r   Ú)test_ru_tokenizer_splits_same_close_punctA   r(   r)   u	   'Ð¢ÐµÑÑ‚c                 C   s.   | |ƒ}t |ƒdksJ ‚|d jdksJ ‚d S )Nr   r   r$   r   r   r   r   r   Ú)test_ru_tokenizer_splits_open_appostropheJ   s   r*   u
   Ð¢ÐµÑÑ‚''c                 C   s4   | |ƒ}t |ƒdksJ ‚| dƒ}t |ƒdksJ ‚d S )Nr   z''r   r   )r   r
   r   Útokens_punctr   r   r   Ú)test_ru_tokenizer_splits_double_end_quoteQ   s   r,   zpunct_open,punct_closeu   Ð¢ÐµÑÑ‚c                 C   sZ   | || | ƒ}t |ƒdksJ ‚|d j|ksJ ‚|d j|ks"J ‚|d j|ks+J ‚d S r   r   )r   Ú
punct_openÚpunct_closer
   r   r   r   r   Ú)test_ru_tokenizer_splits_open_close_punctY   s
   r/   zpunct_open2,punct_close2)r   r$   c                 C   s†   | || | | | ƒ}t |ƒdksJ ‚|d j|ksJ ‚|d j|ks&J ‚|d j|ks/J ‚|d j|ks8J ‚|d j|ksAJ ‚d S )Né   r   r   r   r    r&   r   )r   r-   r.   Úpunct_open2Úpunct_close2r
   r   r   r   r   Ú test_ru_tokenizer_two_diff_puncte   s   r3   u	   Ð¢ÐµÑÑ‚.c                 C   s   | |ƒ}|d j dksJ ‚d S )Nr   Ú.)r
   r   r   r   r   Ú%test_ru_tokenizer_splits_trailing_dott   s   r5   c                 C   s*   d}| |ƒ}|t |ƒd  jdksJ ‚d S )Nu+   (Ð Ð°Ð·, Ð´Ð²Ð°, Ñ‚Ñ€Ð¸, Ð¿Ñ€Ð¾Ð²ÐµÑ€ÐºÐ°).r   r4   r   r   r   r   r   Ú'test_ru_tokenizer_splits_bracket_periodz   s   r6   )
uU   Ñ€ÐµÐºÐ¾Ð¼ÐµÐ½Ð´ÑƒÌÑ Ð¿Ð¾Ð´Ð´Ð°ÌÑ‚ÑŒ Ð¶Ð°Ñ€ÑƒÌ. Ð¡Ð°Ð¼Ð¾Ð³Ð¾Ì Ð‘Ð°Ñ€Ð³Ð°Ð¼Ð¾Ñ‚Ð°uU   Ð Ð•ÐšÐžÐœÐ•ÐÐ”Ð£ÌÐ¯ ÐŸÐžÐ”Ð”ÐÌÐ¢Ð¬ Ð–ÐÐ Ð£Ì. Ð¡ÐÐœÐžÐ“ÐžÌ Ð‘ÐÐ Ð“ÐÐœÐžÐ¢ÐuT   Ñ€ÐµÐºÐ¾Ð¼ÐµÐ½Ð´ÑƒÌÑ Ð¿Ð¾Ð´Ð´Ð°ÌÑ‚ÑŒ Ð¶Ð°Ñ€ÑƒÌ.Ð¡Ð°Ð¼Ð¾Ð³Ð¾Ì Ð‘Ð°Ñ€Ð³Ð°Ð¼Ð¾Ñ‚Ð°uU   Ñ€ÐµÐºÐ¾Ð¼ÐµÐ½Ð´ÑƒÌÑ Ð¿Ð¾Ð´Ð´Ð°ÌÑ‚ÑŒ Ð¶Ð°Ñ€ÑƒÌ.'Ð¡Ð°Ð¼Ð¾Ð³Ð¾Ì Ð‘Ð°Ñ€Ð³Ð°Ð¼Ð¾Ñ‚Ð°uT   Ñ€ÐµÐºÐ¾Ð¼ÐµÐ½Ð´ÑƒÌÑ Ð¿Ð¾Ð´Ð´Ð°ÌÑ‚ÑŒ Ð¶Ð°Ñ€ÑƒÌ,ÑÐ°Ð¼Ð¾Ð³Ð¾Ì Ð‘Ð°Ñ€Ð³Ð°Ð¼Ð¾Ñ‚Ð°uT   Ñ€ÐµÐºÐ¾Ð¼ÐµÐ½Ð´ÑƒÌÑ Ð¿Ð¾Ð´Ð´Ð°ÌÑ‚ÑŒ Ð¶Ð°Ñ€ÑƒÌ:ÑÐ°Ð¼Ð¾Ð³Ð¾Ì Ð‘Ð°Ñ€Ð³Ð°Ð¼Ð¾Ñ‚Ð°uU   Ñ€ÐµÐºÐ¾Ð¼ÐµÐ½Ð´ÑƒÌÑ Ð¿Ð¾Ð´Ð´Ð°ÌÑ‚ÑŒ Ð¶Ð°Ñ€ÑƒÌ. ÑÐ°Ð¼Ð¾Ð³Ð¾Ì Ð‘Ð°Ñ€Ð³Ð°Ð¼Ð¾Ñ‚Ð°uU   Ñ€ÐµÐºÐ¾Ð¼ÐµÐ½Ð´ÑƒÌÑ Ð¿Ð¾Ð´Ð´Ð°ÌÑ‚ÑŒ Ð¶Ð°Ñ€ÑƒÌ, ÑÐ°Ð¼Ð¾Ð³Ð¾Ì Ð‘Ð°Ñ€Ð³Ð°Ð¼Ð¾Ñ‚Ð°uU   Ñ€ÐµÐºÐ¾Ð¼ÐµÐ½Ð´ÑƒÌÑ Ð¿Ð¾Ð´Ð´Ð°ÌÑ‚ÑŒ Ð¶Ð°Ñ€ÑƒÌ: ÑÐ°Ð¼Ð¾Ð³Ð¾Ì Ð‘Ð°Ñ€Ð³Ð°Ð¼Ð¾Ñ‚Ð°uT   Ñ€ÐµÐºÐ¾Ð¼ÐµÐ½Ð´ÑƒÌÑ Ð¿Ð¾Ð´Ð´Ð°ÌÑ‚ÑŒ Ð¶Ð°Ñ€ÑƒÌ-ÑÐ°Ð¼Ð¾Ð³Ð¾Ì Ð‘Ð°Ñ€Ð³Ð°Ð¼Ð¾Ñ‚Ð°c                 C   s0   | |ƒ}|d j dv sJ ‚|d j tv sJ ‚d S )Nr   )u
   Ð¶Ð°Ñ€ÑƒÌu
   Ð–ÐÐ Ð£Ìu
   Ð¶Ð°Ñ€ÑƒÌr    )r
   r   r   r   r   r   Ú*test_ru_tokenizer_handles_final_diacritics€   s   r7   uT   Ð Ð•ÐšÐžÐœÐ•ÐÐ”Ð£ÌÐ¯ ÐŸÐžÐ”Ð”ÐÌÐ¢Ð¬ Ð–ÐÐ Ð£Ì.Ð¡ÐÐœÐžÐ“ÐžÌ Ð‘ÐÐ Ð“ÐÐœÐžÐ¢ÐuT   Ñ€ÐµÐºÐ¾Ð¼ÐµÐ½Ð´ÑƒÌÑ Ð¿Ð¾Ð´Ð´Ð°ÌÑ‚ÑŒ Ð¶Ð°Ñ€ÑƒÌ.ÑÐ°Ð¼Ð¾Ð³Ð¾Ì Ð‘Ð°Ñ€Ð³Ð°Ð¼Ð¾Ñ‚Ð°c                 C   s"   | |ƒ}|d j  ¡ dksJ ‚d S )Nr   u   Ð¶Ð°Ñ€ÑƒÌ.ÑÐ°Ð¼Ð¾Ð³Ð¾Ì)r
   Úlowerr   r   r   r   Ú4test_ru_tokenizer_handles_final_diacritic_and_period•   s   r9   )Ústringr   ÚpytestÚ
PUNCT_OPENÚPUNCT_CLOSEÚPUNCT_PAIREDÚmarkÚparametrizer   r   r   r"   r%   r'   r)   r*   r,   r/   r3   r5   r6   r7   r9   r   r   r   r   Ú<module>   sj    




þ
þþ