o
    iA                  	   @   s  d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZ d dlm	Z	 g ej
dej dej
dej dej
dej dej
dej dd	ej
d
ej dej
dej dej
dej dej
dej dej
dej dddej
dej dej
dej dej
dej dej
dej ddej
dej dej
dej dej
dej dej
dej ddej
dej dej
dej dej
dej dej
dej dej
dej dej
dej dej
d ej dej
d!ej dd"ej
d#ej dej
d$ej dej
d%ej dej
d&ej dej
d'ej dej
d(ej dej
d)ej dej
d*ej dej
d+ej dej
d,ej dej
d-ej dej
d.ej dej
d/ej dej
d0ej dej
d1ej dZejd2ed3d4 Zd5d6 Zd7d8 ZejjdCd:ejjd;ed<efd=d>Zejjejd2eeje d?d2ed@ed<dfdAdBZdS )D    N)	Tokenizer)get_lang_classfr)marksafarbgbnbocacsdadeelenesetfafigahehihrhuiditknlbltlvnbnlplptrosiskslsqsrsvtatetltrtturkmrlangc                 C   s`   t |  j}td|  d}|jD ]}dd ||D }dd ||D }||ks-J qd S )Nzspacy.lang.z	.examplesc                 S   s   g | ]}|j s|jqS  )is_spacetext.0tr3   r3   V/home/ubuntu/.local/lib/python3.10/site-packages/spacy/tests/tokenizer/test_explain.py
<listcomp>F   s    z*test_tokenizer_explain.<locals>.<listcomp>c                 S      g | ]}|d  qS    r3   r6   r3   r3   r9   r:   G       )r   	tokenizerpytestimportorskip	sentencesexplain)r2   r?   examplessentencetokensdebug_tokensr3   r3   r9   test_tokenizer_explainA   s   
rH   c                 C   sl   t d}t d}dddigi}t| ||j|jd}dd |dD }d	d |dD }||ks4J d S )
Nz[\.]$z[/]za.ORTH)rulessuffix_searchinfix_finditerc                 S      g | ]}|j qS r3   r5   r6   r3   r3   r9   r:   U       z:test_tokenizer_explain_special_matcher.<locals>.<listcomp>za/a.c                 S   r;   r<   r3   r6   r3   r3   r9   r:   V   r>   )recompiler   searchfinditerrC   )en_vocab	suffix_reinfix_rerJ   r?   rF   explain_tokensr3   r3   r9   &test_tokenizer_explain_special_matcherK   s   

rX   c                 C   sT   dddigi}t | |d}d}dd ||D }dd ||D }||ks(J d S )Nz:]rI   )rJ   z: ]c                 S   rM   r3   rN   r6   r3   r3   r9   r:   a   rO   zEtest_tokenizer_explain_special_matcher_whitespace.<locals>.<listcomp>c                 S   r;   r<   r3   r6   r3   r3   r9   r:   b   r>   )r   rC   )rT   rJ   r?   r5   rF   rW   r3   r3   r9   1test_tokenizer_explain_special_matcher_whitespaceZ   s   rY      drawmax_n_wordsreturnc                    sZ   d g dd tjD d fddt tjjd|dD }d d	d |D S )
as  
    Composite strategy for fuzzily generating sentence with varying interpunctation.

    draw (hypothesis.strategies.DrawFn): Protocol for drawing function allowing to fuzzily pick from hypothesis'
                                         strategies.
    max_n_words (int): Max. number of words in generated sentence.
    RETURNS (str): Fuzzily generated sentence.
    |c                 S   s   g | ]}t |qS r3   )rP   escape)r7   pr3   r3   r9   r:   r   s    z%sentence_strategy.<locals>.<listcomp>z\sc                    s.   g | ]} t jjd d t jgqS )r=   )min_size)
hypothesis
strategiesr5   
from_regex)r7   _r[   punctuation_and_space_regexr3   r9   r:   t   s       )	min_value	max_value c                 S   s   g | ]	}|D ]}|qqS r3   r3   )r7   
token_pairtokenr3   r3   r9   r:   ~   s    )joinstringpunctuationrangerb   rc   integers)r[   r\   rE   r3   rf   r9   sentence_strategyf   s   
rs   )rE   rE   c                 C   sh   t | j}tdd| }dd ||D }dd ||D }||ks2J | d| d| dS )z
    Tests whether output of tokenizer.explain() matches tokenizer output. Input generated by hypothesis.
    lang (str): Language to test.
    text (str): Fuzzily generated sentence to tokenize.
    z\s+rk   c                 S   rM   r3   rN   r6   r3   r3   r9   r:      rO   z0test_tokenizer_explain_fuzzy.<locals>.<listcomp>c                 S   r;   r<   r3   r6   r3   r3   r9   r:      r>   z, N)spacyblankr?   rP   substriprC   )r2   rE   r?   rF   rG   r3   r3   r9   test_tokenizer_explain_fuzzy   s
   
$rx   )rZ   )rP   ro   rb   hypothesis.strategiesr@   rt   spacy.tokenizerr   
spacy.utilr   parammarkslow	LANGUAGESparametrizerH   rX   rY   rc   	compositeDrawFnintstrrs   xfailgivenrx   r3   r3   r3   r9   <module>   s    	
 !"#$%&'()*+,-.2
	 