o
    iE                     @   s   d dl Z d dlZd dlZd dlmZmZ d dlmZ d dlm	Z	 d dl
mZ d dlmZmZmZmZmZ ddlmZmZ d	d
 Zejddd Zejddd Zejddd Zdd Zejdg ddd Zdd ZdS )    N)ENT_IOBENT_TYPE)English)	Tokenizer)Doc)compile_infix_regexcompile_prefix_regexcompile_suffix_regexget_lang_class
load_model   )assert_packed_msg_equalmake_tempdirc                 C   s   t d j}||  |S )Nen)r
   	tokenizer
from_bytes)btok r   b/home/ubuntu/.local/lib/python3.10/site-packages/spacy/tests/serialize/test_serialize_tokenizer.pyload_tokenizer   s   
r   i  c                 C   s   t | ddgd}tt t|d  W d   n1 sw   Y  tt t|dd  W d   dS 1 s>w   Y  dS )zATest that a custom error is raised if a token or span is pickled.Helloworld)wordsr   Nr   )r   pytestraisesNotImplementedErrorpickledumps)en_vocabdocr   r   r   test_issue2833   s   "r!   i  c                 C   s   g d}g d}g d}g d}t | ||||d}|ds J d}|d j|d j|d j|d jf|ks8J ttg}||}|	|| |d j|d j|d j|d jf|ks]J |
 }	t | |	}
|
d j|
d j|
d j|
d jf|ks~J d	S )
ziTest that the is_tagged attribute doesn't get overwritten when we from_array
    without tag information.)Thisis10%.)DTVBZCDNNr&   )DETVERBNUMNOUNPUNCT)Or0   z	B-PERCENTz	I-PERCENTr0   )r   tagsposentsTAG)r$   r-   r)   PERCENTr   N)r   has_annotationtextpos_tag_	ent_type_r   r   to_array
from_arrayto_bytesr   )r   r   r1   r2   r3   r    expectedheader	ent_array	doc_bytesdoc2r   r   r   test_issue3012%   s   ,
,0rC   i^  c                  C   s   dd } d}t  }||}dd |D }| | ||}dd |D }t }|| t|}W d    n1 s:w   Y  ||}	dd |	D }
||
ksPJ |jjdu sXJ d S )	Nc              	   S   sj   t | jj}t| jj}t| jj}dd t| jj	 D }t
| j||j|j|j| jjdd}|| _d S )Nc                 S   s.   i | ]\}}t |d kr|d dks||qS )r      r&   )len).0kvr   r   r   
<dictcomp>B   s
    z?test_issue4190.<locals>.customize_tokenizer.<locals>.<dictcomp>F)prefix_searchsuffix_searchinfix_finditertoken_matchfaster_heuristics)r   Defaultsprefixesr	   suffixesr   infixesdicttokenizer_exceptionsitemsr   vocabsearchfinditerr   rM   )nlp	prefix_re	suffix_reinfix_re
exceptionsnew_tokenizerr   r   r   customize_tokenizer=   s    
	z+test_issue4190.<locals>.customize_tokenizerzTest c.c                 S      g | ]}|j qS r   r7   rF   tokenr   r   r   
<listcomp>V       z"test_issue4190.<locals>.<listcomp>c                 S   r`   r   ra   rb   r   r   r   rd   Z   re   c                 S   r`   r   ra   rb   r   r   r   rd   a   re   F)r   r   to_diskr   r   rN   )r_   test_stringnlp_1doc_1a	result_1adoc_1b	result_1b	model_dirnlp_2doc_2result_2r   r   r   test_issue4190;   s    

rq   c                 C   s  t | |jd}| }t | | td j}tdj|_	|j
i ks&J |j	dus-J |jdus4J |jdus;J |jdusBJ || |j
i ksNJ |j	du sUJ |jdu s\J |jdu scJ |jdu sjJ t | dddiddigid	}i |_
| }t | |}|j
i ksJ dS )
zTest that custom tokenizer with not all functions defined or empty
    properties can be serialized and deserialized correctly (see #2494,
    #4991).)rK   r   testNzABC.ORTHABCr&   )rules)r   rK   r=   r   r
   r   recompilematchrM   ru   	url_matchrJ   rL   )r   en_tokenizerr   tokenizer_bytestokenizer_reloadedr   r   r   test_serialize_custom_tokenizerf   s*   
r}   r7   )u   I💜youu	   they’reu   “hello”c                 C   sj   | }t | }t| |  | | ksJ ||}||}dd |D dd |D ks3J d S )Nc                 S   r`   r   ra   rb   r   r   r   rd      re   z<test_serialize_tokenizer_roundtrip_bytes.<locals>.<listcomp>)r   r=   r   )rz   r7   r   r^   doc1rB   r   r   r   (test_serialize_tokenizer_roundtrip_bytes   s   $r   c                 C   s`   | }t  !}|d }|| | |}| | ksJ W d    d S 1 s)w   Y  d S )Nr   )r   rf   	from_diskr=   )rz   r   d	file_pathtokenizer_dr   r   r   'test_serialize_tokenizer_roundtrip_disk   s   

"r   )r   rv   r   spacy.attrsr   r   spacy.lang.enr   spacy.tokenizerr   spacy.tokensr   
spacy.utilr   r   r	   r
   r   utilr   r   r   markissuer!   rC   rq   r}   parametrizer   r   r   r   r   r   <module>   s(    


	


*

