o
    i
                     @   s   d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ dd	lmZ ejd
d Zejddd ZdZejddd Zdd Zdd Zdd ZdS )    N)English)Italian)Language)	Tokenizer)Example)load_config_from_str   )make_tempdirc                   C   s    ddddddddddd d	d
S )Nname-in-fixturezversion-in-fixturezdescription-in-fixturezauthor-in-fixturezemail-in-fixturezurl-in-fixturezlicense-in-fixturer   )widthvectorskeysname)r   versiondescriptionauthoremailurllicenser    r   r   r   a/home/ubuntu/.local/lib/python3.10/site-packages/spacy/tests/serialize/test_serialize_language.py	meta_data   s   r   i	  c                  C   s(   t  } | d |  }t  | dS )zBTest we can serialize and deserialize a blank NER or parser model.nerN)r   add_pipeto_bytes
from_bytes)nlpbr   r   r   test_issue2482   s   
r   a<  
[nlp]
lang = "en"
pipeline = ["tok2vec", "tagger"]

[components]

[components.tok2vec]
factory = "tok2vec"

[components.tok2vec.model]
@architectures = "spacy.Tok2Vec.v1"

[components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v1"
width = ${components.tok2vec.model.encode:width}
attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
rows = [5000,2500,2500,2500]
include_static_vectors = false

[components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v1"
width = 96
depth = 4
window_size = 1
maxout_pieces = 3

[components.ner]
factory = "ner"

[components.tagger]
factory = "tagger"

[components.tagger.model]
@architectures = "spacy.Tagger.v2"
nO = null

[components.tagger.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode:width}
upstream = "*"
i&  c                      s@   t tt   fdd t   d t  dS )zwTest that the nlp object with initialized tok2vec with listeners pickles
    correctly (and doesn't have lambdas).
    c                      s   t  dddgigS )NhellotagsV)r   	from_dictmake_docr   r   r   r   <lambda>Y   s    z test_issue6950.<locals>.<lambda>r   N)r   from_configr   CONFIG_ISSUE_6950
initializepickledumpsr   r   r$   r   test_issue6950S   s
   
r+   c                 C   sZ   t | d}t }|| t  |}W d    n1 sw   Y  |j|jks+J d S )Nmeta)r   r	   to_disk	from_diskr-   )r   languagednew_languager   r   r   !test_serialize_language_meta_disk_   s   

r3   c                     st   t dt dt d  fdd} t }| ||_t }|| W d   dS 1 s3w   Y  dS )zTest that serialization with custom tokenizer works without token_match.
    See: https://support.prodi.gy/t/how-to-save-a-custom-tokenizer/661/2
    z$1/|2/|:[0-9][0-9][A-K]:|:[0-9][0-9]: z[~]c                    s   t | ji jj jdS )N)prefix_searchsuffix_searchinfix_finditer)r   vocabsearchfinditerr$   infix_re	prefix_re	suffix_rer   r   custom_tokenizero   s   z>test_serialize_with_custom_tokenizer.<locals>.custom_tokenizerN)recompiler   	tokenizerr	   r.   )r?   r   r1   r   r;   r   $test_serialize_with_custom_tokenizerg   s   


	
"rC   c                 C   s   d}t | d}|jd |ksJ t  | }|jd |ks!J t  j| dgd}|jd |kr5J t  |jdgd}|jd |krIJ d S )Nr
   r,   r   r-   )exclude)r   r-   r   r   )r   r   r   new_nlpr   r   r   test_serialize_language_exclude~   s   
rF   )r)   r@   pytestspacy.lang.enr   spacy.lang.itr   spacy.languager   spacy.tokenizerr   spacy.trainingr   
spacy.utilr   utilr	   fixturer   markissuer   r'   r+   r3   rC   rF   r   r   r   r   <module>   s(    



,
