o
    iF                     @   s  d dl Z d dlmZ d dlmZmZ d dlZd dlmZ d dl	m
Z
 d dlmZmZmZ d dlmZmZmZmZ d dlmZmZ d d	lmZ d d
lmZmZmZmZ ddlmZ dZ dZ!dZ"dZ#e$ddd Z%e j&'ddd Z(dd Z)dd Z*dd Z+dd Z,dd  Z-e j&.d!e"e#gd"d# Z/d$d% Z0d&d' Z1d(d) Z2d*d+ Z3d,d- Z4e j&5d.d/d0 Z6d1d2 Z7d3d4 Z8d5d6 Z9e j&.d!e"e#gd7d8 Z:d9d: Z;d;d< Z<dS )=    N)RegistryError)ConfigConfigValidationError)German)English)DEFAULT_CONFIGDEFAULT_CONFIG_PRETRAIN_PATHLanguage)MaxoutWindowEncoderMultiHashEmbedbuild_tb_parser_modelbuild_Tok2Vec_model)ConfigSchemaConfigSchemaPretrain)Example)load_configload_config_from_strload_model_from_configregistry   )make_tempdira  
[paths]
train = null
dev = null

[corpora]

[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}

[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}

[training]

[training.batcher]
@batchers = "spacy.batch_by_words.v1"
size = 666

[nlp]
lang = "en"
pipeline = ["tok2vec", "tagger"]

[components]

[components.tok2vec]
factory = "tok2vec"

[components.tok2vec.model]
@architectures = "spacy.HashEmbedCNN.v1"
pretrained_vectors = null
width = 342
depth = 4
window_size = 1
embed_size = 2000
maxout_pieces = 3
subword_features = true

[components.tagger]
factory = "tagger"

[components.tagger.model]
@architectures = "spacy.Tagger.v2"

[components.tagger.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.width}
a  
[paths]
train = null
dev = null

[corpora]

[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}

[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}

[training]

[training.batcher]
@batchers = "spacy.batch_by_words.v1"
size = 666

[nlp]
lang = "en"
pipeline = ["tok2vec", "tagger"]

[components]

[components.tok2vec]
factory = "tok2vec"

[components.tok2vec.model]
@architectures = "spacy.HashEmbedCNN.v1"
pretrained_vectors = null
width = 342
depth = 4
window_size = 1
embed_size = 2000
maxout_pieces = 3
subword_features = true

[components.tagger]
factory = "tagger"

[components.tagger.model]
@architectures = "spacy.Tagger.v2"

[components.tagger.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.width}

[pretraining]
aX  
[model]
@architectures = "spacy.TransitionBasedParser.v2"
state_type = "parser"
extra_state_tokens = false
hidden_width = 66
maxout_pieces = 2
use_upper = true

[model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v1"
pretrained_vectors = null
width = 333
depth = 4
embed_size = 5555
window_size = 1
maxout_pieces = 7
subword_features = false
aY  
[model]
@architectures = "spacy.TransitionBasedParser.v2"
state_type = "parser"
extra_state_tokens = false
hidden_width = 66
maxout_pieces = 2
use_upper = false

[model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v1"
pretrained_vectors = null
width = 333
depth = 4
embed_size = 5555
window_size = 1
maxout_pieces = 7
subword_features = false
my_test_parserc                  C   sB   t tdddgddgddtdddd	d
} t| dddddd}|S )NiA  LOWERSHAPEi8  F)widthattrsrowsinclude_static_vectors      r   )r   window_sizemaxout_piecesdepthparserTA      )tok2vec
state_typeextra_state_tokenshidden_widthr!   	use_upper)r   r   r
   r   )r&   r#    r+   _/home/ubuntu/.local/lib/python3.10/site-packages/spacy/tests/serialize/test_serialize_config.py	my_parser   s$   	r-   i  c                  C   s   ddiddid} t | }t (}|d }|| tj|dddiid	}|jd d dks1J W d
   d
S 1 s<w   Y  d
S )z?Test that config overrides are not lost after load is complete.langenkeyvalue)nlpcustom
test_modelr3   updated_valueconfigN)r   from_configr   to_diskspacyloadr7   )
source_cfg
source_nlpdir_pathsource_pathr2   r+   r+   r,   test_issue8190   s   

"r@   c                  C   s  t  t} tt t| dd W d    n1 sw   Y  t| dd}|jd d d dks4J t|jd dks?J |j	d	d
gksHJ t|jd dksSJ t|jd d dks`J |
d
 t|jd dkspJ t|jd d dks}J tt di i}tt |dd W d    n1 sw   Y  tt dddii}tt |dd W d    d S 1 sw   Y  d S )NF	auto_fillTtrainingbatchersizei     r&   tagger
componentsr   r2   pipelineyolofoobar)r   from_strnlp_config_stringpytestraisesr   r   r7   len
pipe_namesremove_pipe
ValueError)r7   r2   bad_cfgr+   r+   r,   test_create_nlp_from_config   s*   
"rV   c                  C   s4   t  t} tt}| |}tj|d td dS )z;Test that the default pretraining config validates properlypretraining)schemaN)	r   rM   pretrain_config_stringr   r   merger   resolver   )r7   pretrain_configfilledr+   r+   r,   'test_create_nlp_from_pretraining_config   s   
r^   c                  C   s   t  t} | d d | d d | d d d| d< t| d  | d d< t| dd}|jg dks5J |d	jdks?J |d
jdksIJ |djdksSJ |j	d }t
|dks`J t| g dkslJ |j	d d g dksyJ dS )zmTest that the nlp object is created correctly for a config with multiple
    instances of the same component.rH   r&   rG   )t2vtagger1tagger2r2   rI   TrA   r_   r`   ra   r   N)r   rM   rN   listkeysr   rR   get_pipe_metafactoryr7   rQ   )r7   r2   pipeline_configr+   r+   r,   .test_create_nlp_from_config_multiple_instances   s   




rg   c                  C   s  t  t} t| dd}|dd |  d|jv sJ d|jv s&J d|jvs-J |dj	d
ddks=J t 8}|| t|}d|jv sRJ d|jv sYJ d|jvs`J |dj	d
ddkspJ W d	   d	S 1 s{w   Y  d	S )
zNCreate a custom nlp pipeline from config and ensure it serializes it correctlyTrA   rG   Ar&   r#   nOV  N)r   rM   rN   r   get_pipe	add_label
initializerR   modelget_refget_dimr   r9   r:   r;   )
nlp_configr2   dnlp2r+   r+   r,   test_serialize_nlp  s     

""rt   c                  C   s   t  } t }ddi|d< | jd|d |   t 6}| | t|}|dj	}|
d |
ddd	ks<J |
d
dd	ksHJ W d   dS 1 sSw   Y  dS )zBCreate a custom nlp pipeline and ensure it serializes it correctlyz@architecturesr   rn   r#   r6   r&   uppernIr$   lowerN)r   dictadd_piperm   r   r9   r:   r;   rk   rn   ro   rp   )r2   
parser_cfgrr   rs   rn   r+   r+   r,   test_serialize_custom_nlp  s   


"r{   parser_config_stringc                 C   s   t  }t | }|jd|d}|d |  t ;}|| t	|}|
dj}|d |jd rC|dddksCJ |d	ddksOJ W d
   d
S 1 sZw   Y  d
S )zGCreate a non-default parser config to check nlp serializes it correctlyr#   r6   nsubjr&   	has_upperru   rv   B   rw   N)r   r   rM   ry   rl   rm   r   r9   r:   r;   rk   rn   ro   r   rp   )r|   r2   model_configr#   rr   rs   rn   r+   r+   r,   test_serialize_parser.  s   




"r   c                  C   s|   t  } | d | d t| jdd}|j| jksJ |j| jks$J |j| jks,J |j| jks4J |j| jks<J dS )zTTest that a config produced by the nlp object passes training config
    validation.entity_rulernerFrA   N)r   ry   r   r7   rR   _pipe_configs
_pipe_meta_factory_meta)r2   new_nlpr+   r+   r,   test_config_nlp_roundtripD  s   

r   c                  C   s~   t  } |  }t  |}|j| jksJ t  } t }| | t|}W d   n1 s0w   Y  |j| jks=J dS )zQTest that the config is serialized correctly and not interpolated
    by mistake.N)r   to_bytes
from_bytesr7   r   r9   r:   r;   )r2   	nlp_bytesr   rr   r+   r+   r,   $test_config_nlp_roundtrip_bytes_diskR  s   
r   c                  C   s  d} t j| ddiddtdtdtfdd}t }|| r J t  }|| s*J |j| dd	id
d |jd d
 }|d d	ksCJ |d | ksKJ t }|	| t
|}W d   n1 scw   Y  || soJ |jd
gkswJ |d
j| ksJ |jd d
 }|d d	ksJ |d | ksJ t |j }d|d d< tt t| W d   dS 1 sw   Y  dS )zVTest that config serialization works as expected with language-specific
    factories.'test_serialize_config_language_specificrK      )default_configr2   namec                 S   s   dd S )Nc                 S   s   | S Nr+   )docr+   r+   r,   <lambda>g  s    zQtest_serialize_config_language_specific.<locals>.custom_factory.<locals>.<lambda>r+   )r2   r   rK   r+   r+   r,   custom_factorye  s   z?test_serialize_config_language_specific.<locals>.custom_factoryd   rL   )r7   r   rH   re   Nder.   )r   re   r	   strinthas_factoryry   r7   r   r9   r:   r;   rR   rd   r   rM   to_strrO   rP   rT   r   )r   r   r2   pipe_configrr   rs   r7   r+   r+   r,   r   `  s4   

"r   c                  C   sz   t  t} | d d d| d d v sJ d| d vsJ tt t| dd W d    d S 1 s6w   Y  d S )NrH   r&   r2   rI   TrA   )r   rM   rN   poprO   rP   rT   r   r6   r+   r+   r,   #test_serialize_config_missing_pipes  s   "r   c                  C   s  dddgdi} ddgd}t  jt|d}t|dd}t|ts#J |jdgks+J t  t}t|dd}t|ts>J |jd	dgksGJ t }|	| t
j|| d
}W d    n1 saw   Y  t|tsmJ |jdgksuJ t }|	| t
j||d
}W d    n1 sw   Y  t|tsJ |jdgksJ t }|	| t
|}W d    n1 sw   Y  t|tsJ |jd	dgksJ d S )Nr2   r   rG   )r.   rI   )znlp.langznlp.pipeline)	overridesTrA   r&   r6   )r   rM   rN   r   
isinstancer   rR   r   r   r9   r:   r;   )overrides_nestedoverrides_dotr7   r2   base_configbase_nlprr   r+   r+   r,   test_config_overrides  s8   


r   zignore:\[W036c               	   C   s  t d} | d t m}| | t j|dddddiiiid}|jd d d d dks1J tdd	d
 }t j|ddddddidiiid}|jd d d d ddiks]J t	
|di }||g}d|v srJ W d    d S 1 s}w   Y  d S )Nr/   attribute_rulerrH   scorer@scorerszspacy.tagger_scorer.v1r6   test_some_other_keyc                   S   s   dS )Nsome_other_keyr+   r+   r+   r+   r,   misc_some_other_key  s   zGtest_config_overrides_registered_functions.<locals>.misc_some_other_keyz)spacy.overlapping_labeled_spans_scorer.v1@misc)r   	spans_keyr   za b cspans_some_other_key_f)r:   blankry   r   r9   r;   r7   r   miscr   	from_dictmake_docevaluate)r2   rr   nlp_re1r   nlp_re2examplescoresr+   r+   r,   *test_config_overrides_registered_functions  sN   



"r   c                  C   s>  t  jtdd} | d d d dksJ |  }|d d d d u s$J t| }|jd d d dks6J d}| d d	 d
 d d |ksHJ |jd d	 d
 d d |ksYJ |j }|d d d d u sjJ |d d	 d
 d d dkszJ t|}|jd d d d u sJ |jd d	 d
 d d dksJ d S )NF)interpolatecorporatrainpathz${paths.train}z!${components.tok2vec.model.width}rH   rG   rn   r&   r   rj   )r   rM   rN   r   r   r8   r7   )r7   interpolatedr2   r   interpolated2rs   r+   r+   r,   test_config_interpolation  s   
 "
 
&r   c                  C   sV   t  t} t| } d| vsJ tj| tdd}t  | }|d i ks)J d S )NrW   F)rX   validate)	r   rM   rN   r   rZ   r   fillr   r   )r7   r]   
new_configr+   r+   r,   test_config_optional_sections  s   
r   c                  C   sf   t ddii d} t| ddsJ t ddiddid} t| ddd	}d|jd
 vs,J t|j d S )Nr.   r/   )r2   rC   TrA   extrahelloF)rB   r   rC   )r   r   r7   )r7   r2   r+   r+   r,   "test_config_auto_fill_extra_fields  s   r   c                 C   st   t  }t | }d|d d< tt |jd|d W d    n1 s&w   Y  d|d d< |jd|d d S )Nnonsensern   r'   r#   r6   r   )r   r   rM   rO   rP   r   ry   )r|   r2   r7   r+   r+   r,   test_config_validate_literal  s   r   c                  C   s   t  } | j}ddi|d d< ddi|d d< t|dd} tt |   W d	   n1 s0w   Y  d	| jd d< |   d	S )
zTest that only the relevant blocks are resolved in the different methods
    and that invalid blocks are ignored if needed. For instance, the [initialize]
    shouldn't be resolved at runtime.
    r   nonexistentrC   before_to_diskrm   lookupsTrA   N)r   r7   r   rO   rP   r   rm   )r2   r7   r+   r+   r,   (test_config_only_resolve_relevant_blocks  s   
r   c                  C   sJ   d} t jdG dd dt}tt| }|djddgks#J d S )Nz
    [nlp]
    lang = "en"
    pipeline = ["my_punctual_component"]

    [components]

    [components.my_punctual_component]
    factory = "my_punctual_component"
    punctuation = ["?","-"]
    my_punctual_componentc                   @   s   e Zd ZdZdd ZdS )z2test_hyphen_in_config.<locals>.MyPunctualComponentr   c                 S   s
   || _ d S r   )punctuation)selfr2   r   r   r+   r+   r,   __init__5  s   
z;test_hyphen_in_config.<locals>.MyPunctualComponent.__init__N)__name__
__module____qualname__r   r   r+   r+   r+   r,   MyPunctualComponent1  s    r   ?-)	r:   r	   re   objectr   r8   r   rk   r   )hyphen_config_strr   r2   r+   r+   r,   test_hyphen_in_config$  s
   
r   )=rO   	cataloguer   	thinc.apir   r   r:   spacy.lang.der   spacy.lang.enr   spacy.languager   r   r	   spacy.ml.modelsr
   r   r   r   spacy.schemasr   r   spacy.trainingr   
spacy.utilr   r   r   r   utilr   rN   rY   parser_config_string_upperparser_config_string_no_upperarchitecturesr-   markissuer@   rV   r^   rg   rt   r{   parametrizer   r   r   r   r   r   filterwarningsr   r   r   r   r   r   r   r+   r+   r+   r,   <module>   sZ    36



#	

0


