o
    i /                     @   s  d dl mZ d dlZd dlZd dlZd dlmZmZ d dl	m
Z
 d dlmZ d dlmZmZ d dlmZ d dlmZmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ ddl
mZ dZ dZ!dZ"i ddidddddgZ#ddddddddddgZ$dd  Z%ej&'d!e#ej&'d"d#d$d% Z(ej&'d!e$d&d' Z)ej&'d!e$d(d) Z*ej&'d*e!e gd+d, Z+d-d. Z,d/d0 Z-d1d2 Z.d3d4 Z/d5d6 Z0d7d8 Z1dS )9    )PathN)Configget_current_ops)util)English)DEFAULT_CONFIG_PATHDEFAULT_CONFIG_PRETRAIN_PATH)create_pretrain_vectors)DocDocBin)init_nlp)train)pretrain)Vectors)Vocab   )make_tempdiraE  
[nlp]
lang = "en"
pipeline = ["tok2vec", "tagger"]

[components]

[components.tok2vec]
factory = "tok2vec"

[components.tok2vec.model]
@architectures = "spacy.HashEmbedCNN.v1"
pretrained_vectors = null
width = 342
depth = 4
window_size = 1
embed_size = 2000
maxout_pieces = 3
subword_features = true

[components.tagger]
factory = "tagger"

[components.tagger.model]
@architectures = "spacy.Tagger.v2"

[components.tagger.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.width}

[pretraining]
max_epochs = 5

[training]
max_epochs = 5
a  
[nlp]
lang = "en"
pipeline = ["tagger"]

[components]

[components.tagger]
factory = "tagger"

[components.tagger.model]
@architectures = "spacy.Tagger.v2"

[components.tagger.model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v1"
pretrained_vectors = null
width = 342
depth = 4
window_size = 1
embed_size = 2000
maxout_pieces = 3
subword_features = true

[pretraining]
max_epochs = 5

[training]
max_epochs = 5
a  
[nlp]
lang = "en"
pipeline = ["tok2vec", "tagger"]

[components]

[components.tok2vec]
factory = "tok2vec"

[components.tok2vec.model]
@architectures = "spacy.HashEmbedCNN.v1"
pretrained_vectors = null
width = 342
depth = 4
window_size = 1
embed_size = 2000
maxout_pieces = 3
subword_features = true

[components.tagger]
factory = "tagger"

[components.tagger.model]
@architectures = "spacy.Tagger.v2"

[components.tagger.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.width}

[pretraining]
max_epochs = 5

[pretraining.objective]
@architectures = spacy.PretrainVectors.v1
maxout_pieces = 3
hidden_size = 300
loss = cosine

[training]
max_epochs = 5
@architectureszspacy.PretrainCharacters.v1   *   )r   maxout_pieceshidden_sizen_characterszspacy.PretrainVectors.v1   ,  cosine)r   r   r   loss   L2c                  C   sR   t  t} tj| ddd}|j}tt}||}d|d d d v s'J dS )	z7Test that pretraining defaults to a character objectiveTF	auto_fillvalidatePretrainCharacterspretraining	objectiver   N)	r   from_strpretrain_string_internalr   load_model_from_configconfigload_configr   merge)r(   nlpfilledpretrain_config r.   Y/home/ubuntu/.local/lib/python3.10/site-packages/spacy/tests/training/test_pretraining.pytest_pretraining_default   s   

r0   r$   	skip_last)TFc                 C   s&  t  t}| |d d< tj|ddd}|j}tt}||}t	 e}t
|}||d d< | }|d d d	ks=J t|||d
 t|d  sNJ t|d  sXJ t|d  rbJ |rot|d  rnJ nt|d  syJ W d   dS W d   dS 1 sw   Y  dS )z8Test that pretraining works with the character objectiver#   r$   TFr   pathsraw_text	componenttok2vec)r1   
model0.bin
model4.bin
model5.binmodel-last.binNr   r%   pretrain_string_listenerr   r'   r(   r)   r   r*   r   write_sample_jsonlinterpolater   r   exists)r$   r1   r(   r+   r,   r-   tmp_dir	file_pathr.   r.   r/   #test_pretraining_tok2vec_characters   s*   


"rA   c              	   C   s   t  t}| |d d< tj|ddd}|j}tt}||}t	 C}t
|}||d d< | }|d d	 d
u s=J tt t|| W d
   n1 sRw   Y  W d
   d
S W d
   d
S 1 sjw   Y  d
S )z]Test that pretraining doesn't works with the vectors objective if there are no static vectorsr#   r$   TFr   r2   r3   
initializevectorsN)r   r%   r;   r   r'   r(   r)   r   r*   r   r<   r=   pytestraises
ValueErrorr   )r$   r(   r+   r,   r-   r?   r@   r.   r.   r/   %test_pretraining_tok2vec_vectors_fail   s"   

"rG   c                 C   s   t  t}| |d d< tj|ddd}|j}tt}||}t	 &}t
|}||d d< t|}||d d	< | }t|| W d
   d
S 1 sMw   Y  d
S )zQTest that pretraining works with the vectors objective and static vectors definedr#   r$   TFr   r2   r3   rB   rC   N)r   r%   r;   r   r'   r(   r)   r   r*   r   r<   write_vectors_modelr=   r   )r$   r(   r+   r,   r-   r?   r@   nlp_pathr.   r.   r/    test_pretraining_tok2vec_vectors   s   

"rJ   r(   c                 C   s   t  t} tj| ddd}|j}tt}||}t	 P}t
|}||d d< d|d d< d	|d d
< | }t|| t|d  sHJ t|d  sRJ t|d  s\J t|d  rfJ W d   dS 1 sqw   Y  dS )z?Test pretraining of the tagger's tok2vec layer (via a listener)TFr   r2   r3   taggerr#   r4   r5   layerr6   r7   r9   r8   Nr:   r(   r+   r,   r-   r?   r@   r.   r.   r/   test_pretraining_tagger_tok2vec   s"   


"rN   c               	   C   s   t  t} tj| ddd}|j}tt}||}t	 ?}t
|}||d d< d|d d< | }tt t|| W d	   n1 sHw   Y  W d	   d	S W d	   d	S 1 s`w   Y  d	S )
z\Test pretraining of the tagger itself will throw an error (not an appropriate tok2vec layer)TFr   r2   r3   rK   r#   r4   N)r   r%   r&   r   r'   r(   r)   r   r*   r   r<   r=   rD   rE   rF   r   rM   r.   r.   r/   test_pretraining_tagger   s    

"rO   c               	   C   s  t  t} tj| ddd}|j}tt}||}tt	}||}t
 }|d }|  t|}||d d< d|d d	< d
|d d< |d }|  t|\}	}
|	|d d< |
|d d< | }|d }t|}||d	 j|d d}d}| D ]	}|jdkr|}qt|| t|d }| sJ t||d d< t|}||d	 j|d d}d}| D ]	}|jdkr|}qtt|d|dsJ t|| W d   dS 1 sw   Y  dS )z5Test that training can use a pretrained Tok2Vec modelTFr   r   r2   r3   rK   r#   r4   r5   rL   r   devembedN	hashembedz
model3.binrB   init_tok2vecE)r   r%   r&   r   r'   r(   r)   r   r*   r   r   mkdirr<   write_sample_trainingr=   r   get_pipemodelget_refwalknamer   r   r>   strnpany	not_equal	get_paramr   )r(   r+   r,   r-   train_configr?   pretrain_dirr@   	train_dir
train_pathdev_pathPnlp_base
model_base
embed_basenodepretrained_modelrX   rQ   r.   r.   r/   test_pretraining_training  sV   





 
""rl   c                 C   sF   ddidddddddid	ddddg}|  d
}t || |S )Nid1z$This is the best TV you'll ever buy!   r   )posneg)metatextcats2zI wouldn't buy this again.z/text.jsonl)srslywrite_jsonl)r?   datar@   r.   r.   r/   r<   :  s   
r<   c                 C   sb   g d}g d}t t j||d}t }|| |  d}|  d}|| || ||fS )N)Theplayersstart.)DTNNVBZr|   )wordstagsz/train.spacyz
/dev.spacy)r
   r   vocabr   addto_disk)r?   r   r   docdoc_binrd   re   r.   r.   r/   rV   L  s   




rV   c                 C   s~   dd l }t }|jddd|jddd|jdddd}| D ]
\}}||| q#| d }t|}|| t|S )Nr   ro   )r   )dogcatorangevectors_model)	numpyr   randomuniformitems
set_vectorr   r   r\   )r?   r   r   vector_datawordvectorrI   r+   r.   r.   r/   rH   Y  s   
rH   c                  C   s   t  } | d |   tdd| j_tddd| j| dj tt	 j
dddd| j_tddd| j| dj tjtdd	 t | j_tddd| j| dj W d    d S 1 scw   Y  d S )
Nr5   )
   r   )shapero   r   floret)rx   mode
hash_countE875)match)r   add_piperB   r   r   rC   r	   rW   rX   r   xpzerosrD   rE   rF   )r+   r.   r.   r/   test_pretrain_default_vectorsj  s   



"r   )2pathlibr   r   r]   rD   rv   	thinc.apir   r   spacyr   spacy.lang.enr   spacy.languager   r   spacy.ml.models.multi_taskr	   spacy.tokensr
   r   spacy.training.initializer   spacy.training.loopr   spacy.training.pretrainr   spacy.vectorsr   spacy.vocabr   r   r;   r&   pretrain_string_vectorsCHAR_OBJECTIVESVECTOR_OBJECTIVESr0   markparametrizerA   rG   rJ   rN   rO   rl   r<   rV   rH   r   r.   r.   r.   r/   <module>   sl    %,



.