o
    i                     @   s  d dl Z d dlZd dlZd dlZd dlmZmZ d dlZd dlm	Z	 d dl
mZmZ d dlmZmZmZmZmZmZmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZm Z m!Z!m"Z" ddl#m$Z$ ej%dd Z&e% dd Z'ej%dd Z(ej)*ddd Z+ej)*ddd Z,dZ-ej)*ddd Z.dd Z/dd  Z0d!d" Z1d#d$ Z2d%d& Z3d'd( Z4d)d* Z5d+d, Z6d-d. Z7ej)8d/d0d1 Z9d2d3 Z:d4d5 Z;d6d7 Z<d8d9 Z=d:d; Z>d<d= Z?d>d? Z@d@dA ZAdBdC ZBdDdE ZCdFdG ZDdHdI ZEdJdK ZFdLdM ZGdNdO ZHdPdQ ZIdRdS ZJej)KdTg dUdVdWgd gd gdXggd dXgdggffg dYdZdWgd gd gd gdXggg d[d\ggffd]d^gdVdWgd gd dXggd dXgdXggffg d_g d`d dXgdgdggd gd gdXdggffg d`g dad gdXgdd\ggd gdXgdgdggffdbd]gd]gg d ggdXggffg dcg ddd gd dXgdXgdggd dXgdXdgd\ggffgdedf ZLdgdh ZMdidj ZNdkdl ZOdmdn ZPdodp ZQdqdr ZRdsdt ZSdudv ZTdwdx ZUdydz ZVd{d| ZWd}d~ ZXdd ZYdd ZZdd Z[dd Z\dS )    N)Adamcompounding)English)DocDocBin)	AlignmentCorpusExamplebiluo_tags_to_offsetsbiluo_tags_to_spansdocs_to_jsoniob_to_biluooffsets_to_biluo_tagsget_alignments)AlignmentArray)json_to_docs)train_while_improving)get_words_and_spacesload_config_from_strload_model_from_path	minibatch   )make_tempdirc                  C   s   t  } g d}g d}g d}g d}g d}g d}g d}dgt| }d	|d
< d|d< d|d< d|d< d|d< ddd}	t| j||||||||d	}
|	|
_|
S )N
Sarah'ssisterflewtoSiliconValleyviaLondon.)
NNPPOSNNVBDINr%   r%   r)   r%   r$   )
PROPNPARTNOUNVERBADPr*   r*   r.   r*   PUNCT)
NounType=prop|Number=singzPoss=yeszNumber=singzTense=past|VerbForm=fin r0   r0   r1   r0   zPunctType=peri)
r   r      r2   r2         r2         )
posscasensubjROOTprepcompoundpobjr;   r=   punct)
r   r   r   flyr   r    r!   r"   r#   r$   OB-PERSONr   zI-PERSON   B-LOCr6   I-LOCr3   zB-GPE         ?        )TRAVELBAKING)wordstagsposmorphsheadsdepslemmasents)r   lenr   vocabcats)nlprJ   rK   rL   rM   rN   rO   rP   rQ   rT   doc rW   V/home/ubuntu/.local/lib/python3.10/site-packages/spacy/tests/training/test_training.pyrV   #   s8   
rV   c                   C   s$   g dg dg dg dg ddS )N)rB   r   r2   r4   r6   r3   r5   )HithereeveryoneItisjustme)TTTTTTF)INTJADVPRONrb   AUXra   rb   )rB   r   r   rB   r   r   r   )idsrJ   spacesrK   sent_startsrW   rW   rW   rW   rX   merged_dictH   s   rg   c                  C   s   t  } | jS N)r   rS   )rU   rW   rW   rX   rS   S   s   rS   i  c               
   C   sj  dg gdg gdg gdg gdg gdg gddggd	d
ggddggg	} t  }|d}| D ]\}}|D ]
\}}}|| q0q*|  tdD ]}t|  | D ]\}	}
t|	|	d|
i}|
|g qMqDt }|| t|}W d   n1 s{w   Y  | D ]0\}	}
||	}dd |jD }|
D ]\}}}||f|v r|||f |ksJ  n|
rt|qqdS )a$  Test that adding entities and resuming training works passably OK.
    There are two issues here:
    1) We have to re-add labels. This isn't very nice.
    2) There's no way to set the learning rate for the weight update, so we
        end up out-of-scale, causing it to learn too fast.
    heyhowdyz	hey therehellohizi'm looking for a place to eatz,i'm looking for a place in the north of town)   $   LOCATIONzshow me chinese restaurants)rE      CUISINEzshow me chines restaurants)rE      rq   ner   entitiesNc                 S   s   i | ]
}|j |jf|jqS rW   
start_charend_charlabel_.0entrW   rW   rX   
<dictcomp>       z!test_issue999.<locals>.<dictcomp>)r   add_pipe	add_label
initializerangerandomshuffler	   	from_dictmake_docupdater   to_diskr   rQ   	Exception)
TRAIN_DATArU   rs   _offsetsstartendlabelitnraw_textentity_offsetsexample	model_dirnlp2rV   rQ   rW   rW   rX   test_issue999Y   sR   	



r   i2  c                  C   s  dddddddddddd	ddd
dddddddddddddddddddddddg	g ddddddd	ddddddddddddddd d!ddd"d#ddd$dddd%dddd&dddd'dddd(d)ddgg dgd*d+d,d-d.d,gd/d0dd1dddd2dddd3ddd
d4dddd5dddd6dddd7dddd8dddd9ddddddg
g dddddgg dgd*d.d,d-d+d,gd/gd:} t  }g d;}t `}|d< }t| g}t||d= }|d>}|| W d    n	1 sw   Y  t|}t||}	t	|	dksJ g }
|	D ]
}|

|  qt	|
dks1J W d    d S 1 s=w   Y  d S )?Nr   zRHow should I cook bacon in an oven?
I've heard of people cooking bacon in an oven.Howr@   )idorthrs   rB   shouldr   Ir2   cookr4   baconr6   inr3   anr5   ovenrE   ?)tokensbrackets	   

      z've   heard   ofrr   peoplerp   cooking            rt   r$   bakingrF   )r   value
not_bakingrG   )raw	sentencesrT   z5What is the difference between white and brown eggs?
Whatr]   the
differencebetweenwhiteandbrowneggsr   
paragraphs)ORTH
SENT_STARTENT_IOBENT_TYPEztest4402.spacy)docsattrswb)r   r   r   r   to_bytesopenwriter   listrR   extendsplit_sents)	json_datarU   r   tmpdiroutput_filer   datafile_reader
train_datasplit_train_dataegrW   rW   rX   test_issue4402   s   




















")









F
$r   a  
[nlp]
lang = "en"
pipeline = ["tok2vec", "tagger"]

[components]

[components.tok2vec]
factory = "tok2vec"

[components.tok2vec.model]
@architectures = "spacy.Tok2Vec.v1"

[components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v1"
width = ${components.tok2vec.model.encode:width}
attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
rows = [5000,2500,2500,2500]
include_static_vectors = false

[components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v1"
width = 96
depth = 4
window_size = 1
maxout_pieces = 3

[components.tagger]
factory = "tagger"

[components.tagger.model]
@architectures = "spacy.Tagger.v2"
nO = null

[components.tagger.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode:width}
upstream = "*"
iu  c            	   	      s   ddg difddg difg} t tt}g  | D ]} t||d |d  q|j fdd	d
}t	dD ]}i }|j
 ||d q<g d}t|j|dd}t|j|dd}dd |dd D dd |dd D kswJ dS )z<Test that an empty document doesn't mess up an entire batch.zI like green eggsrK   )NVJr   zEat blue ham)r   r   r   r   rB   c                      s    S rh   rW   rW   train_examplesrW   rX   <lambda>      z test_issue7029.<locals>.<lambda>)get_examples2   sgdlosses)firstsecondthirdfourthr   thensomer1   )
batch_sizer4   c                 S   s   g | ]}|d  j qS )r   tag_)r{   rV   rW   rW   rX   
<listcomp>      z"test_issue7029.<locals>.<listcomp>N)r   from_configr   CONFIG_7029appendr	   r   r   r   r   r   r   pipe)	r   rU   t	optimizerir   textsdocs1docs2rW   r   rX   test_issue7029  s   "4r   c                 C   P   g d}g d}t | ||d}tdtddfg}t||}|g dks&J d S )N)r   r   r   r#   r$   )TTTFTrJ   re   
I flew to zI flew to LondonLOCr@   r@   r@   U-LOCr@   r   rR   r   en_vocabrJ   re   rV   ru   rK   rW   rW   rX   test_gold_biluo_U"     
r  c                 C   r   )N)r   r   r   San	Franciscor$   )TTTTFTr   r   I flew to San Franciscor   )r@   r@   r@   rC   L-LOCr@   r  r  rW   rW   rX   test_gold_biluo_BL+  r  r
  c                 C   r   )Nr   r   r   r  r  r!   r$   TTTTTFTr   r   I flew to San Francisco Valleyr   )r@   r@   r@   rC   rD   r	  r@   r  r  rW   rW   rX   test_gold_biluo_BIL4  r  r  c                 C   s|   g d}g d}t | ||d}tdtddftdtddfg}tt t|| W d    d S 1 s7w   Y  d S )Nr  r  r   r   r  r   r  )r   rR   pytestraises
ValueErrorr   )r  rJ   re   rV   ru   rW   rW   rX   test_gold_biluo_overlap=  s   "r  c                 C   sz   g d}g d}t | ||d}tdtddfg}tt t||}W d    n1 s.w   Y  |g dks;J d S )N)r   r   r   r  r  zValley.)TTTTTFr   r   r  r   )r@   r@   r@   -r  r  )r   rR   r  warnsUserWarningr   r  rW   rW   rX   test_gold_biluo_misalignI  s   r  c                    s|   g d}g d} fdd|D }t  |d}t  |d}|dtj|dd}t||}|jdd	d
}|g dks<J d S )Nr   likestuffr,   r-   r,   c                    s   g | ]} j |qS rW   )stringsadd)r{   tagr  rW   rX   r   V      z,test_example_constructor.<locals>.<listcomp>rJ   TAGuint64)dtypeT	as_string)r   
from_arraynumpyarrayr	   get_aligned)r  rJ   rK   tag_ids	predicted	referencer   rW   r  rX   test_example_constructorS  s   
r-  c                 C   sN   g d}g d}t | |d}t|d|i}|jddd}|g dks%J d S )Nr  r  r   TAGSr!  Tr$  )r   r	   r   r)  )r  rJ   rK   r+  r   rW   rW   rX   test_example_from_dict_tags_  s   r/  c                 C   sJ   g d}g d}t | ||d}t|d|i}| }|g dks#J d S )NabcdTTFTr   rJ   )NNNNr   r	   r   get_aligned_nerr  rJ   re   r+  r   ner_tagsrW   rW   rX   test_example_from_dict_no_nerh  s   r:  c                 C   sP   g d}g d}t | ||d}t||g dd}| }|g dks&J d S )Nr0  r5  r   )r   NNNrJ   ru   r6  r8  rW   rW   rX   test_example_from_dict_some_nerq  s   r<  zignore::UserWarningc                 C   s   dddddddddddd	dd
ddddddddddddddgigigdg}t t|}t|dks8J |D ]	}|drCJ q:|D ]	}|jdksOJ qFtt|jdd |D dd |D d|}| }|g dksqJ d S )NrB   r   r   nnr%   zMs.)depheadr  r   r9   Haagr:   r   VBZplaysdobjr   Eliantir>   r$   r   r   c                 S      g | ]}|j qS rW   textr{   wrW   rW   rX   r         z,test_json_to_docs_no_ner.<locals>.<listcomp>c                 S      g | ]}t |jqS rW   )boolwhitespace_rI  rW   rW   rX   r     r   r   )NNNNN)	r   r   rR   has_annotationent_iobr	   r   rS   r7  )r  r   r   rV   tokenr   r9  rW   rW   rX   test_json_to_docs_no_ner|  sV   #rR  c                 C   s  g d}g d}g d}t | |d}t|||d}|jdks"J | }t|dks.J |d jd	ks7J |d
 jdks@J g d}g d}g d}t | |d}t|||d}|jdksbJ | }t|dksnJ |d jd	kswJ |d
 jdksJ d S )N)r   r   r   San Francisco Valleyhadzloads of fun)
r   r   r   r  r  r!   rT  loadsr   fun)
TFFFFFTFFFr   )rJ   rf   z0I flew to San Francisco Valley had loads of fun r   r   zI flew to San Francisco Valley rB   zhad loads of fun )	r   r   r   r  r  r!   rT  rU  zof fun)r   r   r   San Franciscor!   rT  zloads ofrV  )TFFFFTFF)r   r	   r   rH  r   rR   )r  rJ   
gold_wordsrf   rV   r   split_examplesrW   rW   rX   test_split_sentences  s(   rZ  c           
      C   s$  g d}g d}t | ||d}d}t|t|d dfg}g d}t|||d}| }	|	g d	ks6J td
tddft|t|d dfg}g d}t|||d}| }	|	g dkscJ td
tddft|t|d dfg}g d}t|||d}| }	|	g dksJ d S )N)Mr and 	Mrs Smithflew torS  r$   TTTFFr   Mr and Mrs Smith flew to rS  r   )Mr and Mrs Smithr   r   r  r  r!   r$   r;  r   r[  r`  PERSON	Mr andMrsSmithr   r   r  r  r!   r$   )r@   U-PERSONr@   r   r@   
Mr and Mrs)r@   Nr@   r   r@   r   rR   r	   r   r7  
r  en_tokenizerrJ   re   rV   prefixru   rX  r   r9  rW   rW   rX   test_gold_biluo_one_to_many  s.   rl  c                 C   s   g d}g d}t | ||d}d}t|t|d dfg}g d}t|||d}| }	|	g d	ks6J td
tddft|t|d dfg}g d}t|||d}| }	g d}
|	|
kseJ d S )Nrb  )	TTTTTTTFFr   r_  rS  r   )r`  r]  rS  r$   r;  )	r@   r@   r@   r@   r@   rC   rD   r	  r@   r[  r`  ra  )rc  r\  r]  rS  r$   )	r@   rA   zL-PERSONr@   r@   rC   rD   r	  r@   rh  )r  rj  rJ   re   rV   rk  ru   rX  r   r9  expectedrW   rW   rX   test_gold_biluo_many_to_one  s"   rn  c           
      C   s   g d}g d}t | ||d}d}t|t|d dfg}g d}t|||d}| }	|	g d	ks6J td
tddft|t|d dfg}g d}t|||d}| }	|	g dkscJ d S )N)rg  re  r   r   rW  r!   r$   )TTTTTFFr   r_  rS  r   )Mrzand Mrs Smithr]  r  Francisco Valleyr$   r;  r@   r@   r@   r@   rC   r	  r@   r[  r`  ra  )rc  r\  r]  r  rp  r$   )NNr@   r@   rC   r	  r@   rh  ri  rW   rW   rX   test_gold_biluo_misaligned  s    rr  c                 C   s|   t g dd\}}t| ||d}d}t|t|d dfg}g d}g d}t||||d	}	|	 }
|
g d
ks<J d S )N)r   r   r   rW  r!   r$   z I flew  to San Francisco Valley.r   zI flew  to rS  r   )r   r    r   rS  r$   )TTFTFFrJ   re   ru   rq  )r   r   rR   r	   r   r7  )r  rj  rJ   re   rV   rk  ru   rX  gold_spacesr   r9  rW   rW   rX   %test_gold_biluo_additional_whitespace  s   rv  c                 C   s   |d}g d}g d}dg}t ||||d}| }|g dks%J |d}g d}g d}dg}t ||||d}| }|g d	ksJJ d S )
NzI'll return the A54 amount)r   'llreturnr   A54amount)FTTTFTF)r   r   MONEYrt  )r@   r@   r@   r@   zU-MONEYr@   zI'll return the $54 amount)r   rw  rx  r   $rz  r{  )r@   r@   r@   r@   zB-MONEYzL-MONEYr@   )r	   r   r7  )r  rj  rV   rX  ru  ru   r   r9  rW   rW   rX   test_gold_biluo_4791-  s$   r~  c                 C   sZ   d}g d}ddg}| |}t ||}||ksJ t||}dd |D }||ks+J d S )N$I flew to Silicon Valley via London.r@   r@   r@   rC   r	  r@   U-GPEr@   )r      r   )   #   GPEc                 S   s   g | ]}|d  r|qS )r   rW   rz   rW   rW   rX   r   K  r  z;test_roundtrip_offsets_biluo_conversion.<locals>.<listcomp>)r   r
   )rj  rH  
biluo_tagsr   rV   biluo_tags_convertedoffsets_convertedrW   rW   rX   'test_roundtrip_offsets_biluo_conversionC  s   

r  c                 C   s   | d}g d}t ||}dd |D }t|dksJ |d jdks%J |d jdks.J |d	 jd
ks7J |d	 jdks@J d S )Nr  r  c                 S   s   g | ]}|j r|qS rW   )ry   )r{   spanrW   rW   rX   r   S  r   z$test_biluo_spans.<locals>.<listcomp>r   r   zSilicon Valleyr   rB   r#   r  )r   rR   rH  ry   )rj  rV   r  spansrW   rW   rX   test_biluo_spansO  s   
r  c                 C   s   g d}g d}t | ||d}d}dtddft|t|d d	fg}g d
}t|||d}|jj}	dd |	D ddgksAJ ||	}
dd |
D ddgksSJ d S )N)r`  r   r   rS  r$   r^  r   r_  r   r`  ra  rS  r   )
ro  r   rd  re  r   r   r  r  r!   r$   r;  c                 S      g | ]}|j |jfqS rW   r   r   rz   rW   rW   rX   r   i  r  z*test_aligned_spans_y2x.<locals>.<listcomp>r   r4   r3   r   c                 S   r  rW   r  rz   rW   rW   rX   r   k  r  )r   rB   )r2   r4   )r   rR   r	   r   r,  rQ   get_aligned_spans_y2x)r  rj  rJ   re   rV   rk  ru   
tokens_refr   ents_refents_y2xrW   rW   rX   test_aligned_spans_y2x[  s   
r  c                 C   s   d}t  }ddddddg}|d}|| ||}dd	 |jD d
dgks+J d}dtddft|t|d dfg}g d}	t||	|d}
dd	 |
jjD ddgks[J |
jj}dd	 |D d
dgkslJ |
	|}dd	 |D ddgks~J d S )Nz-Mr and Mrs Smith flew to San Francisco Valleyra  r`  )r   patternr   rS  entity_rulerc                 S   r  rW   r  rz   rW   rW   rX   r   x  r  z*test_aligned_spans_x2y.<locals>.<listcomp>r  r  r_  r   )rg  re  r   r   rW  r!   r;  c                 S   r  rW   r  rz   rW   rW   rX   r     r  )r   r   )r4   r3   c                 S   r  rW   r  rz   rW   rW   rX   r     r  c                 S   r  rW   r  rz   rW   rW   rX   r     r  )
r   r   add_patternsrQ   rR   r	   r   r,  r+  get_aligned_spans_x2y)r  rj  rH  rU   patternsrulerrV   rk  ru   r  r   	ents_predents_x2yrW   rW   rX   test_aligned_spans_x2yn  s(   


r  c                 C   s   d}t  }||}||}g }d}||jt|t|d dd ||jt|t|d dd d}||j|< t||}	|	jj| }
d	d
 |
D ddgksSJ |	j|
dd}dd
 |D dgksfJ |	j|
dd}dd
 |D ddgkszJ d S )Nr  r   rW  CITY)r   rS  VALLEYoverlap_entsc                 S   r  rW   r  rz   rW   rW   rX   r     r  z2test_aligned_spans_y2x_overlap.<locals>.<listcomp>)r2   r6   )r2   r3   F)allow_overlapc                 S   r  rW   r  rz   rW   rW   rX   r     r  Tc                 S   r  rW   r  rz   rW   rW   rX   r     r  )	r   r   r   	char_spanrR   r  r	   r,  r  )r  rj  rH  rU   rV   gold_docr  rk  	spans_keyr   
spans_goldspans_y2x_no_overlapspans_y2x_overlaprW   rW   rX   test_aligned_spans_y2x_overlap  s2   


r  c                 C   s:   | d}g d}t |d|i}|dg dksJ d S )Nr  )Nr@   r@   rC   r	  r@   r  r@   ru   r   )r   r   r   r2   rB   r   r2   r   )r	   r   r)  )rj  rV   r  r   rW   rW   rX   test_gold_ner_missing_tags  s   r  c                 C   s8  | d}g d}dgt | }t|||d}|jdd\}}|jdd\}}|g dks0J |g dks8J | d	}d
g}dg}t|||d}|jdd\}}||ksYJ ||ks_J t|jdgdgdgd
gd}	t|jg dg dg dg dd}
t|	|
}|jdd\}}|d gksJ |d gksJ d S )NzHe pretty quickly walks away)r2   r   r2   r2   r   r>  )rN   rO   T)projectivizeF)r2   r   r2   r2   r2   Conrailr   zDouble-Jointedr:   )rJ   re   rO   rN   )Doubler  Jointed)TTT)amodr>   r:   )r   r   r   )rR   r	   r   get_aligned_parser   rS   )rj  rV   rN   rO   r   
proj_headsproj_labelsnonproj_headsnonproj_labelsdoc_adoc_b	proj_depsrW   rW   rX   test_projectivize  s:   
r  c                  C   sd   g d} g d}g d}t | }||ksJ tt t | W d    d S 1 s+w   Y  d S )N)r@   r@   rC   rD   r@   rA   )r@   r@   rC   r	  r@   rf  )r@   r@   "rC   rD   )r   r  r  r  )good_iob
good_biluobad_iobconverted_biluorW   rW   rX   test_iob_to_biluo  s   
"r  c                 C   sD  | j }dd | D }dd | D }dd | D }dd | D }dd | D }dd | D }dd | D }| j}	d	d | jD }
t /}t }|d
 }t|t| g |d }t| gd	| t
|}t||}W d    n1 stw   Y  t| tdd |D ksJ |d }||jj ksJ |dd |jD ksJ |dd |jD ksJ |dd |jD ksJ |dd |jD ksJ |dd |jD ksJ |dd |jD ksJ |dd |jD ksJ |
dd |jjD ksJ d|jjv sJ d|jjv sJ |	d |jjd ksJ |	d |jjd ks J d S )Nc                 S   rF  rW   idxr{   r   rW   rW   rX   r     rK  z1test_roundtrip_docs_to_docbin.<locals>.<listcomp>c                 S   rF  rW   r   r  rW   rW   rX   r     rK  c                 S   rF  rW   pos_r  rW   rW   rX   r     rK  c                 S   rL  rW   strmorphr  rW   rW   rX   r     r   c                 S   rF  rW   lemma_r  rW   rW   rX   r     rK  c                 S   rF  rW   dep_r  rW   rW   rX   r     rK  c                 S      g | ]}|j jqS rW   r?  r   r  rW   rW   rX   r         c                 S      g | ]
}|j |j|jfqS rW   rv   r{   erW   rW   rX   r     r~   zroundtrip.jsonzroundtrip.spacy)r   c                 s   s    | ]}t |V  qd S rh   )rR   )r{   r   rW   rW   rX   	<genexpr>  s    z0test_roundtrip_docs_to_docbin.<locals>.<genexpr>r   c                 S   rF  rW   r  r  rW   rW   rX   r     rK  c                 S   rF  rW   r   r  rW   rW   rX   r     rK  c                 S   rF  rW   r  r  rW   rW   rX   r     rK  c                 S   rL  rW   r  r  rW   rW   rX   r     r   c                 S   rF  rW   r  r  rW   rW   rX   r     rK  c                 S   rF  rW   r  r  rW   rW   rX   r     rK  c                 S   r  rW   r  r  rW   rW   rX   r     r  c                 S   r  rW   rv   r  rW   rW   rX   r     s    rH   rI   )rH  rT   rQ   r   r   srsly
write_jsonr   r   r   r   r   rR   sumr,  )rV   rH  r  rK   rL   rM   rP   rO   rN   rT   rQ   r   reloaded_nlp	json_filer   r   reloaded_examplesreloaded_examplerW   rW   rX   test_roundtrip_docs_to_docbin  sH   	r  c                 C   s   d| j d< t }t &}|d }t| gdd| t ||j}t|d }W d    n1 s4w   Y  |j d dksBJ d S )NTcheckuserdata.spacyr   store_user_datar   )		user_datar   r   r   r   	from_diskget_docsrS   r   rV   rU   r   r   reloaded_docsreloaded_docrW   rW   rX    test_docbin_user_data_serialized  s   
r  c                 C   s   t  | jd< t }t &}|d }t| gdd| t ||j}t	|d }W d    n1 s5w   Y  d|jvsAJ d S )Nr  r  Fr  r   )
setr  r   r   r   r   r  r  rS   r   r  rW   rW   rX   $test_docbin_user_data_not_serialized  s   r  ztokens_a,tokens_b,expected)r1  r2  r3  abr3  rB   )r1  r2  r  r3  zab"r   rB   r   r2   r1  bc)r  r3  r4  )r1  r2  cdr0  rs  )r1  ''',)za'r  r  c                 C   s@   t | |\}}||f|ksJ t || \}}||f|ksJ d S rh   r   )tokens_atokens_brm  a2bb2arW   rW   rX   
test_align  s   r  c           	      C   sp   d}| |}dg}dg}dg}dg}t |||||d}| }|ddgks)J |jdd	d
d dgks6J d S )Nz ar1  U-DATEr:   r   rJ   ru   rO   rN   r@   DEPTr$  r	   r   r7  r)  	rj  rH  rV   rX  ru   rO   rN   r   r9  rW   rW   rX   test_goldparse_startswith_spaceC     r  c           	      C   sp   d}| |}dg}dg}dg}dg}t |||||d}| }|ddgks)J |jdd	d
dd gks6J d S )Nza
r1  r  r:   r   r  r@   r  Tr$  r  r  rW   rW   rX   test_goldparse_endswith_spaceR  r  r  c                  C   sb   t  } | d}t|ddddi}|jdddg d	ksJ |jjd
 s'J |jjd r/J dS )z,Test that the Example constructor works finezThis is a sentencerT   rF   rG   )cat1cat2r   Tr$  )Thisr]   r1  sentencer  r  N)r   r	   r   r)  r,  rT   )rU   rV   r   rW   rW   rX   test_gold_constructora  s   r  c                  C   s6   dddgifddddgifdddgifg} t |  d	S )
zTest tuple format#Uber blew through $1 million a weekru   r   r4   ORGSpotify steps up Asia expansionr   r5   r  r      r   !Google rebrands its business appsr   r3   r  N)_train_tuplesr   rW   rW   rX   test_tuple_format_implicitp  s   
	r  c                  C   sb   dddgifddddgifddd	gifg} t t t|  W d
   d
S 1 s*w   Y  d
S )z:Test that an error is thrown for an implicit invalid fieldr   frumbler  r  ru   r  r  r  r  N)r  r  KeyErrorr	  r
  rW   rW   rX   "test_tuple_format_implicit_invalid  s   

"r  c           
   	   C   s   t  }|d}|d |d g }| D ]}|t||d |d  q| }tdD ]}i }t	|t
ddd	d
}|D ]
}	|j|	||d qAq1d S )Nrs   r  r   r   rB   r6   g      @g      @@gjt?)sizer   )r   r   r   r   r	   r   r   r   r   r   r   r   )
r   rU   rs   r   r   r   r   r   batchesbatchrW   rW   rX   r	    s   


"r	  c                 C   s  t  }tt|j| d | d d| }|jdksJ | }t|dks&J |d jdks/J |d jd	ks8J |d  d
 }|d g dksJJ |d g dksTJ |d g dks^J |d  d
 }|d g dkspJ |d g dkszJ |d g dksJ d S )NrJ   re   r   zHi there everyone It is just mer   r   zHi there everyone rB   zIt is just metoken_annotationr   )rY   rZ   r[   r!  )r`   ra   rb   r   )rB   r   r   )r\   r]   r^   r_   )rb   rc   ra   rb   )rB   r   r   r   )	r   r	   r   r   rS   rH  r   rR   to_dict)rg   rU   r   rY  token_annotation_1token_annotation_2rW   rW   rX   test_split_sents  s$   r  c                  C      g d} g d}t | |}t|jjg dksJ t|jjg dks&J t|jjg dks2J t|jjg dks>J d S )N)r   listenedr   obamar  spodcastsr$   r   r  r   r  r   r  r$   rB   rB   rB   rB   rB   rB   rB   rB   r   rB   r   r2   r4   r4   r6   r3   rB   rB   rB   rB   r   rB   rB   r   rB   r   r2   r4   r6   r3   r5   r   from_stringsr   x2ylengthsr   y2xother_tokensspacy_tokensalignrW   rW   rX   test_alignment     r*  c                  C   s  t g ddgg g dddgg} t| jg dksJ t| jg dks&J t| d g dks2J t| d g ks<J t| d	 g dksHJ t| d
d g dksVJ t| d
d  g dksdJ t| d d g dksrJ t| d d  t| jksJ t| dd g ksJ t| dd g ksJ t| dd g ksJ tjtdd | d dd  W d    n1 sw   Y  tjtdd | g d  W d    n1 sw   Y  t g g dddgg} t| d g ksJ t| dd
 g ksJ t| d ddgksJ t| dd g dksJ t g dddgg g} t| d g ks4J t| d	d  ddgksCJ d S )Nr  r2   )r4   r6   r3   r5   rE   r   )
r   rB   r   r2   r4   r6   r3   r5   rE   r   )r2   rB   r   r4   r   r   rE  rB   r4   )r2   r4   r6   r3   r5   )r2   r4   r6   r3   r5   rE   r   )r   rB   r   r2   r   r   z&only supports slicing with a step of 1matchz.only supports indexing using an int or a slicer   rB   r2   )rB   r   r2   r6   )r   r   r   r$  r  r  r  )r1  rW   rW   rX   test_alignment_array  s:    "r/  c                  C   r  )N)r   r  r   r  r  r  r  r$   )r   r  r   Obamar   PODCASTSr$   r  r  r  r   r!  r&  rW   rW   rX   test_alignment_case_insensitive  r+  r2  c                  C   r  )Ni listened tor  r  r  r  r$   r   r  r   r  r   	podcasts.r2   rB   rB   rB   rB   rB   r   rB   r   r2   r4   r4   r6   r6   rB   rB   rB   rB   r   r   r   r   r   rB   r   r2   r4   r6   r!  r&  rW   rW   rX   test_alignment_complex  r+  r;  c                 C   s   g d}g d}t | |g dd}t | |g dd}|jdks!J |jdks(J t||}|j}t|jjg dks<J t|jjg dksHJ t|jjg d	ksTJ t|jjg d
ks`J d S )Nr3  r5  )TFFTFFr   )TTTFTFzi listened to obama's podcasts.r7  r8  r9  r:  )	r   rH  r	   	alignmentr   r#  r$  r   r%  )r  r'  r(  r+  r,  r   r)  rW   rW   rX   test_alignment_complex_example  s    


r=  c                  C   sL   g d} g d}t t t| | W d    d S 1 sw   Y  d S )N)sher  r   r  r   r  r$   r  )r  r  r  r   r"  )r'  r(  rW   rW   rX   test_alignment_different_texts  s
   "r?  c                 C   s  g d}g d}t ||}t|jjg dksJ t|jjg dks&J t|jjg dks2J t|jjg dks>J g d}g d}t ||}t|jjg dksXJ t|jjg dksdJ t|jjg dkspJ t|jjg d	ks|J g d}g d
}t ||}t|jjg dksJ t|jjg dksJ t|jjg dksJ t|jjg dksJ g d}g d}t ||}t|jjg dksJ t|jjg dksJ t|jjg dksJ t|jjg dksJ g d}g d}t ||}t|jjg dksJ t|jjg dks J t|jjg dks-J t|jjg dks:J g d}g d}t ||}t|jjg dksUJ t|jjg dksbJ t|jjg dksoJ t|jjg dks|J g d}g d}t ||}t|jjg dksJ t|jjg dksJ t|jjg d ksJ t|jjg d!ksJ g d"}g d#}t ||}t|jjg d$ksJ t|jjg d%ksJ d&d'g}g d(}t ||}d'd&g}d'd)g}t ||}d S )*N)rs  r4  r  r  r  r  r$   r5  )r   r2   rB   rB   rB   rB   rB   r8  r9  )rB   rB   rB   r   r2   r4   r6   r3   )rs  rs  r4  r  r  r  r  r$   )r   r   r2   rB   rB   rB   rB   rB   )r   r   r   r2   r4   r6   r3   r5   )rs  r   r  r   r  r   r6  )rB   r   r2   rB   rB   rB   rB   rB   )	r   rB   r   r2   r4   r6   r6   r3   r3   )rB   rB   rB   rB   rB   r   r   )	r   r   r   r   r2   r4   r6   r3   r5   )  r   r  r   r  r   r6  )rB   rB   r2   rB   rB   rB   rB   rB   )
r   r   rB   r   r2   r4   r6   r6   r3   r3   )r   rB   rB   rB   rB   r   r   )
r   rB   r   r   r   r2   r4   r6   r3   r5   )r4  r  r  r  r  r$   rs  )r2   rB   rB   rB   rB   rB   r   r:  )r4  r  r  r  r  r$   rs  rs  )r   r  r   r  r   r6  rs  )r2   rB   rB   rB   rB   rB   rB   r   )	r   rB   r   r2   r4   r4   r6   r6   r3   )rB   rB   rB   rB   r   r   rB   )	r   r   r   rB   r   r2   r4   r6   r3   )r   r  r   r  r   r6  r@  )r2   rB   rB   rB   rB   rB   rB   rB   )
r   rB   r   r2   r4   r4   r6   r6   r3   r3   )rB   rB   rB   rB   r   r   r   )
r   r   r   rB   r   r2   r4   r6   r3   r5   )r1  z 
 r2  r3  )r1  r2  rs  r3  r.  )r   r   r2   rs  r1  )r@  r1  rs  r@  r!  )r  r'  r(  r)  rW   rW   rX   test_alignment_spaces
  sx   rA  c                 C   s   |  dg}t| jdd | D ddg|}t| jdd | D ddg|}t||}g d}g d}|jdd	d
|ks@J | }||dd  ||dd  W d    n1 saw   Y  |jdd	d
|ksqJ d S )Nr!  c                 S   rF  rW   rG  r  rW   rW   rX   r   ]  rK  z)test_retokenized_docs.<locals>.<listcomp>r   c                 S   rF  rW   rG  r  rW   rW   rX   r   ^  rK  r   )Nr   r   r   Nr"   r#   r$   r   Tr$  r   r   r6   r5   )to_arrayr   rS   r&  r	   r)  
retokenizemerge)rV   r1  doc1doc2r   	expected1	expected2retokenizerrW   rW   rX   test_retokenized_docs[  s   ""

rJ  c                    s   dd } fdd}t d}|d t }t||| dd d	d
ddd
g g |d}tjtdd |D ]}q4W d    d S 1 sBw   Y  d S )Nc                 S   s(   |d dksJ |d dksJ t d)Nstepr   epochrB   ran_before_update)r  )rU   argsrW   rW   rX   before_updatel  s   z2test_training_before_update.<locals>.before_updatec                   3   s    dt   gfV  d S )NrB   )r	   rW   rV   rW   rX   generate_batchu  s   z3test_training_before_update.<locals>.generate_batchentaggerc                   S   s   d S rh   rW   rW   rW   rW   rX   r     r   z-test_training_before_update.<locals>.<lambda>g?d   r   )dropouteval_frequencyaccumulate_gradientpatience	max_stepsexcludeannotating_componentsrO  rM  r,  )spacyblankr   r   r   r  r  r  )rV   rO  rQ  rU   r   	generatorr   rW   rP  rX   test_training_before_updatek  s0   	

"r_  )]r   r'  r  r  	thinc.apir   r   r\  spacy.lang.enr   spacy.tokensr   r   spacy.trainingr   r   r	   r
   r   r   r   r   spacy.training.alignr   spacy.training.alignment_arrayr   spacy.training.convertersr   spacy.training.loopr   
spacy.utilr   r   r   r   utilr   fixturerV   rg   rS   markissuer   r   r   r   r  r
  r  r  r  r-  r/  r:  r<  filterwarningsrR  rZ  rl  rn  rr  rv  r~  r  r  r  r  r  r  r  r  r  r  r  parametrizer  r  r  r  r  r  r	  r  r*  r/  r2  r;  r=  r?  rA  rJ  r_  rW   rW   rW   rX   <module>   s    (

$





0
Y
)
			
		

6"$
(* ("$(

 

Q