o
    iU                     @   s2  d dl Z d dlZd dlmZmZ d dlmZmZmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZmZ e ZdZddgZddedgiifddeddgiifgZddedgiifddeg diifddeg iifgZefddZej dedd Z!ej dedd Z"ej dedd Z#ej ded d! Z$ejj%d"d#d$d% Z&ej d&g d'd(d) Z'ej d*g d+d,d- Z(d.d/ Z)d0d1 Z*d2d3 Z+d4d5 Z,d6d7 Z-d8d9 Z.ej ded:d; Z/ej ded<d= Z0ej deej d>d?d@gdAdB Z1dS )C    N)assert_almost_equalassert_array_equal)NumpyOpsRaggedget_current_ops)util)English)Language)	SpanGroup)
SpanGroups)Example)fix_random_seedmake_tempdirregistrylabeled_spansspancatspancat_singlelabelzWho is Shaka Khan?spans)      PERSONzI like London and Berlin.r      LOC      r   I like London and Berlin)r   r   )r   r   
DOUBLE_LOC c                 C   s6   g }|D ]}t | |d |d }|| q|S )Nr      )r   	from_dictmake_docappend)nlpdatatrain_examplesteg r)   U/home/ubuntu/.local/lib/python3.10/site-packages/spacy/tests/pipeline/test_spancat.pymake_examples&   s
   r+   namec                 C   sP   t  }|j| dtid tt |  W d    d S 1 s!w   Y  d S )N	spans_keyconfig)r	   add_pipeSPAN_KEYpytestraises
ValueError
initialize)r,   r$   r)   r)   r*   test_no_label.   s
   
"r6   c                 C   s   t  }|j| dtid}|d |d |jdksJ |  |jd|jks,J t	
t |d W d    d S 1 sBw   Y  d S )Nr-   r.   ThingPhrase)r7   r8   nOStuff)r	   r0   r1   	add_labellabelsr5   modelget_dim	_n_labelsr2   r3   r4   r,   r$   r   r)   r)   r*   test_no_resize6   s   

"rA   c                    sX   t  }|j| dtid}t|jdksJ t| |j fddd |jdks*J d S )Nr-   r.   r   c                          S Nr)   r)   r&   r)   r*   <lambda>J       z&test_implicit_labels.<locals>.<lambda>get_examplesr   r   )r	   r0   r1   lenr<   r+   r5   r@   r)   rD   r*   test_implicit_labelsD   s   rK   c                 C   sX   t  }|j| dtid}t|jdksJ |d |d |  |jdks*J d S )Nr-   r.   r   r   r   rI   )r	   r0   r1   rJ   r<   r;   r5   r@   r)   r)   r*   test_explicit_labelsN   s   

rL   z%Test is unreliable for unknown reason)reasonc               
   C   s   t  } | jddtid}|d |   g d}dd | |D }t||D ]<\}}t|ts3J |	 D ],\}}t|t
sBJ t|dksJJ tt |d  W d    n1 s^w   Y  q7q(d S )	Nr   r-   r.   r   zJust a sentence.r   zI like Berlinz
I eat ham.c                 S      g | ]}|j qS r)   )r   .0docr)   r)   r*   
<listcomp>g       ztest_doc_gc.<locals>.<listcomp>r   )r	   r0   r1   r;   r5   pipezip
isinstancer   itemsr
   rJ   r2   r3   RuntimeError)r$   r   texts	all_spanstext
spangroupskey	spangroupr)   r)   r*   test_doc_gcZ   s"   

r`   zmax_positive,nr_results))N   )r       )rb      )rc   ra   )ra   ra   c                 C   s2  t d t }|jdtd| dd}|d}tjddd	gd
}||gd j}t	t
|tddgdd	gdd	gg g d}|D ]}|| qDtjg dg dg dgdd}	||||	}
t|
|kskJ |
d jdkstJ |
d jdks}J td|
jd d d |
d jdksJ | dkr|
d jdksJ td|
jd d d n|
d jdksJ td|
jd d d |d	kr|
d	 jdksJ | d	kr|
d	 jdksJ td|
jd d	 d n|
d	 jdksJ td|
jd d	 d |
d jdksJ |
d jdksJ td|
jd d d d S )Nr   r         ?r-   	thresholdmax_positiver.   Greater Londonspacy.ngram_suggester.v1r    rb   sizesr7   CityPerson	GreatCity皙?皙?333333?皙?rt   333333?rq   rr   皙?ffffff?rs   ?fdtypeLondonrm   rv   scores   ro   rz   r7   rx   ry   )r   r	   r0   r1   r"   r   miscgetdataXdr   OPSto_numpynumpyasarrayr;   _make_span_group_multilabelrJ   r\   label_r   attrs)rg   
nr_resultsr$   r   rR   ngram_suggesterindicesr<   labelr   r_   r)   r)   r*   test_make_spangroup_multilabelr   sJ   

(r   z"threshold,allow_overlap,nr_results))皙?Trc   )r   Fr    )rd   Trb   )rd   Fr    c                 C   s2  t d t }|jdt| ddd}|d}tjdddgd	}||gd j}t	t
|tddgddgddgg g d
}|D ]}	||	 qDtjg dg dg dgdd}
||||
|}| dkr|r|d jdkssJ |d jdks|J td|jd d d |d jdksJ |d jdksJ |jd d dksJ td|jd d d d S |d jdksJ |d jdksJ |jd d dksJ d S |r|d jdksJ |d jdksJ |d jdksJ |d jdksJ |d jdksJ |d jdksJ d S |d jdksJ d S )Nr   r   r    re   r.   rh   ri   rb   rj   rl   rp   ru   rw   r{   r|   rr   r~   rm   rv   r   r   ro   rz   Greater)r   r	   r0   r1   r"   r   r   r   r   r   r   r   r   r   r;   _make_span_group_singlelabelr\   r   r   r   )rf   allow_overlapr   r$   r   rR   r   r   r<   r   r   r_   r)   r)   r*   test_make_spangroup_singlelabel   sT   
(r   c                  C   s6  t d t } t }| jdtdddd}|jdtdddd}d|_d|_| d	}g d
}|D ]}|| || q1tj	dddgd}||gd j
}tt|tddgddgddgg tjg dg dg dgdd}	||||	}
||||	}t|dksJ |d jdksJ |d jdksJ td|jd d d |d jd	ksJ |d jdksJ |jd d dksJ td|jd d d t|
dksJ |
d jdksJ |
d jdksJ td|
jd d d |
d jdksJ |
d jdksJ td|
jd d d |
d jdksJ |
d jdks'J td|
jd d d |
d jdks<J |
d jdksFJ td|
jd d d |
d jd	ks[J |
d jd kseJ |
d jd	ksoJ td!|
jd d d |
d jd	ksJ |
d jdksJ td|
jd d d d S )"Nr   r   rt   r    re   r.   rb   Trh   rl   ri   rj   )rq   rr   rs   rt   rt   )rt   rv   rq   rr   rz   )rx   ry   rs   rz   rt   r{   r|   r   rm   rr   r   r   ro   rz      rn   rs   r~   rv   rc   ra   r7   rx   )r   r	   r0   r1   add_negative_labelr"   r;   r   r   r   r   r   r   r   r   r   r   r   rJ   r\   r   r   r   )
nlp_single	nlp_multispancat_singlespancat_multirR   r<   r   r   r   r   spangroup_multispangroup_singler)   r)   r*   "test_make_spangroup_negative_label   s   

(r   c              	      s,  dD ]t jdgd} fdddD }||}|jD ]}|d |d  ks,J qd}t|D ]]\}}|j|||j|   }t }	|D ]5}
d|
d   krYt|k s\J  J d|
d   k rkt|ksnJ  J |		t
|
d t
|
d f qH|jd t|	ksJ ||j| 7 }q3tt|jfd	d|D  qt jdg dd} fd
ddD }||}tt|jg d tt|jg ddgddgddgddgddgddgddgddgddgddgddgddgddgddgddgddgddgddgddgddgddgddgddgddgddgddgddgddgddgddgddg t jddgd} fdddD }||}tt|jdd |D  t jddgd} fdddD }||}tt|jdd |D  d S )Nr    rb   rc   ri   rj   c                       g | ]} |qS r)   r)   rQ   r\   en_tokenizerr)   r*   rS   %  s    z(test_ngram_suggester.<locals>.<listcomp>)aa ba b ca b c d	a b c d eza a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a r    r   c                    s"   g | ]}t d t| d  qS r   r    )maxrJ   rP   )sizer)   r*   rS   C  s   " c                    r   r)   r)   r   r   r)   r*   rS   H      r   r   r   r   r   r    rc   r   	      rb   rc   ra   r   c                    r   r)   r)   r   r   r)   r*   rS   y      )r   r   r   c                 S      g | ]}t |qS r)   rJ   rP   r)   r)   r*   rS   {  r   c                    r   r)   r)   r   r   r)   r*   rS     r   )r   r   r   c                 S   r   r)   r   rP   r)   r)   r*   rS     r   )r   r   r   r%   	enumerater   lengthssetrJ   addintshaper   r   r   )r   r   docsngramssoffsetirR   r   	spans_setspanr)   )r   r   r*   test_ngram_suggester!  s   

$$ 


	
 !"#$+ r   c                    s   t jdg dd}t jd}|ddd} fdd	d
D }||}||}tt|jg d tt|jt|j tt|jt|j t jd}|ddd}||}tt|jg d d S )Nri   r   rj   zspacy.ngram_range_suggester.v1r    rc   )min_sizemax_sizec                    r   r)   r)   r   r   r)   r*   rS     r   z$test_ngram_sizes.<locals>.<listcomp>r   r   rb   ra   )r   r    rc   r   r   )r   r   r   r   r   r   r   r%   )r   size_suggestersuggester_factoryrange_suggesterr   ngrams_1ngrams_2ngrams_3r)   r   r*   test_ngram_sizes  s   
r   c                  C   s   t  } | d| dg}|d dd g|d jt< |d dd |d dd g|d jt< tjdtd	}||}t|tksCJ t|d
ksKJ t	|j
d ddgksXJ t	|j
d ddgkseJ t	|j
d
 ddgksrJ t	|jdd
gks}J d S )NzThis is an example.zThis is the second example.r   rc   ra   r    r   zspacy.preset_spans_suggester.v1)r-   rb   )r	   r   r1   r   r   r   typer   rJ   listr   r   )r$   r   	suggester
candidatesr)   r)   r*   test_preset_spans_suggester  s   *r   c                     s  t d t } | jddtid}t|  | j fddd}|jdd	ks(J t|j	d
dhks3J t
dD ]}i }| j ||d q7|d dk sLJ d}| |}|j|j |jt ks_J |jt }t|d	kslJ t|jd d	kswJ t|jd dksJ tdd |D ddhksJ tdd |D d
hksJ t V}| | t|}	|	|}
|
jt }t|d	ksJ t|jd d	ksJ t|jd dksJ tdd |D ddhksJ tdd |D d
hksJ W d    n1 sw   Y  |  }dt d|v sJ |dt d dksJ |dt d dks*J |dt d dks7J | d}t|j|j dksHJ d S )Nr   r   r-   r.   c                      rB   rC   r)   r)   rD   r)   r*   rE     rF   z%test_overfitting_IO.<locals>.<lambda>rG   r9   rb   r   r   2   sgdlosses{Gz?r   r   rx   c                 S   rO   r)   r\   rQ   r   r)   r)   r*   rS     rT   z'test_overfitting_IO.<locals>.<listcomp>r~   Berlinc                 S   rO   r)   r   r   r)   r)   r*   rS     rT   c                 S   rO   r)   r   r   r)   r)   r*   rS     rT   c                 S   rO   r)   r   r   r)   r)   r*   rS     rT   spans__f_pg      ?_rr    )r   r   r0   r1   r+   r5   r=   r>   r   r<   rangeupdater   r^   rJ   r   minr   to_diskr   load_model_from_pathevaluate)r$   r   	optimizerr   r   	test_textrR   r   tmp_dirnlp2doc2spans2r   r)   rD   r*   test_overfitting_IO  sL   




r   c                     s  t d t } | jddtid}t| td | j fddd}|jd	d
ks*J t	|j
h dks5J tdD ]}i }| j ||d q9|d dk sNJ d}| |}|jt }t|d
ksaJ t|jd d
kslJ t|jd dkswJ t	dd |D h dksJ t	dd |D ddhksJ t X}| | t|}	|	|}
|
jt }t|d
ksJ t|jd d
ksJ t|jd dksJ t	dd |D h dksJ t	dd |D ddhksJ W d    d S 1 sw   Y  d S )Nr   r   r-   r.   )r%   c                      rB   rC   r)   r)   rD   r)   r*   rE     rF   z1test_overfitting_IO_overlapping.<locals>.<lambda>rG   r9   rc   >   r   r   r   r   r   r   r   r   rz   c                 S   rO   r)   r   r   r)   r)   r*   rS     rT   z3test_overfitting_IO_overlapping.<locals>.<listcomp>>   London and Berlinr   r~   c                 S   rO   r)   r   r   r)   r)   r*   rS     rT   r   r   c                 S   rO   r)   r   r   r)   r)   r*   rS     rT   c                 S   rO   r)   r   r   r)   r)   r*   rS     rT   )r   r   r0   r1   r+   TRAIN_DATA_OVERLAPPINGr5   r=   r>   r   r<   r   r   r   rJ   r   r   r   r   r   r   )r$   r   r   r   r   r   rR   r   r   r   r   r   r)   rD   r*   test_overfitting_IO_overlapping  s<   



 "r   c                    s   t ddd }td t }|j| dditdd}t| |j fdd	d
}|j	d|j
ks5J t|jddhks@J |j |d |d |d |d t|g d t|g d d S )Ntest_mixed_zero_suggesterc                  S   s   d ddd} | S )N)opsc                S   s   |d u rt  }g }g }| D ] }t|dkr(t|d dkr(|d |d q|d q||}||}t|dkrIt|j||}|S t|jjddd|}|S )Nr   rb   r   r    )r   r   r   r|   )	r   rJ   r#   	asarray2i	asarray1ir   xpvstackzeros)r   r   r   r   rR   lengths_arrayoutputr)   r)   r*   mixed_zero_suggester  s    


zVtest_zero_suggestions.<locals>.make_mixed_zero_suggester.<locals>.mixed_zero_suggesterr)   )r   r)   r)   r*   make_mixed_zero_suggester  s   z8test_zero_suggestions.<locals>.make_mixed_zero_suggesterr   z@misc)r   r-   r.   c                      rB   rC   r)   r)   rD   r)   r*   rE   -  rF   z'test_zero_suggestions.<locals>.<lambda>rG   r9   r   r   )r   r   onetwo two)r   r   three three threer   zfour four four four)r   r   r   )r   r   r   r   r0   r1   r+   r5   r=   r>   r?   r   r<   r   r   rU   )r,   r   r$   r   r   r)   rD   r*   test_zero_suggestions	  s(   
r   c                    s   t    j| dtid}t  jfddd g d} fdd|D }|| t|t|ks6J t|d	 jd
 t	ksCJ t|d	 jd
 dksPJ |d	 jd
 d	 j
dks^J |d	 jd
 d j
dkslJ d S )Nr-   r.   c                      rB   rC   r)   r)   rD   r)   r*   rE   C  rF   z%test_set_candidates.<locals>.<lambda>rG   rN   c                    r   r)   r)   r   )r$   r)   r*   rS   K  r   z'test_set_candidates.<locals>.<listcomp>r   r   r   Justra   zJust a)r	   r0   r1   r+   r5   set_candidatesrJ   r   r   r
   r\   )r,   r   rZ   r   r)   )r$   r&   r*   test_set_candidates>  s   
 r   	n_processr    rb   c                    s|   t tts	|dk r<t }|j| dtid}t| |j fddd g d}t|j	||d}t
|t
|ks:J d S d S )	Nrb   r-   r.   c                      rB   rC   r)   r)   rD   r)   r*   rE   \  rF   z.test_spancat_multiprocessing.<locals>.<lambda>rG   rN   )r   )rW   r   r   r	   r0   r1   r+   r5   r   rU   rJ   )r,   r   r$   r   rZ   r   r)   rD   r*   test_spancat_multiprocessingU  s   r   )2r   r2   numpy.testingr   r   	thinc.apir   r   r   spacyr   spacy.lang.enr   spacy.languager	   spacy.tokensr
   spacy.tokens._dict_proxiesr   spacy.trainingr   
spacy.utilr   r   r   r   r1   SPANCAT_COMPONENTS
TRAIN_DATAr   r+   markparametrizer6   rA   rK   rL   skipr`   r   r   r   r   r   r   r   r   r   r   r   r)   r)   r)   r*   <module>   st    	



	


0
1Gc2/
4
