o
    iA                     @   sV  d dl Z d dlZd dlZd dlmZ d dlZd dlmZmZmZ d dl	m
Z
 d dlmZ d dlmZmZmZmZmZmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlm Z m!Z! ddl"m#Z# eegZ$ej%dd Z&ej%dd Z'ej%dd Z(ej)*ddd Z+ej)*ddd Z,ej)*ddd Z-ej)*ddd Z.ej)*ddd  Z/ej)*d!d"d# Z0ej)*d!d$d% Z1ej)*d&d'd( Z2ej)3d)e$d*d+ Z4ej)3d)e$d,d- Z5ej)3d)e$d.d/ Z6d0d1 Z7d2d3 Z8d4d5 Z9d6d7 Z:ej)*d8d9d: Z;ej)3d)e$d;d< Z<d=d> Z=d?d@ Z>dAdB Z?dCdD Z@dS )E    N)Linear)Vocabloadregistry)English)Language)DependencyParserEntityRecognizerEntityRulerSentenceRecognizerTaggerTextCategorizerTrainablePipe)DEFAULT_PARSER_MODEL)DEFAULT_SENTER_MODEL)DEFAULT_TAGGER_MODEL)DEFAULT_SINGLE_TEXTCAT_MODEL)Span)ensure_path
load_model   )make_tempdirc                 C   sL   ddddddd}dt i}tj|d	d
d }t| |fi |}|d |S )NF   d            ?        learn_tokensmin_action_frequpdate_with_oracle_cut_size
beam_widthbeam_update_probbeam_densitymodelTvalidatensubj)r   r   resolver   	add_labelen_vocabconfigcfgr$   parser r/   a/home/ubuntu/.local/lib/python3.10/site-packages/spacy/tests/serialize/test_serialize_pipeline.pyr.       s   
r.   c                 C   sB   ddddddd}dt i}tj|d	d
d }t| |fi |}|S )NFr   r   r   r   r   r   r$   Tr%   )r   r   r(   r   r*   r/   r/   r0   blank_parser1   s   r1   c                 C   s6   dt i}tj|ddd }t| |}t| |}||fS Nr$   Tr%   )r   r   r(   r   )r+   r-   r$   tagger1tagger2r/   r/   r0   taggersA   s
   

r5   i  c                  C   s8   t  } | d}|d |   t| ddg d S )NtaggerAhi )r   add_piper)   
initializelistpipe)nlpr6   r/   r/   r0   test_issue3456J   s
   

r?   i  c                 C   s   ddddddiddigddddigddd	d
dgdddddg}t | d}t||dd}| }t|t|ks>J t|jdksGJ |jsLJ t|}||}t|t|ks_J t|jdkshJ |j|jkspJ |j|jksxJ d S )NHELLOhello worldlabelpatternBYELOWERbyeORTHCOMPLEXfoo*rH   OPTECH_ORGApplea1rC   rD   idvocabTpatternsoverwrite_ents   )r   r
   to_byteslenlabels	overwrite
from_bytes
ent_id_sep)r+   rV   r>   rulerruler_bytes	new_rulerr/   r/   r0   test_issue_3526_1T   s$   



rb   c                 C   s   ddddddiddigddddigddd	d
dgdddddg}t | d}t||dd}t|j}t|}||}t|t|ksIJ |jD ]	}||jv sUJ qL|j|jus^J d S )Nr@   rA   rB   rE   rF   rG   rH   rI   rJ   rK   rL   rN   rO   rP   rQ   rS   TrU   )r   r
   srslymsgpack_dumpsrV   r]   rZ   r\   )r+   rV   r>   r_   bytes_old_stylera   rD   r/   r/   r0   test_issue_3526_2k   s   



rf   c                 C   s   ddddddiddigddddigddd	d
dgdddddg}t | d}t||dd}t =}|d }t|d|j t||}|jD ]	}||jv sUJ qLt|t|ks`J |j	|j	ushJ W d    d S 1 ssw   Y  d S )Nr@   rA   rB   rE   rF   rG   rH   rI   rJ   rK   rL   rN   rO   rP   rQ   rS   TrU   entity_rulerz.jsonl)
r   r
   r   rc   write_jsonlwith_suffixrV   	from_diskrZ   r\   )r+   rV   r>   r_   tmpdirout_filera   rD   r/   r/   r0   test_issue_3526_3   s"   


"rm   c                 C   s   t | d}dddg}ddi}|jd|d}|| t @}|| |d}|jdddgks4J |jdu s;J t|}|d}|jdddgksOJ |jdu sVJ W d    d S 1 saw   Y  d S )	NrS   ORGrO   rB   rW   Trg   r,   )	r   r:   add_patternsr   to_diskget_piperV   r\   r   )r+   r>   rV   r,   r_   rk   nlp2ra   r/   r/   r0   test_issue_3526_4   s   




"rt   i  c            	      C   s   t  } | d}|d |   ddddddidd	igdg}| jd
dd}|| | d}|jd jdks;J t ,}t|}|	 sK|
  | | t|}|d}|jd jdksbJ W d   dS 1 smw   Y  dS )z@Test that serialization of an EntityRuler before NER works fine.ner
SOME_LABELMY_ORGrO   rB   MY_GPElowersan	franciscorg   )beforeWhat do you think about Apple ?r   N)r   r:   r)   r;   rp   entslabel_r   r   existsmkdirrq   r   )	r>   ru   rV   r_   doc1d
output_dirrs   doc2r/   r/   r0   test_issue4042   s(   



"r   c                  C   s(  t  } | d}|d |   | d}t|jdksJ d|jv s%J t|dddd}t|j|g |_|d || t|jd	ksHJ d|jv sOJ d|jv sVJ t	 1}t
|}| sf|  || i }| jd|d
}|| t|jd	ksJ W d   dS 1 sw   Y  dS )z
    Test that serialization of an NER works fine when new labels were added.
    This is the second bug of two bugs underlying the issue 4042.
    ru   rv   r}   r         rw   )rC   r   ro   N)r   r:   r)   r;   rZ   r[   r   r<   r~   r   r   r   r   rq   create_piperj   )nlp1ner1r   	apple_entr   r   r,   ner2r/   r/   r0   test_issue4042_bug2   s0   




"r   iu  c               	   C   s   t dd} t| d}ddi}|jd|d}t \}|d d	}t|| |jd dks0J W d
   n1 s:w   Y  |d d}t|}|jd dksUJ W d
   n1 s_w   Y  W d
   d
S W d
   d
S 1 sww   Y  d
S )z(Ensure the pickling of the NER goes welltest_vocab_add_vector)vectors_namerS   r    o   ru   ro   zner.pklwbNrb)	r   r   r   r   openpickledumpr-   r   )rT   r>   r,   ru   tmp_pathfile_r   r/   r/   r0   test_issue4725_1   s"   


"r   Parserc                 C   s   dt i}tj|ddd }|| |}|| |}||jdgd}|jdgd}|jdgd}t|t|ks9J ||ks?J d S )Nr$   Tr%   rT   exclude)r   r   r(   r]   rY   rZ   )r+   r   r-   r$   r.   
new_parserbytes_2bytes_3r/   r/   r0   %test_serialize_parser_roundtrip_bytes   s   

r   c                 C   s   t  }d}||jvsJ dti}tj|ddd }| ||}|| ||jjv s+J t  }||jvs5J | ||}||jdgd}||jjv sLJ d S )N
FunnyLabelr$   Tr%   rT   r   )	r   stringsr   r   r(   r)   rT   r]   rY   )r   vocab1rC   r-   r$   parser1vocab2parser2r/   r/   r0   test_serialize_parser_strings  s   


r   c           
      C   s   dt i}tj|ddd }|| |}t <}|d }|| || |}||}|jddgd}|jddgd}	t|t|	ksCJ ||	ksIJ W d    d S 1 sTw   Y  d S )Nr$   Tr%   r.   rT   r   )r   r   r(   r   rq   rj   rY   rZ   )
r+   r   r-   r$   r.   r   	file_pathparser_dparser_bytesparser_d_bytesr/   r/   r0   $test_serialize_parser_roundtrip_disk  s   



"r   c                 C   s   | j dusJ |j dusJ |jj| jjksJ | jdgd}|j jd |j | jj || |j dus7J |jj| jjksAJ d S )NTrT   r   resize_output)r$   movesn_movesrY   attrsr]   )r.   r1   
bytes_datar/   r/   r0   test_to_from_bytes'  s   
r   c                 C   s   |d }|  }||}|  |ksJ dti}tj|ddd }t| ||}|  }t|t|ks8J ||ks>J d S )Nr   r$   Tr%   )rY   r]   r   r   r(   r   rZ   )r+   r5   r3   	tagger1_br-   r$   new_tagger1new_tagger1_br/   r/   r0   %test_serialize_tagger_roundtrip_bytes3  s   
r   c                 C   s   |\}}t  B}|d }|d }|| || dti}tj|ddd }t| ||}	t| ||}
|	 |
 ksAJ W d    d S 1 sLw   Y  d S )Nr3   r4   r$   Tr%   )r   rq   r   r   r(   r   rj   rY   )r+   r5   r3   r4   r   
file_path1
file_path2r-   r$   	tagger1_d	tagger2_dr/   r/   r0   $test_serialize_tagger_roundtrip_disk@  s   

"r   c           
      C   s   d}|| j vs	J ||j vsJ |d }||jj vsJ t <}|| ||jj v s-J |d }|| dti}tj|ddd }t||	|}	||	jj v sSJ W d    d S 1 s^w   Y  d S )NSomeWeirdLabelr   r3   r$   Tr%   )
r   rT   r   r)   rq   r   r   r(   r   rj   )
r+   de_vocabr5   rC   r6   r   r   r-   r$   r4   r/   r/   r0   test_serialize_tagger_stringsN  s   

"r   iQ  c                 C   s:   dt i}tj|ddd }t| |dd}|jdgd d S )Nr$   Tr%   g      ?)	thresholdrT   r   )r   r   r(   r   rY   )r+   r-   r$   textcatr/   r/   r0   test_serialize_textcat_emptya  s   r   c                    s   dt i}tj|ddd  fdd} }d|jd< | |jdgd	}d|jv s1J | j|jdgd	d
gd	}d|jvsFJ | j|jd
gd	dgd	}d|jvs[J d S )Nr$   Tr%   c                     s    } | S Nr/   )r   r   r+   r$   r/   r0   get_new_parsero  s   
z3test_serialize_pipe_exclude.<locals>.get_new_parserbarrJ   rT   r   r-   )r   r   r(   r-   r]   rY   )r+   r   r-   r   r.   r   r/   r   r0   test_serialize_pipe_excludej  s   

r   c                 C   sT   dt i}tj|ddd }t| |}| }t| ||}| | ks(J d S r2   )r   r   r(   r   rY   r]   )r+   r-   r$   srsr_bsr_dr/   r/   r0   !test_serialize_sentencerecognizer  s   
r   c                  C   s  t  } | d | d | d | jd d dgksJ | j }t |}|jdgks0J |jddgks9J |jdgksAJ |jd d dgksMJ t	 }|
| t|}W d    n1 sew   Y  |jdgksrJ |jddgks{J t	 }|
| tj|dgd}W d    n1 sw   Y  |jg ksJ |jddgksJ |jddgksJ t	 }| 
| tj|dgd}W d    n1 sw   Y  |jdgksJ |jdgksJ |jg ksJ d S )Nru   r6   r>   disabled)disabler   )r   r:   disable_piper,   copyfrom_config
pipe_namescomponent_namesr   r   rq   spacyr   )r>   r,   rs   r   nlp3nlp4nlp5r/   r/   r0   &test_serialize_pipeline_disable_enable  s>   







r   c               	   C   s  G dd dt } G dd dt }G dd dt }| t }tt |  W d    n1 s1w   Y  t "}tt || W d    n1 sOw   Y  W d    n1 s^w   Y  |t }tt |  W d    n1 s|w   Y  t "}tt || W d    n1 sw   Y  W d    n1 sw   Y  |t t }| }|t t 	|}| |ksJ t }|| |t t 
|}W d    n1 sw   Y  | |ksJ d S )Nc                   @      e Zd Zdd ZdS )z<test_serialize_custom_trainable_pipe.<locals>.BadCustomPipe1c                 S   s   d S r   r/   selfrT   r/   r/   r0   __init__  s   zEtest_serialize_custom_trainable_pipe.<locals>.BadCustomPipe1.__init__N__name__
__module____qualname__r   r/   r/   r/   r0   BadCustomPipe1      r   c                   @   r   )z<test_serialize_custom_trainable_pipe.<locals>.BadCustomPipe2c                 S   s   || _ d | _d S r   rT   r$   r   r/   r/   r0   r        
zEtest_serialize_custom_trainable_pipe.<locals>.BadCustomPipe2.__init__Nr   r/   r/   r/   r0   BadCustomPipe2  r   r   c                   @   r   )z8test_serialize_custom_trainable_pipe.<locals>.CustomPipec                 S   s   || _ || _d S r   r   )r   rT   r$   r/   r/   r0   r     r   zAtest_serialize_custom_trainable_pipe.<locals>.CustomPipe.__init__Nr   r/   r/   r/   r0   
CustomPipe  r   r   )r   r   pytestraises
ValueErrorrY   r   rq   r   r]   rj   )r   r   r   r=   r   
pipe_bytesnew_piper/   r/   r0   $test_serialize_custom_trainable_pipe  s@   




r   c                  C   s   t d} t| jj}d}| jj| t| jj|d ks J t A}| | t|}t| jjt|jjks;J ||jjv sCJ t|dgd}|t|jjksTJ ||jjvs\J W d    d S 1 sgw   Y  d S )Nen  unlikely_word_unlikely_word_unlikely_word_unlikely_word_unlikely_word_unlikely_word_unlikely_word_unlikely_word_unlikely_word_unlikely_word_unlikely_word_unlikely_word_unlikely_word_unlikely_word_unlikely_word_unlikely_word_unlikely_word_unlikely_word_unlikely_word_unlikely_word_r   r   r   )	r   blankrZ   rT   r   addr   rq   r   )r>   orig_strings_lengthwordr   reloaded_nlpr/   r/   r0   test_load_without_strings  s   

"r   )Ar   r   rc   	thinc.apir   r   r   r   r   spacy.lang.enr   spacy.languager   spacy.pipeliner   r	   r
   r   r   r   r   spacy.pipeline.dep_parserr   spacy.pipeline.senterr   spacy.pipeline.taggerr   spacy.pipeline.textcatr   spacy.tokensr   
spacy.utilr   r   utilr   test_parsersfixturer.   r1   r5   markissuer?   rb   rf   rm   rt   r   r   r   parametrizer   r   r   r   r   r   r   r   r   r   r   r   r   r/   r/   r/   r0   <module>   sr    $	





	











"






	%