o
    i(e                     @   s  d dl Z d dlmZmZ d dlmZ d dlmZ d dlm	Z	 d dl
mZ d dlmZmZmZmZ d dlmZ d d	lmZ d d
lmZmZ ddgZe jdd Ze jeddd Zeddd Ze j de j!dedd Z"e j de j!dedd Z#e j de j!dedd Z$e j de j!ded d! Z%e j d"e j!ded#d$ Z&e j!ded%d& Z'e j!ded'd( Z(e j!ded)d* Z)e j!ded+d, Z*e j!ded-d. Z+e j!ded/d0 Z,e j!ded1d2 Z-e j!ded3d4 Z.e j!ded5d6 Z/e j!ded7d8 Z0e j!ded9d: Z1e j!ded;d< Z2e j!ded=d> Z3e j!ded?d@ Z4e j!dedAdB Z5e j!dedCdD Z6e j!dedEdF Z7e j!dedGdH Z8e j!dIdJdKge j!dedLdM Z9e j!dedNdO Z:e j!dedPdQ Z;e j!dedRdS Z<e j!dedTdU Z=e j!dedVdW Z>e j!dedXdY Z?e j!dedZd[ Z@e j!ded\d] ZAe j!ded^d_ ZBdS )`    N)NumpyOpsget_current_ops)registry)MatchPatternError)EnglishLanguage)EntityRecognizerEntityRuler	SpanRulermerge_entities)DEFAULT_NER_MODEL)make_tempdir)DocSpanentity_rulerfuture_entity_rulerc                   C   s   t  S Nr    r   r   Z/home/ubuntu/.local/lib/python3.10/site-packages/spacy/tests/pipeline/test_entity_ruler.pynlp   s   r   entity_ruler_patternsc                	   C   sR   ddddddiddigddddigddd	d
dgdddddddddgS )NHELLOhello worldlabelpatternBYELOWERbyeORTHCOMPLEXfoo*)r    OPTECH_ORGApplea1r   r   id	Microsofta2r   r   r   r   r   patterns   s   

r,   add_entc                 C   s   t | ddddg| _| S )Nr      ORG)r   )r   ents)docr   r   r   add_ent_component"   s   r2   i  entity_ruler_factoryc                 C   s   t  }t|jg dd}d|d _|j| dd}|ddd	g d
ti}tj|ddd
 }t	|j|}|j
dd |d ||}|j
|gd }|j
|d |j
|d |j
|d |j
|dskJ dS )z8Test case where preset entity crosses sentence boundary.)IliveinNewYork)wordsT   r   nameGPEzNew Yorkr   modelvalidate    r   OzB-GPEN)r   r   vocabis_sent_startadd_pipeadd_patternsr   r   resolver	   moves
add_action	add_label
init_batchapply_transitionis_valid)r3   r   r1   rulercfgr>   nerstater   r   r   test_issue3345(   s    

rS   i  c                 C   s   t  }ddddddddg}|j| ddd	id
}|| d}d}|j|gddD ]}|tdd |jD 7 }q*|dks?J tttrcd}|j|gddD ]}|tdd |jD 7 }qN|dkseJ d S d S )NPERSONz	joe bidenz	joe-bidenr(   zbernie sanderszbernie-sandersr   phrase_matcher_attrr   r<   configz
    The left is starting to take aim at Democratic front-runner Joe Biden.
    Sen. Bernie Sanders joined in her criticism: "There is no 'middle ground' when it comes to climate policy."
    r      	n_processc                 S      g | ]	}|j d kr|qS r   ent_id.0entr   r   r   
<listcomp>V       z"test_issue4849.<locals>.<listcomp>   c                 S   r[   r\   r]   r_   r   r   r   rb   \   rc   )	r   rF   rG   pipelenr0   
isinstancer   r   )r3   r   r,   rO   text
count_entsr1   r   r   r   test_issue4849A   s,   



rj   i  c                 C   sv   t  }|j| dd}dddddddddg}|| d}||}t|jd	ks,J t|}t|jd	ks9J d S )
Nr   r;   r/   zDigicon Incr   zRotan Mosle Inc'sz#Rotan Mosle Technology Partners Ltdz
        Digicon Inc said it has completed the previously-announced disposition
        of its computer systems division to an investment group led by
        Rotan Mosle Inc's Rotan Mosle Technology Partners Ltd affiliate.
        r.   )r   rF   rG   rf   r0   r   )r3   r   rO   r,   rh   r1   r   r   r   test_issue5918`   s   
rk   i  c                 C   sx   t  }|j| dd}ddddddidd	igd
ddddiddigd
dg}|| |d}tdd |D s:J d S )Nr   r;   r/   r&   r   r=   r   san	franciscosan-franciscor(   franzSan Francisco San Franc                 s   s    | ]}|j d kV  qdS )rn   N)ent_id_)r`   tr   r   r   	<genexpr>   s    z!test_issue8168.<locals>.<genexpr>)r   rF   rG   all)r3   r   rO   r,   r1   r   r   r   test_issue8168}   s   
rt   i   c                 C   sx   | j |dddid}|| tdd |jj D }|dks"J |g  tdd |jj D }||ks:J d	S )
z/Test that patterns don't get added excessively.r   r@   TrV   c                 s       | ]}t |V  qd S r   rf   r`   mmr   r   r   rr          z,test_entity_ruler_fix8216.<locals>.<genexpr>r   c                 s   ru   r   rv   rw   r   r   r   rr      ry   N)rF   rG   summatcher	_patternsvalues)r   r,   r3   rO   pattern_countafter_countr   r   r   test_entity_ruler_fix8216   s   


r   c                 C   s   | j |dd}|| t|t|ksJ t|jdksJ d|v s%J d|v s+J | d | j |dd}|| | d}t|jdksIJ |jd jdksSJ |jd	 jdks]J d S )
Nr   r;   r:   r   r   hello world bye byerd   r   rX   )rF   rG   rf   labelsremove_piper0   label_r   r,   r3   rO   r1   r   r   r   test_entity_ruler_init   s   


r   c                 C   s   | j |dd}t|dksJ t|jdksJ | d | j |dd | jdgks,J tt | d}W d    n1 s@w   Y  t|jdksNJ d S )Nr   r;   r   r   )	rF   rf   r   r   
pipe_namespytestwarnsUserWarningr0   )r   r3   rO   r1   r   r   r   #test_entity_ruler_no_patterns_warns   s   

r   c                 C   s  | j |dd}t|jdksJ |jdd |d t|jdks"J | d}|jd jd	ks0J |jd
 jdks:J | d dddii| jd d d< | j |dd}t|jdks\J |   t|jdksiJ | d}|jd jd	kswJ |jd
 jdksJ d S )Nr   r;   r   c                   S      g S r   r   r   r   r   r   <lambda>       z1test_entity_ruler_init_patterns.<locals>.<lambda>r,   r:   r   r   rX   r   r,   @miscr   
initialize
components)rF   rf   r   r   r0   r   r   rW   r   r   r   r   test_entity_ruler_init_patterns   s"   
r   c                 C   sN   | j |dd}|| t|jdksJ |dd  t|jdks%J dS ))Test that initialization clears patterns.r   r;   r:   c                   S   r   r   r   r   r   r   r   r      r   z.test_entity_ruler_init_clear.<locals>.<lambda>r   N)rF   rG   rf   r   r   r   r,   r3   rO   r   r   r   test_entity_ruler_init_clear   s
   
r   c                 C   s   | j |dd}|| t|jdksJ | d}t|jdks"J |  t|jdks/J tt | d}W d   n1 sCw   Y  t|jdksQJ dS )r   r   r;   r:   r   rX   r   N)	rF   rG   rf   r   r0   clearr   r   r   r   r   r   r   test_entity_ruler_clear   s   

r   c                 C   sl   | j |dd}|| | j ddd | d}t|jdks J |jd jdks*J |jd	 jd
ks4J d S )Nr   r;   r-   beforeOH HELLO WORLD bye byerd   r   r/   rX   r   rF   rG   rf   r0   r   r   r   r   r   test_entity_ruler_existing   s   
r   c                 C   s   | j |dddid}|| | j ddd | d}t|jdks#J |jd	 jd
ks-J |jd	 jd
ks7J |jd jdksAJ d S )Nr   overwrite_entsTrV   r-   r   r   rd   r   r   rX   r   )rF   rG   rf   r0   r   rh   r   r   r   r   $test_entity_ruler_existing_overwrite   s   

r   c                 C   s   | j |dddid}|| | j ddd | d}t|jdks#J |jd	 jd
ks-J |jd jdks7J t|jd	 dksBJ t|jd dksMJ d S )Nr   r   TrV   r-   r   zfoo foo bye byerd   r   r!   rX   r   r   r   r   r   r   "test_entity_ruler_existing_complex  s   

r   c                 C   sd   | j |dddid}|| | d}t|jdksJ |jd jdks&J |jd jd	ks0J d S )
Nr   r   TrV   Apple is a technology companyrX   r   r%   r'   )rF   rG   rf   r0   r   rp   r   r   r   r   test_entity_ruler_entity_id  s   

r   c                 C   s   ddd}| j |d|d}|| | d}t|tr"d|jv s"J t|jdks+J |jd	 jd
ks5J |jd	 jdks?J d S )NTz**)r   
ent_id_sepr   rV   r   zTECH_ORG**a1rX   r   r%   r'   )	rF   rG   rg   r
   phrase_patternsrf   r0   r   rp   )r   r,   r3   rW   rO   r1   r   r   r    test_entity_ruler_cfg_ent_id_sep&  s   


r   c                 C   s   t | |d}t|t|ksJ t|jdksJ | }t | }t|dks)J t|jdks2J ||}t|t|ksAJ t|jdksJJ t|jt|jksVJ |jD ]	}||jv sbJ qYt|jt|jksoJ d S )Nr   r:   r   )r
   rf   r   to_bytes
from_bytesr,   sorted)r   r,   r3   rO   ruler_bytes	new_rulerr   r   r   r   !test_entity_ruler_serialize_bytes3  s   

r   c                 C   s   t | d|d}t|t|ksJ t|jdksJ | }t | }t|dks*J t|jdks3J |jd u s:J ||}t|t|ksIJ t|jdksRJ |jdksYJ d S )Nr   )rU   r,   r:   r   )r
   rf   r   r   rU   r   )r   r,   r3   rO   r   r   r   r   r   5test_entity_ruler_serialize_phrase_matcher_attr_bytesE  s   
r   c                 C   s   | j |dd}t| dd}dddigd}dddigd}tt ||g W d    n1 s3w   Y  ||g tt ||g W d    d S 1 sUw   Y  d S )	Nr   r;   Tr?   r   r   r   ASDF)rF   r
   r   raises
ValueErrorrG   r   )r   r3   rO   validated_rulervalid_patterninvalid_patternr   r   r   test_entity_ruler_validateW  s   "r   c                 C   sB   t | |dd}t|jtg dksJ t|jddgksJ d S )NT)r,   r   )r   r   r!   r%   r'   r+   )r
   r   r   ent_idsr   r   r   r   test_entity_ruler_propertiesk  s   r   c                 C   s^   | j |dd}ddddddg}|| | d}t|jd	ks#J |jd
 jdks-J d S )Nr   r;   FOOBARzfoo barr   BARBAZzbar bazzfoo bar bazrX   r   r   r   r3   rO   r,   r1   r   r   r   #test_entity_ruler_overlapping_spansr  s   
r   c                 C   `   | j |dd}ddddiigdg}|| | d}t|jd	ks$J |jd
 jdks.J d S Nr   r;   r   r   FUZZYhellor   helloorX   r   r   r   r   r   r   test_entity_ruler_fuzzy_pipe     
r   c                 C   r   r   r   r   r   r   r   test_entity_ruler_fuzzy  r   r   c                 C   sh   t ddd }| j|ddddiid}dd	d
diigdg}|| | d}t|jdks2J d S )Ntest_fuzzy_compare_disabledc                   S   s   dd S )Nc                 S   s   dS )NFr   )xyzr   r   r   r     r   z\test_entity_ruler_fuzzy_disabled.<locals>.make_test_fuzzy_compare_disabled.<locals>.<lambda>r   r   r   r   r    make_test_fuzzy_compare_disabled  s   zJtest_entity_ruler_fuzzy_disabled.<locals>.make_test_fuzzy_compare_disabledr   matcher_fuzzy_comparer   rV   r   r   r   r   r   r   r   )r   miscrF   rG   rf   r0   )r   r3   r   rO   r,   r1   r   r   r    test_entity_ruler_fuzzy_disabled  s   


r   rZ   rX   rd   c                 C   st   t tts	|dk r6dg}ddddg}| j|dd}|| | j|dd	D ]}|jD ]	}|jdks4J q+q&d S d S )
Nrd   zI enjoy eating Pizza Hut pizza.FASTFOODz	Pizza Hut1234r(   r   r;   rY   )rg   r   r   rF   rG   re   r0   rp   )r   rZ   r3   textsr,   rO   r1   ra   r   r   r   !test_entity_ruler_multiprocessing  s   

r   c              	   C      | j |dd}|| t ;}||d  ||d  tt ||d  W d    n1 s5w   Y  W d    d S W d    d S 1 sMw   Y  d S )Nr   r;   ztest_ruler.jsonlznon_existing.jsonlrF   rG   r   to_disk	from_diskr   r   r   r   r,   r3   rO   dr   r   r   !test_entity_ruler_serialize_jsonl     
"r   c              	   C   r   )Nr   r;   
test_rulernon_existing_dirr   r   r   r   r   test_entity_ruler_serialize_dir  r   r   c                 C   s  | j |dd}ddddddd	ddd
dg}|| | d}t|jdks)J t|jdks2J t|tr>d|jv s>J |jd jdksHJ |jd j	dksRJ t|tr]|
d n|d | d}t|jdksoJ t|tr{d|jvs{J t|jdksJ d S )Nr   r;   rT   Dinadinar(   r/   ACMEacmeACMr   Dina went to schoolr.   rX   PERSON||dinar   rd   )rF   rG   rf   r,   r0   rg   r
   phrase_matcherr   rh   removeremove_by_idr   r   r   r   test_entity_ruler_remove_basic  s*   






r   c                 C   s  | j |dd}dddddddddd	d
dg}|| | d}t|jdks*J t|tr=d|jv s6J d|jv s=J t|jdksFJ t|trQ|d n|	d | d}t|jdkscJ t|trvd|jvsoJ d|jvsvJ t|jdksJ d S )Nr   r;   rT   r   r   r(   r/   DinaCorpr   r   zDina founded DinaCorp and ACME.r.   r   z	ORG||dinarX   )
rF   rG   rf   r,   rg   r
   r   r0   r   r   r   r   r   r   2test_entity_ruler_remove_same_id_multiple_patterns  s*   







r   c                 C   s   | j |dd}ddddddd	ddd
dg}|| t|jdks%J tt |d W d    n1 s:w   Y  t|t	ratt |
d W d    d S 1 sZw   Y  d S d S )Nr   r;   rT   r   r   r(   r/   r   r   r   r   r.   	nepattern)rF   rG   rf   r,   r   r   r   r   rg   r   r   )r   r3   rO   r,   r   r   r   ,test_entity_ruler_remove_nonexisting_pattern   s   



"r   c                 C   sx  | j |dd}ddddddd	ddd
dg}|| | d}t|jdks)J t|jdks2J |jd jdks<J |jd jdksFJ |jd jdksPJ |jd jdksZJ t|tre|	d n|
d | d}t|jdkswJ t|jdksJ |jd jdksJ |jd jdksJ t|tr|	d	 n|
d	 | d}t|jdksJ t|jdksJ d S )Nr   r;   rT   r   r   r(   r/   r   r   r   r   zDina founded her company ACME.r.   rd   r   rX   zDina founded her company ACME)rF   rG   rf   r,   r0   r   rh   rg   r
   r   r   r   r   r   r   )test_entity_ruler_remove_several_patterns  s6   






r   c                 C   s:  | j |dd}ddddddd	dd
ddddddg}|| | d}t|jdks.J |jd jdks8J |jd jdksBJ |jd jdksLJ |jd jdksVJ |jd jd
ks`J |jd jdksjJ t|tr|d |d	 |d n|	d |	d	 |	d | d}t|jdksJ d S )Nr   r;   rT   r   r   r(   r/   r   r   DATEher birthdaybdayr   r   -Dina founded her company ACME on her birthdayr.   r   rX   rd   r   )
rF   rG   rf   r0   r   rh   rg   r
   r   r   r   r   r   r   *test_entity_ruler_remove_patterns_in_a_row3  s0   









r   c                 C   s,  | j |dd}ddddddd	dd
dddg}|| t|jdks&J t|tr1|d n|d t|jdks?J t|trJ|d	 n|d	 t|jdksXJ t|trc|d n|d t|jdksqJ t	t
 | d}t|jdksJ W d    d S 1 sw   Y  d S )Nr   r;   rT   r   r   r(   r/   r   r   r   r   r   r.   rd   rX   r   r   )rF   rG   rf   r,   rg   r
   r   r   r   r   r   r0   r   r   r   r   %test_entity_ruler_remove_all_patternsQ  s.   









"r   c                 C   sx  | j |dd}dddg}|| || d}t|jdks"J t|jdks+J |jd jdks5J |jd jdks?J dd	d
dg}|| || d}t|jdks[J t|jdksdJ |jd jdksnJ |jd jdksxJ |jd jdksJ |jd jd	ksJ t|t	r|
d
 n|d
 || d}t|jdksJ t|jdksJ |jd jdksJ |jd jdksJ || || d}t|jdksJ t|jdksJ ddd
dg}|| || d}t|jdksJ t|jdksJ t|t	r|
d
 n|d
 || d}t|jdks0J t|jdks:J d S )Nr   r;   r   z	last timer   z=I saw him last time we met, this time he brought some flowersrX   r   z	this timettimer(   rd   zanother timez[I saw him last time we met, this time he brought some flowers, another time some chocolate.r.   )rF   rG   make_docrf   r,   r0   r   rh   rg   r
   r   r   )r   r3   rO   r,   r1   	patterns1	patterns2r   r   r    test_entity_ruler_remove_and_addo  sp   






r   )Cr   	thinc.apir   r   spacyr   spacy.errorsr   spacy.lang.enr   spacy.languager   spacy.pipeliner	   r
   r   r   spacy.pipeline.nerr   spacy.tests.utilr   spacy.tokensr   r   ENTITY_RULERSfixturer   r   r,   	componentr2   markissueparametrizerS   rj   rk   rt   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   <module>   s    










	












	
	








!

