o
    i/F                     @   s  d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZm	Z	 d dl
mZmZ d dlmZ ddlmZ ejd	d
d Zejddd Zejddd Zejddd Zejddd Zejddd Zejddd Zejddd  Zd!d" Zd#d$ Zd%d& Zd'd( Zd)d* Zd+d, Z d-d. Z!d/d0 Z"d1d2 Z#d3d4 Z$d5d6 Z%d7d8 Z&d9d: Z'd;d< Z(d=d> Z)d?d@ Z*dAdB Z+dCdD Z,dEdF Z-ej.dGdHdIgdJdK Z/dLdM Z0dNdO Z1dS )P    N)Mock)English)MatcherPhraseMatcher)DocSpan)Vocab   )make_tempdiri  c                  C   sT   t  } t| j}|d| d| d| dg |d| dg t|dks(J dS )	zdTest that the PhraseMatcher correctly reports its number of rules, not
    total number of patterns.TEST1abcTEST2dr	   N)r   r   vocabaddlen)nlpmatcher r   [/home/ubuntu/.local/lib/python3.10/site-packages/spacy/tests/matcher/test_phrase_matcher.pytest_issue3248_1   s
   
r   i  c                 C   s   t | }|dt| ddgdg |dt| ddgdg t| g dd}||}t|dks2J | j|d d  | j|d	 d  g}t|ddgksNJ d
S )zcTest that duplicate patterns for different rules result in multiple
    matches, one per rule.
    ABarackObamawordsB)r   r   liftsAmericar	   r      N)r   r   r   r   stringssorted)en_vocabr   docmatches	match_idsr   r   r   test_issue3331   s   $r(   i  c                    s   t  }|dt ddgdg |dt ddgdg t g dd}||}t|dks2J  fdd	|D }d|v sAJ d|v sGJ d
S )zGTest that the PhraseMatcher returns duplicates for duplicate match IDs.r   NewYorkr   r   )Iliveinr)   r*   r	   c                    s   g | ]
\}}} j | qS r   )r"   ).0ent_id_r$   r   r   
<listcomp>5       z"test_issue3972.<locals>.<listcomp>Nr   r   r   r   )r$   r   r%   r&   	found_idsr   r1   r   test_issue3972)   s   r6   i  c                 C   s   t | dd}t| ddgd}dd |D ddgksJ |d|g t| g d	d}d
d |D g d	ks7J ||}t|dksCJ t | dd}t| ddgd}d|d _d|d _dd |D ddgkshJ |d|g ||}t|dks{J dS )zETest that the PhraseMatcher can match on overwritten NORM attributes.NORMattrr   r   r   c                 S      g | ]}|j qS r   norm_r.   tr   r   r   r2   ?       z"test_issue4002.<locals>.<listcomp>TEST)r   r   r   r   c                 S   r:   r   r;   r=   r   r   r   r2   B   r?   r!   12r   c                 S   r:   r   r;   r=   r   r   r   r2   I   r?   N)r   r   r   r   r<   )r$   r   pattern1r%   r&   pattern2r   r   r   test_issue4002:   s    

rE   i  c                  C   s8   t t } t| jtsJ tt } t| jtsJ dS )zCTest that PhraseMatcher.vocab can be accessed (like Matcher.vocab).N)r   r   
isinstancer   r   )r   r   r   r   test_issue4373O   s   

rG   i+  c                  C   s   d} t  }ddddg}|jdddid	}|| || }d
d |jD }t  }t }|d }|| |d| W d   n1 sHw   Y  || }	dd |	jD }
||
ks_J dS )zTest that the EntityRuler PhraseMatcher is deserialized correctly using
    the method from_disk when the EntityRuler argument phrase_matcher_attr is
    specified.
    z!Spacy is a python library for nlp
PYTHON_LIBspacyspaCy)labelpatternidentity_rulerphrase_matcher_attrLOWER)configc                 S      g | ]
}|j |j|jfqS r   textlabel_ent_id_r.   entr   r   r   r2   d   r3   z;test_issue4651_with_phrase_matcher_attr.<locals>.<listcomp>entityrulerNc                 S   rR   r   rS   rW   r   r   r   r2   k   r3   )r   add_pipeadd_patternsentsr
   to_disk	from_disk)rT   r   patternsrulerr%   resnlp_reloadedr   	file_pathdoc_reloadedres_reloadedr   r   r   'test_issue4651_with_phrase_matcher_attrX   s    

rf   i  c                 C   sZ   g d}t | |d}|dd }t | g dd}t| }|d|g ||}|s+J dS )z/Ensure that PhraseMatcher accepts Span as inputr+   likeSpansandDocsr-   myinput,rj   nothingelse.r   N   ri   rj   rk   SPACY)r   r   r   )r$   r   r%   spanrL   r   r&   r   r   r   test_issue6839o   s   rv   i)  c           
      C   s   g d}t | |d}t | dgdt | ddgdd}t| }| D ]\}}|||g q"||}|| jd ddf| jd	 dd
fgksFJ |d t|dksSJ ||}|| jd	 dd
fgksdJ |d	 t|dksqJ ||}	|	ryJ dS )z:Ensure overlapping terms can be removed from PhraseMatcher)Onlysaveoutthebinarydataforrz   
individual
componentsrq   r   r{   r|   )0rA   r         rA      r!   r   N)r   r   itemsr   r"   remover   )
r$   r   r%   termsr   match_idtermr&   new_matches
no_matchesr   r   r   test_issue10643~   s$   (

r   c                 C   s2  t | g dd}t | ddgd}t| }|d|g t||dks%J t | dgd}t| }|d|g t||dksAJ t | ddgd}t| }|d	|g t||dks^J t | d
gd}t| }|d|g t||dkszJ t | dd
gd}t| }|d|g t||dksJ d S )Nr+   rh   GoogleNowbestr   r   r   COMPANYr!   r+   rh   ILIKEr   BESTNOWBESTr   r   r   r   )r$   r%   rL   r   r   r   r   test_matcher_phrase_matcher   s*   r   c                 C   sl   t | }t|dksJ |dt| dgdg t|dks J |dt| dgdg t|dks4J d S )	Nr   r@   testr   r!   r   test2r	   )r   r   r   r   r$   r   r   r   r   test_phrase_matcher_length   s   r   c                 C   s<   t | }|dt| dgdg d|v sJ d|vsJ d S )Nr@   r   r   r   )r   r   r   r   r   r   r   test_phrase_matcher_contains   s   r   c                 C   s  t | ddgd}t | dgdt | ddgdg}t| }|jdd g|R   t||dks/J t| }t }|jd|g|R   t||dksJJ |jdksQJ t| }|d| t||dkseJ t| }t }|jd||d	 t||dks~J |jdksJ d S )
Nr   r   r   OLD_APIr	   OLD_API_CALLBACKNEW_APINEW_API_CALLBACKon_match)r   r   r   r   r   
call_count)r$   r%   r_   r   r   r   r   r   test_phrase_matcher_add_new_api   s$   r   c                 C   s   t | }|dt| dgdg |dt| dgdg |dt| dgdg |dt| dgdg t| g dd}d|v sBJ d|vsHJ t||dksRJ d S )Nr@   rh   r   r   r   r!   r4   r$   r   r%   r   r   r    test_phrase_matcher_repeated_add   s   r   c                 C   st  t | }|dt| dgdg |dt| dgdg t| g dd}d|v s*J d|v s0J d|vs6J t||dks@J |d d|vsKJ d|v sQJ d|vsWJ t||d	ksaJ |d d|vslJ d|vsrJ d|vsxJ t||d
ksJ tt |d W d    n1 sw   Y  d|vsJ d|vsJ d|vsJ t||d
ksJ d S )Nr   rh   r   r   r   r   TEST3r	   r!   r   )r   r   r   r   r   pytestraisesKeyErrorr   r   r   r   test_phrase_matcher_remove   s2   

r   c                 C   s  t | }|dt| dgdg |dt| dgdg t| g dd}d|v s*J t|dks2J t||dks<J |d d|vsGJ t|dksOJ t||dksYJ ||d d | jd kshJ |d d|vssJ t|dks{J t||dksJ d S )	Nr@   rh   r   r   r   r	   r!   r   )r   r   r   r   r   r"   r   r   r   r   +test_phrase_matcher_overlapping_with_remove  s    

r   c                 C   s   g d}g d}g d}g d}t | ||d}t| dd}|d|g t | ||d}||}t|d	ks7J |d
 \}	}
}|	| jd ksGJ |
dksMJ |dksSJ d S )Nr+   rh   catsPRONVERBNOUN)Yesrn   youhatedogsverymuch)INTJPUNCTr   r   r   ADVr   r   posPOSr8   r@   r!   r   r	   r   r   r   r   r   r"   )r$   words1pos1words2pos2rL   r   r%   r&   r   startendr   r   r    test_phrase_matcher_string_attrs  s   r   c           	      C   sr   g d}g d}g d}g d}t | ||d}t| dd}|d|g t | ||d}||}t|d	ks7J d
S )zATest that token with the control codes as ORTH are *not* matched.r   r   )zmatcher:POS-PRONzmatcher:POS-VERBzmatcher:POS-NOUN)Xr   r   r   r   r8   r@   r   Nr   )	r$   r   r   r   r   rL   r   r%   r&   r   r   r   )test_phrase_matcher_string_attrs_negative.  s   r   c                 C   s   g d}g d}t | |d}t| dd}|d|g t | |d}||}t|dks-J |d \}}}	|d	 \}
}}|| jd ksDJ |
| jd ksMJ |dksSJ |	d
ksYJ |d
ks_J |dkseJ d S )N)Helloworld!)Noproblemrn   hesaidrq   r   IS_PUNCTr8   r@   r	   r   r!      r   r   )r$   r   r   rL   r   r%   r&   	match_id1start1end1	match_id2start2end2r   r   r   test_phrase_matcher_bool_attrs<  s    r   c                 C   sf  t | dgd}d|d _t | dgd}d|d _d|d _|d d t | dgd}t| dd	}tt |	d
|g W d    n1 sHw   Y  tt |	d|g W d    n1 sdw   Y  t
  t
d |	d|g W d    n1 sw   Y  t| ddd}t
  t
d |	d|g W d    d S 1 sw   Y  d S )NTestr   ROOTr   TAGr   Feat=ValT)validater   r   errorr   r   )r9   r   TEST4)r   dep_tag_pos_	set_morphr   r   warnsUserWarningr   warningscatch_warningssimplefilter)r$   doc1doc2doc3r   r   r   r   test_phrase_matcher_validationO  s.   






"r   c                 C   s<   t t t| dd W d    d S 1 sw   Y  d S )NUNSUPPORTEDr8   )r   r   
ValueErrorr   r1   r   r   r   test_attr_validatione  s   "r   c              	   C   s  t | dgd}d|d _t | dgd}d|d _d|d _|d d d|d _t | dgd}t| d	d
}|d|g t	t
 |d|g W d    n1 sTw   Y  t	t
 |d|g W d    n1 spw   Y  dD ]G}t| |d
}|d|g t	t
 |d|g W d    n1 sw   Y  t	t
 |d|g W d    n1 sw   Y  qwt| dd
}|d|g t| dd
}|d|g d S )Nr   r   r   r   r   r   r   LEMMADEPr8   r   r   r   )r   r   r   ORTHTEXT)r   r   r   r   r   lemma_r   r   r   r   r   )r$   r   r   r   r   r9   r   r   r   test_attr_pipeline_checksj  s<   



r   c                 C   s\   t  }t| g dd}t| ddgd}t| }|jd|g|d ||}|||d| d S )Nr   r   r   r   r   r   r   )r   r   r   r   assert_called_once_with)r$   mockr%   rL   r   r&   r   r   r   test_phrase_matcher_callback  s   r   c                 C   sh   t | }t| dgd}t| ddgd}t| g dd}t| g dd}|d||||g |d d S )Nthisr   is)r   r   r   )r   r   r   wordTHIS)r   r   r   r   )r$   r   rC   rD   pattern3pattern4r   r   r   /test_phrase_matcher_remove_overlapping_patterns  s   r   c                 C   sT   t | }t| ddgd}tt |d| W d    d S 1 s#w   Y  d S )Nhellor   r   r@   )r   r   r   r   r   r   )r$   r   rL   r   r   r   test_phrase_matcher_basic_check  s
   "r   c                 C   s   t | }t }|dt| dgdg |jdt| dgdg|d t| g dd}t|dks1J t|}t|}||}||}t|t|ksMJ ||ksSJ | d	 \}}	}
}t	|

dtsgJ d S )
Nr@   r   r   r   r   r   )thesearetests:r   r   r	   r!   )r   r   r   r   r   srslypickle_dumpspickle_loads
__reduce__rF   get)r$   r   r   r%   r   matcher_unpickledr&   matches_unpickledr   docs	callbacksr9   r   r   r   test_phrase_matcher_pickle  s   

r  c                 C   s   t | }|dt| ddgdg |dt| dgdg t| g dd}||dd	}t|d
ks3J t|d ts<J |d jdksEJ |d jdksNJ t|d tsWJ |d jdks`J |d jdksiJ dS )zTest the new as_spans=True API.r   r   r   r   r   r   )z...r   r   r   r   r   r   T)as_spansr	   r   zhello worldr!   N)r   r   r   r   rF   r   rT   rU   )r$   r   r%   r&   r   r   r   test_phrase_matcher_as_spans  s   r  c                 C   s   t | }|dt| dgdg t| ddgd}tt#}||gD ]}q$|js,J dt|jd j	v s8J W d    d S 1 sCw   Y  d S )Nr@   helllor   r   r   z
spaCy v3.0r   )
r   r   r   r   r   DeprecationWarningpipeliststrmessage)r$   r   r%   recordr0   r   r   r   test_phrase_matcher_deprecated  s   
"r  r9   
SENT_STARTIS_SENT_STARTc                 C   s   t | |d}d S )Nr8   )r   )r$   r9   r0   r   r   r   test_phrase_matcher_sent_start  s   r  c                 C   sz   g d}t | |d}|dd }t | g dd}t| }|d|g ||}||}t|dks3J t|dks;J dS )z7Ensure that PhraseMatcher accepts Span and Doc as inputrg   r   Nrr   rs   rt   r!   r   r$   r   r%   ru   rL   r   matches_docmatches_spanr   r   r   test_span_in_phrasematcher  s   r  c                 C   sz   g d}t | |d}|dd }t | g dd}t| }|d|g ||}||}t|dks3J t|dks;J d	S )
zREnsure that PhraseMatcher only returns matches in input Span and not in entire Doc)r+   rh   ri   rj   rk   r-   rl   rm   rn   ri   rj   rk   r-   rl   matchersz,andri   rj   rk   
everywhererq   r   	      rs   rt   r   r!   Nr   r  r   r   r    test_span_v_doc_in_phrasematcher  s   r#  )2r   r   r  r   r   spacy.lang.enr   spacy.matcherr   r   spacy.tokensr   r   spacy.vocabr   utilr
   markissuer   r(   r6   rE   rG   rf   rv   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  parametrizer  r  r#  r   r   r   r   <module>   s`    
















	


