o
    i-j                  	   @   s  d dl Z d dlZd dlmZmZmZ d dlmZ d dlm	Z	 d dl
mZ d dlmZ d dlmZmZmZ d dlmZ d	d
id
ddgZd
ddd	d
igZd	d
id	d
igZd	did
ddd	digZdddd
ddd	digZdZdZdZdZdZdZdZdZ dZ!dZ"ej#dd Z$ej#dd Z%ej&'dej&(dddigddiddiggddiddigddigggd d! Z)ej&'dej&(dddigddiddiggddiddigddigggd"d# Z*ej&'d$d%d& Z+ej&'d'd(d) Z,ej&'d*d+d, Z-ej&'d-d.d/ Z.ej&'d0d1d2 Z/ej&'d3d4d5 Z0ej&'d3d6d7 Z1ej&'d8d9d: Z2ej&(d;g d<ej&'d=d>d? Z3ej&'d@dAdB Z4ej&'dCdDdE Z5ej&'dCdFdG Z6ej&'dCdHdI Z7ej&'dCdJdK Z8ej&'dLdMdN Z9ej&'dOdPdQ Z:ej&'dRdSdT Z;ej&'dUdVdW Z<ej&'dXdYdZ Z=ej&'d[d\d] Z>ej&?d^ej&'d_d`da Z@ej&'dbdcdd ZAej&'dedfdg ZBej&'dhdidj ZCej&'dkdldm ZDej&(dneefeefeefeefeefgdodp ZEej&(dqeefeefee fee!fee"fgdrds ZFdtdu ZGdvdw ZHej&(dneefeefeefeefeefgdxdy ZIdzd{ ZJej&'d=d|d} ZKd~d ZLej&Mddd ZNdd ZOdd ZPdS )    N)IS_PUNCTLOWERORTH)MatchPatternError)English)	LEX_ATTRS)Matcher)DocSpanToken)Vocabr   A*r   OPBzAA*zA*AAAzBA*BzB*A*Bz	A A A A AzA AzB A A A A A BzB B A A A A A Bc                   C   s   dS )Nz(BBAAAAAB). r   r   r   Z/home/ubuntu/.local/lib/python3.10/site-packages/spacy/tests/matcher/test_matcher_logic.pytext    s   r   c                 C   s   | d |}|S )N )join)en_tokenizerr   docr   r   r   r   %   s   r   v   patternsr   celticsbostonc                    s   d}| |}|j jd  t|j }|d| tt|jdks"J  fdd||D }| ddf d	dfgks;J |d
d |_t|j}t|dksOJ |d j ksXJ |d jdksaJ |d j	dksjJ d
S )5Test a bug that arose from having overlapping matchesLhow many points did lebron james score against the boston celtics last nightORGBostonCelticsr   c                       g | ]
\}}} ||fqS r   r   .0_startendr    r   r   
<listcomp>=       z!test_issue118.<locals>.<listcomp>	      
   N   )
vocabstringsr   addlenlistentslabelr&   r'   r   r   r   r   matchermatchesr4   r   r(   r   test_issue118+   s   

r9   c                    s   d}| |}|j jd  t|j }|d| tt|jdks"J  fdd||D }| jt|dd 7  _| d	d
f d	dfgksHJ |j}t|dksSJ |d j ks\J |d j	d	kseJ |d j
dksnJ dS )r   r   r    r!   r   c                    r"   r   r   r#   r(   r   r   r)   Y   r*   z0test_issue118_prefix_reorder.<locals>.<listcomp>r.   Nr+   r-   r,   )r/   r0   r   r1   r2   r3   r4   tupler5   r&   r'   r6   r   r(   r   test_issue118_prefix_reorderG   s   
r;      c                 C   s   d}ddiddigddiddigg}| |}t |j}|d| dd ||D }|\}}|d	 d
ks6J |d dks>J |d	 dksFJ |d dksNJ tt | jt|7  _W d   dS 1 shw   Y  dS )z$Test overlapping multi-word phrases.zAThere are different food safety standards in different countries.r   foodsafety	standardsFOODc                 S   s   g | ]
\}}}|||fqS r   r   )r$   ent_typer&   r'   r   r   r   r)   n   r*   z!test_issue242.<locals>.<listcomp>r.                  N)r   r/   r1   pytestraises
ValueErrorr4   r:   )r   r   r   r   r7   r8   match1match2r   r   r   test_issue242c   s    
"rL   iK  c                 C   s   | d}t |j}|dtditdigg ||}t|dks"J |dtditditditdigg ||}t|d	ksAJ |d
tditditditdigg ||}t|d	ks`J dS )z6Test that Matcher doesn't segfault on particular inputza b; cTEST1abr.   TEST2TcrC   TEST3dN)r   r/   r1   r   r2   r   )r   r   r7   r8   r   r   r   test_issue587z   s   
&&rT   iL  c                 C   sF   t | }tt |dg g W d   dS 1 sw   Y  dS )z=Test if empty specs still cause an error when adding patternsTESTN)r   rG   rH   rI   r1   )en_vocabr7   r   r   r   test_issue588   s   "rW   iN  c                 C   sz   t | g dd}t| }|dddiddiddidd	igg |dddidd
iddigg ||}t|dks;J dS )zTest overlapping matches)n=1;rN   :5%wordsabIS_ALPHATr   r\   LIKE_NUMr^   rY   rC   Nr	   r   r1   r2   rV   r   r7   r8   r   r   r   test_issue590   s    rf   ig  c                 C   sx   dd }d}ddiddig}d}| |}t |j}|j||g|d || t|j}|g ks1J |d	 jd	ks:J d S )
Nc           	         s   |t |d kr
dS  fdd|D }  +}|D ]}|jr!dn|jj}||jd}|j||d  j|f  _qW d   dS 1 sEw   Y  dS )zMerge a phrase. We have to be careful here because we'll change the
        token indices. To avoid problems, merge all the phrases once we're called
        on the last match.r.   Nc                    s"   g | ]\}}}t  |||d qS r5   )r
   )r$   r5   r&   r'   r   r   r   r)      s   " z8test_issue615.<locals>.merge_phrases.<locals>.<listcomp>NNP)taglemma)attrs)r2   
retokenizelabel_roottag_r   merger4   )	r7   r   ir8   spansretokenizerspanrk   rm   r   ri   r   merge_phrases   s   
"z$test_issue615.<locals>.merge_phraseszThe golf club is brokenr   golfclubSport_Equipment)on_matchr   )r   r/   r1   r3   r4   r5   )r   rw   r   patternr5   r   r7   entitiesr   r   r   test_issue615   s   

r~   iR  c                  C   s   t tdd id} t| }ddiddiddig}|d	|g t|jg d
d}||}t|dks4J |d \}}}|dksAJ |dksGJ dS )zfThe variable-length pattern matches the succeeding token. Check we
    handle the ambiguity correctly.c                 S      |   S Nlowerstringr   r   r   <lambda>       ztest_issue850.<locals>.<lambda>lex_attr_gettersr   bobr   r   frankFarAwayr   andr   r   r_   r.   r   rE   Nr   r   r   r1   r	   r/   r2   r/   r7   r|   r   matchent_idr&   r'   r   r   r   test_issue850   s   r   c                  C   s   t tdd id} t| }ddiddddd	ig}|d
|g t|jg dd}||}t|dks5J |d \}}}|dksBJ |dksHJ dS )z7Test Matcher matches with '*' operator and Boolean flagc                 S   r   r   r   r   r   r   r   r      r   z%test_issue850_basic.<locals>.<lambda>r   r   r   r   r   )r   r   r   r   r   r_   r.   r   rE   Nr   r   r   r   r   test_issue850_basic   s   r   i  c                  C   st   ddidddg} t td}t|ddgd}t|dgd}t|}|d	| g ||}|s0J ||}|s8J d
S )z=Test matches occur when optional element at end of short doc.r   HelloT?)rb   r   r   Worldr_   	MyMatcherN)r   r   r	   r   r1   )r|   r/   hello_worldhellor7   r8   r   r   r   test_issue1434   s   
r   zstring,start,end))rN   r   r.   )a br   rC   )a cr   r.   )a b cr   rC   )a b b cr   rB   )a b br   rB   i  c                 C   s   ddidddg}t t }|d|g tt |  d}||}|du s*|du r0|g ks0J |d	 d
 |ks:J |d	 d |ksDJ dS )z5Test matcher works when patterns end with * operator.r   rN   rO   r   r   TSTENDr_   Nr.   rC   )r   r   r1   r	   split)r   r&   r'   r|   r7   r   r8   r   r   r   test_issue1450   s   
r   i  c                  C   s   t t } | dddiddigg t| jg dd}| |}t|dks'J |d dd	 d
ks3J |d dd	 dks?J d	S )z0Test regression in Matcher introduced in v2.0.6.MWEorthrN   )rN   rN   rN   r_   rC   r   r.   N)r   rC   )r.   rB   )r   r   r1   r	   r/   r2   )r7   r   r8   r   r   r   test_issue1945  s   
r   i  c                    s   t  }ddidddddidddddg}tjdd	d
 |d|g t g dd}||}t fdd|D s>J d S )Nr   Doe!r   r   optionalT)r%   r   FdefaultrU   )r   Johnr   r   r_   c                    s   g | ]
\}}}| j v qS r   r0   )r$   match_idr&   r'   rV   r   r   r)     r*   z"test_issue1971.<locals>.<listcomp>)r   r   set_extensionr1   r	   allrV   r7   r|   r   r8   r   r   r   test_issue1971  s   r   c                 C   sl   t | }dddgidddig}ddiddig}t| g dd	}|d
||g ||}t|dks4J d S )NEURINeur)r   r   rc   Tr   )r   10isr   r   r_   rM   rC   r   r	   r1   r2   )rV   r7   pattern1pattern2r   r8   r   r   r   test_issue_1971_2"  s   r   c                    s   t jdddd t jdddd t ddgd	}t }|d
dddiigg |ddddiigg t fdd||D }t|dksIJ |tg dksSJ dS )zFTest that pattern matches correctly for multiple extension attributes.rN   r.   Tr   forcerO   rC   r   worldr_   r   r%   r   c                 3   s&    | ]\}}} j | ||fV  qd S r   r   )r$   m_idser   r   r   	<genexpr>6  s   $ z$test_issue_1971_3.<locals>.<genexpr>rE   ))r   r   r.   )r   r.   rC   )r   r   r.   )r   r.   rC   N)r   r   r	   r   r1   sortedr2   re   r   r   r   test_issue_1971_3-  s   r   c                 C   s   t jdddd t jdddd t| }t| g dd}d	ddd
igd }|d|g ||}t|dks9J |d | jd ddfksGJ dS )zhTest that pattern matches correctly with multiple extension attribute
    values on a single token.
    ext_astr_aTr   ext_bstr_b)thisr   r   r_   r%   )r   r   rB   rU   r.   r   N)r   r   r   r	   r1   r2   r0   )rV   r7   r   r|   r8   r   r   r   test_issue_1971_4;  s    r   i	  c                 C   sN   t | }t| ddgd}|dddiddigg ||}t|dks%J dS )	zITest problem with successive ?. This is the same bug, so putting it here.rN   rO   r_   4r   r   rB   Nr   )rV   r7   r   r8   r   r   r   test_issue2464L  s
   r   i	
  c                    s   | d t  dd jjd dg _t j}|ddddgg  fd	d
| D }t|tdd}t|dks<J t|d dksFJ |d jdksOJ dS )zTest that operator + is greedy.zIt is May 15, 1993.rC   rF   DATErh   RULE+)ENT_TYPEr   c                    s   g | ]\}}} || qS r   r   r#   ri   r   r   r)   ]  s    z"test_issue2569.<locals>.<listcomp>T)keyreverser-   r   rE   zMay 15, 1993N)	r
   r/   r0   r4   r   r1   r   r2   r   )r   r7   matchedr   ri   r   test_issue2569V  s   
r   io
  c                  C   s   t  } t| j}d}ddidddddig}|||g | d}| d	}||}|D ]\}}}	| jj| |ks:J q+||}
|
D ]\}}}	| jj| |ksPJ qAd
S )z^Ensure the correct entity ID is returned for matches with quantifiers.
    See also #2675
    test_patternr   highTr   )r   r   
adrenalinez$This is a high-adrenaline situation.z$This is a high adrenaline situation.N)r   r   r/   r1   r0   )nlpr7   
pattern_idr|   doc1doc2matches1r   r&   r'   matches2r   r   r   test_issue2671d  s"   
r   i  c           
      C   s   ddiddiddiddigddidd	d
dddiddiddigddidd	ddddiddiddigg}g d}g d}g d}t | |||d}t| }t|D ]\}}|t||g ||}	|	sdJ qOdS )z%Test problem with matcher quantifiersr   hasr   todoTAGr   TFr   )IS_ASCIIr   r   r   )alsor   r   r   with)RBVBZTOVBr   )ADVVERBADPr   r   )r`   tagsposN)r	   r   	enumerater1   str)
rV   r   r`   r   r   r   r7   rs   r|   r8   r   r   r   test_issue3009|  s0   


r   i   c                    s   t | g dd t| }ddddgiigddddgiigg}|d	| | }t|d
ks0J  fdd|D }|g dksAJ d S )N)r   ,howareyoudoingr   r_   r   r   r   r   r   r   rU   rE   c                    s    g | ]\}}} || j qS r   )r   r#   ri   r   r   r)     s     z"test_issue3328.<locals>.<listcomp>)r   r   r   r   rd   )rV   r7   r   r8   matched_textsr   ri   r   test_issue3328  s   r   i  c                 C   sn   t | dd}ddiddig}|d|g tt |ddd	igg W d
   d
S 1 s0w   Y  d
S )zATest that match pattern validation doesn't raise on empty errors.T)validater   r   r   GOODBADXYN)r   r1   rG   rH   r   )rV   r7   r|   r   r   r   test_issue3549  s   "r   z5Matching currently only works on strings and integersi  c                 C   sT   t jddd t| }ddidddiig}|d|g t| dd	gd
}|| dS )zBTest that custom extensions with default None don't break matcher.	issue3555Nr   r   haver%   TrU   appler_   )r   r   r   r1   r	   )rV   r7   r|   r   r   r   r   test_issue3555  s   r  i  c                 C   s   t | g dd}t| }d}ddiddiddig}ddiddiddiddig}|||g ||}|d	 d	 | j| ks?J t| }|||g ||}|d	 d	 | j| ks[J d
S )zJTest that match IDs returned by the matcher are correct, are in the string)terrificgroupofpeopler_   PATTERNr   r  r   r   r  r   N)r	   r   r1   r0   )rV   r   r7   r   r   r   r8   r   r   r   test_issue3839  s   r	  i'  c                 C   sf   t | g dd}t|dksJ dddddidd	ig}t| }|d
|g t||dks1J d S )N)Thisr   rN   test.r_   rD   r
  r   r   r   r   r  rU   rC   )r	   r2   r   r1   )rV   r   r|   r7   r   r   r   test_issue3879  s   r  io  c                 C   s`   t | }ddidddddiddig}|d|g t| g d	d
}||}t|dks.J dS )z?Test that combinations of optional rules are matched correctly.r   r   r   r   )r   r   r   r   rU   )r   mynewr   r_   r   Nr   r1   r	   r2   r   r   r   r   test_issue3951  s   r  i  c                 C   s  t | }|dddiddigg t| dgd}t||dks"J t| g dd}t||d	ks4J t | }|dddiddidd
igg t| g dd}t||d	ksZJ t | }|dddiddid
ddgg t| g dd}t||dksJ dS )z=Test that matches without a final {OP: ?} token are returned.rU   r   rN   r   r   r_   r.   )rN   rO   rQ   rC   rO   )rN   rO   rO   rQ   r   rB   Nr  )rV   r7   r   r   doc3doc4r   r   r   test_issue4120  s    "r  zpattern,re_patternc                 C   s|   t | j}|j||gdd || }dd t||D }t||D ]\\}}}	\}
}| ||	 j| |
| jks;J q"dS )z_Test that the greedy matching behavior "FIRST" is consistent with
    other re implementations.FIRSTgreedyc                 S      g | ]}|  qS r   rv   r$   mr   r   r   r)         z.test_greedy_matching_first.<locals>.<listcomp>N)r   r/   r1   refinditerzipr   )r   r   r|   
re_patternr7   r8   
re_matchesr   m_sm_ere_sre_er   r   r   test_greedy_matching_first  s   
"r&  zpattern,longestc           	      C   sN   t | j}|jd|gdd || }|D ]\}}}| || j|ks$J qdS )z+Test the "LONGEST" greedy matching behaviorr   LONGESTr  N)r   r/   r1   r   )	r   r   r|   longestr7   r8   r   r   r   r   r   r   test_greedy_matching_longest  s   
r)  c                 C   s~   | d d}t|j}ddiddig}|jd|gdd ||}t|dks)J |d	 d d	ks3J |d	 d
 d
ks=J dS )zJTest that "LONGEST" matching prefers the first of two equally long matchesr   CCCr   Cr   r'  r  r.   r   rC   N)r   r   r/   r1   r2   )r   r   r7   r|   r8   r   r   r   "test_greedy_matching_longest_first'  s   
r,  c                 C   sL   t | j}tt |jdtgdd W d    d S 1 sw   Y  d S )Nr   GREEDYr  )r   r/   rG   rH   rI   r1   r   )r   r   r7   r   r   r   test_invalid_greediness4  s   
"r.  c                 C   sR   t | j}|j||gdd || }dd t||D }t|t|ks'J dS )zPTest that matcher.__call__ consumes tokens on a match similar to
    re.findall.r  r  c                 S   r  r   r  r  r   r   r   r)   J  r  z(test_match_consuming.<locals>.<listcomp>N)r   r/   r1   r  r  r2   )r   r   r|   r   r7   r8   r!  r   r   r   test_match_consuming:  s
   
r/  c           
      C   s   g d}|D ]P\}}}t | }t|jt|d}g }| D ]}|dr0||d dd q|d|i q|d|g ||}	|rN|	sMJ ||fq|	rVJ ||fqd S )N))aaaba a a bT)r0  a+ bT)r0  a+ a+ bT)r0  	a+ a+ a bT)r0  
a+ a+ a+ bT)r0  a+ a a bTr0  a+ a aTr0  za+T)aaar2  F)r:  r3  F)r:  r5  F)r:  a+ a bF)r:  r6  Fr7  r9  )r0  r;  Tr_   r   r   r   r   r  )r   r	   r/   r3   r   endswithappendr1   )
rV   casesr   pattern_strresultr7   r   r|   partr8   r   r   r   test_operator_combosN  s   
rB  c                    s   t |  ddidddg} d|g  fdd}t |dd	ks&J t |d
dks2J t |dd	ks>J t |ddksJJ t |ddksVJ t |ddksbJ dS )zBTest matcher works when patterns end with * operator. (issue 1450)r   rN   rO   r   r   r   c                    s   t  j|  dS )Nr_   )r	   r/   r   r   r7   r   r   r   x  s    z,test_matcher_end_zero_plus.<locals>.<lambda>r.   r   rC   r   r   r   rB   r   N)r   r1   r2   )rV   r|   r   r   rC  r   test_matcher_end_zero_plusr  s   rD  c                    s   t | }dddgiigdddgiigdddgiigg}|d| t| d d | } fd	d
|D }|g dks>J d S )Nr   r   zeroonetworU   zzero one two threer_   c                    s$   g | ]\}}}t  |||d jqS rg   )r
   r   )r$   Lr   r   ri   r   r   r)     s   $ z;test_matcher_sets_return_correct_tokens.<locals>.<listcomp>)rE  rF  rG  )r   r1   r	   r   )rV   r7   r   r8   textsr   ri   r   'test_matcher_sets_return_correct_tokens  s   rJ  zignore:\[W036c                  C   s   t  } t| j}d}ddiddig}t|dksJ |d|g d|v s'J || |}t|dks5J |d || |}t|dksHJ tt |d W d    d S 1 s^w   Y  d S )	NzThis is a test case.r   r  r   r   r   RulerC   )	r   r   r/   r2   r1   removerG   rH   rI   )r   r7   r   r|   results1results2r   r   r   test_matcher_remove  s   

"rO  c                 C   s4  ddg dfddg dfddg dfdd	g dfd
dg dfddg dfd
dg dfddg dfddg dfddg dfddg dfddg dfddg dfddg dfddg dfddg dfddg dfddg dfdd g dfdd!g dfdd"g d#fdd$g dfg}|D ]\}}}t | }t|jt|d%}g }| D ]M}|d&r||d' d&d( q|d)r||d' d)d( q|d*r||d' d*d( q|d+r||d' |d,d  d( q|d-|i q|jd.|gd/d0 ||d1d2}	t|	}
|	d' \}}}}||ksJ |||||
fqd S )3Nr0  a* br   r   r   r.   baabb a* br   r.   r.   rC   r1  r   r.   rC   rB   r2  aabaa+ b a+r   r   r.   rC   aabaar   r   r.   rC   rC   a+ b a*aaaaa*r   r   r   r   	b a* b b*aabba* b* a*r   r   r.   r.   r4  r5  r6  r8  r   r.   rC   a+ a a?
a a a a a?r;  r3  a{2,} bza{,3} ba{2} br   r   r.   a{2,3} br_   r   r   r   r   r   }r.   r   r  r'  r  Twith_alignments	r   r	   r/   r3   r   r<  r=  r1   r2   )rV   r>  r   r?  r@  r7   r   r|   rA  r8   	n_matchesr%   r   r   expectedr   r   r   +test_matcher_with_alignments_greedy_longest  sV   



rp  c                 C   sT  dddddgg dg ddggfdddg d	gfd
ddg dgfdddddgg dg dgfdddg dg dgfdddg dg dg dg dgfdddddgg dg dg dgfddddgddgg dg dgfddd g d	gfd!d"d#dgd
gd
d
gddgg dg d$g d%ddggfd&dd'g dgfd(dd)g dgfd*dd+g dgfd,dd-g dgfd.dd/ddgg dgfd0dd1g dgfd2dd3g dg dgfd4dd5g dg dgfd6dd7g dg dgfd8dd9g dgfd:dd;g dgfd<dd=g dg dgfg}|D ]\}}}}t | }t|jt|d>}g }| D ]V}	|	d?r.||	d d?d@ q|	dAr@||	d dAd@ q|	dBrR||	d dBd@ q|	dCrh||	d |	dd  d@ q|dD|	i q|dE|g ||dFdG}
t|
}|
D ]!\}}}}||v sJ ||||||ft||| ksJ qqd S )HNr   r0  rP  r.   rh  rQ  rR  rS  rT  rC   r1  rU  rB   r2  rE   rV  rW  rc  rX  rD   rY  rZ  )r   r.   rC   rC   rF   r[     r\  r]  )r   r   r   r^     r_  r+   r`  ra  rb  )r   r.   r.   r-   r4  r,   r5     r6     r8     rd     re     r;     r3     rf     za{3} b   rg     ri  r_   r   r   r   r   rj  r   r  Trk  rm  )rV   r>  case_idr   r?  resultsr7   r   r|   rA  r8   rn  r%   r   r   ro  r   r   r   'test_matcher_with_alignments_non_greedy  sj    " ." r  )Qr  rG   spacy.attrsr   r   r   spacy.errorsr   spacy.lang.enr   spacy.lang.lex_attrsr   spacy.matcherr   spacy.tokensr	   r
   r   spacy.vocabr   r   r   pattern3pattern4pattern5re_pattern1re_pattern2re_pattern3re_pattern4re_pattern5longest1longest2longest3longest4longest5fixturer   r   markissueparametrizer9   r;   rL   rT   rW   rf   r~   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   skipr  r	  r  r  r  r&  r)  r,  r.  r/  rB  rD  rJ  filterwarningsrO  rp  r  r   r   r   r   <module>   s   


































	









	






	






	



$


1