o
    iq                     @   s  d dl Z d dlZd dlZd dlmZ d dlmZmZ d dlm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZmZ d dlmZmZmZ d dlm Z  ddlm!Z! dddgifddddgifgZ"ej#dd Z$ej#dd Z%ej#dd Z&ej#dd Z'ej#dd  Z(ej#d!d" Z)ej*+d#d$gej*,d%d&d' Z-ej*,d(d)d* Z.ej*,d+d,d- Z/ej*,d.d/d0 Z0ej*,d1d2d3 Z1d4d5 Z2ej*,d6d7d8 Z3ej*,d9d:d; Z4d<d= Z5d>d? Z6d@dA Z7dBdC Z8dDdE Z9ej*j:dFdGdHdI Z;ej*j:dFdGdJdK Z<dLdM Z=dNdO Z>dPdQ Z?dRdS Z@dTdU ZAdVdW ZBdXdY ZCdZd[ ZDd\d] ZEej*+d^d_d`gdadb ZFdcdd ZGdedf ZHdgdh ZIdidj ZJdkdl ZKdmdn ZLeMdoG dpdq dqZNdS )r    N)assert_equal)registryutil)ENT_IOB)English)Italian)Language)Lookups)EntityRecognizer)BiluoPushDown)DEFAULT_NER_MODEL)DocSpan)Exampleiob_to_biluosplit_bilu_labelVocab   )make_tempdirWho is Shaka Khan?entities      PERSONzI like London and Berlin.)r      LOC)      r   c                   C   s   dS )Nnon_entities r!   r!   r!   O/home/ubuntu/.local/lib/python3.10/site-packages/spacy/tests/parser/test_ner.pyneg_key   s   r#   c                   C   s   t  S Nr   r!   r!   r!   r"   vocab!   s   r%   c                 C   s   t | g ddS )N)CaseywenttoNewYork.words)r   )r%   r!   r!   r"   doc&   s   r.   c                 C   s4   | dd }| dd }|j |jdf|j |jdfgS )Nr            r   GPE)
start_charend_char)r.   caseynyr!   r!   r"   entity_annots+   s
   r7   c                 C   s   t tdd | D S )Nc                 S   s   g | ]\}}}|qS r!   r!   ).0selabelr!   r!   r"   
<listcomp>7       z entity_types.<locals>.<listcomp>)sortedset)r7   r!   r!   r"   entity_types5   s   r@   c                 C   s   t j|d}t | j|S )Nr@   )r   get_actionsstrings)r%   r@   actionsr!   r!   r"   tsys:   s   rE   r;   z
U-JOB-NAMEi  c              
   C   sj   t  }i }|jd|d}tt|jdgddgdgdgdgdg| gd}d	|jj|gd
d v s3J d S )Nnerconfigwordr,   r   tagdep)idsr-   tagsheadsdepsr   zJOB-NAME)examplesr/   )r   create_piper   	from_dictr   r%   movesrB   )r;   nlprH   rF   exampler!   r!   r"   test_issue1967@   s    rV   i  c                  C   s   t  } | d}|d |   t  }|d t|djdks%J |dj}|jd || dj	j
 ||   d|djvsIJ |djdksSJ dS )zGTest that spurious 'extra_labels' aren't created when initializing NER.rF   CITIZENSHIPr   resize_outputextra_labels)rW   N)r   add_pipe	add_label
initializelenget_pipelabelsmodelattrsrS   n_moves
from_bytesto_bytescfg)rT   rF   nlp2r`   r!   r!   r"   test_issue2179T   s   


rg   iQ	  c                  C   sd   d} t | g dksJ d}t |g dksJ d}t |g dks$J d}t |g dks0J d	S )
z9Test that IOB tags are correctly converted to BILUO tags.)	B-BRAWLER	I-BRAWLERri   )rh   ri   z	L-BRAWLER)I-ORGrj   B-ORG)rk   L-ORGzU-ORG)B-PERSONzI-PERSONrm   )rm   L-PERSONU-PERSON)B-MULTI-PERSONzI-MULTI-PERSONrp   )rp   zL-MULTI-PERSONzU-MULTI-PERSONN)r   )tags1tags2tags3tags4r!   r!   r"   test_issue2385e   s   ru   i
  c            	      C   s   t  } g }|t| ddg ig dd tdD }| d}t|D ]}|| q&| 	 }tdD ]}i }t
| |D ]}| j|g||dd	 qAq6d
S )zdTest issue that arises when too many labels are added to NER model.
    Used to cause segfault.
    zOne sentencer   c                 S   s   g | ]}t |qS r!   )str)r8   ir!   r!   r"   r<          z"test_issue2800.<locals>.<listcomp>i  rF      g      ?)sgdlossesdropN)r   extendr   rR   make_docrangerZ   listr[   r\   randomshuffleupdate)	rT   
train_datar@   rF   entity_type	optimizerrw   r{   rU   r!   r!   r"   test_issue2800v   s"   

r   i  c                  C   s   t  } | d}|d |   g d}|j|ksJ t  }|d}|j}|jd ||jj |	| 
  |j|ks?J dS )zTest issue that occurred in spaCy nightly where NER labels were being
    mapped to classes incorrectly after loading the model, when the labels
    were added using ner.add_label().
    rF   ANIMAL)OzB-ANIMALzI-ANIMALzL-ANIMALzU-ANIMALrX   N)r   rZ   r[   r\   
move_namesr`   ra   rS   rb   rc   rd   )rT   rF   r   rf   ner2r`   r!   r!   r"   test_issue3209   s   


r   c                  C   sT   t  } | d}|d |   g d}dh}|j|ksJ t|j|ks(J dS )zBTest that labels are inferred correctly when there's a - in label.rF   zLARGE-ANIMAL)r   zB-LARGE-ANIMALzI-LARGE-ANIMALzL-LARGE-ANIMALzU-LARGE-ANIMALN)r   rZ   r[   r\   r   r?   r_   )rT   rF   r   r_   r!   r!   r"   test_labels_from_BILUO   s   

r   i  c                  C   s   t  } | d}|d |   d| jv sJ | d}|ds#J |D ]	}|jdks.J q%dddg}| d	}|| d	| jv sFJ d| jv sMJ | d}|dsXJ |D ]	}|jdkscJ qZd
S )zDTest that running an entity_ruler after ner gives consistent resultsrF   PEOPLEhir   r   SOFTWAREspacyr;   patternentity_rulerN)r   rZ   r[   r\   
pipe_nameshas_annotationent_iobadd_patterns)rT   rF   doc1tokenpatternsrulerdoc2r!   r!   r"   test_issue4267   s&   



r   i  c                  C   s   d} d}t  }| |d}|jd|d}|d |  |d}t|jdks)J d|jv s0J t|d	d
dd}t|j|g |_|g}|j	|d| |d t|jdksVJ d|jv s]J dS )z:This should not crash or exit with some strange error code   -C6?
beam_widthbeam_densitybeam_nerrG   
SOME_LABELzWhat do you think about Apple ?r/   r1      MY_ORGr;           )r|   r   r   r   N)
r   rZ   r[   r\   r]   r_   r   r   ents
beam_parse)r   r   rT   rH   rF   r.   	apple_entdocsr!   r!   r"   test_issue4313   s$   
r   c                    sD   t |d|i} j|dd} fdd|D }|g dks J d S )Nr   F)_debugc                       g | ]}  |qS r!   get_class_namer8   actrE   r!   r"   r<      r=   z)test_get_oracle_moves.<locals>.<listcomp>)ro   r   r   B-GPEL-GPEr   )r   rR   get_oracle_sequence)rE   r.   r7   rU   act_classesnamesr!   r   r"   test_get_oracle_moves   s   r   c                    s   | j d< t|ddgd}ddg}t|d|i}t|jddd	d
t|jdddd
g|jj|<  |} fdd|D }|sAJ |d d	ksIJ |d dksQJ |d dksYJ dS )zTest that we don't get stuck in a two word input when we have a negative
    span. This could happen if we don't have the right check on the B action.
    r#   ABr,   Nr   r   r/   r   r   r   r   c                    r   r!   r   r   r   r!   r"   r<      r=   z8test_negative_samples_two_word_input.<locals>.<listcomp>rm   rn   re   r   r   rR   r   yspansr   rE   r%   r#   r.   r7   rU   r   r   r!   r   r"   $test_negative_samples_two_word_input   s   

r   c                    s   | j d< t|g dd}g d}t|d|i}t|jdddd	t|jdd
dd	g|jj|<  |} fdd|D }|sAJ |d dksIJ |d dksQJ dS )HTest that we exclude a 2-word entity correctly using a negative example.r#   )r   r   Cr,   )NNNr   r   r/   r   r   r   r   c                    r   r!   r   r   r   r!   r"   r<     r=   z:test_negative_samples_three_word_input.<locals>.<listcomp>rm   Nr   r   r!   r   r"   &test_negative_samples_three_word_input  s   

r   c                    s   | j d< t|dgd}dg}t|d|i}t|jdddd	t|jddd
d	g|jj|<  |} fdd|D }|s?J |d dksGJ |d dksOJ dS )r   r#   r   r,   Nr   r   r/   r   r   r   c                    r   r!   r   r   r   r!   r"   r<   $  r=   z2test_negative_samples_U_entity.<locals>.<listcomp>ro   r   r   r!   r   r"   test_negative_samples_U_entity  s   

r   c                 C   s2   t j|d}t | j|dd}|jd dksJ d S )NrA   r    )incorrect_spans_keyr#   )r   rB   rC   re   )r%   r@   rD   rE   r!   r!   r"   %test_negative_sample_key_is_in_config*  s   r   zNo longer supported)reasonc           
      C   s   g d}g d}t | |d}t|||d}t| j}d}|D ]?}|d u r'q |dkr5||dd q t|\}}	||d|	 ||d	|	 ||d
|	 ||d|	 q || d S )N)r   52Bomber)NNz	L-PRODUCTr,   )r-   r   Mr   ILUr   r    r   r   r   r   	r   r   rR   r   rC   
add_actionindexr   r   
en_vocabr-   
biluo_tagsr.   rU   rS   
move_typesrJ   actionr;   r!   r!   r"   test_oracle_moves_missing_B2  s"   
r   c           
      C   s   g d}g d}t | |d}t|d|i}t| j}d}|D ]$}|d u r&q|dkr4||dd qt|\}}	||||	 q|| d S )N)	
production
ofNorthropr   zCorp.r   z'sradar)	r   r   r   rk   Nrj   rl   r   r   r,   r   r   r   r   r   r   r!   r!   r"   test_oracle_moves_whitespaceL  s   
r   c            	      C   s  t  } | d}i }| jd|d}dd |D g dksJ dd |D g dks*J |jdd	 |d
 |j|gd }|j|d |j|d |j|d |j|ds]J t  }|d}i }|jd|d}|jg |dd gdd dd |D g dksJ dd |D g dksJ |jdd	 |jdd	 |d
 |j|gd }|j|d |j|d |j|d |j|drJ |j|dsJ |j|d |j|drJ |j|dsJ dS )z5Test succesful blocking of tokens to be in an entity.I live in New YorkrF   rG   c                 S      g | ]}|j qS r!   ent_iob_r8   r   r!   r!   r"   r<   h      z-test_accept_blocked_token.<locals>.<listcomp>r   r   r   r   r   c                 S   r   r!   	ent_type_r   r!   r!   r"   r<   i  r   r1   r   r2   r   r   r   r0   
unmodifiedblockeddefaultc                 S   r   r!   r   r   r!   r!   r"   r<   ~  r   )r   r   r   r   r   c                 S   r   r!   r   r   r!   r!   r"   r<     r      zU-N)	r   rQ   rS   r   r[   
init_batchapply_transitionis_validset_ents)	nlp1r   rH   ner1state1rf   r   r   state2r!   r!   r"   test_accept_blocked_tokena  s@   

r   c            	   	   C   s   dddgifddg ifg} t  }g }| D ]}|t||d |d  q|jddd	}|d
 |  tdD ]}i }t	j
|dd}|D ]	}|j||d qHq;dS )z7Test that training an empty text does not throw errors.r   r   r   r   r   r/   rF   Tlastr   r      sizer{   N)r   appendr   rR   r~   rZ   r[   r\   r   r   	minibatchr   	r   rT   train_examplestrF   itnr{   batchesbatchr!   r!   r"   test_train_empty  s"   
"
r  c            	   
   C   s   dddgifg} t  }g }| D ]}|t||d |d  q|jddd}|d	 |  td
D ],}i }t	j
|dd}|D ]}tt |j||d W d   n1 s\w   Y  qCq6dS )zFTest that the deprecated negative entity format raises a custom error.r   r   )r   r   z!PERSONr   r/   rF   Tr   r   r   r   r   r   N)r   r   r   rR   r~   rZ   r[   r\   r   r   r   pytestraises
ValueErrorr   r   r!   r!   r"   test_train_negative_deprecated  s&   "
r
  c                  C   s   t  } | d |   | d}dd |D g dksJ dd |D g dks*J i }| jd|d}|jd	d
 |d |j|gd }|j|dsQJ |j|dsZJ |j	|d |j|dsjJ |j|dssJ d S )NrF   r   c                 S   r   r!   r   r   r!   r!   r"   r<     r   z(test_overwrite_token.<locals>.<listcomp>)r   r   r   r   r   c                 S   r   r!   r   r   r!   r!   r"   r<     r   r   rG   r1   r   r2   r   r   zU-GPEzI-GPEr   )
r   rZ   r\   rQ   rS   r   r[   r   r   r   )rT   r.   rH   r   stater!   r!   r"   test_overwrite_token  s    

r  c                  C   sL   t  } | d}|d |   | d}g d}dd |D |ks$J d S )NrF   MY_LABELz3John is watching the news about Croatia's elections)	r   r   r   r   r   r   r   r   r   c                 S   r   r!   r   r   r!   r!   r"   r<     r   z"test_empty_ner.<locals>.<listcomp>r   rZ   r[   r\   )rT   rF   r.   resultr!   r!   r"   test_empty_ner  s   

r  c                  C   s   t  } dddg}| d}| d}|d |   || | d}g d}g d	}d
d |D |ks8J dd |D |ksCJ dS )zLTest that an NER works after an entity_ruler: the second can add annotationsTHINGThisr   r   rF   r  *This is Antti Korhonen speaking in Finlandr   r   r   r   r   r   r   r  r   r   r   r   r   r   c                 S   r   r!   r   r   r!   r!   r"   r<     r   z)test_ruler_before_ner.<locals>.<listcomp>c                 S   r   r!   r   r   r!   r!   r"   r<     r   Nr   rZ   r[   r\   r   )rT   r   r   untrained_nerr.   expected_iobsexpected_typesr!   r!   r"   test_ruler_before_ner  s   



r  c                 C   sB   ddi}dt i}tj|ddd }t| |fi | t| | d S )Nupdate_with_oracle_cut_sized   r`   T)validate)r   r   resolver
   )r   rH   re   r`   r!   r!   r"   test_ner_constructor  s   r  c                  C   s   t  } | jddd}|d |   dddg}| d}|| | d	}g d
}g d}dd |D |ks:J dd |D |ksEJ dS )zTTest that an entity_ruler works after an NER: the second can overwrite O annotationsrF   uner)namer  r  r  r   r   r  r  r  c                 S   r   r!   r   r   r!   r!   r"   r<     r   z)test_ner_before_ruler.<locals>.<listcomp>c                 S   r   r!   r   r   r!   r!   r"   r<     r   Nr  )rT   r  r   r   r.   r  r  r!   r!   r"   test_ner_before_ruler  s   


r"  c                  C   s~   t  } | jddddd | d}|d |   | d}g d	}g d
}dd |D |ks2J dd |D |ks=J dS )zITest functionality for blocking tokens so they can't be in a named entityblockerr   r1   )startendrG   rF   r  z,This is Antti L Korhonen speaking in Finland)r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   c                 S   r   r!   r   r   r!   r!   r"   r<     r   z"test_block_ner.<locals>.<listcomp>c                 S   r   r!   r   r   r!   r!   r"   r<     r   Nr  )rT   r  r.   r  r  r!   r!   r"   test_block_ner  s   

r&  	use_upperTFc                    sD  t    jddd| iid}g }tD ]\}}|t || |dD ]	}||d  q'q 	 }t
dD ]}i } j|||d q:|d d	k sOJ d
}	 |	}
|
j}t|dks`J |d jdksiJ |d jdksrJ t h} | t|}||	}|j}t|dksJ |d jdksJ |d jdksJ |d}|jjd | ksJ |d ||	}|j}t|dksJ |d jdksJ |d jdksJ W d    n1 sw   Y  g d}dd  |D }dd  |D }dd  fdd|D D }t|| t|| d}	 |	}
t|
dddddg|
_|
j}t|dks3J |d jdks=J |d jdksGJ |d jdksQJ  d|
}
|
j}t|dksdJ |d jdksnJ |d jdksxJ |d jdksJ |d jdksJ |d jdksJ |d jdksJ d S )NrF   r`   r'  rG   r   r   2   rz   r{   h㈵>I like London.r/   r   Londonr   	has_upperRANDOM_NEW_LABEL)zJust a sentence.z$Then one more sentence about London.zHere is another one.r+  c                 S      g | ]}| tgqS r!   to_arrayr   r8   r.   r!   r!   r"   r<   Q      z'test_overfitting_IO.<locals>.<listcomp>c                 S   r/  r!   r0  r2  r!   r!   r"   r<   R  r3  c                 S   r/  r!   r0  r2  r!   r!   r"   r<   S  r3  c                    s   g | ]} |qS r!   r!   )r8   textrT   r!   r"   r<   S  rx   zI like London and London.r0   i  )r;   kb_id)r   rZ   
TRAIN_DATAr   r   rR   r~   getr[   r\   r   r   r   r]   r4  label_r   to_diskr   load_model_from_pathr^   r`   ra   piper   r   r6  )r'  rF   r  r4  annotationsentr   rw   r{   	test_textr.   r   tmp_dirrf   r   ents2r   doc3ents3textsbatch_deps_1batch_deps_2no_batch_depsr!   r5  r"   test_overfitting_IO  sv   






rH  c                  C   s  d} d}t  }| |d}|jd|d}g }tD ]\}}|t||| |dD ]	}||d  q,q|	 }	i }
|j
||	|
d d	}||}|g}||}||d
 }tt|D ]%}|jD ]}|||d |f }d}d
| |  krd| ksJ  J qfqad S )Nr   r   r   r   rG   r   r   r)  r+  r   r/   r*  )r   rZ   r7  r   r   rR   r~   r8  r[   r\   r   predictscored_entsr   r]   r_   )r   r   rT   rH   rF   r  r4  r=  r>  r   r{   r?  r.   r   beamsentity_scoresjr;   scoreepsr!   r!   r"   test_beam_ner_scoresl  s8   


&rP  c                 C   s  t  }d}d}||| d}|jd|d}g }tD ]\}}|t||| |dD ]	}	||	d  q-q|	 }
t
dD ]}i }|j||
|d	 q@|d dk sUJ d
}||g}||}||d }|d dksqJ |d dksyJ t||jdksJ t 9}|| t|}||g}|d}||}||d }|d dksJ |d dksJ W d    n1 sw   Y  ||}t||}t|dddg|jj| < |g}t
dD ]}i }|j||
|d	 qt||jdksJ d S )Nr   r   r   r   r   r   rG   r   r   r(  r)  zI like Londonr   )r   r0   r   g      ?)r   r0   r   r   r/   r0   r   ry   )r   rZ   r7  r   r   rR   r~   r8  r[   r\   r   r   rI  rJ  r]   r   r   r:  r   r;  r^   r   	referencer   )r#   rT   r   r   rH   rF   r  r4  r=  r>  r   rw   r{   r?  r   rK  rL  r@  rf   docs2r   beams2entity_scores2neg_docneg_exneg_train_examplesr!   r!   r"   test_beam_overfitting_IO  sX   






rY  c                 C   s   t  }d}d}||| d}|jd|d}d}||}|d |d t|d	d
gi}t|jdddt|jdddt|jdddg|jj| < |	 }	t
dD ]}
i }|j|g|	|d qQdS )zCheck that the NER update works with a negative annotation that is a different label of the correct one,
    or partly overlapping, etcr   r   rQ  r   rG   r   r   ORGr   r   r   r   r0   r/   r)  N)r   rZ   r~   r[   r   rR   r   rR  r   r\   r   r   r#   rT   r   r   rH   rF   
train_textrV  rU   r   rw   r{   r!   r!   r"   test_neg_annotation  s,   


r]  c              	   C   s\  t  }d}d}||| d}|jd|d}d}||}|d |d t|d	d
gi}t|jdddg|jj| < t	|jj
dksEJ |jj
d jdksPJ |jj
d jdks[J t	|jj|  dksgJ |jj|  d jdkstJ |jj|  d jdksJ | }	tdD ]"}
i }tt |j|g|	|d W d    n1 sw   Y  qd S )Nr   r   rQ  r   rG   r   r   r   r   r   r   r   r/   r   z
Shaka Khanr)  )r   rZ   r~   r[   r   rR   r   rR  r   r]   r   r4  r9  r\   r   r  r  r	  r   r[  r!   r!   r"   test_neg_annotation_conflict  s8   


r^  c                 C   s   t  }d}d}||| d}|jd|d g d}g d}t|j|d}t|d	|i}t|jd
dd}	|	g|jj| < |	 }
t
dD ]}i }|j|g|
|d qBd|v sVJ dS )z/Regression test for previously flakey behaviourr   r   rQ  r   rG   )5FEDERALNATIONALMORTGAGEASSOCIATION(FannieMaez):Postedyieldson30yearmortgagecommitmentsfordeliverywithinri  daysrc  pricedatpar)z9.75%,standardconventionalfixed-rate	mortgages;z8.70ru  rv  z6/2r{  cappedonerz  rj  
adjustabler{  r|  r+   Source:TelerateSystemszInc.)5rk   rj   rj   rl   r   rk   rl   r   r   r   r   B-DATEL-DATEr   r   r   r   r   r  r  r   r   r   r   r   	B-PERCENT	L-PERCENTr   r   r   r   r   r   r   r   r  r  r   z
U-CARDINALr   r   r  zI-DATEr  r   r   r   r   r   r   r   r   r   r,   rF   r(  5   rZ  r1   r)  N)r   rZ   r   r%   r   rR   r   rR  r   r\   r   r   )r#   rT   r   r   rH   tokensiobr.   rU   neg_spanr   rw   r{   r!   r!   r"   test_beam_valid_parse  s&   r  c                 C   s   t  }|jtjv sJ t |j_t|jjrJ |d | 	t
j |  d| jv s/J W d    n1 s9w   Y  |   |jjd d|jjdd< | 	t
j |  d| jvsdJ W d    d S 1 sow   Y  d S )NrF   W033lexeme_normr   a)r   langr   LEXEME_NORM_LANGSr	   r%   lookupsr]   rZ   at_levelloggingDEBUGr\   r4  clear	add_table	get_table)caplogrT   r!   r!   r"   test_ner_warns_no_lookups"  s    

"r  r#  c                   @   s   e Zd ZdddZdd ZdS )BlockerComponent1
my_blockerc                 C   s   || _ || _|| _d S r$   )r$  r%  r!  )selfrT   r$  r%  r!  r!   r!   r"   __init__5  s   
zBlockerComponent1.__init__c                 C   s"   |j g || j| j gdd |S )Nr   r   )r   r$  r%  )r  r.   r!   r!   r"   __call__:  s   zBlockerComponent1.__call__N)r  )__name__
__module____qualname__r  r  r!   r!   r!   r"   r  3  s    
r  )Or  r   r  numpy.testingr   r   r   r   spacy.attrsr   spacy.lang.enr   spacy.lang.itr   spacy.languager   spacy.lookupsr	   spacy.pipeliner
   $spacy.pipeline._parser_internals.nerr   spacy.pipeline.nerr   spacy.tokensr   r   spacy.trainingr   r   r   spacy.vocabr   r   r7  fixturer#   r%   r.   r7   r@   rE   markparametrizeissuerV   rg   ru   r   r   r   r   r   r   r   r   r   r   skipr   r   r   r  r
  r  r  r  r  r"  r&  rH  rP  rY  r]  r^  r  r  factoryr  r!   r!   r!   r"   <module>   s    



	
















1

M#: