o
    icf                  
   @   s  d dl Z d dlZd dlmZ d dlmZ d dlmZmZ d dl	m
Z
 d dlmZmZmZ d dlmZ d dlmZ d	d
lmZ ddlmZ ejdd Zejdd Zejddd Zejddd Zejddd Zejddd Zejddd Z ej!d d!gejd"d#d$ Z"ej!d%d&gejd"d'd( Z#ej!d)d*d+d,e $g d-fgejd"d.d/ Z%ej!d0d d e&d1d1fdd e&d2d2fd	e&d3e&d3e&d4 d4fd5gd6d7 Z'd8d9 Z(d:d; Z)d<d= Z*d>d? Z+d@dA Z,dBdC Z-dDdE Z.ej!dFg dGdHdI Z/dJdK Z0dLdM Z1dNdO Z2dPdQ Z3dRdS Z4dTdU Z5dVdW Z6dXdY Z7ej8dZd[d\ Z9d]d^ Z:d_d` Z;dadb Z<dcdd Z=dedf Z>dgdh Z?didj Z@dkdl ZAdmdn ZBdodp ZCdqdr ZDej!dsg dtdudv ZEdwdx ZFdydz ZGd{d| ZHd}d~ ZIdd ZJdS )    N)assert_array_equal)get_current_ops)LENGTHORTH)English)DocSpanToken)filter_spans)Vocab   )add_vecs_to_vocab   )clean_underscorec              	   C   sb   d}g d}g d}g d}| |}dd |D }dd |D }t |jdd |D |||||d	S )
N:This is a sentence. This is another sentence. And a third.)r   r      r   r      r      r   r      r   r   r   )nsubjROOTdetattrpunctr   r   r   r   r   r   r   npadvmodr   )Or   zB-ENTI-ENTr   r   r   r   r   r   r   r   r   r   c                 S      g | ]}|j qS  text.0tr   r   M/home/ubuntu/.local/lib/python3.10/site-packages/spacy/tests/doc/test_span.py
<listcomp>       zdoc.<locals>.<listcomp>c                 S   s   g | ]}t |jqS r   )boolwhitespace_r!   r   r   r$   r%      s    c                 S   r   r   r   r!   r   r   r$   r%      r&   )wordsspacesheadsdepsentslemmasr   vocab)en_tokenizerr    r+   r,   r-   tokensr.   r*   r   r   r$   doc   s    r3   c                 C   s(   d}| |}t |jdd |D d}|S )Nr   c                 S   r   r   r   r!   r   r   r$   r%   ,   r&   z"doc_not_parsed.<locals>.<listcomp>r)   r/   r1   r    r2   r3   r   r   r$   doc_not_parsed(   s   r6   i  c                  C   s   d} t t t|  d}d|d _|dd D ]}|djdkr&d|_qd	|_qt|j}|d  }|d  }t	|t sBJ t	|t sIJ dS )
z)Test that Span.as_doc() doesn't segfault.z7The sky is blue . The man is pink . The dog is purple .r4   Tr   r   N.F)
r   r   listsplit
sent_startnborr    sentsas_doc
isinstance)stringr3   wordr=   sent0sent1r   r   r$   test_issue15370   s   

rD   iL  c                 C   s(   | d}|dd }|j |jksJ dS )z.Test that span.orth_ is identical to span.textzThe black cat purrs.r   r   N)orth_r    r1   r3   spanr   r   r$   test_issue1612B   s   rH   i  c                  C   sp   g d} t t | dgt|  dgt|  d}tt t|dd j W d   dS 1 s1w   Y  dS )zTest that Span.noun_chunks works correctly if no noun chunks iterator
    is available. To make this test future-proof, we're constructing a Doc
    with a new Vocab here and a parse tree to make sure the noun chunks run.
    )Thisisasentencer   depr)   r+   r,   r   N)r   r   lenpytestraisesNotImplementedErrorr9   noun_chunks)r)   r3   r   r   r$   test_issue3199J   s
   &"rT   i   c                  C   s   t  } | d}| d}| d}|dd }|dd }|dd }|d }tt ||dks4J W d    n1 s>w   Y  ||dksLJ tt ||dk s[J W d    d S 1 sfw   Y  d S )NzTalk about being boring!zTalk of being boring!Letr   r                 ?)r   rP   warnsUserWarning
similarity)nlpr    text_varyrG   span_2span_3tokenr   r   r$   test_issue5152V   s   "ra   ic  c                 C   s4   | d}|d d }|j dksJ |jdksJ d S )NzThis is a magnificent sentence.r    )text_with_wsr    rF   r   r   r$   test_issue6755i   s   rd   z!sentence, start_idx,end_idx,label)Welcome to Mumbai, my friend      GPEi  c                 C   8   t  }|| }|d d  j|||d}|j|ksJ d S )Nlabel)r   	char_spanlabel_)rL   	start_idxend_idxrk   r[   r3   rG   r   r   r$   test_issue6815_1q   s   rp   z!sentence, start_idx,end_idx,kb_id)re   rf   rg      c                 C   ri   )Nkb_id)r   rl   rs   )rL   rn   ro   rs   r[   r3   rG   r   r   r$   test_issue6815_2}   s   rt   z"sentence, start_idx,end_idx,vectorre   rf   rg   )g?g?g333333?c                 C   s<   t  }|| }|d d  j|||d}|j|k sJ d S )N)vector)r   rl   ru   all)rL   rn   ro   ru   r[   r3   rG   r   r   r$   test_issue6815_3   s   rw   zi_sent,i,j,textz	This is azThis is anotherzAnd za third)r   r   r   Nc                 C   s<   t | j}|| ||}|s|rJ d S |j|ksJ d S N)r9   r=   rl   r    )r3   i_sentijr    r=   rG   r   r   r$   test_char_span   s
   

r|   c                 C   s   d}d}d}| j dd|||d}| dd  j dd	|||d}|j|jks&J |j|j  kr3|ks6J  J |j|j  krC|ksFJ  J |j|j  krS|ksVJ  J d S )
NLABELKB_IDSPAN_ID   -   )rk   rs   span_idr      (   )rl   r    rm   kb_id_id_)r3   rk   rs   r   span1span2r   r   r$   test_char_span_attributes   s     $r   c                 C   s`   t | j}|d jdksJ |d jdksJ t|dksJ tdd |D t| ks.J d S )Nr   rq   r   c                 s   s    | ]}t |V  qd S rx   )rO   )r"   sentr   r   r$   	<genexpr>   s    z(test_spans_sent_spans.<locals>.<genexpr>)r9   r=   startendrO   sumr3   r=   r   r   r$   test_spans_sent_spans   s
   
"r   c                 C   sP   | dd }t |dksJ |jdksJ |jjdksJ |jjjdks&J d S )Nr      z
a sentencerL   rJ   )rO   r    rootheadr3   rG   r   r   r$   test_spans_root   s
   r   c                 C   s.   | dd }t |dksJ |jdksJ d S )Nr   r   zThis is a sentence)rO   r    r   r   r   r$   test_spans_string_fn   s   r   c                 C   sZ   d}g d}dgt | }| |}t|jdd |D ||d}|dd  jjdks+J d S )	Nz through North and South Carolina)r   r   r   r   r   rM   c                 S   r   r   r   r!   r   r   r$   r%      r&   z$test_spans_root2.<locals>.<listcomp>rN   Carolina)rO   r   r0   r   r    )r1   r    r+   r,   r2   r3   r   r   r$   test_spans_root2   s   r   c                 C   s4  t t| js	J | dd jjjdksJ | dd jjdks"J | dd jjjjdks0J | dt |  jt| jd ksBJ t| dt |  jt| jksTJ tt	 |dd j W d   n1 skw   Y  d	|d _
d	|d
 _
|dd j|dd
 ksJ |dd j|d
d ksJ dS )zTest span.sent propertyNr   rJ   zThis is a sentence.r      rI   r   Trq   r   r   
      )rO   r9   r=   r   r   r    	left_edgerP   rQ   
ValueErroris_sent_startr3   r6   r   r   r$   test_spans_span_sent   s   $$

"r   c                  C   sd   t t td d} d| d _d| d _d| d _dg| _| jd }t|j}t|dks0J d S )	Nz5This is a sentence . This is another sentence . Thirdr4   Tr   rq   r   )ENTITYr   	   r   )r   r   r9   r:   r   r-   r=   rO   )r3   entity	ent_sentsr   r   r$   test_issue13769   s   





r   zstart,end,expected_sentence))r   r   This is)r   r   r   )r   r   r   )r   r   r   )r   r   zAnd a)r   r   zthird.)r   r   r   c                 C   sZ   dd }|| j d< | || jj|ksJ dd | jd< | || j| || ks+J d S )Nc                        fddt dt dD S )Nc                       g | ]
} ||d   qS r   r   r"   iir3   r   r$   r%         zFtest_spans_span_sent_user_hooks.<locals>.user_hook.<locals>.<listcomp>r   r   rangerO   r   r   r   r$   	user_hook     z2test_spans_span_sent_user_hooks.<locals>.user_hookr=   c                 S   s   | S rx   r   xr   r   r$   <lambda>      z1test_spans_span_sent_user_hooks.<locals>.<lambda>r   )
user_hooksr   r    user_span_hooks)r3   r   r   expected_sentencer   r   r   r$   test_spans_span_sent_user_hooks   s
   
"r   c                 C   s  | d}t |jdd |D g ddgd d}|dd	  }|jd
ks&J |d dks.J |d dks6J |d dks>J |d dksFJ |dd  }|jdksUJ |d dks]J |d dkseJ |d d	ksmJ |d	d  }|jd
ks|J |d dksJ |d dksJ |d dksJ |d dksJ | d}t |jdd |D g ddgt| d}|dd  }t|tg dg dg dg dS )z!Test span's lca matrix generationzthe lazy dog sleptc                 S   r   r   r   r!   r   r   r$   r%     r&   z)test_spans_lca_matrix.<locals>.<listcomp>)r   r   r   r   rM   r   rN   Nr   )r   r   r   r   r   r   r   r7   )r   r   )r   r   r   )r   r   )r   r   zI like New York in Autumnc                 S   r   r   r   r!   r   r   r$   r%   1  r&   )r   r   r   r   r   r   )r   r   r   )r   r   r   )r   r   r   )r   r0   get_lca_matrixshaperO   r   numpyasarray)r1   r2   r3   lcar   r   r$   test_spans_lca_matrix  sB   &r   c                  C   s   t t g dd} | d d }| dd  }tt+ ||dks$J || dks-J |d d | jd dks=J W d    d S 1 sHw   Y  d S )N)rK   brK   r   r4   r   rW   rV   r   rK   )r   r   rP   rX   rY   rZ   r0   r3   r   r   r   r   r$   test_span_similarity_match9  s   ""r   c                 C   s   d}| |}d|j |d j _d|j |d j _t|j dd |D d}|d	d jd
ks/J |dd	 jdks:J |d	d jdksEJ d	S ):Test span.sentiment property's default averaging behaviourgood stuff bad stuff      @r          r   c                 S   r   r   r   r!   r   r   r$   r%   I  r&   z0test_spans_default_sentiment.<locals>.<listcomp>r4   Ng      ?r   g      r7   gUUUUUU?)r0   r    	sentimentr   r5   r   r   r$   test_spans_default_sentimentC  s   r   c                 C   s   d}| |}d|j |d j _d|j |d j _t|j dd |D d}d	d
 |jd< |dd jdks6J |dd jdksAJ |dd jdksLJ dS )r   r   r   r   r   r   c                 S   r   r   r   r!   r   r   r$   r%   U  r&   z1test_spans_override_sentiment.<locals>.<listcomp>r4   c                 S   s   dS )N      $@r   )rG   r   r   r$   r   V  r   z/test_spans_override_sentiment.<locals>.<lambda>r   Nr   r   r7   )r0   r    r   r   r   r5   r   r   r$   test_spans_override_sentimentO  s   r   c                 C   s\   d}| |}|dd }|dd }t |t |ksJ |dd }t |t |ks,J dS )zTest spans can be hashed.r   Nr   r   r   )hash)r1   r    r2   r   r   span3r   r   r$   test_spans_are_hashable\  s   r   c                 C   s  | dd }| j |j|jdd}|j|jksJ |j|jks J |jdks'J | j |j|jddd}|j|jks:J |j|jksBJ |jdksIJ | j |jd |jddd}|j|jks^J |j|jksfJ |jdksmJ | j |jd |jdd	d}|j|jksJ |j|jksJ |jdksJ tt | j |jd |jdd
d}W d    n1 sw   Y  | dd j |jd |jddd}|j|jksJ |j|jksJ |jdksJ d S )Nr   r   rh   rj   strict)rk   alignment_moder   contractexpandunkr   r   )rl   
start_charend_charrm   rP   rQ   r   r   r   r   r$   test_spans_by_characterg  sD   r   c                 C   sb   | dd }| ttg}|jt|dfksJ |d |d jks#J |d t|d ks/J d S )Nr   r   r   r   r   r   )to_arrayr   r   r   rO   orth)r3   rG   arrr   r   r$   test_span_to_array  s
   r   c                 C   s   | dd }|  }|j|j ksJ t|| jsJ || us"J |d jdks+J t|jdks4J | dd   }t|jdksEJ | dd   }t|jdksVJ d S )Nr   r   r   r   r   rq   )r>   r    stripr?   	__class__idxrO   r-   )r3   rG   span_docr   r   r$   test_span_as_doc  s   r   r   c                 C   s   d}d}|| j |< tjddd d| d j_| dd	 }|jdd
}| }| j |d|u s1J |j |d|u s<J |j |ddu sGJ tt|D ]}|dkr^|| jjdu s]J qM|| jjdu shJ qMt	dd |D rtJ dS )z>Test that the user_data can be preserved (but not by default).my_infoiV  is_xF)defaultTr   r   r   )copy_user_dataNr   c                 S   s   g | ]}|j jqS r   )_r   r!   r   r   r$   r%     s    z.test_span_as_doc_user_data.<locals>.<listcomp>)
	user_datar	   set_extensionr   r   r>   getr   rO   any)r3   my_keymy_valuerG   span_doc_withspan_doc_withoutrz   r   r   r$   test_span_as_doc_user_data  s    
r   c                 C   ^   t | ddddd}|jdksJ |j| jjd ksJ |jdks"J |j| jjd ks-J d S )Nr   r   helloQ342)rk   rs   )r   rm   rk   r0   stringsr   rs   r   r   r   r$   test_span_string_label_kb_id  
   r   c                 C   r   )Nr   r   r   r   )rk   r   )r   rm   rk   r0   r   r   idr   r   r   r$   test_span_string_label_id  r   r   c                 C   s"   t | dd}d|_d|_d|_d S )Nr   r   rk   rs   r   )r   rm   r   r   r   r   r   r$   test_span_attrs_writable  s   
r   c                 C   s  | j jd ddf| j jd ddf| j jd ddfg| _tt| jdks&J t| j}t|dks3J t|d jdks>J |d jd jd	ksJJ |d jd jdksVJ |d jd jdksbJ |d jd j	dksnJ t|d jdksyJ |d jd jd
ksJ |d jd jdksJ |d jd jdksJ |d jd j	dksJ |d jd jdksJ |d jd jdksJ |d jd jdksJ |d jd j	dksJ d S )NPRODUCTr   r   r   r   rf   r   r   rI   anotherr   za third.)
r0   r   r-   rO   r9   r=   r    rm   r   r   )r3   	sentencesr   r   r$   test_span_ents_property  s*   
r   c                 C   s  | dd | dd | dd | dd g}t |}t|dks"J |d jdkr0|d jdks2J |d jdkr@|d jdksBJ |d	 jdkrP|d	 jdksRJ | dd | dd | d
d | dd | dd g}t |}t|d	ksyJ t|d dksJ t|d d
ksJ |d jdkr|d jdksJ |d jd
kr|d jdksJ | dd | d	d
 | d
d | dd | dd g}t |}t|d	ksJ t|d dksJ t|d d
ksJ |d jdkr|d jdksJ |d jd
kr|d jdks
J d S )Nr   r   r   r   r   r   r   r   r   rq   r   r   )r
   rO   r   r   )r3   spansfilteredr   r   r$   test_filter_spans  s(   ,   6  6 (r   c                 C   s   | dd | dd ksJ | dd | dd ksJ | dd |dd ks*J t | dd t | dd ks<J t | dd t | dd ksNJ t | dd t |dd ks`J | dt|  | t| t| d  ksvJ d S )Nr   r   r   r   )r   rO   r   r   r   r$   test_span_eq_hash  s   $$$0r   c           	      C   s  d}d}| || }t ||D ]}|||  | | ksJ qtt |d  W d    n1 s2w   Y  tt |d  W d    n1 sKw   Y  | dd }|jdks]J |jdksdJ |jdkskJ |jdksrJ |jdksyJ | dd }|jdksJ |jdksJ |jdksJ |j|jksJ | t	|  d t	|  d  }|jdksJ |jdksJ |jdksJ |jdksJ |jdksJ | t	| d t	| d  }|jdksJ |jt	| ksJ |jt	| ksJ |jt	| jksJ |jt	| jksJ d S )Nr   rq   r   rb   r   )
r   rP   rQ   
IndexErrorr    r   r   r   r   rO   )	r3   r   r   rG   rz   empty_span_0empty_span_1oob_span_startoob_span_endr   r   r$   test_span_boundaries#  sD   

 r  c                 C   s4   | dd }t |jdt |jdksJ d S )Nr   rq    )rO   r    r:   lemma_)r3   spr   r   r$   test_span_lemmaJ  s   (r  c                 C   sZ   | d}|dd }|j drJ tt |j W d    d S 1 s&w   Y  d S )Nz7Check span.sent raises error if doc is not sentencized.r   r   
SENT_START)r3   has_annotationrP   rQ   r   r   rF   r   r   r$   	test_sentP  s   "r  c              	   C   s   t  }| jj}d|g dfd|g dfd|g dfd|g dfd	|g d
fg}t| j| t|| dd jt	d t|| dd jt	d t|| dd jg d || j_d S )Napple)r   r   r   orange)r7   r   And)r7   r7   r7   juice)rq   rq   r   pie)r   g333333@g!@r   )r   r   r   rf   )
r   r0   vectorsr   r   r   to_numpyru   r   zeros)r3   opsprev_vectorsr  r   r   r$   test_span_with_vectorsX  s     r  c                 C   sB  t | ddt | ddksJ t | dddt | dddksJ t | dddddt | dddddks2J t | ddt | dddksAJ t | ddt | dddddksRJ t | dddt | dddddksdJ t | ddt | ddkr|t | ddt | ddks~J t | dddt | dddkrt | dddt | dddksJ t | dddddt | dddddksJ t | dddddt | dddddksJ t | ddt | ddddd  k rt | ddd  k rt | dddddk sJ  J t | ddt | ddddd  krt | ddd  krt | dddddksJ  J t | dddddt | ddd  krEt | ddddd  krEt | ddksHJ  J t | dddddt | ddd  krrt | ddddd  krrt | ddksuJ  J t | dddddt | dddddk sJ t | dddddt | ddk sJ t | dddddt | ddksJ t | ddt | dddddksJ t | ddt | dddddksJ t | dddddt | dddddksJ t | dddddt | ddk sJ t | dddddt | ddksJ t | ddt | dddddksJ t | ddt | dddddks/J t | dddddt | dddddksDJ t | dddddt | ddk sVJ t | dddddt | ddkshJ t | ddt | dddddkszJ t | ddt | dddddksJ t | ddd	d
t | dddd
k sJ d S )Nr   r   r}   r~   rr   rb   r   r   AAA)r   BBB)r   r   r   r   r$   test_span_comparisonm  s<    ("$4<((TZZZ*$$$$*$$$$*$$$$*r  z9start,end,expected_sentences,expected_sentences_with_hook))r   r   r   r   )r   r   r   r   )r   r   r   r   )r   r   r   r   )r   r   r   r   )r   r   r   r   )rf   r   r   r   )r   r   r   r   c                 C   s   t t| || j|ksJ dd }|| jd< t t| || j|ks'J dd | jd< t| || jd | || ksAJ t t| || jdksPJ d S )Nc                    r   )Nc                    r   r   r   r   r   r   r$   r%     r   z6test_span_sents.<locals>.user_hook.<locals>.<listcomp>r   r   r   r   r   r   r$   r     r   z"test_span_sents.<locals>.user_hookr=   c                 S   s   | gS rx   r   r   r   r   r$   r     s    z!test_span_sents.<locals>.<lambda>r   r   )rO   r9   r=   r   r   )r3   r   r   expected_sentencesexpected_sentences_with_hookr   r   r   r$   test_span_sents  s   
&"r   c                 C   sB   t t tt| ddj W d    d S 1 sw   Y  d S )Nr   r   )rP   rQ   r   r9   r   r=   )r6   r   r   r$   test_span_sents_not_parsed  s   "r!  c                 C   s   | dd | dd g| j d< t| j d dksJ |  }t|j d dks)J | j d | dd  t| j d dks@J t|j d dksKJ d S )Nr   r   r   r   testr   )r   rO   copyappend)r3   doc_copyr   r   r$   test_span_group_copy  s   r&  c                  C   s^   t t jg dg dd} | t| dddg t| j| jd jD ]
\}}||ks,J q"dS )	zSpans may be associated with multiple sentences. These .sents should always be complete, not partial, sentences,
    which this tests for.
    )zMahler'sSymphonyzNo.8wasz
beautiful.)r   r   r   r   r   r   r)   sent_startsr   r   WORKr   N)r   r   r0   set_entsr   zipr=   r-   )r3   doc_sentent_sentr   r   r$   test_for_partial_ent_sents  s   r1  c                  C   s   t t jg dg dd} | t| dddg t| jd j}t|dks(J t	|d t	| jd j
  kr=d	ks@J  J d
S )zySpan.sents() should set .sents correctly, even if Span in question is trailing and doesn't form a full
    sentence.
    )rI   rJ   rK   ztest.r   )r   r   r   r   r   r*  r   rq   r,  r   r   r   N)r   r   r0   r-  r   r9   r-   r=   rO   strr   r   r   r   r$   test_for_no_ent_sents  s   4r3  c                 C   st   | d}| d}|dd |d krJ |dd |d kr J |dd |dd kr.J |dd |kr8J d S )Nza bzb cr   r   r   r   )r1   doc1doc2r   r   r$   test_span_api_richcmp_other  s   r6  )Kr   rP   numpy.testingr   	thinc.apir   spacy.attrsr   r   spacy.lang.enr   spacy.tokensr   r   r	   
spacy.utilr
   spacy.vocabr   utilr   test_underscorer   fixturer3   r6   markissuerD   rH   rT   ra   rd   parametrizerp   rt   arrayrw   rO   r|   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   usefixturesr   r   r   r   r   r   r   r  r  r  r  r  r   r!  r&  r1  r3  r6  r   r   r   r$   <module>   s    















			
)
/

'3
