o
    iJ                     @   s   d dl Z d dlmZ d dlmZmZ d dlmZ dd Zdd Z	d	d
 Z
dd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd  Zd!d" Zd#d$ Ze jd%d&d'id(d'id)d'id*ggd+d, Zd-d. Zd/d0 Zd1d2 Zd3d4 ZdS )5    N)LEMMA)DocToken)Vocabc                 C   sN  d}ddddd}| |}t |dksJ | }|j|dd	 |d
 |j|d	d |d
 W d    n1 s:w   Y  t |dksGJ |d jdksPJ |d jdksYJ |d jdksbJ |d jdkskJ t|d jdksvJ |d jdksJ |d jdksJ |d jdksJ t|d jdksJ |d jdksJ d S )N-WKRO played songs by the beach boys all nightNAMEDr   TYPEzNumber=Plur)taglemmaent_typemorph	         attrs   the beach boyszthe beach boys    z	all night)	len
retokenizemergetexttext_with_wstag_lemma_strr   )en_tokenizerr   r   docretokenizer r    Y/home/ubuntu/.local/lib/python3.10/site-packages/spacy/tests/doc/test_retokenize_merge.pytest_doc_retokenize_merge   s.   
r"   c                 C   s   d}dddd}| |}t |dksJ | }|j|dd |d	 W d
   n1 s.w   Y  |D ]%}|j|jjk rI|t|jjv sHJ q5|j|jjkrZ|t|jjv sZJ q5d
S )z3Test that attachments work correctly after merging.r   r   r   r   r	   r
   r   r   r   r   r   N)r   r   r   iheadlistleftsrights)r   r   r   r   r   wordr    r    r!   "test_doc_retokenize_merge_children"   s   
r*   c                 C   sr   d}| |}|  %}|j|dd dddd |j|dd	 dddd W d    d S 1 s2w   Y  d S )
Nz through North and South Carolina   r    ORGr
   r   r         )r   r   r   r   r   r   r    r    r!   test_doc_retokenize_merge_hang1   s   
"r2   c                 C   sh   | d}|  }||dd  W d    n1 sw   Y  t|dks)J |d jdks2J d S )Nr   r   r   r   )r   r   r   r   )r   r   r   r    r    r!   test_doc_retokenize_retokenizer9   s   
r3   c                 C   s   | d}t dd|jjd i}| }|j|dd |d W d    n1 s(w   Y  t|dks5J |d jdks>J |d jdksGJ |d jdksPJ d S )	Nr   boysENT_TYPEr-   r   r   r   r   )	r   vocabstringsr   r   r   r   r   	ent_type_)r   r   r   r   r    r    r!   %test_doc_retokenize_retokenizer_attrsA   s   
r9   c                 C   s   | d}t dd |D rJ | }|j|dd dddd	 W d
   n1 s,w   Y  |d jdks:J |d jdksCJ |d jsJJ t|jdgd}|d jsYJ d
S )z8Test that lexical attributes can be changed (see #2390).zWKRO played beach boys songsc                 s       | ]}|j V  qd S Nis_stop).0tokenr    r    r!   	<genexpr>P       z0test_doc_retokenize_lex_attrs.<locals>.<genexpr>r0   r   r4   T)r   IS_STOPr   Nz
beach boyswordsr   )anyr   r   r   r   r=   r   r6   )r   r   r   new_docr    r    r!   test_doc_retokenize_lex_attrsM   s   
rG   c                 C   s  d}g d}dgt | }| |}t|jdd |D ||d}t |dks'J |d jjd	ks1J |d
 jjdks;J | }dddd}|j|dd |d W d    n1 s[w   Y  t |dkshJ |d jdksqJ |d jjdks{J |d jdksJ d S )NLos Angeles start.r/   r0   r0   r0   depc                 S      g | ]}|j qS r    r   r>   tr    r    r!   
<listcomp>_       z:test_doc_retokenize_spans_merge_tokens.<locals>.<listcomp>rD   headsdepsr   r   Angelesr/   startNNPLos AngelesGPEr#   r0   r   r+   )r   r   r6   r%   r   r   r   r8   )r   r   rR   rS   tokensr   r   r   r    r    r!   &test_doc_retokenize_spans_merge_tokensZ   s    
rZ   c           	   	   C   s  g d}dd |D }g d}dgt | }g d}g d}t| ||||||d}t |d	ks1J |d
 jdks:J |d
 jdksCJ |d
 jdksLJ |d
 jdksUJ | }||d
d  W d    n1 smw   Y  t |dkszJ |d
 jdksJ |d
 jdksJ |d
 jdksJ |d
 jdksJ t| ||||||d}t |d	ksJ |d
 jdksJ |d
 jdksJ |d
 jdksJ |d
 jdksJ | }||d
d  ||dd	  W d    n1 sw   Y  t |dksJ |d
 jdksJ |d
 jdksJ |d
 jdks"J |d
 jdks,J |d jdks6J |d jdks@J |d jdksJJ |d jdksTJ d S )N)TheplayersrU   .c                 S   s   g | ]}|  qS r    )lowerrM   r    r    r!   rO   n   s    zHtest_doc_retokenize_spans_merge_tokens_default_attrs.<locals>.<listcomp>rI   rJ   )DTNNVBZr]   )DETNOUNVERBPUNCT)rD   tagsposrR   rS   lemmasr   r   r[   r_   rb   ther0   r+   zThe playersr`   rc   zthe playersr/   zstart .ra   rd   )r   r   r   r   pos_r   r   r   )	en_vocabrD   rh   rR   rS   rf   rg   r   r   r    r    r!   4test_doc_retokenize_spans_merge_tokens_default_attrsl   sV   

rl   c                 C   s  g d}g d}dgt | }t| |||d}t |dksJ | }|d jddd	}|j|d
d |d W d    n1 sBw   Y  t |dksOJ |d jjdksYJ |d jjdkscJ |d jjd
ksmJ |d
 jjdkswJ |d jjdv sJ |d jjdksJ d S )N)Ifoundapilatesclassnearworkr]   )r/   r/   r   r   r/   r   r   r/   rJ   rQ      r   zpilates classOr#   r+   r   r   r   r   r/   r0   )r/   r+   )r   r   r   r   r   r%   r$   )rk   rD   rR   rS   r   r   r   r    r    r!   %test_doc_retokenize_spans_merge_heads   s    
rv   c              	   C   s   d}| |}t tC | &}|j|dd ddddd |j|dd	 ddddd W d    n1 s9w   Y  W d    d S W d    d S 1 sQw   Y  d S )
NrH   r   r0   rV   rW   rX   r#   r   r/   )pytestraises
ValueErrorr   r   r1   r    r    r!   ,test_doc_retokenize_spans_merge_non_disjoint   s    




"rz   c           	      C   s  d}g d}dgt | }| |}t|jdd |D ||d}|d jjdks)J | }d	d
dd}|j|dd |d W d    n1 sIw   Y  |d jjdksXJ d}g d}dgt | }| |}t|jdd |D ||d}| }|jD ]}|j|j	|jd}|j||d qW d    n1 sw   Y  d}g d}dgt | }| |}t|jdd |D ||d}| }|jD ]}|| qW d    d S 1 sw   Y  d S )N.displaCy is a parse tool built with Javascript)r/   r/   r   r   r/   r   r   r   rJ   c                 S   rK   r    rL   rM   r    r    r!   rO      rP   z6test_doc_retokenize_span_np_merges.<locals>.<listcomp>rQ   r   r/   NPtoolru   r#   r0   r   r   zmdisplaCy is a lightweight and modern dependency parse tree visualization tool built with CSS3 and JavaScript.)r/   r/   
   r   r+   r+   r   r~   r   r~   r/   r~            r   r/   c                 S   rK   r    rL   rM   r    r    r!   rO      rP   zFOne test with entities like New York City so the ents list is not void)r/   r/   r/   r0   r+   r   r   r   r   r   r   r   r/   r   r   c                 S   rK   r    rL   rM   r    r    r!   rO      rP   )
r   r   r6   r%   r$   r   r   entslabel_r   )	r   r   rR   rS   rY   r   r   r   entr    r    r!   "test_doc_retokenize_span_np_merges   s@   




"r   c                 C   s  d}g d}dgt | }g d}g d}dgt | }d|d< d	|d
< d|d< d|d< d	|d< | |}t|jdd |D ||||d}t |dksLJ | %}|jD ]}	tdd |	D }
|	jj|
d}|j|	|d qTW d    n1 sxw   Y  t |dksJ d S )NzPStewart Lee is a stand up comedian who lives in England and loves Joe Pasquale.
)r/   r0   r0   r   r   r   r0   rt   r   rt   r   rt   rt      r   r0      rJ   )rV   rV   ra   r_   VBRPr`   WPra   INrV   CCra   rV   rV   r]   SP))PERSONr   r0   )rX   r~   r   )r   r   r   ru   zB-PERSONr   zI-PERSONr/   zB-GPEr~   r   r   c                 S   rK   r    rL   rM   r    r    r!   rO      rP   z:test_doc_retokenize_spans_entity_merge.<locals>.<listcomp>)rD   rR   rS   rf   r      c                 s   r:   r;   )r8   )r>   wr    r    r!   r@      rA   z9test_doc_retokenize_spans_entity_merge.<locals>.<genexpr>r.   r   r   )	r   r   r6   r   r   maxrootr   r   )r   r   rR   rS   rf   r   rY   r   r   r   r   r   r    r    r!   &test_doc_retokenize_spans_entity_merge   s:   

r   c                 C   s  g d}t t |d}|jjdddf|jjdddfg|_|d jdks)J |d	 jd
ks2J |d jd
ks;J |d jdksDJ | }||dd  W d    n1 s\w   Y  t	|t	|d	 ksmJ |d jdksvJ |d	 jd
ksJ g d}t t |d}| #}dd	d}|j|dd |d |j|dd |d W d    n1 sw   Y  |d jdksJ |d	 jd
ksJ g d}t t |d}|jjdddf|jjdddfg|_|d jdksJ |d jd
ksJ |d jdksJ |d jd
ksJ | #}||dd  ||dd  ||dd  W d    n	1 s<w   Y  t	|dksJJ |d jdksTJ |d j
dks^J |d jdkshJ |d j
dksrJ g d}g d}dgt	| }d|d< d|d< d|d< d|d< dgt	| }| jd | jd | jd t | ||||d}|dd j|d ksJ |dd j|d ksJ | #}||dd  ||dd  ||dd  W d    n	1 sw   Y  t	|dksJ |d jdksJ |d j
dksJ |d jd
ks&J |d j
dks0J |d jdks:J |d j
dksDJ g d}g d}dgt	| }d|d< d|d< d|d< d|d< dgt	| }t | ||||d}| }||dd  ||dd  W d    n	1 sw   Y  t	|dksJ |d jdksJ |d j
dksJ |d jdksJ |d j
dksJ d S )N)ro   bcderC   zent-abcr   r+   zent-dr   Br/   rm   r0   )r   ent_iobr   r   )	ro   r   r   r   r   fghr$   zent-dezent-fgr   r   r   )	r   r   r+   r   r   r   r   r   r   ru   zB-ent-dezI-ent-dezB-ent-fgzI-ent-fgrJ   )rD   rR   rS   r   )	r   r   r+   r   r   r   r   r   r   )r   r   r6   r7   addr   ent_iob_r   r   r   r8   r   )rk   rD   r   r   r   rR   r   rS   r    r    r!   *test_doc_retokenize_spans_entity_merge_iob   s   





r   c                 C   s   d}g d}g d}| |}t |jdd |D ||d}t|j\}}t|}t|}	| #}
ddd}|
j|d	d
 |d |
j|dd  |d W d    n1 sUw   Y  t|j\}}t||d kskJ t||	d ksuJ d S )NzOStewart Lee is a stand up comedian. He lives in England and loves Joe Pasquale.)r/   r0   r0   r   r0   r   r   r0   r   r   r   r~   r   r   r   r   r   )compoundnsubjROOTdetamodprtattrpunctr   r   preppobjccconjr   dobjr   c                 S   rK   r    rL   rM   r    r    r!   rO   d  rP   zItest_doc_retokenize_spans_sentence_update_after_merge.<locals>.<listcomp>rQ   noner.   r   r0   r   r/   )r   r6   r&   sentsr   r   r   )r   r   rR   rS   rY   r   sent1sent2init_len	init_len2r   r   r    r    r!   5test_doc_retokenize_spans_sentence_update_after_merge[  s    

r   c           
      C   s   d}g d}g d}| |}t |jdd |D ||d}t|jd }tt|jj}| }ddd	}	|j|dd
 |	d W d    n1 sJw   Y  tt|jj|d ks]J d S )NzNStewart Lee is a stand up comedian who lives in England and loves Joe Pasquale)r/   r0   r0   r   r   r   r0   rt   r   rt   r   rt   rt   r   r   )r   r   r   r   r   r   r   r   relclr   r   r   r   r   r   c                 S   rK   r    rL   rM   r    r    r!   rO   z  rP   z@test_doc_retokenize_spans_subtree_size_check.<locals>.<listcomp>rQ   r   r   r.   r0   r   r/   )	r   r6   r&   r   r   r   subtreer   r   )
r   r   rR   rS   rY   r   r   r   r   r   r    r    r!   ,test_doc_retokenize_spans_subtree_size_checkq  s   

 r   c                 C   s  t jdddd t jdddd t| g dd}| }d	dd
dd}|j|dd |d W d    n1 s:w   Y  |d jd	ksHJ |d jjdu sRJ |d jjd
ks\J t| g dd}| (}|j|dd ddd
did |j|dd dd ddid W d    n1 sw   Y  |d jjdu sJ |d jjd
ksJ |d jjd u sJ |d jjdksJ d S )Nro   FT)defaultforcer   nothinghelloworld!rC   hello world1)ro   r   )r
   _r   r0   r   )r   r   r   r   r   r   2r/   )	r   set_extensionr   r   r   r   r   ro   r   )rk   r   r   r   r    r    r!   )test_doc_retokenize_merge_extension_attrs  s&   

 "r   underscore_attrsro   xr   r   r/   c              	   C   s   t jddd dd t jddd dd t| g d	d
}d|i}tt0 | }|j|dd |d W d    n1 s@w   Y  W d    d S W d    d S 1 sXw   Y  d S )Nro   c                 S      | S r;   r    r   r    r    r!   <lambda>      zCtest_doc_retokenize_merge_extension_attrs_invalid.<locals>.<lambda>T)getterr   r   c                 S   r   r;   r    r   r    r    r!   r     r   )methodr   r   rC   r   r   r0   r   )r   r   r   rw   rx   ry   r   r   )rk   r   r   r   r   r    r    r!   1test_doc_retokenize_merge_extension_attrs_invalid  s   
"r   c                 C   s  t | g dd}tdd |D rJ | }|j|dd ddd	d
 W d   n1 s0w   Y  |d jdks>J |d jsEJ t | g dd}tdd |D rXJ tdd |D rcJ | "}|j|dd ddid
 |j|dd ddid
 W d   n1 sw   Y  |d jsJ |d jsJ |d jrJ |d jrJ t | g dd}|d jdksJ | }|j|dd ddid
 W d   n1 sw   Y  |d jdksJ | d jdksJ dS )a  Test that retokenization also sets attributes on the lexeme if they're
    lexical attributes. For example, if a user sets IS_STOP, it should mean that
    "all tokens with that lexeme" are marked as a stop word, so the ambiguity
    here is acceptable. Also see #2390.
    r   rC   c                 s   r:   r;   r<   rM   r    r    r!   r@     rA   z7test_doc_retokenizer_merge_lex_attrs.<locals>.<genexpr>r   r0   r   T)r
   r=   r   N)einszweir   r   c                 s   r:   r;   )like_numrM   r    r    r!   r@     rA   c                 s   r:   r;   r<   rM   r    r    r!   r@     rA   r   r   r=   r/   r   normr   )r   rE   r   r   r   r=   r   norm_rk   r   r   r    r    r!   $test_doc_retokenizer_merge_lex_attrs  s2   


r   c                 C   s   t | g dd}| }||dd  ||dd  W d   n1 s)w   Y  t|dks6J |d jdks?J dS )zsTest that the retokenizer automatically skips duplicate spans instead
    of complaining about overlaps. See #3687.r   rC   r   r0   Nr   )r   r   r   r   r   r   r    r    r!   test_retokenize_skip_duplicates  s   
r   c              	   C   s   t | g dd}tt. | }||dd  W d    n1 s&w   Y  W d    d S W d    d S 1 s>w   Y  d S )Nr   rC   r/   )r   rw   rx   ry   r   r   r   r    r    r!   $test_retokenize_disallow_zero_length  s   
"r   c                 C   s  d}g d}| |}t |jdd |D |d}tt|jdks"J | }||dd  W d    n1 s:w   Y  tt|jdksJJ t |jd	d |D |d}tt|jdksbJ | }||dd
  W d    n1 szw   Y  |d jd u sJ t |jdd |D |d}tt|jdksJ | }|j|dd
 ddid W d    n1 sw   Y  tt|jdksJ d S )Nr{   )r/   r   r   r   r/   r   r   r   c                 S   rK   r    rL   rM   r    r    r!   rO     rP   zGtest_doc_retokenize_merge_without_parse_keeps_sents.<locals>.<listcomp>)rD   sent_startsr0   r/   r+   c                 S   rK   r    rL   rM   r    r    r!   rO     rP   r   c                 S   rK   r    rL   rM   r    r    r!   rO     rP   
sent_startTr   )r   r6   r   r&   r   r   r   is_sent_start)r   r   r   rY   r   r   r    r    r!   3test_doc_retokenize_merge_without_parse_keeps_sents  s*   


r   )rw   spacy.attrsr   spacy.tokensr   r   spacy.vocabr   r"   r*   r2   r3   r9   rG   rZ   rl   rv   rz   r   r   r   r   r   r   markparametrizer   r   r   r   r   r    r    r    r!   <module>   s4    , !a"

!