o
    i*                  
   @   s   d dl Z d dlZd dlmZmZ d dlmZ ejddd Z	dd Z
d	d
 Zdd Zdd Zdd Zdd Zdd Zdd Zejdddii gddii gddii gddiddigdddddigddigdd Zdd  Zd!d" Zd#d$ ZdS )%    N)DocToken)Vocabi  c                 C   s&  g d}t jddgddgddgdd	gd
dgddggdd}t| |d}||_g d}dd |D |ks5J g d}t|D ]	\}}||| _q=dd |D |ksRJ dd |D }t|t|kscJ | +}	|d df|d g}
ddgddgddgd}|	j|d ddg|
|d W d    n1 sw   Y  g d}d d |D |ksJ g d}d!d |D |ksJ d"d |D }t|t|ksJ |d# 	 |d# 	 ksJ |d 	 |d 	 ksJ |d 	 |d 	 ksJ |d$ 	 |d% 	 ksJ |d% 	 |d& 	 ksJ d S )'N)IliveinNewYorkrightnowg      ?g?g       @g @g      @g@g      @gffffff@g      @gffffff@g      @gffffff@f)dtypewordsc                 S      g | ]}|j qS  text.0tokenr   r   Y/home/ubuntu/.local/lib/python3.10/site-packages/spacy/tests/doc/test_retokenize_split.py
<listcomp>       z"test_issue3540.<locals>.<listcomp>c                 S   r   r   lemma_r   r   r   r   r      r   c                 S   r   r   vectorr   r   r   r   r      r            PROPNNewYorkpobjcompound)POSLEMMADEP)headsattrs)r   r   r   r!   r"   r	   r
   c                 S   r   r   r   r   r   r   r   r   $   r   c                 S   r   r   r   r   r   r   r   r   &   r   c                 S   r   r   r   r   r   r   r   r   '   r   r            )
numpyasarrayr   tensor	enumerater   len
retokenizesplittolist)en_vocabr   r/   doc	gold_text
gold_lemmailemma	vectors_1retokenizerr(   r)   	vectors_2r   r   r   test_issue3540   sD   &
	"r>   c              
   C   s  g d}g d}dgt | }t| |||d}t |dksJ t t|dks)J |d jjdks3J |d	 jjd
ks=J | -}|j|d ddg|d d	f|d	 gdgd ddgdgd dgd dd W d    n1 sqw   Y  t |dks~J |d jdksJ |d jjdksJ |d jdksJ t|d jdksJ |d	 jdksJ |d	 jdksJ |d	 jjdksJ t|d	 jdksJ |d jdksJ |d jjd
ksJ |d jd
ksJ |d jjd
ksJ t t|dksJ d S )N
LosAngelesstart.r   r   r   depr   r(   depsr      r   rA   r   rB   LosAngelesNNPr   GPEzNumber=Sing)tagr:   ent_typemorphr)   r*   )	r1   r   strheadr   r2   r3   idxrN   )r5   r   r(   rF   r6   r<   r   r   r   test_doc_retokenize_split0   sD   
rS   c                 C   sT  g d}g d}dgt | }t| |||d}| }||d ddg|d df|d g W d    n1 s9w   Y  |d jd	ksGJ |d jd	ksPJ g d}g d}dgt | }t| |||d}|D ]}d
|_qi| }||d ddg|d df|d g W d    n1 sw   Y  |d jdksJ |d jdksJ d S )Nr?   rC   rD   rE   r   rH   rI   r    a)r1   r   r2   r3   r   )r5   r   r(   rF   r6   r<   tr   r   r    test_doc_retokenize_split_lemmasU   s8   

rW   c                 C   s   t | g dd}|jjd}|jjd}| !}|j|d ddg|d df|d gd	||gid
 W d    n1 s>w   Y  |d j|ksLJ |d j|ksUJ d S )Nr?   r   amodsubjectr   rH   rI   r   rD   rO   )r   vocabstringsaddr2   r3   rD   )r5   r6   dep1dep2r<   r   r   r   &test_doc_retokenize_split_dependenciesu   s   

r_   c              
   C   s  t | g dd}tt* | }||d ddg|d g W d    n1 s+w   Y  W d    n1 s:w   Y  tt9 | }||d ddg|d |d |d g W d    n1 shw   Y  W d    d S W d    d S 1 sw   Y  d S )Nr?   r   r   rH   rI   r   r   pytestraises
ValueErrorr2   r3   r5   r6   r<   r   r   r   %test_doc_retokenize_split_heads_error   s   

*"re   c               	   C   s   g d} t t | d}|jjdddfg|_|d jdks J |d jdks)J |  }||d g d	|d df|d df|d g W d    n1 sPw   Y  |d jdks^J |d jdksgJ |d jdkspJ |d
 jdksyJ d S )N)abcder   zent-abcdr   r   Br   r   )rU   bcr   )	r   r   rZ   r[   r\   entsent_iob_r2   r3   )r   r6   r<   r   r   r   *test_doc_retokenize_spans_entity_split_iob   s   
2rn   c           
      C   s  g d}g d}g d}t | |||d}t|j\}}t|}t|}| :}	|	j|d ddg|d df|d gd	d
dgid |	j|d ddg|d df|d gd	d
dgid W d    n1 sdw   Y  t|j\}}t||d kszJ t||d ksJ d S )N)
StewartLeeisrU   standupcomedianrB   Helivesr   EnglandandlovesJoePasqualerB   )r   r   r   r+   r   r   r      rz   rz   	   rz   rz         )nsubjROOTdetrX   prtattrpunctr~   r   prepr#   ccconjr$   r   rE   r   StewartLeer   rD   r$   r~   rO      JoePasqualer}   dobj)r   listsentsr1   r2   r3   )
r5   r   r(   rF   r6   sent1sent2init_len	init_len2r<   r   r   r   5test_doc_retokenize_spans_sentence_update_after_split   s0   


r   c              	   C   s   t | g dd}tt: | }||d ddg|d df|d dfg W d   n1 s2w   Y  W d   dS W d   dS 1 sJw   Y  dS )al  Test that the regular retokenizer.split raises an error if the orths
    don't match the original token text. There might still be a method that
    allows this, but for the default use cases, merging and splitting should
    always conform with spaCy's non-destructive tokenization policy. Otherwise,
    it can lead to very confusing and unexpected results.
    r?   r   r   LANr`   rd   r   r   r   (test_doc_retokenize_split_orths_mismatch   s   
,"r   c                 C   s  t jdddd t jdddd t| ddgd	}| /}|d
 df|d g}dddddig}ddg|d}|j|d
 ddg||d W d    n1 sNw   Y  |d
 jdks\J |d
 jjdu sfJ |d
 jjdkspJ |d jdksyJ |d jjdu sJ |d jjdksJ d S )NrU   FTdefaultforcerj   nothingr@   rA   r   r   r   1)rU   rj   2losangeles)r:   _rH   rI   rO   )	r   set_extensionr   r2   r3   r   r   rU   rj   )r5   r6   r<   r(   
underscorer)   r   r   r   )test_doc_retokenize_split_extension_attrs   s   
r   underscore_attrsrU   xrj   rk   )rU   r   c              	   C   s   t jdddd t jddd dd t jd	d
d dd t| ddgd}d|i}tt< | }|d df|d g}|j|d ddg||d W d    n1 sTw   Y  W d    d S W d    d S 1 slw   Y  d S )Nr   FTr   rU   c                 S      | S Nr   r   r   r   r   <lambda>       zCtest_doc_retokenize_split_extension_attrs_invalid.<locals>.<lambda>)getterr   rj   c                 S   r   r   r   r   r   r   r   r      r   )methodr   r@   rA   r   r   r   r   rH   rI   rO   )r   r   r   ra   rb   rc   r2   r3   )r5   r   r6   r)   r<   r(   r   r   r   1test_doc_retokenize_split_extension_attrs_invalid   s   
"r   c                 C   s   t | dgdd jrJ t | dgdd jrJ t | ddgd}|d jr'J | %}ddd	gi}|d d
f|d
 g}|j|d ddg||d W d   n1 sSw   Y  |d js_J |d
 jrfJ dS )a  Test that retokenization also sets attributes on the lexeme if they're
    lexical attributes. For example, if a user sets IS_STOP, it should mean that
    "all tokens with that lexeme" are marked as a stop word, so the ambiguity
    here is acceptable. Also see #2390.
    rH   r   r   rI   r@   rA   is_stopTFr   rO   N)r   r   r2   r3   )r5   r6   r<   r)   r(   r   r   r   $test_doc_retokenizer_split_lex_attrs   s   
r   c                 C   s   d}t | | dd d}| #}|d }|dfgt| }|j||j t|j|d W d   n1 s8w   Y  t | | d}| $}|d }|dfgt| }|j||j t|j|d W d   dS 1 spw   Y  dS )zB#4604: realloc correctly when new tokens outnumber original tokenszOHyperglycemic adverse events following antipsychotic drug administration in theNr   r   r(   )r   r3   r2   r1   r9   r   r   )r5   r   r6   r<   r   r(   r   r   r   test_doc_retokenizer_realloc  s   

"r   c                    s   d}t | | d}d|d _|d  | }|j g d fddtdD d	 W d
   n1 s5w   Y  |d jdksCJ |d jdksLJ |d jdksUJ |d jdks^J d
S )z#6060: reset norm in splitz6The quick brownfoxjumpsoverthe lazy dog w/ white spotsr   withr+   r   )brownfoxjumpsoverthec                    s   g | ]} |fqS r   r   )r   rR   r   r   r   r   "  s    z3test_doc_retokenizer_split_norm.<locals>.<listcomp>r   Nr{   zw/r   )r   r3   norm_r2   ranger   )r5   r   r6   r<   r   r   r   test_doc_retokenizer_split_norm  s   

r   )r-   ra   spacy.tokensr   r   spacy.vocabr   markissuer>   rS   rW   r_   re   rn   r   r   r   parametrizer   r   r   r   r   r   r   r   <module>   s8    

'% 



