o
    i                     @   sp  d dl Z d dlZd dlZd dlZd dlmZmZ d dlmZ d dl	m
Z
 d dlmZmZ d dlmZ d dlmZ d dlmZ d	d
lmZ ejddd Zejddd Zejddd Zejddd Zejddd Zejddd Zejddd Zejd d!d" Zejd#d$d% Z d&d' Z!d(d) Z"d*d+ Z#d,d- Z$d.d/ Z%d0d1 Z&dS )2    N)DEPHEAD)English)Language)MatcherPhraseMatcher)Doc)Vectors)Vocab   )make_tempdiri  c                  C   s   t t } tjddd}t|g dd}| d}|d |jdd	d	ks)J ||j	_
t !}|| | d|}|jdd	d	ksIJ W d
   d
S 1 sTw   Y  d
S )zfTest that models with no pretrained vectors can be deserialized
    correctly after vectors are added.)   i,  fdtype)IamMatt)datakeystaggerPRPpretrained_dimsr   N)r   r
   numpyonesr	   create_pipe	add_labelcfggetvocabvectorsr   to_disk	from_disk)nlpr   r    r   path r%   \/home/ubuntu/.local/lib/python3.10/site-packages/spacy/tests/serialize/test_serialize_doc.pytest_issue1727   s   



"r'   i  c               	   C   s   t jddgddgddgddgdd	gd
dgddggdd} tt d d}|jjd |t	t
g| }tt|jdks?J dS )z[Test sentence boundaries are deserialized correctly, even for
    non-projective sentences.   i     i  r   i  i  r   l   LP^& l    i  l    i  uint64r   zJust what I was looking for .wordsROOTN)r   asarrayr   r
   splitr   stringsadd
from_arrayr   r   lenlistsents)
heads_depsdocr%   r%   r&   test_issue1799#   s   	r8   i*  c                  C   s   g d} t t | d}d|d _t |j| }|d js!J |dr(J |dr/J t t | dgt|  g ddgt|  d	}t |j| }|d jsVJ |ds]J |dsdJ d
S )zVTest that sentence boundaries & parse/tag flags are not lost
    during serialization.)	Thisisafirstsentence.Andanotheroner+   T   r   TAG)	r   r   r   r   r   r   rB   rB   rB   dep)r,   tagsheadsdepsN)	r   r
   is_sent_startr   
from_bytesto_bytes
sent_starthas_annotationr3   )r,   r7   new_docr%   r%   r&   test_issue18349   s$   
rN   i[  c                  C   st   t t } | dddigg t| jdgd}t| |dks!J t| }t|jdgd}t||dks8J d S )Npat1orthhellor+   r(   )r   r
   r1   r   r   r3   copydeepcopy)matcherr7   new_matcherrM   r%   r%   r&   test_issue1883Q   s   

rV   i
  c                  C   s`   t  } | d}|d |   | d}|dsJ | ddg}t|}|ds.J dS )zQTest the tagger sets has_annotation("TAG") correctly when used via Language.pipe.r   Azhello worldrC   rQ   worldN)r   add_piper   
initializerL   pipenext)r#   r   r7   docs	piped_docr%   r%   r&   test_issue2564\   s   

r_   i  c                  C   sl   t  } t| j}|d| d| d| dg |d| dg t|}t|}t|t|ks4J dS )z5Test that the PhraseMatcher can be pickled correctly.TEST1r;   bcTEST2dN)r   r   r   r1   pickledumpsloadsr3   )r#   rT   r   rU   r%   r%   r&   test_issue3248_2j   s   


rh   i  c                  C   s6   t  } | d |  }t  }|d || dS )zeTest that Language.to_bytes handles serializing a pipeline component
    with an uninitialized model.textcatN)r   rY   rJ   rI   )r#   
bytes_datanew_nlpr%   r%   r&   test_issue3289v   s   

rl   i  c                  C   s   t  } | d | d}|d jsJ |dsJ tt|jdks%J | }t| j	
|}|d js8J |ds?J tt|jdksJJ dS )z|Test that sentence boundaries are set correctly so Doc.has_annotation("SENT_START") can
    be restored after serialization.sentencizerzHello worldr   
SENT_STARTr(   N)r   rY   rH   rL   r3   r4   r5   rJ   r   r   rI   )r#   r7   	doc_bytesrM   r%   r%   r&   test_issue3468   s   
rp   iw  c                  C   s   t  } | d}|d jdksJ d|d _|d jdksJ t $}|d }|| | d}|| |d jdks=J W d   dS 1 sHw   Y  dS )z=Ensure that a modified pos attribute is serialized correctly.zSdisplaCy uses JavaScript, SVG and CSS to show you how computers understand languager    NOUNmy_docN)r   pos_r   r!   r"   )r#   r7   tmp_dir	file_pathdoc2r%   r%   r&   test_issue3959   s   


"rx   c                 C   s^   t | }| }t | }|| t|t|ksJ t||D ]\}}|j|jks,J q d S )N)r   rJ   rI   r3   ziptext)en_vocabr7   r   rw   token1token2r%   r%   r&   test_serialize_empty_doc   s   
r~   c                 C   sD   t | ddgd}ddi|_| }t | |}| |ks J d S )NrQ   rX   r+   rW   g      ?)r   catsrJ   rI   )r{   r7   doc_brM   r%   r%   r&   "test_serialize_doc_roundtrip_bytes   s
   
r   c                 C   sp   t | ddgd}t #}|d }|| t | |}| | ks&J W d    d S 1 s1w   Y  d S NrQ   rX   r+   r7   )r   r   r!   r"   rJ   r{   r7   rd   rv   doc_dr%   r%   r&   !test_serialize_doc_roundtrip_disk   s   
"r   c                 C   sx   t | ddgd}t '}|d }t|}|| t | |}| | ks*J W d    d S 1 s5w   Y  d S r   )r   r   strr!   r"   rJ   r   r%   r%   r&   *test_serialize_doc_roundtrip_disk_str_path   s   
"r   c                 C   s   t | ddgd}d|jd< t | | }|jd dksJ t | j| dgd}|jr0J t | |jdgd}|jrAJ d S )NrQ   rX   r+   barfoo	user_data)exclude)r   r   rI   rJ   )r{   r7   rM   r%   r%   r&   test_serialize_doc_exclude   s   

r   c                 C   s   t | g dd}|dd }d|_d|_d|_|g|jd< t | | }t|jd d	ks1J |jd d jdks=J |jd d jdksIJ |jd d jdksUJ d S )
N)rQ   rX   !r+   r   r   $test_serialize_doc_span_groups_label!test_serialize_doc_span_groups_id$test_serialize_doc_span_groups_kb_idcontentr(   )r   label_id_kb_id_spansrI   rJ   r3   )r{   r7   spanrM   r%   r%   r&   test_serialize_doc_span_groups   s   r   )'rR   re   r   pytestspacy.attrsr   r   spacy.lang.enr   spacy.languager   spacy.matcherr   r   spacy.tokensr   spacy.vectorsr	   spacy.vocabr
   utilr   markissuer'   r8   rN   rV   r_   rh   rl   rp   rx   r~   r   r   r   r   r   r%   r%   r%   r&   <module>   sH    



















	
