o
    i                     @   s  d dl Z d dlZd dlmZ d dlZd dlmZ d dlmZ d dl	m
Z
 d dlmZmZ d dlmZ d dlmZ d	d
lmZ g g fg dg dfgZg ddfgZejddd Zejddd Zejddd Zejddgdd Zejdedd Zejdedd Zejd ed!d" Zejd ed#d$ Z ejd ed%d& Z!ejded'd( Z"ejded)d* Z#ejd ed+d, Z$dS )-    N)get_current_ops)English)StringStore)Doc)ensure_path
load_model)Vectors)Vocab   )make_tempdir)ratsarecute)iliker   HelloiW  c                 C   s2   t | }t |j}||  |dsJ d S )NDEP)r   vocab
from_bytesto_byteshas_annotation)en_vocabdocdoc2 r   f/home/ubuntu/.local/lib/python3.10/site-packages/spacy/tests/serialize/test_serialize_vocab_strings.pytest_issue599   s   
r   i  c           	      C   s   t  }|j}t G}t|d }| s|  || t |}t	j
d|d}t|d }| s8|  || t|}|jdksHJ W d   dS 1 sSw   Y  dS )zTest that a new blank model can be made with a vocab from file,
    and that serialization does not drop the language at any point.r   en)r   nlpN)r   r   r   r   existsmkdirto_diskr	   	from_diskspacyblankr   lang)	r   nlp1vocab1d	vocab_dirvocab2nlp2nlp_dirnlp3r   r   r   test_issue4054   s    

"r.   i%  c                 C   s   t  }|j }g d}g d}t| |d}t|D ]	\}}|| |_q| }t }	|	|}	t|	|}g }
|D ]}|
|j q;|
|ksJJ d S )N)Appleislookingatbuyingastartup)NOUNVERBADPr7   PROPNr6   r8   )words)	r   r   r   r   	enumeratepos_r	   r   append)r   r   vocab_bytesr:   posr   r   token	doc_bytesr   actualr   r   r   test_issue41331   s   

rC   textratc                 C   sR   | j |}| jdgd}t |}|j | |ksJ |jdgd|ks'J d S )Nlookups)exclude)stringsaddr   r	   r   )r   rD   	text_hashr>   	new_vocabr   r   r   test_serialize_vocabE   s
   rL   zstrings1,strings2c                 C   s   t | d}t |d}| }| }| |kr||ksJ n||ks#J ||}| |ks0J t  |}| |ks>J t|jt| ksIJ tdd |jD t| ksYJ d S )NrH   c                 S      g | ]}|qS r   r   .0sr   r   r   
<listcomp>]       z8test_serialize_vocab_roundtrip_bytes.<locals>.<listcomp>)r	   r   r   lenrH   sorted)strings1strings2r'   r*   vocab1_bvocab2_b
new_vocab1r   r   r   $test_serialize_vocab_roundtrip_bytesN   s   


$r[   c           	      C   s&  t | d}t |d}t |}|d }|d }|| || t  |}t  |}t| tdd |jD ks<J t|tdd |jD ksLJ t| t|krgdd |jD dd |jD ksfJ ndd |jD dd |jD ksyJ W d    d S W d    d S 1 sw   Y  d S )	NrM   r'   r*   c                 S   rN   r   r   rO   r   r   r   rR   l   rS   z7test_serialize_vocab_roundtrip_disk.<locals>.<listcomp>c                 S   rN   r   r   rO   r   r   r   rR   m   rS   c                 S   rN   r   r   rO   r   r   r   rR   o   rS   c                 S   rN   r   r   rO   r   r   r   rR   q   rS   )r	   r   r!   r"   setrH   )	rV   rW   r'   r*   r(   
file_path1
file_path2vocab1_dvocab2_dr   r   r   #test_serialize_vocab_roundtrip_disk`   s"   



  &&"ra   zstrings,lex_attrc                 C   sr   t | d}t  }||| d  _|| d  j|ksJ || d  j|ks%J || }|| d  j|ks7J d S )NrM   r   )r	   norm_r   r   )rH   lex_attrr'   r*   r   r   r   $test_serialize_vocab_lex_attrs_bytest   s   
rd   c                 C   s2   t | d}||  t|jt| ksJ d S NrM   )r	   r   r   rT   rH   )rH   rc   r   r   r   r   #test_deserialize_vocab_seen_entries   s   
rf   c                 C   s   t | d}t  }||| d  _|| d  j|ksJ || d  j|ks%J t }|d }|| ||}W d    n1 sAw   Y  || d  j|ksQJ d S )NrM   r   r   )r	   rb   r   r!   r"   )rH   rc   r'   r*   r(   	file_pathr   r   r   #test_serialize_vocab_lex_attrs_disk   s   

rh   c                 C   s   t | d}t |d}| }| }t| t|kr!||ks J n||ks'J ||}| |ks4J t  |}| |ksBJ t|t| ksLJ d S re   )r   r   r\   r   )rV   rW   sstore1sstore2	sstore1_b	sstore2_bnew_sstore1r   r   r   *test_serialize_stringstore_roundtrip_bytes   s   


rn   c           	      C   s   t | d}t |d}t `}|d }|d }|| || t  |}t  |}t|t|ks6J t|t|ks@J t| t|krSt|t|ksRJ nt|t|ks]J W d    d S W d    d S 1 spw   Y  d S )NrM   rV   rW   )r   r   r!   r"   r\   )	rV   rW   ri   rj   r(   r]   r^   	sstore1_d	sstore2_dr   r   r   )test_serialize_stringstore_roundtrip_disk   s"   




"rq   c                 C   sv   t | d}t }t|jdddd}||_||| d  _t|}t	|}|
 |
 ks1J |jjdks9J d S )NrM   )
   rr   floret   )datamode
hash_countr   )r	   r   r   xpzerosvectorsrb   pickledumpsloadsr   rv   )rH   rc   r   opsrz   vocab_pickledvocab_unpickledr   r   r   test_pickle_vocab   s   


r   )%r{   pytest	thinc.apir   r#   spacy.lang.enr   spacy.stringsr   spacy.tokensr   
spacy.utilr   r   spacy.vectorsr   spacy.vocabr	   utilr   test_stringstest_strings_attrsmarkissuer   r.   rC   parametrizerL   r[   ra   rd   rf   rh   rn   rq   r   r   r   r   r   <module>   sJ    














