o
    
io                     @   s   d dl mZ d dlmZmZ d dlZd dlmZ ejdddd Z	ejd	d
 Z
dZg dZg dZdd Zdd Zdd Zdd Zdd Zdd Zdd ZdS )    )Path)dumpsloadsN)ByteBPEProcessormodule)scopec                 C   s   t | jjS N)r   fspathparent)request r   `/home/ubuntu/.local/lib/python3.10/site-packages/curated_tokenizers/tests/test_bbpe_processor.pytest_dir	   s   r   c                 C   s   t j| d | d dS )Nrobbert-vocab-1000.jsonzrobbert-merges-1000.txtvocabmerges)r   load_from_filesr   r   r   r   toy_processor   s   r   u    Wij bezoeken alle provinciën.)
u   ĠWiju   Ġbezoekenu   Ġalleu   Ġprovincu   iÃ«n.)
i  i1  i  \   i        i7  ir     c                  C   s$   t i g } | dg dksJ d S )Nu   they'll visit Köln)they'lr$      Ġr   isr&   r   r%   K   Ã   ¶r$   n)r   encode_as_pieces)bbper   r   r   test_empty_processor'   s   
r.   c                 C      |  ttks	J d S r   )decode_from_idsEXAMPLE_PIECE_IDSEXAMPLE_TEXTr   r   r   r   test_can_decodeA      r4   c                 C   s   |  tttfksJ d S r   )encoder2   r1   EXAMPLE_PIECESr3   r   r   r   test_can_encodeE   s   r8   c                 C   r/   r   )encode_as_idsr2   r1   r3   r   r   r   test_can_encode_as_idsI   r5   r:   c                 C   r/   r   )r,   r2   r7   r3   r   r   r   test_can_encode_as_piecesM   r5   r;   c                 C   sJ   t jtdd tj| d | d d W d    d S 1 sw   Y  d S )NzMerge must consist of 2 items)matchr   zincorrect-merges.txtr   )pytestraises
ValueErrorr   r   r   r   r   r   test_rejects_incorrect_mergesQ   s   "r@   c                 C   sB   t | }t|}t|tsJ |j| jksJ |j| jksJ d S r   )r   r   
isinstancer   r   r   )r   
serializeddeserializedr   r   r   test_pickleY   s
   rD   )pathlibr   pickler   r   r=   curated_tokenizersr   fixturer   r   r2   r7   r1   r.   r4   r8   r:   r;   r@   rD   r   r   r   r   <module>   s$    


