o
    }oi                     @   s   d dl Z d dlZd dlZd dlmZ d dlZd dlmZmZ ej	dd Z
dd Zdd	 Zd
d Zdd Zdd Zdd Zdd ZdS )    N)Path)TiktokenTokenizerreload_mergeable_ranksc                  c   s    dd t dD } | dtdddddtd	dd
dg tjdddd}t| | |j	}W d    n1 sAw   Y  |V  t
|  d S )Nc                 S   s0   g | ]}|t t|gd d| dqS )utf-8token_ranktoken_bytes	token_str)base64	b64encodebytesdecode).0i r   X/home/ubuntu/.local/lib/python3.10/site-packages/tests/export/test_tiktoken_tokenizer.py
<listcomp>   s    "z%sample_vocab_file.<locals>.<listcomp>      Hellor   Hellor        WorldWorldwz.jsonF)modesuffixdelete)rangeextendr   r   r   tempfileNamedTemporaryFilejsondumpnamer   unlink)
vocab_dataf	temp_pathr   r   r   sample_vocab_file   s   r)   c                 C   s<   t | }t|dksJ |d dksJ |d dksJ d S )Ni  r   r   r   r   )r   len)r)   ranksr   r   r   test_reload_mergeable_ranks1   s   r,   c                 C   s6   t | }|jdksJ |jdksJ |jdksJ d S )N      )r   bos_token_ideos_token_idpad_id)r)   	tokenizerr   r   r   test_tokenizer_initialization8   s   r3   c                 C   sV   t | }d}||}||}t|tsJ tdd |D s"J t|ts)J d S )NzHello Worldc                 s   s    | ]}t |tV  qd S )N)
isinstanceint)r   tr   r   r   	<genexpr>E   s    z%test_encode_decode.<locals>.<genexpr>)r   encoder   r4   listallstr)r)   r2   texttokensdecoded_textr   r   r   test_encode_decode?   s   

r?   c                 C   s.   t | }g dg}||}t|tsJ d S )N    i  )r   batch_decoder4   r;   r)   r2   r=   r>   r   r   r   test_batch_decodeI   s   

rE   c                 C   s2   t | }|jdd|jg}||}|dksJ d S )NrA   rB    r   r/   r0   r   rD   r   r   r   test_special_token_handlingP   s   
rH   c                 C   s.   t | }|j|jg}||}|dksJ d S )NrF   rG   rD   r   r   r   test_empty_decodeX   s   
rI   c                 C   sx   dd l }dd l}t| }|g dg}|g dg}||}||}t|ts-J t|ts4J ||ks:J d S )Nr   r@   )numpytorchr   arraytensorrC   r4   r;   )r)   nprK   r2   	np_tokenstorch_tokens
np_decodedtorch_decodedr   r   r   test_batch_decode_numpy_tensor`   s   

rS   )r   r"   r    pathlibr   pytestnemo.export.tiktoken_tokenizerr   r   fixturer)   r,   r3   r?   rE   rH   rI   rS   r   r   r   r   <module>   s   

