o
    }oi                     @   sr   d dl Z d dlZd dlZd dlmZ d dlZd dlZd dlZd dl	m
Z
 G dd dejZedkr7e  dS dS )    N)	MagicMock)SentencePieceTokenizerc                   @   s   e Zd Zedd Zedd Zdd Zdd Zd	d
 Zdd Z	dd Z
dd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd  Zd!d" Zd#d$ Zd%d& Zd'S )(TestSentencePieceTokenizerc              	   C   s   t  | _ttj| jdd}|d W d    n1 s w   Y  tj	
dtj| jd dtj| jd d tj| jd| _d S )	Nztest.txtwzHello world
This is a test
z--input=z --model_prefix=
test_modelz! --vocab_size=55 --model_type=bpeztest_model.model)tempfilemkdtemptest_diropenospathjoinwritesentencepieceSentencePieceTrainerTrain
model_path)clsf r   ]/home/ubuntu/.local/lib/python3.10/site-packages/tests/export/test_sentencepiece_tokenizer.py
setUpClass   s   
z%TestSentencePieceTokenizer.setUpClassc                 C   s   dd l }|| j d S )Nr   )shutilrmtreer	   )r   r   r   r   r   tearDownClass/   s   z(TestSentencePieceTokenizer.tearDownClassc                 C   s   t | jd| _d S )Nr   )r   r   	tokenizerselfr   r   r   setUp6      z TestSentencePieceTokenizer.setUpc                 C   s   t | jd}| |j | |j|j | t t dd W d    n1 s)w   Y  t	 }| t t | j|d W d    n1 sHw   Y  | t t   W d    d S 1 saw   Y  d S )Nr   znonexistent.model)r   r   )
r   r   assertIsNotNoner   assertEqualoriginal_vocab_size
vocab_sizeassertRaises
ValueErrorr   )r   r   mock_tokenizerr   r   r   test_initialization9   s   "z.TestSentencePieceTokenizer.test_initializationc                 C   8   d}| j |}| |t | tdd |D  d S )NHello worldc                 s       | ]}t |tV  qd S N
isinstancestr.0tr   r   r   	<genexpr>P       zATestSentencePieceTokenizer.test_text_to_tokens.<locals>.<genexpr>)r   text_to_tokensassertIsInstancelist
assertTrueall)r   texttokensr   r   r   test_text_to_tokensL      z.TestSentencePieceTokenizer.test_text_to_tokensc                 C   r)   )Nr*   c                 s   r+   r,   r.   intr1   ir   r   r   r3   V   r4   z9TestSentencePieceTokenizer.test_encode.<locals>.<genexpr>)r   encoder6   r7   r8   r9   )r   r:   idsr   r   r   test_encodeR   r=   z&TestSentencePieceTokenizer.test_encodec                 C   s8   d}| j |}| j |}| |t | |d d S )Nr*    )r   r5   tokens_to_textr6   r/   assertNotEqual)r   r:   r;   reconstructed_textr   r   r   test_tokens_to_textX   s
   z.TestSentencePieceTokenizer.test_tokens_to_textc                 C   sp   d}| j |}| j |}| |t t|}| j |}| |t t|}| j |}| |t d S Nr*   )	r   rB   batch_decoder6   r/   nparraytorchtensor)r   r:   rC   decoded_textids_npdecoded_text_np	ids_torchdecoded_text_torchr   r   r   test_batch_decode_   s   

z,TestSentencePieceTokenizer.test_batch_decodec                 C   s0   d}| j |}| j |d }| |t d S )NHellor   )r   r5   token_to_idr6   r?   )r   r:   r;   token_idr   r   r   test_token_to_idq   s   z+TestSentencePieceTokenizer.test_token_to_idc                 C   D   d}| j |}| j |}| |t | tdd |D  d S )Nr*   c                 s   r+   r,   r-   r0   r   r   r   r3   |   r4   z@TestSentencePieceTokenizer.test_ids_to_tokens.<locals>.<genexpr>)r   rB   ids_to_tokensr6   r7   r8   r9   r   r:   rC   r;   r   r   r   test_ids_to_tokensw   
   z-TestSentencePieceTokenizer.test_ids_to_tokensc                 C   rZ   )NrV   c                 s   r+   r,   r>   r@   r   r   r   r3      r4   z@TestSentencePieceTokenizer.test_tokens_to_ids.<locals>.<genexpr>)r   r5   tokens_to_idsr6   r7   r8   r9   )r   r:   r;   rC   r   r   r   test_tokens_to_ids~   r^   z-TestSentencePieceTokenizer.test_tokens_to_idsc                 C   sf   g d}t | j|dd}| |j|j d}||}| d| ||}||}| d| d S )N)[PAD]z[BOS]z[EOS]T)r   special_tokenslegacyzHello [PAD] worldra   )	r   r   assertGreaterr$   r#   r5   assertInrB   rK   )r   rb   r   r:   r;   rC   rP   r   r   r   test_legacy_mode   s   


z+TestSentencePieceTokenizer.test_legacy_modec                 C   sD   |  | jjt |  | jjt |  | jjt |  | jjt d S r,   )r6   r   pad_idr?   bos_token_ideos_token_idunk_idr   r   r   r   test_properties   s   z*TestSentencePieceTokenizer.test_propertiesc                 C   s0   | j j}| |t | tdd |D  d S )Nc                 s   r+   r,   r-   r0   r   r   r   r3      r4   zATestSentencePieceTokenizer.test_vocab_property.<locals>.<genexpr>)r   vocabr6   r7   r8   r9   )r   rl   r   r   r   test_vocab_property   s   z.TestSentencePieceTokenizer.test_vocab_propertyc                 C   rZ   )Nr*   c                 s   r+   r,   r-   r0   r   r   r   r3      r4   zHTestSentencePieceTokenizer.test_convert_ids_to_tokens.<locals>.<genexpr>)r   rB   convert_ids_to_tokensr6   r7   r8   r9   r\   r   r   r   test_convert_ids_to_tokens   r^   z5TestSentencePieceTokenizer.test_convert_ids_to_tokensc                 C   s,   d}| j |}| j |}| |t d S rJ   )r   r5   convert_tokens_to_stringr6   r/   )r   r:   r;   stringr   r   r   test_convert_tokens_to_string   s   z8TestSentencePieceTokenizer.test_convert_tokens_to_stringc                 C   s   |  t| j| jj d S r,   )r"   lenr   r$   r   r   r   r   test_len   s   z#TestSentencePieceTokenizer.test_lenc                 C   s   |  | jj d S r,   )r8   r   is_fastr   r   r   r   test_is_fast   r    z'TestSentencePieceTokenizer.test_is_fastc                 C   s   |  | j  d S r,   )assertIsNoner   get_added_vocabr   r   r   r   test_get_added_vocab   s   z/TestSentencePieceTokenizer.test_get_added_vocabN)__name__
__module____qualname__classmethodr   r   r   r(   r<   rD   rI   rU   rY   r]   r`   rf   rk   rm   ro   rr   rt   rv   ry   r   r   r   r   r      s,    

r   __main__)r   r   unittestunittest.mockr   numpyrL   r   rN   #nemo.export.sentencepiece_tokenizerr   TestCaser   rz   mainr   r   r   r   <module>   s    $