o
    }oi                     @   sH   d dl Z d dlmZ ddddddddZG d	d
 d
ZG dd dZdS )    N)SentencePieceTokenizerz[UNK][SEP]z[PAD][CLS][MASK])	unk_token	sep_token	pad_token	bos_token
mask_token	eos_token	cls_tokenc                   @   s   e Zd ZdZejjdd Zejjdd Zejjdd Z	ejjdd	 Z
ejjd
d Zejjdd Zejjdd ZdS ) TestSentencePieceTokenizerLegacyz/m_common.modelc                 C   sD   t || j dd}t}|| |j|jtt|  ks J d S )NTlegacy)	r   
model_nameMODEL_SPECIAL_TOKENSadd_special_tokens
vocab_sizeoriginal_vocab_sizelensetvalues)selftest_data_dir	tokenizerspecial_tokens r   _/home/ubuntu/.local/lib/python3.10/site-packages/tests/collections/common/test_spc_tokenizer.pytest_add_special_tokens!   s   
$z8TestSentencePieceTokenizerLegacy.test_add_special_tokensc                 C   s   t || j dd}t}|| d}||}t|t| ks#J |ddks,J |ddks5J |ddks>J d S 	NTr   ([CLS] a b c [MASK] e f [SEP] g h i [SEP]r      r   r      )r   r   r   r   text_to_tokensr   splitcount)r   r   r   r   texttokensr   r   r   test_text_to_tokens(   s   

z4TestSentencePieceTokenizerLegacy.test_text_to_tokensc                 C   s:   t || j dd}d}||}||}||ksJ d S NTr   r    r   r   r#   tokens_to_textr   r   r   r&   r'   resultr   r   r   test_tokens_to_text6   s
   

z4TestSentencePieceTokenizerLegacy.test_tokens_to_textc                 C   s   t || j dd}t}|| d}||}t|t| ks#J ||ddks/J ||ddks;J ||ddksGJ d S r   )	r   r   r   r   text_to_idsr   r$   r%   token_to_id)r   r   r   r   r&   idsr   r   r   test_text_to_ids@   s   

z1TestSentencePieceTokenizerLegacy.test_text_to_idsc                 C   sH   t || j dd}t}|| d}||}||}||ks"J d S r)   )r   r   r   r   r/   ids_to_text)r   r   r   r   r&   r1   r-   r   r   r   test_ids_to_textN   s   


z1TestSentencePieceTokenizerLegacy.test_ids_to_textc                 C   s   t || j dd}t}|| d}||}||}t|t|ks&J ||ddks2J ||ddks>J ||ddksJJ d S r   )	r   r   r   r   r#   tokens_to_idsr   r%   r0   )r   r   r   r   r&   r'   r1   r   r   r   test_tokens_to_idsZ   s   


z3TestSentencePieceTokenizerLegacy.test_tokens_to_idsc           	      C   s   t || j dd}t}|| d}||}||}||}t|t|ks+J tt|D ]}|| || ks=J q1d S r)   )	r   r   r   r   r#   r5   ids_to_tokensr   range)	r   r   r   r   r&   r'   r1   r-   ir   r   r   test_ids_to_tokensi   s   



z3TestSentencePieceTokenizerLegacy.test_ids_to_tokensN)__name__
__module____qualname__r   pytestmarkunitr   r(   r.   r2   r4   r6   r:   r   r   r   r   r      s     


	


r   c                   @   sp   e Zd ZdZejjdd Zejjdd Zejjdd Z	ejjdd	 Z
ejjd
d Zejjdd ZdS )TestSentencePieceTokenizerz/m_new.modelc                 C   sV   t || j }d}||}|ddksJ |ddks J |ddks)J d S N <cls> a b c <sep> e f g h i </s><cls>r!   <sep>r   </s>)r   r   r#   r%   r   r   r   r&   r'   r   r   r   r(   }   s   
z.TestSentencePieceTokenizer.test_text_to_tokensc                 C   6   t || j }d}||}||}||ksJ d S )Nz<cls> a b c e f g h ir*   r,   r   r   r   r.      s
   

z.TestSentencePieceTokenizer.test_tokens_to_textc                 C   sh   t || j }d}||}||ddksJ ||ddks&J ||ddks2J d S rB   )r   r   r/   r%   r0   rG   r   r   r   r2      s   
z+TestSentencePieceTokenizer.test_text_to_idsc                 C   rH   )NrC   )r   r   r/   r3   )r   r   r   r&   r1   r-   r   r   r   r4      s
   

z+TestSentencePieceTokenizer.test_ids_to_textc                 C   s   t || j }g d}||}t|t|ksJ ||ddks&J ||ddks2J ||ddks>J d S )NrD   abcrE   efrE   ghr9   rF   rD   r!   rF   rE   r"   )r   r   r5   r   r%   r0   )r   r   r   r'   r1   r   r   r   r6      s   
z-TestSentencePieceTokenizer.test_tokens_to_idsc                 C   sh   t || j }g d}||}||}t|t|ksJ tt|D ]}|| || ks1J q%d S )NrI   )r   r   r5   r7   r   r8   )r   r   r   r'   r1   r-   r9   r   r   r   r:      s   

z-TestSentencePieceTokenizer.test_ids_to_tokensN)r;   r<   r=   r   r>   r?   r@   r(   r.   r2   r4   r6   r:   r   r   r   r   rA   z   s    




	
rA   )r>   :nemo.collections.common.tokenizers.sentencepiece_tokenizerr   r   r   rA   r   r   r   r   <module>   s   \