o
    }oia	                     @   sT   d dl Z d dlmZmZ d dlmZ dZe jdddd Ze jddd	d
 Z	dS )    N)CanaryTokenizerSentencePieceTokenizer)create_spt_modela#  
Example system message.
Example user message.
Example assistant message.
TEST
[INST]
[/INST]
<s>
</s>
<<SYS>>
<</SYS>>
User: Assistant:
user model
Instruct Output 



<start_of_turn> <end_of_turn>
<|
|>
<|en|> <|de|> <|fr|> <|es|> <|transcribe|> <|translate|> <|pnc|> <|nopnc|> <|startoftranscript|> <|endoftext|>
Feel free to add new tokens for your own tests!?
But know that if you do so, you may need to update the token IDs in the existing tests! 
So, it might be a good idea to create a new tokenizer instead when adding new prompt formats.
session)scopec              
   C   sL   |  d}|d }|t tt|dddt|dddd tt|d S )	Nbpe_tokenizerztext.txti   FT)
vocab_sizesample_sizedo_lower_case
output_dirremove_extra_whitespacesboseosztokenizer.model)mktemp
write_textTOKENIZER_TRAIN_TEXTr   strr   )tmp_path_factorytmpdir	text_path r   g/home/ubuntu/.local/lib/python3.10/site-packages/tests/collections/common/prompt_formatters/conftest.pyr   /   s   


r   c                 C   s*   | d}tddg|}t|| ddS )N
spl_tokens
transcribeen)r   r   )
tokenizers)r   r   build_special_tokenizer)r   r   r   r   r   r   r   canary_tokenizerA   s   
r   )
pytest"nemo.collections.common.tokenizersr   r   :nemo.collections.common.tokenizers.sentencepiece_tokenizerr   r   fixturer   r   r   r   r   r   <module>   s   


