o
    }oi,                     @   s   d dl mZ d dlZd dlZd dlmZ d dlmZ d dl	m
Z
mZ d dlmZmZ d dlmZ ejdd	d
efddZejdd	d
efddZdd Zdd ZdS )    )MockN)	OmegaConf)ASRBPEMixin)DEFAULT_TOKENSCanaryTokenizer)SentencePieceTokenizercreate_spt_model)Serializationsession)scopereturnc                 C   s&   g d}|  d}t|| t|S )Nasrastendefres
spl_tokens)mktempr   build_special_tokenizerstr)tmp_path_factorytokenstmpdir r   _/home/ubuntu/.local/lib/python3.10/site-packages/tests/collections/asr/test_custom_tokenizer.pyspecial_tokenizer_path   s   
r   c                 C   s:   |  d}|d }|d t|dddt|d t|S )Nklingon_tokensztext.txtza
b
c
d
   F)
vocab_sizesample_sizedo_lower_case
output_dir)r   
write_textr   r   )r   r   	text_pathr   r   r   lang_tokenizer_path#   s
   

r'   c                 C   s   g d}t || }tdd |D  ddg }g }t|j D ]}||j| q | | f t	|| ||ks?J d S )Nr   c                 S   s   g | ]}d | dqS )z<|z|>r   ).0tr   r   r   
<listcomp>/   s    zAtest_canary_tokenizer_build_special_tokenizer.<locals>.<listcomp>u   ▁z<unk>)
r   r   r   range	tokenizerr!   append	IdToPiecesortprint)tmp_pathr   r,   expected_tokensir   r   r   -test_canary_tokenizer_build_special_tokenizer,   s   
r4   c                 C   s$  G dd dt t}| }tdd d|_tdd | dd|ddd	d
did}|| |j}t|t	s8J t
|jdksAJ t|j ddhksNJ t|jd tsXJ |jd jdksbJ t|jd tslJ |jd jdksvJ |jdddg dksJ |jdddddgksJ d S )Nc                   @   s   e Zd ZdS )z7test_canary_tokenizer_init_from_cfg.<locals>.DummyModelN)__name__
__module____qualname__r   r   r   r   
DummyModel9   s    r8   c                 S   s   |S )Nr   )selfxr   r   r   <lambda>=   s    z5test_canary_tokenizer_init_from_cfg.<locals>.<lambda>)side_effectaggbpe)dirtype)r   r   _target_zCnemo.collections.common.tokenizers.canary_tokenizer.CanaryTokenizer)r@   r?   langscustom_tokenizer   r   r         z/<|startoftranscript|><|en|><|asr|><|en|><|pnc|>)lang_id)   	      rI      a      )r   r	   r   register_artifactr   create_setup_aggregate_tokenizerr,   
isinstancer   lentokenizers_dictsetkeysr   r!   text_to_ids)r   r'   r8   modelconfigr,   r   r   r   #test_canary_tokenizer_init_from_cfg8   s0   
rZ   )unittest.mockr   pytestsentencepiecespm	omegaconfr   !nemo.collections.asr.parts.mixinsr   3nemo.collections.common.tokenizers.canary_tokenizerr   r   :nemo.collections.common.tokenizers.sentencepiece_tokenizerr   r   	nemo.corer	   fixturer   r   r'   r4   rZ   r   r   r   r   <module>   s   

