o
    ߥi                     @   s   d dl mZ G dd dZdS )    )	Tokenizerc                   @   sZ   e Zd ZdZdd Zedd Zedd Zedd	 ZdddZ	dd Z
edd ZdS )JiebaBPETokenizerz2SentencePiece BPE tokenizer with Jieba integrationc                 C   sf   d| _ t|| _| jd| _zdd l}W n ty!   tdw || _| jd | _	| jd | _
d S )NzJieba BPE Tokenizerz<|endoftext|>r   zhYou need to install rjieba to use JiebaTokenizer. See https://pypi.org/project/rjieba/ for installation.
z<sep>)namer   	from_file	tokenizertoken_to_ideod_idjiebaImportErrorvocabnew_line	sep_token)selftokenizer_json_filer
    r   [/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/nlp/gpt_moe/tokenizer.py__init__   s   zJiebaBPETokenizer.__init__c                 C      | j jddS NT)with_added_tokens)r   get_vocab_sizer   r   r   r   
vocab_size$      zJiebaBPETokenizer.vocab_sizec                 C   r   r   )r   	get_vocabr   r   r   r   r   (   r   zJiebaBPETokenizer.vocabc                 C   s*   | j }t }| D ]\}}|||< q
|S N)r   dictitems)r   r   	inv_vocabkeyvalr   r   r   r   ,   s
   
zJiebaBPETokenizer.inv_vocabFc                 C   sB   |sdd | j |D }| jj|dddjS | jj|dddjS )Nc                 S   s   g | ]}|qS r   r   ).0xr   r   r   
<listcomp>6   s    z.JiebaBPETokenizer.tokenize.<locals>.<listcomp>T)is_pretokenizedadd_special_tokensF)r
   cutr   encodeids)r   textis_codeseg_listr   r   r   tokenize4   s   zJiebaBPETokenizer.tokenizec                 C   s   | j j|dd}|S )NF)skip_special_tokens)r   decode)r   	token_idsr*   r   r   r   
detokenize=   s   zJiebaBPETokenizer.detokenizec                 C   s   | j S r   )r	   r   r   r   r   eodA   s   zJiebaBPETokenizer.eodN)F)__name__
__module____qualname____doc__r   propertyr   r   r   r-   r1   r2   r   r   r   r   r      s    



	r   N)
tokenizersr   r   r   r   r   r   <module>   s   