o
    ߥi
                     @   s*   d dl mZ d dlmZ G dd dZdS )    )List)	Tokenizerc                   @   s   e Zd ZdZdd Zedd Zedd Zedd	 Zdde	de
dee fddZddee de
de	fddZedd ZdS )JiebaBPETokenizerz2SentencePiece BPE tokenizer with Jieba integrationc                 C   sz   d| _ t|| _| jd| _zdd l}dd l}||j	 W n t
y+   t
dw || _| jd | _| jd | _d S )NzJieba BPE Tokenizerz<|endoftext|>r   zfYou need to install jieba to use JiebaTokenizer. See https://pypi.org/project/jieba/ for installation.
z<sep>)namer   	from_file	tokenizertoken_to_ideod_idjiebaloggingsetLogLevelINFOImportErrorvocabnew_line	sep_token)selftokenizer_json_filer   r    r   X/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/nlp/gpt3/tokenizer.py__init__   s   zJiebaBPETokenizer.__init__c                 C      | j jddS NT)with_added_tokens)r   get_vocab_sizer   r   r   r   
vocab_size(      zJiebaBPETokenizer.vocab_sizec                 C   r   r   )r   	get_vocabr   r   r   r   r   ,   r   zJiebaBPETokenizer.vocabc                 C   s*   | j }t }| D ]\}}|||< q
|S N)r   dictitems)r   r   	inv_vocabkeyvalr   r   r   r#   0   s
   
zJiebaBPETokenizer.inv_vocabFtextis_codereturnc                 C   sB   |sdd | j |D }| jj|dddjS | jj|dddjS )z	
        c                 S   s   g | ]}|qS r   r   ).0xr   r   r   
<listcomp><   s    z.JiebaBPETokenizer.tokenize.<locals>.<listcomp>T)is_pretokenizedadd_special_tokensF)r   cutr   encodeids)r   r&   r'   seg_listr   r   r   tokenize8   s   zJiebaBPETokenizer.tokenizeT	token_ids
early_stopc                 C   s6   |r| j |v r|d || j  }| jj|dd}|S )NT)skip_special_tokens)r   indexr   decode)r   r3   r4   r&   r   r   r   
detokenizeC   s   zJiebaBPETokenizer.detokenizec                 C   s   | j S r    )r
   r   r   r   r   eodI   s   zJiebaBPETokenizer.eodN)F)T)__name__
__module____qualname____doc__r   propertyr   r   r#   strboolr   intr2   r8   r9   r   r   r   r   r      s    


r   N)typingr   
tokenizersr   r   r   r   r   r   <module>   s   