o
    iA
                     @   s*   d Z ddlmZmZmZ G dd dZdS )z Tokenization utils for RoFormer.    )NormalizedStringPreTokenizedStringnormalizersc                   @   s>   e Zd ZdddZdededee fddZd	efd
dZ	dS )JiebaPreTokenizerreturnNc                 C   sH   || _ tjddddd| _zdd l}W n ty   tdw || _d S )NFT)
clean_texthandle_chinese_charsstrip_accents	lowercaser   zkYou need to install rjieba to use RoFormerTokenizer. See https://pypi.org/project/rjieba/ for installation.)vocabr   BertNormalizerrjiebaImportErrorjieba)selfr   r    r   c/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/roformer/tokenization_utils.py__init__   s   
zJiebaPreTokenizer.__init__inormalized_stringc                 C   s   g }| j jt|ddD ]4\}}}|| jv r ||||  q| j| }|D ]}|r?|t| }||||  |}q*q|S )NF)hmm)	r   tokenizestrr   appendr   normalize_strsplitlen)r   r   r   splitstokenstartend
token_listr   r   r   jieba_split&   s   
zJiebaPreTokenizer.jieba_splitpretokc                 C   s   | | j d S )N)r   r"   )r   r#   r   r   r   pre_tokenizeA   s   zJiebaPreTokenizer.pre_tokenize)r   N)
__name__
__module____qualname__r   intr   listr"   r   r$   r   r   r   r   r      s    
r   N)__doc__
tokenizersr   r   r   r   r   r   r   r   <module>   s   