o
    i                     @   sp   d dl mZ d dlmZ d dlmZ d dlmZ d dlZd dlm	Z	 d dl
mZ edd	G d
d	 d	e	ZdS )    )Path)Iterable)List)UnionN)BaseTokenizer)tablestokenizer_classesSentencepiecesTokenizerc                       s   e Zd Zdeeef f fddZdd Zdd Zded	e	e fd
dZ
dee d	efddZded	e	e fddZde	e fddZdd Zdd Zdd Z  ZS )r	   bpemodelc                    s.   t  jdi | t|| _d | _|   d S )N )super__init__strr
   sp_build_sentence_piece_processor)selfr
   kwargs	__class__r   \/home/ubuntu/.local/lib/python3.10/site-packages/funasr/tokenizer/sentencepiece_tokenizer.pyr      s   
z SentencepiecesTokenizer.__init__c                 C   s   | j j d| j dS )Nz(model="z"))r   __name__r
   r   r   r   r   __repr__   s   z SentencepiecesTokenizer.__repr__c                 C   s*   | j d u rt | _ | j | j d S d S N)r   spmSentencePieceProcessorloadr
   r   r   r   r   r      s   

z7SentencepiecesTokenizer._build_sentence_piece_processorlinereturnc                 C      |    | j|S r   )r   r   EncodeAsPieces)r   r   r   r   r   text2tokens"      z#SentencepiecesTokenizer.text2tokenstokensc                 C   s   |    | jt|S r   )r   r   DecodePieceslist)r   r#   r   r   r   tokens2text&   s   z#SentencepiecesTokenizer.tokens2textc                 K   r   r   )r   r   EncodeAsIdsr   r   r   r   r   r   encode*   r"   zSentencepiecesTokenizer.encodec                 K   r   r   )r   r   	DecodeIdsr(   r   r   r   decode.   r"   zSentencepiecesTokenizer.decodec                 C   s
   | j  S r   )r   GetPieceSizer   r   r   r   get_vocab_size2   s   
z&SentencepiecesTokenizer.get_vocab_sizec                 O      | j |i |S r   )r+   r   argsr   r   r   r   
ids2tokens5      z"SentencepiecesTokenizer.ids2tokensc                 O   r.   r   )r)   r/   r   r   r   
tokens2ids8   r2   z"SentencepiecesTokenizer.tokens2ids)r   
__module____qualname__r   r   r   r   r   r   r   r!   r   r&   intr)   r+   r-   r1   r3   __classcell__r   r   r   r   r	      s    )pathlibr   typingr   r   r   sentencepiecer   funasr.tokenizer.abs_tokenizerr   funasr.registerr   registerr	   r   r   r   r   <module>   s    
