o
    i                     @   sp   d dl Z d dlZd dlmZ d dlmZ d dlmZ d dlm	Z	m
Z
mZmZ G dd deZG dd	 d	eZdS )
    N)ABC)Path)abstractmethod)UnionIterableListDictc                   @   s@   e Zd Zededee fddZedee defddZdS )	AbsTokenizerlinereturnc                 C      t NNotImplementedErrorselfr
    r   R/home/ubuntu/.local/lib/python3.10/site-packages/funasr/tokenizer/abs_tokenizer.pytext2tokens
      zAbsTokenizer.text2tokenstokensc                 C   r   r   r   r   r   r   r   r   tokens2text   r   zAbsTokenizer.tokens2textN)	__name__
__module____qualname__r   strr   r   r   r   r   r   r   r   r	   	   s
    r	   c                   @   s   e Zd Z		ddeeeee f defddZdd Zd	d
 Z	de
fddZdeejee
 f dee fddZdee dee
 fddZededee fddZedee defddZdS )BaseTokenizerN<unk>
token_list
unk_symbolc           	      K   s  |d urt |ttfrG|drGt|}t|| _g | _|jddd}t|D ]\}}| }| j	| q(W d    n1 sAw   Y  nft |ttfr}|dr}t|}t|| _g | _t|ddd}t
|| _W d    n1 sww   Y  n0t|| _d| _t| jD ]\}}|dkr n|  j| d7  _q|  jd	t| j d
7  _i | _t| jD ]\}}|| jv rtd| d|| j|< q|| _| j| jvrtd| d| j| j | _d S d S )Nz.txtrzutf-8)encodingz.json    z, z... (NVocab=)zSymbol "z" is duplicatedzUnknown symbol 'z!' doesn't exist in the token_list)
isinstancer   r   endswithtoken_list_reprr   open	enumeraterstripappendjsonloadlistlentoken2idRuntimeErrorr    unk_id)	r   r   r    kwargsfidxr
   itr   r   r   __init__   sJ   



zBaseTokenizer.__init__c                 K   s   |  |}| |}|S r   )r   
tokens2ids)r   textr4   r   	text_intsr   r   r   encodeA   s   

zBaseTokenizer.encodec                 C   s   |  |}| |}|S r   )
ids2tokensr   )r   r<   tokenr;   r   r   r   decodeG   s   

zBaseTokenizer.decoder   c                 C   s
   t | jS r   )r0   r   r   r   r   r   get_num_vocabulary_sizeL   s   
z%BaseTokenizer.get_num_vocabulary_sizeintegersc                    s8   t |tjr|jdkrtd|j  fdd|D S )N   zMust be 1 dim ndarray, but got c                    s   g | ]} j | qS r   )r   .0r7   rA   r   r   
<listcomp>R   s    z,BaseTokenizer.ids2tokens.<locals>.<listcomp>)r&   npndarrayndim
ValueError)r   rC   r   rA   r   r>   O   s   zBaseTokenizer.ids2tokensr   c                    s    fdd|D S )Nc                    s   g | ]
} j | jqS r   )r1   getr3   rE   rA   r   r   rG   U   s    z,BaseTokenizer.tokens2ids.<locals>.<listcomp>r   r   r   rA   r   r:   T   s   zBaseTokenizer.tokens2idsr
   c                 C   r   r   r   r   r   r   r   r   W   r   zBaseTokenizer.text2tokensc                 C   r   r   r   r   r   r   r   r   [   r   zBaseTokenizer.tokens2text)Nr   )r   r   r   r   r   r   r   r9   r=   r@   intrB   rH   rI   r   r>   r:   r   r   r   r   r   r   r   r      s"    
-$r   )r-   numpyrH   abcr   pathlibr   r   typingr   r   r   r   r	   r   r   r   r   r   <module>   s    
