o
    i                     @   sV   d dl mZ d dlmZ d dlmZ d dlmZ d dlmZ d dlZG dd dZ	dS )	    )Path)Dict)Iterable)List)UnionNc                   @   s|   e Zd Z	ddeeeee f defddZdefddZ	d	ee
jee f dee fd
dZdee dee fddZdS )TokenIDConverter<unk>
token_list
unk_symbolc                 C   sN  t |ttfr>t|}t|| _g | _|jddd}t|D ]\}}| }| j| qW d    n1 s8w   Y  n0t	|| _d| _t| jD ]\}}|dkrU n|  j| d7  _qK|  jdt
| j d7  _i | _t| jD ]\}}|| jv rtd	| d
|| j|< qv|| _| j| jvrtd| d| j| j | _d S )Nrzutf-8)encoding    z, z... (NVocab=)zSymbol "z" is duplicatedzUnknown symbol 'z!' doesn't exist in the token_list)
isinstancer   strtoken_list_reprr	   open	enumeraterstripappendlistlentoken2idRuntimeErrorr
   unk_id)selfr	   r
   fidxlineit r"   W/home/ubuntu/.local/lib/python3.10/site-packages/funasr/tokenizer/token_id_converter.py__init__   s6   


zTokenIDConverter.__init__returnc                 C   s
   t | jS )N)r   r	   r   r"   r"   r#   get_num_vocabulary_size/   s   
z(TokenIDConverter.get_num_vocabulary_sizeintegersc                    s8   t |tjr|jdkrtd|j  fdd|D S )N   zMust be 1 dim ndarray, but got c                    s   g | ]} j | qS r"   )r	   .0r    r&   r"   r#   
<listcomp>5   s    z/TokenIDConverter.ids2tokens.<locals>.<listcomp>)r   npndarrayndim
ValueError)r   r(   r"   r&   r#   
ids2tokens2   s   zTokenIDConverter.ids2tokenstokensc                    s    fdd|D S )Nc                    s   g | ]
} j | jqS r"   )r   getr   r*   r&   r"   r#   r,   8   s    z/TokenIDConverter.tokens2ids.<locals>.<listcomp>r"   )r   r2   r"   r&   r#   
tokens2ids7   s   zTokenIDConverter.tokens2idsN)r   )__name__
__module____qualname__r   r   r   r   r$   intr'   r-   r.   r   r1   r4   r"   r"   r"   r#   r   
   s    
$$r   )
pathlibr   typingr   r   r   r   numpyr-   r   r"   r"   r"   r#   <module>   s    