o
    i                     @   sX   d dl mZ d dlmZ d dlmZ d dlmZ d dlZd dlmZ G dd deZ	dS )	    )Path)Iterable)List)UnionN)AbsTokenizerc                	   @   sl   e Zd Z			ddedeeeee f defddZdd	 Z	d
ede
e fddZdee defddZdS )WordTokenizerNF	delimiternon_linguistic_symbolsremove_non_linguistic_symbolsc                 C   s   || _ |s|d urtd |d u rt | _nIt|ttfr[t|}z#|jddd}tdd |D | _W d    n1 s?w   Y  W n t	yZ   t| d t | _Y nw t|| _|| _
d S )NzMnon_linguistic_symbols is only used when remove_non_linguistic_symbols = Truerzutf-8)encodingc                 s   s    | ]}|  V  qd S N)rstrip).0line r   S/home/ubuntu/.local/lib/python3.10/site-packages/funasr/tokenizer/word_tokenizer.py	<genexpr>   s    z)WordTokenizer.__init__.<locals>.<genexpr>z doesn't exist.)r   warningswarnsetr	   
isinstancer   stropenFileNotFoundErrorr
   )selfr   r	   r
   fr   r   r   __init__   s(   


zWordTokenizer.__init__c                 C   s   | j j d| j dS )Nz(delimiter="z"))	__class____name__r   )r   r   r   r   __repr__'   s   zWordTokenizer.__repr__r   returnc                 C   s6   g }| | jD ]}| jr|| jv rq|| q|S r   )splitr   r
   r	   append)r   r   tokenstr   r   r   text2tokens*   s   zWordTokenizer.text2tokensr$   c                 C   s    | j d u rd}n| j }||S )N )r   join)r   r$   r   r   r   r   tokens2text2   s   

zWordTokenizer.tokens2text)NNF)r   
__module____qualname__r   r   r   r   boolr   r    r   r&   r)   r   r   r   r   r      s    
r   )
pathlibr   typingr   r   r   r   funasr.tokenizer.abs_tokenizerr   r   r   r   r   r   <module>   s    