o
    i                     @   sT   d dl Z d dlmZ d dlmZmZmZ d dlmZ d dl	m
Z
 G dd de
ZdS )    N)Path)IterableListUnion)check_argument_types)AbsTokenizerc                	   @   sl   e Zd Z			ddedeeeee f defddZdd	 Z	d
ede
e fddZdee defddZdS )WordTokenizerNF	delimiternon_linguistic_symbolsremove_non_linguistic_symbolsc                 C   s   t  sJ || _|s|d urtd |d u rt | _nIt|ttfr`t|}z#|j	ddd}tdd |D | _W d    n1 sDw   Y  W n t
y_   t| d t | _Y nw t|| _|| _d S )NzMnon_linguistic_symbols is only used when remove_non_linguistic_symbols = Truerzutf-8)encodingc                 s   s    | ]}|  V  qd S N)rstrip).0line r   O/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/text/word_tokenizer.py	<genexpr>    s    z)WordTokenizer.__init__.<locals>.<genexpr>z doesn't exist.)r   r	   warningswarnsetr
   
isinstancer   stropenFileNotFoundErrorr   )selfr	   r
   r   fr   r   r   __init__   s*   



zWordTokenizer.__init__c                 C   s   | j j d| j dS )Nz(delimiter="z"))	__class____name__r	   )r   r   r   r   __repr__(   s   zWordTokenizer.__repr__r   returnc                 C   s6   g }| | jD ]}| jr|| jv rq|| q|S r   )splitr	   r   r
   append)r   r   tokenstr   r   r   text2tokens+   s   zWordTokenizer.text2tokensr%   c                 C   s    | j d u rd}n| j }||S )N )r	   join)r   r%   r	   r   r   r   tokens2text3   s   

zWordTokenizer.tokens2text)NNF)r    
__module____qualname__r   r   r   r   boolr   r!   r   r'   r*   r   r   r   r   r   
   s    
r   )r   pathlibr   typingr   r   r   	typeguardr   espnet2.text.abs_tokenizerr   r   r   r   r   r   <module>   s    