o
    ij                     @   s   d dl mZ d dlmZ d dlmZ d dlmZ d dlZd dlZd dlm	Z	 d dl
mZ edd	G d
d	 d	e	Zdd Zdd ZdS )    )Path)Iterable)List)UnionN)BaseTokenizer)tablestokenizer_classesCharTokenizerc                       s   e Zd Z					ddeeeee f dedededef
 fd	d
Zdd Z	deee
f dee fddZdee defddZ  ZS )r	   N<space>Fnon_linguistic_symbolsspace_symbolremove_non_linguistic_symbolssplit_with_spaceseg_dictc                    s   t  jdi | || _|d u rt | _nIt|ttfrYt|}z#|jddd}tdd |D | _W d    n1 s=w   Y  W n t	yX   t
| d t | _Y nw t|| _|| _|| _d | _|d urm|n|dd }|d ur~t|| _d S d S )	Nrzutf-8encodingc                 s   s    | ]}|  V  qd S )N)rstrip).0line r   S/home/ubuntu/.local/lib/python3.10/site-packages/funasr/tokenizer/char_tokenizer.py	<genexpr>   s    z)CharTokenizer.__init__.<locals>.<genexpr>z doesn't exist.seg_dict_filer   )super__init__r   setr   
isinstancer   stropenFileNotFoundErrorwarningswarnr   r   r   getload_seg_dict)selfr   r   r   r   r   kwargsf	__class__r   r   r      s.   	

zCharTokenizer.__init__c                 C   s   | j j d| j d| j dS )Nz(space_symbol="z"non_linguistic_symbols="z"))r)   __name__r   r   r%   r   r   r   __repr__,   s   
zCharTokenizer.__repr__r   returnc                 C   s   | j d ur| d}t|| j }|S g }t|dkr_| jD ]}||r>| js4||d t|  |t|d  } nq|d }|dkrN|dd  }q|| |dd  }t|dks|S )N r      )	r   stripsplitseg_tokenizelenr   
startswithr   append)r%   r   tokenswtr   r   r   text2tokens4   s*   



zCharTokenizer.text2tokensr6   c                    s    fdd|D }d |S )Nc                    s   g | ]}| j kr|nd qS )r.   )r   )r   r8   r+   r   r   
<listcomp>M   s    z-CharTokenizer.tokens2text.<locals>.<listcomp> )join)r%   r6   r   r+   r   tokens2textL   s   
zCharTokenizer.tokens2text)Nr
   FFN)r*   
__module____qualname__r   r   r   r   boolr   r,   listr   r9   r=   __classcell__r   r   r(   r   r	      s(    c                 C   s   i }t | ts	J t| ddd)}| }|D ]}|  }|d }|dd  }d|||< qW d    |S 1 s<w   Y  |S )Nr   utf8r   r   r/   r.   )r   r   r   	readlinesr0   r1   r<   )r   r   r'   linesr   skeyvaluer   r   r   r$   Q   s   
r$   c                 C   s   t d}d}| D ]3}| }||v r||| d 7 }q	||r8|D ]}||v r2||| d 7 }q#|d7 }q#q	|d7 }q	|  S )Nz([\u4E00-\u9FA5A-Za-z0-9])r;   r.   z<unk> )recompilelowermatchr0   r1   )txtr   patternout_txtwordcharr   r   r   r2   ^   s   



r2   )pathlibr   typingr   r   r   r!   rI   funasr.tokenizer.abs_tokenizerr   funasr.registerr   registerr	   r$   r2   r   r   r   r   <module>   s    
D