o
    …wÖi¤
  ã                   @   sB   d dl Z d dlmZ d dlZd dlZd dlmZ G dd„ dƒZdS )é    N)ÚList)Úspacingc                   @   sR   e Zd ZdZdd„ Zdedefdd„Zdee defdd	„Zdedefd
d„Z	dS )ÚChineseProcessorzF
    Tokenizer, Detokenizer and Normalizer utilities for Chinese.
    c                 C   s   t  d¡| _d S )Nzt2s.json)ÚopenccÚOpenCCÚ
normalizer)Úself© r	   úr/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/common/tokenizers/chinese_tokenizers.pyÚ__init__/   s   zChineseProcessor.__init__ÚtextÚreturnc                 C   s   | j  |¡S )N)r   Úconvert©r   r   r	   r	   r
   Ú	normalize2   s   zChineseProcessor.normalizec                    s$   t  d¡‰ ‡ fdd„}|d |¡ƒS )NzÙ([\u2018\u2019\u201c\u201d\u2e80-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff00-\uffef])\s+(?=[\u2018\u2019\u201c\u201d\u2e80-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff00-\uffef])c                    s   t ˆ  d| ¡ƒ ¡ S )Nz\1)r   ÚsubÚstrip)Ús©ÚRE_WS_IN_FWr	   r
   Ú<lambda>:   s    z-ChineseProcessor.detokenize.<locals>.<lambda>ú )ÚreÚcompileÚjoin)r   r   Ú
detokenizer	   r   r
   r   5   s
   ÿzChineseProcessor.detokenizec                 C   s   t  |¡}d |¡S )Nr   )ÚjiebaÚcutr   r   r	   r	   r
   Útokenize=   s   

zChineseProcessor.tokenizeN)
Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   Ústrr   r   r   r   r	   r	   r	   r
   r   *   s    r   )r   Útypingr   r   r   Úpangur   r   r	   r	   r	   r
   Ú<module>   s   !