o
    5tit                     @   s@   d dl mZ ddlmZ ddlmZ g dZG dd deZdS )	    )	lru_cache   )BaseTokenizer)TokenizerRegexp))u   㐀u   䶵)u   一u   龥)u   龦u   龻)u   豈u   鶴)u   侮u   頻)u   並u   龎)u    0u   ⩭6)u   ⾀0u   ⾡d)u   ＀u   ￯)u   ⺀u   ⻿)u   　u   〿)u   ㇀u   ㇯)u   ⼀u   ⿟)u   ⿰u   ⿿)u   ㄀u   ㄯ)u   ㆠu   ㆿ)u   ︐u   ︟)u   ︰u   ﹏)u   ☀u   ⛿)u   ✀u   ➿)u   ㈀u   ㋿)u   ㌀u   ㏿c                   @   sD   e Zd Zdd Zdd Zeedddd Zeddd	d
 ZdS )TokenizerZhc                 C   s   dS )Nzh selfr   r   U/home/ubuntu/.local/lib/python3.10/site-packages/sacrebleu/tokenizers/tokenizer_zh.py	signatureJ   s   zTokenizerZh.signaturec                 C   s   t  | _d S )N)r   _post_tokenizerr	   r   r   r   __init__M   s   zTokenizerZh.__init__i   )maxsizec                 C   s.   t D ]\}}||   kr|kr dS  qqdS )zu
        :param uchar: input char in unicode
        :return: whether the input char is a Chinese character.
        TF)_UCODE_RANGES)ucharstartendr   r   r   _is_chinese_charP   s
   zTokenizerZh._is_chinese_charc                 C   sL   |  }d}|D ]}| |r|d7 }||7 }|d7 }q||7 }q| |S )aX  The tokenization of Chinese text in this script contains two
        steps: separate each Chinese characters (by utf-8 encoding); tokenize
        the non Chinese part (following the `13a` i.e. mteval tokenizer).

        Author: Shujian Huang huangsj@nju.edu.cn

        :param line: input sentence
        :return: tokenized sentence
          )stripr   r   )r
   lineline_in_charscharr   r   r   __call__\   s   



zTokenizerZh.__call__N)	__name__
__module____qualname__r   r   staticmethodr   r   r   r   r   r   r   r   H   s    
r   N)	functoolsr   tokenizer_baser   tokenizer_rer   r   r   r   r   r   r   <module>   s
   '