o
    5tiK                     @   s|   d dl Z d dlmZ ddlmZ dedefddZdedefd	d
ZdedefddZdedefddZ	G dd deZ
dS )    N)	lru_cache   )BaseTokenizersentreturnc                 C   s   t dd| } t dd| } t dd| } t dd| } t d	d
| } t dd| } d|  d} t dd| } t dd| } t dd| } t dd| } t dd| } t dd| } | S )Nz\n- z\n z&quot;"z&amp;&z&lt;<z&gt;>z([{-~[-` -&(-+:-@/]) \1 z's z 's z's$z 'sz([^0-9])([\.,])\1 \2 z([\.,])([^0-9])z \1 \2z
([0-9])(-)resubr    r   V/home/ubuntu/.local/lib/python3.10/site-packages/sacrebleu/tokenizers/tokenizer_ter.py_normalize_general_and_western   s   r   c                 C   s   t dd| } t dd| } t dd| } t dd| } t dd| } t dd| } t d	d| } t tjd| } t tjd| } | S )
Nz([\u4e00-\u9fff\u3400-\u4dbf])r   z([\u31c0-\u31ef\u2e80-\u2eff])z+([\u3300-\u33ff\uf900-\ufaff\ufe30-\ufe4f])z([\u3200-\u3f22])z<(^|^[\u3040-\u309f])([\u3040-\u309f]+)(?=$|^[\u3040-\u309f])r   z<(^|^[\u30a0-\u30ff])([\u30a0-\u30ff]+)(?=$|^[\u30a0-\u30ff])z<(^|^[\u31f0-\u31ff])([\u31f0-\u31ff]+)(?=$|^[\u31f0-\u31ff])r   r   TercomTokenizerASIAN_PUNCTFULL_WIDTH_PUNCTr   r   r   r   _normalize_asian;   s*   r   c                 C   s   t dd| S )Nz[\.,\?:;!\"\(\)]r   r   r   r   r   r   _remove_punctc   s   r   c                 C   s$   t tjd| } t tjd| } | S )Nr   r   r   r   r   r   _remove_asian_punctg   s   r   c                	   @   s`   e Zd ZdZdZdZ				ddedededefd	d
Zeddde	de	fddZ
dd ZdS )r   a#  Re-implementation of Tercom Tokenizer in Python 3.

    See src/ter/core/Normalizer.java in https://github.com/jhclark/tercom

    Note that Python doesn't support named Unicode blocks so the mapping for
    relevant blocks was taken from here:

    https://unicode-table.com/en/blocks/
    z=([\u3001\u3002\u3008-\u3011\u3014-\u301f\uff61-\uff65\u30fb])z:([\uff0e\uff0c\uff1f\uff1a\uff1b\uff01\uff02\uff08\uff09])F
normalizedno_punctasian_supportcase_sensitivec                 C   s   || _ || _|| _|| _dS )aJ  Initialize the tokenizer.

        :param normalized: Enable character normalization. By default, normalizes a couple of things such as
            newlines being stripped, retrieving XML encoded characters, and fixing tokenization for punctuation. When
            'asian_support' is enabled, also normalizes specific Asian (CJK) character sequences, i.e.
            split them down to the character level.
        :param no_punct: Remove punctuation. Can be used in conjunction with 'asian_support' to also remove typical
            punctuation markers in Asian languages (CJK).
        :param asian_support: Enable special treatment of Asian characters. This option only has an effect when
            'normalized' and/or 'no_punct' is enabled. If 'normalized' is also enabled, then Asian (CJK)
            characters are split down to the character level. If 'no_punct' is enabled alongside 'asian_support',
            specific unicode ranges for CJK and full-width punctuations are also removed.
        :param case_sensitive: Enable case sensitivity, i.e., do not lower case data.
        N)_normalized	_no_punct_asian_support_case_sensitive)selfr   r   r   r    r   r   r   __init__z   s   
zTercomTokenizer.__init__i   )maxsizer   r   c                 C   s\   |sdS | j s| }| jrt|}| jrt|}| jr't|}| jr't|}d	|
 S )Nr   r   )r$   lowerr!   r   r#   r   r"   r   r   joinsplit)r%   r   r   r   r   __call__   s   zTercomTokenizer.__call__c                 C   s   dS )Ntercomr   )r%   r   r   r   	signature   s   zTercomTokenizer.signatureN)FFFF)__name__
__module____qualname____doc__r   r   boolr&   r   strr+   r-   r   r   r   r   r   m   s(    	
r   )r   	functoolsr   tokenizer_baser   r3   r   r   r   r   r   r   r   r   r   <module>   s   %(