o
    ॵi
                     @   sn   d dl Z d dlZdZejZdefddZdefddZded	efd
dZ	ded	efddZ
dd Zdd ZdS )    Nu   ＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､　、〃〈〉《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏﹑﹔·！？｡。decoded_strc                 C   s   |  d}g }d}t|D ]&\}}t|r|dkr|}q|dkr.|d|||  d}|| q|dkrD|d||d   d| S )N  )split	enumerate_is_chinese_strappendjoinstrip)r   old_word_listnew_word_liststartiword r   R/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/utils/chinese_utils.py"remove_space_between_chinese_chars
   s   
r   stringc                 C   s   d d dd | D  S )Nr   r   c                 S   s,   g | ]}t |s|tv rd | d n|qS )r   )_is_chinese_charCHINESE_PUNCTUATION).0charr   r   r   
<listcomp>   s    z'rebuild_chinese_str.<locals>.<listcomp>)r
   r   r   r   r   r   rebuild_chinese_str   s
   r   returnc                 C   s   t dd | D S )Nc                 s   s(    | ]}t |p|tv p|tv V  qd S )N)r   r   ENGLISH_PUNCTUATION)r   cpr   r   r   	<genexpr>&   s    
z"_is_chinese_str.<locals>.<genexpr>)allr   r   r   r   r   %   s   r   r   c                 C   s   t | } | dkr| dksD| dkr| dksD| dkr| dksD| dkr$| dksD| d	kr,| d
ksD| dkr4| dksD| dkr<| dksD| dkrF| dkrFdS dS )z6Checks whether CP is the codepoint of a CJK character.i N  i  i 4  iM  i   iߦ i  i? i@ i i  i i   i  i  i TF)ord)r   r   r   r   r   +   s    r   c                 C   sH   ddl m} g d}d}| D ]}|dv r|d }||7 }q||d}|S )Nr   )convert)
u   零u   一u   二u   三u   四u   五u   六u   七u   八u   九r   
0123456789zzh-hans)zhconvr"   )textr"   chinese_numbernew_textxr   r   r   normalize_chinese_number:   s   

r)   c                 C   sF   |   tdtd} tdd| } | d} | dd | } | S )Nr   z\s{2,}
)lowerreplacer   r   resubrstripr   )r%   	max_wordsr   r   r   pre_chineseG   s   

r1   )r-   r   r   punctuationr   strr   r   boolr   r   r)   r1   r   r   r   r   <module>   s   