o
    SiX                     @   s   d dl mZ d dlmZmZ ddlmZ g dZdefddZ	d	e
de
fd
dZdd	e
dee
 de
fddZd	e
de
fddZd	e
de
fddZd	e
de
fddZd	e
de
de
de
fddZd dee
ef dee dee dee
ef fddZdS )!    sub)UnionOptional   )T2S_DICT)filter_unusual_charactershandle_blank_characteruppercase_to_lowercasetraditional_to_simplifiedfull_angle_to_half_anglehandle_substitutereturnc                   C   s   t S )u   获取所有的预处理管道)ALL_PIPELINES r   r   E/home/ubuntu/.local/lib/python3.10/site-packages/proces/preprocess.pyget_all_pipelines   s   r   textc                 C   s    d}d}t d| | dd| S )u   过滤所有非常见字符，保留中文、英文、常见标点、空白字符

    Attributes:
        text: input text
    z\u4E00-\u9FA5u{   !\"#$%&\'()*+,\-./:;<=>?@\\\[\]^_`{|}~¥·—‘’“”…、。〈〉《》「」『』【】！（），：；？｜～z[^\w\sz]+ r   )r   chinesepunctuationr   r   r   r      s   r   r   replc                 C   s   t d|| S )uw   处理空白字符，默认替换成空字符

    Attributes:
        text: input text
        repl: replace text
    z\s+r   )r   r   r   r   r   r	   !   s   r	   c                 C   s   |   S )u>   大写转小写

    Attributes:
        text: input text
    )lowerr   r   r   r   r
   +   s   r
   c                 C   s   d dd | D S )u`   繁体转简体

    Attributes:
        text: input text

    convert data from mediawiki.
    r   c                 S   s$   g | ]}|t  v rt | n|qS r   )r   keys).0tr   r   r   
<listcomp><   s   $ z-traditional_to_simplified.<locals>.<listcomp>)joinr   r   r   r   r   4   s   r   c                 C   sT   d}| D ]#}t |}|dkrd}nd|  krdkr!n n|d8 }|t|7 }q|S )u>   全角转半角

    Attributes:
        text: input text
    r   i 0      i  i^  i  )ordchr)r   resultucharinside_coder   r   r   r   ?   s   r   ptnc                 C   s   t ||| S )ut   替换一些字符

    Attributes:
        text: input text
        ptn: re pattern
        repl: replace text
    r   )r   r%   r   r   r   r   r   Q   s   r   Ndata	pipelinesparamsc                 C   s   g d}|du r
|}t | tkr| g}n| }g }|D ]7}|D ]-}|tv rD|du r0t | |}q|| v rCt | |g|| R  }qtd| d|| qt | tkr\|d S |S )u  文本预处理

    Attributes:
        data: input data
        pipelines: default is
            ["handle_blank_character",
            "uppercase_to_lowercase",
            "traditional_to_simplified",
            "full_angle_to_half_angle"]
        params: function parameters
    )r	   r
   r   r   Nz
pipeline: z not support!r   )typestrr   globalsr   
ValueErrorappend)r&   r'   r(   default_pipelines	data_listresultsr   funcr   r   r   
preprocess\   s(   r2   )r   )NN)rer   typingr   r   confr   r   listr   r*   r   r	   r
   r   r   r   dictr2   r   r   r   r   <module>   s    

	 
