o
    Qi                     @   sf   d dl Z d dlmZmZ e de de de dgZddd	Zdd
dZdddZdd Z	dS )    N)	decomposecomposez(\w\w\w\w)\1{3,}z(\w\w\w)\1{3,}z(\w\w)\1{3,}z
(\w)\1{3,}   c                    s    |   }d fdd|D S )N c                 3   s    | ]}t | V  qd S N)_normalize_korean_token).0token
num_repeat P/home/ubuntu/.local/lib/python3.10/site-packages/soynlp/tokenizer/_normalizer.py	<genexpr>   s    znormalize.<locals>.<genexpr>)splitjoin)sentencer   tokensr   r
   r   	normalize   s   r   c                 C   s   t | } t| |} | S r   )_normalize_emoji_remove_repeat)r	   r   r   r   r   r      s   
r   c                 C   s&   |dkrt D ]
}|d| | } q| S )Nr   z\1)repeatchars_patternssub)r	   r   patternr   r   r   r      s   r   c                 C   s  t | dkr| S g }dd | D }t| ||dd  D ]b\}}}|d ks(|d kr.|| q|d dkrN|d |d krN|t|d |d d|d   q|d dkrx|d dkrx|d |d krx||d dkrr|d |d  n|d  q|| qd|| d  S )	N   c                 S   s   g | ]}t |qS r   )r   )r   cr   r   r   
<listcomp>    s    z$_normalize_emoji.<locals>.<listcomp>r   r   r    )lenzipappendr   r   )r	   token_decomposedscharcdndr   r   r   r      s   
$(,r   )r   )
resoynlp.hangler   r   compiler   r   r   r   r   r   r   r   r   <module>   s   


