o
    Qi~                      @   s  d dl Z e jdkree  e d d dlZd dlZd dlZdZdZ	dZ
dZdZd	Zd
ZdZg dZg dZg dZg dZg dZedZedZdddd i fddZdd Zdd Zdd Zdd Zdd Zdd  Zd!d" Zd#d$ Z d%d& Z!d'd( Z"G d)d* d*Z#dS )+    N)      zutf-8i   i  iL     i11  iN1  iO1  ic1  )   ㄱ   ㄲ   ㄴ   ㄷ   ㄸ   ㄹ   ㅁ   ㅂ   ㅃ   ㅅ   ㅆ   ㅇ   ㅈ   ㅉ   ㅊ   ㅋ   ㅌ   ㅍ   ㅎ)   ㅏ   ㅐ   ㅑ   ㅒ   ㅓ   ㅔ   ㅕ   ㅖ   ㅗ   ㅘ   ㅙ   ㅚ   ㅛ   ㅜ   ㅝ   ㅞ   ㅟ   ㅠ   ㅡ   ㅢ   ㅣ) r   r      ㄳr      ㄵ   ㄶr   r
      ㄺ   ㄻ   ㄼ   ㄽ   ㄾ   ㄿ   ㅀr   r      ㅄr   r   r   r   r   r   r   r   r   )r   r   r.   r   r/   r0   r   r	   r
   r1   r2   r3   r4   r5   r6   r7   r   r   r   r8   r   r   r   r   r   r   r   r   r   r   z\s+z
(\w)\1{3,}Fc           
      C   sr  d}t j|tdd |dkrtd| | } g }| D ]}|dkr&|| qt|}	t|	  kr4tksLn t	|	  kr?t
ksLn t|	  krJtkrRn n|| q|rj|	dkr\|	dksd|	d	krj|	d
krj|| q|rz|	dkrz|	dkrz|| q|r|	dks|	dks|	dks|	dks|	dks|	dks|	dkr|| q||v r|| q|d qtdd| S )Nzenormalize func will be moved soynlp.normalizer at ver 0.1
argument remains will be removed at ver 0.1r   )
stacklevelr   z\1r-   a   z   A   Z   0   9   !   "   '   ,   .   ?   `    )warningswarnDeprecationWarningrepeatchars_patternsubappendto_base	kor_beginkor_end
jaum_beginjaum_end
moum_beginmoum_enddoublespace_patternjoinstrip)
docenglishnumberpunctuationremove_repeatremainsmessagefci rb   I/home/ubuntu/.local/lib/python3.10/site-packages/soynlp/hangle/_hangle.py	normalize,   s6   
D
$

<

rd   c                 C   s.   t ttt|   tt|  t| S N)chrrO   chosung_basechosung_listindexjungsung_basejungsung_listjongsung_list)chosungjungsungjongsungrb   rb   rc   composeL   s   .rp   c                 C   s   t | sd S t| }t|  krtkrn n| ddfS t|  kr%tkr,n nd| dfS |t8 }|t }||t  t }||t  |t  }t	| t
| t| fS )Nr-   )character_is_koreanrN   rQ   rR   rS   rT   rO   rg   rj   rh   rk   rl   )r`   ra   chojungjongrb   rb   rc   	decomposeO   s   

ru   c                 C   sP   t | }t|  kotkn  p't|  kotkn  p't|  ko%tkS   S re   )rN   rO   rP   rQ   rR   rS   rT   )r`   ra   rb   rb   rc   rq   ]   s   Hrq   c                 C      t t|   kotkS   S re   )rO   rN   rP   r`   rb   rb   rc   character_is_complete_koreana      rx   c                 C   rv   re   )rQ   rN   rR   rw   rb   rb   rc   character_is_jaumd   ry   rz   c                 C   rv   re   )rS   rN   rT   rw   rb   rb   rc   character_is_moumg   ry   r{   c                 C   sT   t jjdkrt| tkst| tkrt| S tt| tks$t| tkr(t| S t)Nr   )	sysversion_infomajortypestrunicodeord	TypeErrorintrw   rb   rb   rc   rN   j   s   rN   c                 C   s   t | } | dko| dkS )Nr>   r?   rN   ra   rb   rb   rc   character_is_numberv   s   r   c                 C   s(   t | } | dkr| dkp| dko| dkS )Nr:   r;   r<   r=   r   r   rb   rb   rc   character_is_englishz   s    r   c                 C   s@   t | } | dkp| dkp| dkp| dkp| dkp| dkp| dkS )Nr@   rA   rB   rC   rD   rE   rF   r   r   rb   rb   rc   character_is_punctuation~   s   8r   c                   @   sH   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d Zdd Z	dd Z
dS )ConvolutionHangleEncoderu   초/중/종성을 구성하는 자음/모음과 띄어쓰기만 인코딩
    one hot vector [ㄱ, ㄴ, ㄷ, ... ㅎ, ㅏ, ㅐ, .. ㅢ, ㅣ,"  ", ㄱ, ㄲ, ... ㅍ, ㅎ,"  ", 0, 1, 2, .. 9]
    c                 C   s  d| _ d| _d| _d| _d| _d| _dd td	D }d
}d}tt t	 | |g |g }t
|| _i ddddddddddddddddddddd d	d!d"d#d$d%d&d'd(d)d*d+d,i d-d.d/d0d1dd2d3d4d5d6d7d8d9d:d;d<d=d>d?d@dAdBdCdDdEdFdGdHdIdJdKdLdMi dNdOdPdQdRdSdTdUdVdWdXdYd
ddZd[d\d]d^d_d`dadbdcdddedfdgdhdidjdkdldmdndoi| _d S )pN   (   D   N   O   P   c                 S      g | ]}t |qS rb   )r   ).0ra   rb   rb   rc   
<listcomp>       z5ConvolutionHangleEncoder.__init__.<locals>.<listcomp>
   r-   z<unk>r   r   r      r   r   r      r	      r
      r      r   r   r      r   	   r   r      r      r      r      r      r      r      r      r   r      r      r      r      r      r      r      r       r!   r   r"      r#      r$      r%       r&   r@   r'   rA   r(   #   r)   $   r*   %   r+   &   r,   rB   r.   +   r/   -   r0   rD   r1   1   r2   2   r3   3   r4   4   r5   5   r6   6   r7   7   r8   :   )
jung_begin
jong_beginnumber_beginspaceunkdimrangerh   rk   rl   npasarrayidx_to_charjamo_to_idx)selfnumr   r   r   rb   rb   rc   __init__   s   z!ConvolutionHangleEncoder.__init__c                 C   sJ   |  |}tt|| jf}t|D ]\}}|D ]}d|||f< qq|S )Nr   )sent_to_onehotr   zeroslenr   	enumerate)r   sentonehotxra   xijrb   rb   rc   encode   s   
zConvolutionHangleEncoder.encodec                 C   s   |  |}dd |D }g }t||D ]1\}}|dkr#|| jf qd|  kr-dkr;n n||d | j f q|| || q|S )Nc                 S   r   rb   )r   )r   r`   rb   rb   rc   r      r   z;ConvolutionHangleEncoder.sent_to_onehot.<locals>.<listcomp>r   r>   r?   )
_normalizeziprM   r   r   
_decompose)r   r   charsordsr   charidxrb   rb   rc   r      s   
z'ConvolutionHangleEncoder.sent_to_onehotc                    s    fdd}g }|D ]V}t |dkr6d|d   kr jk s+n td|d  jf | j|d   q
t |dkrX|| t fdd|D \}}}|t||| q
| jd	  q
d
|S )Nc                    s~   | \}}}d|  kr j k sn td|  j |  kr# jk s*n td|  j|  kr6 jk s=n td| d S )Nr   zChosung %d is out of indexzJungsung %d is out of indexzJongsung %d is out of index)r   
ValueErrorr   r   )r`   rr   rs   rt   r   rb   rc   	check_cjj   s   
z:ConvolutionHangleEncoder.onehot_to_sent.<locals>.check_cjjr   r   z*character index %d is out of index [0, %d]r   c                 3   s    | ]} j | V  qd S re   )r   )r   cir   rb   rc   	<genexpr>   s    z:ConvolutionHangleEncoder.onehot_to_sent.<locals>.<genexpr>rG   )r   r   r   rM   r   tuplerp   rV   )r   encoded_sentr   r   r`   rr   rs   rt   rb   r   rc   onehot_to_sent   s   	
z'ConvolutionHangleEncoder.onehot_to_sentc                 C   s2   dd l }|d}|d|}td| }|S )Nr   u   [^ㄱ-ㅎㅏ-ㅣ가-힣 0-9]r-   )recompilerL   rU   rW   )r   r   r   regexrb   rb   rc   r      s
   
z#ConvolutionHangleEncoder._normalizec                 C   s   t tt|  t|  | S re   )rf   rO   rg   rj   )r   rr   rs   rt   rb   rb   rc   _compose   ry   z!ConvolutionHangleEncoder._composec                 C   st   t |  kr
tkr1n n%|t 8 }|t }||t  t }||t  |t  }|| j| | j| fS | j|| jfS re   )	rO   rP   rg   rj   r   r   r   getr   )r   r`   ra   rr   rs   rt   rb   rb   rc   r      s   z#ConvolutionHangleEncoder._decomposeN)__name__
__module____qualname____doc__r   r   r   r   r   r   r   rb   rb   rb   rc   r      s    r   )$r|   r}   reloadsetdefaultencodingrH   r   numpyr   rO   rP   rg   rj   rQ   rR   rS   rT   rh   rk   rl   	jaum_list	moum_listr   rU   rK   rd   rp   ru   rq   rx   rz   r{   rN   r   r   r   r   rb   rb   rb   rc   <module>   sD   



 