o
    
il                     @   s   d dl Z d dlmZmZ d dlZdd Zdd Zdd Zd	d
 Zdd Z	dd Z
dd Zdd Zdd Zdd Zdd Zdd ZdS )    N)h2jj2hc                 C   sv   dd |  d }tdd|}|dd}|dd}|d	d
}|dd}|dd}|dd}|d S )z-Modify arpabets so that it fits our processes z $z\d z T S z TS z D Z z DZ z AW ER z AWER z IH R $z IH ER z EH R $z EH ER z$ )joinresubreplacestripsplit)arpabetsstring r   F/home/ubuntu/.local/lib/python3.10/site-packages/misaki/g2pkc/utils.pyadjust   s   r   c                 C   s   i dddddddddd	d
ddddddd	dddddddddddddddddddddd d!d	d	d"	}| | | S )#zArpabet to choseong or onsetBu   ᄇCHu   ᄎDu   ᄃDHDZu   ᄌFu   ᄑGu   ᄀHHu   ᄒJHKu   ᄏLu   ᄅMu   ᄆNu   ᄂNGu   ᄋPRSu   ᄉu   ᄐWY)	SHTTHTSVr"   r#   ZZHgetarpabetdr   r   r   to_choseong   sZ   	
r0   c                 C   sp   i ddddddddd	d
dddddddddddddddddddddd}| | | S )zArpabet to jungseong or vowelAAu   ᅡAE   ᅢAHu   ᅥAOu   ᅩAWu	   ᅡ우AWERu	   ᅡ워AYu	   ᅡ이EH   ᅦEREYu	   ᅦ이IH   ᅵIYOWOYu	   ᅩ이UH   ᅮUWr+   r-   r   r   r   to_jungseong4   sB   	
rE   c              	   C   s   i dddddddddd	d
ddddddddddddddddddddddddddddddd}| | | S )zArpabet to jongseong or codar   u   ᆸr   u   ᆾr   u   ᆮr   r   u   ᇁr   u   ᆨr   u   ᇂr   u   ᆽr   r      ᆯr   u   ᆷr   u   ᆫr   u   ᆼr   r    r!   u   ᆺr$   )r%   r&   r(   r"   r#   r)   r*   r+   r-   r   r   r   to_jongseongI   sV   	
rG   c                 C   s&   g d}|D ]
\}}|  ||} q| S )zSome postprocessing rules))u   그Wu   ᄀW)u   흐Wu   ᄒW)u   크Wu   ᄏW)u   ᄂYᅥu   니어)u   ᄃYᅥu   디어)u   ᄅYᅥu   리어)u   Yᅵr>   )u   Yᅡu   ᅣ)u   Yᅢ   ᅤ)u   Yᅥu   ᅧ)u   Yᅦ   ᅨ)u   Yᅩu   ᅭ)u   Yᅮu   ᅲ)u   Wᅡu   ᅪ)u   Wᅢ   ᅫ)u   Wᅥ   ᅯ)u   WᅩrK   )u   WᅮrC   )u   Wᅦ   ᅰ)u   Wᅵu   ᅱ)u   ᅳᅵu   ᅴ)r#   r>   )r"   rC   r	   )r   pairsstr1str2r   r   r   reconstructh   s   rQ   c                  C   s   t tjtjtd ddd  } | d d}g }| dd D ]P}|d}|d }t	|D ]@\}}|| }t
|dkrBq3|dkrGq3| | }	d	|v rg|d	d }
|d	d dd
 d}n|}
g }||	|
|f q3q$|S )zParse the main rule tablez
/table.csvrutf8encodingr   ,   N(/)openospathdirnameabspath__file__read
splitlinesr   	enumeratelenappend)linesonsetstablelinecolscodaionsetcellrO   rP   rule_idsr   r   r   parse_table   s(   *
rp   c                 C   s  t jdkr.|| d}g }|dd D ]}|d\}}|dd }|||f qn
t jdkr8|| }td	d
| d
dd |D krK| S dd t	| D }g }|D ]'\}	}
|
dd }
|
dksk|	dkrnd}
n|
d }
|dt
|	d  |
  qXd
|}|D ]\}}|d| | ||d  }qd
}t| |D ]?\}}
||7 }|dkr|
dkr|d7 }q|
dkrt|d dv r|d7 }q|
dkrt|d dv r|d7 }q|
dkr|d7 }q|S ) zLattach pos tags to the given string using Mecab
    mecab: mecab object
    nt
N	rV   r   posixz[ \n]r   c                 s   s    | ]\}}|V  qd S )Nr   ).0token_r   r   r   	<genexpr>   s    zannotate.<locals>.<genexpr>c                 S   s    g | ]\}}|d v r||fqS ))r   rr   r   )rv   rl   charr   r   r   
<listcomp>   s     zannotate.<locals>.<listcomp>+rY   NNBCu   곳r   rx   rW   u   의Jz/JErF   z/Er(   u   ᆫᆬᆷᆱᆰᆲᆴz/Pz/B)r\   nameparser   re   posr   r   r   rc   rd   zipr   )r   mecabr   tokenspp1p2blankstag_seqrw   tagrl   rz   	annotatedr   r   r   annotate   sP   


"

r   c                 C   sj   t dd| } | }tt d|}|D ]
}||t| }qtt d|}|D ]
}||t| }q(|S )Nu   (^|[^ᄀ-ᄒ])([ᅡ-ᅵ])u   \1ᄋ\2u   [ᄀ-ᄒ][ᅡ-ᅵ][ᆨ-ᇂ]u   [ᄀ-ᄒ][ᅡ-ᅵ])r   r   setfindallr	   r   )lettersr   sylssylr   r   r   compose   s   r   c                 C   s4   |  dd} |  dd} |  dd} |  dd} | S )z]For group_vowels=True
    Contemporarily, Korean speakers don't distinguish some vowels.
    r3   r:   rH   rI   rJ   u   ᅬrL   rM   )inpr   r   r   group   s
   r   c                  C   st   t dddd  } g }| D ]}|dr|td| qg }|D ]\}}|dD ]	}|||f q-q$|S )zFor internal use	rules.txtrR   rS   rT   ->u/   ([ㄱ-힣][ ㄱ-힣]*)\[([ㄱ-힣][ ㄱ-힣]*)]rZ   )	r[   ra   rb   
startswithextendr   r   r   re   )textexamplesri   	_examplesr   gteachr   r   r   _get_examples   s   
r   c                  C   sz   t tjtjtjtdddd  	d} t
 }| D ]}| d | dd }}d	||| < q |S )
zfor verbose=Truer   rR   rS   rT   z

r   rW   Nrr   )r[   r\   r]   r   r^   r_   r`   ra   r
   r   dictrb   )rulesrule_id2textrulerule_idtextsr   r   r   get_rule_id2text  s   6r   c                 C   sN   | r!||kr#|t dd|kr%tt|dt| td|d dS dS dS dS )z-displays the process and relevant informationz/[EJPB]r   r   z[1;31mz[0mN)r   r   printr   )verboseoutr   r   r   r   r   gloss  s   r   )r   jamor   r   r\   r   r0   rE   rG   rQ   rp   r   r   r   r   r   r   r   r   r   r   <module>   s      1
