o
    2wi,                     @   s   d dl Z d dlmZmZ d dlZdd Zdd Zdd Zd	d
 Zdd Z	dd Z
dd Zdd Zdd Zdd Zdd Zdd ZdS )    N)h2jj2hc                 C   sv   dd |  d }tdd|}|dd}|dd}|d	d
}|dd}|dd}|dd}|d S )z-Modify arpabets so that it fits our processes z $z\d z T S z TS z D Z z DZ z AW ER z AWER z IH R $z IH ER z EH R $z EH ER z$ )joinresubreplacestripsplit)arpabetsstring r   G/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/g2pk/utils.pyadjust   s   r   c                 C   s   i dddddddddd	d
ddddddd	dddddddddddddddddddddd d!d	d	d"	}| | | S )#zArpabet to choseong or onsetBu   ᄇCHu   ᄎDu   ᄃDHDZu   ᄌFu   ᄑGu   ᄀHHu   ᄒJHKu   ᄏLu   ᄅMu   ᄆNu   ᄂNGu   ᄋPRSu   ᄉu   ᄐWY)	SHTTHTSVr"   r#   ZZHgetarpabetdr   r   r   to_choseong   sZ   	
r0   c                 C   sp   i ddddddddd	d
dddddddddddddddddddddd}| | | S )zArpabet to jungseong or vowelAAu   ᅡAE   ᅢAHu   ᅥAOu   ᅩAWu	   ᅡ우AWERu	   ᅡ워AYu	   ᅡ이EH   ᅦEREYu	   ᅦ이IH   ᅵIYOWOYu	   ᅩ이UH   ᅮUWr+   r-   r   r   r   to_jungseong4   sB   	
rE   c              	   C   s   i dddddddddd	d
ddddddddddddddddddddddddddddddd}| | | S )zArpabet to jongseong or codar   u   ᆸr   u   ᆾr   u   ᆮr   r   u   ᇁr   u   ᆨr   u   ᇂr   u   ᆽr   r      ᆯr   u   ᆷr   u   ᆫr   u   ᆼr   r    r!   u   ᆺr$   )r%   r&   r(   r"   r#   r)   r*   r+   r-   r   r   r   to_jongseongI   sV   	
rG   c                 C   s&   g d}|D ]
\}}|  ||} q| S )zSome postprocessing rules))u   그Wu   ᄀW)u   흐Wu   ᄒW)u   크Wu   ᄏW)u   ᄂYᅥu   니어)u   ᄃYᅥu   디어)u   ᄅYᅥu   리어)u   Yᅵr>   )u   Yᅡu   ᅣ)u   Yᅢ   ᅤ)u   Yᅥu   ᅧ)u   Yᅦ   ᅨ)u   Yᅩu   ᅭ)u   Yᅮu   ᅲ)u   Wᅡu   ᅪ)u   Wᅢ   ᅫ)u   Wᅥ   ᅯ)u   WᅩrK   )u   WᅮrC   )u   Wᅦ   ᅰ)u   Wᅵu   ᅱ)u   ᅳᅵu   ᅴ)r#   r>   )r"   rC   r	   )r   pairsstr1str2r   r   r   reconstructh   s   rQ   c                  C   s   t tjtjtd ddd  } | d d}g }| dd D ]P}|d}|d }t	|D ]@\}}|| }t
|dkrBq3|dkrGq3| | }	d	|v rg|d	d }
|d	d dd
 d}n|}
g }||	|
|f q3q$|S )zParse the main rule tablez
/table.csvrutf8encodingr   ,   N(/)openospathdirnameabspath__file__read
splitlinesr   	enumeratelenappend)linesonsetstablelinecolscodaionsetcellrO   rP   rule_idsr   r   r   parse_table   s(   *
rp   c           
      C   sR  | | }| ddddd |D kr| S dd t| D }g }|D ]#\}}|dd }|d	kr6d
}n|d }|dt|d  |  q$d|}|D ]}|d| d ||d  }qOd}t| |D ]?\}	}||	7 }|	dkr||dkr||d7 }qg|dkrt|	d dv r|d7 }qg|dkrt|	d dv r|d7 }qg|d
kr|d7 }qg|S )zLattach pos tags to the given string using Mecab
    mecab: mecab object
    r   r   c                 s   s    | ]\}}|V  qd S )Nr   ).0token_r   r   r   	<genexpr>   s    zannotate.<locals>.<genexpr>c                 S   s   g | ]
\}}|d kr|qS )r   r   )rq   rl   charr   r   r   
<listcomp>   s    zannotate.<locals>.<listcomp>+rY   NNBCr   r   rs   rW   Nu   의Jz/JErF   z/Er(   u   ᆫᆬᆷᆱᆰᆲᆴz/Pz/B)	posr	   r   rc   r   re   rd   zipr   )
r   mecabtokensblankstag_seqrr   tagrl   	annotatedru   r   r   r   annotate   s>   
 

r   c                 C   sj   t dd| } | }tt d|}|D ]
}||t| }qtt d|}|D ]
}||t| }q(|S )Nu   (^|[^ᄀ-ᄒ])([ᅡ-ᅵ])u   \1ᄋ\2u   [ᄀ-ᄒ][ᅡ-ᅵ][ᆨ-ᇂ]u   [ᄀ-ᄒ][ᅡ-ᅵ])r   r   setfindallr	   r   )lettersr   sylssylr   r   r   compose   s   r   c                 C   s4   |  dd} |  dd} |  dd} |  dd} | S )z]For group_vowels=True
    Contemporarily, Korean speakers don't distinguish some vowels.
    r3   r:   rH   rI   rJ   u   ᅬrL   rM   )inpr   r   r   group   s
   r   c                  C   st   t dddd  } g }| D ]}|dr|td| qg }|D ]\}}|dD ]	}|||f q-q$|S )zFor internal usez	rules.txtrR   rS   rT   ->u/   ([ㄱ-힣][ ㄱ-힣]*)\[([ㄱ-힣][ ㄱ-힣]*)]rZ   )	r[   ra   rb   
startswithextendr   r   r   re   )textexamplesri   	_examplesr   gteachr   r   r   _get_examples   s   
r   c                  C   st   t tjtjtd ddd  d} t	 }| D ]}|
 d |
 dd }}d	||| < q|S )
zfor verbose=Truez
/rules.txtrR   rS   rT   z

r   rW   N
)r[   r\   r]   r^   r_   r`   ra   r
   r   dictrb   r   )rulesrule_id2textrulerule_idtextsr   r   r   get_rule_id2text   s   0r   c                 C   sN   | r!||kr#|t dd|kr%tt|dt| td|d dS dS dS dS )z-displays the process and relevant informationz/[EJPB]r   r   z[1;31mz[0mN)r   r   printr   )verboseoutr   r   r   r   r   gloss  s   r   )r   jamor   r   r\   r   r0   rE   rG   rQ   rp   r   r   r   r   r   r   r   r   r   r   <module>   s      (
