o
    i@                     @   s^  d dl Z d dlmZ d dlZd dlmZ d dlmZ d dlmZ d dlmZ d dl	Z	d dl
Z
d dlmZ g dZd	ee fd
dZd	ee fddZd	ee fddZd	ee fddZd	ee fddZd)deded	ee fddZdd Zd	ee fddZd	ee fddZG dd  d ZG d!d" d"ZG d#d$ d$ZG d%d& d&ZG d'd( d(eZdS )*    N)Path)Iterable)List)Optional)Union)AbsTokenizer)Ng2p_eng2p_en_no_spacepyopenjtalkpyopenjtalk_kanapyopenjtalk_accentpyopenjtalk_accent_with_pausepyopenjtalk_prosodypypinyin_g2ppypinyin_g2p_phoneespeak_ng_arabicespeak_ng_germanespeak_ng_frenchespeak_ng_spanishespeak_ng_russianespeak_ng_greekespeak_ng_finnishespeak_ng_hungarianespeak_ng_dutchespeak_ng_english_us_vitsespeak_ng_hindig2pkg2pk_no_spacekorean_jasokorean_jaso_no_spacereturnc                 C   s2   d| v r|  dd} dd | dD S | dS )Nz   z	 <space> c                 S      g | ]}| d dqS )<space> replace.0c r)   V/home/ubuntu/.local/lib/python3.10/site-packages/funasr/tokenizer/phoneme_tokenizer.py
<listcomp>0       z"split_by_space.<locals>.<listcomp>r#   )r%   split)textr)   r)   r*   split_by_space-   s   
r/   c                 C   s$   dd l }|j| dd}|d}|S )Nr   Fkanar#   )r
   g2pr-   )r.   r
   phonesr)   r)   r*   pyopenjtalk_g2p5   s   
r4   c                 C   sj   dd l }dd l}g }|| d D ]!}|d|}t|dkr2||d d |d d |d d g7 }q|S )Nr      -\-(.*?)\+.*?\/A:([0-9\-]+).*?\/F:.*?_([0-9]+)   )r
   rerun_frontendfindalllenr.   r
   r8   r3   labelspr)   r)   r*   pyopenjtalk_g2p_accent>   s   &r?   c                 C   s   dd l }dd l}g }|| d D ]5}|dd dd dkr'|dg7 }q|d|}t|dkrF||d d |d d |d d g7 }q|S )Nr   r5   -+paur6   r7   )r
   r8   r9   r-   r:   r;   r<   r)   r)   r*   !pyopenjtalk_g2p_accent_with_pauseJ   s   
&rC   c                 C   s   dd l }|j| dd}t|S )Nr   Tr0   )r
   r2   list)r.   r
   kanasr)   r)   r*   pyopenjtalk_g2p_kanaY   s   rF   Tr.   drop_unvoiced_vowelsc                 C   s  ddl }|| d }t|}g }t|D ]}|| }td|d}|r.|dv r.| }|dkrg|dks>||d ks>J |dkrH|d n||d krft	d|}	|	dkr]|d	 n	|	dkrf|d
 q|dkrq|d q|| t	d|}
t	d|}t	d|}t	d|}t	d||d  }|dkr|dkr|dv r|d q|
dkr||d kr||kr|d q|dkr|dkr|d q|S )u5  Extract phoneme + prosoody symbol sequence from input full-context labels.

    The algorithm is based on `Prosodic features control by symbols as input of
    sequence-to-sequence acoustic modeling for neural TTS`_ with some r9y9's tweaks.

    Args:
        text (str): Input text.
        drop_unvoiced_vowels (bool): whether to drop unvoiced vowels.

    Returns:
        List[str]: List of phoneme + prosody symbols.

    Examples:
        >>> from funasr.tokenizer.phoneme_tokenizer import pyopenjtalk_g2p_prosody
        >>> pyopenjtalk_g2p_prosody("こんにちは。")
        ['^', 'k', 'o', '[', 'N', 'n', 'i', 'ch', 'i', 'w', 'a', '$']

    .. _`Prosodic features control by symbols as input of sequence-to-sequence acoustic
        modeling for neural TTS`: https://doi.org/10.1587/transinf.2020EDP7104

    r   Nr5   z	\-(.*?)\+AEIOUsil^z!(\d+)_$?rB   _z/A:([0-9\-]+)\+z	\+(\d+)\+z\+(\d+)/z	/F:(\d+)_aeiouAEIOUNcl#]r7   [)
r
   r9   r;   ranger8   searchgrouplowerappend_numeric_feature_by_regex)r.   rG   r
   r=   Nr3   nlab_currp3e3a1a2a3f1a2_nextr)   r)   r*   pyopenjtalk_g2p_prosody`   sJ   








rb   c                 C   s&   t | |}|d u rdS t|dS )Nir5   )r8   rS   intrT   )regexsmatchr)   r)   r*   rW      s   rW   c                 C   s4   ddl m} ddl m} dd || |jdD }|S )Nr   pinyinStylec                 S   s   g | ]}|d  qS )r   r)   )r'   phoner)   r)   r*   r+          z pypinyin_g2p.<locals>.<listcomp>style)pypinyinrh   rj   TONE3r.   rh   rj   r3   r)   r)   r*   r      s   r   c                    sR   ddl m} ddl m} ddlm  ddlm  fdd|| |jdD }|S )	Nr   rg   ri   )
get_finals)get_initialsc                    sB   g | ]}|d  dd |d  ddfD ]
}t |d kr|qqS )r   T)strict)r;   )r'   rk   r>   rr   rs   r)   r*   r+      s    z&pypinyin_g2p_phone.<locals>.<listcomp>rm   )ro   rh   rj   pypinyin.style._utilsrr   rs   rp   rq   r)   ru   r*   r      s   	r   c                   @   s2   e Zd ZdZd
defddZdee fddZd	S )G2p_enzOn behalf of g2p_en.G2p.

    g2p_en.G2p isn't pickalable and it can't be copied to the other processes
    via multiprocessing module.
    As a workaround, g2p_en.G2p is instantiated upon calling this class.

    Fno_spacec                 C   s   || _ d | _d S N)rx   r2   )selfrx   r)   r)   r*   __init__      
zG2p_en.__init__r    c                 C   s:   | j d u r
t | _ |  |}| jrttdd |}|S )Nc                 S      | dkS Nr#   r)   re   r)   r)   r*   <lambda>       z!G2p_en.__call__.<locals>.<lambda>)r2   r   G2prx   rD   filter)rz   r.   r3   r)   r)   r*   __call__   s   


zG2p_en.__call__N)F)	__name__
__module____qualname____doc__boolr{   r   strr   r)   r)   r)   r*   rw      s    rw   c                   @   s,   e Zd ZdZd	ddZdee fddZdS )
G2pkzOn behalf of g2pk.G2p.

    g2pk.G2p isn't pickalable and it can't be copied to the other processes
    via multiprocessing module.
    As a workaround, g2pk.G2p is instantiated upon calling this class.

    Fc                 C   s"   || _ || _|| _|| _d | _d S ry   )
descritivegroup_vowelsto_sylrx   r2   )rz   r   r   r   rx   r)   r)   r*   r{      s
   
zG2pk.__init__r    c                 C   sT   | j d u rdd l}| | _ t| j || j| j| jd}| jr(ttdd |}|S )Nr   )descriptiver   r   c                 S   r}   r~   r)   r   r)   r)   r*   r     r   zG2pk.__call__.<locals>.<lambda>)	r2   r   r   rD   r   r   r   rx   r   )rz   r.   r   r3   r)   r)   r*   r      s   

zG2pk.__call__N)FFFF)r   r   r   r   r{   r   r   r   r)   r)   r)   r*   r      s    
r   c                   @   s   e Zd ZdZdZddd eddD Zddd ed	d
D Zddd eddD Z	ee e	 e e Z
dddZdedee fddZdd Zdee fddZdS )Jasoz
!'(),-.:;?r#    c                 C      g | ]}t |qS r)   chrr'   rM   r)   r)   r*   r+     rl   zJaso.<listcomp>i   i  c                 C   r   r)   r   r   r)   r)   r*   r+     rl   ia  iv  c                 C   r   r)   r   r   r)   r)   r*   r+     rl   i  i  Fc                 C   s   || _ || _d S ry   space_symbolrx   )rz   r   rx   r)   r)   r*   r{     r|   zJaso.__init__liner    c                 C   s   t t|}|S ry   )rD   jamohangul_to_jamo)rz   r   jasosr)   r)   r*   _text_to_jaso  s   zJaso._text_to_jasoc                    s    fdd|D }|S )Nc                    s   g | ]	}| j v r|qS r)   )VALID_CHARS)r'   tokenrz   r)   r*   r+     s    z6Jaso._remove_non_korean_characters.<locals>.<listcomp>r)   )rz   tokens
new_tokensr)   r   r*   _remove_non_korean_characters  s   z"Jaso._remove_non_korean_charactersc                    sP   dd   |D } |} jrttdd |}|S  fdd|D }|S )Nc                 S   s   g | ]}|qS r)   r)   r'   xr)   r)   r*   r+   #  s    z!Jaso.__call__.<locals>.<listcomp>c                 S   r}   r~   r)   r   r)   r)   r*   r   '  r   zJaso.__call__.<locals>.<lambda>c                    s   g | ]}|d kr
|n j qS )r#   )r   r   r   r)   r*   r+   )  s    )r   r   rx   rD   r   )rz   r.   	graphemesr)   r   r*   r   "  s   
zJaso.__call__N)r#   F)r   r   r   PUNCSPACEjoinrR   
JAMO_LEADSJAMO_VOWELS
JAMO_TAILSr   r{   r   r   r   r   r   r)   r)   r)   r*   r     s    
r   c                	   @   sT   e Zd ZdZ					ddee dee dee defd	d
Zdee fddZ	dS )
Phonemizera5  Phonemizer module for various languages.

    This is wrapper module of https://github.com/bootphon/phonemizer.
    You can define various g2p modules by specifying options for phonemizer.

    See available options:
        https://github.com/bootphon/phonemizer/blob/master/phonemizer/phonemize.py#L32

    Nr#   Fword_separatorsyllable_separatorphone_separatorsplit_by_single_tokenc                 K   sj   ddl m} ddlm}	 |	|||d| _td}
|
tj || di |d|
i| _	|| _
|| _d S )Nr   )BACKENDS)	Separator)wordsyllablerk   
phonemizerloggerr)   )phonemizer.backendr   phonemizer.separatorr   	separatorlogging	getLoggersetLevelERRORr   stripr   )rz   backendr   r   r   r   r   phonemizer_kwargsr   r   r   r)   r)   r*   r{   8  s    



zPhonemizer.__init__r    c                 C   s:   | j j|g| j| jddd }| js| S dd |D S )Nr5   )r   r   njobsr   c                 S   r!   )r#   r"   r$   r&   r)   r)   r*   r+   b  r,   z'Phonemizer.__call__.<locals>.<listcomp>)r   	phonemizer   r   r   r-   )rz   r.   r   r)   r)   r*   r   V  s   zPhonemizer.__call__)NNr#   FF)
r   r   r   r   r   r   r   r{   r   r   r)   r)   r)   r*   r   -  s"    
r   c                	   @   sx   e Zd Z			ddedef deeeee f dedefdd	Zd
d Z	dede
e fddZdee defddZdS )PhonemeTokenizerNr"   Fg2p_typenon_linguistic_symbolsr   remove_non_linguistic_symbolsc              
   C   s   |d u r	t | _n&|dkrtdd| _n|dkr!tdd| _n|dkr*t| _n|dkr2t| _n|dkr:t| _n|d	krBt| _n|d
krJt| _n|dkrRt| _n|dkrZt	| _n|dkrht
ddddd| _n|dkrvt
ddddd| _n|dkrt
ddddd| _n|dkrt
ddddd| _n|dkrt
ddddd| _n|dkrt
ddddd| _n|dkrt
ddddd| _ns|dkrt
ddddd| _ne|dkrt
d dddd| _nW|d!krt
d"dddd| _nI|d#krtdd| _n>|d$krtdd| _n3|d%krt
d&ddddd'd(dd)| _n |d*krt|dd+| _n|d,kr(tdd| _ntd-| || _|| _|d u r?t | _nLt|ttfrt|}z$|jd.d/d0}td1d2 |D | _W d    n	1 siw   Y  W n ty   t| d3 t | _Y nw t|| _|| _d S )4Nr   F)rx   r	   Tr
   r   r   r   r   r   r   r   arespeak)languager   with_stresspreserve_punctuationr   der   zfr-frr   esr   rur   elr   fir   hur   nlr   hir   r   r   zen-usr#   r   )r   r   r   r   r   r   r   r   r   r   r   zNot supported: g2p_type=rzutf-8)encodingc                 s   s    | ]}|  V  qd S ry   )rstrip)r'   r   r)   r)   r*   	<genexpr>  s    z,PhonemeTokenizer.__init__.<locals>.<genexpr>z doesn't exist.)r/   r2   rw   r4   rF   r?   rC   rb   r   r   r   r   r   NotImplementedErrorr   r   setr   
isinstancer   r   openFileNotFoundErrorwarningswarnr   )rz   r   r   r   r   fr)   r)   r*   r{   f  s   




















zPhonemeTokenizer.__init__c                 C   s&   | j j d| j d| j d| j dS )Nz(g2p_type="z", space_symbol="z", non_linguistic_symbols="z"))	__class__r   r   r   r   r   r)   r)   r*   __repr__  s   
zPhonemeTokenizer.__repr__r   r    c                 C   s   g }t |dkr@| jD ]}||r*| js ||d t |  |t |d  } nq|d }|| |dd  }t |dksd|}| |}|S )Nr   r5   r   )r;   r   
startswithr   rV   r   r2   )rz   r   r   wtr)   r)   r*   text2tokens  s    




zPhonemeTokenizer.text2tokensr   c                 C   s
   d |S )Nr   )r   )rz   r   r)   r)   r*   tokens2text
  s   
zPhonemeTokenizer.tokens2text)Nr"   F)r   r   r   r   r   r   r   r   r{   r   r   r   r   r)   r)   r)   r*   r   e  s"    

 
	r   )T)r   pathlibr   r8   typingr   r   r   r   r   r   funasr.tokenizer.abs_tokenizerr   g2p_classesr   r/   r4   r?   rC   rF   r   rb   rW   r   r   rw   r   r   r   r   r)   r)   r)   r*   <module>   s2    	O$!8