o
    }oi|b                     @   s   d dl Z d dlZd dlZd dlmZ d dlmZmZmZm	Z	m
Z
mZmZ d dlmZ d dlmZmZmZmZ d dlmZ d dlmZmZmZ d dlmZ d d	lmZ eG d
d deZdS )    N)defaultdict)CallableDictListOptionalSetTupleUnion)validate_locale)LATIN_CHARS_ALLany_locale_word_tokenizeenglish_word_tokenizenormalize_unicode_text)BaseG2p)GRAPHEME_CASE_MIXEDGRAPHEME_CASE_UPPERset_grapheme_case)logging)experimentalc                       s  e Zd ZddgZede dZede dZddddd	dde	d
df
de
eejeeeee  f f dedeeegef  dedee
eejee f  dedee dee dee dee dee ddf fddZede
eejeeeee  f f deeeee  f fddZde
eejeeeee  f f fddZede
eejf dee fddZd edee fd!d"Zd#eeeee  f deeeeee  f ef fd$d%Zd/d&d'Zd edefd(d)Zd edeee ef fd*d+Zd,edee fd-d.Z   Z!S )0IpaG2pu   ˈu   ˌ[z\d]z[^en-USNTF phoneme_dictlocaleapply_to_oov_wordignore_ambiguous_words
heteronyms	use_charsphoneme_probabilityuse_stressesgrapheme_casegrapheme_prefixmapping_filereturnc                    sD  | _ |	 _|
 _| _| _t  _|durt| |s, jdur,d _	t
d n| _	 |}|r? |\} _nt| d|du rOt
d |dkrVt}nt}t j||||d | _t|tspt|tjryt | _nt|trtdd	 |D rt| _nd _ jr fd
d jD  _dS dS )u  
        Generic IPA G2P module. This module converts words from graphemes to International Phonetic Alphabet
        representations. Optionally, it can ignore heteronyms, ambiguous words, or words marked as unchangeable
        by `word_tokenize_func` (see code for details). Ignored words are left unchanged or passed through
        `apply_to_oov_word` for handling.

        Args:
            phoneme_dict (str, Path, or Dict): Path to file in CMUdict format or an IPA dict object with CMUdict-like
                entries. For example,
                a dictionary file: scripts/tts_dataset_files/ipa_cmudict-0.7b_nv22.06.txt;
                a dictionary object: {..., "Wire": [["ˈ", "w", "a", "ɪ", "ɚ"], ["ˈ", "w", "a", "ɪ", "ɹ"]], ...}.
            locale (str): Locale used to determine a locale-specific tokenization logic. Currently, it supports "en-US",
                "de-DE", and "es-ES". Defaults to "en-US". Specify None if implementing custom logic for a new locale.
            apply_to_oov_word (Callable): Function that deals with the out-of-vocabulary (OOV) words that do not exist
                in the `phoneme_dict`.
            ignore_ambiguous_words (bool): Whether to handle word via phoneme_dict with ambiguous phoneme sequences.
                Defaults to True.
            heteronyms (str, Path, List[str]): Path to file that includes heteronyms (one word entry per line), or a
                list of words.
            use_chars (bool): Whether to include chars/graphemes in the token list. It is True if `phoneme_probability`
                is not None or if `apply_to_oov_word` function ever returns graphemes.
            phoneme_probability (Optional[float]): The probability (0.0 <= ε <= 1.0) that is used to balance the action
                that a word in a sentence is whether transliterated into a sequence of phonemes, or kept as a sequence
                of graphemes. If a random number for a word is greater than ε, then the word is kept as graphemes;
                otherwise, the word is transliterated as phonemes. Defaults to None which is equivalent to setting it
                to 1.0, meaning always transliterating the word into phonemes. Note that this code path is only run if
                the word can be transliterated into phonemes, otherwise, if a word does not have an entry in the g2p
                dict, it will be kept as graphemes. If a word has multiple pronunciations as shown in the g2p dict and
                `ignore_ambiguous_words` is True, it will be kept as graphemes as well.
            use_stresses (Optional[bool]): Whether to include the stress symbols (ˈ and ˌ).
            grapheme_case (Optional[str]): Trigger converting all graphemes to uppercase, lowercase, or keeping them as
                original mix-cases. You may want to use this feature to distinguish the grapheme set from the phoneme
                set if there is an overlap in between. Defaults to `upper` because phoneme set only uses lowercase
                symbols. You could explicitly prepend `grapheme_prefix` to distinguish them.
            grapheme_prefix (Optional[str]): Prepend a special symbol to any graphemes in order to distinguish graphemes
                from phonemes because there may be overlaps between the two set. It is suggested to choose a prefix that
                is not used or preserved somewhere else. "#" could be a good candidate. Default to "".
            TODO @borisfom: add docstring for newly added `mapping_file` argument.
        NTzdphoneme_probability was not None, characters will be enabled even though use_chars was set to False.z contains no entries!a  apply_to_oov_word=None, This means that some of words will remain unchanged if they are not handled by any of the rules in self.parse_one_word(). This may be intended if phonemes and chars are both valid inputs, otherwise, you may see unexpected deletions in your input.r   )r   word_tokenize_funcr   r#   c                 s   s    | ]}t |tV  qd S )N)
isinstancestr.0het r+   \/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/tts/g2p/models/i18n_ipa.py	<genexpr>   s    z"IpaG2p.__init__.<locals>.<genexpr>c                    s   h | ]	}t | jd qS )case)r   r!   r(   selfr+   r,   	<setcomp>       z"IpaG2p.__init__.<locals>.<setcomp>)r    r!   r"   r   r   randomRandom_rngr
   r   r   warning_parse_phoneme_dict_normalize_dictsymbols
ValueErrorr   r   super__init__r   r&   r'   pathlibPathset_parse_file_by_linesr   listall)r1   r   r   r   r   r   r   r   r    r!   r"   r#   phoneme_dict_obj_phoneme_dictr%   	__class__r0   r,   r=   +   sP   5


zIpaG2p.__init__c                 C   s  t | tst | tjrtt}td}t| ddd}|D ]w}t	|dkr'qt
|}d|d   kr7dksrn d|d   krDd	ksrn d
|d   krQdksrn d|d   kr^dksrn d|d   krkdksrn |d dkr| jdd}t|d|d }tdd|d }|| t| qW d   |S 1 sw   Y  |S td i }|  D ]%\}}t |tsJ dt| dt
|}dd |D }|||i q|S )u  
        parse an input IPA dictionary and save it as a dict object.

        Args:
            phoneme_dict (Union[str, pathlib.Path, dict]): Path to file in CMUdict format or an IPA dict object with
                CMUdict-like entries. For example,
                a dictionary file: scripts/tts_dataset_files/ipa_cmudict-0.7b_nv22.06.txt;
                a dictionary object: {..., "Wire": [["ˈ", "w", "a", "ɪ", "ɚ"], ["ˈ", "w", "a", "ɪ", "ɹ"]], ...}.

        Returns: a dict object (Dict[str, List[List[str]]]).
        z
\([0-9]+\)rutf-8encodingr   AZaz   À   Ö   Ø   ö   ø   ÿ'   )maxsplitr   z\s+NzGLoading phoneme_dict as a Dict object, and validating its entry format.zPronunciation type <z-> is not supported. Please convert to <list>.c                 S   s   g | ]	}d d |D qS )c                 S   s   g | ]}t |qS r+   )r   )r)   pr+   r+   r,   
<listcomp>       z9IpaG2p._parse_phoneme_dict.<locals>.<listcomp>.<listcomp>r+   )r)   pronr+   r+   r,   rZ      r3   z.IpaG2p._parse_phoneme_dict.<locals>.<listcomp>)r&   r'   r>   r?   r   rB   recompileopenlenr   stripsplitsubappendr   infoitemstypeupdate)r   rD   _alt_refdictlinepartswordpronsr+   r+   r,   r8      sH   

++
zIpaG2p._parse_phoneme_dictc                 C   s   |  || _dS )zF
        Replace model's phoneme dictionary with a custom one
        N)r8   r   )r1   r   r+   r+   r,   replace_dict   s   zIpaG2p.replace_dictrY   c                 C   sD   t | ddd}dd | D W  d    S 1 sw   Y  d S )NrH   rI   rJ   c                 S   s   g | ]}|  qS r+   )rstrip)r)   rk   r+   r+   r,   rZ      r[   z/IpaG2p._parse_file_by_lines.<locals>.<listcomp>)r_   	readlines)rY   fr+   r+   r,   rA      s   $zIpaG2p._parse_file_by_linesrm   c                    s    fdd|D S )Nc                       g | ]	} j  | qS r+   r"   )r)   	characterr0   r+   r,   rZ      r3   z7IpaG2p._prepend_prefix_for_one_word.<locals>.<listcomp>r+   r1   rm   r+   r0   r,   _prepend_prefix_for_one_word      z#IpaG2p._prepend_prefix_for_one_wordrD   c           
         s   t t}t }| D ]T\}}t| jd} jr( jd|}|	 
| t } js@|D ]}	| fdd|	D  q0n|}|D ]}	|	|	 qD|||<  jtkr_| s_||| < q||fS )u  
        Parse a python dict object according to the decision on word cases and removal of lexical stress markers.

        Args:
            phoneme_dict_obj (Dict[str, List[List[str]]]): a dictionary object.
                e.g. {..., "Wire": [["ˈ", "w", "a", "ɪ", "ɚ"], ["ˈ", "w", "a", "ɪ", "ɹ"]], ...}

        Returns:
            g2p_dict (dict): processed dict.
            symbols (set): a IPA phoneme set, or its union with grapheme set.

        r.   r   c                    s   g | ]	}| j vr|qS r+   )STRESS_SYMBOLS)r)   symbolr0   r+   r,   rZ     r3   z*IpaG2p._normalize_dict.<locals>.<listcomp>)r   rB   r@   rf   r   r!   r   PUNCT_REGEXrc   rh   rw   r    rd   r   isupperupper)
r1   rD   g2p_dictr:   rm   rn   word_newword_no_punct	prons_newr\   r+   r0   r,   r9      s(   	zIpaG2p._normalize_dictc                 C   s   t |}g }i }| j D ]K\}}t | t|| j}|| }	|	r(|| qg }
|D ]}t || }|s;|
| q,t|
t|krX|sL|| q|
sT|| q|
||< q|D ]}| j|= q[|rj| j| || _	dS )a`  Replaces the vocabulary of symbols with the one given.
        Also filters out any entries with illegal graphemes or phonemes according to the new vocab.

        Args:
            symbols (List, Set): User-provided set of valid symbols, both graphemes and phonemes
            keep_alternate (bool): Whether to keep the other pronunciation(s) of a word if not all contain
                illegal phonemes (and the word doesn't contain illegal graphemes).
                Warning: this may change a word from being ambiguous to having only one valid pronunciation.
                Defaults to True.
        N)
r@   r   rf   rw   r   r!   rd   r`   rh   r:   )r1   r:   keep_alternatenew_symbolsdeletion_wordsreplacement_dictrm   rn   word_graphemes	word_difflegal_pronsr\   	pron_diffdel_wordr+   r+   r,   replace_symbols$  s6   



zIpaG2p.replace_symbolsc                 C   s   t | j| dkS )NrW   )r`   r   rv   r+   r+   r,   is_unique_in_phoneme_dictW  rx   z IpaG2p.is_unique_in_phoneme_dictc           	      C   s"  t || jd}| j|du rt|dfS | jdur)| j | jkr)| |dfS | j	r8|| j	v r8| |dfS | j
dkr-t|dkr|dsN|drd}|| jvr| | jvr|dd | jv rl|dd }n|dd  | jv r|dd  }|dur| jr| |r|d	 d
v r| j| d dg dfS |d	 dv r| j| d ddg dfS | j| d dg dfS t|dkr-|ds|dr-d}|| jvr| | jvr|dd	 | jv r|dd	 }n|dd	  | jv r|dd	  }|dur-| jr| |r-|d	 d
v r!| j| d dg dfS | j| d dg dfS | j
dkrg d}g d}t||D ]x\}}|d }t|dkr||s]|| rd}|| jvr| | jvrt|}||d | jv r||d }n||d  | jv r||d  }|dur| jr| |rdd |D | j| d  df  S q@|| jv r| jr| |r| j| d dfS | jtkr|| jvr| | jv r| }| jr| |r| j| d dfS | jdur
| |dfS | |dfS )zXReturns parsed `word` and `status` (bool: False if word wasn't handled, True otherwise).r.   NTr      z'sz'S)Ttr   s)Sr   u   ɪrO   rW   r   zfr-FR)lcdjmnqur   r   puisqulorsqujusqu)r   r   r   u   ʒr   r   kr   r   pyisku   loʁsku   ʒyskrV   c                 S   s   g | ]}|qS r+   r+   )r)   r   r+   r+   r,   rZ     s    z)IpaG2p.parse_one_word.<locals>.<listcomp>F)r   r!   
CHAR_REGEXsearchrB   r   r6   r4   rw   r   r   r`   endswithr   r}   r   r   zip
startswithr   r   )	r1   rm   
word_foundcontractions_gcontractions_pcont_gcont_pstarterstart_indexr+   r+   r,   parse_one_wordZ  s    $
*
$ zIpaG2p.parse_one_wordtextc              
      s:  t |} jd ur3z jj|gdd d }W n ty2 } ztd| d W Y d }~nd }~ww  |}g }|D ]^\}}|rO| fdd|D  q<t|dksaJ | dt| d	|d } 	|\}}	|	s|
d
}
t|
dkrg }|
D ]} 	|\}}|| |d
 q}|  || q<|S )N)	sentencesrW   r   zHeteronym model failed z
, skippingc                    rs   r+   rt   )r)   rm   r0   r+   r,   rZ     r3   z#IpaG2p.__call__.<locals>.<listcomp>zK should only have a single item when `without_changes` is False, but found .-)r   heteronym_modeldisambiguate	Exceptionr   r7   r%   extendr`   r   rb   rd   pop)r1   r   ewords_list_of_tuplern   wordswithout_changesrm   r\   
is_handledsubwords_by_hyphensub_wordrY   _r+   r0   r,   __call__  s:   



zIpaG2p.__call__)T)"__name__
__module____qualname__ry   r]   r^   r   r   r{   r   r	   r'   r>   r?   r   r   r   r   boolfloatr=   staticmethodr8   ro   rA   rw   r   r   r9   r   r   r   r   __classcell__r+   r+   rF   r,   r   "   sl    	
r*@":
83ir   )r>   r4   r]   collectionsr   typingr   r   r   r   r   r   r	   =nemo.collections.common.tokenizers.text_to_speech.ipa_lexiconr
   Anemo.collections.common.tokenizers.text_to_speech.tokenizer_utilsr   r   r   r   $nemo.collections.tts.g2p.models.baser   nemo.collections.tts.g2p.utilsr   r   r   
nemo.utilsr   nemo.utils.decoratorsr   r   r+   r+   r+   r,   <module>   s   $