o
    }oiI                  
   @   s   d dl Z d dlZd dlZd dlZd dlmZmZmZ g dZdZ	dZ
dZdefdd	ZddedefddZddedededee fddZdee deeef fddZd dededefddZdS )!    N)DictListUnion)read_wordidsset_grapheme_caseGRAPHEME_CASE_UPPERGRAPHEME_CASE_LOWERGRAPHEME_CASE_MIXEDget_heteronym_spansupperlowermixed
wordid_mapc           
      C   s   t j| st|  di }i }t| ddd@}tj|dd}t|D ])\}}|dkr-q$|d }|d }|d	 }	t|||< ||vrGi ||< |	|| |< q$W d
   ||fS 1 s[w   Y  ||fS )ag  
    Reads wordid file from WikiHomograph dataset,
    https://github.com/google-research-datasets/WikipediaHomographData/blob/master/data/wordids.tsv

    Args:
        wordid_map: path to wordids.tsv
    Returns:
        data_dict: a dictionary of graphemes with corresponding word_id - ipa_form pairs
        wordid_to_idx: word id to label id mapping
    
 not foundrutf-8encoding	)	delimiterr         N)	ospathexists
ValueErroropencsvreader	enumeratelen)
r   	data_dictwordid_to_idxftsv_fileilinegraphemeword_idipa_form r*   R/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/tts/g2p/utils.pyr   &   s*   
r   Twordid_to_phonemes_fileto_lowerc                 C   s   t j| st|  di }t| ddd'}t|D ]\}}|r%| }| d}|d ||d < qW d   |S 1 s@w   Y  |S )	a  
    WikiHomograph and NeMo use slightly different phoneme sets, this function reads WikiHomograph word_ids to NeMo
    IPA heteronyms mapping.

    Args:
        wordid_to_phonemes_file: Path to a file with mapping from wordid predicted by the model to phonemes, e.g.,
            NeMo/scripts/tts_dataset_files/wordid_to_ipa-0.7b_nv22.10.tsv
        to_lower: set to True to lower case wordid
    r   r   r   r   z  r   r   N)	r   r   r   r   r   r   r   stripsplit)r,   r-   wordid_to_nemo_cmur#   r%   r&   r*   r*   r+   get_wordid_to_phonemesH   s   

r1   Ftextremove_spacesdo_lowerexcludec                 C   sx   t j}|dur|D ]}||d}q	td| d d| } tdd| } |r2| dddd } |r8|  } |  S )a\  
    Remove punctuation marks form text

    Args:
        text: input text
        remove_spaces: set to True to remove spaces
        do_lower: set to True to lower case the text
        exclude: specify list of punctuation marks keep in the output, e.g., exclude=["'", "."]

    Return:
        processed text with punctuation marks removed
    N [] z +    )stringpunctuationreplaceresubr.   r   )r2   r3   r4   r5   all_punct_markspr*   r*   r+   remove_punctuation_   s   rB   	sentencessupported_heteronymsc                 C   s   g }g }| D ]U}g }g }d}|   D ]<}|d}	|	D ]2}
t|
ddd}||v rG|  ||}|t| }|||f || |}q|t|
d 7 }qq|| || q||fS )a  
    Find heteronyms in sentences and returns span indices

    Args:
        sentences: sentences to find heteronyms in
        supported_heteronyms: heteronyms to look for

    Return:
        start_end: List[Tuple[int]] - start-end indices that indicate location of found heteronym in the sentence
        heteronyms: List[List[str]] - heteronyms found in sentences, each sentence can contain more than one heteronym
    r   -TF)r4   r3   r   )r   r/   rB   indexr    append)rC   rD   	start_end
heteronymssentcur_start_endcur_heteronyms	start_idxwordword_by_hyphensub_wordno_punct_wordend_idxr*   r*   r+   r
   }   s*   



r
   casereturnc                 C   sH   |dkr
|   }|S |dkr|  }|S |dkr| }|S td| d)Nr   r   r   zCase <zG> is not supported. Please specify either 'upper', 'lower', or 'mixed'.)r   r   r   )r2   rS   text_newr*   r*   r+   r      s   r   )T)FFN)r   )r   r   r>   r;   typingr   r   r   __all__r   r   r	   strr   boolr1   rB   r
   r   r*   r*   r*   r+   <module>   s   " #