o
    8wie*                     @   s,  d Z ddlZddlmZ eeZg dZ	 dZdZdZ	dZ
d	d
 eD Zegee	 ee ee
 e Zdd eeD Zdd eeD ZedZedZdd
 dD Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zd d! Zd"d# Zd$d% Zd&d' Zd(d) Z d*d+ Z!d,d- Z"d.d/ Z#dS )0z)from https://github.com/keithito/tacotron    N)
get_logger)TAAAA0AA1AA2AEAE0AE1AE2AHAH0AH1AH2AOAO0AO1AO2AWAW0AW1AW2AYAY0AY1AY2BCHDDHEHEH0EH1EH2ERER0ER1ER2EYEY0EY1EY2FGHHIHIH0IH1IH2IYIY0IY1IY2JHKLMNNGOWOW0OW1OW2OYOY0OY1OY2PRSSHTTHUHUH0UH1UH2UWUW0UW1UW2VWYZZH_z
!'(),.:;? -4ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzc                 C      g | ]}d | qS @ .0sr]   r]   _/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/speechbrain/utils/text_to_sequence.py
<listcomp>       rb   c                 C   s   i | ]\}}||qS r]   r]   r_   ir`   r]   r]   ra   
<dictcomp>       rf   c                 C   s   i | ]\}}||qS r]   r]   rd   r]   r]   ra   rf      rg   z(.*?)\{(.+?)\}(.*)z\s+c                 C   s*   g | ]}t d |d  t j|d fqS )z\b%s\.r      )recompile
IGNORECASE)r_   xr]   r]   ra   rb      s    ))mrsmissus)mrmister)drdoctor)stsaint)cocompany)jrjunior)majmajor)gengeneral)drsdoctors)revreverend)lt
lieutenant)hon	honorable)sgtsergeant)captcaptain)esqesquire)ltdlimited)colcolonel)ftfortc                 C   s    t D ]\}}t||| } q| S )z Expand abbreviations pre-defined)_abbreviationsri   sub)textregexreplacementr]   r]   ra   expand_abbreviations   s   r   c                 C   s   |   S )zLowercase the text)lowerr   r]   r]   ra   	lowercase   s   r   c                 C   s   t td| S )z&Replaces whitespace by " " in the text )ri   r   _whitespace_rer   r]   r]   ra   collapse_whitespace      r   c                 C   s   |  dd}| S )zConverts text to asciiasciiignore)encodedecode)r   text_encodedr]   r]   ra   convert_to_ascii   s   r   c                 C   s   t | } t| } | S )zPBasic pipeline that lowercases and collapses whitespace without transliteration.)r   r   r   r]   r]   ra   basic_cleaners   s   r   c                 C   s   t | } | S )zLPipeline for German text, that collapses whitespace without transliteration.)r   r   r]   r]   ra   german_cleaners   s   r   c                 C   s   t | } t| } t| } | S )z;Pipeline for non-English text that transliterates to ASCII.)r   r   r   r   r]   r]   ra   transliteration_cleaners   s   r   c                 C   s$   t | } t| } t| } t| } | S )zGPipeline for English text, including number and abbreviation expansion.)r   r   r   r   r   r]   r]   ra   english_cleaners   s
   r   c                 C   sr   g }t | r7t| }|s|tt| |7 }	 |S |tt|d|7 }|t|d7 }|d} t | s|S )a  Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
    The text can optionally have ARPAbet sequences enclosed in curly braces embedded
    in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."

    Arguments
    ---------
    text : str
        string to convert to a sequence
    cleaner_names : list
        names of the cleaner functions to run the text through

    Returns
    -------
    sequence : list
        The integers corresponding to the symbols in the text.
    rh         )len	_curly_rematch_symbols_to_sequence_clean_textgroup_arpabet_to_sequence)r   cleaner_namessequencemr]   r]   ra   text_to_sequence   s   

	r   c                 C   sZ   d}| D ]"}|t v r&t | }t|dkr"|d dkr"d|dd  }||7 }q|ddS )	z+Converts a sequence of IDs back to a string rh   r   r\   z{%s}Nz}{r   )_id_to_symbolr   replace)r   result	symbol_idr`   r]   r]   ra   sequence_to_text  s   r   c                 C   sV   |D ]&}|dkr
t }|dkrt}|dkrt}|dkrt}|s$td| || } q| S )z<Apply different cleaning pipeline according to cleaner_namesr   r   r   r   zUnknown cleaner: %s)r   r   r   r   	Exception)r   r   namecleanerr]   r]   ra   r     s   
r   c                 C   s   dd | D S )zConvert symbols to sequencec                 S   s   g | ]
}t |rt| qS r]   )_should_keep_symbol_symbol_to_idr^   r]   r]   ra   rb   .  s    z(_symbols_to_sequence.<locals>.<listcomp>r]   )symbolsr]   r]   ra   r   ,  r   r   c                 C   s   t dd |  D S )z Prepend "@" to ensure uniquenessc                 S   rZ   r[   r]   r^   r]   r]   ra   rb   3  rc   z(_arpabet_to_sequence.<locals>.<listcomp>)r   splitr   r]   r]   ra   r   1  s   r   c                 C   s   | t v o| dko| dkS )z Whether to keep a certain symbolrW   ~)r   )r`   r]   r]   ra   r   6  s   r   c                 C   sl  t d|}|D ] }|dd}|dd}|dd}|dd}|||}qt d|}z| |}W n tyI   td|  t  Y nw d|d	}g }d
}	z|D ]}
|
dvrn|	||	 d |	d7 }	qY|
|
 qYW n1 ty   td|  |D ]}
|
dvr| |
}dd |D }|	| q|
|
 qY nw d|v r|d d|v s|S )aP  Do grapheme to phoneme and keep the punctuations between the words

    Arguments
    ---------
    g2p_model: speechbrain.inference.text.GraphemeToPhoneme
        Model to apply to the given text while keeping punctuation.
    text: string
        the input text.

    Returns
    -------
    The text string's corresponding phoneme symbols with punctuation symbols.

    Example
    -------
    >>> from speechbrain.inference.text import GraphemeToPhoneme
    >>> g2p_model = GraphemeToPhoneme.from_hparams("speechbrain/soundchoice-g2p") # doctest: +SKIP
    >>> from speechbrain.utils.text_to_sequence import _g2p_keep_punctuations # doctest: +SKIP
    >>> text = "Hi, how are you?" # doctest: +SKIP
    >>> _g2p_keep_punctuations(g2p_model, text) # doctest: +SKIP
    ['HH', 'AY', ',', ' ', 'HH', 'AW', ' ', 'AA', 'R', ' ', 'Y', 'UW', '?']
    z\w+[-':\.][-':\.\w]*\w+rX   r   ':.z[\w]+|[-!'(),.:;? ]zerror with text: r   r   z-!'(),.:;? rh   zEDo g2p word by word because of unexpected outputs from g2p for text: c                 S   s   g | ]}|d kr|qS )r   r]   )r_   re   r]   r]   ra   rb   y  s    z*_g2p_keep_punctuations.<locals>.<listcomp>)ri   findallr   RuntimeErrorloggerinfoquitjoinr   extendappend
IndexErrorwarningg2premove)	g2p_modelr   special_wordsspecial_wordrmpall_phonemesword_phonemesphonemes_with_punccountre   pp_without_spacer]   r]   ra   _g2p_keep_punctuations;  sP   



r   )$__doc__ri   speechbrain.utils.loggerr   __name__r   valid_symbols_pad_punctuation_special_letters_arpabetlistr   	enumerater   r   rj   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r]   r]   r]   ra   <module>   sF    X 

	 