o
    i$                     @   s   d Z ddlZddlZddlZddlmZmZmZ ddlm	Z	 ddl
mZmZmZ e r/ddlZe r6ddlZeeZddiZd	d
 ZG dd de	ZdgZdS )zTokenization class for VITS.    N)AnyOptionalUnion   )PreTrainedTokenizer)is_phonemizer_availableis_uroman_availablelogging
vocab_filez
vocab.jsonc                 C   s    t d}|| }|d u}|S )Nz[^\x00-\x7F])recompilesearch)input_stringnon_roman_patternmatchhas_non_roman r   g/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/transformers/models/vits/tokenization_vits.pyhas_non_roman_characters%   s   

r   c                       s   e Zd ZdZeZddgZ							d&	d' fd
dZedd Z	dd Z
dd Zdd Z	d(dededee d	eeeeef f fddZded	ee fddZdee d	efddZdd Zd d! Zd)d"ed#ee d	eee df fd$d%Z  ZS )*VitsTokenizera  
    Construct a VITS tokenizer. Also supports MMS-TTS.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        language (`str`, *optional*):
            Language identifier.
        add_blank (`bool`, *optional*, defaults to `True`):
            Whether to insert token id 0 in between the other tokens.
        normalize (`bool`, *optional*, defaults to `True`):
            Whether to normalize the input text by removing all casing and punctuation.
        phonemize (`bool`, *optional*, defaults to `True`):
            Whether to convert the input text into phonemes.
        is_uroman (`bool`, *optional*, defaults to `False`):
            Whether the `uroman` Romanizer needs to be applied to the input text prior to tokenizing.
    	input_idsattention_mask<pad><unk>NTFreturnc	              
      s   t |dd}
t|
| _W d    n1 sw   Y  dd | j D | _|| _|| _|| _|| _	|| _
t jd|||||||d|	 d S )Nutf-8encodingc                 S   s   i | ]\}}||qS r   r   ).0kvr   r   r   
<dictcomp>W   s    z*VitsTokenizer.__init__.<locals>.<dictcomp>)	pad_token	unk_tokenlanguage	add_blank	normalize	phonemize	is_uromanr   )openjsonloadencoderitemsdecoderr$   r%   r&   r'   r(   super__init__)selfr
   r"   r#   r$   r%   r&   r'   r(   kwargsvocab_handle	__class__r   r   r0   H   s(   
zVitsTokenizer.__init__c                 C   s
   t | jS N)lenr,   r1   r   r   r   
vocab_sizej   s   
zVitsTokenizer.vocab_sizec                    s(    fddt  jD }| j |S )Nc                    s   i | ]}  ||qS r   )convert_ids_to_tokens)r   ir8   r   r   r!   o   s    z+VitsTokenizer.get_vocab.<locals>.<dictcomp>)ranger9   updateadded_tokens_encoder)r1   vocabr   r8   r   	get_vocabn   s   zVitsTokenizer.get_vocabc                 C   s   t | j t | j  }d}d}|t|k rMd}|D ]}|||t|  |kr8||7 }|t|7 }d} nq|sG|||  7 }|d7 }|t|k s|S )zfLowercase the input string, respecting any special token ids that may be part or entirely upper-cased. r   FT   )listr,   keysr>   r7   lower)r1   r   all_vocabularyfiltered_textr;   found_matchwordr   r   r   normalize_texts   s"   zVitsTokenizer.normalize_textc                 C   s   | j dkr|dd}|S )z4Special treatment of characters in certain languagesronu   țu   ţ)r$   replace)r1   textr   r   r   _preprocess_char   s   
zVitsTokenizer._preprocess_charrM   is_split_into_wordsr&   c                    s   |dur|n j }|r |} |}t|r. jr.t s%td n	t	 }|
|} jrNt s8tdtj|dddddd}tdd	|}||fS |r`d
tt fdd| }||fS )a  
        Performs any necessary transformations before tokenization.

        This method should pop the arguments from kwargs and return the remaining `kwargs` as well. We test the
        `kwargs` at the end of the encoding process to be sure all the arguments have been used.

        Args:
            text (`str`):
                The text to prepare.
            is_split_into_words (`bool`, *optional*, defaults to `False`):
                Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
                tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
                which it will tokenize.
            normalize (`bool`, *optional*, defaults to `None`):
                Whether or not to apply punctuation and casing normalization to the text inputs. Typically, VITS is
                trained on lower-cased and un-punctuated text. Hence, normalization is used to ensure that the input
                text consists only of lower-case characters.
            kwargs (`dict[str, Any]`, *optional*):
                Keyword arguments to use for the tokenization.

        Returns:
            `tuple[str, dict[str, Any]]`: The prepared text and the unused kwargs.
        NaC  Text to the tokenizer contains non-Roman characters. To apply the `uroman` pre-processing step automatically, ensure the `uroman` Romanizer is installed with: `pip install uroman` Note `uroman` requires python version >= 3.10Otherwise, apply the Romanizer manually as per the instructions: https://github.com/isi-nlp/uromanzEPlease install the `phonemizer` Python package to use this tokenizer.zen-usespeakT)r$   backendstrippreserve_punctuationwith_stressz\s+ rA   c                    s
   |  j v S r6   )r,   )charr8   r   r   <lambda>   s   
 z8VitsTokenizer.prepare_for_tokenization.<locals>.<lambda>)r&   rJ   rN   r   r(   r   loggerwarningurUromanromanize_stringr'   r   ImportError
phonemizerr   subjoinrC   filterrR   )r1   rM   rO   r&   r2   rG   uromanr   r8   r   prepare_for_tokenization   s6   


 z&VitsTokenizer.prepare_for_tokenizationc                 C   s@   t |}| jr| dgt|d d  }||ddd< |}|S )z]Tokenize a string by inserting the `<pad>` token at the boundary between adjacent characters.r      rB   N)rC   r%   _convert_id_to_tokenr7   )r1   rM   tokensinterspersedr   r   r   	_tokenize   s   zVitsTokenizer._tokenizerf   c                 C   s*   | j rt|dkr|dd d }d|S )NrB   rd   rA   )r%   r7   r`   )r1   rf   r   r   r   convert_tokens_to_string   s   
z&VitsTokenizer.convert_tokens_to_stringc                 C   s   | j || j | jS )z0Converts a token (str) in an id using the vocab.)r,   getr#   )r1   tokenr   r   r   _convert_token_to_id   s   z"VitsTokenizer._convert_token_to_idc                 C   s   | j |S )z=Converts an index (integer) in a token (str) using the vocab.)r.   rj   )r1   indexr   r   r   re      s   z"VitsTokenizer._convert_id_to_tokensave_directoryfilename_prefixc              	   C   s   t j|std| d d S t j||r|d ndtd  }t|ddd}|t	j
| jd	d
ddd  W d    |fS 1 sEw   Y  |fS )NzVocabulary path (z) should be a directory-rA   r
   wr   r   rd   TF)indent	sort_keysensure_ascii
)ospathisdirrX   errorr`   VOCAB_FILES_NAMESr)   writer*   dumpsr,   )r1   rn   ro   r
   fr   r   r   save_vocabulary   s    
zVitsTokenizer.save_vocabulary)r   r   NTTTF)r   N)FNr6   )__name__
__module____qualname____doc__rz   vocab_files_namesmodel_input_namesr0   propertyr9   r@   rJ   rN   strboolr   tupledictr   rc   rC   rh   ri   rl   re   r   r~   __classcell__r   r   r4   r   r   /   sD    "

A0r   )r   r*   rv   r   typingr   r   r   tokenization_utilsr   utilsr   r   r	   r^   rb   rZ   
get_loggerr   rX   rz   r   r   __all__r   r   r   r   <module>   s"   

 
H