o
    i1#                     @   s   d Z ddlZddlmZ ddlmZmZ ddlZddl	m
Z
 ddlmZ ddlmZ d	d
lmZ eeZddiZeddG dd de
ZdgZdS )z Tokenization class for SpeechT5.    N)copyfile)AnyOptional   )PreTrainedTokenizer)logging)requires   )EnglishNumberNormalizer
vocab_filezspm_char.model)sentencepiece)backendsc                
       s,  e Zd ZdZeZddgZ							d/d
eee	e
f  dd	f fddZd0ddZedd Zedd Zejdd Zdd Zdd Zdd Zde	dee	 fddZdd Zd d! Zd"d# Zd1dee fd$d%Z	d2d&ee d'eee  d(edee f fd)d*Zd1d+e	d,ee	 dee	 fd-d.Z  ZS )3SpeechT5Tokenizera	  
    Construct a SpeechT5 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The begin of sequence token.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        normalize (`bool`, *optional*, defaults to `False`):
            Whether to convert numeric quantities in the text to their spelt-out english counterparts.
        sp_model_kwargs (`dict`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:

            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.

    Attributes:
        sp_model (`SentencePieceProcessor`):
            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
    	input_idsattention_mask<s></s><unk><pad>FNsp_model_kwargsreturnc           	   	      sj   |d u ri n|| _ || _|| _d | _tjdi | j | _| j| t j	d|||||| j d| d S )N)	bos_token	eos_token	unk_token	pad_token	normalizer    )
r   r   r   _normalizerspmSentencePieceProcessorsp_modelLoadsuper__init__)	selfr   r   r   r   r   r   r   kwargs	__class__r   o/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/transformers/models/speecht5/tokenization_speecht5.pyr#   Q   s    
zSpeechT5Tokenizer.__init__c                 K   s0   | d| j}|rd| }|r| |}||fS )Nr    )popr   
normalizer)r$   textis_split_into_wordsr%   r   r   r   r(   prepare_for_tokenizationn   s   
z*SpeechT5Tokenizer.prepare_for_tokenizationc                 C   s
   | j  S N)r    get_piece_sizer$   r   r   r(   
vocab_sizev      
zSpeechT5Tokenizer.vocab_sizec                 C   s   | j d u r	t | _ | j S r/   )r   r
   r1   r   r   r(   r+   z   s   
zSpeechT5Tokenizer.normalizerc                 C   s
   || _ d S r/   )r   )r$   valuer   r   r(   r+      r3   c                    s(    fddt  jD }| j |S )Nc                    s   i | ]}  ||qS r   )convert_ids_to_tokens).0ir1   r   r(   
<dictcomp>   s    z/SpeechT5Tokenizer.get_vocab.<locals>.<dictcomp>)ranger2   updateadded_tokens_encoder)r$   vocabr   r1   r(   	get_vocab   s   zSpeechT5Tokenizer.get_vocabc                 C   s   | j  }d |d< |S )Nr    )__dict__copy)r$   stater   r   r(   __getstate__   s   
zSpeechT5Tokenizer.__getstate__c                 C   s<   || _ t| dsi | _tjdi | j| _| j| j d S )Nr   r   )r>   hasattrr   r   r   r    r!   r   )r$   dr   r   r(   __setstate__   s
   
zSpeechT5Tokenizer.__setstate__r,   c                 C   s   | j j|tdS )zPTake as input a string and return a list of strings (tokens) for words/sub-words)out_type)r    encodestr)r$   r,   r   r   r(   	_tokenize   s   zSpeechT5Tokenizer._tokenizec                 C   s   | j |S )z0Converts a token (str) in an id using the vocab.)r    piece_to_id)r$   tokenr   r   r(   _convert_token_to_id   s   z&SpeechT5Tokenizer._convert_token_to_idc                 C   s   | j |}|S )z=Converts an index (integer) in a token (str) using the vocab.)r    	IdToPiece)r$   indexrJ   r   r   r(   _convert_id_to_token   s   z&SpeechT5Tokenizer._convert_id_to_tokenc                 C   sp   g }d}d}|D ]#}|| j v r$|s|d7 }|| j|| 7 }d}g }q|| d}q|| j|7 }| S )z:Converts a sequence of tokens (string) in a single string. Fr)   T)all_special_tokensr    decodeappendstrip)r$   tokenscurrent_sub_tokens
out_stringprev_is_specialrJ   r   r   r(   convert_tokens_to_string   s   

z*SpeechT5Tokenizer.convert_tokens_to_stringc                 C   s$   |du r
|| j g S || | j g S )z=Build model inputs from a sequence by appending eos_token_id.N)eos_token_id)r$   token_ids_0token_ids_1r   r   r(    build_inputs_with_special_tokens   s   z2SpeechT5Tokenizer.build_inputs_with_special_tokensrZ   r[   already_has_special_tokensc                    sV   |rt  j||ddS dg}|d u rdgt| | S dgt| dgt|  | S )NT)rZ   r[   r]   r	   r   )r"   get_special_tokens_masklen)r$   rZ   r[   r]   suffix_onesr&   r   r(   r^      s    z)SpeechT5Tokenizer.get_special_tokens_masksave_directoryfilename_prefixc                 C   s   t j|std| d d S t j||r|d ndtd  }t j| jt j|kr?t j	| jr?t
| j| |fS t j	| jsgt|d}| j }|| W d    |fS 1 sbw   Y  |fS )NzVocabulary path (z) should be a directory-rO   r   wb)ospathisdirloggererrorjoinVOCAB_FILES_NAMESabspathr   isfiler   openr    serialized_model_protowrite)r$   ra   rb   out_vocab_fileficontent_spiece_modelr   r   r(   save_vocabulary   s"   (

z!SpeechT5Tokenizer.save_vocabulary)r   r   r   r   FN)Fr/   )NF) __name__
__module____qualname____doc__rk   vocab_files_namesmodel_input_namesr   dictrG   r   r#   r.   propertyr2   r+   setterr=   rA   rD   listrH   rK   rN   rX   intr\   boolr^   tuplert   __classcell__r   r   r&   r(   r   "   sR    *






(r   )rx   re   shutilr   typingr   r   r   r   tokenization_utilsr   utilsr   utils.import_utilsr   number_normalizerr
   
get_loggerru   rh   rk   r   __all__r   r   r   r(   <module>   s   
 
=