o
    ei                      @   sv   d Z ddlmZ ddlmZ ddlmZ ddlmZ ddl	m
Z
 eeZd	d
iZeddG dd deZdgZdS )z Tokenization class for SpeechT5.    )Any   )SentencePieceBackend)logging)requires   )EnglishNumberNormalizer
vocab_filezspm_char.model)sentencepiece)backendsc                
       s   e Zd ZdZeZddgZdZ							dd
ee	e
f d	B dd	f fddZdddZedd Zejdd Zddee fddZ	ddee dee d	B dedee f fddZ		ddee dee d	B dee fddZ  ZS ) SpeechT5Tokenizera	  
    Construct a SpeechT5 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The begin of sequence token.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        normalize (`bool`, *optional*, defaults to `False`):
            Whether to convert numeric quantities in the text to their spelt-out english counterparts.
        sp_model_kwargs (`dict`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:

            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.

    Attributes:
        sp_model (`SentencePieceProcessor`):
            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
    	input_idsattention_maskF<s></s><unk><pad>Nsp_model_kwargsreturnc           	   	      s@   || _ d | _|d ur||d< t jd||||||d| d S )Nr   )r	   	bos_token	eos_token	unk_token	pad_token	normalize )r   _normalizersuper__init__)	selfr	   r   r   r   r   r   r   kwargs	__class__r   p/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/speecht5/tokenization_speecht5.pyr   M   s   
zSpeechT5Tokenizer.__init__c                 K   s0   | d| j}|rd| }|r| |}||fS )Nr    )popr   
normalizer)r   textis_split_into_wordsr   r   r   r   r"   prepare_for_tokenizationj   s   
z*SpeechT5Tokenizer.prepare_for_tokenizationc                 C   s   | j d u r	t | _ | j S N)r   r   )r   r   r   r"   r%   r   s   
zSpeechT5Tokenizer.normalizerc                 C   s
   || _ d S r)   )r   )r   valuer   r   r"   r%   x   s   
c                 C   s$   |du r
|| j g S || | j g S )z=Build model inputs from a sequence by appending eos_token_id.N)eos_token_id)r   token_ids_0token_ids_1r   r   r"    build_inputs_with_special_tokens|   s   z2SpeechT5Tokenizer.build_inputs_with_special_tokensr,   r-   already_has_special_tokensc                    sV   |rt  j||ddS dg}|d u rdgt| | S dgt| dgt|  | S )NT)r,   r-   r/   r   r   )r   get_special_tokens_masklen)r   r,   r-   r/   suffix_onesr    r   r"   r0      s    z)SpeechT5Tokenizer.get_special_tokens_maskc                 C   s8   | j g}|du rt|| dg S t|| | dg S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. SpeechT5 does not
        make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`list[int]`):
                List of IDs.
            token_ids_1 (`list[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `list[int]`: List of zeros.
        Nr   )r+   r1   )r   r,   r-   eosr   r   r"   $create_token_type_ids_from_sequences   s   z6SpeechT5Tokenizer.create_token_type_ids_from_sequences)r   r   r   r   FN)Fr)   )NF)__name__
__module____qualname____doc__VOCAB_FILES_NAMESvocab_files_namesmodel_input_namesis_fastdictstrr   r   r(   propertyr%   setterlistintr.   boolr0   r4   __classcell__r   r   r    r"   r      sP    *





r   N)r8   typingr    tokenization_utils_sentencepiecer   utilsr   utils.import_utilsr   number_normalizerr   
get_loggerr5   loggerr9   r   __all__r   r   r   r"   <module>   s   
 
	