o
    ei                     @   sd   d dl mZmZmZ d dlmZ ddlmZ ddlm	Z	 e	
eZddiZG dd	 d	eZd	gZd
S )    )	Tokenizerdecodersnormalizers)BPE   )TokenizersBackend)loggingtokenizer_fileztokenizer.jsonc                       s   e Zd ZdZeZdZddgZeZ									
dde
ee
ef B dB de
ee
 B dB de
de
de
de
de
f fddZdefddZ  ZS )GemmaTokenizeru  
    Construct a fast Gemma tokenizer (backed by HuggingFace's tokenizers library).

    This tokenizer uses a BPE model with byte fallback, no prefix space, and a normalizer that replaces
    spaces with "▁".

    Args:
        tokenizer_file (`str`, optional):
            A tokenizers JSON file containing the serialization of a tokenizer.
        unk_token (`str`, optional, defaults to "<unk>"):
            The unknown token.
        bos_token (`str`, optional, defaults to "<bos>"):
            The beginning of sequence token.
        eos_token (`str`, optional, defaults to "<eos>"):
            The end of sequence token.
        pad_token (`str`, optional, defaults to "<pad>"):
            The padding token.
        mask_token (`str`, optional, defaults to "<mask>"):
            The mask token.
        add_bos_token (`bool`, optional, defaults to True):
            Whether or not to add a `bos_token` at the start of sequences.
        add_eos_token (`bool`, optional, defaults to False):
            Whether or not to add an `eos_token` at the end of sequences.
        vocab (`str` or `dict[str, int]`, optional):
            Custom vocabulary dict. If not provided, a minimal vocabulary is created using the special tokens.
    left	input_idsattention_maskN<unk><bos><eos><pad><mask>vocabmerges	unk_token	bos_token	eos_token	pad_token
mask_tokenc           	   
      s   |d u rt |dt |dt |dt |dt |di}|| _|p g | _tt| j| jdt |d dd| _ttdd	t	 t
 g| j_td	d| j_t jd|||||d
| d S )Nr         r      T)r   r   fuse_unkr   dropoutbyte_fallbacku   ▁ )r   r   r   r   r    )str_vocab_mergesr   r   
_tokenizerr   SequenceReplaceByteFallbackFusedecoderr   
normalizersuper__init__)	selfr   r   r   r   r   r   r   kwargs	__class__r!   j/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/gemma/tokenization_gemma.pyr-   ;   s@   

zGemmaTokenizer.__init__returnc                 C   s   dS )Nr   r!   )r.   r!   r!   r2   _unk_idi   s   zGemmaTokenizer._unk_id)NNr   r   r   r   r   )__name__
__module____qualname____doc__VOCAB_FILES_NAMESvocab_files_namespadding_sidemodel_input_namesr   modelr"   dictintlistr-   r4   __classcell__r!   r!   r0   r2   r
      s:    .r
   N)
tokenizersr   r   r   tokenizers.modelsr   tokenization_utils_tokenizersr   utilsr   
get_loggerr5   loggerr9   r
   __all__r!   r!   r!   r2   <module>   s   

T