o
    i                     @   s   d dl Z d dlmZ d dlmZ d dlmZ ddlmZ ddl	m
Z
mZ e
 r.dd	lmZ ndZeeZd
ddZG dd deZdgZdS )    N)copyfile)Optional)
processors   )PreTrainedTokenizerFast)is_sentencepiece_availablelogging   )GemmaTokenizerztokenizer.modelztokenizer.json)
vocab_filetokenizer_filec                       s   e Zd ZdZeZeZdZddgZ									
		d fdd	Z
dd Zedd Zedd Zejdd Zejdd Zddedee dee fddZdddZ  ZS )GemmaTokenizerFastu
  
    Construct a Gemma tokenizer fast. Based on byte-level Byte-Pair-Encoding.

    This uses notably ByteFallback and no prefix space. Normalization is applied to replace  `" "` with `"▁"`

    ```python
    >>> from transformers import GemmaTokenizerFast

    >>> tokenizer = GemmaTokenizerFast.from_pretrained("hf-internal-testing/dummy-gemma")
    >>> tokenizer.encode("Hello this is a test")
    [2, 4521, 736, 603, 476, 2121]
    ```

    If you want to change the `bos_token` or the `eos_token`, make sure to specify them when initializing the model, or
    call `tokenizer.update_post_processor()` to make sure that the post-processing is correctly done (otherwise the
    values of the first token and final token of an encoded sequence will not be correct). For more details, checkout
    [post-processors] (https://huggingface.co/docs/tokenizers/api/post-processors) documentation.


    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`, *optional*):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .model extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
        tokenizer_file (`str`, *optional*):
            [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
            contains everything needed to load the tokenizer.
        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
            Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
            extra spaces.
        unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<bos>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
        eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<eos>"`):
            The end of sequence token.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The padding token
        add_bos_token (`bool`, *optional*, defaults to `True`):
            Whether or not to add an `bos_token` at the start of sequences.
        add_eos_token (`bool`, *optional*, defaults to `False`):
            Whether or not to add an `eos_token` at the end of sequences.
    left	input_idsattention_maskNF<unk><bos><eos><pad>Tc
                    sD   t  jd|||||||||	d	|
 || _|	| _|   || _d S )N)	r   r   clean_up_tokenization_spaces	unk_token	bos_token	eos_token	pad_tokenadd_bos_tokenadd_eos_token )super__init___add_bos_token_add_eos_tokenupdate_post_processorr   )selfr   r   r   r   r   r   r   r   r   kwargs	__class__r   e/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/gemma/tokenization_gemma_fast.pyr   W   s"   

zGemmaTokenizerFast.__init__c                 C   s   | j }| j}|du r| jrtd| j}| j}|du r"| jr"td| jr)|d nd d| jr5d| d nd }| | jrDd| d	 nd d
| jrPd| d	 nd }g }| jr`|||f | jrj|||f tj	|||d| j
_dS )ze
        Updates the underlying post processor with the current `bos_token` and `eos_token`.
        Nz)add_bos_token = True but bos_token = Nonez)add_eos_token = True but eos_token = Nonez:0  z$A:0 z:0z:1z $B:1)singlepairspecial_tokens)r   bos_token_idr   
ValueErrorr   eos_token_idr   appendr   TemplateProcessing
_tokenizerpost_processor)r"   bosr,   eosr.   r)   r*   r+   r   r   r&   r!   v   s$   .6z(GemmaTokenizerFast.update_post_processorc                 C      | j S N)r    r"   r   r   r&   r         z GemmaTokenizerFast.add_eos_tokenc                 C   r5   r6   )r   r7   r   r   r&   r      r8   z GemmaTokenizerFast.add_bos_tokenc                 C      || _ |   d S r6   )r    r!   r"   valuer   r   r&   r         c                 C   r9   r6   )r   r!   r:   r   r   r&   r      r<   save_directoryfilename_prefixreturnc                 C   s~   | j stdtj|std| d d S tj||r"|d ndtd  }tj	| j
tj	|kr<t| j
| |fS )NzhYour fast tokenizer does not have the necessary information to save the vocabulary for a slow tokenizer.zVocabulary path (z) should be a directory-r'   r   )can_save_slow_tokenizerr-   ospathisdirloggererrorjoinVOCAB_FILES_NAMESabspathr   r   )r"   r=   r>   out_vocab_filer   r   r&   save_vocabulary   s   z"GemmaTokenizerFast.save_vocabularyc                 C   sL   | j r| jgng }| jr| jgng }|| | }|d ur$|| | | }|S r6   )r   r,   r   r.   )r"   token_ids_0token_ids_1r,   r.   outputr   r   r&    build_inputs_with_special_tokens   s   z3GemmaTokenizerFast.build_inputs_with_special_tokens)	NNFr   r   r   r   TFr6   )__name__
__module____qualname____doc__rH   vocab_files_namesr
   slow_tokenizer_classpadding_sidemodel_input_namesr   r!   propertyr   r   setterstrr   tuplerK   rO   __classcell__r   r   r$   r&   r   "   s6    /



 r   )rB   shutilr   typingr   
tokenizersr   tokenization_utils_fastr   utilsr   r   tokenization_gemmar
   
get_loggerrP   rE   rH   r   __all__r   r   r   r&   <module>   s   

 
"