o
    i7                     @   s   d dl Z d dlmZ d dlmZmZmZ d dlZddl	m
Z
mZ ddlmZ ddlmZ er4ddlmZ eeZd	d
iZdZeddG dd deZdgZdS )    N)copyfile)TYPE_CHECKINGAnyOptional   )
AddedTokenPreTrainedTokenizer)logging)requires)	TextInput
vocab_fileztokenizer.modelu   ▁)sentencepiece)backendsc                
       sX  e Zd ZdZeZddgZ								
	
	
	
d4deee	e
f  f fddZdd Zdd Zedd Zdd Zdddee	 f fddZdd Zdd Zdd  Zd!d" Zd5d#ee	 dee	 fd$d%Zd5d&d'Z	
d6d(ee d)eee  d*edee f fd+d,Z	d5d(ee d)eee  dee fd-d.Z	
	
d7d/ee d0ed1ede	fd2d3Z  ZS )8GemmaTokenizera
  
    Construct a Gemma tokenizer. Based on byte-level Byte-Pair-Encoding. The default padding token is unset as there is
    no padding token in the original model.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<bos>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
        eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<eos>"`):
            The end of sequence token.
        pad_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<pad>"`):
            A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
            attention mechanisms or loss computation.
        sp_model_kwargs (`dict[str, Any]`, `Optional`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:

            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.

        add_bos_token (`bool`, *optional*, defaults to `True`):
            Whether or not to add an `bos_token` at the start of sequences.
        add_eos_token (`bool`, *optional*, defaults to `False`):
            Whether or not to add an `eos_token` at the end of sequences.
        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
            Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
            extra spaces.
        use_default_system_prompt (`bool`, *optional*, defaults to `False`):
            Whether or not the default system prompt for Gemma should be used.
        spaces_between_special_tokens (`bool`, *optional*, defaults to `False`):
            Whether or not to add spaces between special tokens.
    	input_idsattention_mask<unk><bos><eos><pad>NTFsp_model_kwargsc                    s   |d u ri n|| _ t|trt|dddn|}t|tr#t|dddn|}t|tr1t|dddn|}t|tr?t|dddn|}|| _|| _|| _|
| _tj	di | j | _
| j
| t jd||||||||	|
|d
| d S )NFT)
normalizedspecial)
	bos_token	eos_token	unk_token	pad_tokenadd_bos_tokenadd_eos_tokenr   clean_up_tokenization_spacesuse_default_system_promptspaces_between_special_tokens )r   
isinstancestrr   r   r   r   r    spmSentencePieceProcessorsp_modelLoadsuper__init__)selfr   r   r   r   r   r   r   r   r   r    r!   kwargs	__class__r"   `/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/gemma/tokenization_gemma.pyr*   ^   s2   
zGemmaTokenizer.__init__c                 C   s$   | j  }d |d< | j |d< |S )Nr'   sp_model_proto)__dict__copyr'   serialized_model_proto)r+   stater"   r"   r/   __getstate__   s   
zGemmaTokenizer.__getstate__c                 C   s2   | j | tjdi | j| _| j| j d S )Nr"   )r1   updater%   r&   r   r'   LoadFromSerializedProtor0   )r+   dr"   r"   r/   __setstate__   s   zGemmaTokenizer.__setstate__c                 C   s
   | j  S )zReturns vocab size)r'   get_piece_sizer+   r"   r"   r/   
vocab_size   s   
zGemmaTokenizer.vocab_sizec                    s(    fddt  jD }| j |S )zReturns vocab as a dictc                    s   i | ]}  ||qS r"   )convert_ids_to_tokens).0ir;   r"   r/   
<dictcomp>   s    z,GemmaTokenizer.get_vocab.<locals>.<dictcomp>)ranger<   r6   added_tokens_encoder)r+   vocabr"   r;   r/   	get_vocab   s   zGemmaTokenizer.get_vocabtextr   returnc                    s   t  j|fi |S )ze
        Args:
            text: TextInput
        Simply calls PreTrainedTokenizer's method
        )r)   tokenizer+   rE   r,   r-   r"   r/   rG      s   zGemmaTokenizer.tokenizec                 K   s   | j j|tdS )z
        Args:
            text: TextInput
        Returns a tokenized string. The Gemma tokenizer never adds a prefix space.
        )out_type)r'   encoder$   rH   r"   r"   r/   	_tokenize   s   zGemmaTokenizer._tokenizec                 C   s   | j |S )z0Converts a token (str) in an id using the vocab.)r'   piece_to_id)r+   tokenr"   r"   r/   _convert_token_to_id   s   z#GemmaTokenizer._convert_token_to_idc                 C   s   | j |}|S )z=Converts an index (integer) in a token (str) using the vocab.)r'   	IdToPiece)r+   indexrM   r"   r"   r/   _convert_id_to_token   s   z#GemmaTokenizer._convert_id_to_tokenc                 C   sT   g }d}|D ]}|| j v r|| j|| 7 }g }q|| q|| j|7 }|S )z:Converts a sequence of tokens (string) in a single string. )_added_tokens_encoderr'   decodeappend)r+   tokenscurrent_sub_tokens
out_stringrM   r"   r"   r/   convert_tokens_to_string   s   
z'GemmaTokenizer.convert_tokens_to_stringfilename_prefixc                 C   s   t j|std| d dS t j||r|d ndtd  }t j| jt j|kr?t j	| jr?t
| j| |fS t j	| jsgt|d}| j }|| W d   |fS 1 sbw   Y  |fS )a  
        Save the vocabulary and special tokens file to a directory.

        Args:
            save_directory (`str`):
                The directory in which to save the vocabulary.

        Returns:
            `Tuple(str)`: Paths to the files saved.
        zVocabulary path (z) should be a directoryN-rR   r   wb)ospathisdirloggererrorjoinVOCAB_FILES_NAMESabspathr   isfiler   openr'   r3   write)r+   save_directoryrZ   out_vocab_fileficontent_spiece_modelr"   r"   r/   save_vocabulary   s"   (

zGemmaTokenizer.save_vocabularyc                 C   sL   | j r| jgng }| jr| jgng }|| | }|d ur$|| | | }|S N)r   bos_token_idr   eos_token_idr+   token_ids_0token_ids_1rn   ro   outputr"   r"   r/    build_inputs_with_special_tokens   s   z/GemmaTokenizer.build_inputs_with_special_tokensrq   rr   already_has_special_tokensc                    s   |rt  j||ddS | jrdgng }| jrdgng }|du r*|dgt|  | S |dgt|  | | dgt|  | S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`list[int]`):
                List of IDs.
            token_ids_1 (`list[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)rq   rr   ru      Nr   )r)   get_special_tokens_maskr   r   len)r+   rq   rr   ru   rn   ro   r-   r"   r/   rw      s(   z&GemmaTokenizer.get_special_tokens_maskc                 C   s`   | j r| jgng }| jr| jgng }dgt|| |  }|dur.|dgt|| |  7 }|S )a  
        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
        sequence pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        if token_ids_1 is None, only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`list[int]`):
                List of ids.
            token_ids_1 (`list[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `list[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        r   Nrv   )r   rn   r   ro   rx   rp   r"   r"   r/   $create_token_type_ids_from_sequences  s   z3GemmaTokenizer.create_token_type_ids_from_sequences	token_idsskip_special_tokensr!   c                 K   s   g }g }|D ]+}|r|| j v rq|| jv r,|r || j| || j| j g }q|| q|r=|| j| |rEd|}nd|}|tdS )N rR   )	all_special_ids_added_tokens_decoderrU   r'   rT   contentrb   replaceSPIECE_UNDERLINE)r+   rz   r{   r!   r,   	sub_textscurrent_sub_textidsr"   r"   r/   _decode1  s"   

zGemmaTokenizer._decode)
r   r   r   r   NTFFFFrm   )NF)FF) __name__
__module____qualname____doc__rc   vocab_files_namesmodel_input_namesr   dictr$   r   r*   r5   r9   propertyr<   rD   listrG   rK   rN   rQ   rY   tuplerl   rt   intboolrw   ry   r   __classcell__r"   r"   r-   r/   r   +   st    .*


&

$r   )r]   shutilr   typingr   r   r   r   r%   tokenization_utilsr   r   utilsr	   utils.import_utilsr
   tokenization_utils_baser   
get_loggerr   r`   rc   r   r   __all__r"   r"   r"   r/   <module>   s"   
  
%