o
    	۷iM                     @   s   d Z ddlZddlZddlZddlmZ ddlmZmZm	Z	 ddl
ZddlmZ ddlmZ ddlmZ er<dd	lmZ dd
lmZ ddlmZ eeZddiZdZeddG dd deZdgZdS )z Tokenization class for model T5.    N)copyfile)TYPE_CHECKINGAnyOptional   )import_protobuf)PreTrainedTokenizer)
AddedToken)	TextInput)logging)requires
vocab_filezspiece.modelu   ▁)sentencepiece)backendsc                
       s  e Zd ZdZeZddgZ									d=d
eee	e
f  ddf fddZd>ddZedd Zedd Zdd Z	d?dee deee  dedee f fddZdd Zdd Zd ee dee fd!d"Z	d@dee deee  dee fd#d$Z	d@dee deee  dee fd%d&Zd'd( Zd)d* Zd+d,dee	 f fd-d.Zed/d0 Zd1d2 Zd3d4 Z d5d6 Z!d7d8 Z"d@d9e	d:ee	 de#e	 fd;d<Z$  Z%S )AT5Tokenizera  
    Construct a T5 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
            The token used is the `sep_token`.

            </Tip>

        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        extra_ids (`int`, *optional*, defaults to 100):
           Add a number of extra ids added to the vocabulary for use as sentinels. These tokens are
            accessible as "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1. These tokens can be
            retrieved by calling get_sentinel_tokens method and token ids can be by calling get_sentinel_token_ids
            method
         additional_special_tokens (`list[str]`, *optional*):
            Additional special tokens used by the tokenizer.
        sp_model_kwargs (`dict`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:

            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.
        legacy (`bool`, *optional*):
            Whether or not the `legacy` behaviour of the tokenizer should be used. Legacy is before the merge of #24622
            and #25224 which includes fixes to properly handle tokens that appear after special tokens. A simple
            example:

            - `legacy=True`:
            ```python
            >>> from transformers import T5Tokenizer

            >>> tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-base", legacy=True)
            >>> tokenizer.encode("Hello <extra_id_0>.")
            [8774, 32099, 3, 5, 1]
            ```
            - `legacy=False`:
            ```python
            >>> from transformers import T5Tokenizer

            >>> tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-base", legacy=False)
            >>> tokenizer.encode("Hello <extra_id_0>.")  # the extra space `[3]` is no longer here
            [8774, 32099, 5, 1]
            ```
            Checkout the [pull request](https://github.com/huggingface/transformers/pull/24565) for more details.
        add_prefix_space (`bool`, *optional*, defaults to `False`):
            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
            other word.

    Attributes:
        sp_model (`SentencePieceProcessor`):
            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
    	input_idsattention_mask</s><unk><pad>d   NTsp_model_kwargsreturnc
                    s  t |trt|ddn|}t |trt|ddn|}t |tr%t|ddn|}|d u r-i n|| _|| _|| _tjdi | j| _| j	| |d urydd |D }t
|dk rc|dd t|D 7 }n!|dkrx|t
|krxtd| d	| d
ndd t|D }|}i | _tt
|D ]}td| ddddddd| jt
| jd | | < q|d u rtd| j d d}|| _| |
dd| _|	| _t jd|||||| j||	d|
 d S )NT)specialc                 S   s   g | ]
}d t |v r|qS )
<extra_id_)str).0x r   \/home/ubuntu/vllm_env/lib/python3.10/site-packages/transformers/models/t5/tokenization_t5.py
<listcomp>   s    z(T5Tokenizer.__init__.<locals>.<listcomp>   c                 S      g | ]}d | dqS r   >r   r   ir   r   r   r           r   zBoth extra_ids (z!) and additional_special_tokens (zk) are provided to T5Tokenizer. In this case the additional_special_tokens must include the extra_ids tokensc                 S   r"   r#   r   r%   r   r   r   r       r'   r   r$   F)single_wordlstriprstripr   
normalizedz2You are using the default legacy behaviour of the a_  . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565	from_slow)	eos_token	unk_token	pad_token	extra_idsadditional_special_tokensr   legacyadd_prefix_spacer   )
isinstancer   r	   r   r   
_extra_idsspmSentencePieceProcessorsp_modelLoadlenrange
ValueError_added_tokens_decoderloggerwarning_once	__class__r2   get_spm_processorpopr3   super__init__)selfr   r-   r.   r/   r0   r1   r   r2   r3   kwargsextra_tokensr&   r@   r   r   rD      sX    	
zT5Tokenizer.__init__Fc                 C   s   t jdi | j}| js|r|| j |S t| jd3}| }td| j	j
 d}|j|}| }d|_|j| | }|| W d    |S 1 sRw   Y  |S )NrbzThe new behaviour of z (with `self.legacy = False`)Fr   )r6   r7   r   r2   r9   r   openreadr   r@   __name__
ModelProto
FromStringNormalizerSpecadd_dummy_prefixnormalizer_spec	MergeFromSerializeToStringLoadFromSerializedProto)rE   r,   	tokenizerfr8   	model_pb2modelrQ   r   r   r   rA      s"   

		zT5Tokenizer.get_spm_processorc                 C   sZ   | t jv r+t j|  }|d ur||kr|S |d u r+td| d|  d| d| d	t |S )NzGThis tokenizer was incorrectly instantiated with a model max length of z which will be corrected in Transformers v5.
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on z( automatically truncating your input to zM when padding/encoding.
- If you want to encode/pad to sequences longer than z you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
- To avoid this warning, please instantiate this tokenizer with `model_max_length` set to your preferred value.)r   max_model_input_sizeswarningswarnFutureWarning)pretrained_model_name_or_pathmax_model_lengthinit_max_model_lengthdeprecated_max_model_lengthr   r   r   !_eventually_correct_t5_max_length   s$   

	z-T5Tokenizer._eventually_correct_t5_max_lengthc                 C   s
   | j  S N)r8   get_piece_sizerE   r   r   r   
vocab_size   s   
zT5Tokenizer.vocab_sizec                    s(    fddt  jD }| j |S )Nc                    s   i | ]}  ||qS r   )convert_ids_to_tokensr%   rd   r   r   
<dictcomp>   r'   z)T5Tokenizer.get_vocab.<locals>.<dictcomp>)r;   re   updateadded_tokens_encoder)rE   vocabr   rd   r   	get_vocab   s   zT5Tokenizer.get_vocabtoken_ids_0token_ids_1already_has_special_tokensc                    sZ   |rt  j||ddS |du rdgt| dg S dgt| dg dgt|  dg S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`list[int]`):
                List of IDs.
            token_ids_1 (`list[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)rl   rm   rn   Nr   r!   )rC   get_special_tokens_maskr:   )rE   rl   rm   rn   rH   r   r   ro      s   (z#T5Tokenizer.get_special_tokens_maskc                 C   s   t ttdd | jS )Nc                 S   s   t td| d uS )Nz<extra_id_\d+>)boolresearch)r   r   r   r   <lambda>  s    z1T5Tokenizer.get_sentinel_tokens.<locals>.<lambda>)listsetfilterr1   rd   r   r   r   get_sentinel_tokens  s   zT5Tokenizer.get_sentinel_tokensc                    s    fdd   D S )Nc                    s   g | ]}  |qS r   )convert_tokens_to_ids)r   tokenrd   r   r   r      s    z6T5Tokenizer.get_sentinel_token_ids.<locals>.<listcomp>)rw   rd   r   rd   r   get_sentinel_token_ids  s   z"T5Tokenizer.get_sentinel_token_ids	token_idsc                 C   s>   t |dkr|d | jkrtd| j d |S || jg S )z.Do not add eos again if user already added it.r   zThis sequence already has zQ. In future versions this behavior may lead to duplicated eos tokens being added.)r:   eos_token_idrZ   r[   r-   )rE   r{   r   r   r   _add_eos_if_not_present  s   z#T5Tokenizer._add_eos_if_not_presentc                 C   s<   | j g}|du rt|| dg S t|| | | dg S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
        use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`list[int]`):
                List of IDs.
            token_ids_1 (`list[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `list[int]`: List of zeros.
        Nr   )r}   r:   )rE   rl   rm   eosr   r   r   $create_token_type_ids_from_sequences'  s   z0T5Tokenizer.create_token_type_ids_from_sequencesc                 C   s(   |  |}|du r|S |  |}|| S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A sequence has the following format:

        - single sequence: `X </s>`
        - pair of sequences: `A </s> B </s>`

        Args:
            token_ids_0 (`list[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`list[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        N)r~   )rE   rl   rm   r   r   r    build_inputs_with_special_tokens=  s
   

z,T5Tokenizer.build_inputs_with_special_tokensc                 C   s   | j  }d |d< |S )Nr8   )__dict__copy)rE   stater   r   r   __getstate__W  s   
zT5Tokenizer.__getstate__c                 C   s<   || _ t| dsi | _tjdi | j| _| j| j d S )Nr   r   )r   hasattrr   r6   r7   r8   r9   r   )rE   dr   r   r   __setstate__\  s
   
zT5Tokenizer.__setstate__textr
   c                    s   | j s	t|dkrt j|fi |S |td}| jr t| }t j|fi |}t|dkrC|d tkrC|d | jv rC|dd }|S )z
        Converts a string to a list of tokens. If `self.legacy` is set to `False`, a prefix token is added unless the
        first token is special.
        r    r!   N)r2   r:   rC   tokenizereplaceSPIECE_UNDERLINEr3   all_special_tokensrE   r   rF   tokensrH   r   r   r   f  s   &zT5Tokenizer.tokenizec                 C   s   t | jt| jS rb   )r:   r8   encoder   r.   rd   r   r   r   unk_token_lengthx  s   zT5Tokenizer.unk_token_lengthc                 K   sZ   | j s
|tdfs| jj|tdS | jj| j| td}t|| jkr+|| jd S |S )u(  
        Returns a tokenized string.

        We de-activated the `add_dummy_prefix` option, thus the sentencepiece internals will always strip any
        SPIECE_UNDERLINE. For example: `self.sp_model.encode(f"{SPIECE_UNDERLINE}Hey", out_type = str)` will give
        `['H', 'e', 'y']` instead of `['▁He', 'y']`. Thus we always encode `f"{unk_token}text"` and strip the
        `unk_token`. Here is an example with `unk_token = "<unk>"` and `unk_token_length = 4`.
        `self.tokenizer.sp_model.encode("<unk> Hey", out_type = str)[4:]`.
        r   )out_typeN)	r2   
startswithr   r8   r   r   r.   r:   r   r   r   r   r   	_tokenize|  s   
 zT5Tokenizer._tokenizec                 C   s   | j |S )z0Converts a token (str) in an id using the vocab.)r8   piece_to_id)rE   ry   r   r   r   _convert_token_to_id  s   z T5Tokenizer._convert_token_to_idc                 C   s   | j |}|S )z=Converts an index (integer) in a token (str) using the vocab.)r8   	IdToPiece)rE   indexry   r   r   r   _convert_id_to_token  s   z T5Tokenizer._convert_id_to_tokenc                 C   s   |d  tr| jr|d dd |d< g }d}d}|D ]#}|| jv r8|s)|d7 }|| j|| 7 }d}g }q|| d}q|| j|7 }| S )z:Converts a sequence of tokens (string) in a single string.r   r!   N Fr   T)r   r   r3   r   r8   decodeappendstrip)rE   r   current_sub_tokens
out_stringprev_is_specialry   r   r   r   convert_tokens_to_string  s    

z$T5Tokenizer.convert_tokens_to_stringsave_directoryfilename_prefixc                 C   s   t j|std| d d S t j||r|d ndtd  }t j| jt j|kr?t j	| jr?t
| j| |fS t j	| jsgt|d}| j }|| W d    |fS 1 sbw   Y  |fS )NzVocabulary path (z) should be a directory-r   r   wb)ospathisdirr>   errorjoinVOCAB_FILES_NAMESabspathr   isfiler   rJ   r8   serialized_model_protowrite)rE   r   r   out_vocab_fileficontent_spiece_modelr   r   r   save_vocabulary  s"   (

zT5Tokenizer.save_vocabulary)r   r   r   r   NNNT)F)NFrb   )&rL   
__module____qualname____doc__r   vocab_files_namesmodel_input_namesr   dictr   r   rD   rA   staticmethodra   propertyre   rk   rt   intrp   ro   rw   rz   r~   r   r   r   r   r   r   r   r   r   r   tupler   __classcell__r   r   rH   r   r   ,   s|    N
I








(r   )r   r   rq   rZ   shutilr   typingr   r   r   r   r6   convert_slow_tokenizerr   tokenization_utilsr   tokenization_utils_baser	   r
   utilsr   utils.import_utilsr   
get_loggerrL   r>   r   r   r   __all__r   r   r   r   <module>   s.   
   
