o
    	۷i>                     @   s   d Z ddlZddlZddlZddlZddlmZ ddlmZm	Z	m
Z
 ddlZddlmZ ddlmZ ddlmZ er@dd	lmZ dd
lmZmZ ddlmZ eeZddiZdZeddG dd deZdgZdS )z$Tokenization class for SigLIP model.    N)copyfile)TYPE_CHECKINGAnyOptional   )import_protobuf)PreTrainedTokenizer)
AddedToken)	TextInput)loggingrequires_backends)requires
vocab_filezspiece.modelu   ▁)sentencepiece)backendsc                
       s  e Zd ZdZeZddgZ							d;d	eee	e
f  d
df fddZdd Zedd Zdd Z	d<dee deee  ded
ee f fddZdee d
ee fddZ	d=dee deee  d
ee fddZ	d=dee deee  d
ee fddZd d! Zd"d# Zd$e	d
e	fd%d&Zdd'd(d)Zd>d$d*d
ee	 f fd+d,Zed-d. Zd/d0 Zd1d2 Zd3d4 Zd5d6 Z d=d7e	d8ee	 d
e!e	 fd9d:Z"  Z#S )?SiglipTokenizera  
    Construct a Siglip tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"</s>"`):
            The token used for padding, for example when batching sequences of different lengths.
        additional_special_tokens (`list[str]`, *optional*):
            Additional special tokens used by the tokenizer.
        sp_model_kwargs (`dict`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:

            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.
        model_max_length (`int`, *optional*, defaults to 64):
            The maximum length (in number of tokens) for model inputs.
        do_lower_case (`bool`, *optional*, defaults to `True`):
            Whether or not to lowercase the input when tokenizing.
    	input_idsattention_mask</s><unk>N@   Tsp_model_kwargsreturnc	           
   
      s   t | d t|trt|dddddn|}t|tr#t|dddddn|}t|tr3t|dddddn|}|d u r;i n|| _|| _|| _|  | _|| _t	 j
d||||| j||d|	 d S )NprotobufTF)rstriplstrip
normalizedspecial)	eos_token	unk_token	pad_tokenadditional_special_tokensr   model_max_lengthdo_lower_case )r   
isinstancestrr	   r   r#   r   get_spm_processorsp_modelsuper__init__)
selfr   r   r   r    r!   r   r"   r#   kwargs	__class__r$   d/home/ubuntu/vllm_env/lib/python3.10/site-packages/transformers/models/siglip/tokenization_siglip.pyr*   Z   s:   


zSiglipTokenizer.__init__c                 C   s   t jdi | j}t| jd,}| }t }|j|}|	 }d|_
|j| | }|| W d    |S 1 s>w   Y  |S )NrbFr$   )spmSentencePieceProcessorr   openr   readr   
ModelProto
FromStringNormalizerSpecadd_dummy_prefixnormalizer_spec	MergeFromSerializeToStringLoadFromSerializedProto)r+   	tokenizerfr(   	model_pb2modelr9   r$   r$   r/   r'      s   
		z!SiglipTokenizer.get_spm_processorc                 C   s
   | j  S N)r(   get_piece_sizer+   r$   r$   r/   
vocab_size   s   
zSiglipTokenizer.vocab_sizec                    s(    fddt  jD }| j |S )Nc                    s   i | ]}  ||qS r$   )convert_ids_to_tokens).0irC   r$   r/   
<dictcomp>   s    z-SiglipTokenizer.get_vocab.<locals>.<dictcomp>)rangerD   updateadded_tokens_encoder)r+   vocabr$   rC   r/   	get_vocab   s   zSiglipTokenizer.get_vocabFtoken_ids_0token_ids_1already_has_special_tokensc                    sZ   |rt  j||ddS |du rdgt| dg S dgt| dg dgt|  dg S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`list[int]`):
                List of IDs.
            token_ids_1 (`list[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)rN   rO   rP   Nr      )r)   get_special_tokens_masklen)r+   rN   rO   rP   r-   r$   r/   rR      s   (z'SiglipTokenizer.get_special_tokens_mask	token_idsc                 C   s>   t |dkr|d | jkrtd| j d |S || jg S )z.Do not add eos again if user already added it.r   zThis sequence already has zQ. In future versions this behavior may lead to duplicated eos tokens being added.)rS   eos_token_idwarningswarnr   )r+   rT   r$   r$   r/   _add_eos_if_not_present   s   z'SiglipTokenizer._add_eos_if_not_presentc                 C   s<   | j g}|du rt|| dg S t|| | | dg S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
        use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`list[int]`):
                List of IDs.
            token_ids_1 (`list[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `list[int]`: List of zeros.
        Nr   )rV   rS   )r+   rN   rO   eosr$   r$   r/   $create_token_type_ids_from_sequences   s   z4SiglipTokenizer.create_token_type_ids_from_sequencesc                 C   s(   |  |}|du r|S |  |}|| S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A sequence has the following format:

        - single sequence: `X </s>`
        - pair of sequences: `A </s> B </s>`

        Args:
            token_ids_0 (`list[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`list[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        N)rY   )r+   rN   rO   r$   r$   r/    build_inputs_with_special_tokens   s
   

z0SiglipTokenizer.build_inputs_with_special_tokensc                 C   s   | j  }d |d< |S )Nr(   )__dict__copy)r+   stater$   r$   r/   __getstate__   s   
zSiglipTokenizer.__getstate__c                 C   s<   || _ t| dsi | _tjdi | j| _| j| j d S )Nr   r$   )r]   hasattrr   r1   r2   r(   Loadr   )r+   dr$   r$   r/   __setstate__  s
   
zSiglipTokenizer.__setstate__textc                 C   s   | tddtjS )N )	translater&   	maketransstringpunctuation)r+   re   r$   r$   r/   remove_punctuation  s   z"SiglipTokenizer.remove_punctuationkeep_punctuation_exact_stringc                   sH   |r|  fdd||D }n |}tdd|}| }|S )a  Returns canonicalized `text` (puncuation removed).

        Args:
            text (`str`):
                String to be canonicalized.
            keep_punctuation_exact_string (`str`, *optional*):
                If provided, then this exact string is kept. For example providing '{}' will keep any occurrences of '{}'
                (but will still remove '{' and '}' that appear separately).
        c                 3   s    | ]}  |V  qd S rA   )rk   )rF   partrC   r$   r/   	<genexpr>  s    

z4SiglipTokenizer.canonicalize_text.<locals>.<genexpr>z\s+ )joinsplitrk   resubstrip)r+   re   rm   r$   rC   r/   canonicalize_text  s   


z!SiglipTokenizer.canonicalize_textr
   c                    sV   t  jt|td fi |}t|dkr)|d tkr)|d | jv r)|dd }|S )z8
        Converts a string to a list of tokens.
        rp   rQ   r   N)r)   tokenizeSPIECE_UNDERLINEreplacerS   all_special_tokens)r+   re   add_special_tokensr,   tokensr-   r$   r/   rw   (  s    &zSiglipTokenizer.tokenizec                 C   s   t | jt| jS rA   )rS   r(   encoder&   r   rC   r$   r$   r/   unk_token_length2  s   z SiglipTokenizer.unk_token_lengthc                 K   sT   | j |dd}| jj|td}| jj| j| td}t|| jkr(|| jd S |S )u*  
        Returns a tokenized string.

        We de-activated the `add_dummy_prefix` option, thus the sentencepiece internals will always strip any
        SPIECE_UNDERLINE.

        For example: `self.sp_model.encode(f"{SPIECE_UNDERLINE}Hey", out_type = str)` will give `['H', 'e', 'y']` instead of `['▁He', 'y']`.

        Thus we always encode `f"{unk_token}text"` and strip the `unk_token`. Here is an example with `unk_token = "<unk>"` and `unk_token_length = 4`.
        `self.tokenizer.sp_model.encode("<unk> Hey", out_type = str)[4:]`.
        Nrl   )out_type)rv   r(   r}   r&   r   rS   r~   )r+   re   r,   r|   r$   r$   r/   	_tokenize7  s    zSiglipTokenizer._tokenizec                 C   s   | j |S )z0Converts a token (str) in an id using the vocab.)r(   piece_to_id)r+   tokenr$   r$   r/   _convert_token_to_idL  s   z$SiglipTokenizer._convert_token_to_idc                 C   s   | j |}|S )z=Converts an index (integer) in a token (str) using the vocab.)r(   	IdToPiece)r+   indexr   r$   r$   r/   _convert_id_to_tokenQ  s   z$SiglipTokenizer._convert_id_to_tokenc                 C   sp   g }d}d}|D ]#}|| j v r$|s|d7 }|| j|| 7 }d}g }q|| d}q|| j|7 }| S )z:Converts a sequence of tokens (string) in a single string.rf   Frp   T)rz   r(   decodeappendru   )r+   r|   current_sub_tokens
out_stringprev_is_specialr   r$   r$   r/   convert_tokens_to_stringV  s   

z(SiglipTokenizer.convert_tokens_to_stringsave_directoryfilename_prefixc                 C   s   t j|std| d d S t j||r|d ndtd  }t j| jt j|kr?t j	| jr?t
| j| |fS t j	| jsgt|d}| j }|| W d    |fS 1 sbw   Y  |fS )NzVocabulary path (z) should be a directory-rf   r   wb)ospathisdirloggererrorrq   VOCAB_FILES_NAMESabspathr   isfiler   r3   r(   serialized_model_protowrite)r+   r   r   out_vocab_fileficontent_spiece_modelr$   r$   r/   save_vocabularyj  s"   (

zSiglipTokenizer.save_vocabulary)r   r   r   NNr   T)NFrA   )F)$__name__
__module____qualname____doc__r   vocab_files_namesmodel_input_namesr   dictr&   r   r*   r'   propertyrD   rM   listintboolrR   rY   r[   r\   r`   rd   rk   rv   rw   r~   r   r   r   r   tupler   __classcell__r$   r$   r-   r/   r   -   sv    (1








(r   ) r   r   rs   ri   rW   shutilr   typingr   r   r   r   r1   convert_slow_tokenizerr   tokenization_utilsr   tokenization_utils_baser	   r
   utilsr   r   utils.import_utilsr   
get_loggerr   r   r   rx   r   __all__r$   r$   r$   r/   <module>   s.   
  
P