o
    ei7                     @   s   d Z ddlZddlZddlZddlZddlmZ ddlmZm	Z	 ddl
ZddlmZ ddlmZ er8ddlmZ dd	lmZmZ dd
lmZ eeZddiZdZeddG dd deZdgZdS )z$Tokenization class for SigLIP model.    N)copyfile)TYPE_CHECKINGAny   )
AddedToken)SentencePieceBackend)	TextInput)loggingrequires_backends)requires
vocab_filezspiece.modelu   ▁)sentencepiece)backendsc                
       s  e Zd ZdZeZddgZ							d9d	eee	f dB d
df fddZ
edd Zdd Z	d:dee dee dB ded
ee f fddZdee d
ee fddZ	d;dee dee dB d
ee fddZ	d;dee dee dB d
ee fddZdd Zd d! Zd"ed
efd#d$Zdd%d&d'Zd<d"d(d
ee f fd)d*Zed+d, Zd-d. Zd/d0 Zd1d2 Zd3d4 Zd;d5ed6edB d
ee fd7d8Z   Z!S )=SiglipTokenizera  
    Construct a Siglip tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"</s>"`):
            The token used for padding, for example when batching sequences of different lengths.
        additional_special_tokens (`list[str]`, *optional*):
            Additional special tokens used by the tokenizer.
        sp_model_kwargs (`dict`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:

            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.
        model_max_length (`int`, *optional*, defaults to 64):
            The maximum length (in number of tokens) for model inputs.
        do_lower_case (`bool`, *optional*, defaults to `True`):
            Whether or not to lowercase the input when tokenizing.
    	input_idsattention_mask</s><unk>N@   Tsp_model_kwargsreturnc	           
         s   t | d t|trt|dddddn|}t|tr#t|dddddn|}t|tr3t|dddddn|}|d u r;i n|| _|| _t jd|||||| j||d|	 d S )NprotobufTF)rstriplstrip
normalizedspecial)r   	eos_token	unk_token	pad_tokenadditional_special_tokensr   model_max_lengthdo_lower_case )r
   
isinstancestrr   r   r!   super__init__)
selfr   r   r   r   r   r   r    r!   kwargs	__class__r"   l/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/siglip/tokenization_siglip.pyr&   X   s6   
	
zSiglipTokenizer.__init__c                 C   s
   | j  S N)sp_modelget_piece_sizer'   r"   r"   r+   
vocab_size   s   
zSiglipTokenizer.vocab_sizec                    s(    fddt  jD }| j |S )Nc                    s   i | ]}  ||qS r"   )convert_ids_to_tokens).0ir/   r"   r+   
<dictcomp>   s    z-SiglipTokenizer.get_vocab.<locals>.<dictcomp>)ranger0   updateadded_tokens_encoder)r'   vocabr"   r/   r+   	get_vocab   s   zSiglipTokenizer.get_vocabFtoken_ids_0token_ids_1already_has_special_tokensc                    sZ   |rt  j||ddS |du rdgt| dg S dgt| dg dgt|  dg S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`list[int]`):
                List of IDs.
            token_ids_1 (`list[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)r:   r;   r<   Nr      )r%   get_special_tokens_masklen)r'   r:   r;   r<   r)   r"   r+   r>      s   (z'SiglipTokenizer.get_special_tokens_mask	token_idsc                 C   s>   t |dkr|d | jkrtd| j d |S || jg S )z.Do not add eos again if user already added it.r   zThis sequence already has zQ. In future versions this behavior may lead to duplicated eos tokens being added.)r?   eos_token_idwarningswarnr   )r'   r@   r"   r"   r+   _add_eos_if_not_present   s   z'SiglipTokenizer._add_eos_if_not_presentc                 C   s<   | j g}|du rt|| dg S t|| | | dg S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
        use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`list[int]`):
                List of IDs.
            token_ids_1 (`list[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `list[int]`: List of zeros.
        Nr   )rB   r?   )r'   r:   r;   eosr"   r"   r+   $create_token_type_ids_from_sequences   s   z4SiglipTokenizer.create_token_type_ids_from_sequencesc                 C   s(   |  |}|du r|S |  |}|| S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A sequence has the following format:

        - single sequence: `X </s>`
        - pair of sequences: `A </s> B </s>`

        Args:
            token_ids_0 (`list[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`list[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        N)rE   )r'   r:   r;   r"   r"   r+    build_inputs_with_special_tokens   s
   

z0SiglipTokenizer.build_inputs_with_special_tokensc                 C   s   | j  }d |d< |S )Nr-   )__dict__copy)r'   stater"   r"   r+   __getstate__   s   
zSiglipTokenizer.__getstate__c                 C   s<   || _ t| dsi | _tjdi | j| _| j| j d S )Nr   r"   )rI   hasattrr   spmSentencePieceProcessorr-   Loadr   )r'   dr"   r"   r+   __setstate__   s
   
zSiglipTokenizer.__setstate__textc                 C   s   | tddtjS )N )	translater$   	maketransstringpunctuation)r'   rS   r"   r"   r+   remove_punctuation   s   z"SiglipTokenizer.remove_punctuationkeep_punctuation_exact_stringc                   sV    j r| }|r| fdd||D }n |}tdd|}| }|S )a  Returns canonicalized `text` (puncuation removed).

        Args:
            text (`str`):
                String to be canonicalized.
            keep_punctuation_exact_string (`str`, *optional*):
                If provided, then this exact string is kept. For example providing '{}' will keep any occurrences of '{}'
                (but will still remove '{' and '}' that appear separately).
        c                 3   s    | ]}  |V  qd S r,   )rY   )r2   partr/   r"   r+   	<genexpr>  s    

z4SiglipTokenizer.canonicalize_text.<locals>.<genexpr>z\s+ )r!   lowerjoinsplitrY   resubstrip)r'   rS   r[   r"   r/   r+   canonicalize_text   s   


z!SiglipTokenizer.canonicalize_textr   c                    sV   t  jt|td fi |}t|dkr)|d tkr)|d | jv r)|dd }|S )z8
        Converts a string to a list of tokens.
        r^   r=   r   N)r%   tokenizeSPIECE_UNDERLINEreplacer?   all_special_tokens)r'   rS   add_special_tokensr(   tokensr)   r"   r+   rf     s    &zSiglipTokenizer.tokenizec                 C   s   t | jt| jS r,   )r?   r-   encoder$   r   r/   r"   r"   r+   unk_token_length  s   z SiglipTokenizer.unk_token_lengthc                 K   sT   | j |dd}| jj|td}| jj| j| td}t|| jkr(|| jd S |S )u*  
        Returns a tokenized string.

        We de-activated the `add_dummy_prefix` option, thus the sentencepiece internals will always strip any
        SPIECE_UNDERLINE.

        For example: `self.sp_model.encode(f"{SPIECE_UNDERLINE}Hey", out_type = str)` will give `['H', 'e', 'y']` instead of `['▁He', 'y']`.

        Thus we always encode `f"{unk_token}text"` and strip the `unk_token`. Here is an example with `unk_token = "<unk>"` and `unk_token_length = 4`.
        `self.tokenizer.sp_model.encode("<unk> Hey", out_type = str)[4:]`.
        NrZ   )out_type)re   r-   rl   r$   r   r?   rm   )r'   rS   r(   rk   r"   r"   r+   	_tokenize  s    zSiglipTokenizer._tokenizec                 C   s   | j |S )z0Converts a token (str) in an id using the vocab.)r-   piece_to_id)r'   tokenr"   r"   r+   _convert_token_to_id2  s   z$SiglipTokenizer._convert_token_to_idc                 C   s   | j |}|S )z=Converts an index (integer) in a token (str) using the vocab.)r-   	IdToPiece)r'   indexrq   r"   r"   r+   _convert_id_to_token6  s   z$SiglipTokenizer._convert_id_to_tokenc                 C   sp   g }d}d}|D ]#}|| j v r$|s|d7 }|| j|| 7 }d}g }q|| d}q|| j|7 }| S )z:Converts a sequence of tokens (string) in a single string.rT   Fr^   T)ri   r-   decodeappendrd   )r'   rk   current_sub_tokens
out_stringprev_is_specialrq   r"   r"   r+   convert_tokens_to_string;  s   

z(SiglipTokenizer.convert_tokens_to_stringsave_directoryfilename_prefixc                 C   s   t j|std| d d S t j||r|d ndtd  }t j| jt j|kr?t j	| jr?t
| j| |fS t j	| jsgt|d}| j }|| W d    |fS 1 sbw   Y  |fS )NzVocabulary path (z) should be a directory-rT   r   wb)ospathisdirloggererrorr`   VOCAB_FILES_NAMESabspathr   isfiler   openr-   serialized_model_protowrite)r'   r|   r}   out_vocab_fileficontent_spiece_modelr"   r"   r+   save_vocabularyN  s"   (

zSiglipTokenizer.save_vocabulary)r   r   r   NNr   T)NFr,   )F)"__name__
__module____qualname____doc__r   vocab_files_namesmodel_input_namesdictr$   r   r&   propertyr0   r9   listintboolr>   rE   rG   rH   rL   rR   rY   re   rf   rm   ro   rr   ru   r{   tupler   __classcell__r"   r"   r)   r+   r   +   st    (-








(r   )r   r   rb   rW   rC   shutilr   typingr   r   r   rN   tokenization_utils_baser    tokenization_utils_sentencepiecer   r   utilsr	   r
   utils.import_utilsr   
get_loggerr   r   r   rg   r   __all__r"   r"   r"   r+   <module>   s,   
  
6