o
    eiR                     @   sx   d Z ddlmZmZmZmZmZmZ ddlm	Z	 ddl
mZ ddlmZ eeZddd	d
ZG dd deZdgZdS )zTokenization classes for CLIP.    )Regex	Tokenizerdecodersnormalizerspre_tokenizers
processors)BPE   )TokenizersBackend)loggingz
vocab.jsonz
merges.txtztokenizer.json)
vocab_filemerges_filetokenizer_filec                       s|   e Zd ZdZeZddgZeZ						dde	e
e	ef B dB de	ee	 B dB d	e	d
e	de	de	f fddZdd Z  ZS )CLIPTokenizerav  
    Construct a CLIP tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
    Byte-Pair-Encoding.

    This tokenizer inherits from [`TokenizersBackend`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.

    Args:
        vocab (`str`, `dict` or `list`, *optional*):
            Vocabulary dict to use for the tokenizer.
        merges (`str` or `list`, *optional*):
            Merges list to use for the BPE tokenizer.
        unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        bos_token (`str`, *optional*, defaults to `"<|startoftext|>"`):
            The beginning of sequence token.
        eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
            The end of sequence token.
        pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
            The token used for padding, for example when batching sequences of different lengths.
    	input_idsattention_maskN<|endoftext|><|startoftext|>vocabmerges	unk_token	bos_token	eos_token	pad_tokenc           	   
      s  |d ur|nt |dt |dt |di}|pg | _tt|| jd dddt |d| _tt tt	dd	t
 g| j_ttjt	d
dddtjddg| j_t | j_t jd||||d| tjt || jft || jfddd| j_|   d S )Nr          z</w>F)r   r   dropoutcontinuing_subword_prefixend_of_word_suffixfuse_unkr   z\s+ z[<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+removedT)behaviorinvert)add_prefix_space)r   r   r   r   )sepclsr%   trim_offsets )str_mergesr   r   
_tokenizerr   SequenceNFCReplacer   	Lowercase
normalizerr   Split	ByteLevelpre_tokenizerr   decodersuper__init__r   RobertaProcessingeos_token_idbos_token_idpost_processor%_wrap_decode_method_backend_tokenizer)	selfr   r   r   r   r   r   kwargs_vocab	__class__r)   h/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/clip/tokenization_clip.pyr7   8   sb   



zCLIPTokenizer.__init__c                    s,   | j j| j jj  fdd}|| j _d S )Nc                     s"   | i |}|  d }|S )Nr!   )replacestrip)argsr>   textr   orig_decode_methodr)   rB   new_decode_method   s   zNCLIPTokenizer._wrap_decode_method_backend_tokenizer.<locals>.new_decode_method)backend_tokenizerdecodemodelr   )r=   rI   r)   rG   rB   r<      s   
z3CLIPTokenizer._wrap_decode_method_backend_tokenizer)NNr   r   r   r   )__name__
__module____qualname____doc__VOCAB_FILES_NAMESvocab_files_namesmodel_input_namesr   rL   r*   dictintlistr7   r<   __classcell__r)   r)   r@   rB   r      s2    Gr   N)rP   
tokenizersr   r   r   r   r   r   tokenizers.modelsr   tokenization_utils_tokenizersr
   utilsr   
get_loggerrM   loggerrQ   r   __all__r)   r)   r)   rB   <module>   s    

r