o
    ei                     @   sp   d Z ddlmZmZmZmZ ddlmZ ddlm	Z	 ddl
mZ eeZddd	d
ZG dd de	ZdgZdS )z$Tokenization classes for OpenAI GPT.    )	Tokenizerdecodersnormalizerspre_tokenizers)BPE   )TokenizersBackend)loggingz
vocab.jsonz
merges.txtztokenizer.json)
vocab_filemerges_filetokenizer_filec                       sn   e Zd ZdZeZddgZeZ			dde	e
e	ef B dB de	ee	 B dB de	f fd	d
Zedd Z  ZS )OpenAIGPTTokenizera  
    Construct a GPT Tokenizer (backed by HuggingFace's *tokenizers* library). Based on Byte-Pair-Encoding with
    the following peculiarities:

    - lower case all inputs
    - uses BERT's BasicTokenizer for pre-BPE tokenization

    This tokenizer inherits from [`TokenizersBackend`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`, *optional*):
            Path to the vocabulary file.
        merges_file (`str`, *optional*):
            Path to the merges file.
        tokenizer_file (`str`, *optional*):
            Path to a tokenizers JSON file containing the serialization of a tokenizer.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        vocab (`str` or `dict[str, int]`, *optional*):
            Custom vocabulary dictionary. If not provided, a blank vocabulary is initialized.
        merges (`str` or `list[str]`, *optional*):
            Custom merges list. If not provided, an empty list is used.
    	input_idsattention_maskN<unk>vocabmerges	unk_tokenc              
      s   |d ur|nt |di| _|pg | _tt| j| jd dddt |d| _tt t	 t
 g| j_t | j_tjdd| j_t jdd|i| d S )	Nr    z</w>F)r   r   dropoutcontinuing_subword_prefixend_of_word_suffixfuse_unkr   )suffixr    )str_vocab_mergesr   r   
_tokenizerr   SequenceNFD	LowercaseStripAccents
normalizerr   BertPreTokenizerpre_tokenizerr   
BPEDecoderdecodersuper__init__)selfr   r   r   kwargs	__class__r   l/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/openai/tokenization_openai.pyr)   ;   s4   


zOpenAIGPTTokenizer.__init__c                 C   s   dS )NTr   )r*   r   r   r.   do_lower_casec   s   z OpenAIGPTTokenizer.do_lower_case)NNr   )__name__
__module____qualname____doc__VOCAB_FILES_NAMESvocab_files_namesmodel_input_namesr   modelr   dictintlistr)   propertyr/   __classcell__r   r   r,   r.   r      s"    (r   N)r3   
tokenizersr   r   r   r   tokenizers.modelsr   tokenization_utils_tokenizersr   utilsr	   
get_loggerr0   loggerr4   r   __all__r   r   r   r.   <module>   s   

L