o
    i	                     @   sd   d Z ddlmZ ddlmZ ddlmZ ddlmZ e	e
Zdd	d
dZG dd deZdgZdS )z)Fast Tokenization classes for OpenAI GPT.    )Optional   )PreTrainedTokenizerFast)logging   )OpenAIGPTTokenizerz
vocab.jsonz
merges.txtztokenizer.json)
vocab_filemerges_filetokenizer_filec                       s^   e Zd ZdZeZddgZeZd fdd	Z	e
dd	 Zdd
edee dee fddZ  ZS )OpenAIGPTTokenizerFasta  
    Construct a "fast" GPT Tokenizer (backed by HuggingFace's *tokenizers* library). Based on Byte-Pair-Encoding with
    the following peculiarities:

    - lower case all inputs
    - uses BERT's BasicTokenizer for pre-BPE tokenization

    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        merges_file (`str`):
            Path to the merges file.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
    	input_idsattention_maskN<unk>c                    s    t  j||f||d| d S )N)r
   	unk_token)super__init__)selfr   r	   r
   r   kwargs	__class__ g/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/openai/tokenization_openai_fast.pyr   6   s    zOpenAIGPTTokenizerFast.__init__c                 C   s   dS )NTr   )r   r   r   r   do_lower_case9   s   z$OpenAIGPTTokenizerFast.do_lower_casesave_directoryfilename_prefixreturnc                 C   s   | j jj||d}t|S )N)name)
_tokenizermodelsavetuple)r   r   r   filesr   r   r   save_vocabulary=   s   z&OpenAIGPTTokenizerFast.save_vocabulary)NNNr   )N)__name__
__module____qualname____doc__VOCAB_FILES_NAMESvocab_files_namesmodel_input_namesr   slow_tokenizer_classr   propertyr   strr   r    r"   __classcell__r   r   r   r   r      s    
(r   N)r&   typingr   tokenization_utils_fastr   utilsr   tokenization_openair   
get_loggerr#   loggerr'   r   __all__r   r   r   r   <module>   s   

%