o
    ei$                     @   s   d Z ddlZddlmZmZmZmZmZ ddlm	Z	 ddl
mZ ddlmZ eeZdd	iZd
d ZG dd deZdgZdS )z"Tokenization classes for Splinter.    N)	Tokenizerdecodersnormalizerspre_tokenizers
processors)	WordPiece   )TokenizersBackend)logging
vocab_filez	vocab.txtc                 C   sf   t  }t| ddd}| }W d    n1 sw   Y  t|D ]\}}|d}|||< q#|S )Nrzutf-8)encoding
)collectionsOrderedDictopen	readlines	enumeraterstrip)r   vocabreadertokensindextoken r   p/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/splinter/tokenization_splinter.py
load_vocab   s   


r   c                       s   e Zd ZdZeZddgZeZ								
			dde	e
e	ef B dB dede	de	de	de	de	de	dededB f fddZedd Zdd Z  ZS )SplinterTokenizera  
    Construct a Splinter tokenizer (backed by HuggingFace's tokenizers library). Based on WordPiece.

    This tokenizer inherits from [`TokenizersBackend`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`, *optional*):
            Path to a vocabulary file.
        tokenizer_file (`str`, *optional*):
            Path to a tokenizers JSON file containing the serialization of a tokenizer.
        do_lower_case (`bool`, *optional*, defaults to `True`):
            Whether or not to lowercase the input when tokenizing.
        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
            The separator token, which is used when building a sequence from multiple sequences.
        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
            The token used for padding, for example when batching sequences of different lengths.
        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
            The classifier token which is used when doing sequence classification.
        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
            The token used for masking values.
        question_token (`str`, *optional*, defaults to `"[QUESTION]"`):
            The token used for constructing question representations.
        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
            Whether or not to tokenize Chinese characters.
        strip_accents (`bool`, *optional*):
            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
            value for `lowercase`.
        vocab (`str`, `dict` or `list`, *optional*):
            Custom vocabulary dictionary. If not provided, a minimal vocabulary is created.
    	input_idsattention_maskNT[UNK][SEP][PAD][CLS][MASK]
[QUESTION]r   do_lower_case	unk_token	sep_token	pad_token	cls_token
mask_tokenquestion_tokentokenize_chinese_charsstrip_accentsc                    s   |d ur|nt |dt |dt |dt |dt |dt |dddi| _tt| jt |d	| _tjd
|	|
|d| j_t	 | j_
tjdd| j_t jd||||||||	|
d	| || _|	| _|
| _|| _| j| jvrv| j| jgd
d |   d S )Nr         r         .   )r'   T)
clean_texthandle_chinese_charsr.   	lowercasez##)prefix)	r'   r(   r)   r*   r+   r,   r&   r-   r.   )special_tokensr   )str_vocabr   r   
_tokenizerr   BertNormalizer
normalizerr   BertPreTokenizerpre_tokenizerr   decodersuper__init__r&   r-   r.   r,   all_special_tokens
add_tokensupdate_post_processor)selfr   r&   r'   r(   r)   r*   r+   r,   r-   r.   kwargs	__class__r   r   rC   Q   sP   

zSplinterTokenizer.__init__c                 C   s   |  | jS )N)convert_tokens_to_idsr,   )rG   r   r   r   question_token_id   s   z#SplinterTokenizer.question_token_idc           
   
   C   s   | j }| j}| j}d}| j}| j}| j}| d}|d u s!|d u r#d S | jdkr:| d| d| d| d| d
}	n| d| d| d| d| d
}	tj	| d| d|	||f||f||f||fgd| j
_d S )	Nr3   rightz:0 $A:0  z:0 $B:1 z:1z:0)singlepairr9   )r*   r(   r,   cls_token_idsep_token_idrL   rK   padding_sider   TemplateProcessingr<   post_processor)
rG   clssepquestiondotrQ   rR   rL   dot_token_idrP   r   r   r   rF      s,   

$"z'SplinterTokenizer.update_post_processor)
NTr    r!   r"   r#   r$   r%   TN)__name__
__module____qualname____doc__VOCAB_FILES_NAMESvocab_files_namesmodel_input_namesr   modelr:   dictintboolrC   propertyrL   rF   __classcell__r   r   rI   r   r   )   sN    #	
<
r   )r^   r   
tokenizersr   r   r   r   r   tokenizers.modelsr   tokenization_utils_tokenizersr	   utilsr
   
get_loggerr[   loggerr_   r   r   __all__r   r   r   r   <module>   s   

 
