o
    eiM                     @   sn   d dl mZmZmZmZmZ d dlmZ ddlm	Z	 ddl
mZ eeZdddZG d	d
 d
e	Zd
gZdS )    )	Tokenizerdecodersnormalizerspre_tokenizers
processors)BPE   )TokenizersBackend)loggingz
vocab.jsonz
merges.txt)
vocab_filemerges_filec                       s   e Zd ZdZeZddgZeZ										dd
e	e
e	ef B dB de	ee	 B dB de	de	de	de	de	de	dB de	dB f fddZ  ZS )HerbertTokenizera  
    Construct a BPE tokenizer for HerBERT (backed by HuggingFace's tokenizers library).

    Peculiarities:

    - uses BERT's pre-tokenizer: BertPreTokenizer splits tokens on spaces, and also on punctuation. Each occurrence of
      a punctuation character will be treated separately.

    This tokenizer inherits from [`TokenizersBackend`] which contains most of the methods. Users should refer to the
    superclass for more information regarding methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        merges_file (`str`):
            Path to the merges file.
        cls_token (`str`, *optional*, defaults to `"<s>"`):
            The classifier token.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The padding token.
        mask_token (`str`, *optional*, defaults to `"<mask>"`):
            The mask token.
        sep_token (`str`, *optional*, defaults to `"</s>"`):
            The separator token.
        vocab (`str`, `dict` or `list`, *optional*):
            Custom vocabulary dictionary.
        merges (`str` or `list[str]`, *optional*):
            Custom merges list.
    	input_idsattention_maskN<s><unk><pad><mask></s>vocabmerges	cls_token	unk_token	pad_token
mask_token	sep_tokenr   r   c
                    s   |d ur|nt |di| _|pg | _tt| j| jd t |dd| _tjddddd| j_t	
 | j_tjdd| j_t jd|||||d|
 tj| jd	f| jdfd
| j_d S )Nr   z</w>)r   r   dropoutr   end_of_word_suffixFT)	lowercasestrip_accents
clean_texthandle_chinese_chars)suffix)r   r   r   r   r      )sepcls )str_vocab_mergesr   r   
_tokenizerr   BertNormalizer
normalizerr   BertPreTokenizerpre_tokenizerr   
BPEDecoderdecodersuper__init__r   BertProcessingr   r   post_processor)selfr   r   r   r   r   r   r   r   r   kwargs	__class__r&   n/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/herbert/tokenization_herbert.pyr2   A   s:   


	zHerbertTokenizer.__init__)	NNr   r   r   r   r   NN)__name__
__module____qualname____doc__VOCAB_FILES_NAMESvocab_files_namesmodel_input_namesr   modelr'   dictintlistr2   __classcell__r&   r&   r7   r9   r      sB     	
r   N)
tokenizersr   r   r   r   r   tokenizers.modelsr   tokenization_utils_tokenizersr	   utilsr
   
get_loggerr:   loggerr>   r   __all__r&   r&   r&   r9   <module>   s   


S