o
    ߥi                     @   s   d Z ddlmZmZmZmZmZmZ ddlm	Z	 ddl
mZ ddlmZmZ ddlmZ ddlmZ e ZdejiZdi iZd	d	d
Zddiddid
ZG dd deZdS )zTokenization classes for PoNet     )TYPE_CHECKINGAnyDictListOptionalUnion)PaddingStrategy)BertTokenizer)BatchEncodingEncodedInput)	ModelFile)
get_logger
vocab_filei   )z nlp_ponet_fill-mask_chinese-basez nlp_ponet_fill-mask_english-basedo_lower_caseTc                   @   sj   e Zd ZdZeZeZeZ	e
Zdejddfdeeeef ef dee dedee dee defd	d
ZdS )PoNetTokenizera  
    Construct an PoNet tokenizer. Based on BertTokenizer.

    This tokenizer inherits from :class:`~transformers.BertTokenizer` which contains most of the main methods.
    Users should refer to this superclass for more information regarding those methods.

    Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
    parameters.
    Nencoded_inputs
max_lengthpadding_strategypad_to_multiple_ofreturn_attention_maskreturnc           	      C   s  |du r	d| j v }|| j d  }|tjkrt|}|dur/|dur/|| dkr/|| d | }|tjko9t||k}|r|t| }| jdkr|rWdgt| dg|  |d< d|v rg|d | jg|  |d< d|v rv|d dg|  |d< d|v r|d |d d	 d g|  |d< || jg|  || j d < |S | jd
kr|rdg| dgt|  |d< d|v r| jg| |d  |d< d|v r|d d	 d g| |d  |d< d|v rdg| |d  |d< | jg| | || j d < |S tdt	| j |rd|vrdgt| |d< |S )a5  
        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)

        Args:
            encoded_inputs: Dictionary of tokenized inputs (`List[int]`) or
            batch of tokenized inputs (`List[List[int]]`).
            max_length: maximum length of the returned list and optionally padding length (see below).
                Will truncate by taking into account the special tokens.
            padding_strategy: PaddingStrategy to use for padding.

                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
                - PaddingStrategy.DO_NOT_PAD: Do not pad
                The tokenizer padding sides are defined in self.padding_side:

                    - 'left': pads on the left of the sequences
                    - 'right': pads on the right of the sequences
            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
                >= 7.5 (Volta).
            return_attention_mask: (optional) Set to False to avoid returning
            attention mask (default: set to model specifics)
        Nattention_maskr      righttoken_type_idsspecial_tokens_masksegment_idsleftzInvalid padding strategy:)
model_input_namesr   LONGESTlen
DO_NOT_PADpadding_sidepad_token_type_idpad_token_id
ValueErrorstr)	selfr   r   r   r   r   required_inputneeds_to_be_padded
difference r,   \/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/nlp/ponet/tokenization.py_pad@   s    





zPoNetTokenizer._pad)__name__
__module____qualname____doc__VOCAB_FILES_NAMESvocab_files_namesPRETRAINED_VOCAB_FILES_MAPpretrained_vocab_files_map&PRETRAINED_POSITIONAL_EMBEDDINGS_SIZESmax_model_input_sizesPRETRAINED_INIT_CONFIGURATIONpretrained_init_configurationr   r"   r   r   r'   r   r
   r   intbooldictr.   r,   r,   r,   r-   r   0   s.    
r   N)r2   typingr   r   r   r   r   r   transformers.file_utilsr   *transformers.models.bert.tokenization_bertr	   transformers.tokenization_utilsr
   r   modelscope.utils.constantr   modelscope.utils.loggerr   logger
VOCAB_FILEr3   r5   r7   r9   r   r,   r,   r,   r-   <module>   s&    

