o
    i                      @   s   U d Z ddlmZ ddlmZmZ ddlmZ ee	Z
dZdZdZdZd	Zd
ZdZedededededediZeeef ed< dd e D Zeeef ed< G dd deZdgZdS )z Tokenization classes for CANINE.    )Optional   )
AddedTokenPreTrainedTokenizer)loggingi   i   i  i  i  i  z[CLS]z[SEP]z[BOS]z[MASK]z[PAD]z
[RESERVED]SPECIAL_CODEPOINTSc                 C      i | ]\}}||qS  r	   ).0	codepointnamer	   r	   k/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/transformers/models/canine/tokenization_canine.py
<dictcomp>7       r   SPECIAL_CODEPOINTS_BY_NAMEc                
       s  e Zd ZdZeeeeeeeeeeeeddf fdd	Z	e
defddZd	d
 Zdedee fddZdedefddZdedefddZdd Z	d"dee deee  dee fddZ	d#dee deee  dedee f fddZd"dedee fd d!Z  ZS )$CanineTokenizera  
    Construct a CANINE tokenizer (i.e. a character splitter). It turns text into a sequence of characters, and then
    converts each character into its Unicode code point.

    [`CanineTokenizer`] inherits from [`PreTrainedTokenizer`].

    Refer to superclass [`PreTrainedTokenizer`] for usage examples and documentation concerning parameters.

    Args:
        model_max_length (`int`, *optional*, defaults to 2048):
                The maximum sentence length the model accepts.
    Fi   c	                    s  t |trt|dddn|}t |trt|dddn|}t |tr(t|dddn|}t |tr6t|dddn|}t |trDt|dddn|}t |trRt|dddn|}i | _t D ]	\}
}|
| j|< q[dd | j D | _t| _t	| j| _
t jd||||||||d|	 d S )NF)lstriprstripTc                 S   r   r	   r	   )r
   r   r   r	   r	   r   r   c   s    
z,CanineTokenizer.__init__.<locals>.<dictcomp>)	bos_token	eos_token	sep_token	cls_token	pad_token
mask_tokenadd_prefix_spacemodel_max_lengthr	   )
isinstancestrr   _special_codepointsr   items_special_codepoint_stringsUNICODE_VOCAB_SIZE_unicode_vocab_sizelen_num_special_tokenssuper__init__)selfr   r   r   r   r   r   r   r   kwargsr   r   	__class__r	   r   r&   H   s4   	
zCanineTokenizer.__init__returnc                 C   s   | j S N)r"   )r'   r	   r	   r   
vocab_sizev   s   zCanineTokenizer.vocab_sizec                 C   s$   dd t | jD }|| j |S )Nc                 S   s   i | ]}t ||qS r	   )chr)r
   ir	   r	   r   r   {   r   z-CanineTokenizer.get_vocab.<locals>.<dictcomp>)ranger-   updateadded_tokens_encoder)r'   vocabr	   r	   r   	get_vocabz   s   zCanineTokenizer.get_vocabtextc                 C   s   t |S )z5Tokenize a string (i.e. perform character splitting).)list)r'   r5   r	   r	   r   	_tokenize   s   zCanineTokenizer._tokenizetokenc                 C   s*   zt |W S  ty   td| dw )zaConverts a token (i.e. a Unicode character) in an id (i.e. its integer Unicode code point value).zinvalid token: '')ord	TypeError
ValueError)r'   r8   r	   r	   r   _convert_token_to_id   s
   
z$CanineTokenizer._convert_token_to_idindexc                 C   s:   z|t v r
t | W S t|W S  ty   td| w )z
        Converts a Unicode code point (integer) in a token (str). In case it's a special code point, convert to
        human-readable format.
        zinvalid id: )r   r.   r;   r<   )r'   r>   r	   r	   r   _convert_id_to_token   s   

z$CanineTokenizer._convert_id_to_tokenc                 C   s
   d |S )N )join)r'   tokensr	   r	   r   convert_tokens_to_string   s   
z(CanineTokenizer.convert_tokens_to_stringNtoken_ids_0token_ids_1c                 C   s4   | j g}| jg}|| | }|dur||| 7 }|S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A CANINE sequence has the following format:

        - single sequence: `[CLS] X [SEP]`
        - pair of sequences: `[CLS] A [SEP] B [SEP]`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        N)sep_token_idcls_token_id)r'   rD   rE   sepclsresultr	   r	   r    build_inputs_with_special_tokens   s   z0CanineTokenizer.build_inputs_with_special_tokensalready_has_special_tokensc                    sT   |rt  j||ddS dgdgt|  dg }|dur(|dgt| dg 7 }|S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)rD   rE   rL      r   N)r%   get_special_tokens_maskr#   )r'   rD   rE   rL   rJ   r)   r	   r   rN      s   z'CanineTokenizer.get_special_tokens_masksave_directoryfilename_prefixc                 C   s   dS )Nr	   r	   )r'   rO   rP   r	   r	   r   save_vocabulary   s   zCanineTokenizer.save_vocabularyr,   )NF)__name__
__module____qualname____doc__r.   CLSSEPPADMASKr&   propertyintr-   r4   r   r6   r7   r=   r?   rC   r   rK   boolrN   rQ   __classcell__r	   r	   r)   r   r   :   sJ    .


 r   N)rU   typingr   tokenization_utilsr   r   utilsr   
get_loggerrR   loggerr!   rX   rV   rW   BOSrY   RESERVEDr   dictr[   r   __annotations__r   r   r   __all__r	   r	   r	   r   <module>   s.   

" 
