o
    ei,                     @   s   U d Z ddlmZmZ ddlmZ eeZdZ	dZ
dZdZdZd	Zd
Zedededede
dediZeeef ed< dd e D Zeeef ed< G dd deZdgZdS )z Tokenization classes for CANINE.   )
AddedTokenPreTrainedTokenizer)loggingi       i   i  i  i  i  z[CLS]z[SEP]z[BOS]z[MASK]z[PAD]z
[RESERVED]SPECIAL_CODEPOINTSc                 C      i | ]\}}||qS  r   ).0	codepointnamer   r   l/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/canine/tokenization_canine.py
<dictcomp>4       r   SPECIAL_CODEPOINTS_BY_NAMEc                       s   e Zd ZdZg dZeeeeeeeeeeee	ddf fdd	Z
edefdd	Zd
d Zdedee fddZdedefddZdedefddZdd Z  ZS )CanineTokenizera  
    Construct a CANINE tokenizer (i.e. a character splitter). It turns text into a sequence of characters, and then
    converts each character into its Unicode code point.

    [`CanineTokenizer`] inherits from [`PreTrainedTokenizer`].

    Refer to superclass [`PreTrainedTokenizer`] for usage examples and documentation concerning parameters.

    Args:
        model_max_length (`int`, *optional*, defaults to 2048):
                The maximum sentence length the model accepts.
    )	input_idsattention_masktoken_type_idsFi   c	                    s   t |trt|dddn|}t |trt|dddn|}t |tr(t|dddn|}t |tr6t|dddn|}t |trDt|dddn|}t |trRt|dddn|}i | _t D ]	\}
}|
| j|< q[dd | j D | _t| _t	| j| _
t jd	||||||||dddd|	 d S )
NF)lstriprstripTc                 S   r   r   r   )r	   r   r
   r   r   r   r   b   s    
z,CanineTokenizer.__init__.<locals>.<dictcomp>	all_zeroscls_sep)	bos_token	eos_token	sep_token	cls_token	pad_token
mask_tokenadd_prefix_spacemodel_max_lengthtoken_type_ids_pattern%token_type_ids_include_special_tokensspecial_tokens_patternr   )
isinstancestrr   _special_codepointsr   items_special_codepoint_stringsUNICODE_VOCAB_SIZE_unicode_vocab_sizelen_num_special_tokenssuper__init__)selfr   r   r   r   r   r   r   r   kwargsr
   r   	__class__r   r   r-   G   s:   
zCanineTokenizer.__init__returnc                 C   s   | j S )N)r)   )r.   r   r   r   
vocab_sizex   s   zCanineTokenizer.vocab_sizec                 C   s$   dd t | jD }|| j |S )Nc                 S   s   i | ]}t ||qS r   )chr)r	   ir   r   r   r   }   r   z-CanineTokenizer.get_vocab.<locals>.<dictcomp>)ranger3   updateadded_tokens_encoder)r.   vocabr   r   r   	get_vocab|   s   zCanineTokenizer.get_vocabtextc                 C   s   t |S )z5Tokenize a string (i.e. perform character splitting).)list)r.   r;   r   r   r   	_tokenize   s   zCanineTokenizer._tokenizetokenc                 C   s*   zt |W S  ty   td| dw )zaConverts a token (i.e. a Unicode character) in an id (i.e. its integer Unicode code point value).zinvalid token: '')ord	TypeError
ValueError)r.   r>   r   r   r   _convert_token_to_id   s
   
z$CanineTokenizer._convert_token_to_idindexc                 C   s:   z|t v r
t | W S t|W S  ty   td| w )z
        Converts a Unicode code point (integer) in a token (str). In case it's a special code point, convert to
        human-readable format.
        zinvalid id: )r   r4   rA   rB   )r.   rD   r   r   r   _convert_id_to_token   s   

z$CanineTokenizer._convert_id_to_tokenc                 C   s
   d |S )N )join)r.   tokensr   r   r   convert_tokens_to_string   s   
z(CanineTokenizer.convert_tokens_to_string)__name__
__module____qualname____doc__model_input_namesr4   CLSSEPPADMASKr-   propertyintr3   r:   r$   r<   r=   rC   rE   rI   __classcell__r   r   r0   r   r   7   s&    1r   N)rM   tokenization_pythonr   r   utilsr   
get_loggerrJ   loggerr(   rQ   rO   rP   BOSrR   RESERVEDr   dictrT   r$   __annotations__r&   r   r   __all__r   r   r   r   <module>   s*   

"
e