o
    eiA                     @   st   d Z ddlmZmZmZmZmZmZ ddlm	Z	 ddl
mZ ddlmZ eeZddiZG d	d
 d
eZd
gZdS )zTokenization classes for XGLM.    )Regex	Tokenizerdecodersnormalizerspre_tokenizers
processors)Unigram   )TokenizersBackend)loggingtokenizer_fileztokenizer.jsonc                       sx   e Zd ZdZeZddgZeZ									dd
e	e
ee	ef  B dB de	de	de	de	de	de	def fddZ  ZS )XGLMTokenizeraW  
    Construct a XGLM tokenizer (backed by HuggingFace's tokenizers library). Based on BPE.

    This tokenizer inherits from [`TokenizersBackend`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.

    Args:
        tokenizer_file (`str`, *optional*):
            Path to a tokenizers JSON file containing the serialization of a tokenizer.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.
        sep_token (`str`, *optional*, defaults to `"</s>"`):
            The separator token, which is used when building a sequence from multiple sequences.
        cls_token (`str`, *optional*, defaults to `"<s>"`):
            The classifier token which is used when doing sequence classification.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding.
        vocab (`str`, `dict` or `list`, *optional*):
            Custom vocabulary dictionary. If not provided, a minimal vocabulary is created.
        merges (`list[tuple[str, str]]`, *optional*):
            Custom merge rules for BPE. If not provided, merges are generated from the vocabulary.
        add_prefix_space (`bool`, *optional*, defaults to `True`):
            Whether to add a prefix space before encoding.
    	input_idsattention_maskN<s></s><unk><pad>Tvocab	bos_token	eos_token	sep_token	cls_token	unk_token	pad_tokenadd_prefix_spacec	              
      s  d| _ dd t| j D }
 dg pg  d<  d   fdd|
D 7  < || _|d ur1|| _nt|dft|dft|dft|dfg| _tt| jddd	| _t	
t	td
dt	 t	tddg| j_|rodnd}tjd|d| j_tjd|d| j_t jd|||||||d  tj| j d| j | j d| j d| j d| j | j| jf| j| jfgd| j_d S )N   c                 S   s   g | ]}d | dqS )z<madeupword> ).0ir   r   h/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/xglm/tokenization_xglm.py
<listcomp>K   s    z*XGLMTokenizer.__init__.<locals>.<listcomp>additional_special_tokensc                    s   g | ]
}| d  vr|qS )r#   r   )r   wordkwargsr   r!   r"   M   s    g        r	   F)r   unk_idbyte_fallbackz[\n\r\t] z {2,}alwaysneveru   ▁)replacementprepend_scheme)r   r   r   r   r   r   r   z $A z $B )singlepairspecial_tokensr   )num_madeup_wordsrangegetr   _vocabstrr   r   
_tokenizerr   SequenceReplacer   NFKC
normalizerr   	Metaspacepre_tokenizerr   decodersuper__init__r   TemplateProcessingr   r   bos_token_ideos_token_idpost_processor)selfr   r   r   r   r   r   r   r   r&   madeup_wordsr-   	__class__r%   r!   r?   >   sT   




 

zXGLMTokenizer.__init__)Nr   r   r   r   r   r   T)__name__
__module____qualname____doc__VOCAB_FILES_NAMESvocab_files_namesmodel_input_namesr   modelr5   listtuplefloatboolr?   __classcell__r   r   rF   r!   r      s<    	r   N)rK   
tokenizersr   r   r   r   r   r   tokenizers.modelsr   tokenization_utils_tokenizersr
   utilsr   
get_loggerrH   loggerrL   r   __all__r   r   r   r!   <module>   s    

b