o
    ei                     @   s   d Z ddlmZmZmZmZmZmZ ddlm	Z	 ddl
mZ ddlmZ eeZddd	d
ZddiZdZG dd deZdgZdS )zTokenization classes for Qwen2.    )
AddedTokenRegex	Tokenizerdecodersnormalizerspre_tokenizers)BPE   )TokenizersBackend)loggingz
vocab.jsonz
merges.txtztokenizer.json)
vocab_filemerges_filetokenizer_filezqwen/qwen-tokenizeri   zn(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+c                       sn   e Zd ZeZddgZeZ							ddee	ee
f B dB deee B dB deded	ef
 fd
dZ  ZS )Qwen2Tokenizer	input_idsattention_maskN<|endoftext|>vocabmerges	unk_token	eos_token	pad_tokenc           	         s   |d ur|nd| _ |d ur|nddi| _|pg | _tt| j| jd d ddddd| _t | j_t	
 | j_ttjttdddtj| j ddg| j_t jd|||||d	| | d
d | jD  d S )NFr   r    )r   r   dropoutr   continuing_subword_prefixend_of_word_suffixfuse_unkbyte_fallbackisolated)behaviorinvert)add_prefix_space	use_regex)r   	bos_tokenr   r   r!   c                 S   s   g | ]}t |d dqS )T)special)r   ).0token r'   j/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/qwen2/tokenization_qwen2.py
<listcomp>b   s    z+Qwen2Tokenizer.__init__.<locals>.<listcomp>r'   )r!   _vocab_mergesr   r   
_tokenizerr   	ByteLeveldecoderr   NFC
normalizerr   SequenceSplitr   PRETOKENIZE_REGEXpre_tokenizersuper__init__
add_tokensall_special_tokens)	selfr   r   r   r#   r   r   r!   kwargs	__class__r'   r(   r6   )   sV   
	zQwen2Tokenizer.__init__)NNr   Nr   r   N)__name__
__module____qualname__VOCAB_FILES_NAMESvocab_files_namesmodel_input_namesr   modelstrdictintlistr6   __classcell__r'   r'   r;   r(   r   $   s,    r   N)__doc__
tokenizersr   r   r   r   r   r   tokenizers.modelsr   tokenization_utils_tokenizersr
   utilsr   
get_loggerr=   loggerr@   MAX_MODEL_INPUT_SIZESr3   r   __all__r'   r'   r'   r(   <module>   s    

A