o
    ߥi"                     @   s   d Z ddlmZmZmZmZ ddlZddlZddlZddl	Z	ddl
mZ ddlmZmZmZmZ ddlZddlZddlmZmZ ddlmZ e Zdd	iZG d
d deZdS )zTokenization classes for QWen.    )absolute_importdivisionprint_functionunicode_literalsN)open)ListOptionalTupleUnion)
AddedTokenPreTrainedTokenizer)
get_logger
vocab_fileqwen.tiktokenc                       s  e Zd ZdZ	 eZ									d+ fdd	Zd	d
 Zdd Zdd Z	de
dee
 fddZde
dee
 fddZdee
 de
fddZedd Zdede
fddZde
defdd Zedee
 fd!d"Zedee fd#d$Zd%d& Z	d,d'eeee f d(ede
fd)d*Z  ZS )-QWenTokenizerzQWen tokenizer.replaceN<|endoftext|>FTc              	      s  t |trt|dddn|}t |trt|dddn|}t |tr(t|dddn|}t |tr6t|dddn|}t j|||||||	d |	| _|d urN|ntd| _|| _d}d}d}d}|
rv|||d	d
dddft	dd t
dD  }n|||f}d}dtddfdd}||}dd t|t|dD }|| _tj||||d}t|t| |jksJ t|t|  d|j d|| _| j| _dd | j D | _|| _| jj| _|| | _|| | _d S )NF)lstriprstrip)errors	unk_token	bos_token	eos_token	pad_tokenadd_prefix_spaceadd_bos_tokeng   mBQwenr   z<|im_start|>z
<|im_end|>z<R>z<S>z<X>z<mask>z<sep>c                 S   s   g | ]}d | dqS )z<extra_> .0ir   r   [/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/nlp/qwen/tokenization.py
<listcomp>W   s    z*QWenTokenizer.__init__.<locals>.<listcomp>   zn(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+tiktoken_bpe_filereturnzdict[bytes, int]c                 S   s*   t | d }dd dd | D D S )Nrbc                 S   s    i | ]\}}t |t|qS r   )base64	b64decodeint)r    tokenrankr   r   r"   
<dictcomp>a   s    zEQWenTokenizer.__init__.<locals>.load_tiktoken_bpe.<locals>.<dictcomp>c                 s   s    | ]	}|r|  V  qd S N)split)r    liner   r   r"   	<genexpr>c   s    zDQWenTokenizer.__init__.<locals>.load_tiktoken_bpe.<locals>.<genexpr>)r   read
splitlines)r%   contentsr   r   r"   load_tiktoken_bpe_   s   z1QWenTokenizer.__init__.<locals>.load_tiktoken_bpec                 S      i | ]\}}||qS r   r   )r    indexr+   r   r   r"   r-   h   s    z*QWenTokenizer.__init__.<locals>.<dictcomp>)start)pat_strmergeable_ranksspecial_tokensz != z in encodingc                 S   r6   r   r   )r    kvr   r   r"   r-   z       )
isinstancestrr   super__init__r   r*   max_lenr   tuplerange	enumeratelenr;   tiktokenEncodingn_vocabr:   encoderitemsdecoder	tokenizer	eot_tokeneod_idim_start_id	im_end_id)selfr   r   rC   r   r   r   r   r   r   add_more_sp_tokenskwargsname	ENDOFTEXTIMSTARTIMENDr;   PAT_STRr5   r:   enc	__class__r   r"   rB   !   s   		


zQWenTokenizer.__init__c                 C      | j jS r.   rN   rJ   rS   r   r   r"   __len__   s   zQWenTokenizer.__len__c                 C   s   | j S r.   )r:   r`   r   r   r"   	get_vocab   s   zQWenTokenizer.get_vocabc                 C   s   g }t |tr|| jv r| j| S | j|S |D ]}|| jv r)|| j|  q|| j| qt|| jkrFt	d
t|| j |S )NzToken indices sequence length is longer than the specified maximum  sequence length for this model ({} > {}). Running this sequence through the model will result in indexing errors)r?   r@   r;   rK   getappendrG   rC   loggerwarningformat)rS   tokensidsr+   r   r   r"   convert_tokens_to_ids   s   



z#QWenTokenizer.convert_tokens_to_idssave_directoryr&   c                 K   s   t j|d}t|ddd)}| j D ]\}}t|dd t	| d }|
| qW d   |fS 1 s:w   Y  |fS )z
        Save only the vocabulary of the tokenizer (vocabulary + added tokens).

        Returns:
            `Tuple(str)`: Paths to the files saved.
        r   wutf8)encoding 
N)ospathjoinr   r:   rL   r(   	b64encodedecoder@   write)rS   rk   rU   	file_pathrl   r<   r=   r0   r   r   r"   save_vocabulary   s    
zQWenTokenizer.save_vocabularytextc                 K   s6   g }t d|}| j|D ]
}|| j|  q|S )a  
        Converts a string in a sequence of tokens, replacing unknown tokens with the `unk_token`.

        Args:
            text (`str`):
                The sequence to be encoded.
            kwargs (additional keyword arguments, *optional*):
                Will be passed to the underlying model specific encode method. See details in
                [`~PreTrainedTokenizerBase.__call__`]

        Returns:
            `List[str]`: The list of tokens.
        NFC)unicodedata	normalizerN   encode_ordinaryrd   rM   )rS   ry   rU   rh   tr   r   r"   tokenize   s
   zQWenTokenizer.tokenizerh   c                    s0   d |}t fdd|D jd jd}|S )z
        Converts a sequence of tokens in a single string. The most simple way to do it is `" ".join(tokens)` but we
        often want to remove sub-word tokenization artifacts at the same time.
         c                    s   g | ]} j | qS r   )byte_decoder)r    cr`   r   r"   r#      r>   z:QWenTokenizer.convert_tokens_to_string.<locals>.<listcomp>zutf-8)r   )rs   	bytearrayru   r   )rS   rh   ry   r   r`   r"   convert_tokens_to_string   s
   
z&QWenTokenizer.convert_tokens_to_stringc                 C   r^   r.   r_   r`   r   r   r"   
vocab_size   s   zQWenTokenizer.vocab_sizer7   c                 C   s    || j jkr	| jS | j |gS r.   )rN   rJ   r   ru   )rS   r7   r   r   r"   _convert_id_to_token   s   z"QWenTokenizer._convert_id_to_tokenr+   c                 C   s&   | j |d| jj| jddd S )z*Converts a token to an id using the vocab.zUTF-8all)allowed_specialr   )rK   rc   encoderN   r   )rS   r+   r   r   r"   _convert_token_to_id   s   z"QWenTokenizer._convert_token_to_idc                 C      dd | j  D }|S )z
        `List[str]`: All the special tokens (`'<unk>'`, `'<cls>'`, etc.) mapped to class attributes.

        Convert tokens of `tokenizers.AddedToken` type to string.
        c                 S   s   g | ]}t |qS r   )r@   )r    sr   r   r"   r#      s    z4QWenTokenizer.all_special_tokens.<locals>.<listcomp>)r;   keys)rS   all_toksr   r   r"   all_special_tokens   s   z QWenTokenizer.all_special_tokensc                 C   r   )zy
        `List[int]`: List the ids of the special tokens(`'<unk>'`, `'<cls>'`, etc.) mapped to class attributes.
        c                 S   s   g | ]}|qS r   r   )r    r=   r   r   r"   r#      s    z1QWenTokenizer.all_special_ids.<locals>.<listcomp>)r;   values)rS   all_idsr   r   r"   all_special_ids   s   zQWenTokenizer.all_special_idsc                 K   s   t )a  
        Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
        vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).

        Do NOT take care of added tokens.
        )NotImplementedError)rS   ry   rU   r   r   r"   	_tokenize   s   zQWenTokenizer._tokenize	token_idsskip_special_tokensc                    s2   t |tr|g}|r fdd|D } j|S )Nc                    s   g | ]	}| j vr|qS r   )r   r   r`   r   r"   r#      s    z)QWenTokenizer._decode.<locals>.<listcomp>)r?   r*   rN   ru   )rS   r   r   rU   r   r`   r"   _decode   s
   
zQWenTokenizer._decode)	r   Nr   r   r   NFFT)F)__name__
__module____qualname____doc__VOCAB_FILES_NAMESvocab_files_namesrB   ra   rb   rj   r@   r	   rx   r   r   r   propertyr   r*   r   r   r   r   r   r
   boolr   __classcell__r   r   r\   r"   r      sJ    _

	r   )r   
__future__r   r   r   r   r(   loggingrq   r{   ior   typingr   r   r	   r
   jsonrH   transformersr   r   modelscope.utils.loggerr   re   r   r   r   r   r   r"   <module>   s   