o
    c۷iS                  	   @   sN  d dl Z d dlZd dlZd dlZd dlmZ d dlmZ d dlm	Z	 d dl
mZ d dlZd dlZd dlmZmZ d dlmZmZmZmZ d dlmZ d d	lmZ ejd
edd eeZde e	B de!fddZ"G dd deZ#G dd deZ$G dd deZ%G dd deZ&G dd deZ'	dde(e# de)dB de*e+e)f fddZ,dS )     N)cached_property)groupby)Path)	TypedDict)AudioConfigAudioSpectrogramConfig)SpecialTokenPolicySpecialTokens	TokenizerTokenizerVersion)ImageConfig)ModelSettingsBuilderoncez%.*`get_control_token` is deprecated.*)actioncategorymessagepathreturnc                 C   s.   t | tr	t| } |  od| jv o| jdkS )z3Check if the given path is a tekken tokenizer file.tekken.json)
isinstancestrr   is_filenamesuffix)r    r   ]/home/ubuntu/vllm_env/lib/python3.10/site-packages/mistral_common/tokens/tokenizers/tekken.py	is_tekken!   s   
r   c                   @   s.   e Zd ZU dZeed< eed< edB ed< dS )	TokenInfozToken information in the JSON file.

    Attributes:
        rank: The rank of the token.
        token_bytes: The token in bytes, base64 encoded.
        token_str: The token in string format.
    ranktoken_bytesN	token_str)__name__
__module____qualname____doc__int__annotations__r   r   r   r   r   r   )   s
   
 r   c                   @   s*   e Zd ZU dZeed< eed< eed< dS )SpecialTokenInfozSpecial token information in the JSON file.

    Attributes:
        rank: The rank of the token.
        token_str: The token in string format.
        is_control: Whether the token is a control token.
    r   r!   
is_controlN)r"   r#   r$   r%   r&   r'   r   boolr   r   r   r   r(   7   s
   
 r(   c                   @   s:   e Zd ZU dZeed< eed< eed< eed< eed< dS )TekkenConfigaX  Tekken configuration in the JSON file.

    Attributes:
        pattern: The pattern of the tokenizer.
        num_vocab_tokens: The number of vocabulary tokens.
        default_vocab_size: The default vocabulary size.
        default_num_special_tokens: The default number of special tokens.
        version: The version of the tokenizer.
    patternnum_vocab_tokensdefault_vocab_sizedefault_num_special_tokensversionN)r"   r#   r$   r%   r   r'   r&   r   r   r   r   r+   E   s   
 
r+   c                   @   sV   e Zd ZU dZee ed< ee dB ed< eed< e	ed< e
ed< eed< eed	< dS )
	ModelDataa2  The data of the tekken tokenizer model.

    Attributes:
        vocab: The vocabulary of the tokenizer.
        config: The configuration of the tokenizer.
        version: The version of the tokenizer.
        type: The type of the tokenizer.
        image: The image configuration of the tokenizer.
    vocabNspecial_tokensconfigr0   typeimageaudio)r"   r#   r$   r%   listr   r'   r(   r+   r&   r   r   r   r   r   r   r   r1   W   s   
 
r1   c                   @   s  e Zd ZdZedejddedejddedejddedej	ddedej
dded	ejdded
ejddedejddedejddedejddedejddedejddedejddedejddedejddedejddedejddedejddedejddedejddfZdZdddddddee dee deded ed!e d"ed#ee!B dB d$e"dB d%e#dB d&e$dB fd'd(Z%e&d)e!fd*d+Z'e&d)e$dB fd,d-Z(e)d.e*d  d/ee!B d)d fd0d1Z+e&d)e"dB fd2d3Z,e,j-d4e"d)dfd5d3Z,e&d)e#dB fd6d7Z.e.j-d4e#d)dfd8d7Z.e&d)efd9d:Z/e&d)efd;d<Z0e1d)e2e fd=d>Z3e&d)e fd?d@Z4e1d)efdAdBZ5e1d)efdCdDZ6e1d)efdEdFZ7e1d)efdGdHZ8d)ee fdIdJZ9dKedLe:dMe:d)ee fdNdOZ;dPee dQe<d)ee fdRdSZ=dTed)e:fdUdVZ>dKed)efdWdXZ?dYee@jAB eB d)e:fdZd[ZBdKed)efd\d]ZCe<jDfdPee dQe<d)efd^d_ZEdPee d)efd`daZFdTed)efdbdcZGe<jDfdTedQe<d)eHfdddeZIdS )f
TekkenizerzTekken tokenizer.

    This tokenizer is based on the [tiktoken](https://github.com/openai/tiktoken) library. It fastens the tokenization
    for multiple languages.
    r   Tr   r!   r)                           	   
                              z<SPECIAL_{id}>
tekkenizerN)r   _pathimage_configaudio_configmodel_settings_builderr2   r3   r,   
vocab_sizenum_special_tokensr0   r   rO   rP   rQ   rR   c                   s  |j s|durtd|d||t|| ks"J |t||f| _ttdd |D }t||ks=J d| t||ksEJ  fddtt||D }|rhtd|d	 d
  d|d d
   || }ttdd |D t|  kr|ksJ | J ||| }td| d| d t||d _	tt|t j	
 ksJ | j	ftj|| j	i d _| _|	 _|
 _| _dd |D  _dd |D  _ fddt|D  _tj _|durt|nd _| _dS )a  Initialize the tekken tokenizer.

        Args:
            vocab: The vocabulary of the tokenizer.
            special_tokens: The special tokens of the tokenizer.
            pattern: The pattern of the tokenizer.
            vocab_size: The vocabulary size of the tokenizer.
            num_special_tokens: The number of special tokens of the tokenizer.
            version: The version of the tokenizer.
            name: The name of the tokenizer.
            image_config: The image configuration of the tokenizer.
            audio_config: The audio configuration of the tokenizer.
            model_settings_builder: The builder for model settings, or None if unsupported.
        N4model_settings_builder is not supported for version=  but got model_settings_builder=c                 S      g | ]}|d  qS r!   r   .0tr   r   r   
<listcomp>       z'Tekkenizer.__init__.<locals>.<listcomp>zSpecial tokens must be unique: c                    s$   g | ]}t | jj|d ddqS ))idTr:   )r(   SPECIAL_TOKEN_TEMPLATEformatrZ   iselfr   r   r\      s    zAdding special tokens r   r!   z, ..., c                 S   rW   rX   r   rY   r   r   r   r\      r]   zNon special vocabulary size is z with z special tokens.)	max_vocab)r   pat_strmergeable_ranksr3   c                 S      h | ]}|d  qS r   r   rY   r   r   r   	<setcomp>   r]   z&Tekkenizer.__init__.<locals>.<setcomp>c                 S   s   i | ]	}|d  |d qS )r!   r   r   rY   r   r   r   
<dictcomp>   s    z'Tekkenizer.__init__.<locals>.<dictcomp>c                    s   g | ]}  |qS r   )id_to_piecera   rc   r   r   r\          )supports_model_settings
ValueErrorlen_vocab_sizesetrangeloggerinfo_reload_mergeable_ranks_tekken_token2id_nospecialvaluestiktokenEncoding_model_version_image_config_audio_config_all_special_tokens_special_token_ids_special_tokens_reverse_vocab_vocabr   IGNORE_special_token_policyr   
_file_path_model_settings_builder)rd   r2   r3   r,   rS   rT   r0   r   rO   rP   rQ   rR   num_defined_special_tokensspecial_fillerinner_vocab_sizer   rc   r   __init__   sb   
*
zTekkenizer.__init__r   c                 C   s   | j du r	td| j S )zThe path to the tokenizer file.Nz)The tokenizer was not loaded from a file.)r   rp   rc   r   r   r   	file_path   s   
zTekkenizer.file_pathc                 C      | j S )zCThe model settings builder, or None if unsupported by this version.)r   rc   r   r   r   rR         z!Tekkenizer.model_settings_builderclsr   c                 C   s(  t |tr	t|}| sJ |t|ddd}t|}W d   n1 s(w   Y  |d d}|tj	vrIt
d| d| d	ttj	 |dusOJ t|}|d
d}|du rp|tjkrjt
d| dttj}ndd |D }||d
< |d }r|tjkrt
d| d|j dtd!i ||d< n|d }	rtd!i |	|d< |d }
r|
d}td!i |}td!d|i|
|d< |d }dur|jst
d|d||durt|}|}| |d ||d d |d d |d d ||jdd|d|d||d S )"zLoad the tekken tokenizer from a file.

        Args:
            path: The path to the tokenizer file.

        Returns:
            The tekken tokenizer.
        rutf-8)encodingNr4   r0   zUnknown version: z in z+. Make sure to use a valid version string: r3   zSpecial tokens not found in zL. Please update your tokenizer file and include all special tokens you need.c                 S   s   g | ]}|qS r   r   rZ   tokenr   r   r   r\     s    z(Tekkenizer.from_file.<locals>.<listcomp>
multimodalz-The image config has to be called 'image' in z for tokenizers of version .r6   r7   audio_encoding_configencoding_configrR   rU   rV   r2   r,   r.   r/   r    )r2   r3   r,   rS   rT   r0   r   rP   rQ   rR   rO   r   )r   r   r   existsopenjsonloadgetr   __members__rp   r8   v7r9   DEPRECATED_SPECIAL_TOKENSv11valuer   popr   r   ro   r   model_validater   replace)r   r   funtyped_version_strr0   special_tokens_dictsr3   mmr6   r7   r   rR   
model_datar   r   r   	from_file   sx   










zTekkenizer.from_filec                 C   r   )z)The image configuration of the tokenizer.)r~   rc   r   r   r   r6   D  r   zTekkenizer.imager   c                 C      t d)Nz!Can only set Image config at initrp   rd   r   r   r   r   r6   I     c                 C   r   )zThe audio configuration of the tokenizer.

        Returns:
             The audio configuration object if it exists, otherwise None.
        )r   rc   r   r   r   r7   M  s   zTekkenizer.audioc                 C   r   )Nz!Can only set Audio config at initr   r   r   r   r   r7   V  r   c                 C   s
   t | jS )z.The number of special tokens of the tokenizer.)rq   r   rc   r   r   r   rT   Z     
zTekkenizer.num_special_tokensc                 C   r   )z!Vocabulary size of the tokenizer.)rr   rc   r   r   r   n_words_  r   zTekkenizer.n_wordsc                 C   s   dd | j D S )zIds of the special tokens.c                 S   ri   rj   r   r   r   r   r   rk   g  r]   z)Tekkenizer.special_ids.<locals>.<setcomp>r   rc   r   r   r   special_idsd  s   zTekkenizer.special_idsc                 C   r   )zThe version of the tokenizer.)r}   rc   r   r   r   r0   i  r   zTekkenizer.versionc                 C   
   |  dS )z#The beginning of sentence token id.z<s>get_special_tokenrc   r   r   r   bos_idn  r   zTekkenizer.bos_idc                 C   r   )zThe end of sentence token id.z</s>r   rc   r   r   r   eos_ids  r   zTekkenizer.eos_idc                 C   r   )zThe padding token id.z<pad>r   rc   r   r   r   pad_idx  r   zTekkenizer.pad_idc                 C   r   )zThe unknown token id.z<unk>r   rc   r   r   r   unk_id}  r   zTekkenizer.unk_idc                 C   r   )a6  Get all tokens in the vocabulary as strings.

        Note:
           This will collapse all tokens for which we have a decoding error into
           the <?> string. This is bad and results in things like len(set(vocab)) != len(vocab)).

        Returns:
            The vocabulary of the tokenizer.
        )r   rc   r   r   r   r2     s   zTekkenizer.vocabsboseosc                    sD    j |} fdd|D }|r jg|}|r g | j}|S )a  Encode a string into a list of token ids.

        Args:
            s: The string to encode.
            bos: Whether to add the beginning of sentence token.
            eos: Whether to add the end of sentence token.

        Returns:
            The list of token ids.
        c                    s   g | ]}| j  qS r   rT   rY   rc   r   r   r\     rn   z%Tekkenizer.encode.<locals>.<listcomp>)r|   encoder   r   )rd   r   r   r   tokensr   rc   r   r     s   zTekkenizer.encoder   special_token_policyc                    s   g }t | fddD ]>\}}|r9|tjkr tdt| d|tjkr2| fdd|D  q|tjkr8qq| j	
 fdd|D  q|S )	Nc                    s
   |  j k S Nr   )r[   rc   r   r   <lambda>  s   
 z(Tekkenizer._decode_all.<locals>.<lambda>z/Decoding `tokens` that contain special tokens (a  ) is not allowed. 
Either make sure `tokens` do not include any special tokens or, if you want to decode `tokens` that includes special tokens, change the tokenizer's special token policy to IGNORE or KEEP: 
```
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
from mistral_common.tokens.tokenizers.tekken import SpecialTokenPolicy

tokenizer = MistralTokenizer.v3(is_tekken=True)
tekken = tokenizer.instruct_tokenizer.tokenizer
tekken.special_token_policy = SpecialTokenPolicy.IGNORE  # or SpecialTokenPolicy.KEEP
```c                 3   s    | ]
} j | d  V  qdS )r!   Nr   rY   rc   r   r   	<genexpr>  s    z)Tekkenizer._decode_all.<locals>.<genexpr>c                    s   g | ]}| j  qS r   r   rY   rc   r   r   r\     rn   z*Tekkenizer._decode_all.<locals>.<listcomp>)r   r   RAISErp   r8   KEEPextendr   appendr|   decode)rd   r   r   decoded
is_specialgroupr   rc   r   _decode_all  s   


"zTekkenizer._decode_alltoken_idc                 C   s   d|| j    kodk S   S )z$Check if a token id is a byte token.r      r   rd   r   r   r   r   is_byte  s   zTekkenizer.is_bytec                 C   s"   || j v r
| j | S td| )z$Get the token id of a special token.zUnknown control token )r   rp   rd   r   r   r   r   r     s   

zTekkenizer.get_special_tokenr   c                 C   sB   t |ttjfr|| jv S t |tr|| jv S tdt|j	 )z7Return `True` if the passed `token` is a special token.zExpected int or str, got )
r   r&   npintegerr   r   r   	TypeErrorr5   r"   )rd   r   r   r   r   r     s
   


zTekkenizer.is_specialc                 C   s   t dt | |S )NzC`get_control_token` is deprecated. Use `get_special_token` instead.)warningswarnFutureWarningr   r   r   r   r   get_control_token  s   
zTekkenizer.get_control_tokenc                 C   s6   t |ttfstdt| dd| j||dS )zDecode a list of token ids into a string.

        Args:
            tokens: The list of token ids to decode.
            special_token_policy: The policy for handling special tokens.

        Returns:
            The decoded string.
        z>Expected `special_token_policy` to be SpecialTokenPolicy, got r   r   r   )r   r   r   rp   r5   joinr   )rd   r   r   r   r   r   r     s
   
zTekkenizer.decodec                 C   s   | j |tjdS )Nr   r   r   r   )rd   r   r   r   r   
_to_string  s   zTekkenizer._to_stringc                 C   s   | j |gtjdS )z0Convert a token id to its string representation.r   r   r   r   r   r   rm     s   zTekkenizer.id_to_piecec                 C   sn   || j k r.|tjkr| j| d dS |tjkr t| d|tjkr'dS td| | j	|| j  S )a  Convert a token id to its byte representation.

        Args:
            token_id: The token id to convert.
            special_token_policy: The policy for handling special tokens.

        Returns:
            The byte representation of the token.
        r!   r   z is a special token    zUnknown special token policy )
rT   r   r   r   r   r   rp   r   r|   decode_single_token_bytes)rd   r   r   r   r   r   id_to_byte_piece  s   



zTekkenizer.id_to_byte_piece)Jr"   r#   r$   r%   r(   r	   unkr   r   
begin_instend_instbegin_tools	end_toolsbegin_tool_resultsend_tool_results
tool_callsimgpad	img_breakimg_endprefixmiddler   begin_system
end_systembegin_tool_contentr   r_   r8   r   r   r&   r   r   r   r   r   r   propertyr   rR   classmethodr5   r   r6   setterr7   rT   r   r   rs   r   r0   r   r   r   r   r2   r*   r   r   r   r   r   r   r   r   r   r   r   r   rm   bytesr   r   r   r   r   r9   k   s    	


Y Q	 r9   r2   rf   c                 C   s   |dur)t | |ksJ t | |ft | |kr)| d| } tdt |  d i }t| D ]4\}}| h dks=J |d |ksEJ t|d }|dks]|t|gks]J ||f|d ||< q/t |t | ksnJ t|	 tt
t |ks~J |S )zAReload our tokenizer JSON file and convert it to Tiktoken format.Nz(Cutting non special vocabulary to first z tokens.>   r   r!   r    r   r    r   )rq   ru   rv   	enumeratekeysbase64	b64decoder   rs   ry   rt   )r2   rf   ranksrb   xmerger   r   r   rw     s   " rw   r   )-r   r   loggingr   	functoolsr   	itertoolsr   pathlibr   typingr   numpyr   rz   &mistral_common.tokens.tokenizers.audior   r   %mistral_common.tokens.tokenizers.baser   r	   r
   r   &mistral_common.tokens.tokenizers.imager   7mistral_common.tokens.tokenizers.model_settings_builderr   filterwarningsr   	getLoggerr"   ru   r   r*   r   r   r(   r+   r1   r9   r8   r&   dictr   rw   r   r   r   r   <module>   sJ    
   %
