o
    *i_Z                  	   @   sB  d dl Z d dlZd dlZd dlZd dlmZ d dlmZ d dlm	Z	 d dl
mZ d dlZd dlZd dlmZmZ d dlmZmZmZmZ d dlmZ ejd	ed
d eeZdee	B defddZ G dd deZ!G dd deZ"G dd deZ#G dd deZ$G dd deZ%	dde&e! de'dB de(e)e'f fddZ*dS )    N)cached_property)groupby)Path)	TypedDict)AudioConfigAudioSpectrogramConfig)SpecialTokenPolicySpecialTokens	TokenizerTokenizerVersion)ImageConfigoncez%.*`get_control_token` is deprecated.*)actioncategorymessagepathreturnc                 C   s.   t | tr	t| } |  od| jv o| jdkS )z3Check if the given path is a tekken tokenizer file.tekken.json)
isinstancestrr   is_filenamesuffix)r    r   d/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/mistral_common/tokens/tokenizers/tekken.py	is_tekken    s   
r   c                   @   s.   e Zd ZU dZeed< eed< edB ed< dS )	TokenInfozToken information in the JSON file.

    Attributes:
        rank: The rank of the token.
        token_bytes: The token in bytes, base64 encoded.
        token_str: The token in string format.
    ranktoken_bytesN	token_str)__name__
__module____qualname____doc__int__annotations__r   r   r   r   r   r   (   s
   
 r   c                   @   s*   e Zd ZU dZeed< eed< eed< dS )SpecialTokenInfozSpecial token information in the JSON file.

    Attributes:
        rank: The rank of the token.
        token_str: The token in string format.
        is_control: Whether the token is a control token.
    r   r    
is_controlN)r!   r"   r#   r$   r%   r&   r   boolr   r   r   r   r'   6   s
   
 r'   c                   @   s:   e Zd ZU dZeed< eed< eed< eed< eed< dS )TekkenConfigaX  Tekken configuration in the JSON file.

    Attributes:
        pattern: The pattern of the tokenizer.
        num_vocab_tokens: The number of vocabulary tokens.
        default_vocab_size: The default vocabulary size.
        default_num_special_tokens: The default number of special tokens.
        version: The version of the tokenizer.
    patternnum_vocab_tokensdefault_vocab_sizedefault_num_special_tokensversionN)r!   r"   r#   r$   r   r&   r%   r   r   r   r   r*   D   s   
 
r*   c                   @   sV   e Zd ZU dZee ed< ee dB ed< eed< e	ed< e
ed< eed< eed	< dS )
	ModelDataa2  The data of the tekken tokenizer model.

    Attributes:
        vocab: The vocabulary of the tokenizer.
        config: The configuration of the tokenizer.
        version: The version of the tokenizer.
        type: The type of the tokenizer.
        image: The image configuration of the tokenizer.
    vocabNspecial_tokensconfigr/   typeimageaudio)r!   r"   r#   r$   listr   r&   r'   r*   r%   r   r   r   r   r   r   r   r0   V   s   
 
r0   c                   @   s  e Zd ZdZedejddedejddedejddedej	ddedej
dded	ejdded
ejddedejddedejddedejddedejddedejddedejddedejddedejddedejddedejddedejddedejddedejddfZdZddddddee dee deded ed!e d"ed#ee!B dB d$e"dB d%e#dB fd&d'Z$e%d(e!fd)d*Z&e'd+e(d  d,ee!B d(d fd-d.Z)e%d(e"dB fd/d0Z*e*j+d1e"d(dfd2d0Z*e%d(e#dB fd3d4Z,e,j+d1e#d(dfd5d4Z,e%d(efd6d7Z-e%d(efd8d9Z.e/d(e0e fd:d;Z1e%d(e fd<d=Z2e%d(e3fd>d?Z4e4j+d@e3d(dfdAd?Z4e/d(efdBdCZ5e/d(efdDdEZ6e/d(efdFdGZ7e/d(efdHdIZ8d(ee fdJdKZ9dLedMe:dNe:d(ee fdOdPZ;dQee dRe3d(ee fdSdTZ<dUed(e:fdVdWZ=dLed(efdXdYZ>dZee?j@B eB d(e:fd[d\ZAdLed(efd]d^ZBdidQee dRe3dB d(efd_d`ZCdQee d(efdadbZDdQee d(efdcddZEdUed(efdedfZFdidUedRe3dB d(eGfdgdhZHdS )j
TekkenizerzTekken tokenizer.

    This tokenizer is based on the [tiktoken](https://github.com/openai/tiktoken) library. It fastens the tokenization
    for multiple languages.
    r   Tr   r    r(                           	   
                              z<SPECIAL_{id}>
tekkenizerN)r   _pathimage_configaudio_configr1   r2   r+   
vocab_sizenum_special_tokensr/   r   rN   rO   rP   c                   s  |t || ksJ |t ||f| _t tdd |D }t ||ks,J d| t ||ks4J  fddtt ||D }|rWtd|d d  d|d	 d   || }t td
d |D t |  krs|ksxJ | J ||| }td| d| d t||d _tt|t j ksJ | jft	j
|| ji d _| _|	 _|
 _| _dd |D  _dd |D  _ fddt|D  _tj _|durt| _dS d _dS )a  Initialize the tekken tokenizer.

        Args:
            vocab: The vocabulary of the tokenizer.
            special_tokens: The special tokens of the tokenizer.
            pattern: The pattern of the tokenizer.
            vocab_size: The vocabulary size of the tokenizer.
            num_special_tokens: The number of special tokens of the tokenizer.
            version: The version of the tokenizer.
            name: The name of the tokenizer.
            image_config: The image configuration of the tokenizer.
        c                 S      g | ]}|d  qS r    r   .0tr   r   r   
<listcomp>       z'Tekkenizer.__init__.<locals>.<listcomp>zSpecial tokens must be unique: c                    s$   g | ]}t | jj|d ddqS ))idTr9   )r'   SPECIAL_TOKEN_TEMPLATEformatrV   iselfr   r   rX      s    zAdding special tokens r   r    z, ..., c                 S   rS   rT   r   rU   r   r   r   rX      rY   zNon special vocabulary size is z with z special tokens.)	max_vocab)r   pat_strmergeable_ranksr2   c                 S      h | ]}|d  qS r   r   rU   r   r   r   	<setcomp>   rY   z&Tekkenizer.__init__.<locals>.<setcomp>c                 S   s   i | ]	}|d  |d qS )r    r   r   rU   r   r   r   
<dictcomp>   s    z'Tekkenizer.__init__.<locals>.<dictcomp>c                    s   g | ]}  |qS r   )id_to_piecer]   r_   r   r   rX          N)len_vocab_sizesetrangeloggerinfo_reload_mergeable_ranks_tekken_token2id_nospecialvaluestiktokenEncoding_model_version_image_config_audio_config_all_special_tokens_special_token_ids_special_tokens_reverse_vocab_vocabr   IGNORE_special_token_policyr   
_file_path)r`   r1   r2   r+   rQ   rR   r/   r   rN   rO   rP   num_defined_special_tokensspecial_fillerinner_vocab_sizer   r_   r   __init__   sX   
* zTekkenizer.__init__r   c                 C   s   | j du r	td| j S )zThe path to the tokenizer file.Nz)The tokenizer was not loaded from a file.)r   
ValueErrorr_   r   r   r   	file_path   s   
zTekkenizer.file_pathclsr   c                 C   s  t |tr	t|}| sJ |t|ddd}t|}W d   n1 s(w   Y  |d d}|tj	vrIt
d| d| d	ttj	 |dusOJ t|}|d
d}|du rq|tdkrkt
d| dttj}ndd |D }||d
< |d }r|tdkrt
d| d|j dtd i ||d< n|d }	rtd i |	|d< |d }
r|
d}td i |}td d|i|
|d< |}| |d ||d d |d d |d d ||jdd|d|d|d
S )!zLoad the tekken tokenizer from a file.

        Args:
            path: The path to the tokenizer file.

        Returns:
            The tekken tokenizer.
        rutf-8)encodingNr3   r/   zUnknown version: z in z+. Make sure to use a valid version string: r2   v7zSpecial tokens not found in zL. Please update your tokenizer file and include all special tokens you need.c                 S   s   g | ]}|qS r   r   rV   tokenr   r   r   rX     s    z(Tekkenizer.from_file.<locals>.<listcomp>
multimodalv11z-The image config has to be called 'image' in z for tokenizers of version .r5   r6   audio_encoding_configencoding_configr1   r+   r-   r.   r    )
r1   r2   r+   rQ   rR   r/   r   rO   rP   rN   r   )r   r   r   existsopenjsonloadgetr   __members__r   r7   r8   DEPRECATED_SPECIAL_TOKENSvaluer   popr   r   r   replace)r   r   funtyped_version_strr/   special_tokens_dictsr2   mmr5   r6   r   
model_datar   r   r   	from_file   sb   







zTekkenizer.from_filec                 C      | j S )z)The image configuration of the tokenizer.)rx   r_   r   r   r   r5   +     zTekkenizer.imager   c                 C      t d)Nz!Can only set Image config at initr   r`   r   r   r   r   r5   0     c                 C   r   )zThe audio configuration of the tokenizer.

        Returns:
             The audio configuration object if it exists, otherwise None.
        )ry   r_   r   r   r   r6   4  s   zTekkenizer.audioc                 C   r   )Nz!Can only set Audio config at initr   r   r   r   r   r6   =  r   c                 C   s
   t | jS )z.The number of special tokens of the tokenizer.)rk   rz   r_   r   r   r   rR   A     
zTekkenizer.num_special_tokensc                 C   r   )z!Vocabulary size of the tokenizer.)rl   r_   r   r   r   n_wordsF  r   zTekkenizer.n_wordsc                 C   s   dd | j D S )zIds of the special tokens.c                 S   re   rf   r   r   r   r   r   rg   N  rY   z)Tekkenizer.special_ids.<locals>.<setcomp>rz   r_   r   r   r   special_idsK  s   zTekkenizer.special_idsc                 C   r   )zThe version of the tokenizer.)rw   r_   r   r   r   r/   P  r   zTekkenizer.versionc                 C   r   )z'The policy for handling special tokens.)r   r_   r   r   r   special_token_policyU  r   zTekkenizer.special_token_policypolicyc                 C   s4   t |tstdt| dtdt || _dS )z+Set the policy for handling special tokens.z!Expected SpecialTokenPolicy, got r   zThe attributed `special_token_policy` is deprecated and will be removed in 1.10.0. Please pass a special token policy explicitly to the relevant methods.N)r   r   r   r4   warningswarnFutureWarningr   )r`   r   r   r   r   r   Z  s   

c                 C   
   |  dS )z#The beginning of sentence token id.z<s>get_special_tokenr_   r   r   r   bos_idj  r   zTekkenizer.bos_idc                 C   r   )zThe end of sentence token id.z</s>r   r_   r   r   r   eos_ido  r   zTekkenizer.eos_idc                 C   r   )zThe padding token id.z<pad>r   r_   r   r   r   pad_idt  r   zTekkenizer.pad_idc                 C   r   )zThe unknown token id.z<unk>r   r_   r   r   r   unk_idy  r   zTekkenizer.unk_idc                 C   r   )a6  Get all tokens in the vocabulary as strings.

        Note:
           This will collapse all tokens for which we have a decoding error into
           the <?> string. This is bad and results in things like len(set(vocab)) != len(vocab)).

        Returns:
            The vocabulary of the tokenizer.
        )r}   r_   r   r   r   r1   ~  s   zTekkenizer.vocabsboseosc                    sD    j |} fdd|D }|r jg|}|r g | j}|S )a  Encode a string into a list of token ids.

        Args:
            s: The string to encode.
            bos: Whether to add the beginning of sentence token.
            eos: Whether to add the end of sentence token.

        Returns:
            The list of token ids.
        c                    s   g | ]}| j  qS r   rR   rU   r_   r   r   rX     rj   z%Tekkenizer.encode.<locals>.<listcomp>)rv   encoder   r   )r`   r   r   r   tokensr   r_   r   r     s   zTekkenizer.encoder   r   c                    s   g }t | fddD ]>\}}|r9|tjkr tdt| d|tjkr2| fdd|D  q|tjkr8qq| j	
 fdd|D  q|S )	Nc                    s
   |  j k S Nr   )rW   r_   r   r   <lambda>  s   
 z(Tekkenizer._decode_all.<locals>.<lambda>z/Decoding `tokens` that contain special tokens (a  ) is not allowed. 
Either make sure `tokens` do not include any special tokens or, if you want to decode `tokens` that includes special tokens, change the tokenizer's special token policy to IGNORE or KEEP: 
```
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
from mistral_common.tokens.tokenizers.tekken import SpecialTokenPolicy

tokenizer = MistralTokenizer.v3(is_tekken=True)
tekken = tokenizer.instruct_tokenizer.tokenizer
tekken.special_token_policy = SpecialTokenPolicy.IGNORE  # or SpecialTokenPolicy.KEEP
```c                 3   s    | ]
} j | d  V  qdS )r    Nr   rU   r_   r   r   	<genexpr>  s    z)Tekkenizer._decode_all.<locals>.<genexpr>c                    s   g | ]}| j  qS r   r   rU   r_   r   r   rX     rj   z*Tekkenizer._decode_all.<locals>.<listcomp>)r   r   RAISEr   r7   KEEPextendr~   appendrv   decode)r`   r   r   decoded
is_specialgroupr   r_   r   _decode_all  s   


"zTekkenizer._decode_alltoken_idc                 C   s   d|| j    kodk S   S )z$Check if a token id is a byte token.r      r   r`   r   r   r   r   is_byte  s   zTekkenizer.is_bytec                 C   s"   || j v r
| j | S td| )z$Get the token id of a special token.zUnknown control token )r|   r   r`   r   r   r   r   r     s   

zTekkenizer.get_special_tokenr   c                 C   sB   t |ttjfr|| jv S t |tr|| jv S tdt|j	 )z7Return `True` if the passed `token` is a special token.zExpected int or str, got )
r   r%   npintegerr{   r   r|   	TypeErrorr4   r!   )r`   r   r   r   r   r     s
   


zTekkenizer.is_specialc                 C      t dt | |S )NzC`get_control_token` is deprecated. Use `get_special_token` instead.)r   r   r   r   r   r   r   r   get_control_token  s   
zTekkenizer.get_control_tokenc                 C   sb   |durt |ttfstdt| d|du r'td| j dt | j}d	| j
||dS )a  Decode a list of token ids into a string.

        Args:
            tokens: The list of token ids to decode.
            special_token_policy: The policy for handling special tokens.
                Use the tokenizer's [attribute][mistral_common.tokens.tokenizers.tekken.Tekkenizer.special_token_policy]
                if `None`. Passing `None` is deprecated and will be changed
                to `SpecialTokenPolicy.IGNORE` in `mistral_common=1.10.0`.

        Returns:
            The decoded string.
        NzFExpected `special_token_policy` to be None or SpecialTokenPolicy, got r   ,Using the tokenizer's special token policy () is deprecated. It will be removed in 1.10.0. Please pass a special token policy explicitly. Future default will be SpecialTokenPolicy.IGNORE.r   r   )r   r   r   r   r4   r   r   r   r   joinr   )r`   r   r   r   r   r   r     s   	zTekkenizer.decodec                 C   r   )z[DEPRECATED] Converts a list of token ids into a string, keeping special tokens.

        Use `decode` with `special_token_policy=SpecialTokenPolicy.KEEP` instead.

        This is a convenient method for debugging.
        z`to_string` is deprecated and will be removed in 1.10.0. Use `decode` with `special_token_policy=SpecialTokenPolicy.KEEP` instead.)r   r   r   
_to_stringr`   r   r   r   r   	to_string  s
   
zTekkenizer.to_stringc                 C   s   | j |tjdS )Nr   r   r   r   r   r   r   r   r     s   zTekkenizer._to_stringc                 C   s   | j |gtjdS )z0Convert a token id to its string representation.r   r   r   r   r   r   ri   	  s   zTekkenizer.id_to_piecec                 C   s   |du rt d| j dt | j}|| jk r@|tjkr&| j| d dS |tj	kr2t
| d|tjkr9dS t
d| | j|| j S )	a  Convert a token id to its byte representation.

        Args:
            token_id: The token id to convert.
            special_token_policy: The policy for handling special tokens.
                Use the tokenizer's [attribute][mistral_common.tokens.tokenizers.tekken.Tekkenizer.special_token_policy]
                if `None`. Passing `None` is deprecated and will be changed
                to `SpecialTokenPolicy.IGNORE` in `mistral_common=1.10.0`.

        Returns:
            The byte representation of the token.
        Nr   r   r    r   z is a special token    zUnknown special token policy )r   r   r   r   rR   r   r   rz   r   r   r   r~   rv   decode_single_token_bytes)r`   r   r   r   r   r   id_to_byte_piece  s   	



zTekkenizer.id_to_byte_piecer   )Ir!   r"   r#   r$   r'   r	   unkr   r   
begin_instend_instbegin_tools	end_toolsbegin_tool_resultsend_tool_results
tool_callsimgpad	img_breakimg_endprefixmiddler   begin_system
end_systembegin_tool_contentr   r[   r7   r   r   r%   r   r   r   r   r   propertyr   classmethodr4   r   r5   setterr6   rR   r   r   rm   r   r/   r   r   r   r   r   r   r1   r)   r   r   r   r   r   r   r   r   r   r   r   ri   bytesr   r   r   r   r   r8   j   s    	


P G	   r8   r1   rb   c                 C   s   |dur)t | |ksJ t | |ft | |kr)| d| } tdt |  d i }t| D ]4\}}| h dks=J |d |ksEJ t|d }|dks]|t|gks]J ||f|d ||< q/t |t | ksnJ t|	 tt
t |ks~J |S )zAReload our tokenizer JSON file and convert it to Tiktoken format.Nz(Cutting non special vocabulary to first z tokens.>   r   r    r   r   r   r   )rk   ro   rp   	enumeratekeysbase64	b64decoder   rm   rs   rn   )r1   rb   ranksr^   xmerger   r   r   rq   3  s   " rq   r   )+r   r   loggingr   	functoolsr   	itertoolsr   pathlibr   typingr   numpyr   rt   &mistral_common.tokens.tokenizers.audior   r   %mistral_common.tokens.tokenizers.baser   r	   r
   r   &mistral_common.tokens.tokenizers.imager   filterwarningsr   	getLoggerr!   ro   r   r)   r   r   r'   r*   r0   r8   r7   r%   dictr   rq   r   r   r   r   <module>   sH    
   N
