o
    *i1                  	   @   s  d dl Z d dlZd dlmZ d dlmZmZmZmZ d dl	Z
d dlmZ d dlmZmZmZmZ d dlmZ d dlmZmZmZmZmZmZmZmZ d dlmZmZm Z m!Z!m"Z"m#Z# d d	l$m%Z% d d
l&m'Z'm(Z( d dl)m*Z*m+Z+ d dl,m-Z-m.Z. d dl/m0Z0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8 d dl9m:Z: d dl;m<Z< G dd de2ee1e0e6e f Z=G dd de=ee1e0e6e f Z>G dd de>ee1e0e6e f Z?G dd de?ee1e0e6e f Z@G dd de@ZAG dd deAZBG dd deBZCdS )    N)abstractmethod)AnyGenericSequenceoverload)Audio) InvalidAssistantMessageException InvalidMessageStructureExceptionInvalidRequestExceptionTokenizerException)
FIMRequest)
AudioChunkAudioURLChunkContentChunk
ImageChunkImageURLChunk	TextChunk
ThinkChunkUserContentChunk)UATSAssistantMessageAssistantMessageTypeSystemMessageToolMessageUserMessage)InstructRequest)ToolToolCall)StreamingModeTranscriptionRequest)AudioEncoderTranscriptionFormat)	FIMRequestTypeInstructRequestTypeInstructTokenizerSpecialTokenPolicySpecialTokens	TokenizedTokenizedType	TokenizerUserMessagePosition)ImageEncoder)
Tekkenizerc                
       st  e Zd ZdZ		d*dededB dedB f fddZededB fd	d
Z	de
e fddZededeeef fddZededede
e fddZedededede
e fddZedede
e fddZde
e
e dB  de
e dededdf
ddZede
e ddfd d!Zdeeef defd"d#Zd+d$e
e d%e dB de!fd&d'Z"d$e
e de!fd(d)Z#  Z$S ),InstructTokenizerBasezBase instruct tokenizer.N	tokenizerimage_encoderaudio_encoderc                    s&   || _ || _|| _t ||| dS )zInitialize the instruct tokenizer.

        Args:
            tokenizer: The tokenizer to use.
            image_encoder: The image encoder to use if any.
            audio_encoder: The audio encoder to use.
        N)r.   r/   r0   super__init__selfr.   r/   r0   	__class__ f/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/mistral_common/tokens/tokenizers/instruct.pyr2   :   s   zInstructTokenizerBase.__init__returnc                 C   s   | j S N)r/   r4   r7   r7   r8   
mm_encoderL   s   z InstructTokenizerBase.mm_encoderc                 C   s
   | j jgS )zReturn the start tokens.)r.   bos_idr;   r7   r7   r8   startS   s   
zInstructTokenizerBase.startrequestc                 C   sB   d}d}t t| jD ]\}}t|tr|dkr|}|}q||fS )zFind the first and last user message in the request.

        Args:
            request: The request to search for user messages.

        Returns:
            The index of the first and last user message.
        )list	enumeratemessages
isinstancer   )r?   last_user_idxfirst_user_idximsgr7   r7   r8   find_first_last_userW   s   

z*InstructTokenizerBase.find_first_last_usermessageis_before_last_user_messagec                 C      t d)zEncode a tool message.

        Raises:
            NotImplementedError: The tool message is not implemented for the base tokenizer.
        zTool message not implementedNotImplementedErrorr4   rJ   rK   r7   r7   r8   encode_tool_messagej      z)InstructTokenizerBase.encode_tool_messagecontinue_messagec                 C   rL   )zEncode an assistant message.

        Raises:
            NotImplementedError: The assistant message is not implemented for the base tokenizer.
        z!Assistant message not implementedrM   r4   rJ   rK   rR   r7   r7   r8   encode_assistant_messages   s   	z.InstructTokenizerBase.encode_assistant_messagechunkc                 C   rL   )zEncode a think chunk.

        Raises:
            NotImplementedError: The think chunk is not implemented for the base tokenizer.
        zThink chunk not implementedrM   r4   rU   r7   r7   r8   encode_think~   rQ   z"InstructTokenizerBase.encode_think	tokenizedrC   
max_tokenslast_user_message_indexc                 C      d S r:   r7   )r4   rX   rC   rY   rZ   r7   r7   r8   _truncate_for_max_tokens   s   z.InstructTokenizerBase._truncate_for_max_tokensc                 C   r[   r:   r7   clsrC   r7   r7   r8   validate_messages   s   z'InstructTokenizerBase.validate_messagesc              	   C   s  g }g }d}g }|  |j | |\}}t|jD ]\}}	|jr3|t|jd kr3t|	ts3tdt|	t	rW| j
|	|j||k||k|jdd\}
}}|| || nIt|	tre| |	||k }
n;t|	tr|jou|t|jd k}| j|	||k |d}
|t|jd kr|
}nt|	tr| |	}
n	tdt|	 ||
 q|jdur| ||j|j| |  }|D ]}|dur|| qt|| j|tjd|||d	S )
zEncode an instruct request.

        Args:
            request: The request to encode.

        Returns:
            The encoded tokens.
        N   z?Cannot continue final message if it is not an assistant messageT)system_promptforce_img_first)rR   zUnknown message type special_token_policy)tokenstext
prefix_idsimagesaudios)r_   rC   rI   rB   continue_final_messagelenrD   r   r	   r   encode_user_messageavailable_toolsra   extendr   rP   rT   r   encode_system_messager   typeappendtruncate_at_max_tokensr\   r>   r'   decoder%   KEEP)r4   r?   rh   ri   rg   tokens_listrF   rE   msg_idxrH   
new_tokens
new_images
new_audiosrR   re   tokr7   r7   r8   encode_instruct   sv   







z%InstructTokenizerBase.encode_instructre   rd   c                 C   s   | j j||dS )a  Decode tokens to a string.

        Args:
            tokens: The tokens to decode.
            special_token_policy: The policy to use for special tokens.
                Passing `None` will default to `self._special_token_policy` for
                [Tekkenizer][mistral_common.tokens.tokenizers.tekken.Tekkenizer] and `SpecialTokenPolicy.IGNORE`
                for [SentencePieceTokenizer][mistral_common.tokens.tokenizers.sentencepiece.SentencePieceTokenizer].
                Note that passing `None` will be deprecated and `special_token_policy` will default to
                `SpecialTokenPolicy.IGNORE` in `mistral_common=1.10.0`.

        Returns:
            The decoded string.
        rc   )r.   rs   )r4   re   rd   r7   r7   r8   rs      s   zInstructTokenizerBase.decodec                 C   s   | j |S r:   )r.   
_to_string)r4   re   r7   r7   r8   r|      s   z InstructTokenizerBase._to_stringNNr:   )%__name__
__module____qualname____doc__r)   r+   r    r2   propertyr<   rA   intr>   staticmethodr   tuplerI   r   r   boolrP   r   rT   r   rW   r\   classmethodr   r_   r   r'   r{   r%   strrs   r|   __classcell__r7   r7   r5   r8   r-   5   sd    




 Qr-   c                   @   s0  e Zd ZdZ		d dedee dB dedededB d	ed
e	ee
 eej ee f fddZded
ee
 fddZ		d deee B dededB d	ed
e	ee
 eej ee f f
ddZdeded
ee
 fddZdededed
ee
 fddZded
ee
 fddZded
efddZded
efddZdS )!InstructTokenizerV1zrInstruct tokenizer V1.

    This tokenizer has basic for messages. It does not support tools or image inputs.
    NFrJ   rm   is_lastis_firstra   rb   r9   c                 C   st   t |jts
J d| jdu sJ dd}|r!|r!|d |j }n|j}d| d}| j|ddd	\}	}
}|	|
|fS )
ar  Encode a user message.

        Args:
            message: The message to encode.
            available_tools: Not used.
            is_last: Not used.
            is_first: Whether the message is the first one.
            system_prompt: The system prompt.
            force_img_first: Not used.

        Returns:
            The encoded tokens and empty list.
        "Message content must be normalizedNz(InstructTokenizerV1 cannot encode images 

z[INST] z [/INST]F)contentr   ra   )rD   r   r   r/   encode_user_content)r4   rJ   rm   r   r   ra   rb   r   message_txtcurr_tokensimageaudior7   r7   r8   rl     s   
z'InstructTokenizerV1.encode_user_messagec                 C      t d| jj )Nz,System message encoding not implemented for )rN   r6   r~   r4   rJ   r7   r7   r8   ro   '     z)InstructTokenizerV1.encode_system_messager   c                 C   s>   t |tsJ |r|r|d | }| jj|ddd}|g g fS )a*  Encode a user content.

        Args:
            content: The content to encode.
            is_last: Whether the message is the last one.
            system_prompt: The system prompt.
            force_img_first: Not used.

        Returns:
            The encoded tokens and empty list.
        r   Fboseos)rD   r   r.   encode)r4   r   r   ra   rb   re   r7   r7   r8   r   *  s
   
z'InstructTokenizerV1.encode_user_contentrK   c                 C   rL   )zEncode a tool message.

        Raises:
            TokenizerException: The tool message is not implemented for this version.
        &Tools not implemented for tokenizer V1r   rO   r7   r7   r8   rP   D     z'InstructTokenizerV1.encode_tool_messagerR   c                 C   s   t |ts	J ||jdurt|jdkrtd|r"|jr"td|jr:t |jts/J d| j	j
|jddd}nt|j d|j |jsQ|sQ|| j	j |S )	[  Encode an assistant message.

        Args:
            message: The message to encode.
            is_before_last_user_message: Not used.
            continue_message: Whether to continue the message generation.
                Only use this if the assistant message is the last message.

        Returns:
            The encoded tokens.
        Nr   r   U`continue_message` is only supported for assistant messages that have `prefix=False`.z4Message content must be a string for tokenizer < V13Fr   z // )rD   r   
tool_callsrk   r   prefixr   r   r   r.   r   rq   eos_idr4   rJ   rK   rR   r   r7   r7   r8   rT   L  s   

z,InstructTokenizerV1.encode_assistant_messagerU   c                 C   rL   )zEncode a think chunk.

        Raises:
            TokenizerException: The think chunk is not implemented for this version.
        z*Think not implemented for tokenizer < V13.r   rV   r7   r7   r8   rW   j  r   z InstructTokenizerV1.encode_thinkr?   c                 C   r   )zEncode a FIM request.

        Raises:
           TokenizerException: The FIM request is not implemented for this version.
        zFIM not available for r   r.   versionr4   r?   r7   r7   r8   
encode_fimr  s   zInstructTokenizerV1.encode_fimc                 C   r   )Nz Transcription not available for r   r   r7   r7   r8   encode_transcriptionz  r   z(InstructTokenizerV1.encode_transcriptionNF)r~   r   r   r   r   rA   r   r   r   r   r   npndarrayr   rl   r   ro   r   r   r   rP   r   rT   r   rW   r   r'   r   r   r   r7   r7   r7   r8   r      s\    

#


r   c                       s~  e Zd ZdZejZ		d-dededB de	dB f fddZ
		d.d	ed
ee dB dedededB dedeee eej ee f fddZdedefddZdeee B defddZdedeeef fddZd	ededee fddZdedeeef fddZd	e dee fd d!Z!d	e dee fd"d#Z"d	e ded$edee fd%d&Z#d'edee fd(d)Z$d*e%de&fd+d,Z'  Z(S )/InstructTokenizerV2z`Instruct tokenizer V2.

    This tokenizer adds supports to images, tools and FIM requests.
    Nr.   r/   r0   c                    s   t  ||| | jtjj| _| jtjj| _	| jtj
j| _| jtjj| _| jtjj| _| jtjj| _| jtjj| _| jtjj| _| jtjj| _| jtjj| _dS Initialize the tokenizer.

        Args:
            tokenizer: The tokenizer to use.
            image_encoder: The image encoder to use.
            audio_encoder: The audio encoder to use.
        N)r1   r2   r.   get_special_tokenr&   
begin_instvalue
BEGIN_INSTend_instEND_INSTbegin_toolsBEGIN_AVAILABLE_TOOLS	end_toolsEND_AVAILABLE_TOOLSbegin_tool_resultsBEGIN_TOOL_RESULTSend_tool_resultsEND_TOOL_RESULTSr   
TOOL_CALLSr   BOSr   PREFIXsuffixSUFFIXr3   r5   r7   r8   r2     s   zInstructTokenizerV2.__init__FrJ   rm   r   r   ra   rb   r9   c                 C   s   d}||o
| j tjkO }||o| j tjkO }g }|r:|r:dd |D }	| jjtj|	ddddd}
| jg|
| j	}| j
|j|||d\}}}g || j}| jg}|| | }|||fS )a  Encode a user message.

        Args:
            message: The message to encode.
            available_tools: The list of available tools if any.
            is_last: Whether the message is the last one.
            is_first: Not used.
            system_prompt: The system prompt.
            force_img_first: Whether to force the image to be first.

        Returns:
            The encoded tokens and the list of images.
        Fc                 S   s   g | ]}|  qS r7   )
model_dump).0toolr7   r7   r8   
<listcomp>  s    z;InstructTokenizerV2.encode_user_message.<locals>.<listcomp>ensure_asciir   )r   r   ra   rb   )&_user_message_position_to_encode_toolsr*   firstlastr.   r   jsondumpsr   r   r   r   r   r   )r4   rJ   rm   r   r   ra   rb   do_encode_toolstools_tokenstoolstools_json_tokensre   r   r   prefix_tokenssuffix_tokensr   r7   r7   r8   rl     s.   
z'InstructTokenizerV2.encode_user_messager   c                 C   s&   zt |W S  t jy   | Y S w r:   )r   loadsJSONDecodeErrorr4   r   r7   r7   r8   _parse_json_content  s
   z'InstructTokenizerV2._parse_json_contentc                 C   s(   t |trddd |D }| |S )Nr   c                 s       | ]}|j V  qd S r:   rf   r   rU   r7   r7   r8   	<genexpr>      z:InstructTokenizerV2._parse_tool_content.<locals>.<genexpr>)rD   rA   joinr   r   r7   r7   r8   _parse_tool_content  s   

z'InstructTokenizerV2._parse_tool_contenttool_messagec                 C   s   |j | |jdS )z8Bit of a hack due to the way tool results are tokenized.)namer   )r   r   r   r4   r   r7   r7   r8   _prepare_tool_result  s   
z(InstructTokenizerV2._prepare_tool_resultrK   c                 C   sB   |rg S t j| |gdd}| jg| jj|ddd| j}|S )a  Encode a tool message.

        Args:
            message: The message to encode.
            is_before_last_user_message: Whether the message is before the last user message. If true, the message is
                not encoded.

        Returns:
            The encoded tokens.
        Fr   r   r   r   r   r   r.   r   r   r4   rJ   rK   tool_result_strr   r7   r7   r8   rP     s   z'InstructTokenizerV2.encode_tool_message	tool_callc                 C   s   |j j| |j jdS )z:Bit of a hack due to the way function calls are tokenized.r   	arguments)functionr   r   r   )r4   r   r7   r7   r8   _prepare_function_call  s   z*InstructTokenizerV2._prepare_function_callc                 C   sB   |j s
J d| t|j tsJ d| jj|j ddddS )Nz)Assistant message must have content. Got 3Message content must be a string for tokenizer < V7 Fr   )r   rD   r   r.   r   rstripr   r7   r7   r8   (_encode_normal_content_assistant_message  s   z<InstructTokenizerV2._encode_normal_content_assistant_messagec                 C   s`   |j s
J d| g }|j D ]
}|| | qtj|dd}| jg| jj|ddd}|S )N,Assistant message must have tool calls. Got Fr   r   )r   rq   r   r   r   r   r.   r   )r4   rJ   prepared_tool_callsr   tool_call_strr   r7   r7   r8   '_encode_tool_calls_in_assistant_message	  s   
z;InstructTokenizerV2._encode_tool_calls_in_assistant_messagerR   c                 C   s   |j r|jrtd| |r|jrtd|j r#|rg S | |}n|jr6t|jts0J d| |}nt	d|j |jsJ|sJ|
| jj |S )a  Encode an assistant message.

        Args:
            message: The message to encode.
            is_before_last_user_message: Whether the message is before the last user message. If has tools and true, the
                message is not encoded.
            continue_message: Whether to continue the message generation.
                Only use this if the assistant message is the last message.

        Returns:
            The encoded tokens.
        zICannot have tool calls and content defined in the same assistant message r   r   Invalid assistant message: )r   r   
ValueErrorr   r   r   rD   r   r   r   rq   r.   r   r   r7   r7   r8   rT     s"   

z,InstructTokenizerV2.encode_assistant_messagerf   c                 C   s   | j jd| ddddd S )z;Remove prefix space in the case of SentencePieceTokenizers.u   ☺Fr      N)r.   r   )r4   rf   r7   r7   r8   _encode_infilling9  s   z%InstructTokenizerV2._encode_infillingr?   c                 C   s\   | j j|jddd}|jr| |jng }| j| jg|| j|}t|| j	|t
jddS )zEncode a FIM request.

        Args:
            request: The request to encode.

        Returns:
            The encoded tokens.
        Fr   rc   )re   rf   )r.   r   promptr   r   r   r   r   r'   rs   r%   rt   )r4   r?   r   r   re   r7   r7   r8   r   >  s   	zInstructTokenizerV2.encode_fimr}   r   ))r~   r   r   r   r*   r   r   r)   r+   r    r2   r   rA   r   r   r   r   r   r   r   r   rl   r   r   r   r   r   dictr   rP   r   r   r   r   r   rT   r   r   r'   r   r   r7   r7   r5   r8   r   ~  s^    

2
$r   c                       s  e Zd ZdZ		d%dededB dedB f fddZded	e	e
ef fd
dZded	e	e
ef fddZdeded	ee fddZdededed	ee f fddZede
eB eB d	eee ddf fddZedeeB d	eee ejdf fddZedeeB d	eee de f fddZde
e!B d	eee ejdB e dB f fddZde"e! d	eee eej ee  f fddZ#		d&de
ee$ B d ed!e
dB d"ed	eee eej ee  f f
 fd#d$Z%  Z&S )'InstructTokenizerV3zxInstruct tokenizer V3.

    The only difference with V2 tokenizer is that it encodes the tool messages differently.
    Nr.   r/   r0   c                    s   t  j|||d dS )r   )r/   r0   N)r1   r2   r3   r5   r7   r8   r2   [  s   zInstructTokenizerV3.__init__r   r9   c                 C   s6   |j j| |j jd}|jr|jdkr|j|d< |S )Nr   nullid)r   r   r   r   r   )r4   r   function_callr7   r7   r8   r   j  s   
z*InstructTokenizerV3._prepare_function_callr   c                 C   s&   |j d us	J d| |j|j dS )Nz7Tool message has to have the tool call id defined in v3)r   call_id)tool_call_idr   r   r   r7   r7   r8   r   u  s   
z(InstructTokenizerV3._prepare_tool_resultrJ   rK   c                 C   s8   t j| |dd}| jg| jj|ddd| j}|S )a  Encode a tool message.

        Note:
            Same as [V2][mistral_common.tokens.tokenizers.instruct.InstructTokenizerV2.encode_tool_message] but tools
            are not wrapped in a list and the history is also tokenized.

        Args:
            message: The message to encode.
            is_before_last_user_message: Whether the message is before the last user message. If true, the message is
                not encoded.

        Returns:
            The encoded tokens.
        Fr   r   r   r   r7   r7   r8   rP   }  s   z'InstructTokenizerV3.encode_tool_messagerR   c                    s   t  |d|S )a  Encode an assistant message.

        Note:
            Same as [V2][mistral_common.tokens.tokenizers.instruct.InstructTokenizerV2.encode_assistant_message] but
            always encode the tool history.
            continue_message: Whether to continue the message generation.
                Only use this if the assistant message is the last message.

        Args:
            message: The message to encode.
            is_before_last_user_message: Not used.

        Returns:
            The encoded tokens.
        F)r1   rT   rS   r5   r7   r8   rT     s   z,InstructTokenizerV3.encode_assistant_messagerU   c                 C   r[   r:   r7   rV   r7   r7   r8   _encode_content_chunk     z)InstructTokenizerV3._encode_content_chunkc                 C   r[   r:   r7   rV   r7   r7   r8   r     r  c                 C   r[   r:   r7   rV   r7   r7   r8   r     r  c                 C   s   t |tr| jj|dddd d fS t |tr#| jj|jdddd d fS t |tr0| |d d fS t |tt	frL| j
d us@J d| 
|}|j|jd fS t |ttfrh| jd us\J d| |}|jd |jfS td| )NFr   z+Make sure to define a image encoder at initz+Make sure to define a audio encoder at initzUnknown chunk type: )rD   r   r.   r   r   rf   r   rW   r   r   r/   re   r   r   r   r0   r   r   )r4   rU   img_encodingaudio_encodingr7   r7   r8   r     s   




r   c           	      C   s^   g }g }g }|D ]!}|  |\}}}|| |d ur || |d ur)|| q|||fS r:   )r   rn   rq   )	r4   r   re   rh   r   rU   chunk_tokensmaybe_imagemaybe_audior7   r7   r8   _encode_content_chunks  s   



z*InstructTokenizerV3._encode_content_chunksFr   ra   rb   c                    s*  t |trt |||S g }g }g }t|dko!t |d ttf}|r.|r.|d |d g}d}	|D ]]}
d}|	rM|rM|rMd}	|d }|| jj|ddd7 }t |
t	t
frn|r`J d	t|
 d
| |
\}}}|| nt |
ttfr| |
\}}}|| n| |
d }|| q2|||fS )H  Encode a user content.

        Args:
            content: The content to encode.
            is_last: Whether the message is the last one.
            system_prompt: The system prompt.
            force_img_first: Whether to force the image to be first.

        Returns:
            The encoded tokens and the images.
        r   r`   r   Tr   Fr   r   zEIt is not possible that `content` is non-empty when chunk is of type .)rD   r   r1   r   rk   r   r   r.   r   r   r   rp   r   rq   rn   )r4   r   r   ra   rb   re   rh   r   has_one_img_one_text_firstfirst_chunkrU   content_strr  _chunk_audiochunk_imager5   r7   r8   r     s6   

z'InstructTokenizerV3.encode_user_contentr}   r   )'r~   r   r   r   r)   r+   r    r2   r   r   r   r   r   r   r   r   rA   r   rP   r   rT   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r   r   r   r7   r7   r5   r8   r   S  s`    *(&.

r   c                       s  e Zd ZdZ		d2dededB dedB ddf fddZd	eee	 dB  d
ee
 de	de	ddf
ddZdedee	 fddZ		d3deee B dededB dedeee	 eej ee f f
 fddZ		d3dedee dB dedededB dedeee	 eej ee f f fddZdedefddZdedefd d!Zd4d"eeB d#edB defd$d%Zdedefd&d'Z e!d
ee" ddfd(d)Z#e$d
ee" defd*d+Z%de&d,edee	 fd-d.Z'de
d,ed/edee	 fd0d1Z(  Z)S )5InstructTokenizerV7a%  Instruct tokenizer V7.

    The difference with V3 tokenizer is that it encodes the system prompts differently:
    - in V7 the system prompts are treated as separate SystemMessages
    - they are no longer prepended to the last user message
    - they are printed between special tokens

    Nr.   r/   r0   r9   c                    sz   t  ||| | jtjj| _| jtjj| _	| jtj
j| _d| _|dur9|jjs;| jtjj| _dS dS dS r   )r1   r2   r.   r   r&   begin_systemr   BEGIN_SYSTEM
end_system
END_SYSTEMbegin_tool_contentBEGIN_TOOL_CONTENT
TRANSCRIBEaudio_configis_streaming
transcriber3   r5   r7   r8   r2     s   zInstructTokenizerV7.__init__tokenized_messagesrC   rY   rZ   c                    s   t dd D | dtdd f fdd}d}dkrb|tk rb|| |d7 }t|d  trX|tk rXt| tsX|| |d7 }|tk rXt| trCdkrb|tk s%dkrjtd	d S )
Nc                 s   s     | ]}|d urt |V  qd S r:   )rk   )r   tr7   r7   r8   r   7  s    z?InstructTokenizerV7._truncate_for_max_tokens.<locals>.<genexpr>idxr9   c                    sJ   t |  tr	d S |  krd S |  }|d usJ t|8 d | < d S r:   )rD   r   rk   )r  rz   rZ   rC   to_dropr  r7   r8   drop9  s   z:InstructTokenizerV7._truncate_for_max_tokens.<locals>.dropr   r`   z+Input couldn't fit in truncate_at_max_token)sumr   rk   rD   r   r   )r4   r  rC   rY   rZ   r   current_idxr7   r  r8   r\   +  s   
z,InstructTokenizerV7._truncate_for_max_tokensrJ   c                 C   sF   | j g}t|j }trt|dg}|| |d 7 }|| j |S )zEncode a system message.

        Args:
            message: The message to encode.

        Returns:
            The encoded tokens.
        r   r   )r  rD   r   r   r   r  rq   r  )r4   rJ   re   r   r7   r7   r8   ro   T  s   
z)InstructTokenizerV7.encode_system_messageFr   r   ra   rb   c           	         sz   |du sJ dt |trt |||S t|dko#t |d ttf}|r0|r0|d |d g}| |\}}}|||fS )r  N?in Tokenizer V7 we don't encode system prompts in user messagesr   r`   r   )rD   r   r1   r   rk   r   r   r  )	r4   r   r   ra   rb   r
  re   rh   r   r5   r7   r8   r   e  s   

z'InstructTokenizerV7.encode_user_contentrm   r   c           
         s8   |du sJ dt  j||||d|d\}}}	|||	fS )a  Encode a user message.

        Args:
            message: The message to encode.
            available_tools: The list of available tools if any.
            is_last: Whether the message is the last one.
            is_first: Whether the message is the first one.
            system_prompt: Not used.
            force_img_first: Whether to force the image to be first.

        Returns:
            The encoded tokens and the list of images.
        Nr#  )r   r   ra   rb   )r1   rl   )
r4   rJ   rm   r   r   ra   rb   re   rh   r   r5   r7   r8   rl     s   
	z'InstructTokenizerV7.encode_user_messager?   c                 C   sd   | j dusJ d| j | j jjtjkr| |S | j jjtjkr'| |S td| j jjd)a  
        Encodes an audio transcription request into a tokenized format.

        This method processes a transcription request containing audio data,
        encodes the user message, and returns the tokenized output.

        Args:
            request: The transcription request object containing
                the audio data to be encoded.

        Returns:
            Tokenized: The tokenized representation of the audio data, including processed audio and tokens
        Nz6Audio encoder must be defined, got self.audio_encoder=zxTranscription format should be one of 'instruct', 'streaming', got self.audio_encoder.audio_config.transcription_format=r	  )	r0   r  transcription_formatr!   INSTRUCT_encode_instruct_transcription	STREAMING_encode_streaming_transcriptionr
   r   r7   r7   r8   r     s   

z(InstructTokenizerV7.encode_transcriptionc                 C   s   |j tjksJ d|j | jd usJ | jj d|  }| jtt	|j
dgdg ddd d\}}}g ||}|jd urQd|j }|| jj|ddd	7 }|| j t|| j||d
S )Nz=Request must not be in streaming mode, got request.streaming=z! needs to have a TRANSCRIBE token)input_audio)r   T)rm   r   r   ra   zlang:Fr   re   rf   ri   )	streamingr   DISABLEDr  r6   r~   r>   rl   r   r   r   languager.   r   rq   r'   r|   )r4   r?   r   re   r  r   language_stringr7   r7   r8   r&    s$   

z2InstructTokenizerV7._encode_instruct_transcriptionr   transcription_delay_msc                 C   sX   | j d usJ d| j t|trt|nt|}| j ||}t|j|j	gdS )NFAudio encoder must be defined to encode audio, got self.audio_encoder=)re   ri   )
r0   rD   r   r   from_base64
from_bytesencode_audior'   re   r   )r4   r   r/  _audio	audio_encr7   r7   r8   _encode_audio  s   
z!InstructTokenizerV7._encode_audioc                 C   s<  |j tjkr| |jj|j}|  |j }|j	}nw|j tj
kr| jd us'J | j|j\}}||g}t|jjdkrhtd|j dt t|jjtsPJ t|jj}tt|j|jf|j|jg}n|jjrpJ d| jd us}J d| j|  | j|j }ntd|j t|| j|tjd|dS )	Nr   z%Passing audio with request.streaming=zf is deprecated. Make sure to not pass any audio to `TranscriptionRequest` when doing online streaming.z~For online streaming, no audio bytes should be passed in the first request. Audio buffering is taken care of directly by vLLM.r0  z9Request must be in streaming mode, got request.streaming=rc   r*  ) r+  r   OFFLINEr6  r   datatarget_streaming_delay_msr>   re   ri   ONLINEr0   get_padding_audiork   warningswarnFutureWarningrD   r   r   r1  r   concatenateaudio_arraysampling_rateformatencode_streaming_tokensr   r'   rs   r%   rt   )r4   r?   rX   re   ri   left_pad	right_padrequest_audior7   r7   r8   r(    sD   

z3InstructTokenizerV7._encode_streaming_transcriptionc                 C   s,   |  |rtdd |D rtdd S d S )Nc                 s       | ]}t |tV  qd S r:   )rD   r   r   rJ   r7   r7   r8   r         z8InstructTokenizerV7.validate_messages.<locals>.<genexpr>z9System messages are not yet allowed when audio is present)
_has_audioanyr   r]   r7   r7   r8   r_     s
   
z%InstructTokenizerV7.validate_messagesc                 C   s   t dd | D S )Nc                 s   s:    | ]}t |tot |jtotd d |jD V  qdS )c                 s   rG  r:   )rD   r   r   r7   r7   r8   r   !  rI  z;InstructTokenizerV7._has_audio.<locals>.<genexpr>.<genexpr>N)rD   r   r   rA   rK  rH  r7   r7   r8   r     s    


z1InstructTokenizerV7._has_audio.<locals>.<genexpr>)rK  )rC   r7   r7   r8   rJ    s   zInstructTokenizerV7._has_audiorK   c                 C   sr   |j dusJ t|jtsJ d| jj|j ddd}| jj|jddd}| jg|| j}g ||| j}|S )a  Encode a tool message.

        Note:
            Same as [V3][mistral_common.tokens.tokenizers.instruct.InstructTokenizerV3.encode_tool_message]
            but tools are not wrapped in a list and history is also tokenized

        Args:
            message: The message to encode.
            is_before_last_user_message: Not used.

        Returns:
            The encoded tokens.
        Nr   Fr   )	r   rD   r   r   r.   r   r   r  r   )r4   rJ   rK   tool_call_id_tokensre   r   r   r7   r7   r8   rP   %  s$   z'InstructTokenizerV7.encode_tool_messagerR   c                 C   s   |j s|jstd| |r|jrtdg }|j r7t|j tr'| |}nt|j tr7|| 	|j d 7 }|jrA|| 
|7 }|jsM|sM|| jj |S )r   r   r   r   )r   r   r   r   r   rD   r   r   rA   r  r   rq   r.   r   r   r7   r7   r8   rT   D  s"   

z,InstructTokenizerV7.encode_assistant_messager}   r   r:   )*r~   r   r   r   r)   r+   r    r2   rA   r   r   r\   r   ro   r   r   r   r   r   r   r   r   r   r   rl   r   r'   r   r&  bytesfloatr6  r(  r   r   r_   r   rJ  r   rP   rT   r   r7   r7   r5   r8   r  
  s    
)
%
# 4r  c                	       sV   e Zd ZdZ		ddededB dedB ddf fddZd	ede	e
 fd
dZ  ZS )InstructTokenizerV11zInstruct tokenizer V11.

    The difference with V7 tokenizer is that it encodes tool calls differently:
    Tool call results are encoded as :
    - [begin tool call] call_name_tokens [call id] call_id_tokens [args] content tokens
    Nr.   r/   r0   r9   c                    s8   t  ||| | jtjj| _| jtjj| _	d S r:   )
r1   r2   r.   r   r&   argsr   ARGSr   CALL_IDr3   r5   r7   r8   r2   o  s   zInstructTokenizerV11.__init__rJ   c                 C   s   |j s
J d| g }|j D ]B}| |}g }d|v r+| jg| jj|d ddd}|| jg| jj|d ddd|| j| jjtj|d ddddd7 }q|S )Nr   r   Fr   r   r   r   )	r   r   rR  r.   r   r   rQ  r   r   )r4   rJ   r   r   preparedidsr7   r7   r8   r   y  s&   

z<InstructTokenizerV11._encode_tool_calls_in_assistant_messager}   )r~   r   r   r   r)   r+   r    r2   r   rA   r   r   r   r7   r7   r5   r8   rO  g  s    

rO  c                	       s   e Zd ZdZejZ		ddededB de	dB ddf fddZ
d	edee fd
dZd	ededee fddZdedee fddZ  ZS )InstructTokenizerV13zInstruct tokenizer V13.

    The difference with V11 tokenizer is that it encodes tool calls differently:
        - available tools are tokenized at the first user message.
        - call id is no longer tokenized for tool calls or results.
    Nr.   r/   r0   r9   c                    s|   t  ||| t|tsJ dt| tjj|jv r6tj	j|jv r6|
tjj| _|
tj	j| _d S d | _d | _d S )Nz$Tokenizer must be a Tekkenizer. Got )r1   r2   rD   r,   rp   r&   begin_thinkr   _special_tokens_reverse_vocab	end_thinkr   BEGIN_THINK	END_THINKr3   r5   r7   r8   r2     s   
zInstructTokenizerV13.__init__rJ   c                 C   s   |j s
J d| g }|j D ]5}|jr|jdksJ | |}|| jg| jj|d ddd| j| jjtj|d ddddd7 }q|S )Nr   r   r   Fr   r   r   )	r   r   r   r   r.   r   rQ  r   r   )r4   rJ   r   r   rS  r7   r7   r8   r     s   

z<InstructTokenizerV13._encode_tool_calls_in_assistant_messagerK   c                 C   s^   |j dus	J d|j}t|tsddd |D }| jj|ddd}| jg|| j}|S )zEncode a tool message.

        Args:
            message: The message to encode.
            is_before_last_user_message: Not used.
        Returns:
            The encoded tokens.
        Nz2Tool call id must be provided for tokenizer >= v13r   c                 s   r   r:   r   r   r7   r7   r8   r     r   z;InstructTokenizerV13.encode_tool_message.<locals>.<genexpr>Fr   )	r   r   rD   r   r   r.   r   r   r   )r4   rJ   rK   r   re   r   r7   r7   r8   rP     s   	
z(InstructTokenizerV13.encode_tool_messagerU   c                 C   sZ   | j dus	J d| jdusJ d| jj|jddd}| j g|}|jr+|| j |S )zEncode a thinking chunk.

        Args:
            chunk: The thinking chunk to encode.
        Returns:
            The encoded tokens.
        Nz2think tokens are not available for this tokenizer.Fr   )rY  rZ  r.   r   thinkingclosedrq   )r4   rU   re   think_tokensr7   r7   r8   rW     s   z!InstructTokenizerV13.encode_thinkr}   )r~   r   r   r   r*   r   r   r)   r+   r    r2   r   rA   r   r   r   r   rP   r   rW   r   r7   r7   r5   r8   rU    s"    rU  )Dr   r<  abcr   typingr   r   r   r   numpyr   mistral_common.audior   mistral_common.exceptionsr   r	   r
   r   #mistral_common.protocol.fim.requestr   &mistral_common.protocol.instruct.chunkr   r   r   r   r   r   r   r   )mistral_common.protocol.instruct.messagesr   r   r   r   r   r   (mistral_common.protocol.instruct.requestr   +mistral_common.protocol.instruct.tool_callsr   r   -mistral_common.protocol.transcription.requestr   r   &mistral_common.tokens.tokenizers.audior    r!   %mistral_common.tokens.tokenizers.baser"   r#   r$   r%   r&   r'   r(   r)   r*   &mistral_common.tokens.tokenizers.imager+   'mistral_common.tokens.tokenizers.tekkenr,   r-   r   r   r   r  rO  rU  r7   r7   r7   r8   <module>   sL    ( 
,
 
H 
 
V 8  _&