o
    c۷i8                     @   s  d dl mZmZ d dlmZ d dlmZ d dlmZm	Z	m
Z
 d dlZd dlmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZmZ d dlmZ d dlmZ d dlm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z( G dd de)eZ*G dd de)eZ+G dd de)eZ,G dd de)eZ-G dd deZ.G dd deZ/e
ded Z0e
d!ed Z1e
d"e.d Z2G d#d$ d$e	e0e1e2ef Z3dS )%    )ABCabstractmethod)Enum)Path)AnyGenericTypeVarN)
ConfigDictField)Audio)MistralBase)
FIMRequest)UserContentChunk)AssistantMessageTypeUserMessage)InstructRequest)Tool)SpeechRequest)TranscriptionRequest)AudioEncoder)ImageEncoder)ModelSettingsBuilderc                   @   s   e Zd ZdZdZdZdS )UserMessagePositionzWhere to encode available toolsfirstlastN)__name__
__module____qualname____doc__r   r    r   r   [/home/ubuntu/vllm_env/lib/python3.10/site-packages/mistral_common/tokens/tokenizers/base.pyr      s    r   c                   @   s   e Zd ZdZdZdZdZdZdZdZ	dZ
d	Zd
ZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZ dZ!d Z"d!Z#d"Z$d#S )$SpecialTokensa  Enum of special tokens used in the tokenizer.

    Attributes:
        unk: The unknown token.
        bos: The beginning of string token.
        eos: The end of string token.
        begin_inst: The beginning of instruction token.
        end_inst: The end of instruction token.
        begin_tools: The beginning of tools token.
        end_tools: The end of tools token.
        begin_tool_results: The beginning of tool results token.
        end_tool_results: The end of tool results token.
        tool_calls: The tool calls token.
        img: The image token.
        pad: The pad token.
        img_break: The image break token.
        img_end: The image end token.
        prefix: The prefix token for FIM.
        middle: The middle token for FIM.
        suffix: The suffix token for FIM.
        begin_system: The beginning of system prompt token.
        end_system: The end of system prompt token.
        begin_tool_content: The beginning of tool content token.
        args: The args token.
        call_id: The call id token.
        audio: The audio token.
        begin_audio: The beginning of audio token.
        transcribe: The transcribe token.
        begin_think: The beginning of think token.
        end_think: The end of think token.
        streaming_pad: The streaming pad token.
        streaming_word: The streaming word token.
        text_to_audio: The text to audio token.
        audio_to_text: The audio to text token.

    Examples:
        >>> unk = SpecialTokens.unk
    z<unk>z<s>z</s>z[INST]z[/INST]z[AVAILABLE_TOOLS]z[/AVAILABLE_TOOLS]z[TOOL_RESULTS]z[/TOOL_RESULTS]z[TOOL_CALLS]z[IMG]z<pad>z[IMG_BREAK]z	[IMG_END]z[PREFIX]z[MIDDLE]z[SUFFIX]z[SYSTEM_PROMPT]z[/SYSTEM_PROMPT]z[TOOL_CONTENT]z[ARGS]z	[CALL_ID]z[AUDIO]z[BEGIN_AUDIO]z[TRANSCRIBE]z[THINK]z[/THINK]z[STREAMING_PAD]z[STREAMING_WORD]z[NEXT_AUDIO_TEXT]z[REPEAT_AUDIO_TEXT]z[MODEL_SETTINGS]z[/MODEL_SETTINGS]N)%r   r   r   r   unkboseos
begin_instend_instbegin_tools	end_toolsbegin_tool_resultsend_tool_results
tool_callsimgpad	img_breakimg_endprefixmiddlesuffixbegin_system
end_systembegin_tool_contentargscall_idaudiobegin_audio
transcribebegin_think	end_thinkstreaming_padstreaming_wordtext_to_audioaudio_to_textbegin_model_settingsend_model_settingsr   r   r   r    r!   !   sF    'r!   c                       s:   e Zd ZdZdZdZdZededef fddZ	  Z
S )	SpecialTokenPolicyzWhat to do with special tokens when encoding/decoding.

    Attributes:
        IGNORE: Ignore special tokens.
        KEEP: Keep special tokens.
        RAISE: Raise an error if special tokens are found.
    ignorekeepraisevaluereturnc                    s:   | dkr	 t jS  dkr t jS dkrt jS t |S )Nr         )rC   IGNOREKEEPRAISEsuper	_missing_)clsrG   	__class__r   r    rO   y   s   

zSpecialTokenPolicy._missing_)r   r   r   r   rK   rL   rM   classmethodr   rO   __classcell__r   r   rQ   r    rC   l   s     rC   c                   @   s   e Zd ZdZedefddZedefddZdddefd	d
Z	dddefddZ
dddefddZdddefddZdZdZdZdZdZdZdZdS )TokenizerVersiona  Enum of tokenizer versions.

    Allow to distinguish between different versions of the tokenizer and maintain backward compatibility.

    Attributes:
        v1: The first version of the tokenizer.
        v2: The second version of the tokenizer that includes special control tokens [INST], [\INST].
        v3: The third version of the tokenizer that includes improved function calling.
        v7: The seventh version of the tokenizer that includes improved system prompt and function calling.
        v11: The eleventh version of the tokenizer that includes improved function calling.
        v13: The thirteenth version of the tokenizer that includes no call id tokenization and better prompt caching.

    Examples:
        >>> version = TokenizerVersion.v1
    rH   c                 C   s   t | jdd  S )NrI   )intrG   selfr   r   r    _version_num   s   zTokenizerVersion._version_numc                 C   s
   | t jkS N)rU   v15rW   r   r   r    supports_model_settings   s   
z(TokenizerVersion.supports_model_settingsotherzstr | TokenizerVersionc                 C   s   t |tr	t|}| j|jk S rZ   
isinstancestrrU   rY   rX   r]   r   r   r    __lt__   s   
zTokenizerVersion.__lt__c                 C   s"   t |trt|}| j|jkS d S rZ   r^   ra   r   r   r    __le__      
zTokenizerVersion.__le__c                 C   s"   t |trt|}| j|jkS d S rZ   r^   ra   r   r   r    __gt__   rd   zTokenizerVersion.__gt__c                 C   s"   t |trt|}| j|jkS d S rZ   r^   ra   r   r   r    __ge__   rd   zTokenizerVersion.__ge__v1v2v3v7v11v13r[   N)r   r   r   r   propertyrV   rY   boolr\   rb   rc   re   rf   rg   rh   ri   rj   rk   rl   r[   r   r   r   r    rU      s"    rU   c                   @   sz   e Zd ZU dZeddZee ed< dZ	e
dB ed< dZee dB ed< eedZeej ed	< eedZee ed
< dS )	Tokenizeda  A tokenized [`InstructRequest`][mistral_common.protocol.instruct.request.InstructRequest].

    Attributes:
        tokens: The token ids.
        text: The text representation of the tokens.
        prefix_ids: The prefix ids for FIM.
        images: The loaded images associated with the tokens.

    Examples:
        >>> tokenized = Tokenized(tokens=[1, 2, 3], text="Hello world", prefix_ids=[1], images=[])
    T)arbitrary_types_allowedtokensNtext
prefix_ids)default_factoryimagesaudios)r   r   r   r   r	   model_configlistrV   __annotations__rr   r`   rs   r
   ru   npndarrayrv   r   r   r   r   r    ro      s   
 
ro   c                
   @   s  e Zd ZeedefddZeedee fddZeedefddZ	eede
dB fd	d
Zedee fddZededefddZeedefddZeedefddZeedefddZeedefddZededededee fddZeejfdee dedefdd Zededefd!d"Zed#eejB eB defd$d%Zeedefd&d'Zedee defd(d)Zeede fd*d+Z!dS ),	TokenizerrH   c                 C      dS )z!Vocabulary size of the tokenizer.Nr   rW   r   r   r    n_words       zTokenizer.n_wordsc                 C   r}   )zIds of the special tokens.Nr   rW   r   r   r    special_ids   r   zTokenizer.special_idsc                 C   r}   )z.The number of special tokens of the tokenizer.Nr   rW   r   r   r    num_special_tokens   r   zTokenizer.num_special_tokensNc                 C   r}   )zCThe model settings builder, or None if unsupported by this version.Nr   rW   r   r   r    model_settings_builder   r   z Tokenizer.model_settings_builderc                 C   r}   )z(All tokens in the vocabulary as strings.Nr   rW   r   r   r    vocab   r   zTokenizer.vocabtoken_idc                 C   r}   )z$Convert a token id to the token str.Nr   )rX   r   r   r   r    id_to_piece   r   zTokenizer.id_to_piecec                 C   r}   )z$id of the Beginning of String token.Nr   rW   r   r   r    bos_id   r   zTokenizer.bos_idc                 C   r}   )zid of the End of String token.Nr   rW   r   r   r    eos_id   r   zTokenizer.eos_idc                 C   r}   )zid of the Pad token.Nr   rW   r   r   r    pad_id   r   zTokenizer.pad_idc                 C   r}   )zid of the Unk token.Nr   rW   r   r   r    unk_id   r   zTokenizer.unk_idsr#   r$   c                 C   r}   )z(Convert a string to a list of token ids.Nr   )rX   r   r#   r$   r   r   r    encode  r   zTokenizer.encoderq   special_token_policyc                 C   r}   )zDecode the token ids to a string.

        Args:
            tokens: The token ids to decode.
            special_token_policy: The policy to use for special tokens.

        Returns:
            The decoded string.
        Nr   rX   rq   r   r   r   r    decode  r   zTokenizer.decodec                 C   r}   )zGet the id of a control token.Nr   )rX   r   r   r   r    get_special_token  r   zTokenizer.get_special_tokentokenc                 C   r}   )z2Check if token id or token str is a special token.Nr   )rX   r   r   r   r    
is_special  r   zTokenizer.is_specialc                 C   r}   )z!Get the version of the tokenizer.Nr   rW   r   r   r    version  r   zTokenizer.versionc                 C      d S rZ   r   rX   rq   r   r   r    
_to_string     zTokenizer._to_stringc                 C   r}   )zThe file path of the tokenizer.Nr   rW   r   r   r    	file_path"  s   zTokenizer.file_path)"r   r   r   rm   r   rV   r~   setr   r   r   r   rx   r`   r   r   r   r   r   r   rn   r   rC   rK   r   r   rz   integerr   rU   r   r   r   r   r   r   r   r    r|      sZ     "r|   InstructRequestType)boundFIMRequestTypeTokenizedTypec                   @   s  e Zd ZU dZeed< edB ed< edB ed< ede	fddZ
dededB dedB ddfd	d
ZededefddZededefddZededefddZedee dedefddZededefddZe		d&dedee dB dedededB dedeee eej  ee! f fdd Z"e		d&d!eee# B dededB dedeee eej  ee! f f
d"d#Z$edee defd$d%Z%dS )'InstructTokenizerzBase class for instruct tokenizers.

    Attributes:
        tokenizer: The tokenizer to use.
        image_encoder: The image encoder to use if any.
        audio_encoder: The audio encoder to use if any.
    	tokenizerNimage_encoderaudio_encoderrH   c                 C   s   | j jS )zThe version of the tokenizer.)r   r   rW   r   r   r    r   ;  s   zInstructTokenizer.versionc                 C   r}   )zInitialize the instruct tokenizer.

        Args:
            tokenizer: The tokenizer to use.
            image_encoder: The image encoder to use if any.
            audio_encoder: The audio encoder to use if any.
        Nr   )rX   r   r   r   r   r   r    __init__@  r   zInstructTokenizer.__init__requestc                 C   r}   )zInstruct request to Tokenized object

        Args:
            request: The instruct request to encode.

        Returns:
            The tokenized instruct request.
        Nr   rX   r   r   r   r    encode_instructK  r   z!InstructTokenizer.encode_instructc                 C   r}   )a  
        Encodes an audio transcription request into a tokenized format.

        This method processes a transcription request containing audio data,
        encodes the user message, and returns the tokenized output.

        Args:
            request: The transcription request object containing
                the audio data to be encoded.

        Returns:
            Tokenized: The tokenized representation of the audio data, including processed audio and tokens
        Nr   r   r   r   r    encode_transcriptionV  r   z&InstructTokenizer.encode_transcriptionc                 C   r}   )a  Encodes a speech synthesis request into a tokenized format.

        This method processes a speech request containing text input and
        optional reference audio or voice preset, and returns the tokenized output.

        Args:
            request: The speech request object containing the text and voice/audio data.

        Returns:
            Tokenized: The tokenized representation of the speech request.
        Nr   r   r   r   r    encode_speech_requestf  r   z'InstructTokenizer.encode_speech_requestrq   r   c                 C   r}   )zConvert token ids to string

        Args:
            tokens: The token ids to decode.
            special_token_policy: The policy to use for special tokens.

        Returns:
            The decoded string.
        Nr   r   r   r   r    r   t  r   zInstructTokenizer.decodec                 C   r}   )zFIM request to Tokenized object

        Args:
            request: The FIM request to encode.

        Returns:
            The tokenized FIM request.
        Nr   r   r   r   r    
encode_fim  r   zInstructTokenizer.encode_fimFmessageavailable_toolsis_lastis_firstsystem_promptforce_img_firstc                 C   r}   )a  Encode a user message.

        Args:
            message: The user message to encode.
            available_tools: The available tools.
            is_last: Whether the message is the last one.
            is_first: Whether the message is the first one.
            system_prompt: The system prompt.
            force_img_first: Whether to force the image to be first.

        Returns:
            The encoded tokens and images.
        Nr   )rX   r   r   r   r   r   r   r   r   r    encode_user_message  s   z%InstructTokenizer.encode_user_messagecontentc                 C   r}   )aI  Encode a user content.

        Args:
            content: The user content to encode.
            is_last: Whether the content is the last one.
            system_prompt: The system prompt.
            force_img_first: Whether to force the image to be first.

        Returns:
            The encoded tokens and images.
        Nr   )rX   r   r   r   r   r   r   r    encode_user_content  s   z%InstructTokenizer.encode_user_contentc                 C   r   rZ   r   r   r   r   r    r     r   zInstructTokenizer._to_string)NF)&r   r   r   r   r|   ry   r   r   rm   rU   r   r   r   r   r   r   r   r   r   r   rx   rV   rC   r`   r   r   r   r   r   rn   tuplerz   r{   r   r   r   r   r   r   r   r   r    r   .  sx   
 




r   )4abcr   r   enumr   pathlibr   typingr   r   r   numpyrz   pydanticr	   r
   mistral_common.audior   mistral_common.baser   #mistral_common.protocol.fim.requestr   &mistral_common.protocol.instruct.chunkr   )mistral_common.protocol.instruct.messagesr   r   (mistral_common.protocol.instruct.requestr   +mistral_common.protocol.instruct.tool_callsr   &mistral_common.protocol.speech.requestr   -mistral_common.protocol.transcription.requestr   &mistral_common.tokens.tokenizers.audior   &mistral_common.tokens.tokenizers.imager   7mistral_common.tokens.tokenizers.model_settings_builderr   r`   r   r!   rC   rU   ro   r|   r   r   r   r   r   r   r   r    <module>   s8    K6X 