o
    *i8                     @   s|  d dl mZmZ d dlmZ d dlmZ d dlmZm	Z	m
Z
 d dlZd dlmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZmZ d dlmZ d dlmZ d dlm Z  d dl!m"Z" d dl#m$Z$ G dd de%eZ&G dd de%eZ'G dd de%eZ(G dd de%eZ)G dd deZ*G dd deZ+e
dedZ,e
dedZ-e
d e*dZ.G d!d" d"e	e,e-e.ef Z/dS )#    )ABCabstractmethod)Enum)Path)AnyGenericTypeVarN)
ConfigDictField)Audio)MistralBase)
FIMRequest)UserContentChunk)AssistantMessageTypeUserMessage)InstructRequest)Tool)TranscriptionRequest)AudioEncoder)ImageEncoderc                   @   s   e Zd ZdZdZdZdS )UserMessagePositionzWhere to encode available toolsfirstlastN)__name__
__module____qualname____doc__r   r    r   r   b/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/mistral_common/tokens/tokenizers/base.pyr      s    r   c                   @   s   e Zd ZdZdZdZdZdZdZdZ	dZ
d	Zd
ZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZ dS ) SpecialTokensa(  Enum of special tokens used in the tokenizer.

    Attributes:
        unk: The unknown token.
        bos: The beginning of string token.
        eos: The end of string token.
        begin_inst: The beginning of instruction token.
        end_inst: The end of instruction token.
        begin_tools: The beginning of tools token.
        end_tools: The end of tools token.
        begin_tool_results: The beginning of tool results token.
        end_tool_results: The end of tool results token.
        tool_calls: The tool calls token.
        img: The image token.
        pad: The pad token.
        img_break: The image break token.
        img_end: The image end token.
        prefix: The prefix token for FIM.
        middle: The middle token for FIM.
        suffix: The suffix token for FIM.
        begin_system: The beginning of system prompt token.
        end_system: The end of system prompt token.
        begin_tool_content: The beginning of tool content token.
        args: The args token.
        call_id: The call id token.
        audio: The audio token.
        begin_audio: The beginning of audio token.
        transcribe: The transcribe token.
        begin_think: The beginning of think token.
        end_think: The end of think token.

    Examples:
        >>> unk = SpecialTokens.unk
    z<unk>z<s>z</s>z[INST]z[/INST]z[AVAILABLE_TOOLS]z[/AVAILABLE_TOOLS]z[TOOL_RESULTS]z[/TOOL_RESULTS]z[TOOL_CALLS]z[IMG]z<pad>z[IMG_BREAK]z	[IMG_END]z[PREFIX]z[MIDDLE]z[SUFFIX]z[SYSTEM_PROMPT]z[/SYSTEM_PROMPT]z[TOOL_CONTENT]z[ARGS]z	[CALL_ID]z[AUDIO]z[BEGIN_AUDIO]z[TRANSCRIBE]z[THINK]z[/THINK]z[STREAMING_PAD]z[STREAMING_WORD]N)!r   r   r   r   unkboseos
begin_instend_instbegin_tools	end_toolsbegin_tool_resultsend_tool_results
tool_callsimgpad	img_breakimg_endprefixmiddlesuffixbegin_system
end_systembegin_tool_contentargscall_idaudiobegin_audio
transcribebegin_think	end_thinkstreaming_padstreaming_wordr   r   r   r   r      s>    #r   c                       s:   e Zd ZdZdZdZdZededef fddZ	  Z
S )	SpecialTokenPolicyzWhat to do with special tokens when encoding/decoding.

    Attributes:
        IGNORE: Ignore special tokens.
        KEEP: Keep special tokens.
        RAISE: Raise an error if special tokens are found.
    ignorekeepraisevaluereturnc                    s:   | dkr	 t jS  dkr t jS dkrt jS t |S )Nr         )r=   IGNOREKEEPRAISEsuper	_missing_)clsrA   	__class__r   r   rI   o   s   

zSpecialTokenPolicy._missing_)r   r   r   r   rE   rF   rG   classmethodr   rI   __classcell__r   r   rK   r   r=   b   s     r=   c                   @   s   e Zd ZdZedefddZdddefddZdddefd	d
Z	dddefddZ
dddefddZdZdZdZdZdZdZdS )TokenizerVersiona  Enum of tokenizer versions.

    Allow to distinguish between different versions of the tokenizer and maintain backward compatibility.

    Attributes:
        v1: The first version of the tokenizer.
        v2: The second version of the tokenizer that includes special control tokens [INST], [\INST].
        v3: The third version of the tokenizer that includes improved function calling.
        v7: The seventh version of the tokenizer that includes improved system prompt and function calling.
        v11: The eleventh version of the tokenizer that includes improved function calling.
        v13: The thirteenth version of the tokenizer that includes no call id tokenization and better prompt caching.

    Examples:
        >>> version = TokenizerVersion.v1
    rB   c                 C   s   t | jdd  S )NrC   )intrA   selfr   r   r   _version_num   s   zTokenizerVersion._version_numotherzstr | TokenizerVersionc                 C   s   t |tr	t|}| j|jk S N
isinstancestrrO   rS   rR   rT   r   r   r   __lt__   s   
zTokenizerVersion.__lt__c                 C   s"   t |trt|}| j|jkS d S rU   rV   rY   r   r   r   __le__      
zTokenizerVersion.__le__c                 C   s"   t |trt|}| j|jkS d S rU   rV   rY   r   r   r   __gt__   r\   zTokenizerVersion.__gt__c                 C   s"   t |trt|}| j|jkS d S rU   rV   rY   r   r   r   __ge__   r\   zTokenizerVersion.__ge__v1v2v3v7v11v13N)r   r   r   r   propertyrP   rS   boolrZ   r[   r]   r^   r_   r`   ra   rb   rc   rd   r   r   r   r   rO   |   s    rO   c                   @   sz   e Zd ZU dZeddZee ed< dZ	e
dB ed< dZee dB ed< eedZeej ed	< eedZee ed
< dS )	Tokenizeda  A tokenized [`InstructRequest`][mistral_common.tokens.instruct.request].

    Attributes:
        tokens: The token ids.
        text: The text representation of the tokens.
        prefix_ids: The prefix ids for FIM.
        images: The loaded images associated with the tokens.

    Examples:
        >>> tokenized = Tokenized(tokens=[1, 2, 3], text="Hello world", prefix_ids=[1], images=[])
    T)arbitrary_types_allowedtokensNtext
prefix_ids)default_factoryimagesaudios)r   r   r   r   r	   model_configlistrP   __annotations__rj   rX   rk   r
   rm   npndarrayrn   r   r   r   r   r   rg      s   
 
rg   c                
   @   s  e Zd ZeedefddZeedee fddZeedefddZ	ede
e fdd	Zed
edefddZeedefddZeedefddZeedefddZeedefddZedededede
e fddZed,de
e dedB defddZededefdd Zed!eejB eB defd"d#Zeedefd$d%Zede
e defd&d'Zede
e defd(d)Zeedefd*d+ZdS )-	TokenizerrB   c                 C      dS )z!Vocabulary size of the tokenizer.Nr   rQ   r   r   r   n_words       zTokenizer.n_wordsc                 C   ru   )zIds of the special tokens.Nr   rQ   r   r   r   special_ids   rw   zTokenizer.special_idsc                 C   ru   )z.The number of special tokens of the tokenizer.Nr   rQ   r   r   r   num_special_tokens   rw   zTokenizer.num_special_tokensc                 C   ru   )z(All tokens in the vocabulary as strings.Nr   rQ   r   r   r   vocab   rw   zTokenizer.vocabtoken_idc                 C   ru   )z$Convert a token id to the token str.Nr   )rR   r{   r   r   r   id_to_piece   rw   zTokenizer.id_to_piecec                 C   ru   )z$id of the Beginning of String token.Nr   rQ   r   r   r   bos_id   rw   zTokenizer.bos_idc                 C   ru   )zid of the End of String token.Nr   rQ   r   r   r   eos_id   rw   zTokenizer.eos_idc                 C   ru   )zid of the Pad token.Nr   rQ   r   r   r   pad_id   rw   zTokenizer.pad_idc                 C   ru   )zid of the Unk token.Nr   rQ   r   r   r   unk_id   rw   zTokenizer.unk_idsr!   r"   c                 C   ru   )z(Convert a string to a list of token ids.Nr   )rR   r   r!   r"   r   r   r   encode   rw   zTokenizer.encodeNri   special_token_policyc                 C   ru   )a  Decode the token ids to a string.

        Args:
            tokens: The token ids to decode.
            special_token_policy: The policy to use for special tokens.
                Passing `None` will default to `self._special_token_policy` for
                [Tekkenizer][mistral_common.tokens.tokenizers.tekken.Tekkenizer] and `SpecialTokenPolicy.IGNORE`
                for [SentencePieceTokenizer][mistral_common.tokens.tokenizers.sentencepiece.SentencePieceTokenizer].
                Note that passing `None` will be deprecated and `special_token_policy` will default to
                `SpecialTokenPolicy.IGNORE` in `mistral_common=1.10.0`.

        Returns:
            The decoded string.
        Nr   rR   ri   r   r   r   r   decode   rw   zTokenizer.decodec                 C   ru   )zGet the id of a control token.Nr   )rR   r   r   r   r   get_special_token  rw   zTokenizer.get_special_tokentokenc                 C   ru   )z2Check if token id or token str is a special token.Nr   )rR   r   r   r   r   
is_special  rw   zTokenizer.is_specialc                 C   ru   )z!Get the version of the tokenizer.Nr   rQ   r   r   r   version  rw   zTokenizer.versionc                 C   ru   )z[DEPRECATED] Converts a list of token ids into a string, keeping special tokens.

        Use `decode` with `special_token_policy=SpecialTokenPolicy.KEEP` instead.

        This is a convenient method for debugging.
        Nr   rR   ri   r   r   r   	to_string  s   zTokenizer.to_stringc                 C      d S rU   r   r   r   r   r   
_to_string     zTokenizer._to_stringc                 C   ru   )zThe file path of the tokenizer.Nr   rQ   r   r   r   	file_path  s   zTokenizer.file_pathrU   ) r   r   r   re   r   rP   rv   setrx   ry   rp   rX   rz   r|   r}   r~   r   r   rf   r   r=   r   r   rr   integerr   rO   r   r   r   r   r   r   r   r   r   rt      sX     "	rt   InstructRequestType)boundFIMRequestTypeTokenizedTypec                   @   s  e Zd ZU dZeed< edB ed< edB ed< ede	fddZ
dededB dedB ddfd	d
ZededefddZededefddZed$dee dedB defddZededefddZe		d%dedee dB dedededB dedeee eej ee f fddZ e		d%deee! B dededB dedeee eej ee f f
d d!Z"edee defd"d#Z#dS )&InstructTokenizerzBase class for instruct tokenizers.

    Attributes:
        tokenizer: The tokenizer to use.
        image_encoder: The image encoder to use if any.
        audio_encoder: The audio encoder to use if any.
    	tokenizerNimage_encoderaudio_encoderrB   c                 C   s   | j jS )zThe version of the tokenizer.)r   r   rQ   r   r   r   r   6  s   zInstructTokenizer.versionc                 C   ru   )zInitialize the instruct tokenizer.

        Args:
            tokenizer: The tokenizer to use.
            image_encoder: The image encoder to use if any.
            audio_encoder: The audio encoder to use if any.
        Nr   )rR   r   r   r   r   r   r   __init__;  rw   zInstructTokenizer.__init__requestc                 C   ru   )zInstruct request to Tokenized object

        Args:
            request: The instruct request to encode.

        Returns:
            The tokenized instruct request.
        Nr   rR   r   r   r   r   encode_instructF  rw   z!InstructTokenizer.encode_instructc                 C   ru   )a  
        Encodes an audio transcription request into a tokenized format.

        This method processes a transcription request containing audio data,
        encodes the user message, and returns the tokenized output.

        Args:
            request: The transcription request object containing
                the audio data to be encoded.

        Returns:
            Tokenized: The tokenized representation of the audio data, including processed audio and tokens
        Nr   r   r   r   r   encode_transcriptionQ  s   z&InstructTokenizer.encode_transcriptionri   r   c                 C   ru   )a  Convert token ids to string

        Args:
            tokens: The token ids to decode.
            special_token_policy: The policy to use for special tokens.
                Passing `None` will default to `self._special_token_policy` for
                [Tekkenizer][mistral_common.tokens.tokenizers.tekken.Tekkenizer] and `SpecialTokenPolicy.IGNORE`
                for [SentencePieceTokenizer][mistral_common.tokens.tokenizers.sentencepiece.SentencePieceTokenizer].
                Note that passing `None` will be deprecated and `special_token_policy` will default to
                `SpecialTokenPolicy.IGNORE` in `mistral_common=1.10.0`.

        Returns:
            The decoded string.
        Nr   r   r   r   r   r   b  rw   zInstructTokenizer.decodec                 C   ru   )zFIM request to Tokenized object

        Args:
            request: The FIM request to encode.

        Returns:
            The tokenized FIM request.
        Nr   r   r   r   r   
encode_fims  rw   zInstructTokenizer.encode_fimFmessageavailable_toolsis_lastis_firstsystem_promptforce_img_firstc                 C   ru   )a  Encode a user message.

        Args:
            message: The user message to encode.
            available_tools: The available tools.
            is_last: Whether the message is the last one.
            is_first: Whether the message is the first one.
            system_prompt: The system prompt.
            force_img_first: Whether to force the image to be first.

        Returns:
            The encoded tokens and images.
        Nr   )rR   r   r   r   r   r   r   r   r   r   encode_user_message~  s   z%InstructTokenizer.encode_user_messagecontentc                 C   ru   )aI  Encode a user content.

        Args:
            content: The user content to encode.
            is_last: Whether the content is the last one.
            system_prompt: The system prompt.
            force_img_first: Whether to force the image to be first.

        Returns:
            The encoded tokens and images.
        Nr   )rR   r   r   r   r   r   r   r   encode_user_content  s   z%InstructTokenizer.encode_user_contentc                 C   r   rU   r   r   r   r   r   r     r   zInstructTokenizer._to_stringrU   )NF)$r   r   r   r   rt   rq   r   r   re   rO   r   r   r   r   r   r   r   r   rp   rP   r=   rX   r   r   r   r   r   rf   tuplerr   rs   r   r   r   r   r   r   r   r   r   r   )  st   
 

"


r   )0abcr   r   enumr   pathlibr   typingr   r   r   numpyrr   pydanticr	   r
   mistral_common.audior   mistral_common.baser   #mistral_common.protocol.fim.requestr   &mistral_common.protocol.instruct.chunkr   )mistral_common.protocol.instruct.messagesr   r   (mistral_common.protocol.instruct.requestr   +mistral_common.protocol.instruct.tool_callsr   -mistral_common.protocol.transcription.requestr   &mistral_common.tokens.tokenizers.audior   &mistral_common.tokens.tokenizers.imager   rX   r   r   r=   rO   rg   rt   r   r   r   r   r   r   r   r   <module>   s4    C1b 