o
    iI                     @   sv  U d dl Z d dlmZ d dlmZmZmZ d dlmZ d dl	m
Z
 d dlmZmZmZmZmZ d dlmZmZ d dlmZ d d	lmZmZmZmZmZ d d
lmZ d dlmZm Z m!Z! d dl"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z) d dl*m+Z+m,Z,m-Z- d dl.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4 d dl5m6Z6m7Z7m8Z8 d dl9m:Z:m;Z; d dl<m=Z= de+de:e6B de,fddZ>dede:de fddZ?G dd deeeeee(f Z@i ddd de@jAd d!d d"e@jBd#e@jAd$e@jBd%e@jAd&d'd d(e@jCd)e@jAd*e@jBd+e@jDd,d-d d.e@jBd/e@jBd0d1d d2e@jCe@jCe@jCe@jAe@jAe@jBe@jBe@jBe@jBd3d d4d d5d d6ZEeFeGeg e@f f eHd7< dS )8    N)Path)AnyCallableGeneric)TokenizerException)
FIMRequest)UATSAssistantMessageTypeSystemMessageTypeToolMessageTypeUserMessageType)InstructRequestNormalizer normalizer_for_tokenizer_version)ChatCompletionRequest)MistralRequestValidatorMistralRequestValidatorV3MistralRequestValidatorV5MistralRequestValidatorV13ValidationMode)TranscriptionRequest)AudioConfigAudioEncoderSpecialAudioIDs)InstructRequestInstructRequestTypeInstructTokenizerSpecialTokenPolicySpecialTokensTokenizedTypeTokenizerVersion)ImageConfigImageEncoderSpecialImageIDs)InstructTokenizerV1InstructTokenizerV2InstructTokenizerV3InstructTokenizerV7InstructTokenizerV11InstructTokenizerV13)SentencePieceTokenizerget_image_configis_sentencepiece)
Tekkenizer	is_tekken)download_tokenizer_from_hf_hubimage_config	tokenizerreturnc                 C   s6   t |tjj|tjj|tjjd}t| |S )zLoad a image encoder from a config and a tokenizer.

    Args:
        image_config: The image config.
        tokenizer: The tokenizer.

    Returns:
        The image encoder.
    )img	img_breakimg_end)r"   get_special_tokenr   r2   valuer3   r4   r!   )r/   r0   special_ids r8   \/home/ubuntu/.local/lib/python3.10/site-packages/mistral_common/tokens/tokenizers/mistral.pyload_image_encoder:   s   

r:   audio_configc                    sJ   dt dtdB f fdd}t|tjj|tjj|tjjd}t| |S )zLoad a audio encoder from a config and a tokenizer.

    Args:
        audio_config: The audio config.
        tokenizer: The tokenizer.

    Returns:
        The audio encoder.
    tokenr1   Nc                    s     | sd S  | S N)
is_specialr5   )r<   r0   r8   r9   get_special_token_or_noneW   s   

z5load_audio_encoder.<locals>.get_special_token_or_none)audiobegin_audiostreaming_pad)	strintr   r   rA   r6   rB   rC   r   )r;   r0   r@   r7   r8   r?   r9   load_audio_encoderL   s   



rF   c                   @   s  e Zd ZdZdeeeeef de	e
eeef dee
eeeef fddZdeeeedf f fd	d
ZedefddZedefddZedefddZed:ddZed:ddZed;dededd fddZed<dedd fddZ ed<de!dedd fddZ"e#d d ddej$fd!e!d"ee!B d B d#e!d B d$ed%ed&edd fd'd(Z%eej$fd)e!eB d&edd fd*d+Z&	 d=d,e'e( d-e)d B defd.d/Z*d,e+defd0d1Z,d,edefd2d3Z-d=d4e.e) d5e/d B de!fd6d7Z0d4e.e) de!fd8d9Z1d S )>MistralTokenizerag  Mistral tokenizer.

    This class is a wrapper around a [InstructTokenizer][mistral_common.tokens.tokenizers.base.InstructTokenizer],
    a [MistralRequestValidator][mistral_common.protocol.instruct.validator.MistralRequestValidator] and a
    [InstructRequestNormalizer][mistral_common.protocol.instruct.normalize.InstructRequestNormalizer].

    It provides a convenient interface to tokenize, validate ad normalize Mistral requests.

    Attributes:
        instruct_tokenizer: The instruct tokenizer to use. See
            [InstructTokenizer][mistral_common.tokens.tokenizers.instruct.InstructTokenizer].
    instruct_tokenizer	validatorrequest_normalizerc                 C   s   || _ || _|| _dS )zInitializes a `MistralTokenizer`.

        Args:
            instruct_tokenizer: The instruct tokenizer to use.
            validator: The request validator to use.
            request_normalizer: The request normalizer to use.
        N)"_chat_completion_request_validator_instruct_request_normalizerrH   )selfrH   rI   rJ   r8   r8   r9   __init__u   s   zMistralTokenizer.__init__r1   .c                 C   s   t j| jjj| jffS )z
        Provides a recipe for pickling (serializing) this object, which is necessary for use with multiprocessing.

        Returns:
            A tuple of the factory function and the arguments to reconstruct the object from its source file.
        )rG   	from_filerH   r0   	file_pathmoderM   r8   r8   r9   
__reduce__   s   zMistralTokenizer.__reduce__c                 C   s   | j jS )z%The validation mode of the tokenizer.)rK   rQ   rR   r8   r8   r9   rQ      s   zMistralTokenizer.modec                 C   s
   | j jjS )zThe version of the tokenizer.)rH   r0   versionrR   r8   r8   r9   rT      s   
zMistralTokenizer.versionc                 C   s   t tjd d S )N   data)r   __file__parentsclsr8   r8   r9   
_data_path   s   zMistralTokenizer._data_pathc                 C      | j t|  d tjdS )zGet the Mistral tokenizer v1.ztokenizer.model.v1rQ   rO   rD   r[   r   testrY   r8   r8   r9   v1   s   zMistralTokenizer.v1c                 C   r\   )zGet the Mistral tokenizer v2.z*mistral_instruct_tokenizer_240216.model.v2r]   r^   rY   r8   r8   r9   v2   s   zMistralTokenizer.v2Fr-   is_mmc                 C   sL   |r|rd}n|r|sd}n
|s|rt dd}| jt|  | tjdS )a;  Get the Mistral tokenizer v3.

        Args:
            is_tekken: Whether the tokenizer is a tekken tokenizer. See
                [Tekkenizer][mistral_common.tokens.tokenizers.tekken.Tekkenizer].
            is_mm: Whether to load image tokenizer.

        Returns:
            The Mistral tokenizer v3.
        ztekken_240911.jsonztekken_240718.jsonz;Multimodal tokenizer is currently only supported for tekkenz*mistral_instruct_tokenizer_240323.model.v3r]   )
ValueErrorrO   rD   r[   r   r_   )rZ   r-   rb   tokenizer_namer8   r8   r9   v3   s   zMistralTokenizer.v3c                 C   s<   |r| j t|  d tjdS | j t|  d tjdS )zGet the Mistral tokenizer v7.

        Args:
            is_mm: Whether to load the image tokenizer.

        Returns:
            The Mistral tokenizer v7.
        z,mistral_instruct_tokenizer_241114.model.v7m1r]   z*mistral_instruct_tokenizer_241114.model.v7r^   )rZ   rb   r8   r8   r9   v7   s   
zMistralTokenizer.v7modelstrictc                 C   sX   |st dt t D ]\}}|| v r|   S q|tvr'td| t|  S )ax  Get the Mistral tokenizer for a given model.

        Args:
            model: The model name.
            strict: Whether to use strict model name matching. If `False`, the model name is matched as a substring.
                This is deprecated and will be removed in `mistral_common=1.10.0`.

        Returns:
            The Mistral tokenizer for the given model.
        a  Calling `MistralTokenizer.from_model(..., strict=False)` is deprecated as it can lead to incorrect tokenizers. It is strongly recommended to use MistralTokenizer.from_model(..., strict=True)` which will become the default in `mistral_common=1.10.0`.If you are using `mistral_common` for open-sourced model weights, we recommend using `MistralTokenizer.from_file('<path/to/tokenizer/file>')` instead.zUnrecognized model: )warningswarnFutureWarningMODEL_NAME_TO_TOKENIZER_CLSitemslowerr   )rZ   rg   rh   
model_nametokenizer_clsr8   r8   r9   
from_model   s   

zMistralTokenizer.from_modelNrepo_idr<   revisionforce_downloadlocal_files_onlyrQ   c                 C   s    t | ||||d}tj||dS )aO  Download the Mistral tokenizer for a given Hugging Face repository ID.

        See [here](https://huggingface.co/mistralai/models) for a list of our OSS models.

        Args:
            repo_id: The Hugging Face repo ID.
            token: The Hugging Face token to use to download the tokenizer.
            revision: The revision of the model to use. If `None`, the latest revision will be used.
            mode: The validation mode to use.
            force_download: Whether to force the download of the tokenizer. If `True`, the tokenizer will be downloaded
                even if it is already cached.
            local_files_only: Whether to only use local files. If `True`, the tokenizer will be downloaded only if it is
                already cached.

        Returns:
            The Mistral tokenizer for the given model.
        )rr   r<   rs   rt   ru   r]   )r.   rG   rO   )rr   r<   rs   rt   ru   rQ   tokenizer_pathr8   r8   r9   from_hf_hub   s   zMistralTokenizer.from_hf_hubtokenizer_filenamec           	      C   s  t |rt|}|j}|j}nt|rt|}t|}d}ntd| |dur/t	||nd}d}|durEt
|ts@J dt||}t|j}|jtjkrl|du sXJ d|du s`J dtt|t|d|dS |jtjkr|du szJ d|du sJ dtt|t|d|dS |jtjkr|du sJ dtt||dt|d|dS |jtjkrtt|||d	t|d|dS |jtjkrtt|||d	t|d|dS |jtjkrtt|||d	t|d|dS td
| )zLoads a tokenizer from a file.

        Args:
            tokenizer_filename: The path to the tokenizer file.
            mode: The validation mode to use.

        Returns:
            The loaded tokenizer.
        NzUnrecognized tokenizer file: z-Audio is only supported for tekken tokenizersz#Tokenizer version needs to be >= v3z#Tokenizer version needs to be >= v7r]   )rI   rJ   )image_encoder)ry   audio_encoderz!Unrecognized tokenizer filename: ) r-   r,   rO   imagerA   r+   r)   r*   r   r:   
isinstancerF   r   rT   r   r`   rG   r#   r   ra   r$   re   r%   r   rf   r&   r   v11r'   v13r(   r   )	rZ   rx   rQ   r0   r/   r;   ry   rz   rJ   r8   r8   r9   rO     sr   



zMistralTokenizer.from_filerequestmax_model_input_lenc                 C   sF   | j |}|du r|jrtd| j|}|jr||_| j|S )aD  Encodes a chat completion request.

        Args:
            request: The chat completion request to encode.
            max_model_input_len: The maximum length of the input to the model.
                If `None`, the input will not be truncated.

        Returns:
            The encoded chat completion request.
        NzUencoding a chat completion request with truncation, but no max model len was provided)	rK   validate_requesttruncate_for_context_lengthr   rL   from_chat_completion_requesttruncate_at_max_tokensrH   encode_instruct)rM   r   r   validated_requestinstruct_requestr8   r8   r9   encode_chat_completionm  s   z'MistralTokenizer.encode_chat_completionc                 C      | j |S )zEncodes a transcription request.

        Args:
            request: The transcription request to encode.

        Returns:
            The encoded transcription request.
        )rH   encode_transcriptionrM   r   r8   r8   r9   r        	z%MistralTokenizer.encode_transcriptionc                 C   r   )zEncodes a fill in the middle request.

        Args:
            request: The fill in the middle request to encode.

        Returns:
            The encoded fill in the middle request.
        )rH   
encode_fimr   r8   r8   r9   r     r   zMistralTokenizer.encode_fimtokensspecial_token_policyc                 C   s   | j j||dS )a_  Decodes a list of tokens into a string.

        Args:
            tokens: The tokens to decode.
            special_token_policy: The policy to use for special tokens. Passing `None` is deprecated and will be changed
                to `SpecialTokenPolicy.IGNORE` in `mistral_common=1.10.0`.

        Returns:
            The decoded string.
        )r   )rH   decode)rM   r   r   r8   r8   r9   r     s   zMistralTokenizer.decodec                 C   r   r=   )rH   
_to_string)rM   r   r8   r8   r9   r     s   zMistralTokenizer._to_string)r1   rG   )FF)Fr=   )2__name__
__module____qualname____doc__r   r   r   r   r	   r   r   r   r
   r   r   rN   tupler   r   rS   propertyr   rQ   r   rT   classmethodr   r[   r`   ra   boolre   rf   rD   rq   staticmethodr_   rw   rO   r   r   rE   r   r   r   r   listr   r   r   r8   r8   r8   r9   rG   e   s    
	 
"R
 rG   zministral-8b-2410c                   C      t jddS NT)r-   rG   re   r8   r8   r8   r9   <lambda>      r   zmistral-tiny-2312zopen-mistral-nemo-2407c                   C   r   r   r   r8   r8   r8   r9   r     r   zmistral-tiny-2407zmistral-small-2312zopen-mixtral-8x22b-2404zmistral-small-2402zmistral-small-2409c                   C   r   r   r   r8   r8   r8   r9   r     r   zmistral-medium-2312zmistral-large-2402zmistral-large-2407zmistral-large-2411zpixtral-large-2411c                   C   r   NT)rb   rG   rf   r8   r8   r8   r9   r     r   zcodestral-2405zcodestral-mamba-2407zpixtral-12b-2409c                   C      t jdddS NT)r-   rb   r   r8   r8   r8   r9   r         zopen-mistral-7bc                   C   r   r   r   r8   r8   r8   r9   r     r   c                   C   r   r   r   r8   r8   r8   r9   r     r   c                   C   r   r   r   r8   r8   r8   r9   r     r   )zopen-mixtral-8x7bzmistral-embedzmistral-small-v1zmistral-large-v1zmistral-smallzmistral-largezopen-mixtral-8x22bzcodestral-22bzmistral-nemopixtralzpixtral-largerl   )Iri   pathlibr   typingr   r   r   mistral_common.exceptionsr   #mistral_common.protocol.fim.requestr   )mistral_common.protocol.instruct.messagesr   r	   r
   r   r   *mistral_common.protocol.instruct.normalizer   r   (mistral_common.protocol.instruct.requestr   *mistral_common.protocol.instruct.validatorr   r   r   r   r   -mistral_common.protocol.transcription.requestr   &mistral_common.tokens.tokenizers.audior   r   r   %mistral_common.tokens.tokenizers.baser   r   r   r   r   r   r   &mistral_common.tokens.tokenizers.imager    r!   r"   )mistral_common.tokens.tokenizers.instructr#   r$   r%   r&   r'   r(   .mistral_common.tokens.tokenizers.sentencepiecer)   r*   r+   'mistral_common.tokens.tokenizers.tekkenr,   r-   &mistral_common.tokens.tokenizers.utilsr.   r:   rF   rG   ra   re   r`   rf   rl   dictrD   __annotations__r8   r8   r8   r9   <module>   s   
 $	 
  P	
$