o
    c۷iE                  	   @   s6  U d dl mZ d dlmZmZmZ d dlmZ d dlm	Z	 d dl
mZmZmZmZmZ d dlmZmZ d dlmZ d dlmZmZmZ d d	lmZ d d
lmZ d dlmZmZm Z  d dl!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z( d dl)m*Z*m+Z+m,Z, d dl-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4 d dl5m6Z6m7Z7m8Z8 d dl9m:Z:m;Z; d dl<m=Z= de*de:e6B de+fddZ>dede:defddZ?G dd deeeeee'f Z@i ddd de@jAd d!d d"e@jBd#e@jAd$e@jBd%e@jAd&d'd d(e@jCd)e@jAd*e@jBd+e@jDd,d-d d.e@jBd/e@jBd0d1d ZEeFeGeg e@f f eHd2< d3S )4    )Path)AnyCallableGeneric)TokenizerException)
FIMRequest)UATSAssistantMessageTypeSystemMessageTypeToolMessageTypeUserMessageType)InstructRequestNormalizerget_normalizer)ChatCompletionRequest)MistralRequestValidatorValidationModeget_validator)SpeechRequest)TranscriptionRequest)AudioConfigAudioEncoderSpecialAudioIDs)InstructRequestInstructRequestTypeInstructTokenizerSpecialTokenPolicySpecialTokensTokenizedTypeTokenizerVersion)ImageConfigImageEncoderSpecialImageIDs)InstructTokenizerV1InstructTokenizerV2InstructTokenizerV3InstructTokenizerV7InstructTokenizerV11InstructTokenizerV13InstructTokenizerV15)SentencePieceTokenizerget_image_configis_sentencepiece)
Tekkenizer	is_tekken)download_tokenizer_from_hf_hubimage_config	tokenizerreturnc                 C   s6   t |tjj|tjj|tjjd}t| |S )zLoad a image encoder from a config and a tokenizer.

    Args:
        image_config: The image config.
        tokenizer: The tokenizer.

    Returns:
        The image encoder.
    )img	img_breakimg_end)r!   get_special_tokenr   r2   valuer3   r4   r    )r/   r0   special_ids r8   ^/home/ubuntu/vllm_env/lib/python3.10/site-packages/mistral_common/tokens/tokenizers/mistral.pyload_image_encoder9   s   

r:   audio_configc                    s^   dt dtdB f fdd}t|tjj|tjj|tjj|tjj|tj	jd}t
| |S )zLoad a audio encoder from a config and a tokenizer.

    Args:
        audio_config: The audio config.
        tokenizer: The tokenizer.

    Returns:
        The audio encoder.
    tokenr1   Nc                    s     | sd S  | S N)
is_specialr5   )r<   r0   r8   r9   get_special_token_or_noneV   s   

z5load_audio_encoder.<locals>.get_special_token_or_none)audiobegin_audiostreaming_padtext_to_audioaudio_to_text)strintr   r   rA   r6   rB   rC   rD   rE   r   )r;   r0   r@   r7   r8   r?   r9   load_audio_encoderK   s   





rH   c                   @   s  e Zd ZdZdeeeeef de	e
eeef dee
eeeef fddZdeeeedf f fd	d
ZedefddZedefddZedefddZed=ddZed=ddZed>dededd fddZed?dedd fddZ ed@de!dedd fdd Z"e#d!d!ddej$fd"e!d#ee!B d!B d$e!d!B d%ed&ed'edd fd(d)Z%eej$fd*e!eB d'edd fd+d,Z&	!dAd-e'e( d.e)d!B defd/d0Z*d-e+defd1d2Z,d-e-defd3d4Z.d-edefd5d6Z/e0j1fd7e2e) d8e0de!fd9d:Z3d7e2e) de!fd;d<Z4d!S )BMistralTokenizerag  Mistral tokenizer.

    This class is a wrapper around a [InstructTokenizer][mistral_common.tokens.tokenizers.base.InstructTokenizer],
    a [MistralRequestValidator][mistral_common.protocol.instruct.validator.MistralRequestValidator] and a
    [InstructRequestNormalizer][mistral_common.protocol.instruct.normalize.InstructRequestNormalizer].

    It provides a convenient interface to tokenize, validate ad normalize Mistral requests.

    Attributes:
        instruct_tokenizer: The instruct tokenizer to use. See
            [InstructTokenizer][mistral_common.tokens.tokenizers.instruct.InstructTokenizer].
    instruct_tokenizer	validatorrequest_normalizerc                 C   s   || _ || _|| _dS )zInitializes a `MistralTokenizer`.

        Args:
            instruct_tokenizer: The instruct tokenizer to use.
            validator: The request validator to use.
            request_normalizer: The request normalizer to use.
        N)"_chat_completion_request_validator_instruct_request_normalizerrJ   )selfrJ   rK   rL   r8   r8   r9   __init__v   s   zMistralTokenizer.__init__r1   .c                 C   s   t j| jjj| jffS )z
        Provides a recipe for pickling (serializing) this object, which is necessary for use with multiprocessing.

        Returns:
            A tuple of the factory function and the arguments to reconstruct the object from its source file.
        )rI   	from_filerJ   r0   	file_pathmoderO   r8   r8   r9   
__reduce__   s   zMistralTokenizer.__reduce__c                 C   s   | j jS )z%The validation mode of the tokenizer.)rM   rS   rT   r8   r8   r9   rS      s   zMistralTokenizer.modec                 C   s
   | j jjS )zThe version of the tokenizer.)rJ   r0   versionrT   r8   r8   r9   rV      s   
zMistralTokenizer.versionc                 C   s   t tjd d S )N   data)r   __file__parentsclsr8   r8   r9   
_data_path   s   zMistralTokenizer._data_pathc                 C      | j t|  d tjdS )zGet the Mistral tokenizer v1.ztokenizer.model.v1rS   rQ   rF   r]   r   testr[   r8   r8   r9   v1   s   zMistralTokenizer.v1c                 C   r^   )zGet the Mistral tokenizer v2.z*mistral_instruct_tokenizer_240216.model.v2r_   r`   r[   r8   r8   r9   v2   s   zMistralTokenizer.v2Fr-   is_mmc                 C   sL   |r|rd}n|r|sd}n
|s|rt dd}| jt|  | tjdS )a;  Get the Mistral tokenizer v3.

        Args:
            is_tekken: Whether the tokenizer is a tekken tokenizer. See
                [Tekkenizer][mistral_common.tokens.tokenizers.tekken.Tekkenizer].
            is_mm: Whether to load image tokenizer.

        Returns:
            The Mistral tokenizer v3.
        ztekken_240911.jsonztekken_240718.jsonz;Multimodal tokenizer is currently only supported for tekkenz*mistral_instruct_tokenizer_240323.model.v3r_   )
ValueErrorrQ   rF   r]   r   ra   )r\   r-   rd   tokenizer_namer8   r8   r9   v3   s   zMistralTokenizer.v3c                 C   s<   |r| j t|  d tjdS | j t|  d tjdS )zGet the Mistral tokenizer v7.

        Args:
            is_mm: Whether to load the image tokenizer.

        Returns:
            The Mistral tokenizer v7.
        z,mistral_instruct_tokenizer_241114.model.v7m1r_   z*mistral_instruct_tokenizer_241114.model.v7r`   )r\   rd   r8   r8   r9   v7   s   
zMistralTokenizer.v7Tmodelstrictc                 C   s,   |st d|tvrtd| t|  S )zGet the Mistral tokenizer for a given model.

        Args:
            model: The model name.
            strict: Has to be True, not used.

        Returns:
            The Mistral tokenizer for the given model.
        z&strict has to be `True` since v1.10.0.zUnrecognized model: )re   MODEL_NAME_TO_TOKENIZER_CLSr   )r\   ri   rj   r8   r8   r9   
from_model   s
   
zMistralTokenizer.from_modelNrepo_idr<   revisionforce_downloadlocal_files_onlyrS   c                 C   s    t | ||||d}tj||dS )aO  Download the Mistral tokenizer for a given Hugging Face repository ID.

        See [here](https://huggingface.co/mistralai/models) for a list of our OSS models.

        Args:
            repo_id: The Hugging Face repo ID.
            token: The Hugging Face token to use to download the tokenizer.
            revision: The revision of the model to use. If `None`, the latest revision will be used.
            mode: The validation mode to use.
            force_download: Whether to force the download of the tokenizer. If `True`, the tokenizer will be downloaded
                even if it is already cached.
            local_files_only: Whether to only use local files. If `True`, the tokenizer will be downloaded only if it is
                already cached.

        Returns:
            The Mistral tokenizer for the given model.
        )rm   r<   rn   ro   rp   r_   )r.   rI   rQ   )rm   r<   rn   ro   rp   rS   tokenizer_pathr8   r8   r9   from_hf_hub   s   zMistralTokenizer.from_hf_hubtokenizer_filenamec           
      C   s  t |rt|}|j}|j}nt|rt|}t|}d}ntd| |dur/t	||nd}d}|durEt
|ts@J dt||}t|j|j}t|j|d}	|jtjkrr|du saJ d|du siJ dtt||	|dS |jtjkr|du sJ d|du sJ dtt||	|dS |jtjkr|du sJ dtt||d|	|dS |jtjkrtt|||d	|	|dS |jtjkrtt|||d	|	|dS |jtjkrtt|||d	|	|dS |jtjkrtt|||d	|	|dS td
| )zLoads a tokenizer from a file.

        Args:
            tokenizer_filename: The path to the tokenizer file.
            mode: The validation mode to use.

        Returns:
            The loaded tokenizer.
        NzUnrecognized tokenizer file: z-Audio is only supported for tekken tokenizersr_   z#Tokenizer version needs to be >= v3z#Tokenizer version needs to be >= v7)rK   rL   )image_encoder)rt   audio_encoderz!Unrecognized tokenizer filename: ) r-   r,   rQ   imagerA   r+   r)   r*   r   r:   
isinstancerH   r   rV   model_settings_builderr   r   rb   rI   r"   rc   r#   rg   r$   rh   r%   v11r&   v13r'   v15r(   )
r\   rs   rS   r0   r/   r;   rt   ru   rL   rK   r8   r8   r9   rQ     s   


zMistralTokenizer.from_filerequestmax_model_input_lenc                 C   sF   | j |}|du r|jrtd| j|}|jr||_| j|S )aD  Encodes a chat completion request.

        Args:
            request: The chat completion request to encode.
            max_model_input_len: The maximum length of the input to the model.
                If `None`, the input will not be truncated.

        Returns:
            The encoded chat completion request.
        NzUencoding a chat completion request with truncation, but no max model len was provided)	rM   validate_requesttruncate_for_context_lengthr   rN   from_chat_completion_requesttruncate_at_max_tokensrJ   encode_instruct)rO   r|   r}   validated_requestinstruct_requestr8   r8   r9   encode_chat_completionh  s   z'MistralTokenizer.encode_chat_completionc                 C      | j |S )zEncodes a transcription request.

        Args:
            request: The transcription request to encode.

        Returns:
            The encoded transcription request.
        )rJ   encode_transcriptionrO   r|   r8   r8   r9   r        	z%MistralTokenizer.encode_transcriptionc                 C   r   )zEncodes a speech synthesis request.

        Args:
            request: The speech request to encode.

        Returns:
            The encoded speech request.
        )rJ   encode_speech_requestr   r8   r8   r9   r     r   z&MistralTokenizer.encode_speech_requestc                 C   r   )zEncodes a fill in the middle request.

        Args:
            request: The fill in the middle request to encode.

        Returns:
            The encoded fill in the middle request.
        )rJ   
encode_fimr   r8   r8   r9   r     r   zMistralTokenizer.encode_fimtokensspecial_token_policyc                 C   s   | j j||dS )zDecodes a list of tokens into a string.

        Args:
            tokens: The tokens to decode.
            special_token_policy: The policy to use for special tokens.

        Returns:
            The decoded string.
        )r   )rJ   decode)rO   r   r   r8   r8   r9   r     s   
zMistralTokenizer.decodec                 C   r   r=   )rJ   
_to_string)rO   r   r8   r8   r9   r     s   zMistralTokenizer._to_string)r1   rI   )FF)F)Tr=   )5__name__
__module____qualname____doc__r   r   r   r   r	   r   r   r   r
   r   r   rP   tupler   r   rU   propertyr   rS   r   rV   classmethodr   r]   rb   rc   boolrg   rh   rF   rl   staticmethodra   rr   rQ   r   r   rG   r   r   r   r   r   r   r   IGNORElistr   r   r8   r8   r8   r9   rI   f   s    
	
"Y
 rI   zministral-8b-2410c                   C      t jddS NT)r-   rI   rg   r8   r8   r8   r9   <lambda>      r   zmistral-tiny-2312zopen-mistral-nemo-2407c                   C   r   r   r   r8   r8   r8   r9   r     r   zmistral-tiny-2407zmistral-small-2312zopen-mixtral-8x22b-2404zmistral-small-2402zmistral-small-2409c                   C   r   r   r   r8   r8   r8   r9   r     r   zmistral-medium-2312zmistral-large-2402zmistral-large-2407zmistral-large-2411zpixtral-large-2411c                   C   r   )NT)rd   )rI   rh   r8   r8   r8   r9   r     r   zcodestral-2405zcodestral-mamba-2407zpixtral-12b-2409c                   C   s   t jdddS )NT)r-   rd   r   r8   r8   r8   r9   r     s    rk   N)Ipathlibr   typingr   r   r   mistral_common.exceptionsr   #mistral_common.protocol.fim.requestr   )mistral_common.protocol.instruct.messagesr   r	   r
   r   r   *mistral_common.protocol.instruct.normalizer   r   (mistral_common.protocol.instruct.requestr   *mistral_common.protocol.instruct.validatorr   r   r   &mistral_common.protocol.speech.requestr   -mistral_common.protocol.transcription.requestr   &mistral_common.tokens.tokenizers.audior   r   r   %mistral_common.tokens.tokenizers.baser   r   r   r   r   r   r   &mistral_common.tokens.tokenizers.imager   r    r!   )mistral_common.tokens.tokenizers.instructr"   r#   r$   r%   r&   r'   r(   .mistral_common.tokens.tokenizers.sentencepiecer)   r*   r+   'mistral_common.tokens.tokenizers.tekkenr,   r-   &mistral_common.tokens.tokenizers.utilsr.   r:   rH   rI   rc   rg   rb   rh   rk   dictrF   __annotations__r8   r8   r8   r9   <module>   sr    $	$	
  T	
 