o
    iS                     @   s  d dl mZ d dlmZmZmZmZ d dlmZ	 d dl
mZmZ d dlmZ d dlmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ ddlmZ erxd dlm Z  zd dl!m"Z" W n e#yw   d dl!m$Z" Y nw ee%Z&d+ddZ'd+ddZ(			d,de)d de)e*e+ef  dB de,de,de-e)d e)e*e+ef  dB f f
dd Z.d-d"d#Z/d$d%d&e+e0B de1fd'd(Z2G d)d* d*eZ3dS ).    )Path)TYPE_CHECKINGAnycastoverload)ChatCompletionRequest)FunctionTool)ValidationMode)SpecialTokenPolicySpecialTokens)InstructTokenizerV13)SentencePieceTokenizer)
Tekkenizer)ChatCompletionMessageParam)init_logger   )TokenizerLike)BatchEncodingMistralCommonBackendMistralCommonTokenizerrequestMistralChatCompletionRequestc              	   C   sv   t | jD ]3\}}|ddkr8|dd }g }	 zt|}|| W n	 ty/   Y nw q|| j| d< qd S )Nrole	assistant
tool_calls )	enumeratemessagesget__iter__nextappendStopIteration)r   imessagetool_calls_validatorvalidated_tool_calls	tool_callr   r   M/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/tokenizers/mistral.pymaybe_serialize_tool_calls*   s   r,   c              	   C   s   t | jD ]l\}}|ddkrD|dg }|D ]#}t|d dkr;td|d |d dd  |d dd |d< q|| j| d< q|dd	v rqd
|v rq|d
 }t|dkrjtd||dd  |dd }|| j| d
< qdS )z6Truncates tool call IDs for Mistral's ID requirements.r   r   r   id	   z!Truncating tool call ID: %s to %siN>   tooltool_resultstool_call_idz!Truncating tool_call_id: %s to %s)r   r    r!   lenloggerwarning)r   r&   r'   r   r*   r1   r   r   r+   truncate_tool_call_idsO   s4   
r5   NFr    r   toolscontinue_final_messageadd_generation_promptreturnc                 C   sn  |r|rt dttttf | d }|r|d dkrt d|r+|d dkr+t d| D ]}|dd }q-|rdd	 |D D ]}|d
d u rLi |d
< |dd u rWd|d< q?ttj	
 }ttj	
 }	|D ]J}
t|

 }|D ]?}||vr|
| td| d |
d dkrt|
d 
 }|D ]}||	vr|
d | td| d qqrt dqh| |fS )NzMCannot set both `add_generation_prompt` and `continue_final_message` to True.r   r   zCannot set `add_generation_prompt` to True when the last message is from the assistant. Consider using `continue_final_message` instead.z\Cannot set `continue_final_message` to True when the last message is not from the assistant.	reasoningc                 S   s    g | ]}|d  dkr|d qS )typefunctionr   ).0r/   r   r   r+   
<listcomp>   s    zC_prepare_apply_chat_template_tools_and_messages.<locals>.<listcomp>
parametersdescription 'z[' is not supported by mistral-common for tools. It has been poped from the tool definition.r<   r=   zh' is not supported by mistral-common for function tools. It has been poped from the function definition.z,mistral-common only supports function tools.)
ValueErrorr   dictstrr   popr!   setr	   model_fieldskeysr   listr3   warning_once)r    r6   r7   r8   last_messager'   _r=   tools_fieldsfunction_fieldsr/   	tool_keystool_keyfunction_keysfunction_keyr   r   r+   /_prepare_apply_chat_template_tools_and_messagesm   s^   



	rU   r   c                 C   s    | j d us
| jd urtdd S )Nz6chat_template is not supported for Mistral tokenizers.)chat_templatechat_template_kwargsrD   )r   r   r   r+   validate_request_params   s   rX   	tokenizerr   tc                 C   s   t | tsJ t| t |ts|dn|}| j}z|| j|  W S  tyE   |d}|| j	v r:| j	|  Y S t
d| | j Y S w )Nzutf-8z6Failed to convert token %s to id, replacing with <unk>)
isinstancer   r<   bytesencodenum_special_tokens_tekken_token2id_nospecialKeyErrordecode_special_tokens_reverse_vocabr3   r4   unk_id)rY   rZ   t_bytesshiftt_strr   r   r+   _tekken_token_to_id   s   


rg   c                       s4  e Zd ZedddddeeB dededB dedB dd f
d	d
ZdV fddZde	e
 fddZde	e
 de	e fddZde
fddZede	e fddZede	e
 fddZede
fddZede
fddZede
fddZedefd d!Zede
fd"d#Zede
fd$d%Zede
fd&d'Zedefd(d)Zd*e
defd+d,Zde
fd-d.Zde
fd/d0Z		1		dWd2ee	e B d3edB d4ed5ed6e
dB dd7fd8d9Zede	e fd:d;Zdeee
f fd<d=Zdeee
f fd>d?Z 			1dXd2ed5edB d6e
dB d4ede	e
 f
d@dAZ!	dYdBe	dC dDe	eee"f  dB de	e
 fdEdFZ#dZdGe	e
 e
B dHedefdIdJZ$	dZdGe	e	e
  e	e
 B dHedefdKdLZ%e&dMede
fdNdOZ'e&dMe	e de	e
 fdPdOZ'dMee	e B de
e	e
 B fdQdOZ'dMe	e defdRdSZ(	dZdGe	e
 dHede	e fdTdUZ)  Z*S )[MistralTokenizerFN)trust_remote_coderevisiondownload_dirpath_or_repo_idri   rj   rk   r9   c          	      O   sf   zddl m} W n ty   ddl m} Y nw |j|g|R tj||d u r(dn|d|}| |S )Nr   r   r   main)mode	cache_dirrj   )(transformers.tokenization_mistral_commonr   ImportErrorr   from_pretrainedr
   test)	clsrl   ri   rj   rk   argskwargsr   rY   r   r   r+   rr      s"   
	z MistralTokenizer.from_pretrainedrY   r   c                    sF  t    | _|j _ jj _ jj _ jjj}|t	j
kr$tdt jjj}t|dd  _t jt _t jt _ jsS jsStdt j  fddt jd ddD  _tt j dd	 d
 _ j  _ jd  _ t!dd  jD  _" #  _$t% j$ _& ' j$ _(t% j( _)d S )NzzMistral tokenizer must be in test mode. Make sure to set `mode='ValidationMode.test'` when creating the Mistral tokenizer.vr:   zUnsupported tokenizer: c                    s"   i | ]} j |gd dd |qS )Fskip_special_tokensr   )convert_ids_to_tokensr>   r&   selfr   r+   
<dictcomp>	  s    z-MistralTokenizer.__init__.<locals>.<dictcomp>r   c                 S   s   | d S )Nr   r   )xr   r   r+   <lambda>  s    z+MistralTokenizer.__init__.<locals>.<lambda>)keyc                 s   s    | ]}t |V  qd S N)r2   )r>   tokr   r   r+   	<genexpr>      z,MistralTokenizer.__init__.<locals>.<genexpr>)*super__init__transformers_tokenizerrY   mistralinstruct_tokenizerinstruct"_chat_completion_request_validator_moder
   rs   rD   rF   versionvalueintsplitr[   r   	is_tekkenr   is_spm	TypeErrorr<   range
vocab_size_vocab_dictrE   sorteditemsvocab_vocab_max_token_idmax_max_chars_per_token_get_special_token_ids_special_token_idsrH   _special_token_ids_set_get_special_tokens_special_tokens_special_tokens_set)r}   rY   rn   _mistral_version_str	__class__r|   r+   r      s6   






zMistralTokenizer.__init__c                    s    fddt t jD S )Nc                    s   g | ]
} j |r|qS r   )rY   
is_specialr{   r|   r   r+   r?     s    z;MistralTokenizer._get_special_token_ids.<locals>.<listcomp>)r   r2   r   r|   r   r|   r+   r     s   z'MistralTokenizer._get_special_token_idsall_special_idsc                    s    fdd|D S )Nc                    s    g | ]} j j|gtjd qS ))special_token_policy)rY   ra   r   KEEPr{   r|   r   r+   r?     s    z8MistralTokenizer._get_special_tokens.<locals>.<listcomp>r   )r}   r   r   r|   r+   r     s   
z$MistralTokenizer._get_special_tokensc                 C   s   t | dS )NrB   )r2   r]   r|   r   r   r+   num_special_tokens_to_add$  s   z*MistralTokenizer.num_special_tokens_to_addc                 C      | j S r   )r   r|   r   r   r+   all_special_tokens)     z#MistralTokenizer.all_special_tokensc                 C   r   r   )r   r|   r   r   r+   r   -  r   z MistralTokenizer.all_special_idsc                 C      | j jS r   )rY   bos_idr|   r   r   r+   bos_token_id1     zMistralTokenizer.bos_token_idc                 C   r   r   )rY   eos_idr|   r   r   r+   eos_token_id5  r   zMistralTokenizer.eos_token_idc                 C   r   r   )rY   pad_idr|   r   r   r+   pad_token_id9  r   zMistralTokenizer.pad_token_idc                 C   s   dS )NTr   r|   r   r   r+   is_fast=     zMistralTokenizer.is_fastc                 C   r   r   )r   r   r|   r   r   r+   r   A  r   zMistralTokenizer.vocab_sizec                 C   r   r   )r   r|   r   r   r+   max_token_idE  r   zMistralTokenizer.max_token_idc                 C   r   r   )r   r|   r   r   r+   max_chars_per_tokenI  r   z$MistralTokenizer.max_chars_per_tokenc                 C   r   r   )r   truncation_sider|   r   r   r+   r   M  r   z MistralTokenizer.truncation_sidetoken_idc                 C   s
   || j v S r   )r   )r}   r   r   r   r+   _is_special_token_idQ  s   
z%MistralTokenizer._is_special_token_idc                 C   s   t t| S r   )hashr-   r|   r   r   r+   __hash__T     zMistralTokenizer.__hash__c                 C   r   r   )r   r|   r   r   r+   __len__W     zMistralTokenizer.__len__Ttext	text_pairadd_special_tokens
truncation
max_lengthr   c                 C   sh   |d urt d| j|||||d}|d r2|d d | jkr2|d d |d }r2|d |S )Nz<`text_pair` is not supported by `MistralTokenizer.__call__`.)r   r   r   r   r   	input_idsr:   attention_mask)rD   r   r   rG   r!   )r}   r   r   r   r   r   encodedr   r   r   r+   __call__Z  s    
zMistralTokenizer.__call__c                 C   r   r   )r   r|   r   r   r+   r   x  r   zMistralTokenizer.vocabc                 C   r   r   )r   r|   r   r   r+   	get_vocab|  r   zMistralTokenizer.get_vocabc                 C   s   i S r   r   r|   r   r   r+   get_added_vocab  r   z MistralTokenizer.get_added_vocabc                 C   s2   | j j||dd}|dur|d ur|d | S |S )NF)boseos)rY   r]   )r}   r   r   r   r   r   r   r   r+   r]     s   	zMistralTokenizer.encoder    r   r6   c           
      K   sv   | dd}|dd}|dd}|dd}|dd}|d}	t||||\}}| jj|||||||	d dd		S )
Nr8   Fr7   tokenizeTpaddingr   r   )	conversationr6   r7   r   r   r   r   return_tensorsreturn_dict)rG   r!   rU   r   apply_chat_template)
r}   r    r6   rv   r8   r7   r   r   r   r   r   r   r+   r     s(   
z$MistralTokenizer.apply_chat_templateidsry   c                 C   s    t |tr|g}| jj||dS Nrx   )r[   r   r   ra   r}   r   ry   r   r   r+   ra     s
   
zMistralTokenizer.decodec                 C   s   | j j||dS r   )r   batch_decoder   r   r   r+   r     s   zMistralTokenizer.batch_decodetokensc                 C      d S r   r   r}   r   r   r   r+   convert_tokens_to_ids     z&MistralTokenizer.convert_tokens_to_idsc                 C   r   r   r   r   r   r   r+   r     r   c                 C   s   | j |S r   )r   r   r   r   r   r+   r     r   c                    s  t jh jrAt jtsJ t j fdd|D }tdd |D r: fdd|D } j|t	j
}|S d|}|S t jtsNJ t jg }g }d}|D ] }|v rq|rk| j|t	j g }|| qV|| qV|r| j|t	j d|}|S )Nc                    s"   g | ]}|v s| j vr|qS r   )r   r>   rZ   r}   to_decode_special_tokensr   r+   r?     
    z=MistralTokenizer.convert_tokens_to_string.<locals>.<listcomp>c                 s   s    | ]}t |tV  qd S r   )r[   r\   r   r   r   r+   r     s    z<MistralTokenizer.convert_tokens_to_string.<locals>.<genexpr>c                    s   g | ]}t  j|qS r   )rg   rY   r   r|   r   r+   r?         rB   )r   r   r   r[   rY   r   r<   anyra   r   r   joinr   r$   IGNORE)r}   r   r   decodedregular_tokensdecoded_listtokenr   r   r+   convert_tokens_to_string  sF   !

z)MistralTokenizer.convert_tokens_to_stringc                    s   |sfdd|D S j tjh tjtr/jjr$ jj jj	r/ jj	  fdd|D }fdd|D }t
dd |D rWjrWfdd|D }|S )Nc                       g | ]} j |qS r   rY   id_to_piecer>   r   r|   r   r+   r?     r   z:MistralTokenizer.convert_ids_to_tokens.<locals>.<listcomp>c                    s"   g | ]}| v s |s|qS r   )r   r{   non_skip_special_tokens_idsr}   r   r+   r?     r   c                    r   r   r   r   r|   r   r+   r?     r   c                 s   s    | ]}d |v V  qdS )u   �Nr   r   r   r   r+   r     r   z9MistralTokenizer.convert_ids_to_tokens.<locals>.<genexpr>c                    s8   g | ]}| j vr j|tjn j|gtjqS r   )r   rY   id_to_byte_piecer   r   ra   r   r|   r   r+   r?     s    
)rY   get_control_tokenr   r   r[   r   r   BEGIN_THINKadd	END_THINKr   r   )r}   r   ry   ids_keptr   r   r   r+   rz     s$   
z&MistralTokenizer.convert_ids_to_tokens)rY   r   r9   N)NTFN)NNTr   )F)+__name__
__module____qualname__classmethodrF   r   boolrr   r   rK   r   r   r   r   propertyr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rE   r   r   r]   r   r   ra   r   r   r   r   rz   __classcell__r   r   r   r+   rh      s    +



 
"5rh   )r   r   )NFF)r   r   )4pathlibr   typingr   r   r   r   (mistral_common.protocol.instruct.requestr   r   +mistral_common.protocol.instruct.tool_callsr   r	   *mistral_common.protocol.instruct.validatorr
   %mistral_common.tokens.tokenizers.baser   r   )mistral_common.tokens.tokenizers.instructr   .mistral_common.tokens.tokenizers.sentencepiecer   'mistral_common.tokens.tokenizers.tekkenr   vllm.entrypoints.chat_utilsr   0vllm.entrypoints.openai.chat_completion.protocolvllm.loggerr   protocolr   transformersr   rp   r   rq   r   r   r3   r,   r5   rK   rE   rF   r   tuplerU   rX   r\   r   rg   rh   r   r   r   r+   <module>   sP   

% 

N