o
    
۾i8                     @   sf  d Z ddlZddlZddlmZmZ ddlmZ ddlm	Z	m
Z
mZ ddlmZ ddlmZmZmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ eeZdZ dZ!dZ"dd e#ddD Z$dd e#ddD Z%e e"e!gZ&e e"e!dZ'dZ(dZ)dede*dB de*dB de*dB de+e*e	f f
dd Z,d!ede-e	e+e*e.f f fd"d#Z/G d$d% d%eZ0dS )&z&Tokenizer for Grok-2 .tok.json format.    N)
CollectionSet)Path)AnyLiteraloverload)hf_hub_download)EntryNotFoundErrorHfHubHTTPErrorRepositoryNotFoundErrorRevisionNotFoundError)BatchEncoding)chat_template_utils)ChatCompletionMessageParam)init_logger   )TokenizerLikez<|pad|>z<|eos|>z<|separator|>c                 C      g | ]}d | dqS )z<|reserved_|> .0ir   r   I/home/ubuntu/.local/lib/python3.10/site-packages/vllm/tokenizers/grok2.py
<listcomp>       r         c                 C   r   )z	<|controlr   r   r   r   r   r   r       r   i  )padsepeosa  {% for message in messages %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'].strip() + '<|separator|>\n\n' }}{% elif message['role'] == 'system' %}{{ 'System: ' + message['content'].strip() + '<|separator|>\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + '<|separator|>\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}zn(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+
model_pathrepo_idrevisiondownload_dirreturnc             
   C   sh  | d }|  r$|jddd}t|W  d    S 1 sw   Y  |d u r*i S z
t|d||d}W n( tttfyB   i  Y S  ty\ } zt	
d|| i W  Y d }~S d }~ww z"t|jddd}t|W  d    W S 1 sxw   Y  W d S  tjy } zt	
d| i W  Y d }~S d }~w ty } zt	
d| i W  Y d }~S d }~ww )	Nztokenizer_config.jsonrutf-8)encodingr"   filenamer#   	cache_dirzFailed to download tokenizer_config.json from %s. This may be due to a network or authentication issue. The default chat template will be used. Error: %szXFailed to parse tokenizer_config.json. The default chat template will be used. Error: %szWFailed to open tokenizer_config.json. The default chat template will be used. Error: %s)is_fileopenjsonloadr   r   r   r	   r
   loggerwarningr   JSONDecodeErrorOSError)r!   r"   r#   r$   config_pathfconfig_fileexcr   r   r   _maybe_load_tokenizer_config9   sX    

(r8   
vocab_filec              
      s  zdd l  W n ty } ztd|d }~ww | d}t|}W d    n1 s-w   Y  dd |dg D }dd |dg D }|d	d
krRt}n
td|d	|d|}t| |||d}d|v ru|d |d<  j	di |}d }	d|v rdd |d D }	|	pt
 |_t|_t
 dddtdtd tt B dtd tt B dtt f fdd}
t|
||_| jt
t O  _| jt
tt O  _||fS )Nr   z1Grok-2 tokenizer requires the `tiktoken` package.rbc                 S   s   i | ]}t |d  |d qS )bytestoken)r;   r   itemr   r   r   
<dictcomp>z       z+_load_tiktoken_encoding.<locals>.<dictcomp>regular_tokensc                 S   s(   i | ]}t |d  jddd|d qS )r;   r'   replaceerrorsr<   r;   decoder=   r   r   r   r?   ~   s    special_tokens
word_splitV1zUnknown word_split: pat_str)namerJ   mergeable_ranksrG   
vocab_sizeexplicit_n_vocabdefault_allowed_specialc                 S   s   h | ]}t |jd ddqS )r'   rB   rC   rE   )r   
bytes_listr   r   r   	<setcomp>   r@   z*_load_tiktoken_encoding.<locals>.<setcomp>allallowed_specialdisallowed_specialtextrT   rU   r%   c                   s*   ~t |tr|| jO } jj| ||ddS )Nr   rS   )
isinstanceset_default_allowed_specialEncodingencode)selfrV   rT   rU   tiktokenr   r   encode_patched   s   

z/_load_tiktoken_encoding.<locals>.encode_patchedr   )r^   ImportErrorr-   r.   r/   get	PAT_STR_B
ValueErrorstrrZ   rX   rY   DEFAULT_CONTROL_TOKENS_control_tokensr   r   r   listint	functoolspartialr[   valuesCONTROL_TOKEN_TEXTSRESERVED_TOKEN_TEXTS)r9   r7   r5   	xtok_dictrL   rG   rJ   kwargs	tokenizerrO   r_   r   r]   r   _load_tiktoken_encodingo   sf   


rq   c                       s0  e Zd ZedddddeeB dededB dedB dd f
d	d
ZdddededededB deee	f dB ddf fddZ
defddZedee fddZedee fddZedefddZedefddZedefddZedefdd Zedefd!d"Zedefd#d$Zedefd%d&Zedefd'd(Zdeeef fd)d*Zdeeef fd+d,Zd-ee d.edB dee fd/d0Z			1dMd2ed3edB d.edB d4edee f
d5d6ZdNd7ee eB d8edefd9d:Zed-edefd;d<Zed-ee dee fd=d<Zd-eee B deee B fd>d<Z	dNd7ee d8edee fd?d@Z d-ee defdAdBZ!		1		dOd2eee B dCedB d4ed3ed.edB de"fdDdEZ#	dPdedB dFeeee	f  dB dedB fdGdHZ$			dQdIee% dFeeee	f  dB dedB dJedeee B f
dKdLZ&  Z'S )RGrok2TokenizerFN)trust_remote_coder#   r$   path_or_repo_idrs   r#   r$   r%   c                O   s   |rt d t|}| r|}|j}	d }
n | r$|d }|}	d }
nttt|d||d}|j}	t|}
| sCtd| dt	|	|
||d}| |t||
dd|
d	|d
S )Nz2Ignoring extra positional args for Grok2Tokenizer.ztokenizer.tok.jsonr)   z tokenizer.tok.json not found at .)r"   r#   r$   truncation_sideleftchat_template)r9   name_or_pathrv   rx   init_kwargs)r0   
debug_oncer   r,   parentis_dirr   rd   FileNotFoundErrorr8   ra   )clsrt   rs   r#   r$   argsro   pathr9   r!   r"   configr   r   r   from_pretrained   sH   


zGrok2Tokenizer.from_pretrained)rz   r9   ry   rv   rx   rz   c          
         s2  t    || _|| _|pi | _|pt| _t|\| _| _	i | _
i | _| jj D ]\}}|jddd}|| j
|< || j|< q)| j	 D ]\}}|| j
|< || j|< qD| j	t}	|	d u rc| j	t}	|	d u rm| j	t}	|	d u rsd}	|	| _| j	t| j| _| j	t| j| _| j| _tdd | j
D | _d S )Nr'   rB   rC   r   c                 s   s    | ]}t |V  qd S Nlen)r   tokr   r   r   	<genexpr>  s    z*Grok2Tokenizer.__init__.<locals>.<genexpr>)super__init__ry   _truncation_siderz   DEFAULT_CHAT_TEMPLATE_chat_templaterq   
_tokenizer_special_tokens_token_to_id_id_to_token_mergeable_ranksitemsrF   ra   SEPPADEOS_bos_token_id_eos_token_id_pad_token_id_unk_token_idmax_max_chars_per_token)
r\   r9   ry   rv   rx   rz   r<   token_id	token_strbos_token_id	__class__r   r   r      s6   
	



zGrok2Tokenizer.__init__c                 C      dS )Nr   r   r\   r   r   r   num_special_tokens_to_add     z(Grok2Tokenizer.num_special_tokens_to_addc                 C      t | j S r   )rg   r   keysr   r   r   r   all_special_tokens     z!Grok2Tokenizer.all_special_tokensc                 C   r   r   )rg   r   rk   r   r   r   r   all_special_ids!  r   zGrok2Tokenizer.all_special_idsc                 C      | j S r   )r   r   r   r   r   r   %     zGrok2Tokenizer.bos_token_idc                 C   r   r   )r   r   r   r   r   eos_token_id)  r   zGrok2Tokenizer.eos_token_idc                 C   r   r   )r   r   r   r   r   pad_token_id-  r   zGrok2Tokenizer.pad_token_idc                 C   r   )NFr   r   r   r   r   is_fast1  s   zGrok2Tokenizer.is_fastc                 C   s   | j jS r   r   n_vocabr   r   r   r   rM   5  s   zGrok2Tokenizer.vocab_sizec                 C   s   | j jd S )Nr   r   r   r   r   r   max_token_id9  s   zGrok2Tokenizer.max_token_idc                 C   r   r   )r   r   r   r   r   max_chars_per_token=  r   z"Grok2Tokenizer.max_chars_per_tokenc                 C   r   r   )r   r   r   r   r   rv   A  r   zGrok2Tokenizer.truncation_sidec                 C   
   t | jS r   )dictr   r   r   r   r   	get_vocabE     
zGrok2Tokenizer.get_vocabc                 C   r   r   )r   r   r   r   r   r   get_added_vocabH  r   zGrok2Tokenizer.get_added_vocabtokens
max_lengthc                 C   s<   |d u s
t ||kr|S | jdkr|| d  S |d | S )Nrw   )r   rv   )r\   r   r   r   r   r   _maybe_truncateK  s
   
zGrok2Tokenizer._maybe_truncateTrV   
truncationadd_special_tokensc                 C   s"   ~| j |}|r| ||}|S r   )r   r[   r   )r\   rV   r   r   r   r   r   r   r   r[   R  s
   zGrok2Tokenizer.encodeidsskip_special_tokensc                    s2   t |tr|g}|r fdd|D } j|S )Nc                    s   g | ]}| j  vr|qS r   )r   rk   )r   r   r   r   r   r   c  s
    z)Grok2Tokenizer.decode.<locals>.<listcomp>)rW   rh   r   rF   )r\   r   r   r   r   r   rF   _  s   

zGrok2Tokenizer.decodec                 C      d S r   r   r\   r   r   r   r   convert_tokens_to_idsj  r   z$Grok2Tokenizer.convert_tokens_to_idsc                 C   r   r   r   r   r   r   r   r   m  r   c                    s,   t |tr j| jS  fdd|D S )Nc                    s   g | ]
} j | jqS r   )r   ra   r   )r   r<   r   r   r   r   s  s    z8Grok2Tokenizer.convert_tokens_to_ids.<locals>.<listcomp>)rW   rd   r   ra   r   r   r   r   r   r   p  s   
c                 C   s:   g }|D ]}|r|| j  v rq|| j|d q|S )Nz<|unk|>)r   rk   appendr   ra   )r\   r   r   r   r   r   r   r   convert_ids_to_tokensu  s   z$Grok2Tokenizer.convert_ids_to_tokensc                 C   s   |  |}| j|ddS )NF)r   )r   rF   )r\   r   	token_idsr   r   r   convert_tokens_to_string  s   
z'Grok2Tokenizer.convert_tokens_to_string	text_pairc           
         s|   |d urt dt|tr' fdd|D }dd |D }t||dS j| d}dgt| }	t||	dS )Nz.text_pair is not supported for Grok2Tokenizer.c                    s   g | ]}j | d qS )r   r   r   )r[   r=   r   r   r\   r   r   r   r     s    z+Grok2Tokenizer.__call__.<locals>.<listcomp>c                 S   s   g | ]	}d gt | qS )r   r   )r   r   r   r   r   r     s    )	input_idsattention_maskr   r   )NotImplementedErrorrW   rg   r   r[   r   )
r\   rV   r   r   r   r   input_ids_batchattention_mask_batchr   r   r   r   r   __call__  s$   
	zGrok2Tokenizer.__call__toolsc                 C   s   ~|p| j S r   )r   )r\   rx   r   r   r   r   get_chat_template  s   
z Grok2Tokenizer.get_chat_templatemessagestokenizec                 K   sT   | j ||d}|d u rtdd|d< tjd|||d|}|r(| j|ddS |S )N)r   z?No chat template available. Provide `chat_template` explicitly.Freturn_dict)conversationrx   r   )r   r   )r   rc   hf_chat_utilsapply_chat_templater[   )r\   r   r   rx   r   ro   templatepromptr   r   r   r     s    z"Grok2Tokenizer.apply_chat_template)NNT)F)NTFNr   )NNF)(__name__
__module____qualname__classmethodrd   r   boolr   r   r   r   rh   r   propertyrg   r   r   r   r   r   r   rM   r   r   rv   r   r   r   r[   rF   r   r   r   r   r   r   r   r   r   __classcell__r   r   r   r   rr      s    :+"

 "



$
	
rr   )1__doc__ri   r.   collections.abcr   r   pathlibr   typingr   r   r   huggingface_hubr   huggingface_hub.utilsr	   r
   r   r   transformersr   transformers.utilsr   r   vllm.entrypoints.chat_utilsr   vllm.loggerr   protocolr   r   r0   r   r   r   rangerm   rl   DEFAULT_SPECIAL_TOKENSre   r   rb   rd   r   r8   tuplerh   rq   rr   r   r   r   r   <module>   sT   


6
K