o
    پi                     @   s   d dl Z d dlZd dlmZmZmZmZmZ G dd dZdd e	ddD Z
d	d e	d
dD ZdZdZdZeeegZeeedZdZG dd dZdS )    N)AbstractSet
CollectionListLiteralUnionc                   @   s"   e Zd ZdefddZdd ZdS )TiktokenProcessornamec                 C   s   t || _d S N)TiktokenTokenizer	tokenizer)selfr    r   [/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/tokenizer/tiktoken_tokenizer.py__init__   s   zTiktokenProcessor.__init__c                 C   s
   d|giS )Npixel_valuesr   )r   imager   r   r   image_processor
   s   
z!TiktokenProcessor.image_processorN)__name__
__module____qualname__strr   r   r   r   r   r   r      s    r   c                 C      g | ]}d | dqS )z<|reserved_|>r   .0ir   r   r   
<listcomp>       r         c                 C   r   )z	<|controlr   r   r   r   r   r   r      r      i  z<|pad|>z<|eos|>z<|separator|>)padsepeoszn(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+c                   @   sZ   e Zd Zdd ZdddZdd Z	dd	d
Z		dddZdee	 fddZ
dd ZdS )r
   c              
      s  dd l  ddlm} t|d}t|}W d    n1 sw   Y  dd |d D }dd |d D }|d	 d
kr?t}n	J d|d	  |d|}||||d}d|v rdtdd |d D }	d|v rn|d |d< d }	t	}
 j
di |}|	p~t |_|
|_t dddtdttd tt f dttd tt f dtt f fdd}t|||_| jtt	 O  _| jttt O  _|| _d | _|jt | _|j| _ d| _!|| j!| _"d | _#d S )Nr   )Templaterbc                 S   s   i | ]}t |d  |d qS bytestoken)r'   r   itemr   r   r   
<dictcomp>'   s    z.TiktokenTokenizer.__init__.<locals>.<dictcomp>regular_tokensc                 S   s"   i | ]}t |d   |d qS r&   r'   decoder)   r   r   r   r+   *   s    special_tokens
word_splitV1FzUnknown word_split: pat_str)r   r2   mergeable_ranksr/   default_allowed_specialc                 S   s   g | ]}t | qS r   r-   )r   
bytes_listr   r   r   r   <   s    
z.TiktokenTokenizer.__init__.<locals>.<listcomp>
vocab_sizeexplicit_n_vocaballallowed_specialdisallowed_specialtextr:   r;   returnc                   s(   t |tr
|| jO } jj| ||ddS )Nr   r9   )
isinstanceset_default_allowed_specialEncodingencode)r   r<   r:   r;   tiktokenr   r   encode_patchedK   s   
	
z2TiktokenTokenizer.__init__.<locals>.encode_patcheda  {% for message in messages %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'].strip() + '<|separator|>

' }}{% elif message['role'] == 'system' %}{{ 'System: ' + message['content'].strip() + '<|separator|>

' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: '  + message['content'] + '<|separator|>

' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}r   )$rD   jinja2r$   openjsonload	PAT_STR_Bgetr?   DEFAULT_CONTROL_TOKENSrA   r@   _control_tokensr   r   r   r   r   r   int	functoolspartialrB   valuesCONTROL_TOKEN_TEXTSRESERVED_TOKEN_TEXTSr   bos_token_id_special_tokensEOSeos_token_idn_vocabr6   chat_templatechat_template_jinjaadditional_stop_token_ids)r   tokenizer_pathr$   fin	xtok_dictr3   r/   pad_strkwargsr4   control_tokensr   rE   r   rC   r   r      st   
zTiktokenTokenizer.__init__Fc                 C      | j |S r	   )r   rB   )r   xadd_special_tokensr   r   r   rB   n      zTiktokenTokenizer.encodec                 O   rb   r	   )r   r.   )r   rc   argsr`   r   r   r   r.   q   re   zTiktokenTokenizer.decodeTc                 C   s4   t |dkrt|d trdd |D }| j|S )Nr   c                 S   s   g | ]}|gqS r   r   r   rc   r   r   r   r   x   s    z2TiktokenTokenizer.batch_decode.<locals>.<listcomp>)lenr>   rN   r   decode_batch)r   batchskip_special_tokensspaces_between_special_tokensr   r   r   batch_decodet   s   zTiktokenTokenizer.batch_decodeNc                 K   s"   | j j||d}|r| |S |S )N)messagesadd_generation_prompt)rZ   renderrB   )r   rn   tokenizero   toolsreasoning_effortr`   retr   r   r   apply_chat_template{   s   	z%TiktokenTokenizer.apply_chat_templater<   c                    s   d fdd|D iS )N	input_idsc                    s   g | ]}  |qS r   )rB   rg   r   r   r   r      s    z.TiktokenTokenizer.__call__.<locals>.<listcomp>r   )r   r<   r`   r   rw   r   __call__   s   zTiktokenTokenizer.__call__c                 C   s   ddl m} d}| j}i |j|j}dd t| dd dD }d	g}g }t|D ]\}}t|t	r>|
d
r>|| q+t|D ]\}}	||||	< qC|||d}
t|
jdks^J |
|fS )Nr   )TokenizerInfoz<|xg_special_token_{}|>c                 S   s   g | ]\}}|qS r   r   )r   r(   _r   r   r   r      s    z3TiktokenTokenizer.init_xgrammar.<locals>.<listcomp>c                 S   s   | d S )Nr    r   )rc   r   r   r   <lambda>   s    z1TiktokenTokenizer.init_xgrammar.<locals>.<lambda>)key       )stop_token_ids)xgrammarry   r   _mergeable_ranksrU   sorteditems	enumerater>   r'   
startswithappendformatrh   special_token_ids)r   ry   XGRAMMAR_SPECIAL_TOKEN_TEMPLATEencencoded_vocaboverride_stop_tokensxgrammar_special_token_idsr   r(   idtokenizer_infor   r   r   init_xgrammar   s(   
zTiktokenTokenizer.init_xgrammar)F)TF)NN)r   r   r   r   rB   r.   rm   ru   r   r   rx   r   r   r   r   r   r
      s    
P

r
   )rO   rH   typingr   r   r   r   r   r   rangerS   rR   PADrV   SEPDEFAULT_SPECIAL_TOKENSrL   rJ   r
   r   r   r   r   <module>   s    
