o
    ]۷i+                     @   s   d dl mZmZ d dlZd dlmZmZmZmZm	Z	m
Z
mZmZ d dlZddlmZ ddlmZmZmZ ddlmZmZ ddlmZ G d	d
 d
ZG dd dZdS )    )	dataclassfieldN)AnyCallableDictHashableListOptionalTupleUnion   )LMFormatEnforcerException)CharacterLevelParserForceStopParserCharacterLevelParserConfig)TokenizerPrefixTreeTokenizerPrefixTreeNode)	TokenListc                
   @   sV   e Zd ZdZdeeeeef  de	ee gef de
eee f dedef
ddZd	S )
TokenEnforcerTokenizerDatazTokenEnforcerTokenizerData contains all of the preprocessing for preparing the TokenEnforcer to work with a 
    specific tokenizer. It does some calculations, so it is recommended to reuse it for multiple TokenEnforcersregular_tokensdecodereos_token_iduse_bitmask
vocab_sizec                    sd    fdd|D }|| _ t| j | | _|| _|| _ddd | jjj D | _	 | _
|| _dS )aA  
        Create the tokenizer data that the TokenEnforcer needs. This can be reused for multiple TokenEnforcers if they work with the same tokenizer.
        :param regular_tokens: A list of tuples (token_id, token_string, is_new_word_token) for all the regular (not special) tokens in the tokenizer vocabulary.
        Note that token_string is expected to include leading / trailing whitespaces if relevant.
        :param decoder: A function that decodes a list of token ids into a string.
        :param eos_token_id: The token id(s) of the end-of-string token(s).
        c                    s   g | ]
}|d   kr|qS )r    ).0token_tupler   r   T/home/ubuntu/vllm_env/lib/python3.10/site-packages/lmformatenforcer/tokenenforcer.py
<listcomp>   s    z7TokenEnforcerTokenizerData.__init__.<locals>.<listcomp> c                 s   s     | ]}t |d kr|V  qdS )r   N)len)r   	token_strr   r   r   	<genexpr>!   s    z6TokenEnforcerTokenizerData.__init__.<locals>.<genexpr>N)r   r   tokenizer_treer   r   joinrootchildrenkeystokenizer_alphabetr   r   )selfr   r   r   r   r   filtered_regular_tokensr   r   r   __init__   s    
z#TokenEnforcerTokenizerData.__init__N)__name__
__module____qualname____doc__r   r
   intstrboolr   r   r,   r   r   r   r   r      s    r   c                	   @   s   e Zd ZdZeG dd dZdedefddZde	e
 d	efd
dZdeddfddZdedededee fddZddde	e
 fddZdS )TokenEnforcerzTokenEnforcer provides a token filtering mechanism, given a CharacterLevelParser and some information about the tokenizer.
    It is the main entry point for extending lm-format-enforcer to new inference libraries. See __init__() and get_allowed_tokens()c                   @   sB   e Zd ZU eed< eddZedB ed< eedZ	e
e ed< dS )TokenEnforcer.OutputTensorStateparserN)defaultallowed_tokens)default_factorycurrent_word_tokens)r-   r.   r/   r   __annotations__r   r8   r   listr:   r   r1   r   r   r   r   OutputTensorState)   s   
 r=   tokenizer_datar6   c                 C   sX   i | _ || _|j| _|j| _|j| _|j| _i | _|j| _|j| _t	|j
d}||_dS )z
        Create a new TokenEnforcer.
        :param tokenizer_data: Per tokenizer data that the token enforcer needs in order to operate.
        :param parser: A CharacterLevelParser that defines the allowed strings.
        )alphabetN)prefix_statesroot_parserr$   r   r   r   allowed_token_cacher   r   r   r)   config)r*   r>   r6   rC   r   r   r   r,   0   s   
zTokenEnforcer.__init__token_sequencereturnc                 C   s   t |}|dd }|| jv r| j| jS || jvr/tj| jd}|| j|< | || |jS | j| }| ||}|| j|< | || |jS )a  
        Get a list of allowed tokens, given a list of tokens that were already generated.
        :param token_sequence: The tokens that were already generated, and the next token will be generated for.
        :return: A list of token ids that are allowed to be selected next.
        Nr6   )tupler@   r8   r4   r=   rA   _compute_allowed_tokens_apply_new_characters)r*   rD   
sent_tupleprev_step_tuplestateprev_step_state	new_stater   r   r   get_allowed_tokensD   s   	




z TokenEnforcer.get_allowed_tokensstate_tokensrM   r5   c                 C   sL  z^t | j| j}|j }|d ur|| jv r| j| |_W d S |j }| |j| j	j
|| |j rGt| jtrA|| j n|| j |sMtd||_|d ur\|| j|< W d S W d S  tyf     ty   tjtjd | t|}td| d t | j| j|_t| jtr|j| j Y d S |j| j Y d S w )Nz+Parser reached state with no allowed tokens)levelz+Unknown LMFormatEnforcer Problem. Prefix: 'z'
Terminating the parser. Please open an issue at 
https://github.com/noamgat/lm-format-enforcer/issues with the prefix and CharacterLevelParser parameters)r   r   r   r6   	cache_keyrB   r8   shortcut_key_collect_allowed_tokensr$   r&   can_end
isinstancer   r<   extendappend
ValueErrorr   	ExceptionloggingbasicConfigERRORr   	exception)r*   rQ   rM   r8   rS   rT   prefixr   r   r   rI   b   s:   


z%TokenEnforcer._compute_allowed_tokens	tree_noder8   rT   c                 C   s   | |j | }|j }t||}t|trV|d dkrVt	|dks)J |\}}	}
}| j
j}t|jtd|
|	 }t|j||	 }| |||j |dg}|D ]}||}|j| }| |||d  qXd S )Nr   json_freetext   ")rX   tokensget_allowed_charactersr'   r(   setintersectionrW   rH   r!   r$   json_freetext_tokensminmax_token_lenmaxlookup_allowed_tokensr8   add_characterrU   )r*   r6   ra   r8   rT   allowed_charactersrelevant_characterscharacters_to_explore_cur_lenmin_lenmax_lencachemin_remainingmax_allowed_len	characternext_parsernext_tree_noder   r   r   rU      s"   


z%TokenEnforcer._collect_allowed_tokensc           
      C   s   t j|jd}|d }|| jjv r|g|_| jj| }n|j|g |_| |j}| |j}|t|d  }|D ]-}z	|j	||_W q9 t
yf }	 ztd| d|	 d t |_W Y d }	~	q9d }	~	ww |S )NrG   rF   zReceived an invalid character 'z+', switching to ForceStopParser (Exception:))r4   r=   r6   r$   new_word_tokensr:   tokens_to_strsr   r!   rn   r[   r\   debugr   )
r*   rM   rD   rO   	new_tokennew_charactersprev_decodednew_decodedry   er   r   r   rJ      s$   z#TokenEnforcer._apply_new_charactersN)r-   r.   r/   r0   r   r=   r   r   r,   r   r1   r   rP   r
   rI   r   r	   r   rU   rJ   r   r   r   r   r4   &   s    (r4   )dataclassesr   r   systypingr   r   r   r   r   r	   r
   r   r\   
exceptionsr   characterlevelparserr   r   r   tokenizerprefixtreer   r   	tokenlistr   r   r4   r   r   r   r   <module>   s    (