o
    i                     @   sf   d dl mZ d dlmZmZmZmZ d dlZddlm	Z	 G dd dZ
G dd	 d	ZG d
d dZdS )    )OrderedDict)DictListSetTupleN   )	TokenListc                   @   s   e Zd ZdddZdS )TokenizerPrefixTreeNodereturnNc                 C   s   g | _ i | _d S N)tokenschildrenself r   a/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/lmformatenforcer/tokenizerprefixtree.py__init__   s   
z TokenizerPrefixTreeNode.__init__r
   N)__name__
__module____qualname__r   r   r   r   r   r	      s    r	   c                   @   sf   e Zd ZdZG dd dZdededdfdd	Zd
edefddZ	dedede
fddZdddZdS )JsonFreetextTokenCachea  
    JSON string can contain almost any unicode character, so creating a list of allowed tokens is very expensive.
    The list can be cached, but JSON Schema also allows 'minLength' and 'maxLength' constraint on the string,
    that make some tokens illegal depending on how long the generated string is already. This class precalculates
    a separate allowlist for all possible constraint states up to maximum token length (16 in Llama, for example).
    After deduplication, this results in about ~75 lists for the Llama tokenizer.
    c                   @   sF   e Zd ZdZdd Zdeeeef  fddZ	ddee fd	d
Z
dS )z.JsonFreetextTokenCache._StringLengthTokenCachezThis is an internal data structure, that given a list of string+token pairs,
        can quickly return all token ids of strings between certain lengthsc                 C   s   g | _ dg| _d S Nr   )r   first_index_geq_than_lengthr   r   r   r   r      s   z7JsonFreetextTokenCache._StringLengthTokenCache.__init__token_strs_to_idxc                 C   s|   t |dd d}dd |D | _dd |D }t|D ]\}}t| j|kr3| j| t| j|ks&q| jt| d S )Nc                 S   s   t | d S r   len)pr   r   r   <lambda>   s    zFJsonFreetextTokenCache._StringLengthTokenCache.build.<locals>.<lambda>)keyc                 S   s   g | ]}|d  qS )r   r   .0pairr   r   r   
<listcomp>   s    zHJsonFreetextTokenCache._StringLengthTokenCache.build.<locals>.<listcomp>c                 S   s   g | ]}t |d  qS )r   r   r    r   r   r   r#   !       )sortedr   	enumerater   r   append)r   r   token_lengthsidxtoken_lengthr   r   r   build   s   z4JsonFreetextTokenCache._StringLengthTokenCache.buildr
   c                 C   sp   |t | jkr	g S |dkr| j| nd}|dkrd}n|d t | jk r,| j|d  }nt | j}| j|| S )Nr   r   )r   r   r   )r   
min_length
max_lengthstart_index	end_indexr   r   r   get_indices_between_length'   s   
zIJsonFreetextTokenCache._StringLengthTokenCache.get_indices_between_lengthN)r,   r,   )r   r   r   __doc__r   r   r   strintr+   r1   r   r   r   r   _StringLengthTokenCache   s
    r5   use_bitmask
vocab_sizer
   Nc                 C   s6   i | _ i | _d| _t | _t | _|| _|| _d S r   )	token_num_to_strallowlist_cachemax_token_lenr   r5   regular_tokens_length_cachequote_tokens_length_cacher6   r7   )r   r6   r7   r   r   r   r   3   s   


zJsonFreetextTokenCache.__init__	token_str	token_intc                 C   s   | j rJ dd|d d v }d|dd v }d|v pd|v }|s%|s%|r=ztd| d W n tjjy<   Y d S w t|dkrEd S || j|< d S )Nz:Cannot add more tokens after allowlists were precalculated\r,   "r   
)r9   jsonloadsdecoderJSONDecodeErrorr   r8   )r   r=   r>   has_non_trailing_backslashhas_quote_before_endhas_newliner   r   r   	add_token<   s   z JsonFreetextTokenCache.add_tokenmin_remainingmax_lenc                 C   sj   ||f}|| j vr0| j|d |d }| jd|}|| }t| j| j}|| || j |< | j | S )a-  
        Get the list of tokens that are allowed within a JSON string, such that:
        1. all candidate tokens are at most `max_len` characters long (excluding the trailing quote), and
        2. if a token ends with a quote, it's at least `min_remaining` chars long (excluding the quote).
        r   r,   )r9   r<   r1   r;   r   r6   r7   extend)r   rK   rL   	cache_keytokens_with_quotetokens_without_quotecombinednew_tokenlistr   r   r   lookup_allowed_tokensO   s   



z,JsonFreetextTokenCache.lookup_allowed_tokensc                 C   s   t dd | j D }|sJ dtdd |D rJ dg }g }|D ]}|d dr4|| q%|| q%| j| | j| t	t
| jjt
| jj| _| `dS )	z
        Precalculate token allowlists for all valid combinations of `min_remaining` and `max_len`
        based on the tokens that were added with `add_token()`.
        c                 s   s    | ]	\}}||fV  qd S r   r   )r!   nsr   r   r   	<genexpr>f       z0JsonFreetextTokenCache.freeze.<locals>.<genexpr>z6Cannot precalculate allowlists for an empty token listc                 s   s    | ]	}|d  dkV  qdS )r    Nr   r    r   r   r   rV   h   rW   z'Tokenizer must not contain empty tokensr   r@   N)listr8   itemsanyendswithr'   r;   r+   r<   maxr   r   r:   )r   
all_tokensregular_tokensquote_tokensr"   r   r   r   freezea   s   
zJsonFreetextTokenCache.freezer   )r   r   r   r2   r5   boolr4   r   r3   rJ   r   rS   ra   r   r   r   r   r      s    	r   c                   @   sF   e Zd Zdeeeeef  dedefddZdedede	fd	d
Z
dS )TokenizerPrefixTreer_   r6   r7   c                 C   sx   t  | _t||| _t | _dd |D | _|D ]\}}}| ||| j | j|| |r4| j	| q| j
  d S )Nc                 S   s   i | ]\}}}||qS r   r   )r!   	token_idxr=   _r   r   r   
<dictcomp>~   r$   z0TokenizerPrefixTree.__init__.<locals>.<dictcomp>)r	   rootr   json_freetext_tokenssetnew_word_tokenstokens_to_strs_add_token_to_treerJ   addra   )r   r_   r6   r7   rd   decodedis_new_wordr   r   r   r   z   s   zTokenizerPrefixTree.__init__r=   rd   nodec                 C   s:   |D ]}||j vrt |j |< |j | }q|j| d S r   )r   r	   r   r'   )r   r=   rd   rp   	characterr   r   r   rl      s
   
z&TokenizerPrefixTree._add_token_to_treeN)r   r   r   r   r   r4   r3   rb   r   r	   rl   r   r   r   r   rc   y   s    $rc   )collectionsr   typingr   r   r   r   rC   	tokenlistr   r	   r   rc   r   r   r   r   <module>   s    l