o
    i                     @   sD   d Z ddlmZmZmZmZ ddlZG dd dZdefddZdS )	u  
BiCodec Token Utilities - Incremental Parsing

Provides O(1) per-token parsing instead of O(n) decode + regex scanning.

The key optimization: instead of:
    tokenizer.decode(all_tokens)  # O(n)
    re.findall(entire_text)       # O(n)

We do:
    for new_token_id in new_tokens:  # Only new tokens
        parsed = token_cache.get(token_id)  # O(1) lookup

This eliminates the O(n²) CPU burn in the streaming hot loop.
    )DictOptionalTupleListNc                   @   s   e Zd ZdZedZedZdd Zddd	Z	d
e
deee
ef  fddZdedeee
ef  fddZdee dee dee deee ee f fddZdee
ef fddZdS )BiCodecTokenParsera  
    Caching parser for BiCodec tokens.
    
    Converts vLLM token IDs to (type, value) tuples efficiently.
    
    BiCodec tokens in the vocabulary look like:
        - <|bicodec_semantic_123|> -> ("semantic", 123)
        - <|bicodec_global_7|>     -> ("global", 7)
    
    Usage:
        parser = BiCodecTokenParser(tokenizer)
        
        # In hot loop (O(1) per token):
        for token_id in new_token_ids:
            result = parser.parse(token_id)
            if result:
                token_type, value = result
                if token_type == "semantic":
                    semantic_buffer.append(value)
                elif token_type == "global":
                    global_buffer.append(value)
    z^<\|bicodec_semantic_(\d+)\|>$z^<\|bicodec_global_(\d+)\|>$c                 C   s   || _ i | _|   dS )z
        Initialize parser with tokenizer.
        
        Args:
            tokenizer: HuggingFace tokenizer with BiCodec special tokens
        N)	tokenizer_cache_prewarm_cache)selfr    r   7/home/ubuntu/veenaModal/veena3modal/core/token_utils.py__init__1   s   zBiCodecTokenParser.__init__returnNc                 C   sZ   z"| j  }| D ]\}}|dr| |}|r|| j|< q
W dS  ty,   Y dS w )z
        Pre-populate cache with BiCodec tokens from vocabulary.
        
        This avoids lazy cache misses during streaming.
        z
<|bicodec_N)r   	get_vocabitems
startswith_parse_token_stringr   	Exception)r
   vocab	token_strtoken_idparsedr   r   r   r	   A   s   



z!BiCodecTokenParser._prewarm_cacher   c                 C   sH   | j |}|rdt|dfS | j|}|r"dt|dfS dS )a  
        Parse a token string into (type, value) tuple.
        
        Args:
            token_str: Token string like "<|bicodec_semantic_123|>"
        
        Returns:
            ("semantic", 123) or ("global", 7) or None if not BiCodec token
        semantic   globalN)SEMANTIC_PATTERNmatchintgroupGLOBAL_PATTERN)r
   r   r   r   r   r   r   S   s   z&BiCodecTokenParser._parse_token_stringr   c                 C   sd   || j v r
| j | S z| jj|gdd}| | }|| j |< |W S  ty1   d| j |< Y dS w )a  
        Parse a single token ID into (type, value) tuple.
        
        O(1) for cached tokens, O(1) amortized for uncached.
        
        Args:
            token_id: vLLM token ID
        
        Returns:
            ("semantic", value) or ("global", value) or None
        F)skip_special_tokensN)r   r   decoder   stripr   )r
   r   r   r   r   r   r   parsei   s   



zBiCodecTokenParser.parsenew_token_idssemantic_bufferglobal_bufferc                 C   sN   |D ] }|  |}|r"|\}}|dkr|| q|dkr"|| q||fS )a  
        Parse new tokens and append to existing buffers.
        
        This is the main method for incremental streaming:
        - Only processes NEW tokens (not entire history)
        - Modifies buffers in-place for efficiency
        - Returns the updated buffers (same objects)
        
        Args:
            new_token_ids: Only the new token IDs since last call
            semantic_buffer: Existing semantic token values (modified in-place)
            global_buffer: Existing global token values (modified in-place)
        
        Returns:
            Tuple of (semantic_buffer, global_buffer) - same objects, modified
        r   r   )r#   append)r
   r$   r%   r&   r   r   
token_typevaluer   r   r   parse_incremental   s   

z$BiCodecTokenParser.parse_incrementalc                 C   s2   t | j}tdd | j D }|||| dS )z
        Return cache statistics for debugging.
        
        Returns:
            Dict with cache size and breakdown
        c                 s   s    | ]	}|d urdV  qd S )Nr   r   ).0vr   r   r   	<genexpr>   s    z5BiCodecTokenParser.get_cache_stats.<locals>.<genexpr>)total_cachedbicodec_tokensnon_bicodec_tokens)lenr   sumvalues)r
   totalbicodec_countr   r   r   get_cache_stats   s   
z"BiCodecTokenParser.get_cache_stats)r   N)__name__
__module____qualname____doc__recompiler   r   r   r	   strr   r   r   r   r#   r   r*   r   r6   r   r   r   r   r      s$    



!r   r   c                 C   s   t | S )z
    Factory function to create a BiCodec token parser.
    
    Args:
        tokenizer: HuggingFace tokenizer
    
    Returns:
        Configured BiCodecTokenParser instance
    )r   )r   r   r   r   create_bicodec_parser   s   
r>   )	r:   typingr   r   r   r   r;   r   r>   r   r   r   r   <module>   s     !