o
    i%                     @   s@   d dl mZ d dlZd dlmZ defddZdefdd	ZdS )
    )ListN)PreTrainedTokenizer	tokenizerc                    s.   dd | j  D  G  fddd}|| S )u  Create a tokenizer wrapper that converts multi-character Chinese tokens to single characters.
    
    This function creates a wrapper around the provided tokenizer that automatically
    splits multi-character Chinese tokens into individual characters. This is useful
    for ensuring consistent tokenization of Chinese text.
    
    Args:
        tokenizer: The base tokenizer to wrap
        
    Returns:
        A CharTokenizerWrapper instance that handles multi-character Chinese tokens
        
    Example:
        >>> from transformers import LlamaTokenizerFast
        >>> tokenizer = LlamaTokenizerFast.from_pretrained("path/to/tokenizer")
        >>> wrapped_tokenizer = mask_multichar_chinese_tokens(tokenizer)
        >>> tokens = wrapped_tokenizer("你好世界")
    c                 S   s.   h | ]}t |d krtdd |D r|qS )   c                 s   s(    | ]}d |  kodkn  V  qdS )u   一u   鿿N ).0cr   r   F/home/ubuntu/.local/lib/python3.10/site-packages/voxcpm/model/utils.py	<genexpr>   s   & z:mask_multichar_chinese_tokens.<locals>.<setcomp>.<genexpr>)lenall)r   tokenr   r   r	   	<setcomp>   s
    z0mask_multichar_chinese_tokens.<locals>.<setcomp>c                       sR   e Zd ZdZdeddf fddZdedee fdd	Zdedee	 fd
dZ
dS )z;mask_multichar_chinese_tokens.<locals>.CharTokenizerWrapperzWrapper class for tokenizers that handles multi-character Chinese tokens.
        
        This wrapper automatically splits multi-character Chinese tokens into
        individual characters while preserving the original tokenizer's interface.
        base_tokenizerreturnNc                    s   || _  | _dS )zInitialize the wrapper with a base tokenizer.
            
            Args:
                base_tokenizer: The tokenizer to wrap
            N)r   multichar_tokens)selfr   r   r   r	   __init__&   s   
zDmask_multichar_chinese_tokens.<locals>.CharTokenizerWrapper.__init__textc                 [   sv   t |tstdt| | jj|fi |}g }|D ]}|dd}|| jv r3t|}|	| q|
| q|S )ui  Tokenize text and split multi-character Chinese tokens into single characters.
            
            Args:
                text: Input text to tokenize
                **kwargs: Additional arguments passed to the base tokenizer
                
            Returns:
                List of processed tokens with multi-character Chinese tokens split
                
            Example:
                >>> wrapper = CharTokenizerWrapper(tokenizer)
                >>> tokens = wrapper.tokenize("你好世界")
                >>> # Returns ["你", "好", "世", "界"] instead of ["你好", "世界"]
            zExpected string input, got u   ▁ )
isinstancestr	TypeErrortyper   tokenizereplacer   listextendappend)r   r   kwargstokens	processedr   clean_tokencharsr   r   r	   r   /   s   

zDmask_multichar_chinese_tokens.<locals>.CharTokenizerWrapper.tokenizec              
   [   sR   z| j |fi |}| j|}|W S  ty( } z
tdt| |d}~ww )a8  Call the tokenizer and return token IDs.
            
            This method provides the same interface as the original tokenizer
            but with multi-character Chinese token handling.
            
            Args:
                text: Input text to tokenize
                **kwargs: Additional arguments passed to the base tokenizer
                
            Returns:
                List of token IDs
                
            Raises:
                TypeError: If input is not a string
                ValueError: If tokenization fails
            zTokenization failed: N)r   r   convert_tokens_to_ids	Exception
ValueErrorr   )r   r   r    r!   resulter   r   r	   __call__Q   s   zDmask_multichar_chinese_tokens.<locals>.CharTokenizerWrapper.__call__)__name__
__module____qualname____doc__r   r   r   r   r   intr*   r   r   r   r	   CharTokenizerWrapper   s
    	"r0   )vocabkeys)r   r0   r   r   r	   mask_multichar_chinese_tokens   s
   Jr3   dtypec                 C   sb   | dkrt jS | dkrt jS | dkrt jS | dkrt jS | dkr#t jS | dkr*t jS td|  )Nbfloat16bf16float16fp16float32fp32zUnsupported dtype: )torchr5   r7   r9   r'   )r4   r   r   r	   	get_dtypel   s   r<   )typingr   r;   transformersr   r3   r   r<   r   r   r   r	   <module>   s
    f