
    h                         d Z ddlmZmZmZ ddlmZ ddlmZ ddl	m
Z
  ej        e          Z G d de          Z G d d	e
          Zdd	gZd
S )z#Tokenization classes for vibevoice.    )ListOptionalUnion)logging)Qwen2Tokenizer)Qwen2TokenizerFastc                        e Zd ZdZddgZ	 	 	 	 	 	 	 d fd		Zd
 Zedefd            Z	edefd            Z
edefd            Zedefd            Zedefd            Z xZS )VibeVoiceTextTokenizera  
    Construct a VibeVoice tokenizer. Based on the Qwen2 tokenizer with additional special tokens for speech.
    
    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        merges_file (`str`):
            Path to the merges file.
        errors (`str`, *optional*, defaults to `"replace"`):
            Paradigm to follow when decoding bytes to UTF-8.
        unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
            The unknown token.
        bos_token (`str`, *optional*):
            The beginning of sequence token. Not used for vibevoice.
        eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
            The end of sequence token.
        pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
            The token used for padding.
        add_special_tokens (`bool`, *optional*, defaults to `True`):
            Whether or not to add special tokens when encoding.
    	input_idsattention_maskreplace<|endoftext|>NFTc
                 v     t                      j        d|||||||||	d	|
 |                                  d S )N)	
vocab_filemerges_fileerrors	unk_token	bos_token	eos_token	pad_tokenadd_prefix_spaceadd_special_tokens super__init___add_vibevoice_special_tokens)selfr   r   r   r   r   r   r   r   r   kwargs	__class__s              Z/workspace/chatterbox-finetuning/src/vibevoice/modular/modular_vibevoice_text_tokenizer.pyr   zVibeVoiceTextTokenizer.__init__%   si     	 	
!#-1	
 	
 	
 	
 	
 	**,,,,,    c                    dg di}|                      |          }|                     d          | _        |                     d          | _        |                     d          | _        |                     d          | _        |S )&Add VibeVoice-specific special tokens.additional_special_tokens<|vision_start|><|vision_end|><|vision_pad|>r'   r(   r)   r   )r   convert_tokens_to_ids_speech_start_id_speech_end_id_speech_diffusion_id_eos_idr   special_tokens	num_addeds      r!   r   z4VibeVoiceTextTokenizer._add_vibevoice_special_tokensB   s     ( * * *
 ++N;;	 !% : :;M N N"889IJJ$($>$>?O$P$P!11/BBr"   returnc                     | j         S z Id of the end of sequence token.r.   r   s    r!   eos_idzVibeVoiceTextTokenizer.eos_idV        |r"   c                     | j         S zId of the speech start token.r+   r6   s    r!   speech_start_idz&VibeVoiceTextTokenizer.speech_start_id[        $$r"   c                     | j         S zId of the speech end token.r,   r6   s    r!   speech_end_idz$VibeVoiceTextTokenizer.speech_end_id`        ""r"   c                     | j         S z!Id of the speech diffusion token.r-   r6   s    r!   speech_diffusion_idz*VibeVoiceTextTokenizer.speech_diffusion_ide        ((r"   c                     dS )4Id used for padding (returns -100 for loss masking).ir   r6   s    r!   pad_idzVibeVoiceTextTokenizer.pad_idj   s	     tr"   )r   r   Nr   r   FT__name__
__module____qualname____doc__model_input_namesr   r   propertyintr7   r<   rA   rF   rJ   __classcell__r    s   @r!   r
   r
      s>        , %&67 !!!- - - - - -:  (     X % % % % X% #s # # # X# )S ) ) ) X)     X    r"   r
   c                        e Zd ZdZddgZ	 	 	 	 	 	 	 	 d fd	Zd Zed	efd
            Z	ed	efd            Z
ed	efd            Zed	efd            Zed	efd            Z xZS )VibeVoiceTextTokenizerFasta  
    Construct a "fast" VibeVoice tokenizer (backed by HuggingFace's *tokenizers* library).
    Based on the Qwen2 tokenizer with additional special tokens for speech.
    
    Args:
        vocab_file (`str`, *optional*):
            Path to the vocabulary file.
        merges_file (`str`, *optional*):
            Path to the merges file.
        tokenizer_file (`str`, *optional*):
            Path to [tokenizers](https://github.com/huggingface/tokenizers) file.
        unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
            The unknown token.
        bos_token (`str`, *optional*):
            The beginning of sequence token. Not used for vibevoice.
        eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
            The end of sequence token.
        pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
            The token used for padding.
    r   r   Nr   Fc	                 t     t                      j        d||||||||d|	 |                                  d S )N)r   r   tokenizer_filer   r   r   r   r   r   r   )r   r   r   rX   r   r   r   r   r   r   r    s             r!   r   z#VibeVoiceTextTokenizerFast.__init__   sf     	 
	
!#)-
	
 
	
 
	
 
	
 
	
 	**,,,,,r"   c                 $   dg di}|                      |          }|                     d          | _        |                     d          | _        |                     d          | _        | j        | _        |                     d          | _        |S )r$   r%   r&   r'   r(   r)   z<|image_pad|>)r   r*   r+   r,   r-   eos_token_idr.   _pad_idr/   s      r!   r   z8VibeVoiceTextTokenizerFast._add_vibevoice_special_tokens   s     ( * * *
 ++N;;	 !% : :;M N N"889IJJ$($>$>?O$P$P! (11/BBr"   r2   c                     | j         S r4   r5   r6   s    r!   r7   z!VibeVoiceTextTokenizerFast.eos_id   r8   r"   c                     | j         S r:   r;   r6   s    r!   r<   z*VibeVoiceTextTokenizerFast.speech_start_id   r=   r"   c                     | j         S r?   r@   r6   s    r!   rA   z(VibeVoiceTextTokenizerFast.speech_end_id   rB   r"   c                     | j         S rD   rE   r6   s    r!   rF   z.VibeVoiceTextTokenizerFast.speech_diffusion_id   rG   r"   c                     | j         S )rI   )r[   r6   s    r!   rJ   z!VibeVoiceTextTokenizerFast.pad_id   r8   r"   )NNNr   Nr   r   FrK   rT   s   @r!   rV   rV   p   sA        * %&67 !!!- - - - - -6  ,     X % % % % X% #s # # # X# )S ) ) ) X)     X    r"   rV   N)rO   typingr   r   r   transformers.utilsr   ,transformers.models.qwen2.tokenization_qwen2r   1transformers.models.qwen2.tokenization_qwen2_fastr   
get_loggerrL   loggerr
   rV   __all__r   r"   r!   <module>rh      s    ) ) ( ( ( ( ( ( ( ( ( ( & & & & & & G G G G G G P P P P P P		H	%	%a a a a a^ a a aH` ` ` ` `!3 ` ` `H  r"   