o
    聱i                     @   sr   d Z ddlmZmZmZ ddlmZ ddlmZ ddl	m
Z
 eeZG dd deZG dd	 d	e
Zdd	gZd
S )z#Tokenization classes for vibevoice.    )ListOptionalUnion)logging)Qwen2Tokenizer)Qwen2TokenizerFastc                       s   e Zd ZdZddgZ							d fd	d
	Zdd ZedefddZ	edefddZ
edefddZedefddZedefddZ  ZS )VibeVoiceTextTokenizera  
    Construct a VibeVoice tokenizer. Based on the Qwen2 tokenizer with additional special tokens for speech.
    
    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        merges_file (`str`):
            Path to the merges file.
        errors (`str`, *optional*, defaults to `"replace"`):
            Paradigm to follow when decoding bytes to UTF-8.
        unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
            The unknown token.
        bos_token (`str`, *optional*):
            The beginning of sequence token. Not used for vibevoice.
        eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
            The end of sequence token.
        pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
            The token used for padding.
        add_special_tokens (`bool`, *optional*, defaults to `True`):
            Whether or not to add special tokens when encoding.
    	input_idsattention_maskreplace<|endoftext|>NFTc
                    s2   t  jd|||||||||	d	|
 |   d S )N)	
vocab_filemerges_fileerrors	unk_token	bos_token	eos_token	pad_tokenadd_prefix_spaceadd_special_tokens super__init___add_vibevoice_special_tokens)selfr   r   r   r   r   r   r   r   r   kwargs	__class__r   [/home/ubuntu/VibeVoice-finetuning/src/vibevoice/modular/modular_vibevoice_text_tokenizer.pyr   %   s   
zVibeVoiceTextTokenizer.__init__c                 C   sJ   dg di}|  |}| d| _| d| _| d| _| d| _|S )&Add VibeVoice-specific special tokens.additional_special_tokens<|vision_start|><|vision_end|><|vision_pad|>r#   r$   r%   r   )r   convert_tokens_to_ids_speech_start_id_speech_end_id_speech_diffusion_id_eos_idr   special_tokens	num_addedr   r   r   r   B   s   
z4VibeVoiceTextTokenizer._add_vibevoice_special_tokensreturnc                 C      | j S z Id of the end of sequence token.r*   r   r   r   r   eos_idV      zVibeVoiceTextTokenizer.eos_idc                 C   r/   zId of the speech start token.r'   r2   r   r   r   speech_start_id[   r4   z&VibeVoiceTextTokenizer.speech_start_idc                 C   r/   zId of the speech end token.r(   r2   r   r   r   speech_end_id`   r4   z$VibeVoiceTextTokenizer.speech_end_idc                 C   r/   z!Id of the speech diffusion token.r)   r2   r   r   r   speech_diffusion_ide   r4   z*VibeVoiceTextTokenizer.speech_diffusion_idc                 C   s   dS )4Id used for padding (returns -100 for loss masking).ir   r2   r   r   r   pad_idj   s   zVibeVoiceTextTokenizer.pad_id)r   r   Nr   r   FT__name__
__module____qualname____doc__model_input_namesr   r   propertyintr3   r7   r:   r=   r?   __classcell__r   r   r   r   r      s,    r   c                       s   e Zd ZdZddgZ								d fdd	Zd	d
 ZedefddZ	edefddZ
edefddZedefddZedefddZ  ZS )VibeVoiceTextTokenizerFasta  
    Construct a "fast" VibeVoice tokenizer (backed by HuggingFace's *tokenizers* library).
    Based on the Qwen2 tokenizer with additional special tokens for speech.
    
    Args:
        vocab_file (`str`, *optional*):
            Path to the vocabulary file.
        merges_file (`str`, *optional*):
            Path to the merges file.
        tokenizer_file (`str`, *optional*):
            Path to [tokenizers](https://github.com/huggingface/tokenizers) file.
        unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
            The unknown token.
        bos_token (`str`, *optional*):
            The beginning of sequence token. Not used for vibevoice.
        eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
            The end of sequence token.
        pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
            The token used for padding.
    r	   r
   Nr   Fc	           
         s0   t  jd||||||||d|	 |   d S )N)r   r   tokenizer_filer   r   r   r   r   r   r   )
r   r   r   rJ   r   r   r   r   r   r   r   r   r   r      s   	z#VibeVoiceTextTokenizerFast.__init__c                 C   sR   dg di}|  |}| d| _| d| _| d| _| j| _| d| _|S )r    r!   r"   r#   r$   r%   z<|image_pad|>)r   r&   r'   r(   r)   eos_token_idr*   _pad_idr+   r   r   r   r      s   
z8VibeVoiceTextTokenizerFast._add_vibevoice_special_tokensr.   c                 C   r/   r0   r1   r2   r   r   r   r3      r4   z!VibeVoiceTextTokenizerFast.eos_idc                 C   r/   r5   r6   r2   r   r   r   r7      r4   z*VibeVoiceTextTokenizerFast.speech_start_idc                 C   r/   r8   r9   r2   r   r   r   r:      r4   z(VibeVoiceTextTokenizerFast.speech_end_idc                 C   r/   r;   r<   r2   r   r   r   r=      r4   z.VibeVoiceTextTokenizerFast.speech_diffusion_idc                 C   r/   )r>   )rL   r2   r   r   r   r?      r4   z!VibeVoiceTextTokenizerFast.pad_id)NNNr   Nr   r   Fr@   r   r   r   r   rI   p   s.    rI   N)rD   typingr   r   r   transformers.utilsr   ,transformers.models.qwen2.tokenization_qwen2r   1transformers.models.qwen2.tokenization_qwen2_fastr   
get_loggerrA   loggerr   rI   __all__r   r   r   r   <module>   s    
dd