o
    ꁱiH                     @   s   d Z ddlZddlZddlmZmZmZmZmZm	Z	 ddl
Z
ddlZddlZddlZddlmZmZmZmZmZ ddlmZmZ ddlmZ eeZG dd	 d	Zd	gZdS )
z
VibeVoice Streaming Processor

This processor handles input preparation for the streaming 0.5B model,
including text tokenization and cached voice prompt handling.
    N)ListOptionalUnionDictAnyTuple)BatchEncodingPaddingStrategyPreTokenizedInput	TextInputTruncationStrategy)
TensorTypelogging   )AudioNormalizerc                   @   s  e Zd ZdZd0ddZedd Zd	eee	j
f fd
dZdefddZ							d1dee deeeef  deeeef deeeef dee deeeef  dedefddZ					d2deeeef  deeeef deeeef dee deeeef  dedefddZ			d3deej deeeef  deeeejf  deej deeef f
dd Zd!d" Zd#d$ Z e!d%d& Z"	'			(d4d)eej#ejeeej#ejf  f d*ed+ee d,ed-edefd.d/Z$dS )5VibeVoiceStreamingProcessora"  
    Constructs a VibeVoice Streaming processor which wraps a VibeVoice tokenizer and audio processor into a single processor.

    The streaming processor is designed for the 0.5B real-time model that uses pre-computed voice embeddings
    (.pt files) instead of live audio conditioning.

    Args:
        tokenizer (`VibeVoiceTextTokenizer` or `VibeVoiceTextTokenizerFast`):
            The tokenizer for text processing.
        audio_processor (`VibeVoiceTokenizerProcessor`):
            The audio processor for speech processing.
        speech_tok_compress_ratio (`int`, *optional*, defaults to 3200):
            The compression ratio for speech tokenization.
        db_normalize (`bool`, *optional*, defaults to True):
            Whether to apply decibel normalization to audio inputs.
    N  Tc                 K   s2   || _ || _|| _|| _|rt | _d S d | _d S )N)	tokenizeraudio_processorspeech_tok_compress_ratiodb_normalizer   audio_normalizer)selfr   r   r   r   kwargs r   U/home/ubuntu/vibevoice-community/vibevoice/processor/vibevoice_streaming_processor.py__init__*   s
   z$VibeVoiceStreamingProcessor.__init__c              
   K   s  ddl }ddl}ddlm} ddlm} ddlm}m} |j	
|d}	d}
|j	|	rFt|	d}||}
W d   n1 s@w   Y  nMz%||dfi |}t|d}||}
W d   n1 sew   Y  W n' ty } ztd	| d
|  td ddd}
W Y d}~nd}~ww |
dd}|
dd}|
ddp|dd}td|  d| v r|j|fi |}ntd| dd|
v r|
d }||dd|dd|dd|ddd}n| }| ||||d S )!a  
        Instantiate a VibeVoiceStreamingProcessor from a pretrained VibeVoice Streaming processor.

        Args:
            pretrained_model_name_or_path (`str` or `os.PathLike`):
                This can be either:
                - a string, the *model id* of a pretrained model
                - a path to a *directory* containing processor config

        Returns:
            [`VibeVoiceStreamingProcessor`]: The processor object instantiated from pretrained model.
        r   N)cached_filer   )VibeVoiceTokenizerProcessor)VibeVoiceTextTokenizerVibeVoiceTextTokenizerFastpreprocessor_config.jsonrz-Could not load preprocessor_config.json from z: zUsing default configurationr   T)r   r   r   r   language_model_pretrained_namezQwen/Qwen2.5-0.5BzLoading tokenizer from qwenzUnsupported tokenizer type for z. Supported types: Qwen.r   sampling_rate]  normalize_audiotarget_dB_FSepsư>)r%   r'   r(   r*   )r   r   r   r   )osjsontransformers.utilsr   vibevoice_tokenizer_processorr   2vibevoice.modular.modular_vibevoice_text_tokenizerr   r    pathjoinexistsopenload	Exceptionloggerwarninggetpopinfolowerfrom_pretrained
ValueError)clspretrained_model_name_or_pathr   r,   r-   r   r   r   r    config_pathconfigfconfig_fileer   r   r#   r   audio_configr   r   r   r   r=   1   sr   
	



z+VibeVoiceStreamingProcessor.from_pretrainedsave_directoryc                 K   s   ddl }ddl}|j|dd d| j| jdt| jddt| jd	dt| jd
dt| jdddd}|j|d}t	|d}|j
||dd W d   n1 sQw   Y  td|  dS )a   
        Save a processor to a directory, so that it can be re-loaded using the
        [`~VibeVoiceStreamingProcessor.from_pretrained`] class method.

        Args:
            save_directory (`str` or `os.PathLike`):
                Directory where the processor will be saved.
        r   NT)exist_okr   r   r%   r&   r'   r(   r)   r*   r+   )feature_extractor_typer%   r'   r(   r*   )processor_classr   r   r   r!   w   )indentz!Processor configuration saved in )r,   r-   makedirsr   r   getattrr   r1   r2   r4   dumpr7   r;   )r   rG   r   r,   r-   processor_configrA   rC   r   r   r   save_pretrained   s$   	z+VibeVoiceStreamingProcessor.save_pretrainedreturnc                 C   s   t d)z
        Note:
            This method is intentionally not implemented in the streaming processor.
            Use `process_input_with_cached_prompt` for streaming use cases.
        zsVibeVoiceStreamingProcessor.__call__ is not implemented. Use process_input_with_cached_prompt for streaming inputs.)NotImplementedErrorr   r   r   r   __call__   s   z$VibeVoiceStreamingProcessor.__call__Ftextcached_promptpadding
truncation
max_lengthreturn_tensorsreturn_attention_maskc                 K   s   |g}	|g}
d}g }t |	|
D ]B\}}| jj| d dd}|d d d}|d d d}| jjg| }| jjg| }dg| }|||d|d	}|| q| j||||||d
}|S )a  
        Main method to process one text script based on cached prompt.

        The streaming model uses pre-computed voice embeddings (cached prompts) loaded from .pt files
        instead of processing audio on-the-fly.

        Args:
            text (`str`):
                The input text to process.
            cached_prompt (`Dict[str, Any]`, *optional*):
                The cached prompt to use for processing. It contains the kv cache of the voice prompt.
                Load this from a .pt file using torch.load().
            padding (`bool`, `str` or `PaddingStrategy`, defaults to `True`):
                Whether to pad sequences to the same length
            truncation (`bool`, `str` or `TruncationStrategy`, defaults to `False`):
                Whether to truncate sequences
            max_length (`int`, *optional*):
                Maximum length of the returned sequences
            return_tensors (`str` or `TensorType`, *optional*):
                If set, will return tensors of a particular framework
            return_attention_mask (`bool`, defaults to `True`):
                Whether to return the attention mask

        Returns:
            `BatchEncoding`: A BatchEncoding with the following fields:
                - **input_ids** -- List of token id sequences or tensor
                - **attention_mask** -- List of attention masks or tensor
                - **tts_lm_input_ids** -- List of token id sequences or tensor used for TTS LM
                - **tts_lm_attention_mask** -- List of attention masks or tensor used for TTS LM
                - **tts_text_ids** -- List of token id sequences or tensor for TTS text input
                - **speech_tensors** -- Padded speech inputs (if voice_samples provided)
                - **speech_masks** -- Speech masks (if voice_samples provided)
                - **speech_input_mask** -- Boolean masks indicating speech token positions
        F
)add_special_tokenslmlast_hidden_stater   tts_lmN)	input_idstts_lm_input_idstts_text_idsspeech_inputsspeech_input_mask)rY   rZ   r[   r\   r]   )zipr   encodestripsizepad_idappend_batch_encode)r   rW   rX   rY   rZ   r[   r\   r]   r   textscached_prompts
is_batchedall_encodings
text_inputcached_prompt_inputscript_tokensinput_id_lengthtts_lm_input_id_lengthrc   rd   rg   encodingbatch_encodingr   r   r    process_input_with_cached_prompt   s6   .
	z<VibeVoiceStreamingProcessor.process_input_with_cached_prompt	encodingsc                 C   s  dd |D }dd |D }dd |D }	dd |D }
|r%dd |D nd}|r0dd |D nd}g }d	}|D ]}|d
 durI| |d
  d}q8t }|durtj|tjd|d< tj|tjd|d< tj|	tjd|d< |r|durtj|tjd|d< tj|tjd|d< tj|
tjd|d< n||d< ||d< |	|d< |r|dur||d< ||d< |
|d< |r| j||d}|d |d< |d |d< |S d|d< d|d< |S )z5Combine multiple encodings into a batch with padding.c                 S      g | ]}|d  qS )rc   r   .0encr   r   r   
<listcomp>      z=VibeVoiceStreamingProcessor._batch_encode.<locals>.<listcomp>c                 S   r|   )rd   r   r}   r   r   r   r     r   c                 S   r|   )re   r   r}   r   r   r   r     r   c                 S   r|   )rg   r   r}   r   r   r   r     r   c                 S      g | ]	}d gt | qS r   lenr~   idsr   r   r   r         Nc                 S   r   r   r   r   r   r   r   r     r   Frf   Tdtyperc   rd   re   attention_masktts_lm_attention_maskrg   )r\   padded_speechesspeech_tensorsspeech_masks)extendr   torchtensorlongboolprepare_speech_inputs)r   r{   rY   rZ   r[   r\   r]   input_ids_listtts_lm_input_ids_listtts_text_ids_listspeech_input_masks_listattention_maskstts_lm_attention_masksall_speech_inputs
has_speechr   ry   speech_dictr   r   r   rn     sP   z)VibeVoiceStreamingProcessor._batch_encoderf   devicer   c                    s$  |sdddS  fdd|D }t dd |D }|d jdkr.tjt||fdtjd	}ntjt|||d jd
 fdtjd	}tjt|t |ftjd}t	t
||D ]\}	\}
}|
||	dt|
f< d||	d|f< qV||d}|dkrtj|||ptjd|d< tj||tjd|d< |S )aT  
        Prepare speech inputs for model consumption.

        Args:
            speech_inputs: List of speech arrays
            return_tensors: Output tensor type
            device: Device to place tensors on
            dtype: Data type for tensors

        Returns:
            Dictionary with padded_speeches and speech_masks
        N)r   r   c                    s"   g | ]}t |jd   j qS )r   )mathceilshaper   r~   srU   r   r   r   ^  s   " zEVibeVoiceStreamingProcessor.prepare_speech_inputs.<locals>.<listcomp>c                 s   s    | ]}|j d  V  qdS )r   N)r   r   r   r   r   	<genexpr>_  s    zDVibeVoiceStreamingProcessor.prepare_speech_inputs.<locals>.<genexpr>r   r   )
fill_valuer   r   Tpt)r   r   r   r   )maxndimnpfullr   float32r   zerosbool_	enumeraterh   r   r   r   )r   rf   r\   r   r   vae_tok_seqlensmax_speech_lengthr   r   ispeechvae_tok_lengthresultr   rU   r   r   G  s$   
&z1VibeVoiceStreamingProcessor.prepare_speech_inputsc                 O      | j j|i |S )z
        This method forwards all its arguments to VibeVoiceTextTokenizer's [`~PreTrainedTokenizer.batch_decode`].
        Please refer to the docstring of this method for more information.
        )r   batch_decoder   argsr   r   r   r   r   x     z(VibeVoiceStreamingProcessor.batch_decodec                 O   r   )z
        This method forwards all its arguments to VibeVoiceTextTokenizer's [`~PreTrainedTokenizer.decode`].
        Please refer to the docstring of this method for more information.
        )r   decoder   r   r   r   r     r   z"VibeVoiceStreamingProcessor.decodec                 C   s*   | j j}| jj}tt|| ddg S )zB
        Return the list of inputs accepted by the model.
        rf   rg   )r   model_input_namesr   listdictfromkeys)r   tokenizer_input_namesaudio_processor_input_namesr   r   r   r     s   z-VibeVoiceStreamingProcessor.model_input_names
output.wavaudio_audiooutput_pathr%   	normalizebatch_prefixc                 C   s   | j j|||||dS )a  
        Save audio data to a file.

        Args:
            audio (Union[torch.Tensor, np.ndarray, List[Union[torch.Tensor, np.ndarray]]]):
                The audio data to save. Can be a single tensor/array or a list of them.
            output_path (str, optional): Path to save the audio file. Defaults to "output.wav".
            sampling_rate (int, optional): Sampling rate for the audio. If None, uses the processor's default.
            normalize (bool, optional): Whether to normalize the audio before saving. Defaults to False.
            batch_prefix (str, optional): Prefix for batch audio files. Defaults to "audio_".

        Returns:
            str: The path to the saved audio file.
        )r   r%   r   r   )r   
save_audio)r   r   r   r%   r   r   r   r   r   r     s   z&VibeVoiceStreamingProcessor.save_audio)NNr   T)NNTFNNT)TFNNT)NNN)r   NFr   )%__name__
__module____qualname____doc__r   classmethodr=   r   strr,   PathLikerR   r   rV   r   r   r   r   r	   r   intr   rz   r   rn   r   ndarrayr   r   r   r   r   r   propertyr   Tensorr   r   r   r   r   r      s    

U"

V
C

1
 r   )r   r   warningstypingr   r   r   r   r   r   r,   renumpyr   r   $transformers.tokenization_utils_baser   r	   r
   r   r   r.   r   r   r/   r   
get_loggerr   r7   r   __all__r   r   r   r   <module>   s$     
   