APEX FusedRMSNorm not available, using native implementation /home/ubuntu/vibevoice/vibevoice/processor/vibevoice_asr_processor.py:23: UserWarning: audio_utils not available, will fall back to soundfile for audio loading warnings.warn("audio_utils not available, will fall back to soundfile for audio loading") loading file vocab.json from cache at /home/ubuntu/.cache/huggingface/hub/models--Qwen--Qwen2.5-1.5B/snapshots/8faed761d45a263340a0528343f099c05c9a4323/vocab.json loading file merges.txt from cache at /home/ubuntu/.cache/huggingface/hub/models--Qwen--Qwen2.5-1.5B/snapshots/8faed761d45a263340a0528343f099c05c9a4323/merges.txt loading file tokenizer.json from cache at /home/ubuntu/.cache/huggingface/hub/models--Qwen--Qwen2.5-1.5B/snapshots/8faed761d45a263340a0528343f099c05c9a4323/tokenizer.json loading file added_tokens.json from cache at None loading file special_tokens_map.json from cache at None loading file tokenizer_config.json from cache at /home/ubuntu/.cache/huggingface/hub/models--Qwen--Qwen2.5-1.5B/snapshots/8faed761d45a263340a0528343f099c05c9a4323/tokenizer_config.json loading file chat_template.jinja from cache at None The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. The tokenizer class you load from this checkpoint is 'Qwen2Tokenizer'. The class this function is called from is 'VibeVoiceTextTokenizerFast'. Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. loading configuration file config.json from cache at /home/ubuntu/.cache/huggingface/hub/models--microsoft--VibeVoice-1.5B/snapshots/c00898d257e6b46004e3e2866a47534085fb685a/config.json Model config VibeVoiceConfig { "acoustic_tokenizer_config": { "causal": true, "channels": 1, "conv_bias": true, "conv_norm": "none", "corpus_normalize": 0.0, "decoder_depths": null, "decoder_n_filters": 32, "decoder_ratios": [ 8, 5, 5, 4, 2, 2 ], "disable_last_norm": true, "encoder_depths": "3-3-3-3-3-3-8", "encoder_n_filters": 32, "encoder_ratios": [ 8, 5, 5, 4, 2, 2 ], "fix_std": 0.5, "layer_scale_init_value": 1e-06, "layernorm": "RMSNorm", "layernorm_elementwise_affine": true, "layernorm_eps": 1e-05, "mixer_layer": "depthwise_conv", "model_type": "vibevoice_acoustic_tokenizer", "pad_mode": "constant", "std_dist_type": "gaussian", "vae_dim": 64, "weight_init_value": 0.01 }, "acoustic_vae_dim": 64, "architectures": [ "VibeVoiceForConditionalGeneration" ], "decoder_config": { "attention_dropout": 0.0, "hidden_act": "silu", "hidden_size": 1536, "initializer_range": 0.02, "intermediate_size": 8960, "max_position_embeddings": 65536, "max_window_layers": 28, "model_type": "qwen2", "num_attention_heads": 12, "num_hidden_layers": 28, "num_key_value_heads": 2, "rms_norm_eps": 1e-06, "rope_scaling": null, "rope_theta": 1000000.0, "sliding_window": null, "tie_word_embeddings": true, "torch_dtype": "bfloat16", "use_cache": true, "use_sliding_window": false, "vocab_size": 151936 }, "diffusion_head_config": { "ddpm_batch_mul": 4, "ddpm_beta_schedule": "cosine", "ddpm_num_inference_steps": 20, "ddpm_num_steps": 1000, "diffusion_type": "ddpm", "head_ffn_ratio": 3.0, "head_layers": 4, "hidden_size": 1536, "latent_size": 64, "model_type": "vibevoice_diffusion_head", "prediction_type": "v_prediction", "rms_norm_eps": 1e-05, "speech_vae_dim": 64 }, "model_type": "vibevoice", "semantic_tokenizer_config": { "causal": true, "channels": 1, "conv_bias": true, "conv_norm": "none", "corpus_normalize": 0.0, "disable_last_norm": true, "encoder_depths": "3-3-3-3-3-3-8", "encoder_n_filters": 32, "encoder_ratios": [ 8, 5, 5, 4, 2, 2 ], "fix_std": 0, "layer_scale_init_value": 1e-06, "layernorm": "RMSNorm", "layernorm_elementwise_affine": true, "layernorm_eps": 1e-05, "mixer_layer": "depthwise_conv", "model_type": "vibevoice_semantic_tokenizer", "pad_mode": "constant", "std_dist_type": "none", "vae_dim": 128, "weight_init_value": 0.01 }, "semantic_vae_dim": 128, "torch_dtype": "bfloat16", "transformers_version": "4.51.3" } loading weights file model.safetensors from cache at /home/ubuntu/.cache/huggingface/hub/models--microsoft--VibeVoice-1.5B/snapshots/c00898d257e6b46004e3e2866a47534085fb685a/model.safetensors.index.json Using device: cuda Setting seed: 42 Found 10 voice files in /home/ubuntu/vibevoice/demo/voices Available voices: en-Alice_woman, en-Carter_man, en-Frank_man, en-Mary_woman_bgm, en-Maya_woman, in-Samuel_man, modi, zh-Anchen_man_bgm, zh-Bowen_man, zh-Xinran_woman Reading script from: demo/text_examples/modi_hindi.txt Found 2 speaker segments: 1. Speaker 1 Text preview: Speaker 1: Mere pyaare deshvasiyon, aaj main aapke saath kuch bahut zaroori baatein karna chahta hoo... 2. Speaker 1 Text preview: Speaker 1: Aaj hum Digital India ki baat karte hain. Gaon gaon mein internet pahunch raha hai. Kisan... Speaker mapping: Speaker 1 -> modi Speaker 1 ('modi') -> Voice: modi.wav Loading processor & model from microsoft/VibeVoice-1.5B Using device: cuda, torch_dtype: torch.bfloat16, attn_implementation: flash_attention_2 Fetching 3 files: 0%| | 0/3 [00:00