o
    ꁱi"                     @   s  d Z ddlmZ ddlmZmZmZmZmZm	Z	 ddl
m
Z
 ddlZddlZddlmZ ddlm  mZ ddlmZ ddlmZmZ ddlmZ ddlmZmZmZ dd	lmZ dd
l m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z' ddl(m)Z) ddl*m+Z+ ddl,m-Z- e'.e/Z0e1e!dre!j2du rg de!_2G dd dej3Z4G dd dej3Z5G dd de#Z6G dd de6Z7e8e-e7 g dZ9dS )aU  
VibeVoice Streaming Model Architecture (0.5B)

This module implements the streaming-optimized version of VibeVoice for real-time TTS.
Key differences from the multi-speaker model:
- No semantic tokenizer (only acoustic)
- Split language model architecture: lower layers for text, upper layers for TTS
- Optimized for low-latency generation
    )	dataclass)DictListOptionalTupleUnionCallable)tqdmN)	AutoModelAutoModelForCausalLM)ACT2FN)CausalLMOutputBaseModelOutputWithPastModelOutput)LlamaRMSNorm)modeling_utils)PreTrainedModel)FlashAttentionKwargs)logging   )VibeVoiceDiffusionHead)DPMSolverMultistepScheduler)VibeVoiceStreamingConfigALL_PARALLEL_STYLES)tpnonecolwiserowwisec                       (   e Zd ZdZ fddZdd Z  ZS )BinaryClassifierz?Binary classifier for end-of-speech detection in streaming TTS.c                    s.   t t|   t||| _t|d| _d S )Nr   )superr   __init__nnLinearfc1fc2)selfhidden_size	__class__ R/home/ubuntu/vibevoice-community/vibevoice/modular/modeling_vibevoice_streaming.pyr!   -   s   zBinaryClassifier.__init__c                 C   s   t | |}| |}|S N)torchrelur$   r%   )r&   xr*   r*   r+   forward2   s   
zBinaryClassifier.forward__name__
__module____qualname____doc__r!   r0   __classcell__r*   r*   r(   r+   r   *   s    r   c                       r   )SpeechConnectorzQConnector module that projects speech latents to language model hidden dimension.c                    s8   t    t||| _t|dd| _t||| _d S )Ngư>)eps)r    r!   r"   r#   r$   r   normr%   )r&   	input_dim
output_dimr(   r*   r+   r!   ;   s   
zSpeechConnector.__init__c                 K   s"   |  |}| |}| |}|S r,   )r$   r9   r%   )r&   featureskwargsr/   r*   r*   r+   r0   A   s   


zSpeechConnector.forwardr1   r*   r*   r(   r+   r7   8   s    r7   c                   @   s@   e Zd ZdZeZdZdZdZdZ	dZ
dZdZdZdZdd ZdS )!VibeVoiceStreamingPreTrainedModelz*Base class for VibeVoice Streaming models.modelTpast_key_valuesc                 C   s   t |tr|  d S t| jdrt| jjdr| jjj}nt| jdr1t| jjdr1| jjj}nd}t |tj	rQ|j
jjd|d |jd urO|jj  d S d S t |tjrf|j
jd |jj  d S d S )Nlanguage_model_configinitializer_rangedecoder_configg{Gz?g        )meanstdg      ?)
isinstancer   initialize_weightshasattrconfigrA   rB   rC   r"   r#   weightdatanormal_biaszero_	LayerNormfill_)r&   modulerE   r*   r*   r+   _init_weightsV   s"   

z/VibeVoiceStreamingPreTrainedModel._init_weightsN)r2   r3   r4   r5   r   config_classbase_model_prefixsupports_gradient_checkpointing_skip_keys_device_placement_supports_cache_class_supports_flash_attn_2_supports_sdpa_supports_quantized_cache_supports_static_cache_supports_attention_backendrR   r*   r*   r*   r+   r>   H   s    r>   c                       sB   e Zd ZdZ fddZdd Zdd Zdd	d
Zdd Z  Z	S )VibeVoiceStreamingModela8  
    VibeVoice Streaming Model for real-time TTS.

    The model uses a split architecture:
    - language_model: Lower transformer layers for text encoding
    - tts_language_model: Upper transformer layers for TTS generation

    This separation enables streaming text input and low-latency speech output.
    c                    s@  t  | t|dr!|jd ur!t|jtrtt|j}n|j}ntj}t	
|j}t|dd|j }||_t|| _t | j_t	
|}|j|_t|| _tjd|jjd| _t|j|| _t|j|j|| _| dtt d | dtt d t|j!|| _"t#|j!j$|j!j%|j!j&d	| _'d S )
Ntorch_dtypenum_hidden_layers      )num_embeddingsembedding_dimspeech_scaling_factornanspeech_bias_factor)num_train_timestepsbeta_scheduleprediction_type)(r    r!   rH   r^   rF   strgetattrr-   float32copydeepcopyrC   tts_backbone_num_hidden_layersr_   r
   from_configlanguage_modelr"   Identityr9   tts_language_model	Embeddingr'   tts_input_typesacoustic_tokenizer_configtoacoustic_tokenizerr7   acoustic_vae_dimacoustic_connectorregister_buffertensorfloatdiffusion_head_configprediction_headr   ddpm_num_stepsddpm_beta_scheduleri   noise_scheduler)r&   rI   dtype	lm_configlm_backbone_num_hidden_layerstts_lm_configr(   r*   r+   r!   w   s2   
z VibeVoiceStreamingModel.__init__c                 C   sL   t | jdr
| jjS | jj D ]\}}|jdkr!t| j|  S qJ d)Nembed_tokenszembed_tokens.weightFzshould not arrive here)rH   rq   r   fullmapitems	orig_namerk   )r&   nameattrr*   r*   r+   get_input_embeddings   s   
z,VibeVoiceStreamingModel.get_input_embeddingsc                 C   s   || j _d S r,   )rq   r   )r&   valuer*   r*   r+   set_input_embeddings   s   z,VibeVoiceStreamingModel.set_input_embeddingsNc                 C   s"   || _ | j dur| j   dS dS )z@Set the speech tokenizers used for encoding and decoding speech.N)rx   eval)r&   rx   r*   r*   r+   set_speech_tokenizers   s   
z-VibeVoiceStreamingModel.set_speech_tokenizersc                 O   s   t d)aq  
        Intentionally not implemented.

        This streaming model is split into two explicit submodules:
          - `language_model`      for plain text processing (lower layers).
          - `tts_language_model`  for TTS-related upper layers.

        We deliberately avoid a unified `forward` to prevent accidental calls
        that mix responsibilities.

        To use the model:
          - Call `self.language_model(...)` for text embeddings / hidden states.
          - Call `self.tts_language_model(...)` for the TTS portion.
          - Use the dedicated inference class for combined generation logic.
        zVibeVoiceStreamingModel.forward is intentionally disabled. Use `model.language_model(...)` or `model.tts_language_model(...)` instead.)RuntimeError)r&   argsr=   r*   r*   r+   r0      s   zVibeVoiceStreamingModel.forwardr,   )
r2   r3   r4   r5   r!   r   r   r   r0   r6   r*   r*   r(   r+   r]   l   s    
/

r]   )r>   r]   r   r7   ):r5   dataclassesr   typingr   r   r   r   r   r   r	   rm   r-   torch.nnr"   torch.nn.functional
functionalFtorch.distributeddistributeddisttransformers.models.autor
   r   transformers.activationsr   transformers.modeling_outputsr   r   r   (transformers.models.llama.modeling_llamar   transformersr   transformers.modeling_utilsr   +transformers.modeling_flash_attention_utilsr   transformers.utilsr    modular_vibevoice_diffusion_headr   vibevoice.schedule.dpm_solverr   !configuration_vibevoice_streamingr   
get_loggerr2   loggerrH   r   Moduler   r7   r>   r]   register__all__r*   r*   r*   r+   <module>   s:    
 

$e