o
    ꁱiY                     @   s\   d Z ddlmZ ddlmZ ddlmZ ddlmZm	Z	 e
eZG dd deZdgZd	S )
z( VibeVoice Streaming model configuration    )PretrainedConfig)logging)Qwen2Config   ) VibeVoiceAcousticTokenizerConfigVibeVoiceDiffusionHeadConfigc                       sR   e Zd ZdZdZdZeeedZ	ddddddddZ
					d fd
d	Z  ZS )VibeVoiceStreamingConfiga  
    Configuration class for the VibeVoice Streaming model (0.5B).

    The streaming model differs from the multi-speaker model:
    - No semantic tokenizer (only acoustic)
    - Split language model: lower layers for text encoding, upper layers for TTS
    - Optimized for low-latency real-time generation

    Args:
        acoustic_tokenizer_config: Configuration for the acoustic tokenizer
        decoder_config: Configuration for the Qwen2 language model backbone
        diffusion_head_config: Configuration for the diffusion prediction head
        tts_backbone_num_hidden_layers: Number of upper transformer layers used for TTS (default: 20)
    vibevoice_streamingT)acoustic_tokenizer_configdecoder_configdiffusion_head_configcolwiserowwise)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.o_projzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_projN   c                    sD  d|d< |d u r| j d  | _nt|tr%d|d< | j d di || _nt|tr-|| _|d u r9| j d  | _n*t|trZ|dddkrOtdi || _ntd	|dd t|tfrc|| _|d u ro| j d
  | _	nt|trd|d< | j d
 di || _	nt|t
r|| _	t| jdd| _|| _t jdi | d S )NF_attn_implementation_autosetr
   vibevoice_acoustic_tokenizer
model_typer    qwen2z Unsupported decoder model type: r   vibevoice_diffusion_headvae_dim@    )sub_configsr
   
isinstancedictr   r   getr   
ValueErrorr   r   getattracoustic_vae_dimtts_backbone_num_hidden_layerssuper__init__)selfr
   r   r   r    kwargs	__class__r   W/home/ubuntu/vibevoice-community/vibevoice/modular/configuration_vibevoice_streaming.pyr"   .   s4   	




z!VibeVoiceStreamingConfig.__init__)NNNr   )__name__
__module____qualname____doc__r   is_compositionr   r   r   r   base_model_tp_planr"   __classcell__r   r   r%   r'   r      s*    r   N)r+    transformers.configuration_utilsr   transformers.utilsr   -transformers.models.qwen2.configuration_qwen2r   configuration_vibevoicer   r   
get_loggerr(   loggerr   __all__r   r   r   r'   <module>   s    
R