o
    聱i%                     @   s   d Z ddlmZmZmZmZ ddlmZ ddlm	Z	 ddl
mZ e	eZG dd deZG dd	 d	eZG d
d deZG dd deZg dZdS )z0 VibeVoice_AcousticTokenizer model configuration    )DictListOptionalTuple)PretrainedConfig)logging)Qwen2Configc                .       s   e Zd ZdZdddddddd	d
ddddddddg dddddfdededededededededededededed ed!ed"ed#ed$ee	e  d%ed&ed'ee	e  d(ee f, fd)d*Z
  ZS )+ VibeVoiceAcousticTokenizerConfigvibevoice_acoustic_tokenizer           T@   g      ?gaussiandepthwise_convnoneconstantRMSNormh㈵>ư>{Gz?          r         r   3-3-3-3-3-3-8Nchannelscorpus_normalizecausalvae_dimfix_stdstd_dist_typemixer_layer	conv_normpad_modedisable_last_norm	layernormlayernorm_epslayernorm_elementwise_affine	conv_biaslayer_scale_init_valueweight_init_valueencoder_n_filtersencoder_ratiosencoder_depthsdecoder_n_filtersdecoder_ratiosdecoder_depthsc                    s   t  jdi | || _|| _|| _|| _|| _|| _|| _|	| _	|| _
|
| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|d urH|n|| _|| _|| _d S N )super__init__r   r   r   r    r!   r"   r$   r%   r(   r&   r'   r)   r*   r+   r,   r#   r-   r.   r/   r1   r0   r2   )selfr   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   kwargs	__class__r4   R/home/ubuntu/VibeVoice-finetuning/src/vibevoice/modular/configuration_vibevoice.pyr6      s.   
z)VibeVoiceAcousticTokenizerConfig.__init____name__
__module____qualname__
model_typeintfloatboolstrr   r   r6   __classcell__r4   r4   r9   r;   r	      s    	


r	   c                '       s   e Zd ZdZddddddddd	dd
ddddddg ddfdededededededededededededededed ed!ed"ee	e  d#ef& fd$d%Z
  ZS )& VibeVoiceSemanticTokenizerConfigvibevoice_semantic_tokenizerr   r   Tr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   c                    s   t  jdi | || _|| _|| _|| _|| _|| _|| _|	| _	|| _
|
| _|| _|| _|| _|| _|| _|| _|| _|| _|| _d S r3   )r5   r6   r   r   r   r    r!   r"   r$   r%   r(   r&   r'   r)   r*   r+   r,   r#   r-   r.   r/   )r7   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r8   r9   r4   r;   r6   O   s(   
z)VibeVoiceSemanticTokenizerConfig.__init__r<   r4   r4   r9   r;   rF   L   sx    	

rF   c                       s:   e Zd ZdZ										
			d fdd	Z  ZS )VibeVoiceDiffusionHeadConfigvibevoice_diffusion_head   r         @r   r   Nv_predictionddpm     cosinec                    s^   || _ || _|| _|| _|| _|| _|| _|| _|	| _|
| _	|| _
|| _t jdi | d S r3   )hidden_sizehead_layershead_ffn_ratiorms_norm_epslatent_sizespeech_vae_dimprediction_typediffusion_typeddpm_num_stepsddpm_num_inference_stepsddpm_beta_scheduleddpm_batch_mulr5   r6   )r7   rQ   rR   rS   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r8   r9   r4   r;   r6      s   z%VibeVoiceDiffusionHeadConfig.__init__)rJ   r   rK   r   r   NrL   rM   rN   rO   rP   r   )r=   r>   r?   r@   r6   rE   r4   r4   r9   r;   rH      s    rH   c                       sP   e Zd ZdZdZeeeedZ	ddddddddZ
				d
 fdd		Z  ZS )VibeVoiceConfig	vibevoiceT)acoustic_tokenizer_configsemantic_tokenizer_configdecoder_configdiffusion_head_configcolwiserowwise)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.o_projzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_projNc                    s  d|d< |d u r| j d  | _nt|tr%d|d< | j d di || _nt|tr-|| _|d u r9| j d  | _nt|trNd|d< | j d di || _nt|trV|| _|d u rb| j d  | _n*t|tr|dd	d
krxt	di || _nt
d|dd	 t|t	fr|| _|d u r| j d  | _nt|trd|d< | j d di || _nt|tr|| _t| jdd| _t| jdd| _t jdi | d S )NF_attn_implementation_autosetr_   r
   r@   r`   rG   ra    qwen2z Unsupported decoder model type: rb   rI   r    r      r4   )sub_configsr_   
isinstancedictr	   r`   rF   ra   getr   
ValueErrorrb   rH   getattracoustic_vae_dimsemantic_vae_dimr5   r6   )r7   r_   r`   ra   rb   r8   r9   r4   r;   r6      sB   







zVibeVoiceConfig.__init__)NNNN)r=   r>   r?   r@   is_compositionr	   rF   r   rH   ri   base_model_tp_planr6   rE   r4   r4   r9   r;   r]      s*    	r]   )r	   rF   rH   r]   N)__doc__typingr   r   r   r    transformers.configuration_utilsr   transformers.utilsr   -transformers.models.qwen2.configuration_qwen2r   
get_loggerr=   loggerr	   rF   rH   r]   __all__r4   r4   r4   r;   <module>   s    
?6"O