o
    ꁱi7                     @   s   d Z ddlmZmZmZmZ ddlmZ ddlm	Z	 ddl
mZ e	eZG dd deZG dd	 d	eZG d
d deZG dd deZG dd deZg dZdS )z0 VibeVoice_AcousticTokenizer model configuration    )DictListOptionalTuple)PretrainedConfig)logging)Qwen2Configc                .       s   e Zd ZdZdddddddd	d
ddddddddg dddddfdededededededededededededed ed!ed"ed#ed$ee	e  d%ed&ed'ee	e  d(ee f, fd)d*Z
  ZS )+ VibeVoiceAcousticTokenizerConfigvibevoice_acoustic_tokenizer           T@   g      ?gaussiandepthwise_convnoneconstantRMSNormh㈵>ư>{Gz?          r         r   3-3-3-3-3-3-8Nchannelscorpus_normalizecausalvae_dimfix_stdstd_dist_typemixer_layer	conv_normpad_modedisable_last_norm	layernormlayernorm_epslayernorm_elementwise_affine	conv_biaslayer_scale_init_valueweight_init_valueencoder_n_filtersencoder_ratiosencoder_depthsdecoder_n_filtersdecoder_ratiosdecoder_depthsc                    s   t  jdi | || _|| _|| _|| _|| _|| _|| _|	| _	|| _
|
| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|d urH|n|| _|| _|| _d S N )super__init__r   r   r   r    r!   r"   r$   r%   r(   r&   r'   r)   r*   r+   r,   r#   r-   r.   r/   r1   r0   r2   )selfr   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   kwargs	__class__r4   M/home/ubuntu/vibevoice-community/vibevoice/modular/configuration_vibevoice.pyr6      s.   
z)VibeVoiceAcousticTokenizerConfig.__init____name__
__module____qualname__
model_typeintfloatboolstrr   r   r6   __classcell__r4   r4   r9   r;   r	      s    	


r	   c                '       s   e Zd ZdZddddddddd	dd
ddddddg ddfdededededededededededededededed ed!ed"ee	e  d#ef& fd$d%Z
  ZS )& VibeVoiceSemanticTokenizerConfigvibevoice_semantic_tokenizerr   r   Tr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   c                    s   t  jdi | || _|| _|| _|| _|| _|| _|| _|	| _	|| _
|
| _|| _|| _|| _|| _|| _|| _|| _|| _|| _d S r3   )r5   r6   r   r   r   r    r!   r"   r$   r%   r(   r&   r'   r)   r*   r+   r,   r#   r-   r.   r/   )r7   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r8   r9   r4   r;   r6   O   s(   
z)VibeVoiceSemanticTokenizerConfig.__init__r<   r4   r4   r9   r;   rF   L   sx    	

rF   c                       s:   e Zd ZdZ										
			d fdd	Z  ZS )VibeVoiceDiffusionHeadConfigvibevoice_diffusion_head   r         @r   r   Nv_predictionddpm     cosinec                    s^   || _ || _|| _|| _|| _|| _|| _|| _|	| _|
| _	|| _
|| _t jdi | d S r3   )hidden_sizehead_layershead_ffn_ratiorms_norm_epslatent_sizespeech_vae_dimprediction_typediffusion_typeddpm_num_stepsddpm_num_inference_stepsddpm_beta_scheduleddpm_batch_mulr5   r6   )r7   rQ   rR   rS   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r8   r9   r4   r;   r6      s   z%VibeVoiceDiffusionHeadConfig.__init__)rJ   r   rK   r   r   NrL   rM   rN   rO   rP   r   )r=   r>   r?   r@   r6   rE   r4   r4   r9   r;   rH      s    rH   c                       sP   e Zd ZdZdZeeeedZ	ddddddddZ
				d
 fdd		Z  ZS )VibeVoiceConfig	vibevoiceT)acoustic_tokenizer_configsemantic_tokenizer_configdecoder_configdiffusion_head_configcolwiserowwisezlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.o_projzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_projNc                    s  d|d< |d u r| j d  | _nt|tr%d|d< | j d di || _nt|tr-|| _|d u r9| j d  | _nt|trNd|d< | j d di || _nt|trV|| _|d u rb| j d  | _n*t|tr|dd	d
krxt	di || _nt
d|dd	 t|t	fr|| _|d u r| j d  | _nt|trd|d< | j d di || _nt|tr|| _t| jdd| _t| jdd| _t jdi | d S )NF_attn_implementation_autosetr_   r
   r@   r`   rG   ra    qwen2 Unsupported decoder model type: rb   rI   r    r      r4   )sub_configsr_   
isinstancedictr	   r`   rF   ra   getr   
ValueErrorrb   rH   getattracoustic_vae_dimsemantic_vae_dimr5   r6   )r7   r_   r`   ra   rb   r8   r9   r4   r;   r6      sB   







zVibeVoiceConfig.__init__)NNNN)r=   r>   r?   r@   is_compositionr	   rF   r   rH   rk   base_model_tp_planr6   rE   r4   r4   r9   r;   r]      s*    	r]   c                       s   e Zd ZdZdZeeedZddddddddZ				d fdd		Z
ddefddZedd Zedd Zedd Zedd Zedd Zedd Z  ZS )VibeVoiceASRConfigr^   T)r_   r`   ra   rc   rd   re   Nc                    sL  d|d< |d u r| j d  | _nt|tr%d|d< | j d di || _nt|tr-|| _|d u r9| j d  | _nt|trNd|d< | j d di || _nt|trV|| _|d u rb| j d  | _n)t|tr|dd	d
krxt	di || _nt
d|dd	 t|t	r|| _t| jdd| _t| jdd| _t jdi | d S )NFrf   r_   r
   r@   r`   rG   ra   rg   rh   ri   r    r   rj   r4   )rk   r_   rl   rm   r	   r`   rF   ra   rn   r   ro   rp   rq   rr   r5   r6   )r7   r_   r`   ra   r8   r9   r4   r;   r6     s4   	





zVibeVoiceASRConfig.__init__Fdecoderc                 C   s   | j S )z0Return the text (decoder) config for generation.)ra   )r7   rv   r4   r4   r;   get_text_config7  s   z"VibeVoiceASRConfig.get_text_configc                 C      | j jS )zCReturn vocab_size from decoder config for generation compatibility.)ra   
vocab_sizer7   r4   r4   r;   ry   ;     zVibeVoiceASRConfig.vocab_sizec                 C   rx   )zLReturn num_attention_heads from decoder config for Ulysses SP compatibility.)ra   num_attention_headsrz   r4   r4   r;   r|   @  r{   z&VibeVoiceASRConfig.num_attention_headsc                 C   rx   )zLReturn num_key_value_heads from decoder config for Ulysses SP compatibility.)ra   num_key_value_headsrz   r4   r4   r;   r}   E  r{   z&VibeVoiceASRConfig.num_key_value_headsc                 C   rx   )z?Return hidden_size from decoder config for model compatibility.)ra   rQ   rz   r4   r4   r;   rQ   J  r{   zVibeVoiceASRConfig.hidden_sizec                 C   rx   )zJReturn num_hidden_layers from decoder config for Ulysses SP compatibility.)ra   num_hidden_layersrz   r4   r4   r;   r~   O  r{   z$VibeVoiceASRConfig.num_hidden_layersc                 C   s   t | jd| j| j S )zAReturn head_dim from decoder config for Ulysses SP compatibility.head_dim)rp   ra   rQ   r|   rz   r4   r4   r;   r   T  s   zVibeVoiceASRConfig.head_dim)NNN)F)r=   r>   r?   r@   rs   r	   rF   r   rk   rt   r6   rC   rw   propertyry   r|   r}   rQ   r~   r   rE   r4   r4   r9   r;   ru      s@    0




ru   )r	   rF   rH   r]   ru   N)__doc__typingr   r   r   r    transformers.configuration_utilsr   transformers.utilsr   -transformers.models.qwen2.configuration_qwen2r   
get_loggerr=   loggerr	   rF   rH   r]   ru   __all__r4   r4   r4   r;   <module>   s    
?6"Of