
    h%                         d Z ddlmZmZmZmZ ddlmZ ddlm	Z	 ddl
mZ  e	j        e          Z G d de          Z G d d	e          Z G d
 de          Z G d de          Zg dZdS )z0 VibeVoice_AcousticTokenizer model configuration    )DictListOptionalTuple)PretrainedConfig)logging)Qwen2Configc            .            e Zd ZdZdddddddd	d
ddddddddg dddddfdededededededededededededed ed!ed"ed#ed$ee	e                  d%ed&ed'ee	e                  d(ee         f, fd)Z
 xZS )* VibeVoiceAcousticTokenizerConfigvibevoice_acoustic_tokenizer           T@   g      ?gaussiandepthwise_convnoneconstantRMSNormh㈵>ư>{Gz?          r         r   3-3-3-3-3-3-8Nchannelscorpus_normalizecausalvae_dimfix_stdstd_dist_typemixer_layer	conv_normpad_modedisable_last_norm	layernormlayernorm_epslayernorm_elementwise_affine	conv_biaslayer_scale_init_valueweight_init_valueencoder_n_filtersencoder_ratiosencoder_depthsdecoder_n_filtersdecoder_ratiosdecoder_depthsc                 v    t                      j        di | || _        || _        || _        || _        || _        || _        || _        |	| _	        || _
        |
| _        || _        || _        || _        || _        || _        || _        || _        || _        || _        ||n|| _        || _        || _        d S N )super__init__r   r    r!   r"   r#   r$   r&   r'   r*   r(   r)   r+   r,   r-   r.   r%   r/   r0   r1   r3   r2   r4   )selfr   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   kwargs	__class__s                           Q/workspace/chatterbox-finetuning/src/vibevoice/modular/configuration_vibevoice.pyr9   z)VibeVoiceAcousticTokenizerConfig.__init__   s    : 	""6"""  0* # *!2",H)"&<#!2& "3,, 1?0JnnP^!2,    __name__
__module____qualname__
model_typeintfloatboolstrr   r   r9   __classcell__r<   s   @r=   r   r      s       /J "%'+""&"#-1(,#'!#.;mm-!#.2(,59- 9-9-  9- 	9-
 9- 9- 9- 9- 9- 9-  9- 9- 9- '+9-  !9-" !&#9-$ !%9-( )9-* !c++9-, -9-0 19-2 !c+39-4 !59- 9- 9- 9- 9- 9- 9- 9- 9- 9-r>   r   c            '            e Zd ZdZddddddddd	dd
ddddddg ddfdededededededededededededededed ed!ed"ee	e                  d#ef& fd$Z
 xZS )% VibeVoiceSemanticTokenizerConfigvibevoice_semantic_tokenizerr   r   Tr   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   c                 D    t                      j        di | || _        || _        || _        || _        || _        || _        || _        |	| _	        || _
        |
| _        || _        || _        || _        || _        || _        || _        || _        || _        || _        d S r6   )r8   r9   r   r    r!   r"   r#   r$   r&   r'   r*   r(   r)   r+   r,   r-   r.   r%   r/   r0   r1   )r:   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r;   r<   s                        r=   r9   z)VibeVoiceSemanticTokenizerConfig.__init__O   s    2 	""6"""  0* # *!2",H)"&<#!2& "3,,r>   r?   rI   s   @r=   rK   rK   L   sK       /J "%#+""&"#-1(,#'!#.;mm--0- 0-0-  0- 	0-
 0- 0- 0- 0- 0- 0-  0- 0- 0- '+0-  !0-" !&#0-$ !%0-( )0-* !c++0-, -0- 0- 0- 0- 0- 0- 0- 0- 0- 0-r>   rK   c                   <     e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 d fd	Z xZS )VibeVoiceDiffusionHeadConfigvibevoice_diffusion_head   r         @r   r   Nv_predictionddpm     cosinec                     || _         || _        || _        || _        || _        || _        || _        || _        |	| _        |
| _	        || _
        || _         t                      j        di | d S r6   )hidden_sizehead_layershead_ffn_ratiorms_norm_epslatent_sizespeech_vae_dimprediction_typediffusion_typeddpm_num_stepsddpm_num_inference_stepsddpm_beta_scheduleddpm_batch_mulr8   r9   )r:   rY   rZ   r[   r\   r]   r^   r_   r`   ra   rb   rc   rd   r;   r<   s                 r=   r9   z%VibeVoiceDiffusionHeadConfig.__init__   s      '&,(&,.,,(@%"4,""6"""""r>   )rQ   r   rR   r   r   NrS   rT   rU   rV   rW   r   )r@   rA   rB   rC   r9   rH   rI   s   @r=   rO   rO      sh        +J &!### # # # # # # # # #r>   rO   c                   R     e Zd ZdZdZeeeedZ	ddddddddZ
	 	 	 	 d	 fd	Z xZS )
VibeVoiceConfig	vibevoiceT)acoustic_tokenizer_configsemantic_tokenizer_configdecoder_configdiffusion_head_configcolwiserowwise)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.o_projzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_projNc                 z   d|d<   | | j         d                     | _        nOt          |t                    rd|d<    | j         d         di || _        nt          |t                    r|| _        | | j         d                     | _        nOt          |t                    rd|d<    | j         d         di || _        nt          |t                    r|| _        | | j         d                     | _        nt          |t                    rR|                    dd	          d
k    rt          di || _        nCt          d|                    dd	                     t          |t          f          r|| _        | | j         d                     | _        nOt          |t                    rd|d<    | j         d         di || _        nt          |t                    r|| _        t          | j        dd          | _        t          | j        dd          | _         t!                      j        di | d S )NF_attn_implementation_autosetrh   r   rC   ri   rL   rj    qwen2z Unsupported decoder model type: rk   rP   r"   r      r7   )sub_configsrh   
isinstancedictr   ri   rK   rj   getr	   
ValueErrorrk   rO   getattracoustic_vae_dimsemantic_vae_dimr8   r9   )r:   rh   ri   rj   rk   r;   r<   s         r=   r9   zVibeVoiceConfig.__init__   s    27-.$,-ZT-=>Y-Z-\-\D**1488 	G6T%l3-ZT-=>Y-Z-w-w]v-w-wD**13STT 	G-FD*$,-ZT-=>Y-Z-\-\D**1488 	G6T%l3-ZT-=>Y-Z-w-w]v-w-wD**13STT 	G-FD*!"D$"23C"D"F"FD-- 		1 !!,33w>>&1&C&CN&C&C## !jNDVDVWcegDhDh!j!jkkk77 	1"0D ()R)9:Q)R)T)TD&&-t44 	?2L!,/)R)9:Q)R)k)kUj)k)kD&&-/KLL 	?)>D& !((F	SU V V '(F	SV W W""6"""""r>   )NNNN)r@   rA   rB   rC   is_compositionr   rK   r	   rO   rs   base_model_tp_planr9   rH   rI   s   @r=   rf   rf      s        JN%E%E%!=	 K &/%.%.%."+ )"+  #'"&"8# 8# 8# 8# 8# 8# 8# 8# 8# 8#r>   rf   )r   rK   rO   rf   N)__doc__typingr   r   r   r    transformers.configuration_utilsr   transformers.utilsr   -transformers.models.qwen2.configuration_qwen2r	   
get_loggerr@   loggerr   rK   rO   rf   __all__r7   r>   r=   <module>r      sD   6 6 . . . . . . . . . . . . = = = = = = & & & & & & E E E E E E		H	%	%<- <- <- <- <-'7 <- <- <-~3- 3- 3- 3- 3-'7 3- 3- 3-l #  #  #  #  ##3  #  #  #DM# M# M# M# M#& M# M# M#^  r>   