
    )lh]                        d dl mZ d dlmZmZmZmZmZmZ d dl	m	Z	 d dl
Z
d dlmZ d dlmc mZ d dlmZ d dlmZmZ d dlmZ d dlmZmZmZ d dlmZ d d	lmZ d d
l m!Z! d dl"m#Z# d dl$m%Z% ddl&m'Z'm(Z(m)Z) ddl*m+Z+ d dl,m-Z- ddl.m/Z/  e%j0        e1          Z2 e3ed          rej4        	g de_4        e G d de                      Z5e G d de                      Z6 G d dej7                  Z8 G d de!          Z9 G d de9          Z: G d de9          Z; ej<        e/e:            ej<        e/e;           g d Z=dS )!    )	dataclass)DictListOptionalTupleUnionCallable)tqdmN)	AutoModelAutoModelForCausalLM)ACT2FN)CausalLMOutputBaseModelOutputWithPastModelOutput)LlamaRMSNorm)modeling_utils)PreTrainedModel)FlashAttentionKwargs)logging   ) VibeVoiceTokenizerStreamingCacheVibeVoiceAcousticTokenizerModelVibeVoiceSemanticTokenizerModel)VibeVoiceDiffusionHead)DPMSolverMultistepScheduler)VibeVoiceConfigALL_PARALLEL_STYLES)tpnonecolwiserowwisec                   .   e Zd ZU dZeej                 ed<   dZeej                 ed<   dZ	ee
         ed<   dZej        ed<   dZeeeej                                   ed<   dZeeej        df                  ed<   dZeeej        df                  ed	<   dS )
VibeVoiceCausalLMOutputWithPastNlossdiffusion_lossspeech_token_numlogitspast_key_values.hidden_states
attentions)__name__
__module____qualname__r$   r   torchFloatTensor__annotations__r%   r&   intr'   r(   r   r)   r*        L/workspace/chatterbox-finetuning/src/vibevoice/modular/modeling_vibevoice.pyr#   r#       s         (,D(5$
%,,,26NHU./666&*hsm*** $FE$$$AEOXeE%*;$<=>EEE=AM8E%"3S"89:AAA:>Ju0#567>>>>>r3   r#   c                   \    e Zd ZU dZdZej        ed<   dZe	e
ej                          ed<   dS )VibeVoiceGenerationOutputaH  
    Output type for VibeVoice generation.
    
    Args:
        sequences (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            The generated sequences. 
        speech_outputs (`List[torch.FloatTensor]`, *optional*):
            List of generated speech waveforms or latents for each speech segment.
    N	sequencesspeech_outputs)r+   r,   r-   __doc__r7   r.   
LongTensorr0   r8   r   r   r/   r2   r3   r4   r6   r6   +   sN           #'Iu&&&8<NHT%"345<<<<<r3   r6   c                   $     e Zd Z fdZd Z xZS )SpeechConnectorc                     t                                                       t          j        ||          | _        t          |d          | _        t          j        ||          | _        d S )Ngư>)eps)super__init__nnLinearfc1r   normfc2)self	input_dim
output_dim	__class__s      r4   r@   zSpeechConnector.__init__;   sW    9Y
33 666	9Z44r3   c                     |                      |          }|                     |          }|                     |          }|S N)rC   rD   rE   )rF   featureskwargsxs       r4   forwardzSpeechConnector.forwardA   s6    HHXIIaLLHHQKKr3   )r+   r,   r-   r@   rO   __classcell__rI   s   @r4   r<   r<   :   sG        5 5 5 5 5      r3   r<   c                   <    e Zd ZeZdZdZdZdZdZ	dZ
dZdZdZd ZdS )VibeVoicePreTrainedModelmodelTr(   c                    t          |t                    r|                                 d S t          | j        d          r,t          | j        j        d          r| j        j        j        }nCt          | j        d          r,t          | j        j        d          r| j        j        j        }nd}t          |t          j	                  rJ|j
        j                            d|           |j         |j        j                                         d S d S t          |t          j                  r?|j
        j                            d           |j        j                                         d S d S )Nlanguage_model_configinitializer_rangedecoder_configg{Gz?        )meanstd      ?)
isinstancer   initialize_weightshasattrconfigrV   rW   rX   rA   rB   weightdatanormal_biaszero_	LayerNormfill_)rF   moduler[   s      r4   _init_weightsz&VibeVoicePreTrainedModel._init_weightsU   s\   f455 	%%'''F 4; 788 	WT[Efh{=|=| 	+3ECCT["233 	@Z\o8p8p 	+,>CCCfbi(( 	%M&&CS&999{& &&((((( '&-- 	%M$$S)))K""$$$$$	% 	%r3   N)r+   r,   r-   r   config_classbase_model_prefixsupports_gradient_checkpointing_skip_keys_device_placement_supports_cache_class_supports_flash_attn_2_supports_sdpa_supports_quantized_cache_supports_static_cache_supports_attention_backendri   r2   r3   r4   rS   rS   I   s]        "L&*#"3 !N $!"&% % % % %r3   rS   c                   L    e Zd Z fdZd Zd ZddZ	 	 	 	 	 	 	 	 	 	 ddej        de	ej
                 de	ej                 d	e	eeej                                   d
e	ej                 de	e         de	e         de	e         de	e         de	ej                 deeef         fdZ xZS )VibeVoiceModelc                 v   t                                          |           t          |d          rD|j        =t	          |j        t
                    rt          t          |j                  }n|j        }nt          j        }|j	        }t          j        |          | _        t          j        |j                                      |          | _        t          j        |j                                      |          | _        t%          |j        |j                                      |          | _        t%          |j        |j                                      |          | _        |                     dt          j        t5          d                               |                     dt          j        t5          d                               t          j        |j                                      |          | _        t;          |j        j        |j        j        |j        j                   | _!        d S )Ntorch_dtypespeech_scaling_factornanspeech_bias_factor)num_train_timestepsbeta_scheduleprediction_type)"r?   r@   r_   rw   r]   strgetattrr.   float32rX   r   from_configlanguage_modelacoustic_tokenizer_configtoacoustic_tokenizersemantic_tokenizer_configsemantic_tokenizerr<   acoustic_vae_dimhidden_sizeacoustic_connectorsemantic_vae_dimsemantic_connectorregister_buffertensorfloatdiffusion_head_configprediction_headr   ddpm_num_stepsddpm_beta_scheduler}   noise_scheduler)rF   r`   dtype	lm_configrI   s       r4   r@   zVibeVoiceModel.__init__l   s      6=)) 	"f.@.L&,c22 +v'9::*ME )	'3I>> #,"78X"Y"Y"\"\]b"c"c"+"78X"Y"Y"\"\]b"c"c"1&2I9K`"a"a"d"dej"k"k"1&2I9K`"a"a"d"dej"k"k 	4el5<<6P6PQQQ15<e3M3MNNN  )4V5QRRUUV[\\  ; & < K 6I"8H 
  
  
r3   c                     t          | j        d          r| j        j        S | j        j                                        D ]'\  }}|j        dk    rt          | j        |          c S (J d            )Nembed_tokenszembed_tokens.weightFzshould not arrive here)r_   r   r   fullmapitems	orig_namer   )rF   nameattrs      r4   get_input_embeddingsz#VibeVoiceModel.get_input_embeddings   s    4&77 	4&33-5;;== 	: 	:JD$~!666t2D99999 7....ur3   c                     || j         _        d S rK   )r   r   rF   values     r4   set_input_embeddingsz#VibeVoiceModel.set_input_embeddings   s    +0(((r3   Nc                     || _         || _        | j         | j                                          | j        | j                                         dS dS )z@Set the speech tokenizers used for encoding and decoding speech.N)r   r   eval)rF   r   r   s      r4   set_speech_tokenizersz$VibeVoiceModel.set_speech_tokenizers   s^    "4"4 ".#((***".#((***** /.r3   	input_idsattention_maskposition_idsr(   inputs_embeds	use_cacheoutput_attentionsoutput_hidden_statesreturn_dictcache_positionreturnc                     |	|	n| j         j        }	 | j        d|||||||||	|
d
|}|	s|S t          |j        |j        |j        |j                  S )N
r   r   r   r(   r   r   r   r   r   r   )last_hidden_stater(   r)   r*   r2   )r`   use_return_dictr   r   r   r(   r)   r*   )rF   r   r   r   r(   r   r   r   r   r   r   rM   outputss                r4   rO   zVibeVoiceModel.forward   s     &1%<kk$+B] &$% 
)%+'/!5#)
 
 
 
  	N&%7#3!/)	
 
 
 	
r3   )NN)
NNNNNNNNNN)r+   r,   r-   r@   r   r   r   r.   r:   r   Tensorr   r/   boolr   r   rO   rP   rQ   s   @r4   ru   ru   k   s_       "
 "
 "
 "
 "
H/ / /1 1 1
+ 
+ 
+ 
+ '+1537EI59$(,0/3&*59(
 (
#(
 !.(
 u/0	(

 "%e.?(@"AB(
   12(
 D>(
 $D>(
 'tn(
 d^(
 !!12(
 
u--	.(
 (
 (
 (
 (
 (
 (
 (
r3   ru   c            ,       x    e Zd ZdgZddiZ fdZd Zd Zd Zd Z	d	 Z
d
 Zd Z	 	 	 	 d&dZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d'dej        deej                 deej                 deeej                          deej                 deej                 dee         dee         dee         dee         deej                 deej                 deej                 deej                 deej                 d eej                 d!eej                 d"ed#eeeeej        ef         f                  d$eeef         f(d%Z xZS )(!VibeVoiceForConditionalGenerationzlm_head.weightlm_headcolwise_repc                    t                                          |           t          |          | _        |j        j        | _        t          j        |j        j        | j        d          | _	        | 
                                 d S )NF)rd   )r?   r@   ru   rT   rX   
vocab_sizerA   rB   r   r   	post_init)rF   r`   rI   s     r4   r@   z*VibeVoiceForConditionalGeneration.__init__   sn       #F++
 /:y!6!BDOZ_```r3   c                 4    | j                                         S rK   )rT   r   rF   s    r4   r   z6VibeVoiceForConditionalGeneration.get_input_embeddings   s    z..000r3   c                 :    | j                             |           d S rK   )rT   r   r   s     r4   r   z6VibeVoiceForConditionalGeneration.set_input_embeddings   s    
''.....r3   c                     | j         S rK   r   r   s    r4   get_output_embeddingsz7VibeVoiceForConditionalGeneration.get_output_embeddings   s
    |r3   c                     || j         _        d S rK   rT   r   )rF   decoders     r4   set_decoderz-VibeVoiceForConditionalGeneration.set_decoder   s    $+
!!!r3   c                     | j         j        S rK   r   r   s    r4   get_decoderz-VibeVoiceForConditionalGeneration.get_decoder   s    z((r3   c                    t          | j        j        dd          r|                                 }|                                 }t          |d          r|j        |_        n||_        t          |dd          [t          j        	                    |j
        j        d|j        j        d         |j
        j        d         z
  fdd          |j
        _        t          d           dS t          d	           dS )
zY
        Tie the weights between the input embeddings and the output embeddings.
        tie_word_embeddingsFra   rd   Nr   constantu?   ✅ Tied input and output embeddings using standard assignment.u8   ℹ️  tie_word_embeddings is False, not tying weights.)r   r`   rX   r   r   r_   ra   rA   
functionalpadrd   rb   shapeprint)rF   output_embeddingsinput_embeddingss      r4   tie_weightsz-VibeVoiceForConditionalGeneration.tie_weights   s    4;-/DeLL 	N !% : : < <#88::'22 <+;+B!(( ,<!((&$77C.0m.?.?%*/)06q9<M<R<XYZ<[[\	/ /!&+ STTTTTLMMMMMr3   c                     || _         d S rK   r   )rF   new_embeddingss     r4   set_output_embeddingsz7VibeVoiceForConditionalGeneration.set_output_embeddings  s     &r3   NaudioFc                    |o| j         j        j        }t          j        dd|                              |                                 j                  }| j        	                    |          }||fS t          j
                    5  |dk    rt          j
                    5  | j        j                            |                    d                    d         d         }d d d            n# 1 swxY w Y   |                    | j        j        j                  d         }	n|dk    r| j         j        j        }|                    |                    d          d|          }
|
                    d          }| j        j        j        dz  }t          j        ||
j        |
j                  |z  } |j        dgdg|
                                dz
  z  R  }|
|t          j        |
j                                      |
          z  z   }	nt3          d| d	          t          j        | j        j                  st          j        | j        j                  rd
|	|                                                                         z  }|	|                                                                          }tA          j!                    rtA          j"                    rtA          j#        |t@          j$        j%                   tA          j#        |t@          j$        j%                   tA          j&                    }| j        j        '                    ||z             | j        j        '                    ||z             tQ          d| j        j         d| j        j         d           ni| j        j        '                    |           | j        j        '                    |           tQ          d| j        j         d| j        j         d           |	| j        j        z   | j        j        z  }d d d            n# 1 swxY w Y   | j        	                    |          }|r||fS ||         ||         fS )Nr   r   r   vaeg?r   devicezSpeech type  not implementedr\   )opz%Speech scaling factor (distributed): z, bias factor: T)flushz(Speech scaling factor (single process): ))r`   r   vae_dimr.   zerosr   r   ra   rT   r   no_gradr   encode	unsqueezesamplestd_dist_typereshapesizefix_stdrandnr   r   viewdimr   NotImplementedErrorisnanrx   rz   flattenr[   rZ   distis_availableis_initialized
all_reduceReduceOpSUMget_world_sizecopy_r   )rF   speech_tensorsspeech_masksspeech_typereturn_unmaskr   audio_featuresconnect_featuresframesaudio_tokensspeech_mode
batch_sizer   r[   scaling_factorbias_factor
world_sizes                    r4   forward_speech_featuresz9VibeVoiceForConditionalGeneration.forward_speech_features  s    !k;CG"[Aw77::4;T;T;V;V;]^^N#z<<^LL!#333 &s &s')) i i!%!>!E!EnF^F^_`FaFa!b!bcd!efg!hi i i i i i i i i i i i i i i#)==1N1\#]#]^_#`LL E))"kCKG"0"8"89L9LQ9O9OQSU\"]"]K "-!1!1!!4!4J J9ACGE+j8IR]RdeeehmmC"#(2F{/@/@1/D(EFFFC#.u{;CT7U7U7X7XYd7e7e1e#eLL-.Z[.Z.Z.Z[[[;tz?@@ gEKPTPZPmDnDn g%',|*D*L*L*N*N*R*R*T*T%TN#/#=#E#E#G#G#L#L#N#N"NK (** gt/B/D/D g4=;LMMMM8IJJJJ%)%8%:%:

8>>~PZ?Z[[[
5;;K*<TUUU  WdjFf  W  Ww{  xB  xU  W  W  _c  d  d  d  d  d 
8>>~NNN
5;;KHHH  ZIi  Z  Zz~  {E  {X  Z  Z  bf  g  g  g  g".1N"NRVR\Rr!rM&s &s &s &s &s &s &s &s &s &s &s &s &s &s &sP  $z<<^LL 8%'777!,/1A,1OOOs7   P8?C*P8*C.	.P81C.	2L:P88P<?P<r   r   r   r   r(   r   labelsr   r   r   r   r   r   r   speeches_loss_inputspeech_semantic_tensorsacoustic_input_maskacoustic_loss_maskddpm_batch_mulrM   r   c                 |   |
|
n| j         j        }
 |                                 |          }| j                            |          }|%|                     ||                    |          nd ||                    dd          d          \  }}||||         ||         z   ||<   n||         ||<   |||z           }|||z           }	 ||j        d         t          |
                                                                          k    sMJ d|j        d          dt          |
                                                                           d            n^# t          $ r Y nRw xY wnM|                     ||                    |          nd ||                    dd          	          \  }}||||<   |	 |B|                                                    d
          d
z
  }|                    d           n_|j        d d         \  }}t!          j        |t           j        |j                                      d                              |d          }n# t          $ r Y nw xY w|                     d ||||||d|
|
  
        }|j        }|                     |          }d }|	 d } ||
                                                                dk    rYt!          j        |t           j                  }!|d d d
d f         |!d d d df<   d|!d d df<   ||!         }"|j        \  }#}$	 |"j        d         |#k    sJ d|"j        d          d|#             n# t          $ r Y nw xY wt!          j        |#|z  |$f|j        |j                  }%t!          j        t!          j        | j         j        j                  |#|z  d                              |j                  }&|                     |d          }'|"                     |d          }(| j        j!        "                    |'|%|&          })| j        #                    |)|&                    |          |(          }*| j         j        j$        }+|+dk    r|%},n;|+dk    r"| j        j!        %                    |'|%|&          },ntM          d|+ d          tO          j(        |*)                                |,)                                d          } |$dk    r |dk    r| |$z  |z  tU          |#d
          z  } nt!          j+        d| j                  } nt          d | j        j#        ,                                D                       dz  } | t          d  | j        j-        ,                                D                       dz  z  } | t          d! | j        j        ,                                D                       dz  z  } |
s(||#f|.                                d
d          z   }-|| f|-z   S t_          || ||#nd||j0        |j1        |j2        "          S )#Nr   r   T)r   r   r   r   r   z-Mismatch between selected speech connectors (z) and acoustic_input_mask sum ())r   r   r   r   )r   )min   r   r   Fr   )r   zMismatch: condition_features=z vs speech_features=)r   r   )replacementepsilonv_predictionzPrediction type r   sum)	reductionrY   )r   c              3   >   K   | ]}|                                 V  d S rK   r  .0ps     r4   	<genexpr>z<VibeVoiceForConditionalGeneration.forward.<locals>.<genexpr>  s*       Z ZQ Z Z Z Z Z Zr3   c              3   >   K   | ]}|                                 V  d S rK   r  r  s     r4   r  z<VibeVoiceForConditionalGeneration.forward.<locals>.<genexpr>  *      !^!^a!%%''!^!^!^!^!^!^r3   c              3   >   K   | ]}|                                 V  d S rK   r  r  s     r4   r  z<VibeVoiceForConditionalGeneration.forward.<locals>.<genexpr>  r  r3   )r$   r%   r&   r'   r(   r)   r*   )3r`   r   r   rT   r   r  type_asgetr   r1   r  item	Exceptionlongcumsumclamp_r.   aranger   r   expandr   r   
zeros_liker   r   r   multinomialonesr   r   r   repeat_interleaver   	add_noiser   r}   get_velocityr   Fmse_lossr   maxr   
parametersr   to_tupler#   r(   r)   r*   ).rF   r   r   r   r(   r   r  r   r   r   r   r   r   r   r  r  r	  r
  r  rM   rN   $semantic_speech_all_connect_featuresspeech_all_featuresspeech_all_connect_featuresspeech_featuresspeech_connect_featuresBTr   r)   r'   r$   r%   	cond_maskcondition_features
speech_lenlatent_sizenoise	timestepsspeech_features_repeatedcondition_features_repeatednoisy_speech_featuresmodel_outputr}   target_for_lossoutputs.                                                 r4   rO   z)VibeVoiceForConditionalGeneration.forwardL  s   0 &1%<kk$+B]'D%%''	22/3z/L/LMd/e/e,*?C?[?[@N@Z>#9#9!#<#<#<`d!- &

=' B B"&	 @\ @ @<!< )7C-H-VY}  K  ZL  .LA)**-H-VA)*"56IL6X"Y*EFY\hFh*i'*66<Q?3GZG^G^G`G`GeGeGgGgChChhhh vLcLijkLl  v  v  NQ  Re  Ri  Ri  Rk  Rk  Rp  Rp  Rr  Rr  Ns  Ns  v  v  v  ihh !   D *  8<7S7S@N@Z>#9#9!#<#<#<`d!- &

=' B B 8T 8 84O4
 ))@%& !-#1#6#6#8#8#?#?A#?#F#F#JL ''A'....72A2;DAq#(<AH#U#U#U#_#_`a#b#b#i#ijkmo#p#pL    **)%+/!&#)  
 
  1m,,   %*<*@*@*B*B*G*G*I*IA*M*M();5:NNNI 2111abb5 9Iaaa"f#IaaadO!.y!9&5&;#J)/2j@@@q4F4LQ4Oqqeoqq A@@@     Kn,k:$+#)  E )
4;<KLL^+    b%&&	  (7'H'H]^'H'_'_$*<*N*N~cd*N*e*e'$(J$>$H$H(%% %!  :55%!!!$$+ L #k?OO)++"' N22"&*"<"I"I,eY# # **^_*^*^*^___Z(:(:(<(<o>S>S>U>UafgggNQ>A#5#5!/+!=!NQTU_abQcQc!c!&c.:O!P!P!P
 ! Z Z$*2L2W2W2Y2Y Z Z ZZZ]``Nc!^!^4:3P3[3[3]3]!^!^!^^^adddNc!^!^4:3P3[3[3]3]!^!^!^^^adddN  	3j)G,<,<,>,>qrr,BBF.)F22.)+9+EZZ1#3!/)
 
 
 	
s8   BE 
E,+E, B#I$ $
I10I17,M$ $
M10M1)NNr   F)NNNNNNFNNNNNNNNNNr   )r+   r,   r-   _tied_weights_keys_tp_planr@   r   r   r   r   r   r   r   r  r.   r:   r   r   r   r/   r   
BoolTensorr1   r   r~   r   r   r#   rO   rP   rQ   s   @r4   r   r      s       *+=)H    1 1 1/ / /  , , ,) ) )N N N:& & &  9P 9P 9P 9Pz '+1537=A59-1$),0/3&*596:37;??C:>9=)q
 q
#q
 !.q
 u/0	q

 "$u'8"9:q
   12q
 )*q
 D>q
 $D>q
 'tnq
 d^q
 !!12q
 !!23q
 u/0q
  &e&78!q
" "*%*;!<#q
$ &e&67%q
& %U%56'q
( )q
* 4U5<+<%= =>?+q
, 599:-q
 q
 q
 q
 q
 q
 q
 q
r3   r   )ru   rS   r   r#   r6   )>dataclassesr   typingr   r   r   r   r   r	   r
   r.   torch.nnrA   torch.nn.functionalr   r-  torch.distributeddistributedr   transformers.models.autor   r   transformers.activationsr   transformers.modeling_outputsr   r   r   (transformers.models.llama.modeling_llamar   transformersr   transformers.modeling_utilsr   +transformers.modeling_flash_attention_utilsr   transformers.utilsr   modular_vibevoice_tokenizerr   r   r    modular_vibevoice_diffusion_headr   vibevoice.schedule.dpm_solverr   configuration_vibevoicer   
get_loggerr+   loggerr_   r   r#   r6   Moduler<   rS   ru   r   register__all__r2   r3   r4   <module>r_     sa   ! ! ! ! ! ! ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?                                   D D D D D D D D + + + + + + ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ A A A A A A ' ' ' ' ' ' 7 7 7 7 7 7 L L L L L L & & & & & & L  L  L  L  L  L  L  L  L  L D D D D D D E E E E E E 4 4 4 4 4 4 
	H	%	%w~455 N9[9c)M)M)MN&
? ? ? ? ?k ? ? ? = = = = = = = =    bi   % % % % % % % %Df
 f
 f
 f
 f
- f
 f
 f
Ri
 i
 i
 i
 i
(@ i
 i
 i
V	 	 ?N 3 3 3   o/P Q Q Q  r3   