o
    ~µi>C  ã                   @   s¤   d Z ddlZddlZddlZddlZddlmZ ddlmZm	Z	m
Z
 ddlmZ ddlmZ ddlmZ ddlmZ d	d
„ Zddd„Zdd„ ZedkrPeƒ  dS dS )a,  
RTF optimization v3: keep CFG for voice quality, but optimize compute.

Approach 1: torch.compile(mode=default) on LM + diffusion WITH full CFG
Approach 2: Cached negative condition - compute once, reuse for all tokens
            (saves the negative LM forward on every token)
Approach 3: Combined
é    N)Údefaultdict)Ú*VibeVoiceForConditionalGenerationInferenceÚVibeVoiceGenerationOutputÚ!VibeVoiceTokenConstraintProcessor)Ú VibeVoiceTokenizerStreamingCache)ÚAudioStreamer)ÚVibeVoiceProcessor)ÚLogitsProcessorListc                 C   s:   t  ¡ 							ddd„ƒ}| j| _t || ¡| _dS )zÇ
    Patch generate to cache the negative condition after first computation.
    Instead of running negative LM forward every diffusion token,
    run it once and reuse the hidden state for CFG.
    NTçÍÌÌÌÌÌô?c           J   	   [   s>  |  dd ¡}|  dd ¡ |  dd ¡ |  dd¡}| dd ¡d u r.| jjj|d jd  |d< | j|||fd	d
i|¤Ž\}}}}}tj|d jd df|j	tj
|d jdtj|d jd dftj
|d jd| dd¡dœ}| jd d |fd	di|¤Ž\}}}| j|fi |¤Ž}| d#i |¤dd
dddœ¤Ž}|jd d …dd d …f  ¡ }tƒ }tƒ } |jd }!|j}"tj|!tj|"d}#d }$| dd¡}%dd„ t|!ƒD ƒ}&|jd }'|d jdd}(|j	|j|j|jg})t|dƒrò|jd urò|) |j¡ t|)|"d}*|d u rÿtƒ }| |*¡ t|j|' t||' ƒƒ}+t |j|( ||(  
¡ ¡},tj|!tj|"d}-t|+ƒD ]Ä}.|#  ¡ r5 n»|jd |jkrA n¯| j|fi |¤Ž}/|rwi }0|d ur\|j!|"d|0d< |d urh| !|"¡|0d< |d urt| !|"¡|0d< d}n
|/  dd ¡}1d|$i}0| d#i |/¤|0¤dd
dddœ¤Ž}2| j"|2|dd}|2j#d d …dd d …f j!d
tj$|"d}3|||3ƒ}4tj%|4dd}5|j|5|#< tj&||5d d …d f gdd}|5|jk '¡ rü|5|jkj(dd )d¡}6|6|#|6   }7|7 *¡ dkrüd
|#|7< |d urü| +|7¡ |5|jkj(dd )d¡}8|8 *¡ dkr| ,|8¡ |  ,|8¡ |.|,k}9tj(|9|# @ dd )d¡}:|: *¡ dkrCd
|#|:< d
|-|:< |d urC| +|:¡ | j- .¡ |5ƒ /d¡};tj0|!|"d|# |5|jk@  }<|< *¡ dkrì|2j|<dd d …f }=||< }>| j1|=|>|d  /d¡}?|?| j-j2 !|?j¡ | j-j3 !|?j¡ }@| j-j4j5|@ !| j-j4j¡||< !| j-j4j¡d
dd!}At6|<ƒD ]\}B}C|C 7¡ }D|#|D sÀ|&|D  |A|B ¡ qª|d urÍ| 8|A|<¡ | j-j9j:|A| |<d
dd!j;}E| j- <|?¡}F| j- =|E¡}G|F|G |;|<< |;}$q*|d urù| +¡  g }H|&D ]}I|H |Irtj&|Iddnd ¡ qýt>||r|H|-d"S d |-d"S )$NÚ	tokenizerÚparsed_scriptsÚall_speakers_listÚmax_length_timesé   Úmax_new_tokensÚ	input_idséÿÿÿÿÚreturn_processorsTr   é   )ÚdtypeÚdeviceéd   )r   Úattention_maskr   F)Úlogits_to_keepÚreturn_dictÚoutput_attentionsÚoutput_hidden_statesÚverbosec                 S   s   g | ]}g ‘qS © r   )Ú.0Ú_r   r   ú)/home/ubuntu/vibevoice/rtf_optimize_v3.pyÚ
<listcomp>R   s    zKpatch_cached_negative.<locals>.fast_generate_cached_neg.<locals>.<listcomp>r   )ÚdimÚbos_token_id)r   Úspeech_tensorsÚspeech_masksÚspeech_input_maskÚinputs_embeds)Úis_encoder_decoder)Úcopyr   r   )Úas_tuple)Ú	cfg_scale)ÚcacheÚsample_indicesÚ	use_cacheÚdebug)Ú	sequencesÚspeech_outputsÚreach_max_step_sampler   )?ÚpopÚgetÚconfigÚdecoder_configÚmax_position_embeddingsÚshapeÚ#_build_generate_config_model_kwargsÚtorchÚfullÚspeech_start_idÚlongr   ÚonesÚprepare_inputs_for_generationÚlast_hidden_stateÚcloner   ÚzerosÚboolÚrangeÚsumÚspeech_end_idÚspeech_diffusion_idÚeos_token_idÚhasattrr$   Úappendr   r	   ÚminÚ
max_lengthÚintÚallÚtoÚ#_update_model_kwargs_for_generationÚlogitsÚfloat32ÚargmaxÚcatÚanyÚnonzeroÚsqueezeÚnumelÚendÚset_to_zeroÚmodelÚget_input_embeddingsÚ	unsqueezeÚarangeÚsample_speech_tokensÚspeech_scaling_factorÚspeech_bias_factorÚacoustic_tokenizerÚdecodeÚ	enumerateÚitemÚputÚsemantic_tokenizerÚencodeÚmeanÚacoustic_connectorÚsemantic_connectorr   )JÚselfÚinputsÚgeneration_configÚlogits_processorÚstopping_criteriaÚprefix_allowed_tokens_fnÚsynced_gpusÚassistant_modelÚaudio_streamerÚnegative_prompt_idsÚnegative_prompt_attention_maskr%   r&   r'   Ú
is_prefillÚreturn_speechr,   Ústop_check_fnÚ
tqdm_classÚkwargsr   r   Úmodel_kwargsr   Únegative_kwargsÚnegative_generation_configÚnegative_model_kwargsÚnegative_input_idsÚneg_model_inputsÚnegative_outputsÚcached_neg_conditionÚacoustic_cacheÚsemantic_cacheÚ
batch_sizer   Úfinished_tagsr(   r   Úaudio_chunksÚinitial_lengthÚinitial_length_per_sampleÚvalid_tokensÚtoken_constraintÚ	max_stepsÚmax_step_per_sampler3   ÚstepÚmodel_inputsÚprefill_inputsr    ÚoutputsÚnext_token_logitsÚnext_token_scoresÚnext_tokensÚeos_idxÚnew_eosÚdiffusion_end_indicesÚmax_length_reachedÚnew_maxÚnext_inputs_embedsÚdiffusion_indicesÚpositive_conditionÚnegative_conditionÚspeech_latentÚscaled_latentÚaudio_chunkÚiÚ
sample_idxÚidxÚsemantic_featuresÚacoustic_embedÚsemantic_embedÚfinal_audioÚchunksr   r   r!   Úfast_generate_cached_neg"   s6  
ÿÿÿ($
ýÿÿÿÿ
þ

þ

þ



ÿÿ
þÿ&







ÿÿþ$ý
€
þý
"
þþz7patch_cached_negative.<locals>.fast_generate_cached_neg)NNNNNNNNNNNNNTTr
   NN)r;   Úno_gradÚgenerateÚ_orig_generateÚtypesÚ
MethodType)r\   r«   r   r   r!   Úpatch_cached_negative   s   ù 5r±   é   r
   c                    s~  |  d¡s
d|› }||g|ggdddd}| ¡ D ]\}}t |¡r*| d¡||< qtdd d‰d g‰ d	g‰d g}	‡ ‡‡fd
d„}
tj|
dd}| ¡  t 	d¡ tj
 d¡ | j|d tj
 ¡  t ¡ |	d	< | jdi |¤d ||jddiddˆddœ¤Ž}tj
 ¡  t ¡ }|jdd ||	d	  }ˆ d	 r¢ˆ d	 |	d	  d nd}ˆd	 d }|d	kr²|| ntdƒ}||||dœ|fS )NÚSpeakerzSpeaker 1: TÚpt)ÚtextÚvoice_samplesÚpaddingÚreturn_tensorsÚreturn_attention_maskÚcudar   )r‡   Ústop_signalr   c                     sF   ˆ  d¡D ]} t ¡ }ˆ d d u r|ˆ d< ˆd  | jd 7  < qd S )Nr   r   )Ú
get_streamÚtimeÚperf_counterr9   )ÚchunkÚt©Úfirst_chunkÚstreamerÚ
total_sampr   r!   Úconsumerë   s   üzmeasure.<locals>.consumer)ÚtargetÚdaemoné*   )Ú	num_stepsÚ	do_sampleF)r   r,   r   ro   r   rx   ru   Úshow_progress_baré   )Útimeoutiè  r   g     p×@Úinf)Úttfb_msÚgen_sÚaudio_sÚrtfr   )Ú
startswithÚitemsr;   Ú	is_tensorrP   r   Ú	threadingÚThreadÚstartÚmanual_seedrº   Úmanual_seed_allÚset_ddpm_inference_stepsÚsynchronizer½   r¾   r­   r   ÚjoinÚfloat)r\   Ú	processorÚ
voice_pathrµ   Ú
ddpm_stepsr,   rn   ÚkÚvrØ   rÅ   Úthr“   rZ   ÚgenÚttfbÚdurrÒ   r   rÁ   r!   ÚmeasureÛ   sH   


þ
€
ÿ

ý
 rè   c                  C   sÖ  dd l } d}d}t |¡}ztj|tjddd}W n   tj|tjddd}Y | ¡  d}g }td	ƒ t|||d
dd}tdƒ t||||ddd\}}| 	d|f¡ td|d d›d|d d›dƒ tdƒ t
|ƒ t|||d
ddd}t||||ddd\}}| 	d|f¡ td|d d›d|d d›dƒ tdƒ tj|jjdd|j_t|||dddd}t|||dddd}t||||ddd\}}| 	d|f¡ td|d d›d|d d›dƒ td ƒ tj|jjdd|j_t|||d!ddd}t||||ddd\}}	| 	d"|f¡ td|d d›d|d d›dƒ | jd#d$d% |	jd d ur<|j|	jd d&d' td(ƒ td)d*› ƒ td+d,›d-d.d/›d-d0d1›d-d2d1›d-d3d1›	ƒ td4› ƒ |D ]-\}
}|d d5k rnd6nd7}t|
d,›d-|d d8›d9|d d:›d;|d< d=›d>|d/›	ƒ qatd*› ƒ td?ƒ g d@¢}|D ]J\}}t dA¡ tj dA¡ t||||ddd\}}	|	jd d urçdB|› dC}|j|	jd |d' tdD|› dE|d< dF›dG|d d›dH|› ƒ qžd S )INr   zdemo/voices/modi.wavzmicrosoft/VibeVoice-1.5Brº   Úflash_attention_2)Útorch_dtypeÚ
device_mapÚattn_implementationÚsdpau¾  Speaker 1: à¤®à¥‡à¤°à¥‡ à¤ªà¥à¤¯à¤¾à¤°à¥‡ à¤¦à¥‡à¤¶à¤µà¤¾à¤¸à¤¿à¤¯à¥‹à¤‚, à¤†à¤œ à¤®à¥ˆà¤‚ à¤†à¤ªà¤•à¥‡ à¤¸à¤¾à¤¥ à¤•à¥à¤› à¤¬à¤¹à¥à¤¤ à¤œà¤¼à¤°à¥‚à¤°à¥€ à¤¬à¤¾à¤¤à¥‡à¤‚ à¤•à¤°à¤¨à¤¾ à¤šà¤¾à¤¹à¤¤à¤¾ à¤¹à¥‚à¤. à¤¹à¤®à¤¾à¤°à¤¾ à¤¦à¥‡à¤¶ à¤à¤• à¤¨à¤¯à¥‡ à¤¦à¥Œà¤° à¤®à¥‡à¤‚ à¤ªà¥à¤°à¤µà¥‡à¤¶ à¤•à¤° à¤°à¤¹à¤¾ à¤¹à¥ˆ, à¤œà¤¹à¤¾à¤ à¤Ÿà¥‡à¤•à¥à¤¨à¥‹à¤²à¥‰à¤œà¥€ à¤”à¤° à¤‡à¤¨à¥‹à¤µà¥‡à¤¶à¤¨ à¤¹à¤®à¤¾à¤°à¥€ à¤¤à¤¾à¤•à¤¤ à¤¬à¤¨ à¤°à¤¹à¥€ à¤¹à¥ˆ.zWarming up baseline...zSpeaker 1: test.r²   )rá   z)[1] Baseline: original, cfg=1.3, 20 stepsr
   )rá   r,   zBaseline (full CFG, 20 steps)z    RTF=rÒ   z.3fzx  TTFB=rÏ   z.0fÚmsz(
[2] Cached negative + cfg=1.3, 20 stepszCached neg + CFG=1.3, 20 stepsz0
[3] Cached neg + compile(LM), cfg=1.3, 20 stepsÚdefault)ÚmodezSpeaker 1: compile warmup1.zSpeaker 1: compile warmup2.z"Cached neg + compile(LM), 20 stepsz5
[4] Cached neg + compile(LM+diff), cfg=1.3, 20 stepszSpeaker 1: diff warmup.z'Cached neg + compile(LM+diff), 20 stepsÚ
samples_v3T)Úexist_okz'samples_v3/modi_cached_cfg_compiled.wav)Úoutput_pathz2    Saved: samples_v3/modi_cached_cfg_compiled.wavÚ
zZ==========================================================================================ÚConfigz<50Ú ÚRTFz>7ÚTTFBz>8ÚAudiozStream?zZ------------------------------------------------------------------------------------------g      ð?ÚYESÚNOz>6.3fzx z>7.0fzms rÑ   z>7.2fzs zH
Generating samples with best config (cached neg + compile + CFG=1.3)...))Úshortu^   Speaker 1: à¤¨à¤®à¤¸à¥à¤¤à¥‡, à¤®à¥‡à¤°à¥‡ à¤ªà¥à¤¯à¤¾à¤°à¥‡ à¤¦à¥‡à¤¶à¤µà¤¾à¤¸à¤¿à¤¯à¥‹à¤‚.)Úmediumu  Speaker 1: à¤†à¤œ à¤¹à¤® à¤¡à¤¿à¤œà¤¿à¤Ÿà¤² à¤‡à¤‚à¤¡à¤¿à¤¯à¤¾ à¤•à¥€ à¤¬à¤¾à¤¤ à¤•à¤°à¤¤à¥‡ à¤¹à¥ˆà¤‚. à¤—à¤¾à¤à¤µ à¤—à¤¾à¤à¤µ à¤®à¥‡à¤‚ à¤‡à¤‚à¤Ÿà¤°à¤¨à¥‡à¤Ÿ à¤ªà¤¹à¥à¤à¤š à¤°à¤¹à¤¾ à¤¹à¥ˆ. à¤•à¤¿à¤¸à¤¾à¤¨ à¤…à¤ªà¤¨à¥‡ à¤«à¥‹à¤¨ à¤¸à¥‡ à¤®à¤‚à¤¡à¥€ à¤•à¥‡ à¤­à¤¾à¤µ à¤¦à¥‡à¤– à¤°à¤¹à¤¾ à¤¹à¥ˆ. à¤¯à¤¹ à¤¬à¤¦à¤²à¤¾à¤µ à¤›à¥‹à¤Ÿà¤¾ à¤¨à¤¹à¥€à¤‚ à¤¹à¥ˆ, à¤¯à¤¹ à¤à¤• à¤•à¥à¤°à¤¾à¤‚à¤¤à¤¿ à¤¹à¥ˆ.)Úspeechu¡  Speaker 1: à¤­à¤¾à¤°à¤¤ à¤†à¤œ à¤¦à¥à¤¨à¤¿à¤¯à¤¾ à¤•à¥€ à¤ªà¤¾à¤à¤šà¤µà¥€à¤‚ à¤¸à¤¬à¤¸à¥‡ à¤¬à¤¡à¤¼à¥€ à¤…à¤°à¥à¤¥à¤µà¥à¤¯à¤µà¤¸à¥à¤¥à¤¾ à¤¹à¥ˆ. à¤¹à¤®à¤¾à¤°à¥‡ à¤¯à¥à¤µà¤¾à¤“à¤‚ à¤•à¥€ à¤Šà¤°à¥à¤œà¤¾, à¤¹à¤®à¤¾à¤°à¥‡ à¤µà¥ˆà¤œà¥à¤žà¤¾à¤¨à¤¿à¤•à¥‹à¤‚ à¤•à¥€ à¤ªà¥à¤°à¤¤à¤¿à¤­à¤¾, à¤”à¤° à¤¹à¤®à¤¾à¤°à¥‡ à¤•à¤¿à¤¸à¤¾à¤¨à¥‹à¤‚ à¤•à¥€ à¤®à¥‡à¤¹à¤¨à¤¤, à¤¯à¤¹à¥€ à¤¹à¤®à¤¾à¤°à¥€ à¤…à¤¸à¤²à¥€ à¤¤à¤¾à¤•à¤¤ à¤¹à¥ˆ.rÈ   zsamples_v3/modi_z.wavz  z: z.2fzs | RTF=zx | )Úosr   Úfrom_pretrainedr   r;   Úbfloat16ÚevalÚprintrè   rK   r±   Úcompiler\   Úlanguage_modelÚprediction_headÚmakedirsr2   Ú
save_audiorÙ   rº   rÚ   )rÿ   rà   Ú
model_pathrß   r\   rµ   Úconfigsr    ÚrÚoutÚnameÚcanÚtextsÚlabelÚtxtÚpathr   r   r!   Úmain
  s€   

þþ"""".
>
,€úr  Ú__main__)r²   r
   )Ú__doc__r½   r¯   rÖ   r;   Úcollectionsr   Ú.vibevoice.modular.modeling_vibevoice_inferencer   r   r   Ú-vibevoice.modular.modular_vibevoice_tokenizerr   Úvibevoice.modular.streamerr   Ú'vibevoice.processor.vibevoice_processorr   Útransformers.generationr	   r±   rè   r  Ú__name__r   r   r   r!   Ú<module>   s$    	 
A/V
ÿ