o
    J…µis<  ã                   @   s€   d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	 dd„ Z
dd	„ Zd
d„ Zddd„Zdd„ Zedkr>eƒ  dS dS )a  
RTF optimization v2: properly skip CFG + torch.compile with safe mode.

Key insight: cfg_scale=1.0 still runs negative LM + doubles diffusion batch.
We patch sample_speech_tokens to run single-path when cfg disabled.
We also patch generate() to skip negative LM forward entirely.
é    N)Ú*VibeVoiceForConditionalGenerationInference)ÚAudioStreamer)ÚVibeVoiceProcessorc                    s(   t  ¡ d‡ fdd„	ƒ}ˆ jˆ _|ˆ _dS )zEPatch model to truly skip CFG: no negative LM, single-path diffusion.Nç      ð?c                    sŽ   ˆ j j ˆ j¡ ˆ j jj}|  |¡} tj| j	d ˆ j
j|| jd}ˆ j jjD ]}ˆ j j|| |j	d ¡ |¡| d}ˆ j j |||¡j}q&|S )Nr   )ÚdeviceÚdtype)Ú	condition)ÚmodelÚnoise_schedulerÚset_timestepsÚddpm_inference_stepsÚprediction_headr   ÚtoÚtorchÚrandnÚshapeÚconfigÚacoustic_vae_dimr   Ú	timestepsÚrepeatÚstepÚprev_sample)r   Úneg_conditionÚ	cfg_scaler   ÚspeechÚtÚeps©r	   © ú)/home/ubuntu/vibevoice/rtf_optimize_v2.pyÚsample_speech_tokens_no_cfg   s   

$z1patch_no_cfg.<locals>.sample_speech_tokens_no_cfg)Nr   )r   Úno_gradÚsample_speech_tokensÚ_original_sample)r	   r    r   r   r   Úpatch_no_cfg   s   

r$   c                 C   s   t | dƒr| j| _d S d S )Nr#   )Úhasattrr#   r"   r   r   r   r   Úunpatch_no_cfg#   s   
ÿr&   c                    sœ   ddl }ddlm‰ ddlm‰ ddlm}m‰ m} ddlm	‰ t
| jdƒr*| jjnd}t ¡ 								d‡ ‡‡‡fd
d„	ƒ}| j| _| || ¡| _dS )z4Patch generate to skip all negative/CFG computation.r   N)ÚVibeVoiceGenerationOutput)Ú VibeVoiceTokenizerStreamingCache)ÚGenerationConfigÚLogitsProcessorListÚStoppingCriteriaList)Ú!VibeVoiceTokenConstraintProcessorÚ__wrapped__Tr   c           D   	      sl  |  dd ¡}|  dd ¡}|  dd ¡}|  dd¡}| dd ¡d u r.| jjj|d jd  |d< | j|||fd	d
i|¤Ž\}}}}}ˆƒ }ˆƒ }|jd }|j}tj	|tj
|d}d }| dd¡} dd„ t|ƒD ƒ}!|jd }"|d jdd}#|j|j|j|jg}$t|dƒr|jd ur|$ |j¡ ˆ|$|d}%|d u rœˆ ƒ }| |%¡ t|j|" t||" ƒƒ}&t |j|# ||#  ¡ ¡}'tj	|tj
|d}(t|&ƒD ]¹})| ¡ rÑ n±|jd |jkrÜ n¦| j|fi |¤Ž}*|ri }+|d urö|j|d|+d< |d ur| |¡|+d< |d ur| |¡|+d< d}n
|*  dd ¡},d|i}+| d i |*¤|+¤dd
dddœ¤Ž}-| j|-|dd}|-jd d …dd d …f jd
tj|d}.|||.ƒ}/tj|/dd}0|j|0|< tj||0d d …d f gdd}|0|jk  ¡ r–|0|jkj!dd "d¡}1|1||1   }2|2 #¡ dkr–d
||2< |d ur–| $|2¡ |0|jkj!dd "d¡}3|3 #¡ dkr³| %|3¡ | %|3¡ |)|'k}4tj!|4| @ dd "d¡}5|5 #¡ dkrÝd
||5< d
|(|5< |d urÝ| $|5¡ | j& '¡ |0ƒ (d¡}6tj)||d| |0|jk@  }7|7 #¡ dkr|-j*|7dd d …f }8|  +|8¡ (d¡}9|9| j&j, |9j¡ | j&j- |9j¡ }:| j&j.j/|: | j&j.j¡||7 | j&j.j¡d
dd};t0|7ƒD ]\}<}=|= 1¡ }>||> sS|!|>  |;|< ¡ q=|d ur`| 2|;|7¡ | j&j3j4|;||7d
ddj5}?| j& 6|9¡}@| j& 7|?¡}A|@|A |6|7< |6}qÇ|d ur‹| $¡  g }B|!D ]}C|Cr |B tj|Cdd¡ q|B d ¡ qˆ||r±|B|(dS d |(dS )!NÚ	tokenizerÚparsed_scriptsÚall_speakers_listÚmax_length_timesé   Úmax_new_tokensÚ	input_idséÿÿÿÿÚreturn_processorsTr   )r   r   ÚverboseFc                 S   s   g | ]}g ‘qS r   r   )Ú.0Ú_r   r   r   Ú
<listcomp>Q   s    zGpatch_generate_skip_negative.<locals>.fast_generate.<locals>.<listcomp>Úattention_mask)ÚdimÚbos_token_id)r   Úspeech_tensorsÚspeech_masksÚspeech_input_maskÚinputs_embedsé   )Úlogits_to_keepÚreturn_dictÚoutput_attentionsÚoutput_hidden_states)Úis_encoder_decoder)Úcopyr   r   )Úas_tuple)ÚcacheÚsample_indicesÚ	use_cacheÚdebug)Ú	sequencesÚspeech_outputsÚreach_max_step_sampler   )8ÚpopÚgetr   Údecoder_configÚmax_position_embeddingsr   Ú#_build_generate_config_model_kwargsr   r   ÚzerosÚboolÚrangeÚsumÚspeech_start_idÚspeech_end_idÚspeech_diffusion_idÚeos_token_idr%   r=   ÚappendÚminÚ
max_lengthÚintÚlongÚallÚprepare_inputs_for_generationr   Ú#_update_model_kwargs_for_generationÚlogitsÚfloat32ÚargmaxÚcatÚanyÚnonzeroÚsqueezeÚnumelÚendÚset_to_zeror	   Úget_input_embeddingsÚ	unsqueezeÚarangeÚlast_hidden_stater"   Úspeech_scaling_factorÚspeech_bias_factorÚacoustic_tokenizerÚdecodeÚ	enumerateÚitemÚputÚsemantic_tokenizerÚencodeÚmeanÚacoustic_connectorÚsemantic_connector)DÚselfÚinputsÚgeneration_configÚlogits_processorÚstopping_criteriaÚprefix_allowed_tokens_fnÚsynced_gpusÚassistant_modelÚaudio_streamerÚnegative_prompt_idsÚnegative_prompt_attention_maskr>   r?   r@   Ú
is_prefillÚreturn_speechr   Ústop_check_fnÚ
tqdm_classÚkwargsr.   r/   r0   r1   Úmodel_kwargsr4   Úacoustic_cacheÚsemantic_cacheÚ
batch_sizer   Úfinished_tagsrA   r7   Úaudio_chunksÚinitial_lengthÚinitial_length_per_sampleÚvalid_tokensÚtoken_constraintÚ	max_stepsÚmax_step_per_samplerP   r   Úmodel_inputsÚprefill_inputsr9   ÚoutputsÚnext_token_logitsÚnext_token_scoresÚnext_tokensÚeos_idxÚnew_eosÚdiffusion_end_indicesÚmax_length_reachedÚnew_maxÚnext_inputs_embedsÚdiffusion_indicesÚpositive_conditionÚspeech_latentÚscaled_latentÚaudio_chunkÚiÚ
sample_idxÚidxÚsemantic_featuresÚacoustic_embedÚsemantic_embedÚfinal_audioÚchunks©r*   r'   r,   r(   r   r   Úfast_generate2   s
  ÿÿÿ

þ

þ

ÿÿ
þÿ&







ÿ$ý
€
þý

þþz3patch_generate_skip_negative.<locals>.fast_generate)NNNNNNNNNNNNNTTr   NN)ÚtypesÚ.vibevoice.modular.modeling_vibevoice_inferencer'   Ú-vibevoice.modular.modular_vibevoice_tokenizerr(   Útransformers.generationr)   r*   r+   r,   r%   Úgenerater-   r   r!   Ú_original_generateÚ
MethodType)r	   r·   r)   r+   Úoriginal_generater¶   r   rµ   r   Úpatch_generate_skip_negative(   s$   ø #r¿   é   çÍÌÌÌÌÌô?c                    sz  |  d¡s
d|› }||g|ggdddd}| ¡ D ]\}}t |¡r*| d¡||< qtdd d‰d g‰ d	g‰d g}	‡ ‡‡fd
d„}
tj|
dd}| ¡  t 	d¡ tj
 d¡ | j|d tj
 ¡  t ¡ |	d	< | jdi |¤d ||jddiddˆddœ¤Ž}tj
 ¡  t ¡ }|jdd ||	d	  }ˆ d	 r¢ˆ d	 |	d	  d nd}ˆd	 d }|d	kr²|| ntdƒ}||||dœS )NÚSpeakerzSpeaker 1: TÚpt)ÚtextÚvoice_samplesÚpaddingÚreturn_tensorsÚreturn_attention_maskÚcudarB   )r“   Ústop_signalr   c                     sF   ˆ  d¡D ]} t ¡ }ˆ d d u r|ˆ d< ˆd  | jd 7  < qd S )Nr   r5   )Ú
get_streamÚtimeÚperf_counterr   )Úchunkr   ©Úfirst_chunkÚstreamerÚ
total_sampr   r   Úconsumeré   s   üzmeasure.<locals>.consumer)ÚtargetÚdaemoné*   )Ú	num_stepsÚ	do_sampleF)r3   r   r.   r‚   r7   r‹   rˆ   Úshow_progress_baré   )Útimeoutiè  r5   g     p×@Úinf)Úttfb_msÚgen_sÚaudio_sÚrtfr   )Ú
startswithÚitemsr   Ú	is_tensorr   r   Ú	threadingÚThreadÚstartÚmanual_seedrÉ   Úmanual_seed_allÚset_ddpm_inference_stepsÚsynchronizerÌ   rÍ   r»   r.   ÚjoinÚfloat)r	   Ú	processorÚ
voice_pathrÄ   Ú
ddpm_stepsr   r   ÚkÚvræ   rÓ   Úthrž   rn   ÚgenÚttfbÚdurrà   r   rÏ   r   ÚmeasureÙ   sJ   


þ
€

ÿ

ý
 rö   c            
      C   s0  d} d}d}t  |¡}ztj|tjddd}W n   tj|tjddd}Y | ¡  g }tdƒ | d	¡ t||| d
d	d}tdƒ t||| |d	dd}| 	d|f¡ td|d d›d|d d›dƒ tdƒ t
|ƒ t|ƒ t||| d
d	d}t||| |d	dd}| 	d|f¡ td|d d›d|d d›dƒ tdƒ tj|jjdd|j_t||| dd	d}t||| dd	d}t||| |d	dd}| 	d|f¡ td|d d›d|d d›dƒ td ƒ tj|jjdd|j_t||| d!d	d}t||| |d	dd}| 	d"|f¡ td|d d›d|d d›dƒ td#ƒ t||| |d$dd}| 	d%|f¡ td|d d›d|d d›dƒ td&d'› ƒ td(d)›d*d+d,›d*d-d.›d*d/d.›d*d0d.›	ƒ td1› ƒ |D ]-\}}|d dk rpd2nd3}	t|d)›d*|d d4›d5|d d6›d7|d8 d9›d:|	d,›	ƒ qctd'› ƒ d S );Nzdemo/voices/modi.wavzmicrosoft/VibeVoice-1.5Bu?  Speaker 1: à¤®à¥‡à¤°à¥‡ à¤ªà¥à¤¯à¤¾à¤°à¥‡ à¤¦à¥‡à¤¶à¤µà¤¾à¤¸à¤¿à¤¯à¥‹à¤‚, à¤†à¤œ à¤®à¥ˆà¤‚ à¤†à¤ªà¤•à¥‡ à¤¸à¤¾à¤¥ à¤•à¥à¤› à¤¬à¤¹à¥à¤¤ à¤œà¤¼à¤°à¥‚à¤°à¥€ à¤¬à¤¾à¤¤à¥‡à¤‚ à¤•à¤°à¤¨à¤¾ à¤šà¤¾à¤¹à¤¤à¤¾ à¤¹à¥‚à¤. à¤¹à¤®à¤¾à¤°à¤¾ à¤¦à¥‡à¤¶ à¤à¤• à¤¨à¤¯à¥‡ à¤¦à¥Œà¤° à¤®à¥‡à¤‚ à¤ªà¥à¤°à¤µà¥‡à¤¶ à¤•à¤° à¤°à¤¹à¤¾ à¤¹à¥ˆ.rÉ   Úflash_attention_2)Útorch_dtypeÚ
device_mapÚattn_implementationÚsdpazWarming up...rÀ   zSpeaker 1: test.)rï   z2[1] Baseline: original generate, cfg=1.3, 20 stepsrÁ   )rï   r   zBaseline (cfg=1.3, 20 steps)z    RTF=rà   z.3fzx  TTFB=rÝ   z.0fÚmszG
[2] NO-CFG patched: skip negative LM + single-path diffusion, 20 stepsr   zNO-CFG patched, 20 stepsz7
[3] NO-CFG + torch.compile(LM, mode=default), 20 stepsÚdefault)ÚmodezSpeaker 1: compile warmup.zSpeaker 1: compile warmup2.zNO-CFG + compile(LM), 20 stepsz-
[4] NO-CFG + compile(LM+diffusion), 20 stepszSpeaker 1: diff compile warmup.z#NO-CFG + compile(LM+diff), 20 stepsz(
[5] NO-CFG + compile(LM+diff), 10 stepsé
   zNO-CFG + compile all, 10 stepsÚ
zZ==========================================================================================ÚConfigz<45Ú ÚRTFz>7ÚTTFBz>8ÚAudiozStream?zZ------------------------------------------------------------------------------------------ÚYESÚNOz>6.3fzx z>7.0fzms rß   z>7.2fzs )r   Úfrom_pretrainedr   r   Úbfloat16ÚevalÚprintré   rö   r^   r$   r¿   Úcompiler	   Úlanguage_modelr   )
rî   Ú
model_pathrÄ   rí   r	   Úconfigsr9   ÚrÚnameÚcanr   r   r   Úmain	  sl   

þþ
""""".
>r  Ú__main__)rÀ   rÁ   )Ú__doc__rÌ   rä   r   r¸   r   Úvibevoice.modular.streamerr   Ú'vibevoice.processor.vibevoice_processorr   r$   r&   r¿   rö   r  Ú__name__r   r   r   r   Ú<module>   s      
20G
ÿ