o
    Ñµiž  ã                   @   s‚   d Z ddlZddlZddlZddlZddlmZ ddlmZ ddl	m
Z
 ddd„Zd	d
„ Zddd„Zdd„ Zedkr?eƒ  dS dS )zx
CUDA graph accelerated VibeVoice diffusion head.
Captures prediction_head as a CUDA graph, replays for each DDPM step.
é    N)Ú*VibeVoiceForConditionalGenerationInference)ÚAudioStreamer)ÚVibeVoiceProcessoré   c                 C   sì   | j j}t| ¡ ƒj}t| ¡ ƒj}|d }tj|| jj	||d}tj|||d}tj|| jj
j||d}tdƒD ]	}	||||d q9tj ¡  tj ¡ }
tj |
¡ ||||d}W d  ƒ n1 sew   Y  tj ¡  |
||||fS )zMCapture prediction_head as CUDA graph for given batch size (doubled for CFG).é   ©ÚdeviceÚdtypeé   )Ú	conditionN)ÚmodelÚprediction_headÚnextÚ
parametersr   r	   ÚtorchÚzerosÚconfigÚacoustic_vae_dimÚdecoder_configÚhidden_sizeÚrangeÚcudaÚsynchronizeÚ	CUDAGraphÚgraph)r   Ú
batch_sizeÚphr   r	   Ú	cfg_batchÚstatic_xÚstatic_tÚstatic_condÚ_r   Ú
static_out© r#   ú'/home/ubuntu/vibevoice/cuda_graph_v2.pyÚsetup_cuda_graph_diffusion   s    

ÿ
r%   c                    s0   ˆj }t ¡ d‡ ‡‡‡‡‡fdd„	ƒ}|ˆ_ |S )z5Replace sample_speech_tokens with CUDA graph version.ç      @c                    s  ˆj j ˆj¡ tj| |gdd}ˆ |¡ tj|jd ˆj	j
|j|jd}ˆj jjD ]R}|d t|ƒd … }tj||gdd}ˆ |¡ ˆ | ¡ ¡ ˆ  ¡  ˆ ¡ }tj|t|ƒd dd\}	}
|
||	|
   }tj||gdd}ˆj j |||¡j}q+|d t|ƒd … S )Nr   )Údimr   r   )r   Únoise_schedulerÚset_timestepsÚddpm_inference_stepsr   ÚcatÚcopy_ÚrandnÚshaper   r   r   r	   Ú	timestepsÚlenÚfill_ÚitemÚreplayÚcloneÚsplitÚstepÚprev_sample)r   Úneg_conditionÚ	cfg_scaleÚcombined_condÚspeechÚtÚhalfÚcombinedÚepsÚcond_epsÚ
uncond_epsÚhalf_epsÚfull_eps©r   r   r    r"   r   r   r#   r$   Úfast_sample,   s$   
ÿ
z1patched_sample_speech_tokens.<locals>.fast_sampleN)r&   )Úsample_speech_tokensr   Úno_grad)r   r   r   r   r    r"   Úoriginal_samplerE   r#   rD   r$   Úpatched_sample_speech_tokens(   s
   rI   é
   Ú c                    sä  ||g|ggdddd}|  ¡ D ]\}}t |¡r | d¡||< qtdd d‰d g‰ dg‰d g}	g ‰dg‰‡ ‡‡fd	d
„}
‡‡fdd„}tj|
dd}| ¡  tj|dd}| ¡  t d¡ tj	 
d¡ | j|d tj	 ¡  t ¡ |	d< | jd$i |¤d d|jddiddˆddœ¤Ž}tj	 ¡  t ¡ }dˆd< |jdd t d¡ ||	d  }ˆ d r¸ˆ d |	d  d nd}ˆd d }|dkrÈ|| nd}ˆrÔtˆƒtˆƒ nd}t|d›d|d›d|d›d|d ›d!|d›d"
dd# |S )%NTÚpt)ÚtextÚvoice_samplesÚpaddingÚreturn_tensorsÚreturn_attention_maskr   r   )r   Ústop_signalr   Fc                     sF   ˆ  d¡D ]} t ¡ }ˆ d d u r|ˆ d< ˆd  | jd 7  < qd S )Nr   éÿÿÿÿ)Ú
get_streamÚtimeÚperf_counterr.   )Úchr<   )ÚfcÚstreamerÚtsr#   r$   ÚconsumerS   s
   ýzrun_benchmark.<locals>.consumerc                     sX   ˆd s*t jg d¢ddd} zˆ  t| j ¡ ƒ¡ W n   Y t d¡ ˆd rd S d S )Nr   )z
nvidia-smiz--query-gpu=utilization.gpuz--format=csv,noheader,nounitsT)Úcapture_outputrM   g333333Ã?)Ú
subprocessÚrunÚappendÚfloatÚstdoutÚstriprU   Úsleep)Úr)Úgpu_sÚstopr#   r$   ÚmonY   s   
ÿ
ûzrun_benchmark.<locals>.mon)ÚtargetÚdaemoné*   )Ú	num_stepsgÍÌÌÌÌÌô?Ú	do_sample)Úmax_new_tokensr9   Ú	tokenizerÚgeneration_configÚverboseÚ
is_prefillÚaudio_streamerÚshow_progress_barr
   )Útimeoutg333333Ó?iè  rS   g     p×@iç  z<45z RTF=z.3fzx TTFB=z.0fz	ms Audio=z.2fzs GPU=ú%©Úflushr#   )Úitemsr   Ú	is_tensorÚtor   Ú	threadingÚThreadÚstartÚmanual_seedr   Úmanual_seed_allÚset_ddpm_inference_stepsr   rU   rV   Úgeneratern   Újoinrc   Úsumr0   Úprint)r   Ú	processorÚ
voice_pathrM   Ú
ddpm_stepsÚlabelÚinpÚkÚvÚstr[   rg   ÚthÚmtÚoutÚendÚgenÚttfbÚdurÚrtfÚavg_gpur#   )rX   re   rf   rY   rZ   r$   Úrun_benchmarkI   s:   ÿ€


þ 4r–   c               	   C   sÆ  d} d}t  d¡}ztjdtjddd}W n   tjdtjddd}Y | ¡  t||| dd	d
d t||| |d	dd t||| |ddd tddd t|dd\}}}}}t	||||||ƒ}	tddd t||| |d	dd t||| |ddd tddd tj
|jjdd|j_t||| dd	dd t||| dd	dd t||| |d	dd t||| |ddd dd l}
|
jd dd! t||| |dd"d}|jd d urá|j|jd d#d$ td%dd d S d S )&Nzdemo/voices/modi.wavu?  Speaker 1: à¤®à¥‡à¤°à¥‡ à¤ªà¥à¤¯à¤¾à¤°à¥‡ à¤¦à¥‡à¤¶à¤µà¤¾à¤¸à¤¿à¤¯à¥‹à¤‚, à¤†à¤œ à¤®à¥ˆà¤‚ à¤†à¤ªà¤•à¥‡ à¤¸à¤¾à¤¥ à¤•à¥à¤› à¤¬à¤¹à¥à¤¤ à¤œà¤¼à¤°à¥‚à¤°à¥€ à¤¬à¤¾à¤¤à¥‡à¤‚ à¤•à¤°à¤¨à¤¾ à¤šà¤¾à¤¹à¤¤à¤¾ à¤¹à¥‚à¤. à¤¹à¤®à¤¾à¤°à¤¾ à¤¦à¥‡à¤¶ à¤à¤• à¤¨à¤¯à¥‡ à¤¦à¥Œà¤° à¤®à¥‡à¤‚ à¤ªà¥à¤°à¤µà¥‡à¤¶ à¤•à¤° à¤°à¤¹à¤¾ à¤¹à¥ˆ.zmicrosoft/VibeVoice-1.5Br   Úflash_attention_2)Útorch_dtypeÚ
device_mapÚattn_implementationÚsdpazSpeaker 1: warmup.rJ   Úwarmup)r‡   rˆ   z[1] Baseline (10 steps)é   z[1] Baseline (20 steps)z,
Setting up CUDA graph for diffusion head...Trv   r   )r   zCUDA graph ready.z#[2] CUDA graph diffusion (10 steps)z#[2] CUDA graph diffusion (20 steps)z
Compiling LM backbone...Údefault)ÚmodezSpeaker 1: compile warmup.zcompile warmup 1zcompile warmup 2z+[3] CUDA graph diff + compile LM (10 steps)z+[3] CUDA graph diff + compile LM (20 steps)r   Úsamples_cuda_graph_v2)Úexist_okz[FINAL] Best config 20 stepsz#samples_cuda_graph_v2/modi_best.wav)Úoutput_pathz*Saved: samples_cuda_graph_v2/modi_best.wav)r   Úfrom_pretrainedr   r   Úbfloat16Úevalr–   r„   r%   rI   Úcompiler   Úlanguage_modelÚosÚmakedirsÚspeech_outputsÚ
save_audio)r†   rM   r…   r   r   ÚsxÚst_bufÚscÚsoÚorigr¨   r   r#   r#   r$   Úmainv   sH   

þþþr±   Ú__main__)r   )rJ   rK   )Ú__doc__r   rU   r{   r]   Ú.vibevoice.modular.modeling_vibevoice_inferencer   Úvibevoice.modular.streamerr   Ú'vibevoice.processor.vibevoice_processorr   r%   rI   r–   r±   Ú__name__r#   r#   r#   r$   Ú<module>   s    

!-1
ÿ