o
    i=-                     @   s,  d dl Z d dlZd dlZd dlZd dlmZ d dlmZ d dlmZ d dl	Z
d dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZmZmZmZmZmZmZmZmZmZm Z m!Z!m"Z" e j#d
dddZ$e$j%dde&ej'(ed)dddd e$j%dde&dd e$j%dde&dd e$j%dde&dd e$j%dd e&d!d e$j%d"d#e&d$d e$j%d%d&e&d'd e$j%d(d)e&d*d e$j%d+d,e&d-d e$j%d.d/e&d0d e$j%d1d2e&d3d e$j%d4d5d6d7 e$j%d8d9d:d7 e$j%d;d5d<d7 e$j%d=d5d>d7 e$j%d?e&d@dAgdBe dC e$j%dDe*dEe" d e$j%dFe*dGe d e$j%dHe+dIe d e$j%dJe*dKe d e$j%dLe*dMe! d e$j%dNe*dOe  d e$j%dPe*dQe d e$j%dRe&dSd e$, Z-e.e/e-j0dTZ0e-j1pue02dUdVZ1e-j3pe02dWdXZ3e-j4pe02dYdXZ4e-j5pe02dZd[Z5e-j6dure-j6ne02d\d]Z6e-j7pe02d^d_Z7e-j8pe02d`dXZ8e-j9pe02dadbZ9e-j:pe02dcdde; <de dfZ:e-j=pe02dgdhZ=e-j>pe02didhZ?e=re?re@dj e-jApe02dkdhZAe-jBpe02dldhZBe-jCpe02dmeZCe-j"pe02dne"Z"e-jp#e02doeZe-jp-e02dpeZe-jp7e02dqeZe-j!pAe02dre!Z!e-j pKe02dse Z e-jpUe02dteZe-jp_e02dueZdve5v roe&ed)e5 Z5dve8v r~e&ed)e8 Z8dwe0v re0dw D ] ZDe0dw eD dZ ZEdveEv re&ed)eE e0dw eD dZ< qe8re/e8dxdyF Z7ee9e: ZGe=rej'(e9ee:jH dzZIej'JeIseKeI eCd@krd{ZLneCdAkrd|ZLeeCeBeLed}ZMe.e-jNpe02d~e&ed)de1 dZNedeNj1jO ZPeNj1jQZRd\ZSZTZUe1dkr&eCeNj1jVjks&J e1dkr=eCd@kr3dZTneCdAkr<dZ1dZUn	e1dkrFdZSdZTe3s\e&edeS de1 deT deU Z3ne3Wdrhe&ee3Z3e4Wdrte&ee4Z4e@de1 d eePeRe3eCe4edZXdd ZYeZdkreY  dS dS )    N)datetime)files)Path)cached_path)	get_class)	OmegaConf)	unidecode)cfg_strengthcross_fade_durationdevicefix_durationinfer_process
load_modelload_vocodermel_spec_typenfe_steppreprocess_ref_audio_text remove_silence_for_generated_wavspeedsway_sampling_coef
target_rmszpython3 infer-cli.pyzCCommandline interface for E2/F5 TTS with Advanced Batch Processing.zCSpecify options above to override one or more settings from config.)progdescriptionepilogz-cz--configf5_ttszinfer/examples/basicz
basic.tomlzCThe configuration file, default see infer/examples/basic/basic.toml)typedefaulthelpz-mz--modelz>The model name: F5TTS_v1_Base | F5TTS_Base | E2TTS_Base | etc.)r   r   z-mcz--model_cfgz*The path to F5-TTS model config file .yamlz-pz--ckpt_filez<The path to model checkpoint .pt, leave blank to use defaultz-vz--vocab_filez7The path to vocab file .txt, leave blank to use defaultz-rz--ref_audiozThe reference audio file.z-sz
--ref_textz/The transcript/subtitle for the reference audioz-tz
--gen_textz*The text to make model synthesize a speechz-fz
--gen_filez6The file with text to generate, will ignore --gen_textz-oz--output_dirzThe path to output folderz-wz--output_filezThe name of output filez--save_chunk
store_truez*To save each audio chunks during inference)actionr   z--no_legacy_textstore_falsezLNot to use lossy ASCII transliterations of unicode text in saved file names.z--remove_silencez%To remove long silence found in ouputz--load_vocoder_from_localzITo load vocoder from local dir, default to ../checkpoints/vocos-mel-24khzz--vocoder_namevocosbigvganz,Used vocoder name: vocos | bigvgan, default )r   choicesr   z--target_rmsz;Target output speech loudness normalization value, default z--cross_fade_durationzBDuration of cross-fade between audio segments in seconds, default z
--nfe_stepz=The number of function evaluation (denoising steps), default z--cfg_strengthz+Classifier-free guidance strength, default z--sway_sampling_coefz#Sway Sampling coefficient, default z--speedz*The speed of the generated audio, default z--fix_durationz@Fix the total duration (ref and gen audios) in seconds, default z--devicezSpecify the device to run onrbmodelF5TTS_v1_Base	ckpt_file 
vocab_file	ref_audioz%infer/examples/basic/basic_ref_en.wavref_textz2Some call me nature, others call me mother nature.gen_textz)Here we generate something just for test.gen_file
output_dirtestsoutput_file
infer_cli_z%Y%m%d_%H%M%S.wav
save_chunkFno_legacy_textz
Warning to --save_chunk: lossy ASCII transliterations of unicode text for legacy (.wav) file names, --no_legacy_text to disable.
remove_silenceload_vocoder_from_localvocoder_namer   r
   r   r	   r   r   r   r   zinfer/examples/voicesrzutf-8_chunksz../checkpoints/vocos-mel-24khzz,../checkpoints/bigvgan_v2_24khz_100band_256x)r7   is_local
local_pathr   	model_cfgzconfigs/z.yamlzf5_tts.model.)zF5-TTSi safetensors
F5TTS_BaseiO F5TTS_Base_bigvganpt
E2TTS_BasezE2-TTSzhf://SWivid//z/model_.zhf://zUsing z...)r   r)   r   c                  C   sb  t td} dtvrd| i}ntd }| |d< |D ]1}td| td|| d  t|| d || d \|| d< || d< td|| d d	 qg }d
}t|t}d}|D ]}| s_qXt	||}|rl|d }ntd d}||vrtd| d d}t
|d|}|| d }	|| d }
|| dt}| }td|  t|	|
|tttttttt|ttd\}}}|| trt|dkr|d d d }trt|}ttj !t"t|d  d| d|| qX|r/t#$|}tj %t&st't& t(t)d}t|j*|| t+rt,|j* t|j* W d    d S 1 s(w   Y  d S d S )N)r*   r+   r8   mainzVoice:z
ref_audio r*   r+   
ref_audio_z

z(?=\[\w+\])z	\[(\w+)\]   zNo voice tag found, using main.zVoice z not found, using main.r(   r   zVoice: )	r   r   r
   r   r	   r   r   r   r      z ... _r2   wb)-r*   r+   configprintr   resplitr,   stripmatchsubgetr   r   	ema_modelvocoderr7   r   r
   r   r	   r   r   r   appendr3   lenuse_legacy_textr   sfwriteospathjoinoutput_chunk_dirnpconcatenateexistsr.   makedirsopen	wave_pathnamer5   r   )
main_voicer8   voicegenerated_audio_segmentsreg1chunksreg2textrP   rF   	ref_text_local_speed	gen_text_audio_segmentfinal_sample_ratespectrogram
final_wavef rt   J/home/ubuntu/.local/lib/python3.10/site-packages/f5_tts/infer/infer_cli.pyrE   3  s   




 


$rE   __main__)[argparsecodecsrZ   rM   r   importlib.resourcesr   pathlibr   numpyr^   	soundfilerX   tomlir   hydra.utilsr   	omegaconfr   r   f5_tts.infer.utils_inferr	   r
   r   r   r   r   r   r   r   r   r   r   r   r   ArgumentParserparseradd_argumentstrr[   r\   joinpathfloatint
parse_argsargsloadrb   rK   r%   rR   r'   r)   r*   r+   r,   r-   r.   r0   nowstrftimer3   r4   rW   rL   r5   r6   r7   rf   voice_ref_audioreadrc   stemr]   r`   ra   vocoder_local_pathrT   r=   backbone	model_clsarch	model_arc	repo_name	ckpt_step	ckpt_typemel_spec
startswithrS   rE   __name__rt   rt   rt   ru   <module>   s   @




 


(





&
P
