o
    isN                     @   sT  d dl Z d dlZd dlmZ de jd< eje je je	 d d dl
Z
d dlZd dlZd dlmZ d dlZed d dlmZ d dlZd dlZd dlZd dlZd dlmZ d d	lmZmZ d d
lmZ d dl m!Z! d dl"m#Z# d dl$m%Z%m&Z& i a'i a(ej)* rdnej+* rdn	ej,j-* rdndZ.ej/dkrddinddiZ0dZ1dZ2dZ3dZ4dZ5dZ6dZ7dZ8dZ9dZ:dZ;d Z<d!Z=dZ>dBd#d$Z?ddd%e.dfd&d'Z@daAe.dfd(eBfd)d*ZCdCd+d,ZDdDd(eBfd.d/ZEe6d%e9d-e.fd0d1ZFdEd3d4ZGeHfd5d6ZIe6eHee7e8e:e;e<e=e>e.fd7d8ZJdeddddd9d:dddd;fd<d=ZKd>d? ZLd@dA ZMdS )F    N)ThreadPoolExecutor1PYTORCH_ENABLE_MPS_FALLBACKz/../../third_party/BigVGAN/)filesAgg)hf_hub_download)AudioSegmentsilence)pipeline)Vocos)CFM)convert_char_to_pinyinget_tokenizercudaxpumpscpu)      delete_on_closeFdeletei]  d      i   vocosg?g333333?euler    g       @g      g      ?   c                 C   s   g }d}t d| }|D ]D}t|dt|d |kr4||r0t|d ddkr0|d n|7 }q|r=||  |rNt|d ddkrN|d n|}q|rZ||  |S )a  
    Splits the input text into chunks, each with a maximum number of characters.

    Args:
        text (str): The text to be split.
        max_chars (int): The maximum number of characters per chunk.

    Returns:
        List[str]: A list of text chunks.
     u*   (?<=[;:,.!?])\s+|(?<=[；：，。！？])utf-8    )resplitlenencodeappendstrip)text	max_charschunkscurrent_chunk	sentencessentence r.   L/home/ubuntu/.local/lib/python3.10/site-packages/f5_tts/infer/utils_infer.py
chunk_textI   s    ,(r0   r   c                 C   s@  | dkre|rt d|  | d}| d}nt d d}t||dd}t||d	d}t|}tj|d
dd}	ddlm}
 t|j	|
rWdd |j	j
  D }|	| ||	 | |}|S | dkrzddlm} W n ty~   t d Y nw |r|jj|dd}n	|jjdd|d}|  | |}|S )Nr   zLoad vocos from local path z/config.yamlz/pytorch_model.binz8Download Vocos from huggingface charactr/vocos-mel-24khzzcharactr/vocos-mel-24khzzconfig.yaml)repo_id	cache_dirfilenamezpytorch_model.binr   Tmap_locationweights_onlyr   )EncodecFeaturesc                 S   s   i | ]	\}}d | |qS )zfeature_extractor.encodec.r.   ).0keyvaluer.   r.   r/   
<dictcomp>y   s    z load_vocoder.<locals>.<dictcomp>bigvgan)r<   zSYou need to follow the README to init submodule and change the BigVGAN source code.F)use_cuda_kernelz$nvidia/bigvgan_v2_24khz_100band_256x)r=   r2   )printr   r   from_hparamstorchloadvocos.feature_extractorsr7   
isinstancefeature_extractorencodec
state_dictitemsupdateload_state_dictevaltothird_party.BigVGANr<   ImportErrorBigVGANfrom_pretrainedremove_weight_norm)vocoder_nameis_local
local_pathdevicehf_cache_dirconfig_path
model_pathr1   vocoderrF   r7   encodec_parametersr<   r.   r.   r/   load_vocoderh   sD   



rZ   rT   c                 C   sR   |d u rd| v rt j| jdkrt j dst jnt j}tdd|| da	d S )Nr      [ZLUDA]zautomatic-speech-recognitionzopenai/whisper-large-v3-turbo)modeltorch_dtyperT   )
r@   r   get_device_propertiesmajorget_device_nameendswithfloat16float32r
   asr_pipe)rT   dtyper.   r.   r/   initialize_asr_pipeline   s   
rg   c                 C   s>   t d u r	ttd t | dd|rd|dnddiddd	  S )
NrT         
transcribe)tasklanguagerl   F)chunk_length_s
batch_sizegenerate_kwargsreturn_timestampsr(   )re   rg   rT   r'   )	ref_audiorm   r.   r.   r/   rk      s   
rk   Tc           	      C   s  |d u rd|v rt j|jdkrt j dst jnt j}| |} |	dd }|dkr<ddl
m} |||d	}nt j||d
d}|rs|dkrNd|i}dd |d  D |d< dD ]}||d v rj|d |= q]| |d  n|dkr{d|i}| |d  ~t j  | |S )Nr   r[   r\   .r   safetensorsr   )	load_filerh   Tr4   ema_model_state_dictc                 S   s&   i | ]\}}|d vr| dd|qS ))inittedstepz
ema_model.r   )replace)r8   kvr.   r.   r/   r;      s
    z#load_checkpoint.<locals>.<dictcomp>model_state_dict)zmel_spec.mel_stft.mel_scale.fbz$mel_spec.mel_stft.spectrogram.window)r@   r   r_   r`   ra   rb   rc   rd   rK   r#   safetensors.torchru   rA   rG   rI   empty_cache)	r]   	ckpt_pathrT   rf   use_ema	ckpt_typeru   
checkpointr9   r.   r.   r/   load_checkpoint   s>   





r   c              
   C   s   |dkrt tdd}d}td| td| td|d t||\}	}
t| di ||
td	ttt	t
tt|d
t|d|	d|}|dkrMtjnd }t|||||d}|S )Nr   f5_ttszinfer/examples/vocab.txtcustomz	
vocab : ztoken : zmodel : 
)text_num_embedsmel_dim)n_fft
hop_length
win_lengthn_mel_channelstarget_sample_ratemel_spec_type)method)transformermel_spec_kwargsodeint_kwargsvocab_char_mapr<   )rf   r   r.   )strr   joinpathr>   r   r   r   dictr   r   r   r   rK   r@   rd   r   )	model_cls	model_cfgr   r   
vocab_file
ode_methodr   rT   	tokenizerr   
vocab_sizer]   rf   r.   r.   r/   
load_model   s6   


r   c                 C   s\   t j| |d}| |d  } | j}t| D ]}|j|kr n|d8 }q| d t|d  }|S )N)silence_thresholdgMbP?  )r	   detect_leading_silenceduration_secondsreverseddBFSint)audior   non_silent_start_idxnon_silent_end_durationmstrimmed_audior.   r.   r/   remove_silence_edges  s   

r   c                 C   sD  |d t | d}| }t| }W d    n1 sw   Y  |tv r1|d t| }ntjdddit}|j	}W d    n1 sIw   Y  t
| }	tj|	ddddd	}
t
jd
d}|
D ]}t|dkr{t|| dkr{|d  n||7 }qet|dkrtj|	ddddd	}
t
jd
d}|
D ]}t|dkrt|| dkr|d  n||7 }q|}	t|	dkr|	d d }	|d t|	t
jdd }	|	j|dd |}|t|< | s|tv r|d t| }n|d t|}|t|< n|d |ds|ds|dr|d7 }n|d7 }td| ||fS )NzConverting audio...rbz,Using cached preprocessed reference audio...suffixz.wavr   
   min_silence_lensilence_threshkeep_silence	seek_stepr   durationip  i.  z&Audio is over 12s, clipping short. (1)r   iz&Audio is over 12s, clipping short. (2)z&Audio is over 12s, clipping short. (3)2   wavformatzUsing cached reference text...z;No reference text provided, transcribing reference audio...zUsing custom reference text...z. u   。rs   r!   z
ref_text  r.   )openreadhashlibmd5	hexdigest_ref_audio_cachetempfileNamedTemporaryFiletempfile_kwargsnamer   	from_filer	   split_on_silencesilentr$   r   exportr'   _ref_text_cacherk   rb   r>   )ref_audio_origref_text	show_info
audio_file
audio_data
audio_hashrr   f	temp_pathasegnon_silent_segsnon_silent_wavenon_silent_segr.   r.   r/   preprocess_ref_audio_text(  sj   









r   c                 C   s   t | \}}tt|d|jd |  d|jd |   | }t||d}t|D ]\}}td| | q,td |dt| d t	t
||f||||||||	|
|||||d	S )
Nr   r      )r)   z	gen_text r   zGenerating audio in z batches...)
r   progress
target_rmscross_fade_durationnfe_stepcfg_strengthsway_sampling_coefspeedfix_durationrT   )
torchaudiorA   r   r$   r%   shaper0   	enumerater>   nextinfer_batch_process)rr   r   gen_text	model_objrX   r   r   r   r   r   r   r   r   r   r   rT   r   srr)   gen_text_batchesir.   r.   r/   infer_process~  s4   6r   r   r    i   c           (      #   s   | \ } j d dkrtj ddd ttt 

k r)  
  |tkr8tj|t}|   	| g }g }t
	d ddkrP	d 	 	
fdd	r|d urn||n|D ]}|D ]}|V  qvqpd S t 6fd
d|D }|d ur||n|D ]}| }|rt|\}}|| || qW d    n1 sw   Y  |rB|dkrt|}nf|d }tdt
|D ]Z}|}|| }t|t } t| t
|t
|} | dkrt||g}q||  d  }!|d |  }"tdd| }#tdd| }$|!|# |"|$  }%t|d |   |%|| d  g}&|&}qtj|dd}'|t|'fV  d S d td fV  d S )Nr   r    T)dimkeepdimr   r   r!   c              	   3   s   	}t | ddk rd}|  g}t|} jd t }d ur+tt t }nt d}t | d}|t|| | |  }t  j	 ||d\}}	~	|
tj}|d d |d d d f }|ddd}d	kr||}
nd
kr|}
k r|
  }
|
   }

rtdt |
D ]}|
||  tfV  qn|d   }~|
|fV  W d    d S W d    d S 1 sw   Y  d S )Nr   r   g333333?r   )condr(   r   stepsr   r   r      r    r   r<   )r$   r%   r   r   r   r   r   r@   inference_modesamplerK   rd   permutedecodesqueezer   numpyrange)r   local_speed	text_listfinal_text_listref_audio_lenr   ref_text_lengen_text_len	generated_generated_wavejgenerated_cpu)r   r   
chunk_sizer   r   r   r   r   rmsr   	streamingr   r   rX   r.   r/   process_batch  sT   


"z*infer_batch_process.<locals>.process_batchc                    s   g | ]}  |qS r.   )submit)r8   r   )executorr  r.   r/   
<listcomp>  s    z'infer_batch_process.<locals>.<listcomp>)axis)r   r@   meansqrtsquarer   r   
transformsResamplerK   r$   r%   tqdmr   resultr   r&   npconcatenater   r   minlinspace)(rr   r   r   r   rX   r   r   r   r   r   r   r   r   r   rT   r  r  r   	resamplergenerated_wavesspectrogramsr   chunkfuturesfuturer  r   generated_mel_spec
final_waver   	prev_wave	next_wavecross_fade_samplesprev_overlapnext_overlapfade_outfade_incross_faded_overlapnew_wavecombined_spectrogramr.   )r   r   r  r  r   r   r   r   r  r   r  r   r  r   r   rX   r/   r     sr   
&3

	r   c                 C   sR   t | }tj|ddddd}t jdd}|D ]}||7 }q|}|j| dd	 d S )
Nr   r   i  r   r   r   r   r   r   )r   r   r	   r   r   r   )r3   r   r   r   r   r.   r.   r/    remove_silence_for_generated_wavI  s   


r'  c                 C   s:   t jdd t j| ddd t   t | t   d S )N)r      )figsizelowerauto)originaspect)pltfigureimshowcolorbarsavefigclose)spectrogrampathr.   r.   r/   save_spectrogramX  s
   
r6  )r   )N)NT)r   )Nossysconcurrent.futuresr   environr5  r&   dirnameabspath__file__r   r"   r   importlib.resourcesr   
matplotlibusematplotlib.pylabpylabr.  r   r  r@   r   r  huggingface_hubr   pydubr   r	   transformersr
   r   r   f5_tts.modelr   f5_tts.model.utilsr   r   r   r   r   is_availabler   backendsr   rT   version_infor   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r0   rZ   re   r   rg   rk   r   r   r   r>   r   r   r   r'  r6  r.   r.   r.   r/   <module>   s   
"



,
4

)\
9
 