o
    :i                     @   s`  d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlm	Z	 d dl
Zd dlZd dlZd dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZ d dl m!Z! d dl"m#Z# d dl$m%Z%m&Z& d dlm	Z	m'Z' d dl(m)Z) d dl*m+Z+ d dl,m-Z- e%G dd dZ.ej/d#ddZ0d$ddZ1d%ddZ2d&d!d"Z3dS )'    N)Path)Optional)pipeline)snapshot_download)fix_random_seed)ZipVoiceDistill)EmiliaTokenizer)load_checkpoint)AttributeDictstr2bool)
VocosFbank)rms_norm)	dataclassfield)r   List)Vocos)	OnnxModel)parametrizec                   @   sb   e Zd ZU dZee ed< dZeed< dZee ed< dZ	ee ed< dZ
eed< d	Zeed
< dS )LuxTTSConfigN	model_dirzmodel.ptcheckpoint_namevocoder_pathtrt_engine_pathemilia	tokenizerzen-uslang)__name__
__module____qualname__r   r   str__annotations__r   r   r   r   r    r!   r!   ./home/ubuntu/LuxTTS/zipvoice/modeling_utils.pyr   !   s   
 r   皙?   c                 C   s   t j| d|d\}}	t j| d|d\}
}	||
d }t| t|d}t||\}}|j|dd|}|d| }tj	|
dg|d}||g}||||fS )	Ni]  )srdurationi>  textr   )sampling_rate   )device)librosaloadprinttorch
from_numpy	unsqueezer   extracttotensorsizetexts_to_token_ids)audiotranscriberr   feature_extractorr*   
target_rmsr&   
feat_scale
prompt_wavr%   prompt_wav2prompt_text
prompt_rmsprompt_featuresprompt_features_lensprompt_tokensr!   r!   r"   process_audio.   s   rB         @      ?      ?c                 C   s   | |g}t| j}|
d }
t  |j|| |||
|d||	d	\}}}}W d    n1 s2w   Y  |dddd }||	d
dd}||k rV|||  }|S )	Ng?predict)	tokensrA   r?   r@   speedt_shiftr&   num_stepguidance_scaler      r)   r#   )r5   next
parametersr*   r.   inference_modesamplepermutedecodesqueezeclamp)rA   r@   r?   r>   r'   modelvocoderr   rJ   rK   rH   rI   r9   rG   r*   pred_features_wavr!   r!   r"   generate@   s*   
r[   cudac                 C   sL  t  }| d u rtd} |  d}|  d}|  d}tdd|d}t|d}|j|jd	}t|d
}	t|	}W d    n1 sBw   Y  t	di |d |}
t
||
dd t|d|_|
|j }
t }t|  d|}t|jjd d t|jjd d |tj|  d|jd |d d |_|
||||fS )NYatharthS/LuxTTS/tokens.txtz	/model.pt/config.jsonautomatic-speech-recognitionzopenai/whisper-baserV   r*   
token_file
vocab_sizepad_idrrV   T)filenamerV   strictr   /vocoder/config.yamlweightr)   /vocoder/vocos.binmap_locationfeaturer(   r!   )r   r   r   r   re   rf   openjsonr,   r   r	   r.   r*   r2   evalr   r   from_hparamsr   remove_parametrizations	upsamplerupsample_layersload_state_dictr(   )
model_pathr*   paramsrc   
model_ckptmodel_configr7   r   tokenizer_configfrV   r8   vocosr!   r!   r"   load_models_gpu]   s6   



r   rL   c                 C   s(  t  }d|_td} |  d}|  d}|  d}|  d}tddd	d
}t|d}|j|jd}	t|d}
t	|
}W d    n1 sFw   Y  t
|||d}t|  d }t|jjd d t|jjd d |tj	|  dtd	d t }|d d |_d|_|||||fS )N*   r]   r^   z/text_encoder.onnxz/fm_decoder.onnxr_   r`   zopenai/whisper-tinycpura   rb   rd   rg   )
num_threadrj   r   rk   r)   rl   rm   ro   r(   T)r   seedr   r   r   re   rf   rp   rq   r,   r   r   rs   rr   r   rt   ru   rv   rw   r.   r*   r   r(   	onnx_int8)rx   r   ry   rc   text_encoder_pathfm_decoder_pathr{   r7   r   r|   r}   rV   r~   r8   r!   r!   r"   load_models_cpu   s,   




 r   )r#   r$   r#   )r$   rC   rD   rE   r#   )Nr\   )NrL   )4argparsedatetimedtrq   loggingospathlibr   typingr   numpynpsafetensors.torchsafetensorsr.   r+   
torchaudiotransformersr   huggingface_hubr   lhotse.utilsr    zipvoice.models.zipvoice_distillr   zipvoice.tokenizer.tokenizerr   zipvoice.utils.checkpointr	   zipvoice.utils.commonr
   r   zipvoice.utils.featurer   zipvoice.utils.inferr   dataclassesr   r   r   linacodec.vocoder.vocosr   zipvoice.onnx_modelingr   torch.nn.utilsr   r   rP   rB   r[   r   r   r!   r!   r!   r"   <module>   sB    

"