o
    :i                     @   s^  d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlm	Z	m
Z
 d dlZd dlZd dlZd dlZd dlmZ d dlmZ d dlmZmZ d dlmZ d dlmZ d d	lmZmZmZmZ d d
l m!Z!m"Z" d dl#m$Z$ d dl%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+ G dd dZ,				d$de,de	e	e-  de	e	e-  dede.de.de.de-dejfddZ/d%d"d#Z0dS )&    N)Path)ListTuple)hf_hub_download)fix_random_seed)Tensornn)get_vocoder)get_time_steps)EmiliaTokenizerEspeakTokenizerLibriTTSTokenizerSimpleTokenizer)AttributeDictstr2bool)
VocosFbank)add_punctuationchunk_tokens_punctuationcross_fade_concatload_prompt_wavremove_silencerms_normc                   @   s   e Zd Z	ddededefddZdefdd	Zdefd
dZdedededede	eef f
ddZ
dedededejdedefddZdS )	OnnxModel   text_encoder_pathfm_decoder_path
num_threadc                 C   s2   t  }||_||_|| _| | | | d S )N)ortSessionOptionsinter_op_num_threadsintra_op_num_threadssession_optsinit_text_encoderinit_fm_decoder)selfr   r   r   r!    r%   -/home/ubuntu/LuxTTS/zipvoice/onnx_modeling.py__init__%   s   
zOnnxModel.__init__
model_pathc                 C   s   t j|| jdgd| _d S )NCPUExecutionProvidersess_options	providers)r   InferenceSessionr!   text_encoder)r$   r(   r%   r%   r&   r"   4   s
   zOnnxModel.init_text_encoderc                 C   s4   t j|| jdgd| _| j j}t|d | _d S )Nr)   r*   feat_dim)r   r-   r!   
fm_decoderget_modelmetacustom_metadata_mapintr/   )r$   r(   metar%   r%   r&   r#   ;   s   zOnnxModel.init_fm_decodertokensprompt_tokensprompt_features_lenspeedreturnc                 C   sz   | j | j  d jg| j  d j| | j  d j| | j  d j| | j  d j| i}t|d S )Nr   r         )r.   runget_outputsname
get_inputsnumpytorch
from_numpy)r$   r5   r6   r7   r8   outr%   r%   r&   run_text_encoderD   s   zOnnxModel.run_text_encodertxtext_conditionspeech_conditionguidance_scalec                 C   s   | j | j  d jg| j  d j| | j  d j| | j  d j| | j  d j| | j  d j| i}t|d S )Nr   r   r:   r;      )r0   r<   r=   r>   r?   r@   rA   rB   )r$   rE   rF   rG   rH   rI   rC   r%   r%   r&   run_fm_decoderX   s   zOnnxModel.run_fm_decoderN)r   )__name__
__module____qualname__strr3   r'   r"   r#   r   r   rD   rA   rK   r%   r%   r%   r&   r   $   sF    
	

r   ?      ?      ?   modelr5   r6   prompt_featuresr8   t_shiftrI   num_stepr9   c              	   C   sv  t |t |  krdksJ  J tj|tjd}tj|tjd}tj|dtjd}tj|tjd}| ||||}	|	j\}
}}| j}t	dd||d}t
|
||}tjj|ddd||jd  f}tj|tjd}t|D ]7}|| }||d  }| j|||	||d}|d| |  }|||  }||d k rd| | ||  }qr|}qr|d d | d d d f }|S )Nr   )dtypeg        rR   )t_startt_endrW   rV   r   )rE   rF   rG   rH   rI   )lenrA   tensorint64sizefloat32rD   shaper/   r
   randnr   
functionalpadrangerK   item)rT   r5   r6   rU   r8   rV   rI   rW   r7   rG   
batch_size
num_frames_r/   	timestepsrF   rH   stept_curt_nextvx_1_predx_0_predr%   r%   r&   samplen   sL   $rp   rJ         @?皙?c              
   C   sn   | |g}|
d }
t||| ||
||	|d}|dddd }||ddd}||k r5|||  }|S )NrP   )rT   r5   r6   rU   r8   rV   rI   rW   r   r:   r   rs   )texts_to_token_idsrp   permutedecodesqueezeclamp)r6   prompt_features_lensrU   
prompt_rmstextrT   vocoder	tokenizerrW   rI   r8   rV   
target_rmsr5   pred_featureswavr%   r%   r&   generate_cpu   s"   r   )rP   rQ   rR   rS   )rJ   rq   rR   rr   rs   )1argparsedatetimedtjsonloggingospathlibr   typingr   r   r@   nponnxruntimer   rA   
torchaudiohuggingface_hubr   lhotse.utilsr   r   r   zipvoice.bin.infer_zipvoicer	   zipvoice.models.modules.solverr
   zipvoice.tokenizer.tokenizerr   r   r   r   zipvoice.utils.commonr   r   zipvoice.utils.featurer   zipvoice.utils.inferr   r   r   r   r   r   r   r3   floatrp   r   r%   r%   r%   r&   <module>   sZ     	O



H