o
    id                  7   @   s  d Z ddlZddlZddlZddlZddlZddlmZ ddlm	Z	m
Z
mZmZmZmZ ddlZddlZddlZddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z, ddl-m.Z. ddl/m0Z0m1Z1m2Z2 ddl3m4Z4 G dd dZ5de6de7de6de7de7de7dee7e6f deee6e6e6f  dee6 dee6 d ee6 d!ee6 d"e8d#e8d$e8d%e9d&e9d'e7d(e7d)e8d*e8d+e8d,e9d-e9d.ee6 d/ee6 d0ee6 f6d1d2Z:d3d4 Z;d8d5d6Z<e=d7kre<  dS dS )9z5Script to run the inference of text-to-speeech model.    N)Path)AnyDictOptionalSequenceTupleUnion)parse)check_argument_types)NpyScpWriter)VITS)TTSTask)	to_device)set_all_random_seed)
FastSpeech)FastSpeech2)	Tacotron2)Transformer)DurationCalculator)config_argparse)str2boolstr2triple_strstr_or_none)get_commandline_argsc                '   @   s  e Zd ZdZ												
								d=deeef deeef dededededede	de	dedededeeef deeef dedede	d ed!ef&d"d#Z
e 						d>d$eeejejf d%eejejf d&eejejf d'eejejf d(eejejf d)eejejf d*eeeef  d+eeejf fd,d-Zed+ee	 fd.d/Zed+efd0d1Zed+efd2d3Zed+efd4d5Zed+efd6d7Ze		d?d8ee d9ee d:ee fd;d<ZdS )@Text2Speecha*  Text2Speech class.

    Examples:
        >>> from espnet2.bin.tts_inference import Text2Speech
        >>> # Case 1: Load the local model and use Griffin-Lim vocoder
        >>> text2speech = Text2Speech(
        >>>     train_config="/path/to/config.yml",
        >>>     model_file="/path/to/model.pth",
        >>> )
        >>> # Case 2: Load the local model and the pretrained vocoder
        >>> text2speech = Text2Speech.from_pretrained(
        >>>     train_config="/path/to/config.yml",
        >>>     model_file="/path/to/model.pth",
        >>>     vocoder_tag="kan-bayashi/ljspeech_tacotron2",
        >>> )
        >>> # Case 3: Load the pretrained model and use Griffin-Lim vocoder
        >>> text2speech = Text2Speech.from_pretrained(
        >>>     model_tag="kan-bayashi/ljspeech_tacotron2",
        >>> )
        >>> # Case 4: Load the pretrained model and the pretrained vocoder
        >>> text2speech = Text2Speech.from_pretrained(
        >>>     model_tag="kan-bayashi/ljspeech_tacotron2",
        >>>     vocoder_tag="parallel_wavegan/ljspeech_parallel_wavegan.v1",
        >>> )
        >>> # Run inference and save as wav file
        >>> import soundfile as sf
        >>> wav = text2speech("Hello, World")["wav"]
        >>> sf.write("out.wav", wav.numpy(), text2speech.fs, "PCM_16")

    N      ?              $@F            ?MbX?皙?float32cpu	  train_config
model_file	thresholdminlenratiomaxlenratiouse_teacher_forcinguse_att_constraintbackward_windowforward_windowspeed_control_alphanoise_scalenoise_scale_durvocoder_configvocoder_filedtypedeviceseedalways_fix_seedprefer_normalized_featsc                 C   s  t  sJ t|||\}}|jtt|d  || _|| _|| _	|| _
|j| _|j| _|j| _t | _t|d| _|| _|| _|| _d| _|| _| jjrlt||||}t|tjjri|jtt|d  || _td| j  td| j  td| j  | jdurtd| j  i }|j|d t| jtt fr|j|||d	 t| jtr|j||	|d
 t| jt!t"t#fr|j|
d t| jt#r|j||d || _$dS )zInitialize Text2Speech module.)r4   FNzExtractor:
zNormalizer:
zTTS:
z	Vocoder:
)r+   )r(   r*   r)   )r,   r.   r-   )alpha)r0   r1   )%r
   r   build_model_from_filetogetattrtorchevalr5   r4   
train_argsmodeltts	normalizefeats_extractr   duration_calculatorbuild_preprocess_fnpreprocess_fnr+   r6   r7   vocoderr8   require_vocoderbuild_vocoder_from_file
isinstancennModulelogginginfoupdater   r   r   r   r   decode_conf)selfr&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r@   r?   rG   rP    rR   M/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/bin/tts_inference.py__init__B   sj   


zText2Speech.__init__textspeech	durationsspembssidslidsrP   returnc                 C   s  t  sJ | jr|du rtd| jr|du rtd| jr&|du r&td| jr1|du r1tdt|trA| dt	|dd }t	|d}|durP|j
|d	 |durZ|j
|d
 |durd|j
|d |durn|j
|d |durx|j
|d t|| j}| j}	|dur| j }	|	
| | jrt| j | jjdi ||	}
|
ddur| |
d \}}|
j
||d | jdur| js|
ddu r|
d }n|
d }| |}|
j
|d |
S )zRun text-to-speech.Nz#Missing required argument: 'speech'z!Missing required argument: 'sids'z!Missing required argument: 'lids'z#Missing required argument: 'spembs'z<dummy>)rU   rU   )rV   )rW   )rX   )rY   )rZ   att_w)duration
focus_ratefeat_gen_denormfeat_gen)wavrR   )r
   
use_speechRuntimeErroruse_sidsuse_lids
use_spembsrJ   strrF   dictrO   r   r5   rP   copyr7   r   r6   r@   	inferencegetrD   rG   r8   )rQ   rU   rV   rW   rX   rY   rZ   rP   batchcfgoutput_dictr]   r^   
input_featra   rR   rR   rS   __call__   sT   








zText2Speech.__call__c                 C   s,   t | jdr
| jjS t | jdr| jjS dS )zReturn sampling rate.fsN)hasattrrG   rq   rA   rQ   rR   rR   rS   rq      s
   zText2Speech.fsc                 C   s   | j p	t| jddS )z0Return speech is needed or not in the inference.use_gstF)r+   r<   rA   rs   rR   rR   rS   rb      s   zText2Speech.use_speechc                 C      | j jduS z-Return sid is needed or not in the inference.N)rA   spksrs   rR   rR   rS   rd         zText2Speech.use_sidsc                 C   ru   rv   )rA   langsrs   rR   rR   rS   re      rx   zText2Speech.use_lidsc                 C   ru   )z/Return spemb is needed or not in the inference.N)rA   spk_embed_dimrs   rR   rR   rS   rf      rx   zText2Speech.use_spembs	model_tagvocoder_tagkwargsc           	      K   s   | dur(zddl m} W n ty   td  w | }|jdi ||  |dury|drrzddlm	} W n tyF   td  w ddl
m} t|td	ksYJ d
|dd}||}t|jd }|j||d nt| dtdi |S )a  Build Text2Speech instance from the pretrained model.

        Args:
            model_tag (Optional[str]): Model tag of the pretrained models.
                Currently, the tags of espnet_model_zoo are supported.
            vocoder_tag (Optional[str]): Vocoder tag of the pretrained vocoders.
                Currently, the tags of parallel_wavegan are supported, which should
                start with the prefix "parallel_wavegan/".

        Returns:
            Text2Speech: Text2Speech instance.

        Nr   )ModelDownloaderzZ`espnet_model_zoo` is not installed. Please install via `pip install -U espnet_model_zoo`.zparallel_wavegan/)download_pretrained_modelzZ`parallel_wavegan` is not installed. Please install via `pip install -U parallel_wavegan`.)__version__z0.5.1zQPlease install the latest parallel_wavegan via `pip install -U parallel_wavegan`. z
config.yml)r2   r3   z is unsupported format.rR   )espnet_model_zoo.downloaderr~   ImportErrorrM   errorrO   download_and_unpack
startswithparallel_wavegan.utilsr   parallel_waveganr   Vreplacer   parent
ValueErrorr   )	r{   r|   r}   r~   dr   r   r3   r2   rR   rR   rS   from_pretrained   s>   
zText2Speech.from_pretrained)NNr   r   r   FFr   r   r    r!   r"   NNr#   r$   r%   FF)NNNNNN)NN)__name__
__module____qualname____doc__r   r   rg   floatboolintrT   r=   no_gradTensornpndarrayr   r   r   rp   propertyrq   rb   rd   re   rf   staticmethodr   rR   rR   rR   rS   r   "   s    !

	



Q	E	r   
output_dir
batch_sizer4   ngpur6   num_workers	log_leveldata_path_and_name_and_typekey_filer&   r'   r{   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r7   allow_variable_data_keysr2   r3   r|   c           :      C   s  t  sJ |dkrtd|dkrtdtj|dd |dkr#d}nd}t| td^i d|	d	|
d
|d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|}tjd^||d|}|js{t	t
dd |}tj|||||t|jdt|jd|dd	}t| } | d  jddd! | d" jddd! | d# jddd! | d$ jddd! | d% jddd! | d& jddd! | d' jddd! | d( jddd! d)d*l}|d+ d)d*lm}  d)d,lm}! t| d  | d- }"t| d" | d. }#t| d/ d0}$t| d1 d0}%t| d2 d0t}&t|dD ]e\}'\}(})t|)ts=J t|)td3d4 |(D sKJ |(ttt |)! }*|*dks^J |*d5d6 |)" D })t#$ }+|d^i |)},|(d) }-tt |)! %d)d }.|,&d7d*ur|,d7 }/t'd8(t)|/%d)t#$ |+   t'|- d9|. d:|/%d) d; |/%d)|.| krt*d<|- d= |,d7 + , |"|-< |$-|- d>d?.t/t0|,d7 j1 d@  |,&dAd*ur|,dA + , |#|-< n(|,d$ }0t'dB(t)|0%d)t#$ |+   t'|- d9|. d:|0%d) d; |,&dCd*urB|%-|- d>d>.t/t0|,dC 2 + ,  d@  |,&dDd*urZ|&-|- d>t3|,dD dEd@ |,&dFd*ur"|,dF + , }1|1j4dGkrw|1d* d* }1n|1j4dHkrt5dI|1j4 | 6|1j1d) |1j1d  \}2}3| j7|2dJ t8|1j1d) dK |3dJ t8|1j1d dK fdL}4|49|-  |4:|1j1d) |1j1d }5t|1dkr|5gg}5t;|5|1D ]7\}6}1t;|6|1D ],\}7}8|7j<|8=t>j?dMdN |7@dO |7AdP |7jBC|!ddQ |7jDC|!ddQ qېq|4EdRg dSi |4F| dT|- dU  |4G  |,&dVd*urr|,dV + , }9| 7 }4|4Hddd}6|6I|9 |6J|-  |6@dP |6AdW |6Kd)d |6jLdXdY |4Ed |4F| dZ|- dU  |4G  |,&d$d*urtM-|  d[|- d\|,d$ + , |jNd] q*W d*   n	1 sw   Y  W d*   n	1 sw   Y  W d*   n	1 sw   Y  W d*   n	1 sw   Y  W d*   n	1 sw   Y  |,&d7d*u rtOP| d   |,&dAd*u rtOP| d"  |,&dFd*u rtOP| d%  |,&dCd*u rtOP| d'  |,&dDd*u r,tOP| d(  |,&dVd*u r;tOP| d&  |,&d$d*u rLtOP| d$  d*S d*S )_zRun text-to-speech inference.r   z!batch decoding is not implementedz%only single GPU decoding is supportedz>%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s)levelformatcudar$   r&   r'   r(   r*   r)   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   )r{   r|   c                 S   s   | d dkS )Nr   rV   rR   xrR   rR   rS   <lambda>  s    zinference.<locals>.<lambda>FT)r4   r   r   r   rF   
collate_fnr   rj   norm)parentsexist_okdenormspeech_shapera   att_wsprobsrW   focus_ratesr   NAgg)MaxNLocatorznorm/feats.scpzdenorm/feats.scpzspeech_shape/speech_shapewzdurations/durationszfocus_rates/focus_ratesc                 s   s    | ]}t |tV  qd S N)rJ   rg   ).0srR   rR   rS   	<genexpr>  s    zinference.<locals>.<genexpr>c                 S   s$   i | ]\}}| d s||d qS )_lengthsr   )endswith)r   kvrR   rR   rS   
<dictcomp>  s   $ zinference.<locals>.<dictcomp>r`   z&inference speed = {:.1f} frames / sec.z (size:z->)z&output length reaches maximum length (z). ,
r_   z&inference speed = {:.1f} points / sec.r]   r^   z.5fr\         zMust be 2 or 4 dimension: g?g      @)figsizeauto)aspectInputOutput)integerrect)r   gQ?r   gffffff?zatt_ws/z.pngprobzStop probabilityboth)whichzprobs/z/wav/z.wavPCM_16rR   )Qr
   NotImplementedErrorrM   basicConfigr   rh   r   r   rb   listfilterr   build_streaming_iteratorrE   r?   build_collate_fnr   mkdir
matplotlibusematplotlib.pyplotpyplotmatplotlib.tickerr   r   open	enumeraterJ   typealllennextitervaluesitemstimeperf_countersizerk   rN   r   r   warningr$   numpywritejoinmaprg   shapelongr   ndimrc   	figaspectFigureminsuptitlesubplotszipimshowastyper   r#   
set_xlabel
set_ylabelxaxisset_major_locatoryaxisset_tight_layoutsavefigclfadd_subplotplot	set_titleset_ylimgridsfrq   shutilrmtree):r   r   r4   r   r6   r   r   r   r   r&   r'   r{   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r7   r   r2   r3   r|   r5   text2speech_kwargstext2speechloaderr   pltr   norm_writerdenorm_writershape_writerduration_writerfocus_rate_writeridxkeysrl   _bs
start_timern   keyinsizer`   ra   r\   r   hfigaxesaxax_att_w_r   rR   rR   rS   rj   5  s  
	

	"""





    rj   c                  C   s  t jdtjd} | jddd dddd	 | jd
tddd | jdtddd | jdtddd | jddg ddd | jdtddd | jdtddd | d}|jdtdd d! |jd"t	d# |jd$t
d%d& | d'}|jd(td)d* |jd+td,d* |jd-td.d* | d/}|jd0td1d2d |jd3td4d5d |jd6td7d8d |jd9t
d%d:d |jd;tdd<d |jd=td>d?d |jd@t
d%dAd | jdBtdCdDd | jdEtdFdGd | jdHtdIdJd |jdKt
d%dLd | dM}|jdNt	dOd* |jdPt	dQd* |jdRtdSd* | S )TzGet argument parser.zTTS inference)descriptionformatter_classz--log_levelc                 S   s   |   S r   )upperr   rR   rR   rS   r   =  s    zget_parser.<locals>.<lambda>INFO)CRITICALERRORWARNINGr   DEBUGNOTSETzThe verbose level of logging)r   defaultchoiceshelpz--output_dirTzThe path of output directory)r   requiredr(  z--ngpur   z(The number of gpus. 0 indicates CPU mode)r   r&  r(  z--seedzRandom seedz--dtyper#   )float16r#   float64z	Data type)r&  r'  r(  z--num_workersr   z)The number of workers used for DataLoaderz--batch_sizezThe batch size for inferencezInput data relatedz--data_path_and_name_and_typeappend)r   r)  actionz
--key_file)r   z--allow_variable_data_keysF)r   r&  zThe model configuration relatedz--train_configzTraining configuration file)r   r(  z--model_filezModel parameter filez--model_tagz]Pretrained model tag. If specify this option, train_config and model_file will be overwrittenzDecoding relatedz--maxlenratior   z Maximum length ratio in decodingz--minlenratior   z Minimum length ratio in decodingz--thresholdr   zThreshold value in decodingz--use_att_constraintz#Whether to use attention constraintz--backward_windowz-Backward window value in attention constraintz--forward_windowr   z,Forward window value in attention constraintz--use_teacher_forcingzWhether to use teacher forcingz--speed_control_alphar    z;Alpha in FastSpeech to change the speed of generated speechz--noise_scaler!   z*Noise scale parameter for the flow in vitsz--noise_scale_durr"   zCNoise scale parameter for the stochastic duration predictor in vitsz--always_fix_seedzWhether to always fix seedzVocoder relatedz--vocoder_configzVocoder configuration filez--vocoder_filezVocoder parameter filez--vocoder_tagzcPretrained vocoder tag. If specify this option, vocoder_config and vocoder_file will be overwritten)r   ArgumentParserargparseArgumentDefaultsHelpFormatteradd_argumentrg   r   add_argument_groupr   r   r   r   )parsergrouprR   rR   rS   
get_parser2  sF  



r5  c                 C   sF   t t tjd t }|| }t|}|dd tdi | dS )zRun TTS model inference.)fileconfigNrR   )	printr   sysstderrr5  
parse_argsvarspoprj   )cmdr3  argsr}   rR   rR   rS   main  s   
r@  __main__r   )>r   r/  rM   r  r9  r   pathlibr   typingr   r   r   r   r   r   r   r   	soundfiler  r=   packaging.versionr	   r   	typeguardr
   espnet2.fileio.npy_scpr   espnet2.gan_tts.vitsr   espnet2.tasks.ttsr    espnet2.torch_utils.device_funcsr   'espnet2.torch_utils.set_all_random_seedr   espnet2.tts.fastspeechr   espnet2.tts.fastspeech2r   espnet2.tts.tacotron2r   espnet2.tts.transformerr   espnet2.tts.utilsr   espnet2.utilsr   espnet2.utils.typesr   r   r   espnet.utils.cli_utilsr   r   rg   r   r   r   rj   r5  r@  r   rR   rR   rR   rS   <module>   s      
	

 ~ 
2


