o
    i\                     @   s  d Z ddlmZ ddlmZ ddlmZmZmZ ddl	Z	ddl
mZ ddlmZ ddlmZmZmZmZmZ dd	lmZmZmZmZ dd
lmZ ddlmZ ddlmZ ddlm Z  deiZ!eeeeedZ"ee	j#edkrvddl$m%Z% nedddZ%G dd deZ&dS )zVITS module for GAN-TTS task.    )contextmanager)LooseVersion)AnyDictOptionalN)check_argument_types)	AbsGANTTS)HiFiGANMultiPeriodDiscriminatorHiFiGANMultiScaleDiscriminator)HiFiGANMultiScaleMultiPeriodDiscriminatorHiFiGANPeriodDiscriminatorHiFiGANScaleDiscriminator)DiscriminatorAdversarialLossFeatureMatchLossGeneratorAdversarialLossMelSpectrogramLoss)get_segments)VITSGenerator)KLDivergenceLoss)force_gatherablevits_generator)hifigan_period_discriminatorhifigan_scale_discriminator"hifigan_multi_period_discriminator!hifigan_multi_scale_discriminator.hifigan_multi_scale_multi_period_discriminatorz1.6.0)autocastTc                 c   s    d V  d S )N enabledr   r   M/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/gan_tts/vits/vits.pyr   /   s   
r   c                #       s
  e Zd ZdZddi ddddddd	dd
dddddddddddddddddddddd d!d"d#i d$d#d%d&d'dd(dd)d&d*d+d,g d-d.g d/d0g d1d2g d3g d3g d3gd4dd5d6d7d8d9dd:dd;d#d<ddd6ddd#ddd=d>dd=d?d@ddAddddBddg dCdDdEd8dg dFdGdHd!iddIdJdIg dKddd6d=gdg dLdEddGdHd!iddIdMdNdIdOdPdIdOdPdIdIddQddEdRddSdTdUdddV	dWdXdYdWdWdfdZed[ed\ed]ed^eeef d_ed`eeef daeeef dbeeef dceeef ddeeef deedfedgedhediedje	f" fdkdlZ
edmdn Zedodp Z				ddqejdrejdsejdtejduejdvejdweej dxeej dyeej dze	d{eeef fd|d}Z			ddqejdrejdsejdtejduejdvejdweej dxeej dyeej d{eeef fd~dZ			ddqejdrejdsejdtejduejdvejdweej dxeej dyeej d{eeef fddZ								W		Iddqejdseej dweej dxeej dyeej deej dedededee de	d{eeejf fddZ  ZS )VITSaN  VITS module (generator + discriminator).

    This is a module of VITS described in `Conditional Variational Autoencoder
    with Adversarial Learning for End-to-End Text-to-Speech`_.

    .. _`Conditional Variational Autoencoder with Adversarial Learning for End-to-End
        Text-to-Speech`: https://arxiv.org/abs/2006.04558

    i"V  r   hidden_channels   spksNlangsspk_embed_dimglobal_channelssegment_size    text_encoder_attention_heads   text_encoder_ffn_expand   text_encoder_blocks   $text_encoder_positionwise_layer_typeconv1d*text_encoder_positionwise_conv_kernel_size   +text_encoder_positional_encoding_layer_typerel_pos&text_encoder_self_attention_layer_typerel_selfattntext_encoder_activation_typeswishtext_encoder_normalize_beforeTtext_encoder_dropout_rateg?$text_encoder_positional_dropout_rateg        #text_encoder_attention_dropout_rate"text_encoder_conformer_kernel_size   !use_macaron_style_in_text_encoder"use_conformer_conv_in_text_encoderdecoder_kernel_sizedecoder_channelsi   decoder_upsample_scales)   rF   r,   r,   decoder_upsample_kernel_sizes)   rH   r.   r.   decoder_resblock_kernel_sizes)   r@      decoder_resblock_dilations)r4   rJ      use_weight_norm_in_decoderposterior_encoder_kernel_sizerM   posterior_encoder_layersrH   posterior_encoder_stacksposterior_encoder_base_dilationposterior_encoder_dropout_rate$use_weight_norm_in_posterior_encoderrJ   g      ?)
flow_flowsflow_kernel_sizeflow_base_dilationflow_layersflow_dropout_rateuse_weight_norm_in_flowuse_only_mean_in_flow)stochastic_duration_predictor_kernel_size*stochastic_duration_predictor_dropout_rate#stochastic_duration_predictor_flows-stochastic_duration_predictor_dds_conv_layersr   	AvgPool1d)kernel_sizestridepadding)   )   rM   rJ      i   )r,   r,   r.   r.   r4   	LeakyReLUnegative_slopeF)in_channelsout_channelskernel_sizeschannelsmax_downsample_channels
max_groupsbiasdownsample_scalesnonlinear_activationnonlinear_activation_paramsuse_weight_normuse_spectral_norm)r,   rJ   rM   r@   rK   )rJ   rJ   rJ   rJ   r4   )ri   rj   rk   rl   rp   rm   ro   rq   rr   rs   rt   )scalesscale_downsample_poolingscale_downsample_pooling_paramsscale_discriminator_paramsfollow_official_normperiodsperiod_discriminator_paramsmse)average_by_discriminators	loss_type)r}   average_by_layersinclude_final_outputs   hannP   r   )	fsn_fft
hop_length
win_lengthwindown_melsfminfmaxlog_base      ?g     F@g       @idimodimsampling_rategenerator_typegenerator_paramsdiscriminator_typediscriminator_paramsgenerator_adv_loss_paramsdiscriminator_adv_loss_paramsfeat_match_loss_paramsmel_loss_params
lambda_adv
lambda_mellambda_feat_match
lambda_dur	lambda_klcache_generator_outputsc                    s   t  sJ t   t| }|dkr|j||d |di || _t| }|di || _tdi || _	t
di |	| _tdi |
| _tdi || _t | _|| _|| _|| _|| _|| _|| _d| _|| _| jj| _| jj| _| jj| _dS )a  Initialize VITS module.

        Args:
            idim (int): Input vocabrary size.
            odim (int): Acoustic feature dimension. The actual output channels will
                be 1 since VITS is the end-to-end text-to-wave model but for the
                compatibility odim is used to indicate the acoustic feature dimension.
            sampling_rate (int): Sampling rate, not used for the training but it will
                be referred in saving waveform during the inference.
            generator_type (str): Generator type.
            generator_params (Dict[str, Any]): Parameter dict for generator.
            discriminator_type (str): Discriminator type.
            discriminator_params (Dict[str, Any]): Parameter dict for discriminator.
            generator_adv_loss_params (Dict[str, Any]): Parameter dict for generator
                adversarial loss.
            discriminator_adv_loss_params (Dict[str, Any]): Parameter dict for
                discriminator adversarial loss.
            feat_match_loss_params (Dict[str, Any]): Parameter dict for feat match loss.
            mel_loss_params (Dict[str, Any]): Parameter dict for mel loss.
            lambda_adv (float): Loss scaling coefficient for adversarial loss.
            lambda_mel (float): Loss scaling coefficient for mel spectrogram loss.
            lambda_feat_match (float): Loss scaling coefficient for feat match loss.
            lambda_dur (float): Loss scaling coefficient for duration loss.
            lambda_kl (float): Loss scaling coefficient for KL divergence loss.
            cache_generator_outputs (bool): Whether to cache generator outputs.

        r   )vocabsaux_channelsNr   )r   super__init__AVAILABLE_GENERATERSupdate	generatorAVAILABLE_DISCRIMINATORSdiscriminatorr   generator_adv_lossr   discriminator_adv_lossr   feat_match_lossr   mel_lossr   kl_lossr   r   r   r   r   r   _cacher   r$   r%   r&   )selfr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   generator_classdiscriminator_class	__class__r   r    r   ?   sJ    



zVITS.__init__c                 C      dS )z)Return whether or not speech is required.Tr   r   r   r   r    require_raw_speech     zVITS.require_raw_speechc                 C   r   )z*Return whether or not vocoder is required.Fr   r   r   r   r    require_vocoder  r   zVITS.require_vocodertexttext_lengthsfeatsfeats_lengthsspeechspeech_lengthssidsspembslidsforward_generatorreturnc                 C   s<   |
r| j |||||||||	d	S | j|||||||||	d	S )a  Perform generator forward.

        Args:
            text (Tensor): Text index tensor (B, T_text).
            text_lengths (Tensor): Text length tensor (B,).
            feats (Tensor): Feature tensor (B, T_feats, aux_channels).
            feats_lengths (Tensor): Feature length tensor (B,).
            speech (Tensor): Speech waveform tensor (B, T_wav).
            speech_lengths (Tensor): Speech length tensor (B,).
            sids (Optional[Tensor]): Speaker index tensor (B,) or (B, 1).
            spembs (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim).
            lids (Optional[Tensor]): Language index tensor (B,) or (B, 1).
            forward_generator (bool): Whether to forward generator.

        Returns:
            Dict[str, Any]:
                - loss (Tensor): Loss scalar tensor.
                - stats (Dict[str, float]): Statistics to be monitored.
                - weight (Tensor): Weight tensor to summarize losses.
                - optim_idx (int): Optimizer index (0 for G and 1 for D).

        )	r   r   r   r   r   r   r   r   r   )_forward_generator_forward_discrminator)r   r   r   r   r   r   r   r   r   r   r   r   r   r    forward  s.   #zVITS.forwardc
           "   	   C   s  | d}
|dd}|d}d}| jr| jdu r)d}| j|||||||	d}n| j}| jr7| jr7|s7|| _|\}}}}}}}|\}}}}}}t||| jj | jj	| jj d}| 
|}t  | 
|}W d   n1 srw   Y  tdd	L | ||}| |||||}t| }| |}| ||}|| j }|| j }|| j }|| j }|| j }|| | | | }W d   n1 sw   Y  t| | | | | | d
} t|| |
f|j\}} }!|s| jsd| _|| |!ddS )a  Perform generator forward.

        Args:
            text (Tensor): Text index tensor (B, T_text).
            text_lengths (Tensor): Text length tensor (B,).
            feats (Tensor): Feature tensor (B, T_feats, aux_channels).
            feats_lengths (Tensor): Feature length tensor (B,).
            speech (Tensor): Speech waveform tensor (B, T_wav).
            speech_lengths (Tensor): Speech length tensor (B,).
            sids (Optional[Tensor]): Speaker index tensor (B,) or (B, 1).
            spembs (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim).
            lids (Optional[Tensor]): Language index tensor (B,) or (B, 1).

        Returns:
            Dict[str, Any]:
                * loss (Tensor): Loss scalar tensor.
                * stats (Dict[str, float]): Statistics to be monitored.
                * weight (Tensor): Weight tensor to summarize losses.
                * optim_idx (int): Optimizer index (0 for G and 1 for D).

        r   r4   r,   TNFr   r   r   r   r   r   r   x
start_idxsr)   r   )generator_lossgenerator_mel_lossgenerator_kl_lossgenerator_dur_lossr   generator_feat_match_losslossstatsweight	optim_idx)size	transpose	unsqueezer   r   r   trainingr   upsample_factorr)   r   torchno_gradr   r   r   sumfloatr   r   r   r   r   r   r   dictitemr   device)"r   r   r   r   r   r   r   r   r   r   
batch_sizereuse_cacheoutsspeech_hat_dur_nll_r   z_maskouts_z_pm_plogs_plogs_qspeech_p_hatpr   r   dur_lossadv_lossr   r   r   r   r   r   r    r   R  st   
"










	
zVITS._forward_generatorc
              	   C   sB  | d}
|dd}|d}d}| jr| jdu r)d}| j|||||||	d}n| j}| jr4|s4|| _|^}}}}}t||| jj | jj| jj d}| 	|
 }| 	|}tdd	 | ||\}}|| }W d   n1 stw   Y  t| | | d
}t|||
f|j\}}}|s| jsd| _|||ddS )a  Perform discriminator forward.

        Args:
            text (Tensor): Text index tensor (B, T_text).
            text_lengths (Tensor): Text length tensor (B,).
            feats (Tensor): Feature tensor (B, T_feats, aux_channels).
            feats_lengths (Tensor): Feature length tensor (B,).
            speech (Tensor): Speech waveform tensor (B, T_wav).
            speech_lengths (Tensor): Speech length tensor (B,).
            sids (Optional[Tensor]): Speaker index tensor (B,) or (B, 1).
            spembs (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim).
            lids (Optional[Tensor]): Language index tensor (B,) or (B, 1).

        Returns:
            Dict[str, Any]:
                * loss (Tensor): Loss scalar tensor.
                * stats (Dict[str, float]): Statistics to be monitored.
                * weight (Tensor): Weight tensor to summarize losses.
                * optim_idx (int): Optimizer index (0 for G and 1 for D).

        r   r4   r,   TNFr   r   r   )discriminator_lossdiscriminator_real_lossdiscriminator_fake_lossr   )r   r   r   r   r   r   r   r   r)   r   detachr   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   	real_loss	fake_lossr   r   r   r   r   r    r     sV   
"






zVITS._forward_discrminatorMbX?皙?	durationsnoise_scalenoise_scale_duralphamax_lenuse_teacher_forcingc                 C   s  |d }t j|dgt j|jd}|dur|d}|dur$|d}|dur/|ddd}|r`|dus7J |d dd}t j|dgt j|jd}| jj||||||||
|d	\}}}n| jj|||||||||	|
d
\}}}t	|d|d |d d	S )
a$  Run inference.

        Args:
            text (Tensor): Input text index tensor (T_text,).
            feats (Tensor): Feature tensor (T_feats, aux_channels).
            sids (Tensor): Speaker index tensor (1,).
            spembs (Optional[Tensor]): Speaker embedding tensor (spk_embed_dim,).
            lids (Tensor): Language index tensor (1,).
            durations (Tensor): Ground-truth duration tensor (T_text,).
            noise_scale (float): Noise scale value for flow.
            noise_scale_dur (float): Noise scale value for duration predictor.
            alpha (float): Alpha parameter to control the speed of generated speech.
            max_len (Optional[int]): Maximum length.
            use_teacher_forcing (bool): Whether to use teacher forcing.

        Returns:
            Dict[str, Tensor]:
                * wav (Tensor): Generated waveform tensor (T_wav,).
                * att_w (Tensor): Monotonic attention weight tensor (T_feats, T_text).
                * duration (Tensor): Predicted duration tensor (T_text,).

        Nr4   )dtyper   r(   r,   )	r   r   r   r   r   r   r   r   r   )
r   r   r   r   r   durr   r   r   r   r   )wavatt_wduration)
r   tensorr   longr   viewr   r   	inferencer   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r   r   r   r    r    sX   %



zVITS.inference)NNNT)NNN)
NNNNNr   r   r   NF)__name__
__module____qualname____doc__intstrr   r   r   boolr   propertyr   r   r   Tensorr   r   r   r   r  __classcell__r   r   r   r    r!   4   sP   	
 !"#0(
7
8
_
c
g
lwxyz{| N

	


D	


v	


`	
r!   )T)'r  
contextlibr   distutils.versionr   typingr   r   r   r   	typeguardr   espnet2.gan_tts.abs_gan_ttsr   espnet2.gan_tts.hifiganr	   r
   r   r   r   espnet2.gan_tts.hifigan.lossr   r   r   r   espnet2.gan_tts.utilsr   espnet2.gan_tts.vits.generatorr   espnet2.gan_tts.vits.lossr    espnet2.torch_utils.device_funcsr   r   r   __version__torch.cuda.ampr   r!   r   r   r   r    <module>   s4   