o
    i6^                     @   s   d Z ddlmZmZmZ ddlZddlmZ ddlm	Z	 ddl
mZmZmZmZmZ ddlmZmZmZmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ deiZeeeeedZG dd de	Z dS )zJETS module for GAN-TTS task.    )AnyDictOptionalN)check_argument_types)	AbsGANTTS)HiFiGANMultiPeriodDiscriminatorHiFiGANMultiScaleDiscriminator)HiFiGANMultiScaleMultiPeriodDiscriminatorHiFiGANPeriodDiscriminatorHiFiGANScaleDiscriminator)DiscriminatorAdversarialLossFeatureMatchLossGeneratorAdversarialLossMelSpectrogramLoss)JETSGenerator)ForwardSumLossVarianceLoss)get_segments)force_gatherablejets_generator)hifigan_period_discriminatorhifigan_scale_discriminator"hifigan_multi_period_discriminator!hifigan_multi_scale_discriminator.hifigan_multi_scale_multi_period_discriminatorc                #       s  e Zd ZdZddi dddddd	d
ddd	ddddddddddddddddddddddddi ddd dd!dd"dd#dd$dd%d&d'd(d)d*d+d,d-dd.dd/dd0d1d2d3d4dd5d6i d7d8d9dd:dd;d6d<d8d=d>d?dd@d>dAddBdCdDd6dEdCdFd>dGddHd>dIddJdi dKdLdMdNdOd1dPg dQdRg dSdTg dUdVg dWg dWg dWgdXddYddZd[d\d]did^dd_d`dadNdbdNdcdddedfddgd	dhg did8dddjdkdldldddmdnddod	dddpddg dqdjddrdg dsd[d]didddtdg dudddCd8gdvg dwddd[d]didddxdyddzd{ddzd{dddd|dddddd}d~dddddd	dldddlddfdededededeeef dedeeef deeef deeef deeef deeef dededededede	f" fddZ
edd Zedd Z	d	d	d	ddejdejdejdejdejdejdeej deej deej de	deeef fddZ	d	d	dddejdejdejdejdejdejdeej deej deej deeef fddZ	d	d	dddejdejdejdejdejdejdeej deej deej deeef fddZ	d	d	d	ddejdeej deej deej de	deeejf fddZ  ZS )JETSa3  JETS module (generator + discriminator).

    This is a module of JETS described in `JETS: Jointly Training FastSpeech2
    and HiFi-GAN for End to End Text to Speech'_.

    .. _`JETS: Jointly Training FastSpeech2 and HiFi-GAN for End to End Text to Speech`
        : https://arxiv.org/abs/2203.16852

    i"V  r   adim   aheads   elayers   eunitsi   dlayersdunitspositionwise_layer_typeconv1dpositionwise_conv_kernel_size   use_scaled_pos_encTuse_batch_normencoder_normalize_beforedecoder_normalize_beforeencoder_concat_afterFdecoder_concat_afterreduction_factorencoder_typetransformerdecoder_typetransformer_enc_dropout_rateg?'transformer_enc_positional_dropout_rate!transformer_enc_attn_dropout_ratetransformer_dec_dropout_rate'transformer_dec_positional_dropout_rate!transformer_dec_attn_dropout_rateconformer_rel_pos_typelatestconformer_pos_enc_layer_typerel_posconformer_self_attn_layer_typerel_selfattnconformer_activation_typeswishuse_macaron_style_in_conformeruse_cnn_in_conformer	zero_triuconformer_enc_kernel_size   conformer_dec_kernel_size   duration_predictor_layersduration_predictor_chansi  duration_predictor_kernel_size   duration_predictor_dropout_rateenergy_predictor_layersenergy_predictor_chansenergy_predictor_kernel_sizeenergy_predictor_dropoutg      ?energy_embed_kernel_sizeenergy_embed_dropout#stop_gradient_from_energy_predictorpitch_predictor_layers   pitch_predictor_chanspitch_predictor_kernel_sizepitch_predictor_dropoutpitch_embed_kernel_sizepitch_embed_dropout"stop_gradient_from_pitch_predictorgenerator_out_channelsgenerator_channelsi   generator_global_channelsgenerator_kernel_sizegenerator_upsample_scales)   rb   r   r   generator_upsample_kernel_sizes)   rd   r!   r!   generator_resblock_kernel_sizes)rK   rE      generator_resblock_dilations)r(   rK   rU   generator_use_additional_convsgenerator_biasgenerator_nonlinear_activation	LeakyReLU%generator_nonlinear_activation_paramsnegative_slopegenerator_use_weight_normsegment_size@   spkslangsspk_embed_dimNspk_embed_integration_typeadd
      )    rx   rp   rp      ry   ry   xavier_uniformg      ?)use_gst
gst_tokens	gst_headsgst_conv_layersgst_conv_chans_listgst_conv_kernel_sizegst_conv_stridegst_gru_layersgst_gru_units	init_typeinit_enc_alphainit_dec_alphause_maskinguse_weighted_maskingr   	AvgPool1d)kernel_sizestridepadding)   )   rU   rK   rd   )r   r   r!   r!   r(   )in_channelsout_channelskernel_sizeschannelsmax_downsample_channels
max_groupsbiasdownsample_scalesnonlinear_activationnonlinear_activation_paramsuse_weight_normuse_spectral_norm)r   rK   rU   rE   rf   rx   )rK   rK   rK   rK   r(   )r   r   r   r   r   r   r   r   r   r   r   )scalesscale_downsample_poolingscale_downsample_pooling_paramsscale_discriminator_paramsfollow_official_normperiodsperiod_discriminator_paramsmse)average_by_discriminators	loss_type)r   average_by_layersinclude_final_outputshannP   r   )	fsn_fft
hop_length
win_lengthwindown_melsfminfmaxlog_baseg     F@g       @idimodimsampling_rategenerator_typegenerator_paramsdiscriminator_typediscriminator_paramsgenerator_adv_loss_paramsdiscriminator_adv_loss_paramsfeat_match_loss_paramsmel_loss_params
lambda_adv
lambda_mellambda_feat_match
lambda_varlambda_aligncache_generator_outputsc                    s   t  sJ t   t| }|j||d |di || _t| }|di || _tdi || _	t
di |	| _tdi |
| _tdi || _t | _t | _|| _|| _|| _|| _|| _|| _d| _|| _| jj| _| jj| _| jj| _dS )a  Initialize JETS module.

        Args:
            idim (int): Input vocabrary size.
            odim (int): Acoustic feature dimension. The actual output channels will
                be 1 since JETS is the end-to-end text-to-wave model but for the
                compatibility odim is used to indicate the acoustic feature dimension.
            sampling_rate (int): Sampling rate, not used for the training but it will
                be referred in saving waveform during the inference.
            generator_type (str): Generator type.
            generator_params (Dict[str, Any]): Parameter dict for generator.
            discriminator_type (str): Discriminator type.
            discriminator_params (Dict[str, Any]): Parameter dict for discriminator.
            generator_adv_loss_params (Dict[str, Any]): Parameter dict for generator
                adversarial loss.
            discriminator_adv_loss_params (Dict[str, Any]): Parameter dict for
                discriminator adversarial loss.
            feat_match_loss_params (Dict[str, Any]): Parameter dict for feat match loss.
            mel_loss_params (Dict[str, Any]): Parameter dict for mel loss.
            lambda_adv (float): Loss scaling coefficient for adversarial loss.
            lambda_mel (float): Loss scaling coefficient for mel spectrogram loss.
            lambda_feat_match (float): Loss scaling coefficient for feat match loss.
            lambda_var (float): Loss scaling coefficient for variance loss.
            lambda_align (float): Loss scaling coefficient for alignment loss.
            cache_generator_outputs (bool): Whether to cache generator outputs.

        )r   r   N )r   super__init__AVAILABLE_GENERATERSupdate	generatorAVAILABLE_DISCRIMINATORSdiscriminatorr   generator_adv_lossr   discriminator_adv_lossr   feat_match_lossr   mel_lossr   var_lossr   forwardsum_lossr   r   r   r   r   r   _cacher   rq   rr   rs   )selfr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   generator_classdiscriminator_class	__class__r   M/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/gan_tts/jets/jets.pyr   5   sJ    
?


zJETS.__init__c                 C      dS )z)Return whether or not speech is required.Tr   r   r   r   r   require_raw_speech#     zJETS.require_raw_speechc                 C   r   )z*Return whether or not vocoder is required.Fr   r   r   r   r   require_vocoder(  r   zJETS.require_vocodertexttext_lengthsfeatsfeats_lengthsspeechspeech_lengthssidsspembslidsforward_generatorreturnc                 K   sL   |
r| j d|||||||||	d	|S | jd|||||||||	d	|S )a  Perform generator forward.

        Args:
            text (Tensor): Text index tensor (B, T_text).
            text_lengths (Tensor): Text length tensor (B,).
            feats (Tensor): Feature tensor (B, T_feats, aux_channels).
            feats_lengths (Tensor): Feature length tensor (B,).
            speech (Tensor): Speech waveform tensor (B, T_wav).
            speech_lengths (Tensor): Speech length tensor (B,).
            sids (Optional[Tensor]): Speaker index tensor (B,) or (B, 1).
            spembs (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim).
            lids (Optional[Tensor]): Language index tensor (B,) or (B, 1).
            forward_generator (bool): Whether to forward generator.

        Returns:
            Dict[str, Any]:
                - loss (Tensor): Loss scalar tensor.
                - stats (Dict[str, float]): Statistics to be monitored.
                - weight (Tensor): Weight tensor to summarize losses.
                - optim_idx (int): Optimizer index (0 for G and 1 for D).

        )	r   r   r   r   r   r   r   r   r   Nr   )_forward_generator_forward_discrminator)r   r   r   r   r   r   r   r   r   r   r   kwargsr   r   r   forward-  s6   $

zJETS.forwardc
           (      K   s  | d}|d}d}| jr| jdu r'd}| jd
|||||||	d|
}n| j}| jr5| jr5|s5|| _|\
}}}}}}}}}}t||| jj | jj| jj d}| 	|}t
  | 	|}W d   n1 skw   Y  | ||}| |}| ||}| |||||||\}}} | |||}!|| j }|| j }|| j }|| | }"|| |  | j }#|!| | j }$|"|# |$ }%t|% |" |# |$ | | | | | |  |! | d}&t|%|&|f|%j\}%}&}'|s| jsd| _|%|&|'dd	S )a  Perform generator forward.

        Args:
            text (Tensor): Text index tensor (B, T_text).
            text_lengths (Tensor): Text length tensor (B,).
            feats (Tensor): Feature tensor (B, T_feats, aux_channels).
            feats_lengths (Tensor): Feature length tensor (B,).
            speech (Tensor): Speech waveform tensor (B, T_wav).
            speech_lengths (Tensor): Speech length tensor (B,).
            sids (Optional[Tensor]): Speaker index tensor (B,) or (B, 1).
            spembs (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim).
            lids (Optional[Tensor]): Language index tensor (B,) or (B, 1).

        Returns:
            Dict[str, Any]:
                * loss (Tensor): Loss scalar tensor.
                * stats (Dict[str, float]): Statistics to be monitored.
                * weight (Tensor): Weight tensor to summarize losses.
                * optim_idx (int): Optimizer index (0 for G and 1 for D).

        r   r(   TNFr   r   r   r   r   r   r   x
start_idxsro   )generator_lossgenerator_g_lossgenerator_var_lossgenerator_align_lossgenerator_g_mel_lossgenerator_g_adv_lossgenerator_g_feat_match_lossgenerator_var_dur_lossgenerator_var_pitch_lossgenerator_var_energy_lossgenerator_align_forwardsum_lossgenerator_align_bin_losslossstatsweight	optim_idxr   )size	unsqueezer   r   r   trainingr   upsample_factorro   r   torchno_gradr   r   r   r   r   r   r   r   r   r   dictitemr   device)(r   r   r   r   r   r   r   r   r   r   r   
batch_sizereuse_cacheoutsspeech_hat_bin_loss
log_p_attnr   d_outsdsp_outspse_outsesspeech_p_hatpr   adv_lossr   dur_loss
pitch_lossenergy_lossr   g_lossr   
align_lossr   r   r   r   r   r   r   l  s   
#









zJETS._forward_generatorc
              
   K   s  | d}|d}d}| jr| jdu r'd}| jd
|||||||	d|
}n| j}| jr2|s2|| _|^}}}}}t||| jj | jj| jj d}| |	 }| |}| 
||\}}|| }t| | | d}t|||f|j\}}}|s| jsd| _|||dd	S )a  Perform discriminator forward.

        Args:
            text (Tensor): Text index tensor (B, T_text).
            text_lengths (Tensor): Text length tensor (B,).
            feats (Tensor): Feature tensor (B, T_feats, aux_channels).
            feats_lengths (Tensor): Feature length tensor (B,).
            speech (Tensor): Speech waveform tensor (B, T_wav).
            speech_lengths (Tensor): Speech length tensor (B,).
            sids (Optional[Tensor]): Speaker index tensor (B,) or (B, 1).
            spembs (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim).
            lids (Optional[Tensor]): Language index tensor (B,) or (B, 1).

        Returns:
            Dict[str, Any]:
                * loss (Tensor): Loss scalar tensor.
                * stats (Dict[str, float]): Statistics to be monitored.
                * weight (Tensor): Weight tensor to summarize losses.
                * optim_idx (int): Optimizer index (0 for G and 1 for D).

        r   r(   TNFr   r   )discriminator_lossdiscriminator_real_lossdiscriminator_fake_lossr   r   )r  r  r   r   r   r   r  ro   r   detachr   r  r  r   r	  r  )r   r   r   r   r   r   r   r   r   r   r   r
  r  r  r  _r   r  r  r  	real_loss	fake_lossr   r   r   r   r   r   r     sT   
#




zJETS._forward_discrminatorpitchenergyuse_teacher_forcingc              
   K   s   |d }t j|dgt j|jd}d|v r|d d |d< |r`|dus&J |d }t j|dgt j|jd}|dus>J |d }|dusHJ |d }| jjd
|||||||d|\}	}
n| jjd
||d|\}	}
t|	d|
d d	S )a   Run inference.

        Args:
            text (Tensor): Input text index tensor (T_text,).
            feats (Tensor): Feature tensor (T_feats, aux_channels).
            pitch (Tensor): Pitch tensor (T_feats, 1).
            energy (Tensor): Energy tensor (T_feats, 1).
            use_teacher_forcing (bool): Whether to use teacher forcing.

        Returns:
            Dict[str, Tensor]:
                * wav (Tensor): Generated waveform tensor (T_wav,).
                * duration (Tensor): Predicted duration tensor (T_text,).

        Nr(   )dtyper	  r   )r   r   r   r   r&  r'  r(  )r   r   r_   r   )wavdurationr   )	r  tensorr  longr	  r   	inferencer  view)r   r   r   r&  r'  r(  r   r   r   r*  durr   r   r   r.  K  sL   


zJETS.inference)NNNT)NNN)NNNF)__name__
__module____qualname____doc__intstrr   r   floatboolr   propertyr   r   r  Tensorr   r   r   r   r.  __classcell__r   r   r   r   r   *   s   	
 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEU( 
\
] 
  
	  
  
           !  "  o

	


G	


 	


`r   )!r4  typingr   r   r   r  	typeguardr   espnet2.gan_tts.abs_gan_ttsr   espnet2.gan_tts.hifiganr   r   r	   r
   r   espnet2.gan_tts.hifigan.lossr   r   r   r   espnet2.gan_tts.jets.generatorr   espnet2.gan_tts.jets.lossr   r   espnet2.gan_tts.utilsr    espnet2.torch_utils.device_funcsr   r   r   r   r   r   r   r   <module>   s(   	