o
    i]                  	   @   s:  d Z ddlmZmZ ddlZddlmZ ddlmZ ddl	m
Z
mZmZmZmZmZ ddlmZmZmZmZ ddlmZmZ dd	lmZ dd
lmZmZ ddlmZmZ ddl m!Z!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z, e*e,e&e(dZ-e
eeedZ.eeeeeeeedZ/G dd deZ0dS )z1Joint text-to-wav module for end-to-end training.    )AnyDictN)check_argument_types)	AbsGANTTS)HiFiGANGeneratorHiFiGANMultiPeriodDiscriminatorHiFiGANMultiScaleDiscriminator)HiFiGANMultiScaleMultiPeriodDiscriminatorHiFiGANPeriodDiscriminatorHiFiGANScaleDiscriminator)DiscriminatorAdversarialLossFeatureMatchLossGeneratorAdversarialLossMelSpectrogramLoss)MelGANGeneratorMelGANMultiScaleDiscriminator)PQMF)ParallelWaveGANDiscriminatorParallelWaveGANGenerator)StyleMelGANDiscriminatorStyleMelGANGenerator)get_random_segmentsget_segments)force_gatherable)
FastSpeech)FastSpeech2)	Tacotron2)Transformer)	tacotron2transformer
fastspeechfastspeech2)hifigan_generatormelgan_generatorparallel_wavegan_generatorstyle_melgan_generator)hifigan_period_discriminatorhifigan_scale_discriminator"hifigan_multi_period_discriminator!hifigan_multi_scale_discriminator.hifigan_multi_scale_multi_period_discriminator melgan_multi_scale_discriminatorparallel_wavegan_discriminatorstyle_melgan_discriminatorc                /       sb  e Zd ZdZdddi ddddd	d
dddd
dddddddddddddddddddddddd i d!d d"dd#d$d%d$d&d'd(d'd)d'd*d'd+d'd,d'd-d.d/d0d1d2d3d4d5dd6dd7d i d8d9d:d;d<dd=dd>d?d@d'dAddBddCd?dDddEddFddGd dHddIddJddKdi dLddMddNddOdPdQdPdRdSdTdUdVd dWdXdYd
dZd[d\g d]d^d?d_dd`ddadbdcdddeded d dfdgdddPd9g dhg dig djg dkg dkg dkgdddldmd'iddnd d
dodpdqdrdsddtd
ddduddg dvdbdwdxdg dydldmd'idd dzd g d{dddd?gdg d|dwddldmd'idd d}d~d ddd dddd d dddddwddSddddSdSd	dededdd fdedededededeeef dedeeef dedeeef dedeeef deeef deeef dedeeef dedeeef de	de	de	de	def. fddZ
edd Zedd Z	ddejdejdejdejdejdejdedeeef fddZdejdejdejdejdejdejdeeef fddZdejdejdejdejdejdejdeeef fddZdejdeeejf fddZ  ZS )JointText2Wavz:General class to jointly train text2mel and vocoder parts.    i"V  r!   adimi  aheads   elayers   eunitsi   dlayersdunitspostnet_layers   postnet_chansi   postnet_filtspostnet_dropout_rateg      ?positionwise_layer_typeconv1dpositionwise_conv_kernel_size   use_scaled_pos_encTuse_batch_normencoder_normalize_beforedecoder_normalize_beforeencoder_concat_afterFdecoder_concat_afterreduction_factorencoder_type	conformerdecoder_typetransformer_enc_dropout_rateg?'transformer_enc_positional_dropout_rate!transformer_enc_attn_dropout_ratetransformer_dec_dropout_rate'transformer_dec_positional_dropout_rate!transformer_dec_attn_dropout_rateconformer_rel_pos_typelatestconformer_pos_enc_layer_typerel_posconformer_self_attn_layer_typerel_selfattnconformer_activation_typeswishuse_macaron_style_in_conformeruse_cnn_in_conformer	zero_triuconformer_enc_kernel_size   conformer_dec_kernel_size   duration_predictor_layersduration_predictor_chansduration_predictor_kernel_size   duration_predictor_dropout_rateenergy_predictor_layersenergy_predictor_chansenergy_predictor_kernel_sizeenergy_predictor_dropoutenergy_embed_kernel_sizeenergy_embed_dropout#stop_gradient_from_energy_predictorpitch_predictor_layerspitch_predictor_chanspitch_predictor_kernel_sizepitch_predictor_dropoutpitch_embed_kernel_sizepitch_embed_dropout"stop_gradient_from_pitch_predictorspkslangsspk_embed_dimNspk_embed_integration_typeadduse_gst
gst_tokens
   	gst_headsgst_conv_layers   gst_conv_chans_list)r/   r/   @   r      r   gst_conv_kernel_sizegst_conv_stridegst_gru_layersgst_gru_unitsr   	init_typexavier_uniformg      ?)init_enc_alphainit_dec_alphause_maskinguse_weighted_maskingr"   )   r   r2   r2   )   r   r4   r4   )rc   r]      )r@   rc   r9   	LeakyReLUnegative_slope)out_channelschannelsglobal_channelskernel_sizeupsample_scalesupsample_kernel_sizesresblock_kernel_sizesresblock_dilationsuse_additional_convsbiasnonlinear_activationnonlinear_activation_paramsuse_weight_norm>   gV-?g      "@)subbandstapscutoff_ratiobetar*   	AvgPool1d)r   stridepadding)   )   r9   rc   i   r   )r2   r2   r4   r4   r@   )in_channelsr   kernel_sizesr   max_downsample_channels
max_groupsr   downsample_scalesr   r   r   use_spectral_norm)r2   rc   r9   r]   r   )rc   rc   rc   rc   r@   )r   r   r   r   r   r   r   r   r   r   r   )scalesscale_downsample_poolingscale_downsample_pooling_paramsscale_discriminator_paramsfollow_official_normperiodsperiod_discriminator_paramsmse)average_by_discriminators	loss_type)r   average_by_layersinclude_final_outputs   hannP   r   )	fsn_fft
hop_length
win_lengthwindown_melsfminfmaxlog_baseg       @g     F@idimodimsegment_sizesampling_ratetext2mel_typetext2mel_paramsvocoder_typevocoder_paramsuse_pqmfpqmf_paramsdiscriminator_typediscriminator_paramsgenerator_adv_loss_paramsdiscriminator_adv_loss_paramsuse_feat_match_lossfeat_match_loss_paramsuse_mel_lossmel_loss_paramslambda_text2mel
lambda_advlambda_feat_match
lambda_melcache_generator_outputsc                    s  t  sJ t   || _|	| _tj | _t	| }|j
||d |d	i || jd< t| }|dv r:|j
|d n
|dv rD|j
|d |d	i || jd< | jrYtd	i |
| _t| }|d	i || _td	i || _td	i || _|| _| jrtd	i || _|| _| jrtd	i || _|| _|| _| jr|| _| jr|| _|| _d| _|| _ | jd j!| _!| jd j"| _"| jd j#| _#dS )
a.  Initialize JointText2Wav module.

        Args:
            idim (int): Input vocabrary size.
            odim (int): Acoustic feature dimension. The actual output channels will
                be 1 since the model is the end-to-end text-to-wave model but for the
                compatibility odim is used to indicate the acoustic feature dimension.
            segment_size (int): Segment size for random windowed inputs.
            sampling_rate (int): Sampling rate, not used for the training but it will
                be referred in saving waveform during the inference.
            text2mel_type (str): The text2mel model type.
            text2mel_params (Dict[str, Any]): Parameter dict for text2mel model.
            use_pqmf (bool): Whether to use PQMF for multi-band vocoder.
            pqmf_params (Dict[str, Any]): Parameter dict for PQMF module.
            vocoder_type (str): The vocoder model type.
            vocoder_params (Dict[str, Any]): Parameter dict for vocoder model.
            discriminator_type (str): Discriminator type.
            discriminator_params (Dict[str, Any]): Parameter dict for discriminator.
            generator_adv_loss_params (Dict[str, Any]): Parameter dict for generator
                adversarial loss.
            discriminator_adv_loss_params (Dict[str, Any]): Parameter dict for
                discriminator adversarial loss.
            use_feat_match_loss (bool): Whether to use feat match loss.
            feat_match_loss_params (Dict[str, Any]): Parameter dict for feat match loss.
            use_mel_loss (bool): Whether to use mel loss.
            mel_loss_params (Dict[str, Any]): Parameter dict for mel loss.
            lambda_text2mel (float): Loss scaling coefficient for text2mel model loss.
            lambda_adv (float): Loss scaling coefficient for adversarial loss.
            lambda_feat_match (float): Loss scaling coefficient for feat match loss.
            lambda_mel (float): Loss scaling coefficient for mel loss.
            cache_generator_outputs (bool): Whether to cache generator outputs.

        )r   r   text2mel)r"   r#   )r   )r$   r%   )aux_channelsvocoderN )$r   super__init__r   r   torchnn
ModuleDict	generatorAVAILABLE_TEXT2MELupdateAVAILABLE_VOCODERr   pqmfAVAILABLE_DISCRIMINATORSdiscriminatorr   generator_adv_lossr   discriminator_adv_lossr   r   feat_match_lossr   r   mel_lossr   r   r   r   r   _cacher   rs   ru   rv   )selfr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   text2mel_classvocoder_classdiscriminator_class	__class__r   X/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/gan_tts/joint/joint_text2wav.pyr   C   sj    
T
zJointText2Wav.__init__c                 C      dS )z)Return whether or not speech is required.Tr   r   r   r   r   require_raw_speechV     z JointText2Wav.require_raw_speechc                 C   r   )z*Return whether or not vocoder is required.Fr   r   r   r   r   require_vocoder[  r   zJointText2Wav.require_vocodertexttext_lengthsfeatsfeats_lengthsspeechspeech_lengthsforward_generatorreturnc           	   	   K   s@   |r| j d||||||d|S | jd||||||d|S )a/  Perform generator forward.

        Args:
            text (Tensor): Text index tensor (B, T_text).
            text_lengths (Tensor): Text length tensor (B,).
            feats (Tensor): Feature tensor (B, T_feats, aux_channels).
            feats_lengths (Tensor): Feature length tensor (B,).
            speech (Tensor): Speech waveform tensor (B, T_wav).
            speech_lengths (Tensor): Speech length tensor (B,).
            forward_generator (bool): Whether to forward generator.

        Returns:
            Dict[str, Any]:
                - loss (Tensor): Loss scalar tensor.
                - stats (Dict[str, float]): Statistics to be monitored.
                - weight (Tensor): Weight tensor to summarize losses.
                - optim_idx (int): Optimizer index (0 for G and 1 for D).

        )r   r   r   r  r  r  Nr   )_forward_generator_forward_discrminator)	r   r   r   r   r  r  r  r  kwargsr   r   r   forward`  s*   
zJointText2Wav.forwardc                 K   s   | d}|d}d}	| jr| jdu rHd}	| jd d||||dd|\}
}}t|dd|| jd	\}}| jd
 |}| jrG| j	
|}n| j\}
}}}| jr^| jr^|	s^|
|||f| _t||| jd
 j | j| jd
 j d}| |}t  | |}W d   n1 sw   Y  | |}|| j }|
| j }
||
 }| jr| ||}|| j }|| }|j| d | jr| ||}| j| }|| }|j| d |j| |
 | d t|||f|j\}}}|	s| jsd| _|||ddS )a  Perform generator forward.

        Args:
            text (Tensor): Text index tensor (B, T_text).
            text_lengths (Tensor): Text length tensor (B,).
            feats (Tensor): Feature tensor (B, T_feats, aux_channels).
            feats_lengths (Tensor): Feature length tensor (B,).
            speech (Tensor): Speech waveform tensor (B, T_wav).
            speech_lengths (Tensor): Speech length tensor (B,).

        Returns:
            Dict[str, Any]:
                * loss (Tensor): Loss scalar tensor.
                * stats (Dict[str, float]): Statistics to be monitored.
                * weight (Tensor): Weight tensor to summarize losses.
                * optim_idx (int): Optimizer index (0 for G and 1 for D).

        r   r@   TNFr   r   r   r   r  joint_trainingr2   x	x_lengthsr   r   r  
start_idxsr   )r   )r   )adv_losstext2mel_losslossr  statsweight	optim_idxr   )size	unsqueezer   r   r   r   	transposer   r   r   	synthesistrainingr   upsample_factorr   r   no_gradr   r   r   r   r   r   r   itemr   r   r   r   device)r   r   r   r   r  r  r  r  
batch_sizereuse_cacher  r  	feats_gen
feats_gen_r  speech_hat_speech_p_hatpr  r  r   r   r  r   r   r   r    s|   


	









z JointText2Wav._forward_generatorc                 K   s^  | d}|d}d}	| jr| jdu rHd}	| jd d||||dd|\}
}}t|dd|| jd	\}}| jd
 |}| jrG| j	
|}n| j\}}}}| jr[|	s[|
|||f| _t||| jd
 j | j| jd
 j d}| | }| |}| ||\}}|| }t| | | d}t|||f|j\}}}|	s| jsd| _|||ddS )a  Perform discriminator forward.

        Args:
            text (Tensor): Text index tensor (B, T_text).
            text_lengths (Tensor): Text length tensor (B,).
            feats (Tensor): Feature tensor (B, T_feats, aux_channels).
            feats_lengths (Tensor): Feature length tensor (B,).
            speech (Tensor): Speech waveform tensor (B, T_wav).
            speech_lengths (Tensor): Speech length tensor (B,).

        Returns:
            Dict[str, Any]:
                * loss (Tensor): Loss scalar tensor.
                * stats (Dict[str, float]): Statistics to be monitored.
                * weight (Tensor): Weight tensor to summarize losses.
                * optim_idx (int): Optimizer index (0 for G and 1 for D).

        r   r@   TNFr   r
  r2   r  r   r  )discriminator_loss	real_loss	fake_lossr  r   )r  r  r   r   r   r   r  r   r   r   r  r   r  r   detachr   dictr  r   r   r  )r   r   r   r   r  r  r  r  r!  r"  r  r  r#  r$  r  r%  _r&  r'  r(  r*  r+  r  r  r   r   r   r     s`   


	




z#JointText2Wav._forward_discrminatorc                 K   sp   | j d jd
d|i|}| j d |d }| jr0| j|ddd}|ddd}|j|d |S )a%  Run inference.

        Args:
            text (Tensor): Input text index tensor (T_text,).

        Returns:
            Dict[str, Tensor]:
                * wav (Tensor): Generated waveform tensor (T_wav,).
                * feat_gan (Tensor): Generated feature tensor (T_text, C).

        r   r   r   feat_genr   r@   r2   )wavNr   )	r   	inferencer   r   r  r  r  squeezer   )r   r   r  output_dictr0  r   r   r   r1  _  s   zJointText2Wav.inference)T)__name__
__module____qualname____doc__intstrr   r   boolfloatr   propertyr   r   r   Tensorr	  r  r  r1  __classcell__r   r   r   r   r.   @   s   	
 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEJ( 
R
Sb
cj
k 
  
    
  !  
"  -  .  /  0  1   




3
	
m
	
_r.   )1r7  typingr   r   r   	typeguardr   espnet2.gan_tts.abs_gan_ttsr   espnet2.gan_tts.hifiganr   r   r   r	   r
   r   espnet2.gan_tts.hifigan.lossr   r   r   r   espnet2.gan_tts.melganr   r   espnet2.gan_tts.melgan.pqmfr    espnet2.gan_tts.parallel_waveganr   r   espnet2.gan_tts.style_melganr   r   espnet2.gan_tts.utilsr   r    espnet2.torch_utils.device_funcsr   espnet2.tts.fastspeechr   espnet2.tts.fastspeech2r   espnet2.tts.tacotron2r   espnet2.tts.transformerr   r   r   r   r.   r   r   r   r   <module>   sJ    