o
    i:                     @   s  d Z ddlZddlmZmZmZmZmZmZ ddl	Z
ddlZddlm  mZ ddlmZ ddlmZmZmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddl m!Z" ddl#m$Z$ ddl%m&Z&m'Z' ddl(m)Z)m*Z* ddl+m!Z, G dd dejj-Z.dS )Generator module in JETS.    N)AnyDictListOptionalSequenceTuple)HiFiGANGenerator)AlignmentModuleaverage_by_durationviterbi_decode)GaussianUpsampling)get_random_segments)
initialize)VariancePredictor)StyleEncoder)Encoder)DurationPredictor)make_non_pad_maskmake_pad_mask)PositionalEncodingScaledPositionalEncodingc                       s  e Zd ZdZddddddddddddd	d	dd
d
ddddddddddddd	ddddddddddddd	ddddddd	ddddd	dddddddddddd	d	dddd dg d!g d"g d#g d$g d$g d$gddd%d&didfRd'ed(ed)ed*ed+ed,ed-ed.ed/ed0ed1ed2ed3ed4ed5ed6ed7ed8ed9ed:ed;ed<ed=ed>ed?ed@edAedBedCedDedEedFedGedHedIedJedKedLedMedNedOedPedQedRedSedTedUedVedWedXedYedZed[ee d\ee d]ee d^ed_ed`edaedbedce	e ddedeedfedgedhediedjedkedledmednedoedpedqedre
e dse
e dte
e due
e
e  dvedwedxedyeeef dzef fd{d|Z			dd}ejd~ejdejdejdejdejdejdejdeej deej deej deejejejejejejejejejejf
 fddZ									dd}ejd~ejdeej deej deej deej deej deej deej dedeejejejf fddZdejdejdejfddZdejdejfddZdhediedjefddZ  ZS )JETSGeneratorr            i   conv1d   TFtransformerg?legacyrel_posrel_selfattnswish      i     g      ?	   Nadd
      )    r*   @   r+      r,   r,   xavier_uniformg      ?r+   i   )   r/   r   r   )   r0   r   r   )r%   r#      )r   r%      	LeakyReLUnegative_slopeidimodimadimaheadselayerseunitsdlayersdunitspositionwise_layer_typepositionwise_conv_kernel_sizeuse_scaled_pos_encuse_batch_normencoder_normalize_beforedecoder_normalize_beforeencoder_concat_afterdecoder_concat_afterreduction_factorencoder_typedecoder_typetransformer_enc_dropout_rate'transformer_enc_positional_dropout_rate!transformer_enc_attn_dropout_ratetransformer_dec_dropout_rate'transformer_dec_positional_dropout_rate!transformer_dec_attn_dropout_rateconformer_rel_pos_typeconformer_pos_enc_layer_typeconformer_self_attn_layer_typeconformer_activation_typeuse_macaron_style_in_conformeruse_cnn_in_conformer	zero_triuconformer_enc_kernel_sizeconformer_dec_kernel_sizeduration_predictor_layersduration_predictor_chansduration_predictor_kernel_sizeduration_predictor_dropout_rateenergy_predictor_layersenergy_predictor_chansenergy_predictor_kernel_sizeenergy_predictor_dropoutenergy_embed_kernel_sizeenergy_embed_dropout#stop_gradient_from_energy_predictorpitch_predictor_layerspitch_predictor_chanspitch_predictor_kernel_sizepitch_predictor_dropoutpitch_embed_kernel_sizepitch_embed_dropout"stop_gradient_from_pitch_predictorspkslangsspk_embed_dimspk_embed_integration_typeuse_gst
gst_tokens	gst_headsgst_conv_layersgst_conv_chans_listgst_conv_kernel_sizegst_conv_stridegst_gru_layersgst_gru_units	init_typeinit_enc_alphainit_dec_alphause_maskinguse_weighted_maskingsegment_sizegenerator_out_channelsgenerator_channelsgenerator_global_channelsgenerator_kernel_sizegenerator_upsample_scalesgenerator_upsample_kernel_sizesgenerator_resblock_kernel_sizesgenerator_resblock_dilationsgenerator_use_additional_convsgenerator_biasgenerator_nonlinear_activation%generator_nonlinear_activation_paramsgenerator_use_weight_normcU           W         s\  t    |G| _tt|L| _|| _|| _|| _	|| _
|| _|4| _|-| _|| _|9| _d| _| jr3tnt}Ud||fv rn|dkrV|dkrJd}td |dkrUd}td	 n|d
krg|dks`J |dksfJ ntd| tjj||| jd}V|dkrt||||||V||||U|||	|
d| _nN|dkrtd-i d|d|d|d|d|d|Vd|d|d|d|d|d|	d|
d|d|d|d|d |d!|!d"| | _nt| d#| jrt||:||;|<|=|>|?|@|Ad$
| _d%| _|5d%ur|5d&kr|5| _tj|5|| _d%| _ |6d%ur |6d&kr |6| _ tj|6|| _!d%| _"|7d%ur3|7dkr3|7| _"|8| _#| j"d%urT| j#d'krItj$| j"|| _%ntj$|| j" || _%t&||#|$|%|&d(| _'t(||.|/|0|1d(| _)tj*tjj+d&||2|2d& d) d*tj,|3| _-t(||'|(|)|*d(| _.tj*tjj+d&||+|+d& d) d*tj,|,| _/t0||| _1t2 | _3|dkrtd||||d%||||U|||	|
d| _4nL|dkr
td-i ddd|d|d|d|dd%d|d|d|d|d|d|	d|
d|d|d|d|d |d!|"| _4nt| d#t5||H|I|J|K|L|M|N|O|P|Q|R|S|Td+| _6| j7|B|C|Dd, d%S ).a  Initialize JETS generator module.

        Args:
            idim (int): Dimension of the inputs.
            odim (int): Dimension of the outputs.
            elayers (int): Number of encoder layers.
            eunits (int): Number of encoder hidden units.
            dlayers (int): Number of decoder layers.
            dunits (int): Number of decoder hidden units.
            use_scaled_pos_enc (bool): Whether to use trainable scaled pos encoding.
            use_batch_norm (bool): Whether to use batch normalization in encoder prenet.
            encoder_normalize_before (bool): Whether to apply layernorm layer before
                encoder block.
            decoder_normalize_before (bool): Whether to apply layernorm layer before
                decoder block.
            encoder_concat_after (bool): Whether to concatenate attention layer's input
                and output in encoder.
            decoder_concat_after (bool): Whether to concatenate attention layer's input
                and output in decoder.
            reduction_factor (int): Reduction factor.
            encoder_type (str): Encoder type ("transformer" or "conformer").
            decoder_type (str): Decoder type ("transformer" or "conformer").
            transformer_enc_dropout_rate (float): Dropout rate in encoder except
                attention and positional encoding.
            transformer_enc_positional_dropout_rate (float): Dropout rate after encoder
                positional encoding.
            transformer_enc_attn_dropout_rate (float): Dropout rate in encoder
                self-attention module.
            transformer_dec_dropout_rate (float): Dropout rate in decoder except
                attention & positional encoding.
            transformer_dec_positional_dropout_rate (float): Dropout rate after decoder
                positional encoding.
            transformer_dec_attn_dropout_rate (float): Dropout rate in decoder
                self-attention module.
            conformer_rel_pos_type (str): Relative pos encoding type in conformer.
            conformer_pos_enc_layer_type (str): Pos encoding layer type in conformer.
            conformer_self_attn_layer_type (str): Self-attention layer type in conformer
            conformer_activation_type (str): Activation function type in conformer.
            use_macaron_style_in_conformer: Whether to use macaron style FFN.
            use_cnn_in_conformer: Whether to use CNN in conformer.
            zero_triu: Whether to use zero triu in relative self-attention module.
            conformer_enc_kernel_size: Kernel size of encoder conformer.
            conformer_dec_kernel_size: Kernel size of decoder conformer.
            duration_predictor_layers (int): Number of duration predictor layers.
            duration_predictor_chans (int): Number of duration predictor channels.
            duration_predictor_kernel_size (int): Kernel size of duration predictor.
            duration_predictor_dropout_rate (float): Dropout rate in duration predictor.
            pitch_predictor_layers (int): Number of pitch predictor layers.
            pitch_predictor_chans (int): Number of pitch predictor channels.
            pitch_predictor_kernel_size (int): Kernel size of pitch predictor.
            pitch_predictor_dropout_rate (float): Dropout rate in pitch predictor.
            pitch_embed_kernel_size (float): Kernel size of pitch embedding.
            pitch_embed_dropout_rate (float): Dropout rate for pitch embedding.
            stop_gradient_from_pitch_predictor: Whether to stop gradient from pitch
                predictor to encoder.
            energy_predictor_layers (int): Number of energy predictor layers.
            energy_predictor_chans (int): Number of energy predictor channels.
            energy_predictor_kernel_size (int): Kernel size of energy predictor.
            energy_predictor_dropout_rate (float): Dropout rate in energy predictor.
            energy_embed_kernel_size (float): Kernel size of energy embedding.
            energy_embed_dropout_rate (float): Dropout rate for energy embedding.
            stop_gradient_from_energy_predictor: Whether to stop gradient from energy
                predictor to encoder.
            spks (Optional[int]): Number of speakers. If set to > 1, assume that the
                sids will be provided as the input and use sid embedding layer.
            langs (Optional[int]): Number of languages. If set to > 1, assume that the
                lids will be provided as the input and use sid embedding layer.
            spk_embed_dim (Optional[int]): Speaker embedding dimension. If set to > 0,
                assume that spembs will be provided as the input.
            spk_embed_integration_type: How to integrate speaker embedding.
            use_gst (str): Whether to use global style token.
            gst_tokens (int): The number of GST embeddings.
            gst_heads (int): The number of heads in GST multihead attention.
            gst_conv_layers (int): The number of conv layers in GST.
            gst_conv_chans_list: (Sequence[int]):
                List of the number of channels of conv layers in GST.
            gst_conv_kernel_size (int): Kernel size of conv layers in GST.
            gst_conv_stride (int): Stride size of conv layers in GST.
            gst_gru_layers (int): The number of GRU layers in GST.
            gst_gru_units (int): The number of GRU units in GST.
            init_type (str): How to initialize transformer parameters.
            init_enc_alpha (float): Initial value of alpha in scaled pos encoding of the
                encoder.
            init_dec_alpha (float): Initial value of alpha in scaled pos encoding of the
                decoder.
            use_masking (bool): Whether to apply masking for padded part in loss
                calculation.
            use_weighted_masking (bool): Whether to apply weighted masking in loss
                calculation.
            segment_size (int): Segment size for random windowed discriminator
            generator_out_channels (int): Number of output channels.
            generator_channels (int): Number of hidden representation channels.
            generator_global_channels (int): Number of global conditioning channels.
            generator_kernel_size (int): Kernel size of initial and final conv layer.
            generator_upsample_scales (List[int]): List of upsampling scales.
            generator_upsample_kernel_sizes (List[int]): List of kernel sizes for
                upsample layers.
            generator_resblock_kernel_sizes (List[int]): List of kernel sizes for
                residual blocks.
            generator_resblock_dilations (List[List[int]]): List of list of dilations
                for residual blocks.
            generator_use_additional_convs (bool): Whether to use additional conv layers
                in residual blocks.
            generator_bias (bool): Whether to add bias parameter in convolution layers.
            generator_nonlinear_activation (str): Activation function module name.
            generator_nonlinear_activation_params (Dict[str, Any]): Hyperparameters for
                activation function.
            generator_use_weight_norm (bool): Whether to use weight norm.
                If set to true, it will be applied to all of the conv layers.

        r   	conformerr   r    legacy_rel_poszFallback to conformer_pos_enc_layer_type = 'legacy_rel_pos' due to the compatibility. If you want to use the new one, please use conformer_pos_enc_layer_type = 'latest'.r!   legacy_rel_selfattnzFallback to conformer_self_attn_layer_type = 'legacy_rel_selfattn' due to the compatibility. If you want to use the new one, please use conformer_pos_enc_layer_type = 'latest'.latestzUnknown rel_pos_type: )num_embeddingsembedding_dimpadding_idxr   )r5   attention_dimattention_headslinear_units
num_blocksinput_layerdropout_ratepositional_dropout_rateattention_dropout_ratepos_enc_classnormalize_beforeconcat_afterr=   r>   r5   r   r   r   r   r   r   r   r   r   r   r=   r>   macaron_stylepos_enc_layer_typeselfattention_layer_typeactivation_typeuse_cnn_modulecnn_module_kernelrT   z is not supported.)
r5   rn   gst_token_dimro   conv_layersconv_chans_listconv_kernel_sizeconv_stride
gru_layers	gru_unitsNr   r'   )r5   n_layersn_chanskernel_sizer   r   )in_channelsout_channelsr   padding)r   r   channelsglobal_channelsr   upsample_scalesupsample_kernel_sizesresblock_kernel_sizesresblock_dilationsuse_additional_convsbiasnonlinear_activationnonlinear_activation_paramsuse_weight_norm)rv   rw   rx    )8super__init__r{   intnpprodupsample_factorr5   r6   rE   rF   rG   rh   ra   r?   rm   r   r   r   loggingwarning
ValueErrortorchnn	EmbeddingTransformerEncoderencoderConformerEncoderr   gstri   sid_embrj   lid_embrk   rl   Linear
projectionr   duration_predictorr   pitch_predictor
SequentialConv1dDropoutpitch_embedenergy_predictorenergy_embedr
   alignment_moduler   length_regulatordecoderr	   	generator_reset_parameters)Wselfr5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r]   r^   r_   r`   ra   rb   rc   rd   re   rf   rg   rh   ri   rj   rk   rl   rm   rn   ro   rp   rq   rr   rs   rt   ru   rv   rw   rx   ry   rz   r{   r|   r}   r~   r   r   r   r   r   r   r   r   r   r   r   encoder_input_layer	__class__r   R/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/gan_tts/jets/generator.pyr   '   s   
R
	
	






	

zJETSGenerator.__init__texttext_lengthsfeatsfeats_lengthspitchpitch_lengthsenergyenergy_lengthssidsspembslidsreturnc           "   
   C   s  |ddd|  f }|ddd|  f }|ddd|  f }|ddd|  f }| |}| ||\}}| jrL| |}||d }| jdur`| |	d}||d }| j	durt| 
|d}||d }| jdur| ||
}t||j}| |||}t|||\}}t||d||d}t||d||d}| jr| | |d}n	| ||d}| jr| | |d}n	| ||d}| ||}| |dddd}| |dddd}|| | }t||j}t||j}| ||||}| |}| ||\}}t |dd|| j!\}} | "|}!|!||| ||||||f
S )a  Calculate forward propagation.

        Args:
            text (Tensor): Text index tensor (B, T_text).
            text_lengths (Tensor): Text length tensor (B,).
            feats (Tensor): Feature tensor (B, T_feats, aux_channels).
            feats_lengths (Tensor): Feature length tensor (B,).
            pitch (Tensor): Batch of padded token-averaged pitch (B, T_text, 1).
            pitch_lengths (LongTensor): Batch of pitch lengths (B, T_text).
            energy (Tensor): Batch of padded token-averaged energy (B, T_text, 1).
            energy_lengths (LongTensor): Batch of energy lengths (B, T_text).
            sids (Optional[Tensor]): Speaker index tensor (B,) or (B, 1).
            spembs (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim).
            lids (Optional[Tensor]): Language index tensor (B,) or (B, 1).

        Returns:
            Tensor: Waveform tensor (B, 1, segment_size * upsample_factor).
            Tensor: Binarization loss ().
            Tensor: Log probability attention matrix (B, T_feats, T_text).
            Tensor: Segments start index tensor (B,).
            Tensor: predicted duration (B, T_text).
            Tensor: ground-truth duration obtained from an alignment module (B, T_text).
            Tensor: predicted pitch (B, T_text,1).
            Tensor: ground-truth averaged pitch (B, T_text, 1).
            Tensor: predicted energy (B, T_text, 1).
            Tensor: ground-truth averaged energy (B, T_text, 1).

        Nr   r.   r   )#max_source_maskr   rm   r   	unsqueezeri   r   viewrj   r   rk   _integrate_with_spk_embedr   todevicer   r   r   squeezerh   r   detachra   r   r   r   	transposer   r   r   r   r   r{   r   )"r   r   r   r   r   r   r   r   r   r   r   r   x_maskshs_
style_embssid_embslid_embsh_masks
log_p_attndsbin_losspsesp_outse_outsd_outsp_embse_embsd_maskszs
z_segmentsz_start_idxswavr   r   r   forward  s|   5







zJETSGenerator.forwarduse_teacher_forcingc                 C   s  |  |}| ||\}}| jr| |}||d }| jdur0| |d}||d }| jdurD| 	|	d}||d }| j
durO| ||}t||j}|
r| |||}t|||\}}t||d||d}t||d||d}n| ||d}| ||d}| j||}| |dddd}| |dddd}|| | }|durt||j}nd}t||j}| ||||}|dur|  |}nd}| ||\}}| |dd}|d|fS )aM  Run inference.

        Args:
            text (Tensor): Input text index tensor (B, T_text,).
            text_lengths (Tensor): Text length tensor (B,).
            feats (Tensor): Feature tensor (B, T_feats, aux_channels).
            feats_lengths (Tensor): Feature length tensor (B,).
            pitch (Tensor): Pitch tensor (B, T_feats, 1)
            energy (Tensor): Energy tensor (B, T_feats, 1)
            sids (Optional[Tensor]): Speaker index tensor (B,) or (B, 1).
            spembs (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim).
            lids (Optional[Tensor]): Language index tensor (B,) or (B, 1).
            use_teacher_forcing (bool): Whether to use teacher forcing.

        Returns:
            Tensor: Generated waveform tensor (B, T_wav).
            Tensor: Duration tensor (B, T_text).

        r   Nr.   r   )r   r   rm   r   r   ri   r   r   rj   r   rk   r   r   r   r   r   r   r   r   r   r   r   	inferencer   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   r   r   r  r   r   r   r   r   r   r   r   r  r  r  r  r  r  r  r  r   r   r   r  |  sX   
!



zJETSGenerator.inferencer   c                 C   sz   | j dkr| t|}||d }|S | j dkr9t|dd|dd}| tj||gdd}|S t	d)aE  Integrate speaker embedding with hidden states.

        Args:
            hs (Tensor): Batch of hidden state sequences (B, T_text, adim).
            spembs (Tensor): Batch of speaker embeddings (B, spk_embed_dim).

        Returns:
            Tensor: Batch of integrated hidden state sequences (B, T_text, adim).

        r'   r   concatr.   )dimzsupport only add or concat.)
rl   r   F	normalizer   expandsizer   catNotImplementedError)r   r   r   r   r   r   r     s   

 z'JETSGenerator._integrate_with_spk_embedilensc                 C   s"   t |t|  j}|dS )a  Make masks for self-attention.

        Args:
            ilens (LongTensor): Batch of lengths (B,).

        Returns:
            Tensor: Mask tensor for self-attention.
                dtype=torch.uint8 in PyTorch 1.2-
                dtype=torch.bool in PyTorch 1.2+ (including 1.2)

        Examples:
            >>> ilens = [5, 3]
            >>> self._source_mask(ilens)
            tensor([[[1, 1, 1, 1, 1],
                     [1, 1, 1, 0, 0]]], dtype=torch.uint8)

        )r   r   next
parametersr   r   )r   r  r   r   r   r   r     s   
zJETSGenerator._source_maskc                 C   sj   |dkr	t | | | jdkr| jrt|| jjd j_| j	dkr1| jr3t|| j
jd j_d S d S d S )Npytorchr   r.   )r   rF   r?   r   tensorr   embedalphadatarG   r   )r   rv   rw   rx   r   r   r   r   	  s   
zJETSGenerator._reset_parameters)NNN)NNNNNNNF)__name__
__module____qualname____doc__r   strboolfloatr   r   r   r   r   r   r   Tensorr   r  r  r   r   r   __classcell__r   r   r   r   r   $   s   	
 !"#$&'()+,-./013456789;<=>?@ABCDEFGIJKLMNPQRSTUV
W\]^
_`   Y	

 	

^
r   )/r#  r   typingr   r   r   r   r   r   numpyr   r   torch.nn.functionalr   
functionalr  espnet2.gan_tts.hifiganr	   espnet2.gan_tts.jets.alignmentsr
   r   r   %espnet2.gan_tts.jets.length_regulatorr   espnet2.gan_tts.utilsr   espnet2.torch_utils.initializer   *espnet2.tts.fastspeech2.variance_predictorr   espnet2.tts.gst.style_encoderr   -espnet.nets.pytorch_backend.conformer.encoderr   r   9espnet.nets.pytorch_backend.fastspeech.duration_predictorr   &espnet.nets.pytorch_backend.nets_utilsr   r   1espnet.nets.pytorch_backend.transformer.embeddingr   r   /espnet.nets.pytorch_backend.transformer.encoderr   Moduler   r   r   r   r   <module>   s&    