o
    ipc                     @   s   d Z ddlZddlmZmZmZ ddlZddlZddl	m
  mZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ G dd dej
jZdS )zXGenerator module in VITS.

This code is based on https://github.com/jaywalnut310/vits.

    N)ListOptionalTuple)HiFiGANGenerator)get_random_segments)StochasticDurationPredictor)PosteriorEncoder)ResidualAffineCouplingBlock)TextEncoder)make_non_pad_maskc                _       s  e Zd ZdZdddddddddd	d
dddddddddddddg dg dg dg dg dg dgddddddddddddddddddf.dedededee d ee d!ee d"ed#ed$ed%ed&ed'ed(ed)ed*ed+ed,ed-ed.ed/ed0ed1ed2ed3ed4ed5e	e d6e	e d7e	e d8e	e	e  d9ed:ed;ed<ed=ed>ed?ed@edAedBedCedDedEedFedGedHedIedJef^ fdKdLZ
			dfdMejdNejdOejdPejdQeej dReej dSeej dTeejejejejejejeejejejejejejf f fdUdVZ							W	X	Y		ZdgdMejdNejdOeej dPeej dQeej dReej dSeej d[eej d\ed]ed^ed_ee d`edTeejejejf fdadbZd[ejdcejdTejfdddeZ  ZS )hVITSGeneratora  Generator module in VITS.

    This is a module of VITS described in `Conditional Variational Autoencoder
    with Adversarial Learning for End-to-End Text-to-Speech`_.

    As text encoder, we use conformer architecture instead of the relative positional
    Transformer, which contains additional convolution layers.

    .. _`Conditional Variational Autoencoder with Adversarial Learning for End-to-End
        Text-to-Speech`: https://arxiv.org/abs/2006.04558

    i     N             conv1d   rel_posrel_selfattnswishTg?g           i   )   r   r   r   )   r   r   r   )   r      )r   r      r   r   r   g      ?vocabsaux_channelshidden_channelsspkslangsspk_embed_dimglobal_channelssegment_sizetext_encoder_attention_headstext_encoder_ffn_expandtext_encoder_blocks$text_encoder_positionwise_layer_type*text_encoder_positionwise_conv_kernel_size+text_encoder_positional_encoding_layer_type&text_encoder_self_attention_layer_typetext_encoder_activation_typetext_encoder_normalize_beforetext_encoder_dropout_rate$text_encoder_positional_dropout_rate#text_encoder_attention_dropout_rate"text_encoder_conformer_kernel_size!use_macaron_style_in_text_encoder"use_conformer_conv_in_text_encoderdecoder_kernel_sizedecoder_channelsdecoder_upsample_scalesdecoder_upsample_kernel_sizesdecoder_resblock_kernel_sizesdecoder_resblock_dilationsuse_weight_norm_in_decoderposterior_encoder_kernel_sizeposterior_encoder_layersposterior_encoder_stacksposterior_encoder_base_dilationposterior_encoder_dropout_rate$use_weight_norm_in_posterior_encoder
flow_flowsflow_kernel_sizeflow_base_dilationflow_layersflow_dropout_rateuse_weight_norm_in_flowuse_only_mean_in_flow)stochastic_duration_predictor_kernel_size*stochastic_duration_predictor_dropout_rate#stochastic_duration_predictor_flows-stochastic_duration_predictor_dds_conv_layersc0           1         s  t    || _tdi d|d|d|	d||
 d|d|d|d|d	|d
|d|d|d|d|d|d|d|| _t|d||||||||d
| _t||||| |!|"||#|$d
| _t	|||%|&|'|(||)|*|+d
| _
t||,|-|.|/|d| _tt|| _d| _|dur|dkr|dksJ || _tj||| _d| _|dur|dkr|dksJ || _tj||| _d| _|dur|dkr|dksJ || _tj||| _ddlm}0 |0| _dS )a2  Initialize VITS generator module.

        Args:
            vocabs (int): Input vocabulary size.
            aux_channels (int): Number of acoustic feature channels.
            hidden_channels (int): Number of hidden channels.
            spks (Optional[int]): Number of speakers. If set to > 1, assume that the
                sids will be provided as the input and use sid embedding layer.
            langs (Optional[int]): Number of languages. If set to > 1, assume that the
                lids will be provided as the input and use sid embedding layer.
            spk_embed_dim (Optional[int]): Speaker embedding dimension. If set to > 0,
                assume that spembs will be provided as the input.
            global_channels (int): Number of global conditioning channels.
            segment_size (int): Segment size for decoder.
            text_encoder_attention_heads (int): Number of heads in conformer block
                of text encoder.
            text_encoder_ffn_expand (int): Expansion ratio of FFN in conformer block
                of text encoder.
            text_encoder_blocks (int): Number of conformer blocks in text encoder.
            text_encoder_positionwise_layer_type (str): Position-wise layer type in
                conformer block of text encoder.
            text_encoder_positionwise_conv_kernel_size (int): Position-wise convolution
                kernel size in conformer block of text encoder. Only used when the
                above layer type is conv1d or conv1d-linear.
            text_encoder_positional_encoding_layer_type (str): Positional encoding layer
                type in conformer block of text encoder.
            text_encoder_self_attention_layer_type (str): Self-attention layer type in
                conformer block of text encoder.
            text_encoder_activation_type (str): Activation function type in conformer
                block of text encoder.
            text_encoder_normalize_before (bool): Whether to apply layer norm before
                self-attention in conformer block of text encoder.
            text_encoder_dropout_rate (float): Dropout rate in conformer block of
                text encoder.
            text_encoder_positional_dropout_rate (float): Dropout rate for positional
                encoding in conformer block of text encoder.
            text_encoder_attention_dropout_rate (float): Dropout rate for attention in
                conformer block of text encoder.
            text_encoder_conformer_kernel_size (int): Conformer conv kernel size. It
                will be used when only use_conformer_conv_in_text_encoder = True.
            use_macaron_style_in_text_encoder (bool): Whether to use macaron style FFN
                in conformer block of text encoder.
            use_conformer_conv_in_text_encoder (bool): Whether to use covolution in
                conformer block of text encoder.
            decoder_kernel_size (int): Decoder kernel size.
            decoder_channels (int): Number of decoder initial channels.
            decoder_upsample_scales (List[int]): List of upsampling scales in decoder.
            decoder_upsample_kernel_sizes (List[int]): List of kernel size for
                upsampling layers in decoder.
            decoder_resblock_kernel_sizes (List[int]): List of kernel size for resblocks
                in decoder.
            decoder_resblock_dilations (List[List[int]]): List of list of dilations for
                resblocks in decoder.
            use_weight_norm_in_decoder (bool): Whether to apply weight normalization in
                decoder.
            posterior_encoder_kernel_size (int): Posterior encoder kernel size.
            posterior_encoder_layers (int): Number of layers of posterior encoder.
            posterior_encoder_stacks (int): Number of stacks of posterior encoder.
            posterior_encoder_base_dilation (int): Base dilation of posterior encoder.
            posterior_encoder_dropout_rate (float): Dropout rate for posterior encoder.
            use_weight_norm_in_posterior_encoder (bool): Whether to apply weight
                normalization in posterior encoder.
            flow_flows (int): Number of flows in flow.
            flow_kernel_size (int): Kernel size in flow.
            flow_base_dilation (int): Base dilation in flow.
            flow_layers (int): Number of layers in flow.
            flow_dropout_rate (float): Dropout rate in flow
            use_weight_norm_in_flow (bool): Whether to apply weight normalization in
                flow.
            use_only_mean_in_flow (bool): Whether to use only mean in flow.
            stochastic_duration_predictor_kernel_size (int): Kernel size in stochastic
                duration predictor.
            stochastic_duration_predictor_dropout_rate (float): Dropout rate in
                stochastic duration predictor.
            stochastic_duration_predictor_flows (int): Number of flows in stochastic
                duration predictor.
            stochastic_duration_predictor_dds_conv_layers (int): Number of DDS conv
                layers in stochastic duration predictor.

        r   attention_dimattention_headslinear_unitsblockspositionwise_layer_typepositionwise_conv_kernel_sizepositional_encoding_layer_typeself_attention_layer_typeactivation_typenormalize_beforedropout_ratepositional_dropout_rateattention_dropout_rateconformer_kernel_sizeuse_macaron_styleuse_conformer_convr   )
in_channelsout_channelschannelsr$   kernel_sizeupsample_scalesupsample_kernel_sizesresblock_kernel_sizesresblock_dilationsuse_weight_norm)
r]   r^   r    r`   layersstacksbase_dilationr$   rW   re   )
r]   r    flowsr`   rh   rf   r$   rW   re   use_only_mean)r_   r`   rW   ri   dds_conv_layersr$   Nr   )maximum_path )super__init__r%   r
   text_encoderr   decoderr   posterior_encoderr	   flowr   duration_predictorintnpprodupsample_factorr!   torchnn	Embedding
global_embr#   Linear
spemb_projr"   lang_emb$espnet2.gan_tts.vits.monotonic_alignrl   )1selfr   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rl   	__class__rm   R/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/gan_tts/vits/generator.pyro   (   s    
	
	
zVITSGenerator.__init__texttext_lengthsfeatsfeats_lengthssidsspembslidsreturnc                  C   sj  |  ||\}}	}
}d}| jdur| |dd}| jdur7| t|d}|du r3|}n|| }| j	durR| 
|dd}|du rN|}n|| }| j|||d\}}}}| j|||d}t m td|
 }tjdtdtj  |
 dgdd	}td|d dd |}t|dd|	| }tjd|	d  | dgdd	}|| | | }t|dt|d }| ||dd }W d   n1 sw   Y  |d}| j||||d
}|t| }t|d|	dddd}	t|d|
dddd}
t||| j\}}| j||d}|||||||||	|
||ffS )a  Calculate forward propagation.

        Args:
            text (Tensor): Text index tensor (B, T_text).
            text_lengths (Tensor): Text length tensor (B,).
            feats (Tensor): Feature tensor (B, aux_channels, T_feats).
            feats_lengths (Tensor): Feature length tensor (B,).
            sids (Optional[Tensor]): Speaker index tensor (B,) or (B, 1).
            spembs (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim).
            lids (Optional[Tensor]): Language index tensor (B,) or (B, 1).

        Returns:
            Tensor: Waveform tensor (B, 1, segment_size * upsample_factor).
            Tensor: Duration negative log-likelihood (NLL) tensor (B,).
            Tensor: Monotonic attention weight tensor (B, 1, T_feats, T_text).
            Tensor: Segments start index tensor (B,).
            Tensor: Text mask tensor (B, 1, T_text).
            Tensor: Feature mask tensor (B, 1, T_feats).
            tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]:
                - Tensor: Posterior encoder hidden representation (B, H, T_feats).
                - Tensor: Flow hidden representation (B, H, T_feats).
                - Tensor: Expanded text encoder projected mean (B, H, T_feats).
                - Tensor: Expanded text encoder projected scale (B, H, T_feats).
                - Tensor: Posterior encoder projected mean (B, H, T_feats).
                - Tensor: Posterior encoder projected scale (B, H, T_feats).

        Nr   g      r   r   Tkeepdim)wr   )rp   r!   r|   view	unsqueezer#   r~   F	normalizer"   r   rr   rs   ry   no_gradexpsummathlogpimatmul	transposerl   squeezedetachrt   r   r%   rq   ) r   r   r   r   r   r   r   r   xm_plogs_px_maskr   g_zm_qlogs_qy_maskz_ps_p_sq_rneg_x_ent_1neg_x_ent_2neg_x_ent_3neg_x_ent_4	neg_x_ent	attn_maskattnr   dur_nll
z_segmentsz_start_idxswavrm   rm   r   forward  s   5





(""zVITSGenerator.forwardMbX?皙?      ?Fdurnoise_scalenoise_scale_duralphamax_lenuse_teacher_forcingc           %      C   s  |  ||\}}}}d}| jdur| |dd}| jdur:| t|dd}|du r6|}n|| }| j	durU| 
|dd}|du rQ|}n|| }|r| j|||d\}}}}| j|||d}td| }tjdtdtj  | dgd	d
}td|d dd |}t|dd|| }tjd|d  | dgd	d
}|| | | }t|dt|d }| ||dd} | d}| j|| |d}!n|du r| j|||d	|
d}"t|"| | }#t|#}tt|ddgd }$t|$d|j}t|dt|d }| ||} t| d|dddd}t| d|dddd}|t |t| |	  }| j|||d	d}| j|| ddddd|f |d}!|!d| d|dfS )a   Run inference.

        Args:
            text (Tensor): Input text index tensor (B, T_text,).
            text_lengths (Tensor): Text length tensor (B,).
            feats (Tensor): Feature tensor (B, aux_channels, T_feats,).
            feats_lengths (Tensor): Feature length tensor (B,).
            sids (Optional[Tensor]): Speaker index tensor (B,) or (B, 1).
            spembs (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim).
            lids (Optional[Tensor]): Language index tensor (B,) or (B, 1).
            dur (Optional[Tensor]): Ground-truth duration (B, T_text,). If provided,
                skip the prediction of durations (i.e., teacher forcing).
            noise_scale (float): Noise scale parameter for flow.
            noise_scale_dur (float): Noise scale parameter for duration predictor.
            alpha (float): Alpha parameter to control the speed of generated speech.
            max_len (Optional[int]): Maximum length of acoustic feature sequence.
            use_teacher_forcing (bool): Whether to use teacher forcing.

        Returns:
            Tensor: Generated waveform tensor (B, T_wav).
            Tensor: Monotonic attention weight tensor (B, T_feats, T_text).
            Tensor: Duration tensor (B, T_text).

        Nr   r   r   r   r   r   r   Tr   )r   inverser   )r   r   )!rp   r!   r|   r   r   r#   r~   r   r   r"   r   rr   rs   ry   r   r   r   r   r   r   r   rl   r   rq   rt   ceil	clamp_minlongr   todevice_generate_path
randn_like)%r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   logwr   	y_lengthsrm   rm   r   	inference  s   )







(zVITSGenerator.inferencemaskc           
      C   s   |j \}}}}t|d}||| }tj||j|jd}	|	d|dk }	|	|||j|jd}	|	t	
|	g dddddf  }	|	ddd	| S )
a  Generate path a.k.a. monotonic attention.

        Args:
            dur (Tensor): Duration tensor (B, 1, T_text).
            mask (Tensor): Attention mask tensor (B, 1, T_feats, T_text).

        Returns:
            Tensor: Path tensor (B, 1, T_feats, T_text).

        r   )dtyper   r   r   )r   )r   r   r   r   r   r   Nr   r   )shapery   cumsumr   aranger   r   r   r   r   padr   )
r   r   r   b_t_yt_xcum_durcum_dur_flatpathrm   rm   r   r   (  s   $zVITSGenerator._generate_path)NNN)NNNNNNr   r   r   NF)__name__
__module____qualname____doc__ru   r   strboolfloatr   ro   ry   Tensorr   r   r   r   __classcell__rm   rm   r   r   r      s   	

 !"#$%&'()*+,-./0 b	
 	

 $r   )r   r   typingr   r   r   numpyrv   ry   torch.nn.functionalrz   
functionalr   espnet2.gan_tts.hifiganr   espnet2.gan_tts.utilsr   'espnet2.gan_tts.vits.duration_predictorr   &espnet2.gan_tts.vits.posterior_encoderr   &espnet2.gan_tts.vits.residual_couplingr	   !espnet2.gan_tts.vits.text_encoderr
   &espnet.nets.pytorch_backend.nets_utilsr   Moduler   rm   rm   rm   r   <module>   s   