o
    i                     @   sX   d Z ddlmZmZ ddlZddlmZ ddlmZ ddl	m
Z
 G dd dejjZdS )	z`Posterior encoder module in VITS.

This code is based on https://github.com/jaywalnut310/vits.

    )OptionalTupleN)WaveNet)Conv1d)make_non_pad_maskc                       s   e Zd ZdZ													dd
ededededededededededef fddZ	ddej	dej	de
ej	 deej	ej	ej	ej	f fddZ  ZS ) PosteriorEncoderaS  Posterior encoder module in VITS.

    This is a module of posterior encoder described in `Conditional Variational
    Autoencoder with Adversarial Learning for End-to-End Text-to-Speech`_.

    .. _`Conditional Variational Autoencoder with Adversarial Learning for End-to-End
        Text-to-Speech`: https://arxiv.org/abs/2006.04558
                          Tin_channelsout_channelshidden_channelskernel_sizelayersstacksbase_dilationglobal_channelsdropout_ratebiasuse_weight_normc                    s   t    t||d| _tdi ddddd|d|d|d|d	|d
dd|d d|d|d|	d|
d|dddddddd| _t||d d| _dS )a  Initilialize PosteriorEncoder module.

        Args:
            in_channels (int): Number of input channels.
            out_channels (int): Number of output channels.
            hidden_channels (int): Number of hidden channels.
            kernel_size (int): Kernel size in WaveNet.
            layers (int): Number of layers of WaveNet.
            stacks (int): Number of repeat stacking of WaveNet.
            base_dilation (int): Base dilation factor.
            global_channels (int): Number of global conditioning channels.
            dropout_rate (float): Dropout rate.
            bias (bool): Whether to use bias parameters in conv.
            use_weight_norm (bool): Whether to apply weight norm.

        r   r   r   r   r   r   r   r   residual_channelsaux_channelsgate_channels   skip_channelsr   r   r   r   use_first_convFuse_last_convscale_residualscale_skip_connectTN )super__init__r   
input_convr   encoderproj)selfr   r   r   r   r   r   r   r   r   r   r   	__class__r#   Z/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/gan_tts/vits/posterior_encoder.pyr%      sP   
	
zPosteriorEncoder.__init__Nx	x_lengthsgreturnc           	      C   s   t |dj|j|jd}| || }| j|||d}| || }|j|	dd dd\}}|t
|t
|  | }||||fS )a7  Calculate forward propagation.

        Args:
            x (Tensor): Input tensor (B, in_channels, T_feats).
            x_lengths (Tensor): Length tensor (B,).
            g (Optional[Tensor]): Global conditioning tensor (B, global_channels, 1).

        Returns:
            Tensor: Encoded hidden representation tensor (B, out_channels, T_feats).
            Tensor: Projected mean tensor (B, out_channels, T_feats).
            Tensor: Projected scale tensor (B, out_channels, T_feats).
            Tensor: Mask tensor for input tensor (B, 1, T_feats).

        r   )dtypedevice)r/   r   )dim)r   	unsqueezetor1   r2   r&   r'   r(   splitsizetorch
randn_likeexp)	r)   r-   r.   r/   x_maskstatsmlogszr#   r#   r,   forwardU   s   zPosteriorEncoder.forward)r   r	   r	   r
   r   r   r   r   r   TT)N)__name__
__module____qualname____doc__intfloatboolr%   r8   Tensorr   r   r@   __classcell__r#   r#   r*   r,   r      s\    	
9r   )rD   typingr   r   r8   espnet2.gan_tts.wavenetr   &espnet2.gan_tts.wavenet.residual_blockr   &espnet.nets.pytorch_backend.nets_utilsr   nnModuler   r#   r#   r#   r,   <module>   s   