o
    i)                     @   sf   d Z ddlZddlmZ ddlZddlm  mZ ddl	m
Z
mZmZmZmZ G dd dejjZdS )zmStochastic duration predictor modules in VITS.

This code is based on https://github.com/jaywalnut310/vits.

    N)Optional)ConvFlowDilatedDepthSeparableConvElementwiseAffineFlowFlipFlowLogFlowc                       s   e Zd ZdZ						ddeded	ed
ededef fddZ				ddejdejde	ej de	ej de
dedejfddZ  ZS )StochasticDurationPredictorad  Stochastic duration predictor module.

    This is a module of stochastic duration predictor described in `Conditional
    Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech`_.

    .. _`Conditional Variational Autoencoder with Adversarial Learning for End-to-End
        Text-to-Speech`: https://arxiv.org/abs/2006.04558

                ?   channelskernel_sizedropout_rateflowsdds_conv_layersglobal_channelsc              	      sX  t    tj||d| _t||||d| _tj||d| _t	 | _
tj | _|  jtdg7  _t|D ]}|  jtd|||dg7  _|  jt g7  _q8tjd|d| _t||||d| _tj||d| _tj | _|  jtdg7  _t|D ]}|  jtd|||dg7  _|  jt g7  _q|dkrtj||d| _dS dS )a  Initialize StochasticDurationPredictor module.

        Args:
            channels (int): Number of channels.
            kernel_size (int): Kernel size.
            dropout_rate (float): Dropout rate.
            flows (int): Number of flows.
            dds_conv_layers (int): Number of conv layers in DDS conv.
            global_channels (int): Number of global conditioning channels.

           )layersr      )r   r   N)super__init__torchnnConv1dprer   ddsprojr   log_flow
ModuleListr   r   ranger   r   post_prepost_dds	post_proj
post_flowsglobal_conv)selfr   r   r   r   r   r   i	__class__ [/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/gan_tts/vits/duration_predictor.pyr   $   sZ   
z$StochasticDurationPredictor.__init__NF      ?xx_maskwginversenoise_scalereturnc                 C   sv  |  }| |}|dur|| |   }| ||}| || }|s|dus-J d| |}| ||}| || }t	|
dd|
dj|j|jd| }|}	d}
| jD ]}||	||| d\}	}|
|7 }
q\t|	ddgd\}}t|| }|| | }|
tt|t|  | ddg7 }
td	tdtj |d   | ddg|
 }d}| ||\}}||7 }t||gd}| jD ]}|||||d
\}}|| }qtdtdtj |d   | ddg| }|| S tt| j}|dd |d g }t	|
dd|
dj|j|jd| }|D ]}|||||d
}q#|dd\}}|}|S )aJ  Calculate forward propagation.

        Args:
            x (Tensor): Input tensor (B, channels, T_text).
            x_mask (Tensor): Mask tensor (B, 1, T_text).
            w (Optional[Tensor]): Duration tensor (B, 1, T_text).
            g (Optional[Tensor]): Global conditioning tensor (B, channels, 1)
            inverse (bool): Whether to inverse the flow.
            noise_scale (float): Noise scale value.

        Returns:
            Tensor: If not inverse, negative log-likelihood (NLL) tensor (B,).
                If inverse, log-duration tensor (B, 1, T_text).

        Nzw must be provided.r   r   )devicedtypeg        )r1   r   g      )r1   r2   r   r   )detachr   r&   r   r   r"   r#   r$   r   randnsizetor5   r6   r%   splitsigmoidsumF
logsigmoidmathlogpir   catr   listreversed)r'   r.   r/   r0   r1   r2   r3   h_we_qz_qlogdet_tot_qflowlogdet_qz_uz1uz0logq
logdet_totlogdetznllr   logwr+   r+   r,   forwardi   s   




*

*
z#StochasticDurationPredictor.forward)r	   r
   r   r   r
   r   )NNFr-   )__name__
__module____qualname____doc__intfloatr   r   Tensorr   boolrW   __classcell__r+   r+   r)   r,   r      sP    Ir   )r[   rA   typingr   r   torch.nn.functionalr   
functionalr?   espnet2.gan_tts.vits.flowr   r   r   r   r   Moduler   r+   r+   r+   r,   <module>   s   	