o
    ii                     @   s   d Z ddlZddlZddlmZmZmZmZ ddlZ	ddl
Z
ddlm  mZ ddlmZ G dd de
jjZG dd de
jjZG d	d
 d
e
jjZG dd de
jjZG dd de
jjZG dd de
jjZdS )z_HiFi-GAN Modules.

This code is modified from https://github.com/kan-bayashi/ParallelWaveGAN.

    N)AnyDictListOptional)ResidualBlockc                       s  e Zd ZdZdddddg dg dg d	g d
g d
g d
gdddddidfdedededededee dee dee deee  dedededeee	f def fddZ
	d-d ejd!eej d"ejfd#d$Zd%d& Zd'd( Zd)d* Z	d-d ejd!eej d"ejfd+d,Z  ZS ).HiFiGANGeneratorzHiFiGAN generator module.P      i      )   r      r   )   r      r   )   r      )r	   r      T	LeakyReLUnegative_slope皙?in_channelsout_channelschannelsglobal_channelskernel_sizeupsample_scalesupsample_kernel_sizesresblock_kernel_sizesresblock_dilationsuse_additional_convsbiasnonlinear_activationnonlinear_activation_paramsuse_weight_normc                    s  t    |d dksJ dt|t|ksJ t|	t|ks#J tt|| | _t|| _t|| _t	j
j|||d|d d d| _t	j
 | _t	j
 | _tt|D ]n}|| d||  ksgJ |  jt	j
tt	j
|d	i |t	j
j|d|  |d|d   || || || d || d  || d dg7  _tt|D ]}|  jt|| |d|d   |	| ||
||dg7  _qqYt	j
t	j
 t	j
j|d|d   ||d|d d dt	j
 | _|dkrt	j
||d| _|r|   |   dS )
a  Initialize HiFiGANGenerator module.

        Args:
            in_channels (int): Number of input channels.
            out_channels (int): Number of output channels.
            channels (int): Number of hidden representation channels.
            global_channels (int): Number of global conditioning channels.
            kernel_size (int): Kernel size of initial and final conv layer.
            upsample_scales (List[int]): List of upsampling scales.
            upsample_kernel_sizes (List[int]): List of kernel sizes for upsample layers.
            resblock_kernel_sizes (List[int]): List of kernel sizes for residual blocks.
            resblock_dilations (List[List[int]]): List of list of dilations for residual
                blocks.
            use_additional_convs (bool): Whether to use additional conv layers in
                residual blocks.
            bias (bool): Whether to add bias parameter in convolution layers.
            nonlinear_activation (str): Activation function module name.
            nonlinear_activation_params (Dict[str, Any]): Hyperparameters for activation
                function.
            use_weight_norm (bool): Whether to use weight norm. If set to true, it will
                be applied to all of the conv layers.

        r   r	   Kernel size must be odd number.padding)r&   output_padding)r   r   	dilationsr    r   r!   r"   r   N )super__init__lenintnpprodupsample_factornum_upsamples
num_blockstorchnnConv1d
input_conv
ModuleList	upsamplesblocksrange
SequentialgetattrConvTranspose1dr   r   Tanhoutput_convglobal_convapply_weight_normreset_parameters)selfr   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   ij	__class__r)   S/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/gan_tts/hifigan/hifigan.pyr+      s|   
(






zHiFiGANGenerator.__init__Ncgreturnc                 C   s   |  |}|dur|| | }t| jD ]&}| j| |}d}t| jD ]}|| j|| j |  |7 }q%|| j }q| |}|S )a  Calculate forward propagation.

        Args:
            c (Tensor): Input tensor (B, in_channels, T).
            g (Optional[Tensor]): Global conditioning tensor (B, global_channels, 1).

        Returns:
            Tensor: Output tensor (B, out_channels, T).

        N        )r6   r@   r:   r1   r8   r2   r9   r?   )rC   rI   rJ   rD   csrE   r)   r)   rH   forward   s   

zHiFiGANGenerator.forwardc                 C       dt jjfdd}| | dS )zReset parameters.

        This initialization follows the official implementation manner.
        https://github.com/jik876/hifi-gan/blob/master/models.py

        mc                 S   s@   t | tjjtjjfr| jjdd t	d|  d d S d S )NrL   g{Gz?zReset parameters in .)

isinstancer3   r4   r5   r=   weightdatanormal_loggingdebugrP   r)   r)   rH   _reset_parameters   s   z<HiFiGANGenerator.reset_parameters.<locals>._reset_parametersNr3   r4   Moduleapply)rC   rY   r)   r)   rH   rB      s   z!HiFiGANGenerator.reset_parametersc                 C   rO   )z:Remove weight normalization module from all of the layers.rP   c                 S   s<   zt d|  d tjj|  W d S  ty   Y d S w )NzWeight norm is removed from rQ   )rV   rW   r3   r4   utilsremove_weight_norm
ValueErrorrX   r)   r)   rH   _remove_weight_norm   s   z@HiFiGANGenerator.remove_weight_norm.<locals>._remove_weight_normNrZ   )rC   r`   r)   r)   rH   r^         z#HiFiGANGenerator.remove_weight_normc                 C   rO   )9Apply weight normalization module from all of the layers.rP   c                 S   sD   t | tjjst | tjjr tjj|  td|  d d S d S NzWeight norm is applied to rQ   )	rR   r3   r4   r5   r=   r]   weight_normrV   rW   rX   r)   r)   rH   _apply_weight_norm   s   z>HiFiGANGenerator.apply_weight_norm.<locals>._apply_weight_normNrZ   rC   re   r)   r)   rH   rA      ra   z"HiFiGANGenerator.apply_weight_normc                 C   s@   |dur	| d}| j|dd d|d}|dddS )a  Perform inference.

        Args:
            c (torch.Tensor): Input tensor (T, in_channels).
            g (Optional[Tensor]): Global conditioning tensor (global_channels, 1).

        Returns:
            Tensor: Output tensor (T ** upsample_factor, out_channels).

        Nr   r	   )rJ   )	unsqueezerN   	transposesqueeze)rC   rI   rJ   r)   r)   rH   	inference   s   
zHiFiGANGenerator.inference)N)__name__
__module____qualname____doc__r-   r   boolstrr   r   r+   r3   Tensorr   rN   rB   r^   rA   rj   __classcell__r)   r)   rF   rH   r      s    	


q
r   c                       s   e Zd ZdZdddddgdg dddd	d
diddfdedededee dedee dedededeee	f dedef fddZ
dejdejfddZdd  Zd!d" Z  ZS )#HiFiGANPeriodDiscriminatorz$HiFiGAN period discriminator module.r	   r   r       r   r   r   r   r	      Tr   r   r   Fr   r   periodkernel_sizesr   downsample_scalesmax_downsample_channelsr    r!   r"   r#   use_spectral_normc                    s>  t    t|dksJ |d d dksJ d|d d dks%J d|| _tj | _|}|}|D ]8}|  jtjtjj	|||d df|df|d d d dfdt
tj|	d	i |
g7  _|}t|d |}q4tjj	|||d d dfd|d d d dfd| _|r|rtd|r|   |r|   dS dS )
a  Initialize HiFiGANPeriodDiscriminator module.

        Args:
            in_channels (int): Number of input channels.
            out_channels (int): Number of output channels.
            period (int): Period.
            kernel_sizes (list): Kernel sizes of initial conv layers and the final conv
                layer.
            channels (int): Number of initial channels.
            downsample_scales (List[int]): List of downsampling scales.
            max_downsample_channels (int): Number of maximum downsampling channels.
            use_additional_convs (bool): Whether to use additional conv layers in
                residual blocks.
            bias (bool): Whether to add bias parameter in convolution layers.
            nonlinear_activation (str): Activation function module name.
            nonlinear_activation_params (Dict[str, Any]): Hyperparameters for activation
                function.
            use_weight_norm (bool): Whether to use weight norm.
                If set to true, it will be applied to all of the conv layers.
            use_spectral_norm (bool): Whether to use spectral norm.
                If set to true, it will be applied to all of the conv layers.

        r   r   r	   r$   r%   r   0Either use use_weight_norm or use_spectral_norm.Nr)   )r*   r+   r,   rw   r3   r4   r7   convsr;   Conv2dr<   minr?   r_   rA   apply_spectral_norm)rC   r   r   rw   rx   r   ry   rz   r    r!   r"   r#   r{   in_chsout_chsdownsample_scalerF   r)   rH   r+      sN   
&
z#HiFiGANPeriodDiscriminator.__init__xrK   c                 C   s   |j \}}}|| j dkr"| j|| j  }t|d|fd}||7 }||||| j | j}g }| jD ]}||}||g7 }q3| |}t|dd}||g7 }|S )zCalculate forward propagation.

        Args:
            c (Tensor): Input tensor (B, in_channels, T).

        Returns:
            list: List of each layer's tensors.

        r   reflectr	   r
   )	shaperw   Fpadviewr}   r?   r3   flatten)rC   r   brI   tn_padoutslayerr)   r)   rH   rN   3  s   


z"HiFiGANPeriodDiscriminator.forwardc                 C   rO   )rb   rP   c                 S   6   t | tjjrtjj|  td|  d d S d S rc   rR   r3   r4   r~   r]   rd   rV   rW   rX   r)   r)   rH   re   S     zHHiFiGANPeriodDiscriminator.apply_weight_norm.<locals>._apply_weight_normNrZ   rf   r)   r)   rH   rA   P     z,HiFiGANPeriodDiscriminator.apply_weight_normc                 C   rO   );Apply spectral normalization module from all of the layers.rP   c                 S   r   NzSpectral norm is applied to rQ   rR   r3   r4   r~   r]   spectral_normrV   rW   rX   r)   r)   rH   _apply_spectral_norm]  r   zLHiFiGANPeriodDiscriminator.apply_spectral_norm.<locals>._apply_spectral_normNrZ   rC   r   r)   r)   rH   r   Z  r   z.HiFiGANPeriodDiscriminator.apply_spectral_norm)rk   rl   rm   rn   r-   r   ro   rp   r   r   r+   r3   rq   rN   rA   r   rr   r)   r)   rF   rH   rs      sT    	

T
rs   c                       sz   e Zd ZdZg dddddgdg ddd	d
ddid	ddfdee deeef f fddZ	de
jde
jfddZ  ZS )HiFiGANMultiPeriodDiscriminatorz*HiFiGAN multi-period discriminator module.r   r   r   r   r   r	   r   r   rt   ru   rv   Tr   r   r   Fr   r   rx   r   ry   rz   r    r!   r"   r#   r{   periodsdiscriminator_paramsc                    sP   t    tj | _|D ]}t|}||d< |  jtdi |g7  _qdS )a  Initialize HiFiGANMultiPeriodDiscriminator module.

        Args:
            periods (List[int]): List of periods.
            discriminator_params (Dict[str, Any]): Parameters for hifi-gan period
                discriminator module. The period parameter will be overwritten.

        rw   Nr)   )	r*   r+   r3   r4   r7   discriminatorscopydeepcopyrs   )rC   r   r   rw   paramsrF   r)   rH   r+   h  s   

z(HiFiGANMultiPeriodDiscriminator.__init__r   rK   c                 C   s"   g }| j D ]	}|||g7 }q|S )zCalculate forward propagation.

        Args:
            x (Tensor): Input noise signal (B, 1, T).

        Returns:
            List: List of list of each discriminator outputs, which consists of each
                layer output tensors.

        )r   rC   r   r   fr)   r)   rH   rN     s   
z'HiFiGANMultiPeriodDiscriminator.forward)rk   rl   rm   rn   r   r-   r   rp   r   r+   r3   rq   rN   rr   r)   r)   rF   rH   r   e  s*    
 r   c                       s   e Zd ZdZddg dddddg dd	d
diddfdededee dededededee dedeeef de	de	f fddZ
dejdeej fddZdd  Zd!d" Z  ZS )#HiFiGANScaleDiscriminatorz$HiFi-GAN scale discriminator module.r	      )   r   r      rv   r   Tr   r   r   r   r	   r   r   r   Fr   r   rx   r   rz   
max_groupsr    ry   r!   r"   r#   r{   c                    s  t    tj | _t|dksJ |D ]
}|d dksJ q|  jtjtjj|||d ||d d d dt	tj|	di |
g7  _|}|}d}|D ];}|  jtjtjj|||d ||d d d ||dt	tj|	di |
g7  _|}t
|d |}t
|d |}qOt
|d |}|  jtjtjj|||d d|d d d |dt	tj|	di |
g7  _|  jtjj|||d d|d d d |dg7  _|r|rtd	|r|   |r|   d
S d
S )a  Initilize HiFiGAN scale discriminator module.

        Args:
            in_channels (int): Number of input channels.
            out_channels (int): Number of output channels.
            kernel_sizes (List[int]): List of four kernel sizes. The first will be used
                for the first conv layer, and the second is for downsampling part, and
                the remaining two are for the last two output layers.
            channels (int): Initial number of channels for conv layer.
            max_downsample_channels (int): Maximum number of channels for downsampling
                layers.
            bias (bool): Whether to add bias parameter in convolution layers.
            downsample_scales (List[int]): List of downsampling scales.
            nonlinear_activation (str): Activation function module name.
            nonlinear_activation_params (Dict[str, Any]): Hyperparameters for activation
                function.
            use_weight_norm (bool): Whether to use weight norm. If set to true, it will
                be applied to all of the conv layers.
            use_spectral_norm (bool): Whether to use spectral norm. If set to true, it
                will be applied to all of the conv layers.

        r   r   r	   r   )r    r&   )r   strider&   groupsr    )r   r   r&   r    r   r|   Nr)   )r*   r+   r3   r4   r7   layersr,   r;   r5   r<   r   r_   rA   r   )rC   r   r   rx   r   rz   r   r    ry   r!   r"   r#   r{   ksr   r   r   r   rF   r)   rH   r+     s   
%	z"HiFiGANScaleDiscriminator.__init__r   rK   c                 C   s&   g }| j D ]}||}||g7 }q|S )zCalculate forward propagation.

        Args:
            x (Tensor): Input noise signal (B, 1, T).

        Returns:
            List[Tensor]: List of output tensors of each layer.

        )r   r   r)   r)   rH   rN     s
   

z!HiFiGANScaleDiscriminator.forwardc                 C   rO   )rb   rP   c                 S   r   rc   r   rX   r)   r)   rH   re   .  r   zGHiFiGANScaleDiscriminator.apply_weight_norm.<locals>._apply_weight_normNrZ   rf   r)   r)   rH   rA   +  r   z+HiFiGANScaleDiscriminator.apply_weight_normc                 C   rO   )r   rP   c                 S   r   r   r   rX   r)   r)   rH   r   8  r   zKHiFiGANScaleDiscriminator.apply_spectral_norm.<locals>._apply_spectral_normNrZ   r   r)   r)   rH   r   5  r   z-HiFiGANScaleDiscriminator.apply_spectral_norm)rk   rl   rm   rn   r-   r   rp   r   r   ro   r+   r3   rq   rN   rA   r   rr   r)   r)   rF   rH   r     sT    	

}
r   c                       s   e Zd ZdZddddddddg dd	d
ddg ddddid
dfdededeeef deeef def
 fddZ	de
jdeee
j  fddZ  ZS )HiFiGANMultiScaleDiscriminatorz*HiFi-GAN multi-scale discriminator module.r   	AvgPool1dr   r   r   r   r&   r	   r   r   rv   r   Tr   r   r   r   
r   r   rx   r   rz   r   r    ry   r!   r"   Fscalesdownsample_poolingdownsample_pooling_paramsr   follow_official_normc                    s   t    tj | _t|D ]+}t|}|r-|dkr%d|d< d|d< nd|d< d|d< |  jt	di |g7  _qd| _
|dkrPttj|di || _
dS dS )	a  Initilize HiFiGAN multi-scale discriminator module.

        Args:
            scales (int): Number of multi-scales.
            downsample_pooling (str): Pooling module name for downsampling of the
                inputs.
            downsample_pooling_params (Dict[str, Any]): Parameters for the above pooling
                module.
            discriminator_params (Dict[str, Any]): Parameters for hifi-gan scale
                discriminator module.
            follow_official_norm (bool): Whether to follow the norm setting of the
                official implementaion. The first discriminator uses spectral norm
                and the other discriminators use weight norm.

        r   Fr#   Tr{   Nr	   r)   )r*   r+   r3   r4   r7   r   r:   r   r   r   poolingr<   )rC   r   r   r   r   r   rD   r   rF   r)   rH   r+   C  s"   
'

z'HiFiGANMultiScaleDiscriminator.__init__r   rK   c                 C   s6   g }| j D ]}|||g7 }| jdur| |}q|S )a  Calculate forward propagation.

        Args:
            x (Tensor): Input noise signal (B, 1, T).

        Returns:
            List[List[torch.Tensor]]: List of list of each discriminator outputs,
                which consists of eachlayer output tensors.

        N)r   r   r   r)   r)   rH   rN   ~  s   


z&HiFiGANMultiScaleDiscriminator.forward)rk   rl   rm   rn   r-   rp   r   r   ro   r+   r3   rq   r   rN   rr   r)   r)   rF   rH   r   @  s@    


&;r   c                       s   e Zd ZdZddddddddg dd	d
ddg ddddid
dg dddddgdg dd
ddddidddfdededeeef deeef dede	e deeef f fdd Z
d!ejd"e	e	ej  fd#d$Z  ZS )%)HiFiGANMultiScaleMultiPeriodDiscriminatorz9HiFi-GAN multi-scale + multi-period discriminator module.r   r   r   r   r   r	   r   r   rv   r   Tr   r   r   r   r   r   r   rt   ru   Fr   r   scale_downsample_poolingscale_downsample_pooling_paramsscale_discriminator_paramsr   r   period_discriminator_paramsc                    s0   t    t|||||d| _t||d| _dS )a  Initilize HiFiGAN multi-scale + multi-period discriminator module.

        Args:
            scales (int): Number of multi-scales.
            scale_downsample_pooling (str): Pooling module name for downsampling of the
                inputs.
            scale_downsample_pooling_params (dict): Parameters for the above pooling
                module.
            scale_discriminator_params (dict): Parameters for hifi-gan scale
                discriminator module.
            follow_official_norm (bool): Whether to follow the norm setting of the
                official implementaion. The first discriminator uses spectral norm and
                the other discriminators use weight norm.
            periods (list): List of periods.
            period_discriminator_params (dict): Parameters for hifi-gan period
                discriminator module. The period parameter will be overwritten.

        )r   r   r   r   r   )r   r   N)r*   r+   r   msdr   mpd)rC   r   r   r   r   r   r   r   rF   r)   rH   r+     s   
9z2HiFiGANMultiScaleMultiPeriodDiscriminator.__init__r   rK   c                 C   s   |  |}| |}|| S )aL  Calculate forward propagation.

        Args:
            x (Tensor): Input noise signal (B, 1, T).

        Returns:
            List[List[Tensor]]: List of list of each discriminator outputs,
                which consists of each layer output tensors. Multi scale and
                multi period ones are concatenated.

        )r   r   )rC   r   msd_outsmpd_outsr)   r)   rH   rN     s   

z1HiFiGANMultiScaleMultiPeriodDiscriminator.forward)rk   rl   rm   rn   r-   rp   r   r   ro   r   r+   r3   rq   rN   rr   r)   r)   rF   rH   r     sb    



&Fr   )rn   r   rV   typingr   r   r   r   numpyr.   r3   torch.nn.functionalr4   
functionalr   &espnet2.gan_tts.hifigan.residual_blockr   r[   r   rs   r   r   r   r   r)   r)   r)   rH   <module>   s"    H 
5 'R