o
    }oiw[                     @   s   d dl Z d dlmZ d dlm  mZ d dlmZ d dlmZm	Z	m
Z
 d dlmZ d dlmZ d dlmZmZmZ d dlmZ G dd	 d	e jjZG d
d de jjZG dd deZG dd deZG dd deZG dd deZG dd deZdS )    N)Conv2d)remove_weight_normspectral_normweight_norm)	typecheck)NeuralModule)AudioSignalMelSpectrogramTypeVoidType)
NeuralTypec                       sB   e Zd ZdZdddddddif fdd		Zd
d Zdd Z  ZS )KernelPredictorz8 Kernel predictor for the location-variable convolutions   @           	LeakyReLUnegative_slopeg?c                    sD  t    || _|| _|| _|| _|| | | }|| }ttj	tj
||ddddtt|	di |
| _t | _|d d }tdD ]:}| jtt|tj	tj
||||ddtt|	di |
tj	tj
||||ddtt|	di |
 qGtj	tj
||||dd| _tj	tj
||||dd| _dS )	a7  
        Args:
            cond_channels (int): number of channel for the conditioning sequence,
            conv_in_channels (int): number of channel for the input sequence,
            conv_out_channels (int): number of channel for the output sequence,
            conv_layers (int): number of layers
              T)paddingbias   r   N )super__init__conv_in_channelsconv_out_channelsconv_kernel_sizeconv_layersnn
Sequentialutilsr   Conv1dgetattr
input_conv
ModuleListresidual_convsrangeappendDropoutkernel_conv	bias_conv)selfcond_channelsr   r   r   r   kpnet_hidden_channelskpnet_conv_sizekpnet_dropoutkpnet_nonlinear_activation!kpnet_nonlinear_activation_paramskpnet_kernel_channelskpnet_bias_channelsr   _	__class__r   `/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/tts/modules/univnet_modules.pyr   @   sJ   




zKernelPredictor.__init__c           
      C   s   |j \}}}| |}| jD ]}||j ||| }q| |}| |}| || j	| j
| j| j|}| || j	| j|}	||	fS )zm
        Args:
            c (Tensor): the conditioning sequence (batch, cond_channels, cond_length)
        )shaper#   r%   todevicer)   r*   
contiguousviewr   r   r   r   )
r+   cbatchr4   cond_lengthresidual_convkbkernelsr   r   r   r7   forward~   s   



zKernelPredictor.forwardc                 C   sJ   t | jd  t | j t | j | jD ]}t |d  t |d  qd S )Nr   r   r   )r   r#   r)   r*   r%   r+   blockr   r   r7   r      s   


z"KernelPredictor.remove_weight_norm)__name__
__module____qualname____doc__r   rD   r   __classcell__r   r   r5   r7   r   =   s    >r   c                       sN   e Zd ZdZg dddddddf fdd		Zd
d ZdddZdd Z  ZS )LVCBlockz"the location-variable convolutions)r   r   	      g?r      r   r   c                    s   t    || _t|| _|| _t||d| t||||	|
d|id	| _t	t
|tjtj||d| ||d |d  |d d| _t | _|D ]%}| jt	t
|tjtj|||||d  d |dt
| qLd S )Nr   r   )	r,   r   r   r   r   r-   r.   r/   r1   )strider   output_paddingr   )r   dilation)r   r   cond_hop_lengthlenr   r   r   kernel_predictorr   r   r   r    r   ConvTranspose1d	convt_prer$   conv_blocksr'   r!   )r+   in_channelsr,   rP   	dilationslReLU_sloper   rS   r-   r.   r/   rR   r5   r   r7   r      s\   


	zLVCBlock.__init__c              
   C   s   |j \}}}| |}| |\}}t| jD ]V\}}||}	|dd|ddddddddf }
|dd|ddddf }| j|	|
|| jd}	|t|	ddd|ddf t	|	dd|dddf   }q|S )aM   forward propagation of the location-variable convolutions.
        Args:
            x (Tensor): the input sequence (batch, in_channels, in_length)
            c (Tensor): the conditioning sequence (batch, cond_channels, cond_length)

        Returns:
            Tensor: the output sequence (batch, in_channels, in_length)
        N)hop_size)
r8   rW   rU   	enumeraterX   location_variable_convolutionrS   torchsigmoidtanh)r+   xr=   r4   rY   rC   r   iconvoutputrA   rB   r   r   r7   rD      s   	
(
$
zLVCBlock.forwardr   c                 C   s,  |j \}}}|j \}}}	}
}||| ksJ d|t|
d d  }t|||fdd}|d|d|  |}||k rEt|d|fdd}|d||}|ddddddddd|f }|dd}|d|
d}td	||}|jtj	d
}|
d
djtj	d
}|| }| ||	d}|S )a   perform location-variable convolution operation on the input sequence (x) using the local convolution kernel
        Args:
            x (Tensor): the input sequence (batch, in_channels, in_length).
            kernel (Tensor): the local convolution kernel (batch, in_channel, out_channels, kernel_size, kernel_length)
            bias (Tensor): the bias for the local convolution (batch, out_channels, kernel_length)
            dilation (int): the dilation of convolution.
            hop_size (int): the hop_size of the conditioning sequence.
        Returns:
            (Tensor): the output sequence after performing local convolution. (batch, out_channels, in_length).
        z$length of (x, kernel) is not matchedr   r   constantr   r   N   zbildsk,biokl->bolsd)memory_format)r8   intFpadunfold	transposer_   einsumr9   channels_last_3d	unsqueezer;   r<   )r+   rb   kernelr   rR   r\   r>   r4   	in_lengthout_channelskernel_sizekernel_lengthr   or   r   r7   r^      s(   &z&LVCBlock.location_variable_convolutionc                 C   s4   | j   t| jd  | jD ]}t|d  qd S )Nr   )rU   r   rW   rX   rE   r   r   r7   r     s
   

zLVCBlock.remove_weight_norm)r   rO   )	rG   rH   rI   rJ   r   rD   r^   r   rK   r   r   r5   r7   rL      s    ?
$rL   c                       sX   e Zd Zg dZ		d fdd	Zedd Zedd	 Ze d
d Z	dd Z
  ZS )	Generator)lrelu_slopenum_kernelsnum_upsamplesP   rO   c	                    s  t t|   || _|| _|| _|| _|| _|| _|| _	|| _
t | _d}	| jD ]}
|
|	 }	| jt| j| j	|
| j| j|	| jd q)|	| j
ksSJ d| j| j
tjtj| j| jdddd| _tt| jtjtj| jdddddt | _d S )Nr   )rP   rZ   r[   rS   r.   z>multiplied value of strides {} should match n_window_stride {}   r   reflect)r   padding_mode)r   rx   r   	noise_dimchannel_sizerZ   stridesry   r.   mel_channel
hop_lengthr   r$   	res_stackr'   rL   formatr    r   r!   conv_prer   r   Tanh	conv_post)r+   r   r   rZ   r   ry   r.   n_mel_channelsr   hop_length_lvcrP   r5   r   r7   r   '  sF   



zGenerator.__init__c                 C      dt dt iS )Nrb   )BDT)r   r	   r+   r   r   r7   input_types[     zGenerator.input_typesc                 C   r   )Naudior   Sr   r   r   r   r   r   r7   output_typesa  r   zGenerator.output_typesc                 C   sT   t j|d| j|d|j|jd}| |}| jD ]}|||}q| |}|S )Nr   r   )dtyper:   )	r_   randnsizer   r   r:   r   r   r   )r+   rb   z	res_blockr   r   r7   rD   g  s   &


zGenerator.forwardc                 C   sN   t d t| j | jD ]}t| dkrt| q| jD ]}|  qd S )NzRemoving weight norm...r   )printr   r   r   rT   
state_dictr   )r+   layerr   r   r   r7   r   t  s   



zGenerator.remove_weight_norm)r|   rO   )rG   rH   rI   __constants__r   propertyr   r   r   rD   r   rK   r   r   r5   r7   rx   $  s    
4


rx   c                       sD   e Zd Zd fdd	Zedd Zedd	 Ze d
d Z  Z	S )DiscriminatorPr   r   Fc           	         s:  t    || _|| _|dkrtnt}|sg dng d}t|td|d |df|df|d dfd|t|d |d |df|df|d dfd|t|d |d |df|df|d dfd|t|d |d |df|df|d dfd|t|d |d	 |dfd|d dfdg| _	|t|d	 dd
ddd| _
d S )NF)r      rO   i   i   )          r   r   r   r   r   r   r   rg   )r   r   )r   r   )r   r   ry   periodr   r   r   r$   r   convsr   )	r+   ry   r   ru   rP   use_spectral_normdebugnorm_fconv_chr5   r   r7   r     s   
(,,,( 	zDiscriminatorP.__init__c                 C   r   Nrb   r   r   r   r   r   r7   r     r   zDiscriminatorP.input_typesc                 C      t dt t dt gdS )Nr   r   r   CHWdecisionfeature_mapsr   r
   r   r   r   r7   r        
zDiscriminatorP.output_typesc                 C   s   g }|j \}}}|| j dkr$| j|| j  }t|d|fd}|| }||||| j | j}| jD ]}||}t|| j}|| q3| 	|}|| t
|dd}||fS )Nr   r~   r   ri   )r8   r   rk   rl   r<   r   
leaky_relury   r'   r   r_   flatten)r+   rb   fmaprB   r=   tn_padlr   r   r7   rD     s   


zDiscriminatorP.forward)r   r   FF
rG   rH   rI   r   r   r   r   r   rD   rK   r   r   r5   r7   r   ~  s    

r   c                       sD   e Zd Zd
 fdd	Zedd Zedd Ze dd	 Z  Z	S )MultiPeriodDiscriminatorFc                    s   t    |j| _|j| _t| jdksJ d|j|j| _|j| _|j| _t	
t| j| jd | j| j| j|dt| j| jd | j| j| j|dt| j| jd | j| j| j|dt| j| jd | j| j| j|dt| j| jd | j| j| j|dg| _d S )	Nr   z"MPD requires list of len=5, got {}r   )r   r   r   r   rg   )r   r   ry   periodsrT   r   ru   rP   r   r   r$   r   discriminatorsr+   cfgr   r5   r   r7   r     sd   

z!MultiPeriodDiscriminator.__init__c                 C      t dt t dt dS Nr   )yy_hatr   r   r   r   r7   r        

z$MultiPeriodDiscriminator.input_typesc                 C   :   t dt gt dt gt dt ggt dt ggdS )Nr   r   real_scoresfake_scoresreal_feature_mapsfake_feature_mapsr   r   r   r   r7   r     
   z%MultiPeriodDiscriminator.output_typesc                 C   t   g }g }g }g }t | jD ]&\}}||d\}	}
||d\}}||	 ||
 || || q||||fS N)rb   r]   r   r'   r+   r   r   y_d_rsy_d_gsfmap_rsfmap_gsrc   dy_d_rfmap_ry_d_gfmap_gr   r   r7   rD     s   


z MultiPeriodDiscriminator.forwardFr   r   r   r5   r7   r     s    6

r   c                       sD   e Zd Z fddZedd Zedd Zdd Zd	d
 Z  Z	S )DiscriminatorRc                    s   t    || _t| jdksJ d| j|j| _|jdkr"tnt}t	
|t	jddddd|t	jdddd	dd
|t	jdddd	dd
|t	jdddd	dd
|t	jdddddg| _|t	jddddd| _d S )Nr   z*MRD layer requires list with len=3, got {}Fr   r   )r   rM   )r   rg   r   )r   r   )rP   r   )r   r   )r   r   )r   r   
resolutionrT   r   ry   r   r   r   r   r$   r   r   r   )r+   r   r   r   r5   r   r7   r     s   
	zDiscriminatorR.__init__c                 C   r   r   r   r   r   r   r7   r   !  r   zDiscriminatorR.input_typesc                 C   r   )Nr   r   r   r   r   r   r   r   r   r7   r   '  r   zDiscriminatorR.output_typesc                 C   sn   g }|  |}|d}| jD ]}||}t|| j}|| q| |}|| t	|dd}||fS )Nr   ri   )
spectrogramrq   r   rk   r   ry   r'   r   r_   r   )r+   rb   r   r   r   r   r7   rD   .  s   




zDiscriminatorR.forwardc              
   C   sr   | j \}}}tj|t|| d t|| d fdd}|d}ttj||||ddd}tj|ddd	}|S )
Nr   r~   )moder   FT)n_fftr   
win_lengthcenterreturn_complexri   )pdim)	r   rk   rl   rj   squeezer_   view_as_realstftnorm)r+   rb   r   r   r   magr   r   r7   r   =  s   ,
zDiscriminatorR.spectrogram)
rG   rH   rI   r   r   r   r   rD   r   rK   r   r   r5   r7   r     s    

r   c                       s>   e Zd Zd
 fdd	Zedd Zedd Zdd	 Z  ZS )MultiResolutionDiscriminatorFc                    sP   t     j| _t| jdksJ d| jt fdd| jD | _d S )Nr   zSMRD requires list of list with len=3, each element having a list with len=3. got {}c                    s   g | ]}t  |qS r   )r   ).0r   r   r   r7   
<listcomp>R  s    z9MultiResolutionDiscriminator.__init__.<locals>.<listcomp>)r   r   resolutionsrT   r   r   r$   r   r   r5   r   r7   r   J  s   
 z%MultiResolutionDiscriminator.__init__c                 C   r   r   r   r   r   r   r7   r   T  r   z(MultiResolutionDiscriminator.input_typesc                 C   r   )Nr   r   r   r   r   r   r   r7   r   [  r   z)MultiResolutionDiscriminator.output_typesc                 C   r   r   r   r   r   r   r7   rD   d  s   


z$MultiResolutionDiscriminator.forwardr   )	rG   rH   rI   r   r   r   r   rD   rK   r   r   r5   r7   r   I  s    


r   )r_   torch.nnr   torch.nn.functional
functionalrk   r   torch.nn.utilsr   r   r   nemo.core.classes.commonr   nemo.core.classes.moduler   nemo.core.neural_types.elementsr   r	   r
   "nemo.core.neural_types.neural_typer   Moduler   rL   rx   r   r   r   r   r   r   r   r7   <module>   s"   0] Z6X=