o
    ei                     @   s  d Z ddlZddlmZ ddlm  mZ ddlmZ ddl	Z
ddlmZmZmZ dZd9dd	Zd
d Zdd ZG dd dejjZG dd dejjZG dd dejjZG dd dejZG dd deZG dd dejjZG dd dejjZG dd dejjZG dd dejjZG d d! d!ejZd:d#d$ZG d%d& d&ejZG d'd( d(ejjZ G d)d* d*ejZ!G d+d, d,ejZ"G d-d. d.ejZ#G d/d0 d0ejZ$d1d2 Z%d3d4 Z&G d5d6 d6ejZ'G d7d8 d8ejZ(dS );a  
Neural network modules for the HiFi-GAN: Generative Adversarial Networks for
Efficient and High Fidelity Speech Synthesis

For more details: https://arxiv.org/pdf/2010.05646.pdf, https://arxiv.org/abs/2406.10735

Authors
 * Jarod Duret 2021
 * Yingzhi WANG 2022
    N)
transforms)Conv1dConv2dConvTranspose1dg?   h㈵>c                 C   s   t t j| |d| S )z-Dynamique range compression for audio signalsmin)torchlogclamp)xCclip_val r   ^/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/speechbrain/lobes/models/HifiGAN.pydynamic_range_compression.   s   r   c                 C   s@   t j| |||||||||	|
d|j}||}|rt|}|S )a  calculates MelSpectrogram for a raw audio signal

    Arguments
    ---------
    sample_rate : int
        Sample rate of audio signal.
    hop_length : int
        Length of hop between STFT windows.
    win_length : int
        Window size.
    n_fft : int
        Size of FFT.
    n_mels : int
        Number of mel filterbanks.
    f_min : float
        Minimum frequency.
    f_max : float
        Maximum frequency.
    power : float
        Exponent for the magnitude spectrogram.
    normalized : bool
        Whether to normalize by magnitude after stft.
    norm : str or None
        If "slaney", divide the triangular mel weights by the width of the mel band
    mel_scale : str
        Scale to use: "htk" or "slaney".
    compression : bool
        whether to do dynamic range compression
    audio : torch.tensor
        input audio signal

    Returns
    -------
    Mel spectrogram
    )sample_rate
hop_length
win_lengthn_fftn_melsf_minf_maxpower
normalizednorm	mel_scale)r   MelSpectrogramtodevicer   )r   r   r   r   r   r   r   r   r   r   r   compressionaudioaudio_to_melmelr   r   r   mel_spectogram3   s&   3r%   c              	   C   sb  g }g }t | dD ]K}tj| |ddf dd\}}t|dkr6||dd  |jdddd	 }n|| |jddd }||||ddf d|d qt|}t	d
d |D }|d 
t|||d df}	t|t|d}
t|D ]"\}}||	|d|df< |
|ddf |dk |
|ddf< q|	|
 | fS )a  
    Process a given batch of code to extract consecutive unique elements and their associated features.

    Arguments
    ---------
    code : torch.Tensor (batch, time)
        Tensor of code indices.
    code_feat : torch.Tensor (batch, time, channel)
        Tensor of code features.

    Returns
    -------
    uniq_code_feat_filtered : torch.Tensor (batch, time)
        Features of consecutive unique codes.
    mask : torch.Tensor (batch, time)
        Padding mask for the unique codes.
    uniq_code_count : torch.Tensor (n)
        Count of unique codes.

    Example
    -------
    >>> code = torch.IntTensor([[40, 18, 18, 10]])
    >>> code_feat = torch.rand([1, 4, 128])
    >>> out_tensor, mask, uniq_code = process_duration(code, code_feat)
    >>> out_tensor.shape
    torch.Size([1, 1, 128])
    >>> mask.shape
    torch.Size([1, 1])
    >>> uniq_code.shape
    torch.Size([1])
    r   NT)return_counts   r   dimc                 s   s    | ]}| d V  qdS )r   N)size).0featr   r   r   	<genexpr>   s    z#process_duration.<locals>.<genexpr>)ranger,   r
   unique_consecutivelenappendcumsumviewcatmax	new_zerosarangerepeat	enumerateboolfloat)code	code_featuniq_code_countuniq_code_feati_countuniq_code_idxmax_lenuniq_code_feat_filteredmaskvr   r   r   process_duration|   s,    

(rJ   c                       2   e Zd ZdZd
 fdd	Zdd Zdd	 Z  ZS )	ResBlock1ak  
    Residual Block Type 1, which has 3 convolutional layers in each convolution block.

    Arguments
    ---------
    channels : int
        number of hidden channels for the convolutional layers.
    kernel_size : int
        size of the convolution filter in each layer.
    dilation : list
        list of dilation value for each conv layer in a block.
       r   rM      c                    s   t    tt|||d|d ddddt|||d|d ddddt|||d|d ddddg| _tt|||ddddddt|||ddddddt|||ddddddg| _d S )Nr   r   sameTin_channelsout_channelskernel_sizestridedilationpaddingskip_transposeweight_normr'   )super__init__nn
ModuleListr   convs1convs2selfchannelsrT   rV   	__class__r   r   r[      s   


#


zResBlock1.__init__c                 C   sL   t | j| jD ]\}}t|t}||}t|t}||}|| }q|S zReturns the output of ResBlock1

        Arguments
        ---------
        x : torch.Tensor (batch, channel, time)
            input tensor.

        Returns
        -------
        The ResBlock outputs
        )zipr^   r_   F
leaky_reluLRELU_SLOPE)ra   r   c1c2xtr   r   r   forward  s   
zResBlock1.forwardc                 C   s,   | j D ]}|  q| jD ]}|  qdS z=This functions removes weight normalization during inference.N)r^   remove_weight_normr_   ra   layerr   r   r   ro   )  s
   



zResBlock1.remove_weight_norm)rM   rN   __name__
__module____qualname____doc__r[   rm   ro   __classcell__r   r   rc   r   rL      s
    HrL   c                       rK   )	ResBlock2ak  
    Residual Block Type 2, which has 2 convolutional layers in each convolution block.

    Arguments
    ---------
    channels : int
        number of hidden channels for the convolutional layers.
    kernel_size : int
        size of the convolution filter in each layer.
    dilation : list
        list of dilation value for each conv layer in a block.
    rM   r   rM   c                    sN   t    tt|||d|d ddddt|||d|d ddddg| _d S )Nr   r   rP   TrQ   )rZ   r[   r\   r]   r   convsr`   rc   r   r   r[   ?  s0   


zResBlock2.__init__c                 C   s,   | j D ]}t|t}||}|| }q|S re   )rz   rg   rh   ri   )ra   r   crl   r   r   r   rm   Z  s
   

zResBlock2.forwardc                 C   s   | j D ]}|  qdS rn   )rz   ro   rp   r   r   r   ro   m  s   

zResBlock2.remove_weight_norm)rM   ry   rr   r   r   rc   r   rx   1  s
    rx   c                       sL   e Zd ZdZ			d fdd	Zddd	Zd
d Ze dddZ	  Z
S )HifiganGeneratora  HiFiGAN Generator with Multi-Receptive Field Fusion (MRF)

    Arguments
    ---------
    in_channels : int
        number of input tensor channels.
    out_channels : int
        number of output tensor channels.
    resblock_type : str
        type of the `ResBlock`. '1' or '2'.
    resblock_dilation_sizes : List[List[int]]
        list of dilation values in each layer of a `ResBlock`.
    resblock_kernel_sizes : List[int]
        list of kernel sizes for each `ResBlock`.
    upsample_kernel_sizes : List[int]
        list of kernel sizes for each transposed convolution.
    upsample_initial_channel : int
        number of channels for the first upsampling layer. This is divided by 2
        for each consecutive upsampling layer.
    upsample_factors : List[int]
        upsampling factors (stride) for each upsampling layer.
    inference_padding : int
       constant padding applied to the input at inference time. Defaults to 5.
    cond_channels : int
        If provided, adds a conv layer to the beginning of the forward.
    conv_post_bias : bool
        Whether to add a bias term to the final conv.

    Example
    -------
    >>> inp_tensor = torch.rand([4, 80, 33])
    >>> hifigan_generator= HifiganGenerator(
    ...    in_channels = 80,
    ...    out_channels = 1,
    ...    resblock_type = "1",
    ...    resblock_dilation_sizes = [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
    ...    resblock_kernel_sizes = [3, 7, 11],
    ...    upsample_kernel_sizes = [16, 16, 4, 4],
    ...    upsample_initial_channel = 512,
    ...    upsample_factors = [8, 8, 2, 2],
    ... )
    >>> out_tensor = hifigan_generator(inp_tensor)
    >>> out_tensor.shape
    torch.Size([4, 1, 8448])
    rO   r   Tc                    sD  t    |	| _t|| _t|| _t||dddddd| _|dkr$tnt	}t
 | _tt||D ]#\}\}}| jt|d|  |d|d   |||| d ddd q2t
 | _tt| jD ]"}|d|d   }tt||D ]\}\}}| j|||| qsqbt|ddddd|dd| _|
d	krt|
|dd
| _d S d S )N   r   rP   TrR   rS   rT   rU   rW   rX   rY   1r'   )rR   rS   rT   rU   rW   rX   biasrY   r   )rR   rS   rT   )rZ   r[   inference_paddingr2   num_kernelsnum_upsamplesr   conv_prerL   rx   r\   r]   upsr;   rf   r3   r   	resblocksr0   	conv_post
cond_layer)ra   rR   rS   resblock_typeresblock_dilation_sizesresblock_kernel_sizesupsample_kernel_sizesupsample_initial_channelupsample_factorsr   cond_channelsconv_post_biasresblockrB   ukchrC   drc   r   r   r[     sl   


	




zHifiganGenerator.__init__Nc                 C   s   |  |}t| dr|| | }t| jD ]=}t|t}| j| |}d}t| j	D ]!}|du r?| j
|| j	 |  |}q,|| j
|| j	 |  |7 }q,|| j	 }qt|}| |}t|}|S )a  
        Arguments
        ---------
        x : torch.Tensor (batch, channel, time)
            feature input tensor.
        g : torch.Tensor (batch, 1, time)
            global conditioning input tensor.

        Returns
        -------
        The generator outputs
        r   N)r   hasattrr   r0   r   rg   rh   ri   r   r   r   r   r
   tanh)ra   r   gorB   z_sumjr   r   r   rm     s    




zHifiganGenerator.forwardc                 C   s@   | j D ]}|  q| jD ]}|  q| j  | j  dS rn   )r   ro   r   r   r   rp   r   r   r   ro   	  s   




z#HifiganGenerator.remove_weight_normc                 C   s(   |rt jj|| j| jfd}| |S )aN  The inference function performs a padding and runs the forward method.

        Arguments
        ---------
        c : torch.Tensor (batch, channel, time)
            feature input tensor.
        padding : bool
            Whether to pad tensor before forward.

        Returns
        -------
        The generator outputs
        	replicate)r
   r\   
functionalpadr   rm   )ra   r{   rW   r   r   r   	inference  s
   
zHifiganGenerator.inference)rO   r   TN)T)rs   rt   ru   rv   r[   rm   ro   r
   no_gradr   rw   r   r   rc   r   r|   s  s    8
G 
r|   c                       (   e Zd ZdZ fddZdd Z  ZS )VariancePredictora  Variance predictor inspired from FastSpeech2

    Arguments
    ---------
    encoder_embed_dim : int
        number of input tensor channels.
    var_pred_hidden_dim : int
        size of hidden channels for the convolutional layers.
    var_pred_kernel_size : int
        size of the convolution filter in each layer.
    var_pred_dropout : float
        dropout probability of each layer.

    Example
    -------
    >>> inp_tensor = torch.rand([4, 80, 128])
    >>> duration_predictor = VariancePredictor(
    ...    encoder_embed_dim = 128,
    ...    var_pred_hidden_dim = 128,
    ...    var_pred_kernel_size = 3,
    ...    var_pred_dropout = 0.5,
    ... )
    >>> out_tensor = duration_predictor (inp_tensor)
    >>> out_tensor.shape
    torch.Size([4, 80])
    c              
      sf   t    tt|||ddddt | _|| _tt|||ddddt | _t	|d| _
d S )NrP   T)rR   rS   rT   rW   rX   rY   r   )rZ   r[   r\   
Sequentialr   ReLUconv1dropoutconv2Linearproj)ra   encoder_embed_dimvar_pred_hidden_dimvar_pred_kernel_sizevar_pred_dropoutrc   r   r   r[   E  s2   
zVariancePredictor.__init__c                 C   sn   |  |dddd}tj|| j| jd}| |dddd}tj|| j| jd}| |jddS )z
        Arguments
        ---------
        x : torch.Tensor (batch, channel, time)
            feature input tensor.

        Returns
        -------
        Variance predictor output
        r   r'   )ptrainingr)   )r   	transposerg   r   r   r   r   squeeze)ra   r   r   r   r   rm   f  s
   zVariancePredictor.forwardrs   rt   ru   rv   r[   rm   rw   r   r   rc   r   r   )  s    !r   c                       sn   e Zd ZdZ															
d fdd	Zedd Zd fdd	Ze	 d fdd	Z
  ZS )UnitHifiganGeneratora  The UnitHiFiGAN generator takes discrete speech tokens as input.
    The generator is adapted to support bitrate scalability training.
    For more details, refer to: https://arxiv.org/abs/2406.10735.

    Arguments
    ---------
    in_channels : int
        number of input tensor channels.
    out_channels : int
        number of output tensor channels.
    resblock_type : str
        type of the `ResBlock`. '1' or '2'.
    resblock_dilation_sizes : List[List[int]]
        list of dilation values in each layer of a `ResBlock`.
    resblock_kernel_sizes : List[int]
        list of kernel sizes for each `ResBlock`.
    upsample_kernel_sizes : List[int]
        list of kernel sizes for each transposed convolution.
    upsample_initial_channel : int
        number of channels for the first upsampling layer. This is divided by 2
        for each consecutive upsampling layer.
    upsample_factors : List[int]
        upsampling factors (stride) for each upsampling layer.
    inference_padding : int
        constant padding applied to the input at inference time. Defaults to 5.
    cond_channels : int
        Whether to add a conv to the front
    conv_post_bias : bool
        Whether to add a bias to the last conv
    vocab_size : int
        size of the dictionary of embeddings.
    embedding_dim : int
        size of each embedding vector.
    attn_dim : int
        size of attention dimension.
    duration_predictor : bool
        enable duration predictor module.
    var_pred_hidden_dim : int
        size of hidden channels for the convolutional layers of the duration predictor.
    var_pred_kernel_size : int
        size of the convolution filter in each layer of the duration predictor.
    var_pred_dropout : float
        dropout probability of each layer in the duration predictor.
    multi_speaker : bool
        enable multi speaker training.
    normalize_speaker_embeddings: bool
        enable normalization of speaker embeddings.
    skip_token_embedding: bool
        Whether to skip the embedding layer in the case of continuous input.
    pooling_type: str, optional
        The type of pooling to use. Must be one of ["attention", "sum", "none"].
        Defaults to "attention" for scalable vocoder.

    Example
    -------
    >>> inp_tensor = torch.randint(0, 100, (4, 10, 1))
    >>> unit_hifigan_generator= UnitHifiganGenerator(
    ...    in_channels = 128,
    ...    out_channels = 1,
    ...    resblock_type = "1",
    ...    resblock_dilation_sizes = [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
    ...    resblock_kernel_sizes = [3, 7, 11],
    ...    upsample_kernel_sizes = [11, 8, 8, 4, 4],
    ...    upsample_initial_channel = 512,
    ...    upsample_factors = [5, 4, 4, 2, 2],
    ...    vocab_size = 100,
    ...    embedding_dim = 128,
    ...    duration_predictor = True,
    ...    var_pred_hidden_dim = 128,
    ...    var_pred_kernel_size = 3,
    ...    var_pred_dropout = 0.5,
    ... )
    >>> out_tensor, _ = unit_hifigan_generator(inp_tensor)
    >>> out_tensor.shape
    torch.Size([4, 1, 3200])
    rO   r   Td      FrM         ?	attentionc                    s   t  |||||||||	|
| tj||| _|| _|dkr7tjtj||tj	 tjj|ddd| _
|| _|rDt||||| _|| _|| _|| _d S )Nr   r   F)r   )rZ   r[   r
   r\   	Embeddingunit_embeddingpooling_typer   r   r   attn_poolingduration_predictorr   var_predictormulti_speakernormalize_speaker_embeddingsskip_token_embedding)ra   rR   rS   r   r   r   r   r   r   r   r   r   
vocab_sizeembedding_dimattn_dimr   r   r   r   r   r   r   r   rc   r   r   r[     s@   
zUnitHifiganGenerator.__init__c                 C   s:   |   \}}}| dddd|| } | |||} | S )zO
        Upsamples the input tensor to match the specified max_frames.
        rM   r   )r,   	unsqueezer:   r5   )r   
max_framesbatch
hidden_dimcond_lengthr   r   r   	_upsample  s   zUnitHifiganGenerator._upsampleNc                    sT  | j r|}n| |}|j\}}}}||| ||}	| jdkr8| |	}
tj|
dd}|	| }tj	|dd}n| jdkrEtj	|	dd}n| jdkrL|	}||||}|
dd}d}d}| jr|t||
dd\}}}| |}|| }t|d }| jr| jrtjj|}|d}| ||jd }tj||gdd}t |||ffS )	aK  
        Arguments
        ---------
        x : torch.Tensor (batch, time, channel)
            feature input tensor.
        g : torch.Tensor (batch, 1, time)
            global conditioning input tensor.
        spk : torch.Tensor
            Speaker embeddings

        Returns
        -------
        Generator output
        r   r   r)   sumnoner'   Nr(   )r   r   shaper5   r   r   rg   softmaxr
   r   r   r   rJ   r   r   r   r   r\   r   	normalizer   r   r6   rZ   rm   )ra   r   r   spkr   
batch_sizetimechannelemb_sizeu_attn_scoresattn_weights
u_weightedu_pooledlog_durlog_dur_predrA   uniq_code_maskdurrc   r   r   rm     s@   







zUnitHifiganGenerator.forwardc                    sf  | j s| |}|j\}}}}||| ||}| jdkr5| |}tj|dd}	||	 }
tj	|
dd}n| jdkrBtj	|dd}n| jdkrI|}||||}|
dd}| jr|ddksdJ d| |
dd}tjtt|d  dd	}tj||d
dd}| jr| jrtjj|}|d
}| ||jd
 }tj||gdd}t |S )aD  The inference function performs duration prediction and runs the forward method.

        Arguments
        ---------
        x : torch.Tensor (batch, time, channel)
            feature input tensor.
        spk : torch.Tensor
            Speaker embeddings

        Returns
        -------
        Generator output
        r   r   r)   r   r   r'   r   z-only support single sample batch in inferencer   r(   )r   r   r   r5   r   r   rg   r   r
   r   r   r   r,   r   r   roundexplongrepeat_interleaver   r   r\   r   r   r   r   r6   rZ   rm   )ra   r   r   r   r   r   r   x_r   r   
x_weightedx_pooledr   dur_outrc   r   r   r   C  s>   





zUnitHifiganGenerator.inference)rO   r   Tr   r   r   Fr   rM   r   FFFr   )NNr   )rs   rt   ru   rv   r[   staticmethodr   rm   r
   r   r   rw   r   r   rc   r   r   x  s,    W;
	8r   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )	DiscriminatorPa  HiFiGAN Periodic Discriminator
    Takes every Pth value from the input waveform and applies a stack of convolutions.
    Note:
        if period is 2
        waveform = [1, 2, 3, 4, 5, 6 ...] --> [1, 3, 5 ... ] --> convs -> score, feat

    Arguments
    ---------
    period : int
       Take every a new value every `period`
    kernel_size : int
        Size of 1-d kernel for conv stack
    stride : int
        Stride of conv stack
    rO   rM   c                    s   t    || _ttdd|df|dfddddtdd|df|dfddddtdd|df|dfddddtdd|df|dfddddtdd|dfdddddg| _tddd	ddddd| _d S )
Nr       rP   Tr~   r         )rM   r   )rZ   r[   periodr\   r]   r   rz   r   )ra   r   rT   rU   rc   r   r   r[     sv   
				1zDiscriminatorP.__init__c                 C   s   g }|j \}}}|| j dkr$| j|| j  }t|d|fd}|| }||||| j | j}| jD ]}||}t|t}|| q3| 	|}|| t
|dd}||fS )
        Arguments
        ---------
        x : torch.Tensor (batch, 1, time)
            input waveform.

        Returns
        -------
        Scores and features
        r   reflectr   r(   )r   r   rg   r   r5   rz   rh   ri   r3   r   r
   flatten)ra   r   r.   br{   tn_padrq   r   r   r   rm     s   


zDiscriminatorP.forward)rO   rM   r   r   r   rc   r   r     s    ?r   c                       r   )MultiPeriodDiscriminatorzHiFiGAN Multi-Period Discriminator (MPD)
    Wrapper for the `PeriodDiscriminator` to apply it in different periods.
    Periods are suggested to be prime numbers to reduce the overlap between each discriminator.
    c                    s8   t    ttdtdtdtdtdg| _d S )Nr'   rM   rO   r}      )rZ   r[   r\   r]   r   discriminatorsra   rc   r   r   r[     s   

z!MultiPeriodDiscriminator.__init__c                 C   sD   g }g }t | jD ]\}}||\}}|| || q	||fS )zReturns Multi-Period Discriminator scores and features

        Arguments
        ---------
        x : torch.Tensor (batch, 1, time)
            input waveform.

        Returns
        -------
        Scores and features
        )r;   r   r3   )ra   r   scoresfeatsrC   r   scorer.   r   r   r   rm     s   
z MultiPeriodDiscriminator.forwardr   r   r   rc   r   r     s    r   c                       *   e Zd ZdZd fdd	Zdd Z  ZS )DiscriminatorSaX  HiFiGAN Scale Discriminator.
    It is similar to `MelganDiscriminator` but with a specific architecture explained in the paper.
    SpeechBrain CNN wrappers are not used here because spectral_norm is not often used

    Arguments
    ---------
    use_spectral_norm : bool
        if `True` switch to spectral norm instead of weight norm.
    Fc                    s   t    |rtjjntjj}t|tjdddddd|tjdddddd	d
|tjdddddd	d
|tjdddddd	d
|tjdddddd	d
|tjdddddd	d
|tjddddddg| _|tjdddddd| _	d S )Nr   r      r}   rW   )   r'         )groupsrW         r   r   rO   rM   )
rZ   r[   r\   utilsspectral_normrY   r]   r   rz   r   )ra   use_spectral_normnorm_frc   r   r   r[   #  s    

zDiscriminatorS.__init__c                 C   sX   g }| j D ]}||}t|t}|| q| |}|| t|dd}||fS )r   r   r(   )rz   rg   rh   ri   r3   r   r
   r   )ra   r   r.   rq   r   r   r   rm   7  s   


zDiscriminatorS.forward)Fr   r   r   rc   r   r     s    
r   c                       r   )MultiScaleDiscriminatorzHiFiGAN Multi-Scale Discriminator.
    Similar to MultiScaleMelganDiscriminator but specially tailored for HiFiGAN as in the paper.
    c                    sR   t    ttddt t g| _ttjddddtjddddg| _d S )NT)r  r   r'   r   )rZ   r[   r\   r]   r   r   	AvgPool1d	meanpoolsr   rc   r   r   r[   S  s   

z MultiScaleDiscriminator.__init__c                 C   s^   g }g }t | jD ]!\}}|dkr| j|d  |}||\}}|| || q	||fS )r   r   r   )r;   r   r	  r3   )ra   r   r   r   rB   r   r   r.   r   r   r   rm   `  s   
zMultiScaleDiscriminator.forwardr   r   r   rc   r   r  N  s    r  c                       r   )HifiganDiscriminatora  HiFiGAN discriminator wrapping MPD and MSD.

    Example
    -------
    >>> inp_tensor = torch.rand([4, 1, 8192])
    >>> hifigan_discriminator= HifiganDiscriminator()
    >>> scores, feats = hifigan_discriminator(inp_tensor)
    >>> len(scores)
    8
    >>> len(feats)
    8

    c                    s   t    t | _t | _d S r   )rZ   r[   r   mpdr  msdr   rc   r   r   r[     s   
zHifiganDiscriminator.__init__c                 C   s,   |  |\}}| |\}}|| || fS )zReturns list of list of features from each layer of each discriminator.

        Arguments
        ---------
        x : torch.Tensor
            input waveform.

        Returns
        -------
        Features from each discriminator layer
        )r  r  )ra   r   r   r   scores_feats_r   r   r   rm     s   zHifiganDiscriminator.forwardr   r   r   rc   r   r
  w  s    r
  hann_windowc           	      C   sr   t | d|||}|dddddddf }|dddddddf }t t j|d |d  dd}|S )zHcomputes the Fourier transform of short overlapping windows of the inputr   Nr   r'   g:0yE>r   )r
   stftr   sqrtr   )	r   r   r   r   	window_fnr   MPSr   r   r   r    s    r  c                       r   )STFTLossa  STFT loss. Input generate and real waveforms are converted
    to spectrograms compared with L1 and Spectral convergence losses.
    It is from ParallelWaveGAN paper https://arxiv.org/pdf/1910.11480.pdf

    Arguments
    ---------
    n_fft : int
        size of Fourier transform.
    hop_length : int
        the distance between neighboring sliding window frames.
    win_length : int
        the size of window frame and STFT filter.
    c                    s    t    || _|| _|| _d S r   )rZ   r[   r   r   r   )ra   r   r   r   rc   r   r   r[     s   

zSTFTLoss.__init__c                 C   sh   t || j| j| j}t || j| j| j}tt|t|}tj|| ddtj|dd }||fS )a5  Returns magnitude loss and spectral convergence loss

        Arguments
        ---------
        y_hat : torch.tensor
            generated waveform tensor
        y : torch.tensor
            real waveform tensor

        Returns
        -------
        Magnitude loss and spectral convergence loss
        fro)r   )	r  r   r   r   rg   l1_lossr
   r   r   )ra   y_hatyy_hat_My_Mloss_magloss_scr   r   r   rm     s
    zSTFTLoss.forwardr   r   r   rc   r   r    s    r  c                       s0   e Zd ZdZ			d	 fdd	Zdd Z  ZS )
MultiScaleSTFTLosszMulti-scale STFT loss. Input generate and real waveforms are converted
    to spectrograms compared with L1 and Spectral convergence losses.
    It is from ParallelWaveGAN paper https://arxiv.org/pdf/1910.11480.pdfr   i   r   x      2   iX  i  r#  c                    sF   t    tj | _t|||D ]\}}}| jt||| qd S r   )	rZ   r[   r
   r\   r]   
loss_funcsrf   r3   r  )ra   n_fftshop_lengthswin_lengthsr   r   r   rc   r   r   r[     s   
zMultiScaleSTFTLoss.__init__c           	      C   sT   t | j}d}d}| jD ]}|||\}}||7 }||7 }q|| }|| }||fS )aA  Returns multi-scale magnitude loss and spectral convergence loss

        Arguments
        ---------
        y_hat : torch.tensor
            generated waveform tensor
        y : torch.tensor
            real waveform tensor

        Returns
        -------
        Magnitude loss and spectral convergence loss
        r   )r2   r&  )	ra   r  r  Nr  r  flmlscr   r   r   rm     s   


zMultiScaleSTFTLoss.forward)r   r!  r%  r   r   r   rc   r   r    s    r  c                       sD   e Zd ZdZ										
				d fdd	Zdd Z  ZS )
L1SpecLossa  L1 Loss over Spectrograms as described in HiFiGAN paper https://arxiv.org/pdf/2010.05646.pdf
    Note : L1 loss helps leaning details compared with L2 loss

    Arguments
    ---------
    sample_rate : int
        Sample rate of audio signal.
    hop_length : int
        Length of hop between STFT windows.
    win_length : int
        Window size.
    n_mel_channels : int
        Number of mel filterbanks.
    n_fft : int
        Size of FFT.
    n_stft : int
        Size of STFT.
    mel_fmin : float
        Minimum frequency.
    mel_fmax : float
        Maximum frequency.
    mel_normalized : bool
        Whether to normalize by magnitude after stft.
    power : float
        Exponent for the magnitude spectrogram.
    norm : str or None
        If "slaney", divide the triangular mel weights by the width of the mel band
    mel_scale : str
        Scale to use: "htk" or "slaney".
    dynamic_range_compression : bool
        whether to do dynamic range compression
    "V  r     P   r                  @@F      ?slaneyTc                    sd   t    || _|| _|| _|| _|| _|d d | _|| _|| _	|	| _
|
| _|| _|| _|| _d S )Nr'   r   )rZ   r[   r   r   r   n_mel_channelsr   n_stftmel_fminmel_fmaxmel_normalizedr   r   r   r   )ra   r   r   r   r7  r   r8  r9  r:  r;  r   r   r   r   rc   r   r   r[   ,  s   

zL1SpecLoss.__init__c                 C   s   t | j| j| j| j| j| j| j| j| j	| j
| j| j|}t | j| j| j| j| j| j| j| j| j	| j
| j| j|}t||}|S )zReturns L1 Loss over Spectrograms

        Arguments
        ---------
        y_hat : torch.tensor
            generated waveform tensor
        y : torch.tensor
            real waveform tensor

        Returns
        -------
        L1 loss
        )r%   r   r   r   r   r7  r9  r:  r   r;  r   r   r   rg   r  )ra   r  r  r  r  r  r   r   r   rm   L  s@   zL1SpecLoss.forward)r/  r  r0  r1  r   r2  r3  r4  Fr5  r6  r6  Tr   r   r   rc   r   r.  
  s"    # r.  c                   @   s   e Zd ZdZdd ZdS )MSEGLosszMean Squared Generator Loss
    The generator is trained to fake the discriminator by updating the sample quality
    to be classified to a value almost equal to 1.
    c                 C   s   t |||j}|S )zReturns Generator GAN loss

        Arguments
        ---------
        score_fake : list
            discriminator scores of generated waveforms D(G(s))

        Returns
        -------
        Generator loss
        )rg   mse_lossnew_onesr   )ra   
score_fake	loss_faker   r   r   rm     s   zMSEGLoss.forwardN)rs   rt   ru   rv   rm   r   r   r   r   r<    s    r<  c                       r   )MelganFeatureLosszCalculates the feature matching loss, which is a learned similarity metric measured by
    the difference in features of the discriminator between a ground truth sample and a generated
    sample (Larsen et al., 2016, Kumar et al., 2019).
    c                       t    t | _d S r   )rZ   r[   r\   L1Loss	loss_funcr   rc   r   r   r[        
zMelganFeatureLoss.__init__c           	      C   sZ   d}d}t |D ]\}}t|| || D ]\}}|| ||7 }|d7 }qq|| }|S )a4  Returns feature matching loss

        Arguments
        ---------
        fake_feats : list
            discriminator features of generated waveforms
        real_feats : list
            discriminator features of groundtruth waveforms

        Returns
        -------
        Feature matching loss
        r   r   )r;   rf   rD  )	ra   
fake_feats
real_feats
loss_feats	num_featsidxrC   	fake_feat	real_featr   r   r   rm     s   
zMelganFeatureLoss.forwardr   r   r   rc   r   rA    s    rA  c                       r   )MSEDLosszMean Squared Discriminator Loss
    The discriminator is trained to classify ground truth samples to 1,
    and the samples synthesized from the generator to 0.
    c                    rB  r   )rZ   r[   r\   MSELossrD  r   rc   r   r   r[     rE  zMSEDLoss.__init__c                 C   s:   |  |||j}|  |||j}|| }|||fS )a2  Returns Discriminator GAN losses

        Arguments
        ---------
        score_fake : list
            discriminator scores of generated waveforms
        score_real : list
            discriminator scores of groundtruth waveforms

        Returns
        -------
        Discriminator losses
        )rD  r>  r   r8   )ra   r?  
score_real	loss_realr@  loss_dr   r   r   rm     s   
zMSEDLoss.forwardr   r   r   rc   r   rM    s    rM  c                 C   s<   d}t | tr| D ]
}||}||7 }q	|S || }|}|S )a  Compute Generator adversarial loss function
    and normalize values

    Arguments
    ---------
    scores_fake : list
        discriminator scores of generated waveforms
    loss_func : object
        object of target generator loss

    Returns
    -------
    Generator loss
    r   )
isinstancelist)scores_fakerD  adv_lossr?  	fake_lossr   r   r   _apply_G_adv_loss  s   

rW  c           	      C   st   d}d}d}t | tr+t| |D ]\}}|||d\}}}||7 }||7 }||7 }qn
|| |\}}}|}|||fS )ae  Compute Discriminator losses and normalize loss values

    Arguments
    ---------
    scores_fake : list
        discriminator scores of generated waveforms
    scores_real : list
        discriminator scores of groundtruth waveforms
    loss_func : object
        object of target discriminator loss

    Returns
    -------
    Discriminator losses
    r   )r?  rO  )rR  rS  rf   )	rT  scores_realrD  loss	real_lossrV  r?  rO  
total_lossr   r   r   _apply_D_loss
  s   


r\  c                       sN   e Zd ZdZ										d fdd	Z							d	ddZ  ZS )
GeneratorLossa  Creates a summary of generator losses
    and applies weights for different losses

    Arguments
    ---------
    stft_loss : object
        object of stft loss
    stft_loss_weight : float
        weight of STFT loss
    mseg_loss : object
        object of mseg loss
    mseg_loss_weight : float
        weight of mseg loss
    feat_match_loss : object
        object of feature match loss
    feat_match_loss_weight : float
        weight of feature match loss
    l1_spec_loss : object
        object of L1 spectrogram loss
    l1_spec_loss_weight : float
        weight of L1 spectrogram loss
    mseg_dur_loss : object
        object of mseg duration loss
    mseg_dur_loss_weight : float
        weight of mseg duration loss
    Nr   c                    sJ   t    || _|| _|| _|| _|| _|| _|| _|| _	|	| _
|
| _d S r   )rZ   r[   	stft_lossstft_loss_weight	mseg_lossmseg_loss_weightfeat_match_lossfeat_match_loss_weightl1_spec_lossl1_spec_loss_weightmseg_dur_lossmseg_dur_loss_weight)ra   r^  r_  r`  ra  rb  rc  rd  re  rf  rg  rc   r   r   r[   S  s   

zGeneratorLoss.__init__c	              	   C   sP  d}	d}
d}i }| j r8|  |ddddd|df d|d\}}||d< ||d< |	| j||   }	| jrL| ||}||d< |	| j|  }	| jrd|durdt|| j}||d< |
| j|  }
| j	r||dur|| 	||}||d	< |
| j
|  }
| jr|tjjkrtj||d
d}||d< || j9 }|	|
 | |d< |	|d< |
|d< |S )a  Returns a dictionary of generator losses and applies weights

        Arguments
        ---------
        stage : speechbrain.Stage
            training, validation or testing
        y_hat : torch.tensor
            generated waveform tensor
        y : torch.tensor
            real waveform tensor
        scores_fake : list
            discriminator scores of generated waveforms
        feats_fake : list
            discriminator features of generated waveforms
        feats_real : list
            discriminator features of groundtruth waveforms
        log_dur_pred : torch.Tensor
            Predicted duration for duration loss
        log_dur : torch.Tensor
            Real duration for duration loss

        Returns
        -------
        Dictionary of generator losses
        r   Nr'   r   G_stft_loss_mgG_stft_loss_scG_l1_spec_lossG_mse_fake_lossG_feat_match_lossmean)	reduction
G_dur_lossG_loss
G_gen_loss
G_adv_loss)r^  r,   r   r_  rd  re  r`  rW  ra  rb  rc  rf  sbStageTRAINrg   r=  rg  )ra   stager  r  rT  
feats_fake
feats_realr   r   gen_lossrU  dur_lossrY  stft_loss_mgstft_loss_scrd  mse_fake_lossrb  r   r   r   rm   l  sB   %,
zGeneratorLoss.forward)
Nr   Nr   Nr   Nr   Nr   )NNNNNNNr   r   r   rc   r   r]  7  s*    r]  c                       r   )DiscriminatorLosszCreates a summary of discriminator losses

    Arguments
    ---------
    msed_loss : object
        object of MSE discriminator loss
    Nc                    s   t    || _d S r   )rZ   r[   	msed_loss)ra   r  rc   r   r   r[     s   

zDiscriminatorLoss.__init__c                 C   sP   d}i }| j r"t||| j d\}}}||d< ||d< ||d< ||7 }||d< |S )aN  Returns a dictionary of discriminator losses

        Arguments
        ---------
        scores_fake : list
            discriminator scores of generated waveforms
        scores_real : list
            discriminator scores of groundtruth waveforms

        Returns
        -------
        Dictionary of discriminator losses
        r   )rT  rX  rD  D_mse_gan_lossD_mse_gan_real_lossD_mse_gan_fake_lossD_loss)r  r\  )ra   rT  rX  	disc_lossrY  
mse_D_lossmse_D_real_lossmse_D_fake_lossr   r   r   rm     s   zDiscriminatorLoss.forwardr   r   r   r   rc   r   r~    s    r~  )r   r   )r  ))rv   r
   torch.nnr\   torch.nn.functionalr   rg   
torchaudior   speechbrainrs  speechbrain.nnet.CNNr   r   r   ri   r   r%   rJ   ModulerL   rx   r|   r   r   r   r   r   r  r
  r  r  r  r.  r<  rA  rM  rW  r\  r]  r~  r   r   r   r   <module>   sH    "
IBsB 7O  	q(6)
+--v+*- 
