o
    %ݫit                    @   s  d Z ddlZddlZddlm  mZ ddlmZ ddlm	Z	 ddl
mZmZmZmZ ddlmZmZ ddlmZ ddlmZ dd	lmZ G d
d dejZG dd dejZG dd dejZG dd dejZG dd dejZdd Zd6ddZG dd dZ G dd dejZ!dd Z"d7d"d#Z#G d$d% d%ejjZ$G d&d' d'e	Z%G d(d) d)Z&d*d+ Z'G d,d- d-ejjZ(G d.d/ d/ejZ)G d0d1 d1ejZ*G d2d3 d3ejZ+G d4d5 d5ejZ,dS )8z
Neural network modules for the FastSpeech 2: Fast and High-Quality End-to-End Text to Speech
synthesis model
Authors
* Sathvik Udupa 2022
* Pradnya Kandarkar 2023
* Yingzhi Wang 2023
    N)nn)_Loss)PositionalEncodingTransformerEncoderget_key_padding_maskget_mask_from_lengths)CNNlinear)	Embedding)bce_loss)	LayerNormc                       *   e Zd ZdZd fdd	Zdd Z  ZS )EncoderPreNeta?  Embedding layer for tokens

    Arguments
    ---------
    n_vocab: int
        size of the dictionary of embeddings
    blank_id: int
        padding index
    out_channels: int
        the size of each embedding vector

    Example
    -------
    >>> from speechbrain.nnet.embedding import Embedding
    >>> from speechbrain.lobes.models.FastSpeech2 import EncoderPreNet
    >>> encoder_prenet_layer = EncoderPreNet(n_vocab=40, blank_id=0, out_channels=384)
    >>> x = torch.rand(3, 5)
    >>> y = encoder_prenet_layer(x)
    >>> y.shape
    torch.Size([3, 5, 384])
       c                    s   t    t|||d| _d S )N)num_embeddingsembedding_dimblank_id)super__init__r
   token_embedding)selfn_vocabr   out_channels	__class__ X/home/ubuntu/.local/lib/python3.10/site-packages/speechbrain/lobes/models/FastSpeech2.pyr   3   s   
zEncoderPreNet.__init__c                 C   s   | j |j| _ |  |}|S )zComputes the forward pass

        Arguments
        ---------
        x: torch.Tensor
            a (batch, tokens) input tensor

        Returns
        -------
        output: torch.Tensor
            the embedding layer output
        )r   todevice)r   xr   r   r   forward;   s   
zEncoderPreNet.forward)r   __name__
__module____qualname____doc__r   r    __classcell__r   r   r   r   r      s    r   c                       s4   e Zd ZdZ					d
 fdd	Zdd	 Z  ZS )PostNeta  
    FastSpeech2 Conv Postnet
    Arguments
    ---------
    n_mel_channels: int
       input feature dimension for convolution layers
    postnet_embedding_dim: int
       output feature dimension for convolution layers
    postnet_kernel_size: int
       postnet convolution kernel size
    postnet_n_convolutions: int
       number of convolution layers
    postnet_dropout: float
        dropout probability for postnet
    P   r            ?c              	      s   t t|   tj|||dd| _t | _t	d|d D ]}| j
tj|||dd qtj|||dd| _t | _t|| _t|| _t|| _t|| _t|| _t|| _d S )Nsamein_channelsr   kernel_sizepadding   )r   r'   r   r   Conv1dconv_prer   
ModuleListconvs_intermediaterangeappend	conv_postTanhtanhr   ln1ln2ln3Dropoutdropout1dropout2dropout3)r   n_mel_channelspostnet_embedding_dimpostnet_kernel_sizepostnet_n_convolutionspostnet_dropoutir   r   r   r   ^   s<   
	
zPostNet.__init__c                 C   s   |  |}| ||j}| |}| |}tt| jD ]	}| j| |}q| 	||j}| |}| 
|}| |}| ||j}| |}|S )zComputes the forward pass

        Arguments
        ---------
        x: torch.Tensor
            a (batch, time_steps, features) input tensor

        Returns
        -------
        output: torch.Tensor
            the spectrogram predicted
        )r2   r:   r   dtyper9   r>   r5   lenr4   r;   r?   r7   r<   r@   )r   r   rF   r   r   r   r       s   






zPostNet.forward)r(   r   r)   r)   r*   r!   r   r   r   r   r'   M   s    *r'   c                       s,   e Zd ZdZ	d fdd	Zdd Z  ZS )	DurationPredictora  Duration predictor layer

    Arguments
    ---------
    in_channels: int
       input feature dimension for convolution layers
    out_channels: int
       output feature dimension for convolution layers
    kernel_size: int
       duration predictor convolution kernel size
    dropout: float
       dropout probability, 0 by default
    n_units: int

    Example
    -------
    >>> from speechbrain.lobes.models.FastSpeech2 import FastSpeech2
    >>> duration_predictor_layer = DurationPredictor(in_channels=384, out_channels=384, kernel_size=3)
    >>> x = torch.randn(3, 400, 384)
    >>> mask = torch.ones(3, 400, 384)
    >>> y = duration_predictor_layer(x, mask)
    >>> y.shape
    torch.Size([3, 400, 1])
            r0   c                    s|   t    tj|||dd| _tj|||dd| _tj||d| _t|| _	t|| _
t | _t|| _t|| _d S )Nr+   r,   	n_neurons
input_size)r   r   r   r1   conv1conv2r	   Linearr   r:   r;   r   ReLUrelur=   r>   r?   )r   r-   r   r.   dropoutn_unitsr   r   r   r      s&   



zDurationPredictor.__init__c                 C   sn   |  | || }| ||j}| |}|  | || }| ||j}| |}| 	|| S )a?  Computes the forward pass

        Arguments
        ---------
        x: torch.Tensor
            a (batch, time_steps, features) input tensor
        x_mask: torch.Tensor
            mask of input tensor

        Returns
        -------
        output: torch.Tensor
            the duration predictor outputs
        )
rR   rN   r:   r   rG   r>   rO   r;   r?   r	   )r   r   x_maskr   r   r   r       s   

zDurationPredictor.forward)rJ   r0   r!   r   r   r   r   rI      s
    rI   c                       s0   e Zd ZdZ fddZdd Zdd Z  ZS )SPNPredictora<  
    This module for the silent phoneme predictor. It receives phoneme sequences without any silent phoneme token as
    input and predicts whether a silent phoneme should be inserted after a position. This is to avoid the issue of fast
    pace at inference time due to having no silent phoneme tokens in the input sequence.

    Arguments
    ---------
    enc_num_layers: int
        number of transformer layers (TransformerEncoderLayer) in encoder
    enc_num_head: int
        number of multi-head-attention (MHA) heads in encoder transformer layers
    enc_d_model: int
        the number of expected features in the encoder
    enc_ffn_dim: int
        the dimension of the feedforward network model
    enc_k_dim: int
        the dimension of the key
    enc_v_dim: int
        the dimension of the value
    enc_dropout: float
        Dropout for the encoder
    normalize_before: bool
        whether normalization should be applied before or after MHA or FFN in Transformer layers.
    ffn_type: str
        whether to use convolutional layers instead of feed forward network inside transformer layer
    ffn_cnn_kernel_size_list: list of int
        conv kernel size of 2 1d-convs if ffn_type is 1dcnn
    n_char: int
        the number of symbols for the token embedding
    padding_idx: int
        the index for padding
    c                    sf   t    || _|| _t|||d| _t|| _t|||||||t	j
||	|
d| _tjd|d| _d S )Nr   
num_layersnheadd_ffnd_modelkdimvdimrS   
activationnormalize_beforeffn_typeffn_cnn_kernel_size_listr0   rK   )r   r   enc_num_headpadding_idxr   	encPreNetr   #sinusoidal_positional_embed_encoderr   r   rQ   spn_encoderr	   rP   
spn_linear)r   enc_num_layersrc   enc_d_modelenc_ffn_dim	enc_k_dim	enc_v_dimenc_dropoutr`   ra   rb   n_charrd   r   r   r   r     s.   
zSPNPredictor.__init__c                 C   s   |  |}t|ddd|jd }|| }t|| jd}| d}| |}t||| }tj	tj
|jd |jd |jddd | j|jd  dd}| j|||d\}}	| |d}
|
S )	a  forward pass for the module

        Arguments
        ---------
        tokens: torch.Tensor
            input tokens without silent phonemes
        last_phonemes: torch.Tensor
            indicates if a phoneme at an index is the last phoneme of a word or not

        Returns
        -------
        spn_decision: torch.Tensor
            indicates if a silent phoneme should be inserted after a phoneme
           r0   pad_idxr   )diagonalr   src_masksrc_key_padding_mask)re   torch	unsqueezerepeatshaper   rd   rf   addtriuonesr   boolrc   rg   rh   squeeze)r   tokenslast_phonemestoken_featssrcmasksrcmask_invertedposspn_maskspn_token_feats_spn_decisionr   r   r   r    ?  s2   


zSPNPredictor.forwardc                 C   s   |  ||}t|dk}|S )a  inference function

        Arguments
        ---------
        tokens: torch.Tensor
            input tokens without silent phonemes
        last_phonemes: torch.Tensor
            indicates if a phoneme at an index is the last phoneme of a word or not

        Returns
        -------
        spn_decision: torch.Tensor
            indicates if a silent phoneme should be inserted after a phoneme
        g?)r    ry   sigmoid)r   r   r   r   r   r   r   infern  s   zSPNPredictor.infer)r"   r#   r$   r%   r   r    r   r&   r   r   r   r   rV      s
    !+/rV   c                       s6   e Zd ZdZ fddZ						dddZ  ZS )	FastSpeech2a  The FastSpeech2 text-to-speech model.
    This class is the main entry point for the model, which is responsible
    for instantiating all submodules, which, in turn, manage the individual
    neural network layers
    Simplified STRUCTURE: input->token embedding ->encoder ->duration/pitch/energy predictor ->duration
    upsampler -> decoder -> output
    During training, teacher forcing is used (ground truth durations are used for upsampling)

    Arguments
    ---------
    enc_num_layers: int
        number of transformer layers (TransformerEncoderLayer) in encoder
    enc_num_head: int
        number of multi-head-attention (MHA) heads in encoder transformer layers
    enc_d_model: int
        the number of expected features in the encoder
    enc_ffn_dim: int
        the dimension of the feedforward network model
    enc_k_dim: int
        the dimension of the key
    enc_v_dim: int
        the dimension of the value
    enc_dropout: float
        Dropout for the encoder
    dec_num_layers: int
        number of transformer layers (TransformerEncoderLayer) in decoder
    dec_num_head: int
        number of multi-head-attention (MHA) heads in decoder transformer layers
    dec_d_model: int
        the number of expected features in the decoder
    dec_ffn_dim: int
        the dimension of the feedforward network model
    dec_k_dim: int
        the dimension of the key
    dec_v_dim: int
        the dimension of the value
    dec_dropout: float
        dropout for the decoder
    normalize_before: bool
        whether normalization should be applied before or after MHA or FFN in Transformer layers.
    ffn_type: str
        whether to use convolutional layers instead of feed forward network inside transformer layer.
    ffn_cnn_kernel_size_list: list of int
        conv kernel size of 2 1d-convs if ffn_type is 1dcnn
    n_char: int
        the number of symbols for the token embedding
    n_mels: int
        number of bins in mel spectrogram
    postnet_embedding_dim: int
       output feature dimension for convolution layers
    postnet_kernel_size: int
       postnet convolution kernel size
    postnet_n_convolutions: int
       number of convolution layers
    postnet_dropout: float
        dropout probability for postnet
    padding_idx: int
        the index for padding
    dur_pred_kernel_size: int
        the convolution kernel size in duration predictor
    pitch_pred_kernel_size: int
        kernel size for pitch prediction.
    energy_pred_kernel_size: int
        kernel size for energy prediction.
    variance_predictor_dropout: float
        dropout probability for variance predictor (duration/pitch/energy)

    Example
    -------
    >>> import torch
    >>> from speechbrain.lobes.models.FastSpeech2 import FastSpeech2
    >>> model = FastSpeech2(
    ...    enc_num_layers=6,
    ...    enc_num_head=2,
    ...    enc_d_model=384,
    ...    enc_ffn_dim=1536,
    ...    enc_k_dim=384,
    ...    enc_v_dim=384,
    ...    enc_dropout=0.1,
    ...    dec_num_layers=6,
    ...    dec_num_head=2,
    ...    dec_d_model=384,
    ...    dec_ffn_dim=1536,
    ...    dec_k_dim=384,
    ...    dec_v_dim=384,
    ...    dec_dropout=0.1,
    ...    normalize_before=False,
    ...    ffn_type='1dcnn',
    ...    ffn_cnn_kernel_size_list=[9, 1],
    ...    n_char=40,
    ...    n_mels=80,
    ...    postnet_embedding_dim=512,
    ...    postnet_kernel_size=5,
    ...    postnet_n_convolutions=5,
    ...    postnet_dropout=0.5,
    ...    padding_idx=0,
    ...    dur_pred_kernel_size=3,
    ...    pitch_pred_kernel_size=3,
    ...    energy_pred_kernel_size=3,
    ...    variance_predictor_dropout=0.5)
    >>> inputs = torch.tensor([
    ...     [13, 12, 31, 14, 19],
    ...     [31, 16, 30, 31, 0],
    ... ])
    >>> input_lengths = torch.tensor([5, 4])
    >>> durations = torch.tensor([
    ...     [2, 4, 1, 5, 3],
    ...     [1, 2, 4, 3, 0],
    ... ])
    >>> mel_post, postnet_output, predict_durations, predict_pitch, avg_pitch, predict_energy, avg_energy, mel_lens = model(inputs, durations=durations)
    >>> mel_post.shape, predict_durations.shape
    (torch.Size([2, 15, 80]), torch.Size([2, 5]))
    >>> predict_pitch.shape, predict_energy.shape
    (torch.Size([2, 5, 1]), torch.Size([2, 5, 1]))
    c                    s  t    || _|	| _|| _t|| _t|
| _t|||d| _	t
||||d| _t
||||d| _t
||||d| _tjd||ddd| _tjd||ddd| _t|||||||tj|||d| _t||	||
|||tj|||d| _tj||
d| _t|||||d	| _d S )
NrW   r-   r   r.   rS   r0   r+   Tr-   r   r.   r/   skip_transposerX   rK   rA   rB   rC   rD   rE   )r   r   rc   dec_num_headrd   r   rf   #sinusoidal_positional_embed_decoderr   re   rI   durPred	pitchPred
energyPredr   r1   
pitchEmbedenergyEmbedr   r   rQ   encoderdecoderr	   rP   r'   postnet)r   ri   rc   rj   rk   rl   rm   rn   dec_num_layersr   dec_d_modeldec_ffn_dim	dec_k_dim	dec_v_dimdec_dropoutr`   ra   rb   ro   n_melsrB   rC   rD   rE   rd   dur_pred_kernel_sizepitch_pred_kernel_sizeenergy_pred_kernel_sizevariance_predictor_dropoutr   r   r   r     s   
!zFastSpeech2.__init__N      ?c              
   C   s  t || jd}| d}	| |}
| |
}t|
||	 }
|d| jd|
j	d 
ddd }| j|
||d\}
}|
|	 }
| |
|	d}| dkrV|d}|du rdttj|d}d}| |
|	}|| }|durt|d|}| |}|
ddd}n
| |
ddd}|
ddd}|
|}
d}| |
|	}|| }|durt|d|}| |}|
ddd}n
| |
ddd}|
ddd}|
|}
t|
|dur|n||d\}}tt|}||j}| d}	|d| jd|j	d 
ddd }| |}t|||	 }| j|||d^}}}|  ||	 }| !|| }|||||||t|fS )	a  forward pass for training and inference

        Arguments
        ---------
        tokens: torch.Tensor
            batch of input tokens
        durations: torch.Tensor
            batch of durations for each token. If it is None, the model will infer on predicted durations
        pitch: torch.Tensor
            batch of pitch for each frame. If it is None, the model will infer on predicted pitches
        energy: torch.Tensor
            batch of energy for each frame. If it is None, the model will infer on predicted energies
        pace: float
            scaling factor for durations
        pitch_rate: float
            scaling factor for pitches
        energy_rate: float
            scaling factor for energies

        Returns
        -------
        mel_post: torch.Tensor
            mel outputs from the decoder
        postnet_output: torch.Tensor
            mel outputs from the postnet
        predict_durations: torch.Tensor
            predicted durations of each token
        predict_pitch: torch.Tensor
            predicted pitches of each token
        avg_pitch: torch.Tensor
            target pitches for each token if input pitch is not None
            None if input pitch is None
        predict_energy: torch.Tensor
            predicted energies of each token
        avg_energy: torch.Tensor
            target energies for each token if input energy is not None
            None if input energy is None
        mel_length:
            predicted lengths of mel spectrograms
        rq   rs   r0   r   rp   rv   Npace)"r   rd   rz   re   rf   ry   r}   r{   rc   r|   permuter   r   r   r   dimclampspecialexpm1r   average_over_durationsr   r   r   upsampler   tensorr   r   r   r   r   r	   r   )r   r   	durationspitchenergyr   
pitch_rateenergy_rater   r   r   r   	attn_maskr   predict_durationsdur_pred_reverse_log	avg_pitchpredict_pitch
avg_energypredict_energy
spec_featsmel_lensoutput_mel_featsmemorymel_postpostnet_outputr   r   r   r    l  s   2











zFastSpeech2.forwardNNNr   r   r   r!   r   r   r   r   r     s    txr   c                 C   s  t j|dd }t jj|ddddf d}t jjt j| dkddd}t jjt j| ddd}| \}}| d}|dddddf |||}	|dddddf |||}
t |d|
t |d|	 	 }t |d|
t |d|	 	 }t 
|dk||| }|S )zAverage values over durations.

    Arguments
    ---------
    values: torch.Tensor
        shape: [B, 1, T_de]
    durs: torch.Tensor
        shape: [B, T_en]

    Returns
    -------
    avg: torch.Tensor
        shape: [B, 1, T_en]
    r0   r   Nrs   r0   r   rJ   rp   )ry   cumsumlongr   
functionalpadsizeexpandgatherfloatwhere)valuesdursdurs_cums_endsdurs_cums_startsvalues_nonzero_cumsvalues_cumsbslength
n_formantsdcsdcevalues_sumsvalues_nelemsavgr   r   r   r     s,    
  r   r   rJ   c                    sJ    fddt t D }dd |D }tjjjj|d|d}||fS )a  upsample encoder output according to durations

    Arguments
    ---------
    feats: torch.Tensor
        batch of input tokens
    durs: torch.Tensor
        durations to be used to upsample
    pace: float
        scaling factor for durations
    padding_value: int
        padding index

    Returns
    -------
    mel_post: torch.Tensor
        mel outputs from the decoder
    predict_durations: torch.Tensor
        predicted durations for each token
    c                    s,   g | ]}t j|  |   d dqS )r   r   )ry   repeat_interleaver   ).0rF   r   featsr   r   r   
<listcomp>=  s    zupsample.<locals>.<listcomp>c                 S   s   g | ]}|j d  qS r   )r|   )r   melr   r   r   r   B  s    T)batch_firstpadding_value)r5   rH   ry   r   utilsrnnpad_sequence)r   r   r   r   upsampled_melsr   padded_upsampled_melsr   r   r   r   (  s   

r   c                   @      e Zd ZdZdd ZdS )TextMelCollatezEZero-pads model inputs and targets based on number of frames per stepc           !      C   sZ  t |}tt|D ]
}|| d ||< q
tjtdd |D ddd\}}|d }tjtdd |D ddd\}}|d }	tt||}
tt||	}tt||	}tt||}tt||	}|
  |  |  |  |  tt|D ]g}|||  d }|||  d }t|||  d	 }|||  d
 }t|||  d }||
|d|df< |||d|df< |||d|df< |||d|df< |||d|df< q|d d d}t	dd |D }tt|||}|  tt||}|  tt||}|  tt|}g g }}tt|D ]V}|| }|| d }|| d }|| d }|||ddd|d
f< |||d|df< |||d|df< |d
||< |
|| d  |
|| d  q4dd |D } t| } |ddd
}|
||||||| |||||fS )a  Collate's training batch from normalized text and mel-spectrogram

        Arguments
        ---------
        batch: list
            [text_normalized, mel_normalized]

        Returns
        -------
        text_padded: torch.Tensor
        dur_padded: torch.Tensor
        input_lengths: torch.Tensor
        mel_padded: torch.Tensor
        pitch_padded: torch.Tensor
        energy_padded: torch.Tensor
        output_lengths: torch.Tensor
        len_x: torch.Tensor
        labels: torch.Tensor
        wavs: torch.Tensor
        no_spn_seq_padded: torch.Tensor
        spn_labels_padded: torch.Tensor
        last_phonemes_padded: torch.Tensor
        mel_text_pairc                 S      g | ]}t |d  qS r   rH   r   r   r   r   r   r   o      z+TextMelCollate.__call__.<locals>.<listcomp>r   Tr   
descendingc                 S   r   )r   r   r   r   r   r   u  r   r   r0   rs   Nrp   c                 S   s   g | ]	}|d   dqS )rp   r0   r   r   r   r   r   r               labelwavc                 S   s   g | ]}|d  qS )r)   r   r   r   r   r   r     s    )listr5   rH   ry   sort
LongTensorFloatTensorzero_r   maxr6   Tensorr   )!r   batch	raw_batchrF   input_lengthsids_sorted_decreasingmax_input_lenno_spn_seq_lengthsno_spn_ids_sorted_decreasingmax_no_spn_seq_lentext_paddedno_spn_seq_paddedlast_phonemes_padded
dur_paddedspn_labels_paddedtext
no_spn_seqr   dur
spn_labelsnum_melsmax_target_len
mel_paddedpitch_paddedenergy_paddedoutput_lengthslabelswavsidxr   r   r   len_xr   r   r   __call__N  s   



zTextMelCollate.__call__Nr"   r#   r$   r%   r   r   r   r   r   r   J  s    r   c                       s.   e Zd ZdZ		d fdd	Zdd Z  ZS )	Lossas  Loss Computation

    Arguments
    ---------
    log_scale_durations: bool
        applies logarithm to target durations
    ssim_loss_weight: float
        weight for ssim loss
    duration_loss_weight: float
        weight for the duration loss
    pitch_loss_weight: float
        weight for the pitch loss
    energy_loss_weight: float
        weight for the energy loss
    mel_loss_weight: float
        weight for the mel loss
    postnet_mel_loss_weight: float
        weight for the postnet mel loss
    spn_loss_weight: float
        weight for spn loss
    spn_loss_max_epochs: int
        Max number of epochs
    r      c
           
         s~   t    t | _t | _t | _t | _t | _	t | _
|| _|| _|| _|| _|| _|| _|| _|| _|	| _d S N)r   r   SSIMLoss	ssim_lossr   MSELossmel_losspostnet_mel_lossdur_loss
pitch_lossenergy_losslog_scale_durationsssim_loss_weightmel_loss_weightpostnet_mel_loss_weightduration_loss_weightpitch_loss_weightenergy_loss_weightspn_loss_weightspn_loss_max_epochs)
r   r-  r.  r1  r2  r3  r/  r0  r4  r5  r   r   r   r     s    






zLoss.__init__c              
   C   s  |\}}}}}}	}
t |jdksJ |\	}}}}}}}}}|d}|d}|d}|d}|d}| jr@t| }t|jd D ]}|dkr| ||d|| ddf ||d|| ddf }| 	||d|| ddf ||d|| ddf }| 
||d|	| f ||d|	| f tj}| ||d|| f ||d|| f tj}| ||d|| f ||d|| f tj}qG|| ||d|| ddf ||d|| ddf  }|| 	||d|| ddf ||d|| ddf  }|| 
||d|	| f ||d|	| f tj }|| ||d|| f ||d|| f tj }|| ||d|| f ||d|| f tj }qG| |||}t|t |}t|t |}t|t |}t|t |}t|t |}t||
}|| jkrd| _|| j || j  || j  || j  || j  || j  || j  }||| j || j || j || j || j || j || j d}|S )as  Computes the value of the loss function and updates stats

        Arguments
        ---------
        predictions: tuple
            model predictions
        targets: tuple
            ground truth data
        current_epoch: int
            The count of the current epoch.

        Returns
        -------
        loss: torch.Tensor
            the loss value
        r   rs   r   N)
total_lossr&  r(  r)  r*  r+  r,  spn_loss)rH   r|   r   r-  ry   log1pr   r5   r(  r)  r*  r   float32r+  r,  r&  divr   r5  r4  r.  r/  r0  r1  r2  r3  )r   predictionstargetscurrent_epoch
mel_targettarget_durationstarget_pitchtarget_energy
mel_lengthphon_lenr  mel_outpostnet_mel_outlog_durationspredicted_pitchaverage_pitchpredicted_energyaverage_energyr   	spn_predslog_target_durationsrF   r(  r)  r*  r+  r,  r&  r7  r6  lossr   r   r   r      s   






zLoss.forward)r   r#  r!   r   r   r   r   r"    s    !r"  c              	   C   s   ddl m} |j|||||d|j}|j| |d d ||||
|d|j}||}||}| dks8J |jd |ksAJ tj	|dd}|	r[|t
| t|t
|  }|rat|}||fS )a  calculates MelSpectrogram for a raw audio signal

    Arguments
    ---------
    sample_rate : int
        Sample rate of audio signal.
    hop_length : int
        Length of hop between STFT windows.
    win_length : int
        Window size.
    n_fft : int
        Size of FFT.
    n_mels : int
        Number of mel filterbanks.
    f_min : float
        Minimum frequency.
    f_max : float
        Maximum frequency.
    power : float
        Exponent for the magnitude spectrogram.
    normalized : bool
        Whether to normalize by magnitude after stft.
    min_max_energy_norm : bool
        Whether to normalize by min-max
    norm : str or None
        If "slaney", divide the triangular mel weights by the width of the mel band
    mel_scale : str
        Scale to use: "htk" or "slaney".
    compression : bool
        whether to do dynamic range compression
    audio : torch.Tensor
        input audio signal

    Returns
    -------
    mel : torch.Tensor
    rmse : torch.Tensor
    r   )
transforms)
hop_length
win_lengthn_fftpower
normalizedrp   r0   )sample_raten_stftr   f_minf_maxnorm	mel_scaler   )
torchaudiorN  Spectrogramr   r   MelScaler   r|   ry   rX  minr  dynamic_range_compression)rT  rO  rP  rQ  r   rV  rW  rR  rS  min_max_energy_normrX  rY  compressionaudiorN  audio_to_melspecr   rmser   r   r   mel_spectogramy  s>   6
	"re  r0   h㈵>c                 C   s   t t j| |d| S )z+Dynamic range compression for audio signalsr]  )ry   logr   )r   Cclip_valr   r   r   r^    s   r^  c                       sH   e Zd ZdZ fddZdddZdejdejfd	d
Zdd Z	  Z
S )r%  zkSSIM loss as (1 - SSIM)
    SSIM is explained here https://en.wikipedia.org/wiki/Structural_similarity
    c                    s   t    t | _d S r$  )r   r   	_SSIMLoss	loss_funcr   r   r   r   r     s   
zSSIMLoss.__init__Nc                 C   s>   |du r	|j  }tj||j|jd}|d|dk }|S )a:  Create a sequence mask for filtering padding in a sequence tensor.

        Arguments
        ---------
        sequence_length: torch.Tensor
            Sequence lengths.
        max_len: int
            Maximum sequence length. Defaults to None.

        Returns
        -------
        mask: [B, T_max]
        N)rG   r   r   r0   )datar  ry   arangerG   r   rz   )r   sequence_lengthmax_len	seq_rangemaskr   r   r   sequence_mask  s   

zSSIMLoss.sequence_maskr   rs  c                 C   sH   t j|| dddd}t j|| dddd}|| || d  S )a  Min-Max normalize tensor through first dimension

        Arguments
        ---------
        x: torch.Tensor
            input tensor [B, D1, D2]
        mask: torch.Tensor
            input mask [B, D1, 1]

        Returns
        -------
        Normalized tensor
        r   )r0   rp   T)r   keepdimgꌠ9Y>)F:0yE>)ry   amaxmasked_fillamin)r   r   rs  maximumminimumr   r   r   sample_wise_min_max  s
   zSSIMLoss.sample_wise_min_maxc                 C   s   | j ||ddd}| ||}| ||}| || d|| d}| dkrAtd|  d tjd|j	d}| dk rYtd|  d	 tjd|j	d}|S )
at  
        Arguments
        ---------
        y_hat: torch.Tensor
            model prediction values [B, T, D].
        y: torch.Tensor
            target values [B, T, D].
        length: torch.Tensor
            length of each sample in a batch for masking.

        Returns
        -------
        loss: Average loss value in range [0, 1] masked by the length.
        r0   )rp  rq  rp   r   z > SSIM loss is out-of-range z, setting it 1.0rt   rJ   z, setting it 0.0)
rt  r   rz   r|  rl  itemprintry   r   r   )r   y_hatyr   rs  y_norm
y_hat_normr&  r   r   r   r      s*   
zSSIMLoss.forwardr$  )r"   r#   r$   r%   r   rt  ry   r  r|  r    r&   r   r   r   r   r%    s    
r%  c                       s   e Zd ZdZg dZ								d fd
d	ZdddZ			d ddZdd Zd!ddZ	d!ddZ
									d"ddZdd Z  ZS )#rk  aE  Creates a criterion that measures the structural similarity index error between
    each element in the input x and target y.
    Equation link: https://en.wikipedia.org/wiki/Structural_similarity
    x and y are tensors of arbitrary shapes with a total of n elements each.
    The sum operation still operates over all the elements, and divides by n.
    The division by n can be avoided if one sets reduction = sum.
    In case of 5D input tensors, complex value is returned as a tensor of size 2.

    Arguments
    ---------
    kernel_size: int
        By default, the mean and covariance of a pixel is obtained
        by convolution with given filter_size.
    kernel_sigma: float
        Standard deviation for Gaussian kernel.
    k1: float
        Coefficient related to c1 (see equation in the link above).
    k2: float
        Coefficient related to c2 (see equation in the link above).
    downsample: bool
        Perform average pool before SSIM computation (Default: True).
    reduction: str
        Specifies the reduction type
    data_range: Union[int, float]
        Maximum value range of images (usually 1.0 or 255).

    Example
    -------
    >>> loss = _SSIMLoss()
    >>> x = torch.rand(3, 3, 256, 256, requires_grad=True)
    >>> y = torch.rand(3, 3, 256, 256)
    >>> output = loss(x, y)
    >>> output.backward()
    )r.   k1k2sigmakernel	reduction         ?{Gz?Q?Tmeanr   c                    sT   t    || _|| _|d dksJ d| d|| _|| _|| _|| _|| _d S )Nrp   r0   Kernel size must be odd, got [])	r   r   r  r.   kernel_sigmar  r  
downsample
data_range)r   r.   r  r  r  r  r  r  r   r   r   r   Y  s   



z_SSIMLoss.__init__c                 C   s<   |dkr|S |dkr|j ddS |dkr|jddS td)a?  Reduce input in batch dimension if needed.

        Arguments
        ---------
        x: torch.Tensor
            Tensor with shape (B, *).
        reduction: str
            Specifies the reduction type:
            none | mean | sum (Default: mean)

        Returns
        -------
        Reduced outputs.
        noner  r   r   sumz:Unknown reduction. Expected one of {'none', 'mean', 'sum'})r  r  
ValueError)r   r   r  r   r   r   _reducev  s   z_SSIMLoss._reducer   rs   rJ   g      Nc              	   C   s  	 |d }|D ]}t |sJ dt| |j|jks)J d|j d|j |du rD| | ksCJ d|  d|  n&| |d |d	  | |d |d	  ksjJ d
|  d|  |d |d	 kr| |d ksJ d|d  d|  n,|d |d	 k r|d |   kr|d	 ksn J d|d  d|d	  d|  |d |d	 k r|d | ksJ d|d  d|  | |d	 ksJ d|d	  d|  qdS )a  Check if the input satisfies the requirements

        Arguments
        ---------
        tensors: torch.Tensor
            torch.Tensors to check
        dim_range: Tuple[int, int]
            Allowed number of dimensions. (min, max)
        data_range: Tuple[float, float]
            Allowed range of values in tensors. (min, max)
        size_range: Tuple[int, int]
            Dimensions to include in size comparison. (start_dim, end_dim + 1)

        Returns
        -------
        None
        FNr   zExpected torch.Tensor, got zExpected tensors to be on z, got z%Expected tensors with same size, got z and r0   z9Expected tensors with same size at given dimensions, got z$Expected number of dimensions to be z,Expected number of dimensions to be between z*Expected values to be greater or equal to z(Expected values to be lower or equal to )ry   	is_tensortyper   r   r   r]  r  )r   tensors	dim_ranger  
size_ranger   tr   r   r   _validate_input  sB   $ z_SSIMLoss._validate_inputc                 C   sd   t j|t jd}||d d 8 }|d }|d|d  d|d    }||  }|dS )a7  Returns 2D Gaussian kernel N(0,sigma^2)

        Arguments
        ---------
        kernel_size: int
            Size of the kernel
        sigma: float
            Std of the distribution

        Returns
        -------
        gaussian_kernel: torch.Tensor
            [1, kernel_size, kernel_size]
        rG   r0          @rp   r   )ry   ro  r9  rz   expr  )r   r.   r  coordsgr   r   r   gaussian_filter  s   &
z_SSIMLoss.gaussian_filterc                 C   sF  | d| dk s| d| dk r"td|   d|   |d }|d }| d}tj||dd|d}	tj||dd|d}
|	d }|
d }|	|
 }tj|d |dd|d| }tj|d |dd|d| }tj|| |dd|d| }d	| | || |  }d	| | || |  | }|jd
d}|jd
d}||fS )a  Calculate Structural Similarity (SSIM) index for X and Y per channel.

        Arguments
        ---------
        x: torch.Tensor
            An input tensor (N, C, H, W).
        y: torch.Tensor
            A target tensor (N, C, H, W).
        kernel: torch.Tensor
            2D Gaussian kernel.
        k1: float
            Algorithm parameter (see equation in the link above).
        k2: float
            Algorithm parameter (see equation in the link above).
            Try a larger K2 constant (e.g. 0.4) if you get a negative or NaN results.

        Returns
        -------
        Full Value of Structural Similarity (SSIM) index.
        rs   r   AKernel size can't be greater than actual input size. Input size: . Kernel size: rp   r0   r   weightstrider/   groupsr  )rs   r   r   )r   r  Fconv2dr  )r   r   r  r  r  r  c1c2
n_channelsmu_xmu_ymu_xxmu_yymu_xysigma_xxsigma_yysigma_xycsssssim_valr   r   r   _ssim_per_channel  sR   (


z_SSIMLoss._ssim_per_channelc           $      C   sf  | d}| d| dk s| d| dk r'td|   d|   |d }|d }|d }	|d	 }
|d }|d	 }tj|	|dd
|d}tj|
|dd
|d}tj||dd
|d}tj||dd
|d}|d|d }|d|d }|| ||  }|| ||  }d}|	d|
d }|d|d }|	| |
|  }|	| |
|  }tj||dd
|d| }tj||dd
|d| }tj||dd
|d| }tj||dd
|d| }tj||fdd}tj||fdd}|d ||  |d|d ||   } |d ||  |d|d ||   }!|!|  }!|!jdd}"| jdd}#|"|#fS )a  Calculate Structural Similarity (SSIM) index for Complex X and Y per channel.

        Arguments
        ---------
        x: torch.Tensor
            An input tensor (N, C, H, W, 2).
        y: torch.Tensor
            A target tensor (N, C, H, W, 2).
        kernel: torch.Tensor
            2-D gauss kernel.
        k1: float
            Algorithm parameter (see equation in the link above).
        k2: float
            Algorithm parameter (see equation in the link above).
            Try a larger K2 constant (e.g. 0.4) if you get a negative or NaN results.

        Returns
        -------
        Full Value of Complex Structural Similarity (SSIM) index.
        r0   r   rs   r   r  r  rp   ).r   ).r0   r   r  r   r   )r   r   )	r   r  r  r  powry   stackrz   r  )$r   r   r  r  r  r  r  r  r  x_realx_imagy_realy_imagmu1_realmu1_imagmu2_realmu2_imagmu1_sqmu2_sqmu1_mu2_realmu1_mu2_imagcompensationx_sqy_sqx_y_realx_y_imag	sigma1_sq	sigma2_sqsigma12_realsigma12_imagsigma12mu1_mu2cs_mapssim_mapr  r  r   r   r   _ssim_per_channel_complex,  s   
(







z#_SSIMLoss._ssim_per_channel_complexFc                 C   s$  |d dksJ d| d| j ||gdd|fd |t| }|t| }tdtt| dd	 d
 }|dkrK|rKtj||d}tj||d}| ||	|dddd
|}| dkrf| jn| j}|||||	|
d\}}|d}|d}| ||}| ||}|r||gS |S )a^  Interface of Structural Similarity (SSIM) index.
        Inputs supposed to be in range [0, data_range].
        To match performance with skimage and tensorflow set downsample = True.

        Arguments
        ---------
        x: torch.Tensor
            An input tensor (N, C, H, W) or (N, C, H, W, 2).
        y: torch.Tensor
            A target tensor (N, C, H, W) or (N, C, H, W, 2).
        kernel_size: int
            The side-length of the sliding window used in comparison. Must be an odd value.
        kernel_sigma: float
            Sigma of normal distribution.
        data_range: Union[int, float]
            Maximum value range of images (usually 1.0 or 255).
        reduction: str
            Specifies the reduction type:
            none | mean | sum. Default:mean
        full: bool
            Return cs map or not.
        downsample: bool
            Perform average pool before SSIM computation. Default: True
        k1: float
            Algorithm parameter (see equation in the link above).
        k2: float
            Algorithm parameter (see equation in the link above).
            Try a larger K2 constant (e.g. 0.4) if you get a negative or NaN results.

        Returns
        -------
        Value of Structural Similarity (SSIM) index. In case of 5D input tensors, complex value is returned
        as a tensor of size 2.
        rp   r0   r  r  )r   r)   r   )r  r  r   N   )r.   r)   )r   r  r  r  r  )r  r   r  roundr]  r   r  
avg_pool2dr  r{   r   r   r  r  r  r  )r   r   r  r.   r  r  r  fullr  r  r  fr  _compute_ssim_per_channelr  r  r  r  r   r   r   ssim  s<   0
"




z_SSIMLoss.ssimc                 C   s:   | j ||| j| j| j| j| jd| j| jd
}t	|| S )a  Computation of Structural Similarity (SSIM) index as a loss function.

        Arguments
        ---------
        x: torch.Tensor
            An input tensor (N, C, H, W) or (N, C, H, W, 2).
        y: torch.Tensor
            A target tensor (N, C, H, W) or (N, C, H, W, 2).

        Returns
        -------
        Value of SSIM loss to be minimized, i.e 1 - ssim in [0, 1] range. In case of 5D input tensors,
        complex value is returned as a tensor of size 2.
        F)
r   r  r.   r  r  r  r  r  r  r  )
r  r.   r  r  r  r  r  r  ry   	ones_like)r   r   r  scorer   r   r   r      s   z_SSIMLoss.forward)r  r  r  r  Tr  r   )r  )r  r  N)r  r  )r  r  r   r  FTr  r  )r"   r#   r$   r%   __constants__r   r  r  r  r  r  r  r    r&   r   r   r   r   rk  3  s:    #

?

Fk
Wrk  c                   @   r   )TextMelCollateWithAlignmenta`  Zero-pads model inputs and targets based on number of frames per step
    result: tuple
        a tuple of tensors to be used as inputs/targets
        (
            text_padded,
            dur_padded,
            input_lengths,
            mel_padded,
            output_lengths,
            len_x,
            labels,
            wavs
        )
    c              	   C   s  t |}tt|D ]
}|| d ||< q
tjtdd |D ddd\}}|d }tt||}|  tt|D ]}|||  d }|||d|df< q=|d d d}	td	d |D }
t	t||	|
}|  t	t||
}|  t	t||
}|  tt|}g g }}tt|D ]U}|| }|| d }|| d
 }|| d }|||ddd|df< |||d|df< |||d|df< |d||< |
|| d  |
|| d  q|dd
d}||||||||fS )a  Collate's training batch from normalized text and mel-spectrogram

        Arguments
        ---------
        batch: list
            [text_normalized, mel_normalized]

        Returns
        -------
        phoneme_padded: torch.Tensor
        input_lengths: torch.Tensor
        mel_padded: torch.Tensor
        pitch_padded: torch.Tensor
        energy_padded: torch.Tensor
        output_lengths: torch.Tensor
        labels: torch.Tensor
        wavs: torch.Tensor
        r   c                 S   r   r   r   r   r   r   r   r   6  r   z8TextMelCollateWithAlignment.__call__.<locals>.<listcomp>r   Tr   Nr0   c                 S   s   g | ]	}|d   d qS )r0   r   r   r   r   r   r   D  r   rp   r   r   r   )r   r5   rH   ry   r   r   r  r   r  r  r6   r   )r   r  r  rF   r  r  r	  phoneme_paddedphonemer  r  r  r  r  r  r  r  r  r   r   r   r   r   r   r     sZ   

z$TextMelCollateWithAlignment.__call__Nr!  r   r   r   r   r  	  s    r  c              	   C   s  t j }| | } | j}| j}|    } |   t j}| j	\}}}t j
| j	t jd}t j
||ft jd}	t j|t jddd}
t|D ]G}t j|	ddgddggd|dddddf }|	}||k}t |||}||dddd|f< |
|k}t ||| dddd|f  |}	qKt ||d}t j
| j	t jd}|dddddf dt jd }t |}tt|D ]}d||||f< |||||f  d }q||t j }t|j||d}|S )	a<  
    Monotonic alignment search algorithm, numpy works faster than the torch implementation.

    Arguments
    ---------
    value: torch.Tensor
        input alignment values [b, t_x, t_y]
    mask: torch.Tensor
        input alignment mask [b, t_x, t_y]

    Returns
    -------
    path: torch.Tensor

    Example
    -------
    >>> import torch
    >>> from speechbrain.lobes.models.FastSpeech2 import maximum_path_numpy
    >>> alignment = torch.rand(2, 5, 100)
    >>> mask = torch.ones(2, 5, 100)
    >>> hard_alignments = maximum_path_numpy(alignment, mask)
    r  r0   rs   r   constant)modeconstant_valuesN)r   rG   )npinfr   rG   cpudetachnumpyastypebool_r|   zerosint64r9  ro  reshaper5   r   r   r  reversedry   
from_numpyr   )valuers  max_neg_valr   rG   bt_xt_y	directionvx_rangejv0v1max_maskv_max
index_maskpathindexindex_ranger   r   r   maximum_path_numpyh  s@   &(
r  c                       s2   e Zd ZdZ				d	 fdd	Zdd Z  ZS )
AlignmentNetworka  Learns the alignment between the input text
    and the spectrogram with Gaussian Attention.

    query -> conv1d -> relu -> conv1d -> relu -> conv1d -> L2_dist -> softmax -> alignment
    key   -> conv1d -> relu -> conv1d - - - - - - - - - - - -^

    Arguments
    ---------
    in_query_channels: int
        Number of channels in the query network. Defaults to 80.
    in_key_channels: int
        Number of channels in the key network. Defaults to 512.
    attn_channels: int
        Number of inner channels in the attention layers. Defaults to 80.
    temperature: float
        Temperature for the softmax. Defaults to 0.0005.

    Example
    -------
    >>> import torch
    >>> from speechbrain.lobes.models.FastSpeech2 import AlignmentNetwork
    >>> aligner = AlignmentNetwork(
    ...     in_query_channels=80,
    ...     in_key_channels=512,
    ...     attn_channels=80,
    ...     temperature=0.0005,
    ... )
    >>> phoneme_feats = torch.rand(2, 512, 20)
    >>> mels = torch.rand(2, 80, 100)
    >>> alignment_soft, alignment_logprob = aligner(mels, phoneme_feats, None, None)
    >>> alignment_soft.shape, alignment_logprob.shape
    (torch.Size([2, 1, 100, 20]), torch.Size([2, 1, 100, 20]))
    r(   r   Mb@?c                    s   t    || _tjjdd| _tjjdd| _t	t
j||d dddddtj t
j|d |ddddd| _t	t
j||d dddddtj t
j|d |dddddtj t
j||ddddd| _d S )Nr   r   rp   r+   T)r-   r   r.   r/   biasr   r0   )r   r   temperaturery   r   Softmaxsoftmax
LogSoftmaxlog_softmax
Sequentialr   r1   rQ   	key_layerquery_layer)r   in_query_channelsin_key_channelsattn_channelsr	  r   r   r   r     sf   

zAlignmentNetwork.__init__c           
      C   s   |  |}| |}|dddddddf |dddddf  d }| j |jddd }|durF| |t|dddf d  }|durZ|j|	 
d td  | |}	|	|fS )a9  Forward pass of the aligner encoder.

        Arguments
        ---------
        queries: torch.Tensor
            the query tensor [B, C, T_de]
        keys: torch.Tensor
            the query tensor [B, C_emb, T_en]
        mask: torch.Tensor
            the query mask[B, T_de]
        attn_prior: torch.Tensor
            the prior attention tensor [B, 1, T_en, T_de]

        Returns
        -------
        attn: torch.Tensor
            soft attention [B, 1, T_en, T_de]
        attn_logp: torch.Tensor
            log probabilities [B, 1, T_en , T_de]
        Nrp   r0   T)ru  rv  r  )r  r  r	  r  r  ry   rh  rn  masked_fill_r   rz   r   r  )
r   querieskeysrs  
attn_priorkey_out	query_outattn_factor	attn_logpattnr   r   r   r      s   

6
zAlignmentNetwork.forward)r(   r   r(   r  r!   r   r   r   r   r    s    $=r  c                       s>   e Zd ZdZ fddZdd Z						d
dd	Z  ZS )FastSpeech2WithAlignmenta  The FastSpeech2 text-to-speech model with internal alignment.
    This class is the main entry point for the model, which is responsible
    for instantiating all submodules, which, in turn, manage the individual
    neural network layers. Certain parts are adopted from the following implementation:
    https://github.com/coqui-ai/TTS/blob/dev/TTS/tts/models/forward_tts.py

    Simplified STRUCTURE:
    input -> token embedding -> encoder -> aligner -> duration/pitch/energy -> upsampler -> decoder -> output

    Arguments
    ---------
    enc_num_layers: int
        number of transformer layers (TransformerEncoderLayer) in encoder
    enc_num_head: int
        number of multi-head-attention (MHA) heads in encoder transformer layers
    enc_d_model: int
        the number of expected features in the encoder
    enc_ffn_dim: int
        the dimension of the feedforward network model
    enc_k_dim: int
        the dimension of the key
    enc_v_dim: int
        the dimension of the value
    enc_dropout: float
        Dropout for the encoder
    in_query_channels: int
        Number of channels in the query network.
    in_key_channels: int
        Number of channels in the key network.
    attn_channels: int
        Number of inner channels in the attention layers.
    temperature: float
        Temperature for the softmax.
    dec_num_layers: int
        number of transformer layers (TransformerEncoderLayer) in decoder
    dec_num_head: int
        number of multi-head-attention (MHA) heads in decoder transformer layers
    dec_d_model: int
        the number of expected features in the decoder
    dec_ffn_dim: int
        the dimension of the feedforward network model
    dec_k_dim: int
        the dimension of the key
    dec_v_dim: int
        the dimension of the value
    dec_dropout: float
        dropout for the decoder
    normalize_before: bool
        whether normalization should be applied before or after MHA or FFN in Transformer layers.
    ffn_type: str
        whether to use convolutional layers instead of feed forward network inside transformer layer.
    ffn_cnn_kernel_size_list: list of int
        conv kernel size of 2 1d-convs if ffn_type is 1dcnn
    n_char: int
        the number of symbols for the token embedding
    n_mels: int
        number of bins in mel spectrogram
    postnet_embedding_dim: int
        output feature dimension for convolution layers
    postnet_kernel_size: int
        postnet convolution kernel size
    postnet_n_convolutions: int
        number of convolution layers
    postnet_dropout: float
        dropout probability for postnet
    padding_idx: int
        the index for padding
    dur_pred_kernel_size: int
        the convolution kernel size in duration predictor
    pitch_pred_kernel_size: int
        kernel size for pitch prediction.
    energy_pred_kernel_size: int
        kernel size for energy prediction.
    variance_predictor_dropout: float
        dropout probability for variance predictor (duration/pitch/energy)

    Example
    -------
    >>> import torch
    >>> from speechbrain.lobes.models.FastSpeech2 import FastSpeech2WithAlignment
    >>> model = FastSpeech2WithAlignment(
    ...    enc_num_layers=6,
    ...    enc_num_head=2,
    ...    enc_d_model=384,
    ...    enc_ffn_dim=1536,
    ...    enc_k_dim=384,
    ...    enc_v_dim=384,
    ...    enc_dropout=0.1,
    ...    in_query_channels=80,
    ...    in_key_channels=384,
    ...    attn_channels=80,
    ...    temperature=0.0005,
    ...    dec_num_layers=6,
    ...    dec_num_head=2,
    ...    dec_d_model=384,
    ...    dec_ffn_dim=1536,
    ...    dec_k_dim=384,
    ...    dec_v_dim=384,
    ...    dec_dropout=0.1,
    ...    normalize_before=False,
    ...    ffn_type='1dcnn',
    ...    ffn_cnn_kernel_size_list=[9, 1],
    ...    n_char=40,
    ...    n_mels=80,
    ...    postnet_embedding_dim=512,
    ...    postnet_kernel_size=5,
    ...    postnet_n_convolutions=5,
    ...    postnet_dropout=0.5,
    ...    padding_idx=0,
    ...    dur_pred_kernel_size=3,
    ...    pitch_pred_kernel_size=3,
    ...    energy_pred_kernel_size=3,
    ...    variance_predictor_dropout=0.5)
    >>> inputs = torch.tensor([
    ...     [13, 12, 31, 14, 19],
    ...     [31, 16, 30, 31, 0],
    ... ])
    >>> mels = torch.rand(2, 100, 80)
    >>> mel_post, postnet_output, durations, predict_pitch, avg_pitch, predict_energy, avg_energy, mel_lens, alignment_durations, alignment_soft, alignment_logprob, alignment_mas = model(inputs, mels)
    >>> mel_post.shape, durations.shape
    (torch.Size([2, 100, 80]), torch.Size([2, 5]))
    >>> predict_pitch.shape, predict_energy.shape
    (torch.Size([2, 5, 1]), torch.Size([2, 5, 1]))
    >>> alignment_soft.shape, alignment_mas.shape
    (torch.Size([2, 100, 5]), torch.Size([2, 100, 5]))
    c!           !         s   t    || _|| _|| _t|| _t|| _t|||d| _	t
|||| d| _t
|||| d| _t
|||| d| _tjd||ddd| _tjd||ddd| _t|||||||tj|||d| _t|||||||tj|||d| _tj||d| _t|||||d	| _t||	|
|d
| _d S )NrW   r   r0   r+   Tr   rX   rK   r   )r  r  r  r	  )r   r   rc   r   rd   r   rf   r   r   re   rI   r   r   r   r   r1   r   r   r   r   rQ   r   r   r	   rP   r'   r   r  aligner)!r   ri   rc   rj   rk   rl   rm   rn   r  r  r  r	  r   r   r   r   r   r   r   r`   ra   rb   ro   r   rB   rC   rD   rE   rd   r   r   r   r   r   r   r   r     s   
&z!FastSpeech2WithAlignment.__init__c           
      C   s   t |dt |d }| |dd|dd|d\}}t|ddd |d }t |d }	|ddd}|	|||fS )a  Aligner forward pass.
        1. Compute a mask to apply to the attention map.
        2. Run the alignment network.
        3. Apply MAS (Monotonic alignment search) to compute the hard alignment map.
        4. Compute the durations from the hard alignment map.

        Arguments
        ---------
        x: torch.Tensor
            Input sequence [B, T_en, C_en].
        y: torch.Tensor
            Output sequence [B, T_de, C_de].
        x_mask: torch.Tensor
            Input sequence mask [B, 1, T_en].
        y_mask: torch.Tensor
            Output sequence mask [B, 1, T_de].

        Returns
        -------
        durations: torch.Tensor
            Durations from the hard alignment map [B, T_en].
        alignment_soft: torch.Tensor
            soft alignment potentials [B, T_en, T_de].
        alignment_logprob: torch.Tensor
            log scale alignment potentials [B, 1, T_de, T_en].
        alignment_mas: torch.Tensor
            hard alignment map [B, T_en, T_de].
        rs   rp   r0   N)	ry   rz   r  	transposer  r   
contiguousr  int)
r   r   r  rU   y_maskr   alignment_softalignment_logprobalignment_masr   r   r   r   _forward_aligner(	  s   z)FastSpeech2WithAlignment._forward_alignerNr   c                  C   s  t || jd}| d}	| |}
| |
}t|
||	 }
|d| jd|
j	d 
ddd }| j|
||d\}
}|
|	 }
d}d}d}d}|dur{t || jd}| d}| |
||	dd|dd\}}}}|dd}|dd}| |
|	 }| dkr|d}ttj|d}d}| |
|	}|| }|durt|d|}| |}|
ddd}n
| |
ddd}|
ddd}|
|}
d}| |
|	}|| }|durt|d|}| |}|
ddd}n
| |
ddd}|
ddd}|
|}
t|
|dur|n||d\}}tt|}||j}| d}	|d| jd|j	d 
ddd }|  |}t|||	 }| j!|||d^}}}| "||	 }| #|| }|||||||t|||||fS )	a  forward pass for training and inference

        Arguments
        ---------
        tokens: torch.Tensor
            batch of input tokens
        mel_spectograms: torch.Tensor
            batch of mel_spectograms (used only for training)
        pitch: torch.Tensor
            batch of pitch for each frame. If it is None, the model will infer on predicted pitches
        energy: torch.Tensor
            batch of energy for each frame. If it is None, the model will infer on predicted energies
        pace: float
            scaling factor for durations
        pitch_rate: float
            scaling factor for pitches
        energy_rate: float
            scaling factor for energies

        Returns
        -------
        mel_post: torch.Tensor
            mel outputs from the decoder
        postnet_output: torch.Tensor
            mel outputs from the postnet
        predict_durations: torch.Tensor
            predicted durations of each token
        predict_pitch: torch.Tensor
            predicted pitches of each token
        avg_pitch: torch.Tensor
            target pitches for each token if input pitch is not None
            None if input pitch is None
        predict_energy: torch.Tensor
            predicted energies of each token
        avg_energy: torch.Tensor
            target energies for each token if input energy is not None
            None if input energy is None
        mel_length:
            predicted lengths of mel spectrograms
        alignment_durations:
            durations from the hard alignment map
        alignment_soft: torch.Tensor
            soft alignment potentials
        alignment_logprob: torch.Tensor
            log scale alignment potentials
        alignment_mas: torch.Tensor
            hard alignment map
        rq   rs   r0   r   rp   rv   Nr   )$r   rd   rz   re   rf   ry   r}   r{   rc   r|   r   r   r   r&  r  r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r	   r   ) r   r   mel_spectogramsr   r   r   r   r   r   r   r   r   r   r   alignment_durationsr#  r$  r%  r"  y_mask_invertedr   predict_durations_reverse_logr   r   r   r   r   r   r   r   r   r   r   r   r   r    Q	  s   :














	

z FastSpeech2WithAlignment.forwardr   )r"   r#   r$   r%   r   r&  r    r&   r   r   r   r   r  (  s     ,r  c                       (   e Zd ZdZ fddZdd Z  ZS )LossWithAlignmenta  Loss computation including internal aligner

    Arguments
    ---------
    log_scale_durations: bool
       applies logarithm to target durations
    ssim_loss_weight: float
       weight for the ssim loss
    duration_loss_weight: float
       weight for the duration loss
    pitch_loss_weight: float
       weight for the pitch loss
    energy_loss_weight: float
       weight for the energy loss
    mel_loss_weight: float
       weight for the mel loss
    postnet_mel_loss_weight: float
       weight for the postnet mel loss
    aligner_loss_weight: float
       weight for the alignment loss
    binary_alignment_loss_weight: float
       weight for the postnet mel loss
    binary_alignment_loss_warmup_epochs: int
       Number of epochs to gradually increase the impact of binary loss.
    binary_alignment_loss_max_epochs: int
       From this epoch on the impact of binary loss is ignored.
    c                    s   t    t | _t | _t | _t | _t | _	t | _
t | _t | _|| _|| _|| _|| _|| _|| _|| _|| _|	| _|
| _|| _d S r$  )r   r   r%  r&  r   r'  r(  r)  r*  r+  r,  ForwardSumLossaligner_lossBinaryAlignmentLossbinary_alignment_lossr-  r.  r/  r0  r1  r2  r3  aligner_loss_weightbinary_alignment_loss_weight#binary_alignment_loss_warmup_epochs binary_alignment_loss_max_epochs)r   r-  r.  r1  r2  r3  r/  r0  r1  r2  r3  r4  r   r   r   r   .
  s*   






zLossWithAlignment.__init__c           "      C   s  |\}}}}}t |jdksJ |\}	}
}}}}}}}}}}|d}|d}|d}|d}|d}| jrAt| }t|jd D ]}|dkr| |	|d|| ddf ||d|| ddf }| 	|
|d|| ddf ||d|| ddf }| 
||d|| f ||d|| f tj}| ||d|| f ||d|| f tj}| ||d|| f ||d|| f tj}qH|| |	|d|| ddf ||d|| ddf  }|| 	|
|d|| ddf ||d|| ddf  }|| 
||d|| f ||d|| f tj }|| ||d|| f ||d|| f tj }|| ||d|| f ||d|| f tj }qHd}i }| |	||}|| j |d< t|t |}|| j |d< t|t |}|| j |d< t|t |}|| j |d< t|t |}|| j |d	< t|t |}|| j |d
< |dur| |||}|| j |d< |dur|dur|| jkrd} n
t|| j dd } | ||}!|!| j |  |d< t| }||d< |S )a  Computes the value of the loss function and updates stats

        Arguments
        ---------
        predictions: tuple
            model predictions
        targets: tuple
            ground truth data
        current_epoch: int
            used to determinate the start/end of the binary alignment loss

        Returns
        -------
        loss: torch.Tensor
            the loss value
        r   rs   r   Nr&  r(  r)  r*  r+  r,  r.  r   r0  r6  ) rH   r|   r   r-  ry   r8  r   r5   r(  r)  r*  r   r9  r+  r,  r&  r.  r:  r/  r0  r1  r2  r3  r.  r1  r4  r]  r3  r0  r2  r  r   )"r   r;  r<  r=  r>  r@  rA  rB  rC  rD  rE  rF  rG  rH  rI  rJ  r   r(  r#  r$  alignment_hardrL  rF   r(  r)  r*  r+  r,  r6  rM  r&  r.  binary_loss_warmup_weightr0  r   r   r   r    T
  s   





	zLossWithAlignment.forwardr!   r   r   r   r   r,  
  s    &r,  c                       r   )r-  a  CTC alignment loss

    Arguments
    ---------
    blank_logprob: pad value

    Example
    -------
    >>> import torch
    >>> from speechbrain.lobes.models.FastSpeech2 import ForwardSumLoss
    >>> loss_func = ForwardSumLoss()
    >>> attn_logprob = torch.rand(2, 1, 100, 5)
    >>> key_lens = torch.tensor([5, 5])
    >>> query_lens = torch.tensor([100, 100])
    >>> loss = loss_func(attn_logprob, key_lens, query_lens)
    rs   c                    s4   t    tjjdd| _tjjdd| _|| _d S )Nr   r   T)zero_infinity)	r   r   ry   r   r  r  CTCLossctc_lossblank_logprob)r   r:  r   r   r   r   
  s   

zForwardSumLoss.__init__c           
   	   C   s   t jjj|d| jd}d}t|jd D ]K}t d|| d d}|| 	dddd|| ddd|| d f }| 
|d d }| j|||||d  |||d  d}	||	 }q||jd  }|S )	aN  
        Arguments
        ---------
        attn_logprob: torch.Tensor
            log scale alignment potentials [B, 1, query_lens, key_lens]
        key_lens: torch.Tensor
            mel lengths
        query_lens: torch.Tensor
            phoneme lengths

        Returns
        -------
        total_loss: torch.Tensor
        r   )inputr   r  rJ   r   r0   rp   N)r  target_lengths)ry   r   r   r   r:  r5   r|   ro  rz   r   r  r9  )
r   attn_logprobkey_lens
query_lensattn_logprob_paddedr6  bid
target_seqcurr_logprobrM  r   r   r   r      s&    
zForwardSumLoss.forward)rs   r!   r   r   r   r   r-  
  s    r-  c                       r+  )r/  a  Binary loss that forces soft alignments to match the hard alignments as
    explained in `https://arxiv.org/pdf/2108.10447.pdf`.
    Example
    -------
    >>> import torch
    >>> from speechbrain.lobes.models.FastSpeech2 import BinaryAlignmentLoss
    >>> loss_func = BinaryAlignmentLoss()
    >>> alignment_hard = torch.randint(0, 2, (2, 100, 5))
    >>> alignment_soft = torch.rand(2, 100, 5)
    >>> loss = loss_func(alignment_hard, alignment_soft)
    c                    s   t    d S r$  )r   r   rm  r   r   r   r   7  s   zBinaryAlignmentLoss.__init__c                 C   s.   t t j||dk dd }| |  S )z
        alignment_hard: torch.Tensor
            hard alignment map [B, mel_lens, phoneme_lens]
        alignment_soft: torch.Tensor
            soft alignment potentials [B, mel_lens, phoneme_lens]
        r0   g-q=rg  )ry   rh  r   r  )r   r5  r#  log_sumr   r   r   r    :  s   zBinaryAlignmentLoss.forwardr!   r   r   r   r   r/  *  s    r/  )r   rJ   )r0   rf  )-r%   r  r  ry   torch.nn.functionalr   r   r  torch.nn.modules.lossr   0speechbrain.lobes.models.transformer.Transformerr   r   r   r   speechbrain.nnetr   r	   speechbrain.nnet.embeddingr
   speechbrain.nnet.lossesr   speechbrain.nnet.normalizationr   Moduler   r'   rI   rV   r   r   r   r   r"  re  r^  r%  rk  r  r  r  r  r,  r-  r/  r   r   r   r   <module>   sV    	1ZK   
)"y 7
X]   Y_;    l [?