o
    ½e¦i•t ã                   @   sÂ  d Z ddlZddlZddlm  mZ ddlmZ ddlm	Z	 ddl
mZmZmZmZ ddlmZmZ ddlmZ ddlmZ dd	lmZ G d
d„ dejƒZG dd„ dejƒZG dd„ dejƒZG dd„ dejƒZG dd„ dejƒZdd„ Zd6dd„ZG dd„ dƒZ G dd„ dejƒZ!dd„ Z"d7d"d#„Z#G d$d%„ d%ejjƒZ$G d&d'„ d'e	ƒZ%G d(d)„ d)ƒZ&d*d+„ Z'G d,d-„ d-ejjƒZ(G d.d/„ d/ejƒZ)G d0d1„ d1ejƒZ*G d2d3„ d3ejƒZ+G d4d5„ d5ejƒZ,dS )8z¸
Neural network modules for the FastSpeech 2: Fast and High-Quality End-to-End Text to Speech
synthesis model
Authors
* Sathvik Udupa 2022
* Pradnya Kandarkar 2023
* Yingzhi Wang 2023
é    N)Únn)Ú_Loss)ÚPositionalEncodingÚTransformerEncoderÚget_key_padding_maskÚget_mask_from_lengths)ÚCNNÚlinear)Ú	Embedding)Úbce_loss)Ú	LayerNormc                       ó*   e Zd ZdZd‡ fdd„	Zdd„ Z‡  ZS )ÚEncoderPreNeta?  Embedding layer for tokens

    Arguments
    ---------
    n_vocab: int
        size of the dictionary of embeddings
    blank_id: int
        padding index
    out_channels: int
        the size of each embedding vector

    Example
    -------
    >>> from speechbrain.nnet.embedding import Embedding
    >>> from speechbrain.lobes.models.FastSpeech2 import EncoderPreNet
    >>> encoder_prenet_layer = EncoderPreNet(n_vocab=40, blank_id=0, out_channels=384)
    >>> x = torch.rand(3, 5)
    >>> y = encoder_prenet_layer(x)
    >>> y.shape
    torch.Size([3, 5, 384])
    é   c                    s   t ƒ  ¡  t|||d| _d S )N)Únum_embeddingsÚembedding_dimÚblank_id)ÚsuperÚ__init__r
   Útoken_embedding)ÚselfÚn_vocabr   Úout_channels©Ú	__class__© úb/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/speechbrain/lobes/models/FastSpeech2.pyr   3   s   
ýzEncoderPreNet.__init__c                 C   s   | j  |j¡| _ |   |¡}|S )zïComputes the forward pass

        Arguments
        ---------
        x: torch.Tensor
            a (batch, tokens) input tensor

        Returns
        -------
        output: torch.Tensor
            the embedding layer output
        )r   ÚtoÚdevice)r   Úxr   r   r   Úforward;   s   
zEncoderPreNet.forward)r   ©Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   r    Ú__classcell__r   r   r   r   r      s    r   c                       s4   e Zd ZdZ					d
‡ fdd„	Zdd	„ Z‡  ZS )ÚPostNeta­  
    FastSpeech2 Conv Postnet
    Arguments
    ---------
    n_mel_channels: int
       input feature dimension for convolution layers
    postnet_embedding_dim: int
       output feature dimension for convolution layers
    postnet_kernel_size: int
       postnet convolution kernel size
    postnet_n_convolutions: int
       number of convolution layers
    postnet_dropout: float
        dropout probability for postnet
    éP   r   é   ç      à?c              	      sÄ   t t| ƒ ¡  tj|||dd| _t ¡ | _t	d|d ƒD ]}| j 
tj|||dd¡ qtj|||dd| _t ¡ | _t |¡| _t |¡| _t |¡| _t |¡| _t |¡| _t |¡| _d S )NÚsame©Úin_channelsr   Úkernel_sizeÚpaddingé   )r   r'   r   r   ÚConv1dÚconv_prer   Ú
ModuleListÚconvs_intermediateÚrangeÚappendÚ	conv_postÚTanhÚtanhr   Úln1Úln2Úln3ÚDropoutÚdropout1Údropout2Údropout3)r   Ún_mel_channelsÚpostnet_embedding_dimÚpostnet_kernel_sizeÚpostnet_n_convolutionsÚpostnet_dropoutÚir   r   r   r   ^   s<   ü
üÿ	ü
zPostNet.__init__c                 C   s¢   |   |¡}|  |¡ |j¡}|  |¡}|  |¡}tt| jƒƒD ]	}| j| |ƒ}q|  	|¡ |j¡}|  |¡}|  
|¡}|  |¡}|  |¡ |j¡}|  |¡}|S )züComputes the forward pass

        Arguments
        ---------
        x: torch.Tensor
            a (batch, time_steps, features) input tensor

        Returns
        -------
        output: torch.Tensor
            the spectrogram predicted
        )r2   r:   r   Údtyper9   r>   r5   Úlenr4   r;   r?   r7   r<   r@   )r   r   rF   r   r   r   r    ˆ   s   






zPostNet.forward)r(   r   r)   r)   r*   r!   r   r   r   r   r'   M   s    ú*r'   c                       s,   e Zd ZdZ	d‡ fdd„	Zdd„ Z‡  ZS )	ÚDurationPredictoraÑ  Duration predictor layer

    Arguments
    ---------
    in_channels: int
       input feature dimension for convolution layers
    out_channels: int
       output feature dimension for convolution layers
    kernel_size: int
       duration predictor convolution kernel size
    dropout: float
       dropout probability, 0 by default
    n_units: int

    Example
    -------
    >>> from speechbrain.lobes.models.FastSpeech2 import FastSpeech2
    >>> duration_predictor_layer = DurationPredictor(in_channels=384, out_channels=384, kernel_size=3)
    >>> x = torch.randn(3, 400, 384)
    >>> mask = torch.ones(3, 400, 384)
    >>> y = duration_predictor_layer(x, mask)
    >>> y.shape
    torch.Size([3, 400, 1])
    ç        r0   c                    s|   t ƒ  ¡  tj|||dd| _tj|||dd| _tj||d| _t|ƒ| _	t|ƒ| _
t ¡ | _t |¡| _t |¡| _d S )Nr+   r,   ©Ú	n_neuronsÚ
input_size)r   r   r   r1   Úconv1Úconv2r	   ÚLinearr   r:   r;   r   ÚReLUÚrelur=   r>   r?   )r   r-   r   r.   ÚdropoutÚn_unitsr   r   r   r   Á   s&   
üü


zDurationPredictor.__init__c                 C   sn   |   |  || ¡¡}|  |¡ |j¡}|  |¡}|   |  || ¡¡}|  |¡ |j¡}|  |¡}|  	|| ¡S )a?  Computes the forward pass

        Arguments
        ---------
        x: torch.Tensor
            a (batch, time_steps, features) input tensor
        x_mask: torch.Tensor
            mask of input tensor

        Returns
        -------
        output: torch.Tensor
            the duration predictor outputs
        )
rR   rN   r:   r   rG   r>   rO   r;   r?   r	   )r   r   Úx_maskr   r   r   r    Ø   s   

zDurationPredictor.forward)rJ   r0   r!   r   r   r   r   rI   §   s
    ÿrI   c                       s0   e Zd ZdZ‡ fdd„Zdd„ Zdd„ Z‡  ZS )ÚSPNPredictora<  
    This module for the silent phoneme predictor. It receives phoneme sequences without any silent phoneme token as
    input and predicts whether a silent phoneme should be inserted after a position. This is to avoid the issue of fast
    pace at inference time due to having no silent phoneme tokens in the input sequence.

    Arguments
    ---------
    enc_num_layers: int
        number of transformer layers (TransformerEncoderLayer) in encoder
    enc_num_head: int
        number of multi-head-attention (MHA) heads in encoder transformer layers
    enc_d_model: int
        the number of expected features in the encoder
    enc_ffn_dim: int
        the dimension of the feedforward network model
    enc_k_dim: int
        the dimension of the key
    enc_v_dim: int
        the dimension of the value
    enc_dropout: float
        Dropout for the encoder
    normalize_before: bool
        whether normalization should be applied before or after MHA or FFN in Transformer layers.
    ffn_type: str
        whether to use convolutional layers instead of feed forward network inside transformer layer
    ffn_cnn_kernel_size_list: list of int
        conv kernel size of 2 1d-convs if ffn_type is 1dcnn
    n_char: int
        the number of symbols for the token embedding
    padding_idx: int
        the index for padding
    c                    sf   t ƒ  ¡  || _|| _t|||d| _t|ƒ| _t|||||||t	j
||	|
d| _tjd|d| _d S )N©r   ©Ú
num_layersÚnheadÚd_ffnÚd_modelÚkdimÚvdimrS   Ú
activationÚnormalize_beforeÚffn_typeÚffn_cnn_kernel_size_listr0   rK   )r   r   Úenc_num_headÚpadding_idxr   Ú	encPreNetr   Ú#sinusoidal_positional_embed_encoderr   r   rQ   Úspn_encoderr	   rP   Ú
spn_linear)r   Úenc_num_layersrc   Úenc_d_modelÚenc_ffn_dimÚ	enc_k_dimÚ	enc_v_dimÚenc_dropoutr`   ra   rb   Ún_charrd   r   r   r   r     s.   
ÿÿõzSPNPredictor.__init__c                 C   sÌ   |   |¡}t |d¡ dd|jd ¡}|| }t|| jd}|  d¡}|  |¡}t ||¡| }tj	tj
|jd |jd |jddd ¡  | j|jd  dd¡}| j|||d\}}	|  |¡ d¡}
|
S )	a   forward pass for the module

        Arguments
        ---------
        tokens: torch.Tensor
            input tokens without silent phonemes
        last_phonemes: torch.Tensor
            indicates if a phoneme at an index is the last phoneme of a word or not

        Returns
        -------
        spn_decision: torch.Tensor
            indicates if a silent phoneme should be inserted after a phoneme
        é   r0   ©Úpad_idxéÿÿÿÿ©r   )Údiagonalr   ©Úsrc_maskÚsrc_key_padding_mask)re   ÚtorchÚ	unsqueezeÚrepeatÚshaper   rd   rf   ÚaddÚtriuÚonesr   Úboolrc   rg   rh   Úsqueeze)r   ÚtokensÚlast_phonemesÚtoken_featsÚsrcmaskÚsrcmask_invertedÚposÚspn_maskÚspn_token_featsÚ_Úspn_decisionr   r   r   r    ?  s2   
ÿ
ýúö
ÿzSPNPredictor.forwardc                 C   s   |   ||¡}t |¡dk}|S )a—  inference function

        Arguments
        ---------
        tokens: torch.Tensor
            input tokens without silent phonemes
        last_phonemes: torch.Tensor
            indicates if a phoneme at an index is the last phoneme of a word or not

        Returns
        -------
        spn_decision: torch.Tensor
            indicates if a silent phoneme should be inserted after a phoneme
        gš™™™™™é?)r    ry   Úsigmoid)r   r‚   rƒ   r‹   r   r   r   Úinfern  s   zSPNPredictor.infer)r"   r#   r$   r%   r   r    r   r&   r   r   r   r   rV   ò   s
    !+/rV   c                       s6   e Zd ZdZ‡ fdd„Z						ddd„Z‡  ZS )	ÚFastSpeech2aÀ  The FastSpeech2 text-to-speech model.
    This class is the main entry point for the model, which is responsible
    for instantiating all submodules, which, in turn, manage the individual
    neural network layers
    Simplified STRUCTURE: input->token embedding ->encoder ->duration/pitch/energy predictor ->duration
    upsampler -> decoder -> output
    During training, teacher forcing is used (ground truth durations are used for upsampling)

    Arguments
    ---------
    enc_num_layers: int
        number of transformer layers (TransformerEncoderLayer) in encoder
    enc_num_head: int
        number of multi-head-attention (MHA) heads in encoder transformer layers
    enc_d_model: int
        the number of expected features in the encoder
    enc_ffn_dim: int
        the dimension of the feedforward network model
    enc_k_dim: int
        the dimension of the key
    enc_v_dim: int
        the dimension of the value
    enc_dropout: float
        Dropout for the encoder
    dec_num_layers: int
        number of transformer layers (TransformerEncoderLayer) in decoder
    dec_num_head: int
        number of multi-head-attention (MHA) heads in decoder transformer layers
    dec_d_model: int
        the number of expected features in the decoder
    dec_ffn_dim: int
        the dimension of the feedforward network model
    dec_k_dim: int
        the dimension of the key
    dec_v_dim: int
        the dimension of the value
    dec_dropout: float
        dropout for the decoder
    normalize_before: bool
        whether normalization should be applied before or after MHA or FFN in Transformer layers.
    ffn_type: str
        whether to use convolutional layers instead of feed forward network inside transformer layer.
    ffn_cnn_kernel_size_list: list of int
        conv kernel size of 2 1d-convs if ffn_type is 1dcnn
    n_char: int
        the number of symbols for the token embedding
    n_mels: int
        number of bins in mel spectrogram
    postnet_embedding_dim: int
       output feature dimension for convolution layers
    postnet_kernel_size: int
       postnet convolution kernel size
    postnet_n_convolutions: int
       number of convolution layers
    postnet_dropout: float
        dropout probability for postnet
    padding_idx: int
        the index for padding
    dur_pred_kernel_size: int
        the convolution kernel size in duration predictor
    pitch_pred_kernel_size: int
        kernel size for pitch prediction.
    energy_pred_kernel_size: int
        kernel size for energy prediction.
    variance_predictor_dropout: float
        dropout probability for variance predictor (duration/pitch/energy)

    Example
    -------
    >>> import torch
    >>> from speechbrain.lobes.models.FastSpeech2 import FastSpeech2
    >>> model = FastSpeech2(
    ...    enc_num_layers=6,
    ...    enc_num_head=2,
    ...    enc_d_model=384,
    ...    enc_ffn_dim=1536,
    ...    enc_k_dim=384,
    ...    enc_v_dim=384,
    ...    enc_dropout=0.1,
    ...    dec_num_layers=6,
    ...    dec_num_head=2,
    ...    dec_d_model=384,
    ...    dec_ffn_dim=1536,
    ...    dec_k_dim=384,
    ...    dec_v_dim=384,
    ...    dec_dropout=0.1,
    ...    normalize_before=False,
    ...    ffn_type='1dcnn',
    ...    ffn_cnn_kernel_size_list=[9, 1],
    ...    n_char=40,
    ...    n_mels=80,
    ...    postnet_embedding_dim=512,
    ...    postnet_kernel_size=5,
    ...    postnet_n_convolutions=5,
    ...    postnet_dropout=0.5,
    ...    padding_idx=0,
    ...    dur_pred_kernel_size=3,
    ...    pitch_pred_kernel_size=3,
    ...    energy_pred_kernel_size=3,
    ...    variance_predictor_dropout=0.5)
    >>> inputs = torch.tensor([
    ...     [13, 12, 31, 14, 19],
    ...     [31, 16, 30, 31, 0],
    ... ])
    >>> input_lengths = torch.tensor([5, 4])
    >>> durations = torch.tensor([
    ...     [2, 4, 1, 5, 3],
    ...     [1, 2, 4, 3, 0],
    ... ])
    >>> mel_post, postnet_output, predict_durations, predict_pitch, avg_pitch, predict_energy, avg_energy, mel_lens = model(inputs, durations=durations)
    >>> mel_post.shape, predict_durations.shape
    (torch.Size([2, 15, 80]), torch.Size([2, 5]))
    >>> predict_pitch.shape, predict_energy.shape
    (torch.Size([2, 5, 1]), torch.Size([2, 5, 1]))
    c                    s  t ƒ  ¡  || _|	| _|| _t|ƒ| _t|
ƒ| _t|||d| _	t
||||d| _t
||||d| _t
||||d| _tjd||ddd| _tjd||ddd| _t|||||||tj|||d| _t||	||
|||tj|||d| _tj||
d| _t|||||d	| _d S )
NrW   ©r-   r   r.   rS   r0   r+   T©r-   r   r.   r/   Úskip_transposerX   rK   ©rA   rB   rC   rD   rE   )r   r   rc   Údec_num_headrd   r   rf   Ú#sinusoidal_positional_embed_decoderr   re   rI   ÚdurPredÚ	pitchPredÚ
energyPredr   r1   Ú
pitchEmbedÚenergyEmbedr   r   rQ   ÚencoderÚdecoderr	   rP   r'   Úpostnet)r   ri   rc   rj   rk   rl   rm   rn   Údec_num_layersr“   Údec_d_modelÚdec_ffn_dimÚ	dec_k_dimÚ	dec_v_dimÚdec_dropoutr`   ra   rb   ro   Ún_melsrB   rC   rD   rE   rd   Údur_pred_kernel_sizeÚpitch_pred_kernel_sizeÚenergy_pred_kernel_sizeÚvariance_predictor_dropoutr   r   r   r   ÷  sž   
!ÿÿÿüüüûûõõûzFastSpeech2.__init__Nç      ð?c              
   C   sˆ  t || jd}|  d¡}	|  |¡}
|  |
¡}t |
|¡|	 }
| d¡ | jd|
j	d ¡ 
ddd¡ ¡ }| j|
||d\}
}|
|	 }
|  |
|	¡ d¡}| ¡ dkrV| d¡}|du rdt tj |¡d¡}d}|  |
|	¡}|| }|dur‰t| d¡|ƒ}|  |¡}| 
ddd¡}n
|  | 
ddd¡¡}| 
ddd¡}|
 |¡}
d}|  |
|	¡}|| }|durÄt| d¡|ƒ}|  |¡}| 
ddd¡}n
|  | 
ddd¡¡}| 
ddd¡}|
 |¡}
t|
|durâ|n||d\}}tt |¡ƒ}| |j¡}|  d¡}	| d¡ | jd|j	d ¡ 
ddd¡ ¡ }|  |¡}t ||¡|	 }| j|||d^}}}|   |¡|	 }|  !|¡| }|||||||t |¡fS )	aõ  forward pass for training and inference

        Arguments
        ---------
        tokens: torch.Tensor
            batch of input tokens
        durations: torch.Tensor
            batch of durations for each token. If it is None, the model will infer on predicted durations
        pitch: torch.Tensor
            batch of pitch for each frame. If it is None, the model will infer on predicted pitches
        energy: torch.Tensor
            batch of energy for each frame. If it is None, the model will infer on predicted energies
        pace: float
            scaling factor for durations
        pitch_rate: float
            scaling factor for pitches
        energy_rate: float
            scaling factor for energies

        Returns
        -------
        mel_post: torch.Tensor
            mel outputs from the decoder
        postnet_output: torch.Tensor
            mel outputs from the postnet
        predict_durations: torch.Tensor
            predicted durations of each token
        predict_pitch: torch.Tensor
            predicted pitches of each token
        avg_pitch: torch.Tensor
            target pitches for each token if input pitch is not None
            None if input pitch is None
        predict_energy: torch.Tensor
            predicted energies of each token
        avg_energy: torch.Tensor
            target energies for each token if input energy is not None
            None if input energy is None
        mel_length:
            predicted lengths of mel spectrograms
        rq   rs   r0   r   rp   rv   N©Úpace)"r   rd   rz   re   rf   ry   r}   r{   rc   r|   Úpermuter€   rš   r•   r   ÚdimÚclampÚspecialÚexpm1r–   Úaverage_over_durationsr˜   r—   r™   Úupsampler   Útensorr   r   r“   r”   r›   r	   rœ   )r   r‚   Ú	durationsÚpitchÚenergyrª   Ú
pitch_rateÚenergy_rater…   r†   r„   r‡   Ú	attn_maskrŠ   Úpredict_durationsÚdur_pred_reverse_logÚ	avg_pitchÚpredict_pitchÚ
avg_energyÚpredict_energyÚ
spec_featsÚmel_lensÚoutput_mel_featsÚmemoryÚmel_postÚpostnet_outputr   r   r   r    l  s   2


ü
ÿÿ
ÿ




ý
ü
ÿøzFastSpeech2.forward©NNNr¨   r¨   r¨   r!   r   r   r   r   rŽ   ‚  s    txørŽ   c                 C   s  t j|dd ¡ }t jj |dd…dd…f d¡}t jj t j| dkddd¡}t jj t j| ddd¡}| ¡ \}}|  d¡}|dd…ddd…f  |||¡}	|dd…ddd…f  |||¡}
t  |d|
¡t  |d|	¡  	¡ }t  |d|
¡t  |d|	¡  	¡ }t  
|dk||| ¡}|S )zðAverage values over durations.

    Arguments
    ---------
    values: torch.Tensor
        shape: [B, 1, T_de]
    durs: torch.Tensor
        shape: [B, T_en]

    Returns
    -------
    avg: torch.Tensor
        shape: [B, 1, T_en]
    r0   ©r¬   Nrs   ©r0   r   rJ   rp   )ry   ÚcumsumÚlongr   Ú
functionalÚpadÚsizeÚexpandÚgatherÚfloatÚwhere)ÚvaluesÚdursÚdurs_cums_endsÚdurs_cums_startsÚvalues_nonzero_cumsÚvalues_cumsÚbsÚlengthÚ
n_formantsÚdcsÚdceÚvalues_sumsÚvalues_nelemsÚavgr   r   r   r°   ÿ  s,    ÿ
  þÿýÿr°   r¨   rJ   c                    sJ   ‡ ‡‡fdd„t tˆ ƒƒD ƒ}dd„ |D ƒ}tjjjj|d|d}||fS )aÐ  upsample encoder output according to durations

    Arguments
    ---------
    feats: torch.Tensor
        batch of input tokens
    durs: torch.Tensor
        durations to be used to upsample
    pace: float
        scaling factor for durations
    padding_value: int
        padding index

    Returns
    -------
    mel_post: torch.Tensor
        mel outputs from the decoder
    predict_durations: torch.Tensor
        predicted durations for each token
    c                    s,   g | ]}t jˆ| ˆˆ |   ¡ d d‘qS )r   rÆ   )ry   Úrepeat_interleaverÉ   )Ú.0rF   ©rÒ   Úfeatsrª   r   r   Ú
<listcomp>=  s    ÿÿzupsample.<locals>.<listcomp>c                 S   s   g | ]}|j d  ‘qS ©r   )r|   )rà   Úmelr   r   r   rã   B  s    T)Úbatch_firstÚpadding_value)r5   rH   ry   r   ÚutilsÚrnnÚpad_sequence)râ   rÒ   rª   rç   Úupsampled_melsrÀ   Úpadded_upsampled_melsr   rá   r   r±   (  s   
þ
ÿr±   c                   @   ó   e Zd ZdZdd„ ZdS )ÚTextMelCollatezEZero-pads model inputs and targets based on number of frames per stepc           !      C   sZ  t |ƒ}tt|ƒƒD ]
}|| d ||< q
tjt dd„ |D ƒ¡ddd\}}|d }tjt dd„ |D ƒ¡ddd\}}|d }	t t|ƒ|¡}
t t|ƒ|	¡}t t|ƒ|	¡}t t|ƒ|¡}t t|ƒ|	¡}|
 ¡  | ¡  | ¡  | ¡  | ¡  tt|ƒƒD ]g}|||  d }|||  d }t |||  d	 ¡}|||  d
 }t |||  d ¡}||
|d| d¡…f< |||d| d¡…f< |||d| d¡…f< |||d| d¡…f< |||d| d¡…f< qƒ|d d  d¡}t	dd„ |D ƒƒ}t t|ƒ||¡}| ¡  t t|ƒ|¡}| ¡  t t|ƒ|¡}| ¡  t t|ƒ¡}g g }}tt|ƒƒD ]V}|| }|| d }|| d }|| d }|||dd…d| d
¡…f< |||d| d¡…f< |||d| d¡…f< | d
¡||< | 
|| d ¡ | 
|| d ¡ q4dd„ |D ƒ} t | ¡} | ddd
¡}|
||||||| |||||fS )a•  Collate's training batch from normalized text and mel-spectrogram

        Arguments
        ---------
        batch: list
            [text_normalized, mel_normalized]

        Returns
        -------
        text_padded: torch.Tensor
        dur_padded: torch.Tensor
        input_lengths: torch.Tensor
        mel_padded: torch.Tensor
        pitch_padded: torch.Tensor
        energy_padded: torch.Tensor
        output_lengths: torch.Tensor
        len_x: torch.Tensor
        labels: torch.Tensor
        wavs: torch.Tensor
        no_spn_seq_padded: torch.Tensor
        spn_labels_padded: torch.Tensor
        last_phonemes_padded: torch.Tensor
        Úmel_text_pairc                 S   ó   g | ]}t |d  ƒ‘qS rä   ©rH   ©rà   r   r   r   r   rã   o  ó    z+TextMelCollate.__call__.<locals>.<listcomp>r   T©r¬   Ú
descendingc                 S   rð   )éþÿÿÿrñ   rò   r   r   r   rã   u  ró   rö   éýÿÿÿr0   rs   Nrp   c                 S   s   g | ]	}|d    d¡‘qS )rp   r0   ©rÌ   rò   r   r   r   rã   —  ó    é   é   ÚlabelÚwavc                 S   s   g | ]}|d  ‘qS )r)   r   rò   r   r   r   rã   ®  s    )Úlistr5   rH   ry   ÚsortÚ
LongTensorÚFloatTensorÚzero_rÌ   Úmaxr6   ÚTensorr«   )!r   ÚbatchÚ	raw_batchrF   Úinput_lengthsÚids_sorted_decreasingÚmax_input_lenÚno_spn_seq_lengthsÚno_spn_ids_sorted_decreasingÚmax_no_spn_seq_lenÚtext_paddedÚno_spn_seq_paddedÚlast_phonemes_paddedÚ
dur_paddedÚspn_labels_paddedÚtextÚ
no_spn_seqrƒ   ÚdurÚ
spn_labelsÚnum_melsÚmax_target_lenÚ
mel_paddedÚpitch_paddedÚenergy_paddedÚoutput_lengthsÚlabelsÚwavsÚidxrå   r´   rµ   Úlen_xr   r   r   Ú__call__N  s˜   ÿ
ÿ
ýÿ

ózTextMelCollate.__call__N©r"   r#   r$   r%   r   r   r   r   r   rî   J  s    rî   c                       s.   e Zd ZdZ		d‡ fdd„	Zdd„ Z‡  ZS )	ÚLossas  Loss Computation

    Arguments
    ---------
    log_scale_durations: bool
        applies logarithm to target durations
    ssim_loss_weight: float
        weight for ssim loss
    duration_loss_weight: float
        weight for the duration loss
    pitch_loss_weight: float
        weight for the pitch loss
    energy_loss_weight: float
        weight for the energy loss
    mel_loss_weight: float
        weight for the mel loss
    postnet_mel_loss_weight: float
        weight for the postnet mel loss
    spn_loss_weight: float
        weight for spn loss
    spn_loss_max_epochs: int
        Max number of epochs
    r¨   é   c
           
         s~   t ƒ  ¡  tƒ | _t ¡ | _t ¡ | _t ¡ | _t ¡ | _	t ¡ | _
|| _|| _|| _|| _|| _|| _|| _|| _|	| _d S ©N)r   r   ÚSSIMLossÚ	ssim_lossr   ÚMSELossÚmel_lossÚpostnet_mel_lossÚdur_lossÚ
pitch_lossÚenergy_lossÚlog_scale_durationsÚssim_loss_weightÚmel_loss_weightÚpostnet_mel_loss_weightÚduration_loss_weightÚpitch_loss_weightÚenergy_loss_weightÚspn_loss_weightÚspn_loss_max_epochs)
r   r-  r.  r1  r2  r3  r/  r0  r4  r5  r   r   r   r   Ü  s    






zLoss.__init__c              
   C   sÐ  |\}}}}}}	}
t |jƒdksJ ‚|\	}}}}}}}}}| d¡}| d¡}| d¡}| d¡}| d¡}| jr@t | ¡ ¡}t|jd ƒD ]}|dkrÕ|  ||d|| …dd…f ||d|| …dd…f ¡}|  	||d|| …dd…f ||d|| …dd…f ¡}|  
||d|	| …f ||d|	| …f  tj¡¡}|  ||d|| …f ||d|| …f  tj¡¡}|  ||d|| …f ||d|| …f  tj¡¡}qG||  ||d|| …dd…f ||d|| …dd…f ¡ }||  	||d|| …dd…f ||d|| …dd…f ¡ }||  
||d|	| …f ||d|	| …f  tj¡¡ }||  ||d|| …f ||d|| …f  tj¡¡ }||  ||d|| …f ||d|| …f  tj¡¡ }qG|  |||¡}t |t |ƒ¡}t |t |ƒ¡}t |t |ƒ¡}t |t |ƒ¡}t |t |ƒ¡}t||
ƒ}|| jkr£d| _|| j || j  || j  || j  || j  || j  || j  }||| j || j || j || j || j || j || j dœ}|S )as  Computes the value of the loss function and updates stats

        Arguments
        ---------
        predictions: tuple
            model predictions
        targets: tuple
            ground truth data
        current_epoch: int
            The count of the current epoch.

        Returns
        -------
        loss: torch.Tensor
            the loss value
        rú   rs   r   N)Ú
total_lossr&  r(  r)  r*  r+  r,  Úspn_loss)rH   r|   r   r-  ry   Úlog1prÏ   r5   r(  r)  r*  r   Úfloat32r+  r,  r&  Údivr   r5  r4  r.  r/  r0  r1  r2  r3  )r   ÚpredictionsÚtargetsÚcurrent_epochÚ
mel_targetÚtarget_durationsÚtarget_pitchÚtarget_energyÚ
mel_lengthÚphon_lenr  Úmel_outÚpostnet_mel_outÚlog_durationsÚpredicted_pitchÚaverage_pitchÚpredicted_energyÚaverage_energyrÀ   Ú	spn_predsÚlog_target_durationsrF   r(  r)  r*  r+  r,  r&  r7  r6  Úlossr   r   r   r    ú  sÎ   øö




þþþþþþþþþþ
ÿþýüûúÿø
zLoss.forward)r¨   r#  r!   r   r   r   r   r"  Ã  s    !ör"  c              	   C   sÊ   ddl m} |j|||||d |j¡}|j| |d d ||||
|d |j¡}||ƒ}||ƒ}| ¡ dks8J ‚|jd |ksAJ ‚tj	|dd}|	r[|t 
|¡ t |¡t 
|¡  }|rat|ƒ}||fS )aë  calculates MelSpectrogram for a raw audio signal

    Arguments
    ---------
    sample_rate : int
        Sample rate of audio signal.
    hop_length : int
        Length of hop between STFT windows.
    win_length : int
        Window size.
    n_fft : int
        Size of FFT.
    n_mels : int
        Number of mel filterbanks.
    f_min : float
        Minimum frequency.
    f_max : float
        Maximum frequency.
    power : float
        Exponent for the magnitude spectrogram.
    normalized : bool
        Whether to normalize by magnitude after stft.
    min_max_energy_norm : bool
        Whether to normalize by min-max
    norm : str or None
        If "slaney", divide the triangular mel weights by the width of the mel band
    mel_scale : str
        Scale to use: "htk" or "slaney".
    compression : bool
        whether to do dynamic range compression
    audio : torch.Tensor
        input audio signal

    Returns
    -------
    mel : torch.Tensor
    rmse : torch.Tensor
    r   )Ú
transforms)Ú
hop_lengthÚ
win_lengthÚn_fftÚpowerÚ
normalizedrp   r0   )Úsample_rateÚn_stftr£   Úf_minÚf_maxÚnormÚ	mel_scalerÆ   )Ú
torchaudiorN  ÚSpectrogramr   r   ÚMelScaler¬   r|   ry   rX  Úminr  Údynamic_range_compression)rT  rO  rP  rQ  r£   rV  rW  rR  rS  Úmin_max_energy_normrX  rY  ÚcompressionÚaudiorN  Úaudio_to_melÚspecrå   Úrmser   r   r   Úmel_spectogramy  s>   6ûú
ùø	"re  r0   çñhãˆµøä>c                 C   s   t  t j| |d| ¡S )z+Dynamic range compression for audio signals©r]  )ry   Úlogr­   )r   ÚCÚclip_valr   r   r   r^  Ñ  s   r^  c                       sH   e Zd ZdZ‡ fdd„Zddd„Zdejdejfd	d
„Zdd„ Z	‡  Z
S )r%  zkSSIM loss as (1 - SSIM)
    SSIM is explained here https://en.wikipedia.org/wiki/Structural_similarity
    c                    s   t ƒ  ¡  tƒ | _d S r$  )r   r   Ú	_SSIMLossÚ	loss_func©r   r   r   r   r   Û  s   
zSSIMLoss.__init__Nc                 C   s>   |du r	|j  ¡ }tj||j|jd}| d¡| d¡k }|S )a:  Create a sequence mask for filtering padding in a sequence tensor.

        Arguments
        ---------
        sequence_length: torch.Tensor
            Sequence lengths.
        max_len: int
            Maximum sequence length. Defaults to None.

        Returns
        -------
        mask: [B, T_max]
        N)rG   r   r   r0   )Údatar  ry   ÚarangerG   r   rz   )r   Úsequence_lengthÚmax_lenÚ	seq_rangeÚmaskr   r   r   Úsequence_maskà  s   

ÿzSSIMLoss.sequence_maskr   rs  c                 C   sH   t j| | d¡ddd}t j| | d¡ddd}|| || d  S )a  Min-Max normalize tensor through first dimension

        Arguments
        ---------
        x: torch.Tensor
            input tensor [B, D1, D2]
        mask: torch.Tensor
            input mask [B, D1, 1]

        Returns
        -------
        Normalized tensor
        r   )r0   rp   T)r¬   ÚkeepdimgêŒ 9Y>)Fç:Œ0âŽyE>)ry   ÚamaxÚmasked_fillÚamin)r   r   rs  ÚmaximumÚminimumr   r   r   Úsample_wise_min_max÷  s
   ÿzSSIMLoss.sample_wise_min_maxc                 C   s¶   | j || d¡d d¡}|  ||¡}|  ||¡}|  ||  d¡||  d¡¡}| ¡ dkrAtd| ¡ › dƒ tjd|j	d}| ¡ dk rYtd| ¡ › d	ƒ tjd|j	d}|S )
at  
        Arguments
        ---------
        y_hat: torch.Tensor
            model prediction values [B, T, D].
        y: torch.Tensor
            target values [B, T, D].
        length: torch.Tensor
            length of each sample in a batch for masking.

        Returns
        -------
        loss: Average loss value in range [0, 1] masked by the length.
        r0   )rp  rq  rp   r¨   z > SSIM loss is out-of-range z, setting it 1.0rt   rJ   z, setting it 0.0)
rt  rÌ   rz   r|  rl  ÚitemÚprintry   r²   r   )r   Úy_hatÚyrØ   rs  Úy_normÚ
y_hat_normr&  r   r   r   r      s*   
ÿþÿÿÿzSSIMLoss.forwardr$  )r"   r#   r$   r%   r   rt  ry   r  r|  r    r&   r   r   r   r   r%  Ö  s    
r%  c                       s   e Zd ZdZg d¢Z								d‡ fd
d„	Zddd„Z			d dd„Zdd„ Zd!dd„Z	d!dd„Z
									d"dd„Zdd„ Z‡  ZS )#rk  aE  Creates a criterion that measures the structural similarity index error between
    each element in the input x and target y.
    Equation link: https://en.wikipedia.org/wiki/Structural_similarity
    x and y are tensors of arbitrary shapes with a total of n elements each.
    The sum operation still operates over all the elements, and divides by n.
    The division by n can be avoided if one sets reduction = sum.
    In case of 5D input tensors, complex value is returned as a tensor of size 2.

    Arguments
    ---------
    kernel_size: int
        By default, the mean and covariance of a pixel is obtained
        by convolution with given filter_size.
    kernel_sigma: float
        Standard deviation for Gaussian kernel.
    k1: float
        Coefficient related to c1 (see equation in the link above).
    k2: float
        Coefficient related to c2 (see equation in the link above).
    downsample: bool
        Perform average pool before SSIM computation (Default: True).
    reduction: str
        Specifies the reduction type
    data_range: Union[int, float]
        Maximum value range of images (usually 1.0 or 255).

    Example
    -------
    >>> loss = _SSIMLoss()
    >>> x = torch.rand(3, 3, 256, 256, requires_grad=True)
    >>> y = torch.rand(3, 3, 256, 256)
    >>> output = loss(x, y)
    >>> output.backward()
    )r.   Úk1Úk2ÚsigmaÚkernelÚ	reductioné   ç      ø?ç{®Gáz„?ç¸…ëQ¸ž?TÚmeanr¨   c                    sT   t ƒ  ¡  || _|| _|d dksJ d|› dƒ‚|| _|| _|| _|| _|| _d S )Nrp   r0   úKernel size must be odd, got [ú])	r   r   r‡  r.   Úkernel_sigmarƒ  r„  Ú
downsampleÚ
data_range)r   r.   r  rƒ  r„  r  r‡  r‘  r   r   r   r   Y  s   


ÿ
z_SSIMLoss.__init__c                 C   s<   |dkr|S |dkr|j ddS |dkr|jddS tdƒ‚)a?  Reduce input in batch dimension if needed.

        Arguments
        ---------
        x: torch.Tensor
            Tensor with shape (B, *).
        reduction: str
            Specifies the reduction type:
            none | mean | sum (Default: mean)

        Returns
        -------
        Reduced outputs.
        ÚnonerŒ  r   rÆ   Úsumz:Unknown reduction. Expected one of {'none', 'mean', 'sum'})rŒ  r“  Ú
ValueError)r   r   r‡  r   r   r   Ú_reducev  s   ÿz_SSIMLoss._reduce©r   rs   ©rJ   g      ð¿Nc              	   C   sØ  	 |d }|D ]â}t  |¡sJ dt|ƒ› ƒ‚|j|jks)J d|j› d|j› ƒ‚|du rD| ¡ | ¡ ksCJ d| ¡ › d| ¡ › ƒ‚n&| ¡ |d |d	 … | ¡ |d |d	 … ksjJ d
| ¡ › d| ¡ › ƒ‚|d |d	 kr‰| ¡ |d ksˆJ d|d › d| ¡ › ƒ‚n,|d |d	 k rµ|d | ¡   kr¡|d	 ksµn J d|d › d|d	 › d| ¡ › ƒ‚|d |d	 k ré|d | ¡ ksÓJ d|d › d| ¡ › ƒ‚| ¡ |d	 kséJ d|d	 › d| ¡ › ƒ‚qdS )aø  Check if the input satisfies the requirements

        Arguments
        ---------
        tensors: torch.Tensor
            torch.Tensors to check
        dim_range: Tuple[int, int]
            Allowed number of dimensions. (min, max)
        data_range: Tuple[float, float]
            Allowed range of values in tensors. (min, max)
        size_range: Tuple[int, int]
            Dimensions to include in size comparison. (start_dim, end_dim + 1)

        Returns
        -------
        None
        FNr   zExpected torch.Tensor, got zExpected tensors to be on z, got z%Expected tensors with same size, got z and r0   z9Expected tensors with same size at given dimensions, got z$Expected number of dimensions to be z,Expected number of dimensions to be between z*Expected values to be greater or equal to z(Expected values to be lower or equal to )ry   Ú	is_tensorÚtyper   rÌ   r¬   r]  r  )r   ÚtensorsÚ	dim_ranger‘  Ú
size_ranger   Útr   r   r   Ú_validate_input  sB   ÿÿÿþÿ$ ÿÿÿ€âz_SSIMLoss._validate_inputc                 C   sd   t j|t jd}||d d 8 }|d }| d¡| d¡  d|d    ¡ }|| ¡  }| d¡S )a7  Returns 2D Gaussian kernel N(0,sigma^2)

        Arguments
        ---------
        kernel_size: int
            Size of the kernel
        sigma: float
            Std of the distribution

        Returns
        -------
        gaussian_kernel: torch.Tensor
            [1, kernel_size, kernel_size]
        ©rG   r0   ç       @rp   r   )ry   ro  r9  rz   Úexpr“  )r   r.   r…  ÚcoordsÚgr   r   r   Úgaussian_filterÎ  s   &
z_SSIMLoss.gaussian_filterc                 C   sF  |  d¡|  d¡k s|  d¡|  d¡k r"td|  ¡ › d|  ¡ › ƒ‚|d }|d }|  d¡}tj||dd|d}	tj||dd|d}
|	d }|
d }|	|
 }tj|d |dd|d| }tj|d |dd|d| }tj|| |dd|d| }d	| | || |  }d	| | || |  | }|jd
d}|jd
d}||fS )a  Calculate Structural Similarity (SSIM) index for X and Y per channel.

        Arguments
        ---------
        x: torch.Tensor
            An input tensor (N, C, H, W).
        y: torch.Tensor
            A target tensor (N, C, H, W).
        kernel: torch.Tensor
            2D Gaussian kernel.
        k1: float
            Algorithm parameter (see equation in the link above).
        k2: float
            Algorithm parameter (see equation in the link above).
            Try a larger K2 constant (e.g. 0.4) if you get a negative or NaN results.

        Returns
        -------
        Full Value of Structural Similarity (SSIM) index.
        rs   rö   úAKernel size can't be greater than actual input size. Input size: ú. Kernel size: rp   r0   r   ©ÚweightÚstrider/   Úgroupsr   )rs   rö   rÆ   )rÌ   r”  ÚFÚconv2drŒ  )r   r   r€  r†  rƒ  r„  Úc1Úc2Ú
n_channelsÚmu_xÚmu_yÚmu_xxÚmu_yyÚmu_xyÚsigma_xxÚsigma_yyÚsigma_xyÚcsÚssÚssim_valr   r   r   Ú_ssim_per_channelæ  sR   (ÿÿ

ÿ
ÿÿýÿÿýÿÿýÿz_SSIMLoss._ssim_per_channelc           $      C   sf  |  d¡}|  d¡|  d¡k s|  d¡|  d¡k r'td|  ¡ › d|  ¡ › ƒ‚|d }|d }|d }	|d	 }
|d }|d	 }tj|	|dd
|d}tj|
|dd
|d}tj||dd
|d}tj||dd
|d}| d¡| d¡ }| d¡| d¡ }|| ||  }|| ||  }d}|	 d¡|
 d¡ }| d¡| d¡ }|	| |
|  }|	| |
|  }tj||dd
|d| }tj||dd
|d| }tj||dd
|d| }tj||dd
|d| }tj||fdd}tj||fdd}|d ||  | d¡| d¡ ||   } |d ||  | d¡| d¡ ||   }!|!|  }!|!jdd}"| jdd}#|"|#fS )a¡  Calculate Structural Similarity (SSIM) index for Complex X and Y per channel.

        Arguments
        ---------
        x: torch.Tensor
            An input tensor (N, C, H, W, 2).
        y: torch.Tensor
            A target tensor (N, C, H, W, 2).
        kernel: torch.Tensor
            2-D gauss kernel.
        k1: float
            Algorithm parameter (see equation in the link above).
        k2: float
            Algorithm parameter (see equation in the link above).
            Try a larger K2 constant (e.g. 0.4) if you get a negative or NaN results.

        Returns
        -------
        Full Value of Complex Structural Similarity (SSIM) index.
        r0   rö   rs   r÷   r¥  r¦  rp   ).r   ).r0   r   r§  r¨   rÆ   )rö   r÷   )	rÌ   r”  r«  r¬  Úpowry   Ústackrz   rŒ  )$r   r   r€  r†  rƒ  r„  r¯  r­  r®  Úx_realÚx_imagÚy_realÚy_imagÚmu1_realÚmu1_imagÚmu2_realÚmu2_imagÚmu1_sqÚmu2_sqÚmu1_mu2_realÚmu1_mu2_imagÚcompensationÚx_sqÚy_sqÚx_y_realÚx_y_imagÚ	sigma1_sqÚ	sigma2_sqÚsigma12_realÚsigma12_imagÚsigma12Úmu1_mu2Úcs_mapÚssim_maprº  r¸  r   r   r   Ú_ssim_per_channel_complex,  s”   
(ÿÿ
ÿ
ÿ
ÿ
ÿ
ÿýÿ
ÿýÿ
ÿýÿ
ÿýÿÿþÿÿz#_SSIMLoss._ssim_per_channel_complexFc                 C   s$  |d dksJ d|› dƒ‚| j ||gdd|fd |t|ƒ }|t|ƒ }tdtt| ¡ dd	… ƒd
 ƒƒ}|dkrK|rKtj||d}tj||d}|  ||¡ 	| d¡ddd¡ 
|¡}| ¡ dkrf| jn| j}|||||	|
d\}}| d¡}| d¡}|  ||¡}|  ||¡}|r||gS |S )a^  Interface of Structural Similarity (SSIM) index.
        Inputs supposed to be in range [0, data_range].
        To match performance with skimage and tensorflow set downsample = True.

        Arguments
        ---------
        x: torch.Tensor
            An input tensor (N, C, H, W) or (N, C, H, W, 2).
        y: torch.Tensor
            A target tensor (N, C, H, W) or (N, C, H, W, 2).
        kernel_size: int
            The side-length of the sliding window used in comparison. Must be an odd value.
        kernel_sigma: float
            Sigma of normal distribution.
        data_range: Union[int, float]
            Maximum value range of images (usually 1.0 or 255).
        reduction: str
            Specifies the reduction type:
            none | mean | sum. Default:mean
        full: bool
            Return cs map or not.
        downsample: bool
            Perform average pool before SSIM computation. Default: True
        k1: float
            Algorithm parameter (see equation in the link above).
        k2: float
            Algorithm parameter (see equation in the link above).
            Try a larger K2 constant (e.g. 0.4) if you get a negative or NaN results.

        Returns
        -------
        Value of Structural Similarity (SSIM) index. In case of 5D input tensors, complex value is returned
        as a tensor of size 2.
        rp   r0   r  rŽ  )rû   r)   r   )r›  r‘  rö   Né   )r.   r)   )r   r€  r†  rƒ  r„  )rž  rÏ   r  Úroundr]  rÌ   r«  Ú
avg_pool2dr¤  r{   r   r¬   r×  r»  rŒ  r•  )r   r   r€  r.   r  r‘  r‡  Úfullr  rƒ  r„  Úfr†  Ú_compute_ssim_per_channelrÖ  rÕ  rº  r¸  r   r   r   Ússim“  s<   0
ÿÿ"
ýÿý

ÿ

z_SSIMLoss.ssimc                 C   s:   | j ||| j| j| j| j| jd| j| jd
}t 	|¡| S )aÞ  Computation of Structural Similarity (SSIM) index as a loss function.

        Arguments
        ---------
        x: torch.Tensor
            An input tensor (N, C, H, W) or (N, C, H, W, 2).
        y: torch.Tensor
            A target tensor (N, C, H, W) or (N, C, H, W, 2).

        Returns
        -------
        Value of SSIM loss to be minimized, i.e 1 - ssim in [0, 1] range. In case of 5D input tensors,
        complex value is returned as a tensor of size 2.
        F)
r   r€  r.   r  r  r‘  r‡  rÛ  rƒ  r„  )
rÞ  r.   r  r  r‘  r‡  rƒ  r„  ry   Ú	ones_like)r   r   r€  Úscorer   r   r   r    ê  s   öz_SSIMLoss.forward)rˆ  r‰  rŠ  r‹  TrŒ  r¨   )rŒ  )r–  r—  N)rŠ  r‹  )rˆ  r‰  r¨   rŒ  FTrŠ  r‹  )r"   r#   r$   r%   Ú__constants__r   r•  rž  r¤  r»  r×  rÞ  r    r&   r   r   r   r   rk  3  s:    #ø

û?

Fk
õWrk  c                   @   rí   )ÚTextMelCollateWithAlignmenta`  Zero-pads model inputs and targets based on number of frames per step
    result: tuple
        a tuple of tensors to be used as inputs/targets
        (
            text_padded,
            dur_padded,
            input_lengths,
            mel_padded,
            output_lengths,
            len_x,
            labels,
            wavs
        )
    c              	   C   s  t |ƒ}tt|ƒƒD ]
}|| d ||< q
tjt dd„ |D ƒ¡ddd\}}|d }t t|ƒ|¡}| ¡  tt|ƒƒD ]}|||  d }|||d| d¡…f< q=|d d  d¡}	td	d„ |D ƒƒ}
t 	t|ƒ|	|
¡}| ¡  t 	t|ƒ|
¡}| ¡  t 	t|ƒ|
¡}| ¡  t t|ƒ¡}g g }}tt|ƒƒD ]U}|| }|| d }|| d
 }|| d }|||dd…d| d¡…f< |||d| d¡…f< |||d| d¡…f< | d¡||< | 
|| d ¡ | 
|| d ¡ qœ| dd
d¡}||||||||fS )aà  Collate's training batch from normalized text and mel-spectrogram

        Arguments
        ---------
        batch: list
            [text_normalized, mel_normalized]

        Returns
        -------
        phoneme_padded: torch.Tensor
        input_lengths: torch.Tensor
        mel_padded: torch.Tensor
        pitch_padded: torch.Tensor
        energy_padded: torch.Tensor
        output_lengths: torch.Tensor
        labels: torch.Tensor
        wavs: torch.Tensor
        rï   c                 S   rð   rä   rñ   rò   r   r   r   rã   6  ró   z8TextMelCollateWithAlignment.__call__.<locals>.<listcomp>r   Trô   Nr0   c                 S   s   g | ]	}|d    d ¡‘qS )r0   rø   rò   r   r   r   rã   D  rù   rp   rú   rü   rý   )rþ   r5   rH   ry   rÿ   r   r  rÌ   r  r  r6   r«   )r   r  r  rF   r  r  r	  Úphoneme_paddedÚphonemer  r  r  r  r  r  r  r  r  rå   r´   rµ   r   r   r   r     sZ   ÿ
ÿ
øz$TextMelCollateWithAlignment.__call__Nr!  r   r   r   r   râ  	  s    râ  c              	   C   sÖ  t j }| | } | j}| j}|  ¡  ¡  ¡ } | ¡  ¡  ¡  t j¡}| j	\}}}t j
| j	t jd}t j
||ft jd}	t j|t jd dd¡}
t|ƒD ]G}t j|	ddgddggd|ddd…dd…f }|	}||k}t  |||¡}||dd…dd…|f< |
|k}t  ||| dd…dd…|f  |¡}	qKt  ||d¡}t j
| j	t jd}|dd…dd…df  d¡ t j¡d }t  |¡}tt|ƒƒD ]}d||||f< |||||f  d }qÂ|| t j¡ }t |¡j||d}|S )	a<  
    Monotonic alignment search algorithm, numpy works faster than the torch implementation.

    Arguments
    ---------
    value: torch.Tensor
        input alignment values [b, t_x, t_y]
    mask: torch.Tensor
        input alignment mask [b, t_x, t_y]

    Returns
    -------
    path: torch.Tensor

    Example
    -------
    >>> import torch
    >>> from speechbrain.lobes.models.FastSpeech2 import maximum_path_numpy
    >>> alignment = torch.rand(2, 5, 100)
    >>> mask = torch.ones(2, 5, 100)
    >>> hard_alignments = maximum_path_numpy(alignment, mask)
    rŸ  r0   rs   r   Úconstant)ÚmodeÚconstant_valuesN)r   rG   )ÚnpÚinfr   rG   ÚcpuÚdetachÚnumpyÚastypeÚbool_r|   ÚzerosÚint64r9  ro  Úreshaper5   rË   rÐ   r“  Úreversedry   Ú
from_numpyr   )Úvaluers  Úmax_neg_valr   rG   ÚbÚt_xÚt_yÚ	directionÚvÚx_rangeÚjÚv0Úv1Úmax_maskÚv_maxÚ
index_maskÚpathÚindexÚindex_ranger   r   r   Úmaximum_path_numpyh  s@   ÿþ&(
r  c                       s2   e Zd ZdZ				d	‡ fdd„	Zdd„ Z‡  ZS )
ÚAlignmentNetworkaÅ  Learns the alignment between the input text
    and the spectrogram with Gaussian Attention.

    query -> conv1d -> relu -> conv1d -> relu -> conv1d -> L2_dist -> softmax -> alignment
    key   -> conv1d -> relu -> conv1d - - - - - - - - - - - -^

    Arguments
    ---------
    in_query_channels: int
        Number of channels in the query network. Defaults to 80.
    in_key_channels: int
        Number of channels in the key network. Defaults to 512.
    attn_channels: int
        Number of inner channels in the attention layers. Defaults to 80.
    temperature: float
        Temperature for the softmax. Defaults to 0.0005.

    Example
    -------
    >>> import torch
    >>> from speechbrain.lobes.models.FastSpeech2 import AlignmentNetwork
    >>> aligner = AlignmentNetwork(
    ...     in_query_channels=80,
    ...     in_key_channels=512,
    ...     attn_channels=80,
    ...     temperature=0.0005,
    ... )
    >>> phoneme_feats = torch.rand(2, 512, 20)
    >>> mels = torch.rand(2, 80, 100)
    >>> alignment_soft, alignment_logprob = aligner(mels, phoneme_feats, None, None)
    >>> alignment_soft.shape, alignment_logprob.shape
    (torch.Size([2, 1, 100, 20]), torch.Size([2, 1, 100, 20]))
    r(   r   çü©ñÒMb@?c                    sÔ   t ƒ  ¡  || _tjjdd| _tjjdd| _t 	t
j||d dddddtj ¡ t
j|d |ddddd¡| _t 	t
j||d dddddtj ¡ t
j|d |dddddtj ¡ t
j||ddddd¡| _d S )Nrú   rÆ   rp   r+   T)r-   r   r.   r/   Úbiasr‘   r0   )r   r   Útemperaturery   r   ÚSoftmaxÚsoftmaxÚ
LogSoftmaxÚlog_softmaxÚ
Sequentialr   r1   rQ   Ú	key_layerÚquery_layer)r   Úin_query_channelsÚin_key_channelsÚattn_channelsr	  r   r   r   r   Æ  sf   
úúöúúú
ízAlignmentNetwork.__init__c           
      C   sÆ   |   |¡}|  |¡}|dd…dd…dd…df |dd…dd…df  d }| j |jddd }|durF|  |¡t |dd…df d ¡ }|durZ|j | 	¡  
d¡ tdƒ ¡ |  |¡}	|	|fS )a9  Forward pass of the aligner encoder.

        Arguments
        ---------
        queries: torch.Tensor
            the query tensor [B, C, T_de]
        keys: torch.Tensor
            the query tensor [B, C_emb, T_en]
        mask: torch.Tensor
            the query mask[B, T_de]
        attn_prior: torch.Tensor
            the prior attention tensor [B, 1, T_en, T_de]

        Returns
        -------
        attn: torch.Tensor
            soft attention [B, 1, T_en, T_de]
        attn_logp: torch.Tensor
            log probabilities [B, 1, T_en , T_de]
        Nrp   r0   T)ru  rv  ré  )r  r  r	  r“  r  ry   rh  rn  Úmasked_fill_r€   rz   rÏ   r  )
r   ÚqueriesÚkeysrs  Ú
attn_priorÚkey_outÚ	query_outÚattn_factorÚ	attn_logpÚattnr   r   r   r      s   

6ÿÿ
zAlignmentNetwork.forward)r(   r   r(   r  r!   r   r   r   r   r  £  s    $û=r  c                       s>   e Zd ZdZ‡ fdd„Zdd„ Z						d
dd	„Z‡  ZS )ÚFastSpeech2WithAlignmentaß  The FastSpeech2 text-to-speech model with internal alignment.
    This class is the main entry point for the model, which is responsible
    for instantiating all submodules, which, in turn, manage the individual
    neural network layers. Certain parts are adopted from the following implementation:
    https://github.com/coqui-ai/TTS/blob/dev/TTS/tts/models/forward_tts.py

    Simplified STRUCTURE:
    input -> token embedding -> encoder -> aligner -> duration/pitch/energy -> upsampler -> decoder -> output

    Arguments
    ---------
    enc_num_layers: int
        number of transformer layers (TransformerEncoderLayer) in encoder
    enc_num_head: int
        number of multi-head-attention (MHA) heads in encoder transformer layers
    enc_d_model: int
        the number of expected features in the encoder
    enc_ffn_dim: int
        the dimension of the feedforward network model
    enc_k_dim: int
        the dimension of the key
    enc_v_dim: int
        the dimension of the value
    enc_dropout: float
        Dropout for the encoder
    in_query_channels: int
        Number of channels in the query network.
    in_key_channels: int
        Number of channels in the key network.
    attn_channels: int
        Number of inner channels in the attention layers.
    temperature: float
        Temperature for the softmax.
    dec_num_layers: int
        number of transformer layers (TransformerEncoderLayer) in decoder
    dec_num_head: int
        number of multi-head-attention (MHA) heads in decoder transformer layers
    dec_d_model: int
        the number of expected features in the decoder
    dec_ffn_dim: int
        the dimension of the feedforward network model
    dec_k_dim: int
        the dimension of the key
    dec_v_dim: int
        the dimension of the value
    dec_dropout: float
        dropout for the decoder
    normalize_before: bool
        whether normalization should be applied before or after MHA or FFN in Transformer layers.
    ffn_type: str
        whether to use convolutional layers instead of feed forward network inside transformer layer.
    ffn_cnn_kernel_size_list: list of int
        conv kernel size of 2 1d-convs if ffn_type is 1dcnn
    n_char: int
        the number of symbols for the token embedding
    n_mels: int
        number of bins in mel spectrogram
    postnet_embedding_dim: int
        output feature dimension for convolution layers
    postnet_kernel_size: int
        postnet convolution kernel size
    postnet_n_convolutions: int
        number of convolution layers
    postnet_dropout: float
        dropout probability for postnet
    padding_idx: int
        the index for padding
    dur_pred_kernel_size: int
        the convolution kernel size in duration predictor
    pitch_pred_kernel_size: int
        kernel size for pitch prediction.
    energy_pred_kernel_size: int
        kernel size for energy prediction.
    variance_predictor_dropout: float
        dropout probability for variance predictor (duration/pitch/energy)

    Example
    -------
    >>> import torch
    >>> from speechbrain.lobes.models.FastSpeech2 import FastSpeech2WithAlignment
    >>> model = FastSpeech2WithAlignment(
    ...    enc_num_layers=6,
    ...    enc_num_head=2,
    ...    enc_d_model=384,
    ...    enc_ffn_dim=1536,
    ...    enc_k_dim=384,
    ...    enc_v_dim=384,
    ...    enc_dropout=0.1,
    ...    in_query_channels=80,
    ...    in_key_channels=384,
    ...    attn_channels=80,
    ...    temperature=0.0005,
    ...    dec_num_layers=6,
    ...    dec_num_head=2,
    ...    dec_d_model=384,
    ...    dec_ffn_dim=1536,
    ...    dec_k_dim=384,
    ...    dec_v_dim=384,
    ...    dec_dropout=0.1,
    ...    normalize_before=False,
    ...    ffn_type='1dcnn',
    ...    ffn_cnn_kernel_size_list=[9, 1],
    ...    n_char=40,
    ...    n_mels=80,
    ...    postnet_embedding_dim=512,
    ...    postnet_kernel_size=5,
    ...    postnet_n_convolutions=5,
    ...    postnet_dropout=0.5,
    ...    padding_idx=0,
    ...    dur_pred_kernel_size=3,
    ...    pitch_pred_kernel_size=3,
    ...    energy_pred_kernel_size=3,
    ...    variance_predictor_dropout=0.5)
    >>> inputs = torch.tensor([
    ...     [13, 12, 31, 14, 19],
    ...     [31, 16, 30, 31, 0],
    ... ])
    >>> mels = torch.rand(2, 100, 80)
    >>> mel_post, postnet_output, durations, predict_pitch, avg_pitch, predict_energy, avg_energy, mel_lens, alignment_durations, alignment_soft, alignment_logprob, alignment_mas = model(inputs, mels)
    >>> mel_post.shape, durations.shape
    (torch.Size([2, 100, 80]), torch.Size([2, 5]))
    >>> predict_pitch.shape, predict_energy.shape
    (torch.Size([2, 5, 1]), torch.Size([2, 5, 1]))
    >>> alignment_soft.shape, alignment_mas.shape
    (torch.Size([2, 100, 5]), torch.Size([2, 100, 5]))
    c!           !         s   t ƒ  ¡  || _|| _|| _t|ƒ| _t|ƒ| _t|||d| _	t
|||| d| _t
|||| d| _t
|||| d| _tjd||ddd| _tjd||ddd| _t|||||||tj|||d| _t|||||||tj|||d| _tj||d| _t|||||d	| _t||	|
|d
| _d S )NrW   r   r0   r+   Tr   rX   rK   r’   )r  r  r  r	  )r   r   rc   r“   rd   r   rf   r”   r   re   rI   r•   r–   r—   r   r1   r˜   r™   r   r   rQ   rš   r›   r	   rP   r'   rœ   r  Úaligner)!r   ri   rc   rj   rk   rl   rm   rn   r  r  r  r	  r   r“   rž   rŸ   r    r¡   r¢   r`   ra   rb   ro   r£   rB   rC   rD   rE   rd   r¤   r¥   r¦   r§   r   r   r   r   ¨  sª   
&ÿÿÿüüüûûõõûüz!FastSpeech2WithAlignment.__init__c           
      C   s   t  |d¡t  |d¡ }|  | dd¡| dd¡|d¡\}}t| d¡ dd¡ ¡ | d¡ ¡ ƒ}t  |d¡ ¡ }	| d¡ dd¡}|	|||fS )aê  Aligner forward pass.
        1. Compute a mask to apply to the attention map.
        2. Run the alignment network.
        3. Apply MAS (Monotonic alignment search) to compute the hard alignment map.
        4. Compute the durations from the hard alignment map.

        Arguments
        ---------
        x: torch.Tensor
            Input sequence [B, T_en, C_en].
        y: torch.Tensor
            Output sequence [B, T_de, C_de].
        x_mask: torch.Tensor
            Input sequence mask [B, 1, T_en].
        y_mask: torch.Tensor
            Output sequence mask [B, 1, T_de].

        Returns
        -------
        durations: torch.Tensor
            Durations from the hard alignment map [B, T_en].
        alignment_soft: torch.Tensor
            soft alignment potentials [B, T_en, T_de].
        alignment_logprob: torch.Tensor
            log scale alignment potentials [B, 1, T_de, T_en].
        alignment_mas: torch.Tensor
            hard alignment map [B, T_en, T_de].
        rs   rp   r0   N)	ry   rz   r  Ú	transposer  r   Ú
contiguousr“  Úint)
r   r   r€  rU   Úy_maskr¸   Úalignment_softÚalignment_logprobÚalignment_masr³   r   r   r   Ú_forward_aligner(	  s   ÿþz)FastSpeech2WithAlignment._forward_alignerNr¨   c                  C   sú  t || jd}|  d¡}	|  |¡}
|  |
¡}t |
|¡|	 }
| d¡ | jd|
j	d ¡ 
ddd¡ ¡ }| j|
||d\}
}|
|	 }
d}d}d}d}|dur{t || jd}|  d¡}|  |
||	 dd¡| dd¡¡\}}}}| dd¡}| dd¡}|  |
|	¡ ¡ }| ¡ dkrŽ| d¡}t tj |¡d¡}d}|  |
|	¡}|| }|dur½t| d¡|ƒ}|  |¡}| 
ddd¡}n
|  | 
ddd¡¡}| 
ddd¡}|
 |¡}
d}|  |
|	¡}|| }|durøt| d¡|ƒ}|  |¡}| 
ddd¡}n
|  | 
ddd¡¡}| 
ddd¡}|
 |¡}
t|
|dur|n||d\}}tt |¡ƒ}| |j¡}|  d¡}	| d¡ | jd|j	d ¡ 
ddd¡ ¡ }|   |¡}t ||¡|	 }| j!|||d^}}}|  "|¡|	 }|  #|¡| }|||||||t |¡||||fS )	aÿ  forward pass for training and inference

        Arguments
        ---------
        tokens: torch.Tensor
            batch of input tokens
        mel_spectograms: torch.Tensor
            batch of mel_spectograms (used only for training)
        pitch: torch.Tensor
            batch of pitch for each frame. If it is None, the model will infer on predicted pitches
        energy: torch.Tensor
            batch of energy for each frame. If it is None, the model will infer on predicted energies
        pace: float
            scaling factor for durations
        pitch_rate: float
            scaling factor for pitches
        energy_rate: float
            scaling factor for energies

        Returns
        -------
        mel_post: torch.Tensor
            mel outputs from the decoder
        postnet_output: torch.Tensor
            mel outputs from the postnet
        predict_durations: torch.Tensor
            predicted durations of each token
        predict_pitch: torch.Tensor
            predicted pitches of each token
        avg_pitch: torch.Tensor
            target pitches for each token if input pitch is not None
            None if input pitch is None
        predict_energy: torch.Tensor
            predicted energies of each token
        avg_energy: torch.Tensor
            target energies for each token if input energy is not None
            None if input energy is None
        mel_length:
            predicted lengths of mel spectrograms
        alignment_durations:
            durations from the hard alignment map
        alignment_soft: torch.Tensor
            soft alignment potentials
        alignment_logprob: torch.Tensor
            log scale alignment potentials
        alignment_mas: torch.Tensor
            hard alignment map
        rq   rs   r0   r   rp   rv   Nr©   )$r   rd   rz   re   rf   ry   r}   r{   rc   r|   r«   r€   rš   r&  r  r•   r   r¬   r­   r®   r¯   r–   r°   r˜   r—   r™   r±   r   r²   r   r   r“   r”   r›   r	   rœ   ) r   r‚   Úmel_spectogramsr´   rµ   rª   r¶   r·   r…   r†   r„   r‡   r¸   rŠ   Úalignment_durationsr#  r$  r%  r"  Úy_mask_invertedr¹   Úpredict_durations_reverse_logr»   r¼   r½   r¾   r¿   rÀ   rÁ   rÂ   rÃ   rÄ   r   r   r   r    Q	  sÒ   :


ü
ÿÿ

üûÿþ
ÿ
ÿ


ÿ


ÿ
ù	
ü
ÿôz FastSpeech2WithAlignment.forwardrÅ   )r"   r#   r$   r%   r   r&  r    r&   r   r   r   r   r  (  s     ,ør  c                       ó(   e Zd ZdZ‡ fdd„Zdd„ Z‡  ZS )ÚLossWithAlignmenta  Loss computation including internal aligner

    Arguments
    ---------
    log_scale_durations: bool
       applies logarithm to target durations
    ssim_loss_weight: float
       weight for the ssim loss
    duration_loss_weight: float
       weight for the duration loss
    pitch_loss_weight: float
       weight for the pitch loss
    energy_loss_weight: float
       weight for the energy loss
    mel_loss_weight: float
       weight for the mel loss
    postnet_mel_loss_weight: float
       weight for the postnet mel loss
    aligner_loss_weight: float
       weight for the alignment loss
    binary_alignment_loss_weight: float
       weight for the postnet mel loss
    binary_alignment_loss_warmup_epochs: int
       Number of epochs to gradually increase the impact of binary loss.
    binary_alignment_loss_max_epochs: int
       From this epoch on the impact of binary loss is ignored.
    c                    sš   t ƒ  ¡  tƒ | _t ¡ | _t ¡ | _t ¡ | _t ¡ | _	t ¡ | _
tƒ | _tƒ | _|| _|| _|| _|| _|| _|| _|| _|| _|	| _|
| _|| _d S r$  )r   r   r%  r&  r   r'  r(  r)  r*  r+  r,  ÚForwardSumLossÚaligner_lossÚBinaryAlignmentLossÚbinary_alignment_lossr-  r.  r/  r0  r1  r2  r3  Úaligner_loss_weightÚbinary_alignment_loss_weightÚ#binary_alignment_loss_warmup_epochsÚ binary_alignment_loss_max_epochs)r   r-  r.  r1  r2  r3  r/  r0  r1  r2  r3  r4  r   r   r   r   .
  s*   





ÿ
zLossWithAlignment.__init__c           "      C   s  |\}}}}}t |jƒdksJ ‚|\}	}
}}}}}}}}}}| d¡}| d¡}| d¡}| d¡}| d¡}| jrAt | ¡ ¡}t|jd ƒD ]}|dkrÖ|  |	|d|| …dd…f ||d|| …dd…f ¡}|  	|
|d|| …dd…f ||d|| …dd…f ¡}|  
||d|| …f ||d|| …f  tj¡¡}|  ||d|| …f ||d|| …f  tj¡¡}|  ||d|| …f ||d|| …f  tj¡¡}qH||  |	|d|| …dd…f ||d|| …dd…f ¡ }||  	|
|d|| …dd…f ||d|| …dd…f ¡ }||  
||d|| …f ||d|| …f  tj¡¡ }||  ||d|| …f ||d|| …f  tj¡¡ }||  ||d|| …f ||d|| …f  tj¡¡ }qHd}i }|  |	||¡}|| j |d< t |t |ƒ¡}|| j |d< t |t |ƒ¡}|| j |d< t |t |ƒ¡}|| j |d< t |t |ƒ¡}|| j |d	< t |t |ƒ¡}|| j |d
< |dur×|  |||¡}|| j |d< |dur|dur|| jkrêd} n
t|| j dƒd } |  ||¡}!|!| j |  |d< t| ¡ ƒ}||d< |S )a’  Computes the value of the loss function and updates stats

        Arguments
        ---------
        predictions: tuple
            model predictions
        targets: tuple
            ground truth data
        current_epoch: int
            used to determinate the start/end of the binary alignment loss

        Returns
        -------
        loss: torch.Tensor
            the loss value
        rú   rs   r   Nr&  r(  r)  r*  r+  r,  r.  r¨   r0  r6  ) rH   r|   r   r-  ry   r8  rÏ   r5   r(  r)  r*  r   r9  r+  r,  r&  r.  r:  r/  r0  r1  r2  r3  r.  r1  r4  r]  r3  r0  r2  r“  rÑ   )"r   r;  r<  r=  r>  r@  rA  rB  rC  rD  rE  rF  rG  rH  rI  rJ  rÀ   r(  r#  r$  Úalignment_hardrL  rF   r(  r)  r*  r+  r,  r6  rM  r&  r.  Úbinary_loss_warmup_weightr0  r   r   r   r    T
  sæ   úó




þþþþþþþþþþÿ
ÿÿýûÿ	ÿÿþÿzLossWithAlignment.forwardr!   r   r   r   r   r,  
  s    &r,  c                       r   )r-  a¯  CTC alignment loss

    Arguments
    ---------
    blank_logprob: pad value

    Example
    -------
    >>> import torch
    >>> from speechbrain.lobes.models.FastSpeech2 import ForwardSumLoss
    >>> loss_func = ForwardSumLoss()
    >>> attn_logprob = torch.rand(2, 1, 100, 5)
    >>> key_lens = torch.tensor([5, 5])
    >>> query_lens = torch.tensor([100, 100])
    >>> loss = loss_func(attn_logprob, key_lens, query_lens)
    rs   c                    s4   t ƒ  ¡  tjjdd| _tjjdd| _|| _d S )Nrú   rÆ   T)Úzero_infinity)	r   r   ry   r   r  r  ÚCTCLossÚctc_lossÚblank_logprob)r   r:  r   r   r   r   ý
  s   

zForwardSumLoss.__init__c           
   	   C   sÒ   t jjj|d| jd}d}t|jd ƒD ]K}t  d|| d ¡ d¡}||  	ddd¡d|| …dd…d|| d …f }|  
|d ¡d }| j|||||d … |||d … d}	||	 }q||jd  }|S )	aN  
        Arguments
        ---------
        attn_logprob: torch.Tensor
            log scale alignment potentials [B, 1, query_lens, key_lens]
        key_lens: torch.Tensor
            mel lengths
        query_lens: torch.Tensor
            phoneme lengths

        Returns
        -------
        total_loss: torch.Tensor
        rÇ   )ÚinputrË   rô  rJ   r   r0   rp   N)r  Útarget_lengths)ry   r   rÊ   rË   r:  r5   r|   ro  rz   r«   r  r9  )
r   Úattn_logprobÚkey_lensÚ
query_lensÚattn_logprob_paddedr6  ÚbidÚ
target_seqÚcurr_logprobrM  r   r   r   r      s&   ÿ ÿü
zForwardSumLoss.forward)rs   r!   r   r   r   r   r-  ë
  s    r-  c                       r+  )r/  aË  Binary loss that forces soft alignments to match the hard alignments as
    explained in `https://arxiv.org/pdf/2108.10447.pdf`.
    Example
    -------
    >>> import torch
    >>> from speechbrain.lobes.models.FastSpeech2 import BinaryAlignmentLoss
    >>> loss_func = BinaryAlignmentLoss()
    >>> alignment_hard = torch.randint(0, 2, (2, 100, 5))
    >>> alignment_soft = torch.rand(2, 100, 5)
    >>> loss = loss_func(alignment_hard, alignment_soft)
    c                    s   t ƒ  ¡  d S r$  )r   r   rm  r   r   r   r   7  s   zBinaryAlignmentLoss.__init__c                 C   s.   t  t j||dk dd¡ ¡ }| | ¡  S )zÐ
        alignment_hard: torch.Tensor
            hard alignment map [B, mel_lens, phoneme_lens]
        alignment_soft: torch.Tensor
            soft alignment potentials [B, mel_lens, phoneme_lens]
        r0   gê-™—q=rg  )ry   rh  r­   r“  )r   r5  r#  Úlog_sumr   r   r   r    :  s   ÿþzBinaryAlignmentLoss.forwardr!   r   r   r   r   r/  *  s    r/  )r¨   rJ   )r0   rf  )-r%   rì  rè  ry   Útorch.nn.functionalr   rÊ   r«  Útorch.nn.modules.lossr   Ú0speechbrain.lobes.models.transformer.Transformerr   r   r   r   Úspeechbrain.nnetr   r	   Úspeechbrain.nnet.embeddingr
   Úspeechbrain.nnet.lossesr   Úspeechbrain.nnet.normalizationr   ÚModuler   r'   rI   rV   rŽ   r°   r±   rî   r"  re  r^  r%  rk  râ  r  r  r  r,  r-  r/  r   r   r   r   Ú<module>   sV    	1ZK   
)"y 7
X]   Y_;    l [?