o
    %ݫi"                     @   sB  d Z ddlmZ ddlmZ ddlZddlmZ ddlmZ	 ddl
mZ ddlmZ G d	d
 d
ejjZG dd dejjZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZdd ZeddZG dd  d ejZG d!d" d"Zd)d%d&Zd'd( ZdS )*z
Neural network modules for the Tacotron2 end-to-end neural
Text-to-Speech (TTS) model

Authors
* Georges Abous-Rjeili 2021
* Artem Ploujnikov 2021
    )
namedtuple)sqrtN)nn)
functional)get_mask_from_lengths)GuidedAttentionLossc                       s*   e Zd ZdZd fdd	Zdd Z  ZS )	
LinearNorma:  A linear layer with Xavier initialization

    Arguments
    ---------
    in_dim: int
        the input dimension
    out_dim: int
        the output dimension
    bias: bool
        whether or not to use a bias
    w_init_gain: linear
        the weight initialization gain type (see torch.nn.init.calculate_gain)

    Example
    -------
    >>> import torch
    >>> from speechbrain.lobes.models.Tacotron2 import LinearNorm
    >>> layer = LinearNorm(in_dim=5, out_dim=3)
    >>> x = torch.randn(3, 5)
    >>> y = layer(x)
    >>> y.shape
    torch.Size([3, 3])
    Tlinearc                    sB   t    tjj|||d| _tjjj| jjtjj	|d d S )Nbiasgain)
super__init__torchr   Linearlinear_layerinitxavier_uniform_weightcalculate_gain)selfin_dimout_dimr   w_init_gain	__class__ V/home/ubuntu/.local/lib/python3.10/site-packages/speechbrain/lobes/models/Tacotron2.pyr   O   s   

zLinearNorm.__init__c                 C   
   |  |S )zComputes the forward pass

        Arguments
        ---------
        x: torch.Tensor
            a (batch, features) input tensor


        Returns
        -------
        output: torch.Tensor
            the linear layer output

        )r   )r   xr   r   r   forwardX   s   
zLinearNorm.forward)Tr	   __name__
__module____qualname____doc__r   r!   __classcell__r   r   r   r   r   6   s    	r   c                       s6   e Zd ZdZ						d
 fdd	Zdd	 Z  ZS )ConvNorma  A 1D convolution layer with Xavier initialization

    Arguments
    ---------
    in_channels: int
        the number of input channels
    out_channels: int
        the number of output channels
    kernel_size: int
        the kernel size
    stride: int
        the convolutional stride
    padding: int
        the amount of padding to include. If not provided, it will be calculated
        as dilation * (kernel_size - 1) / 2
    dilation: int
        the dilation of the convolution
    bias: bool
        whether or not to use a bias
    w_init_gain: linear
        the weight initialization gain type (see torch.nn.init.calculate_gain)

    Example
    -------
    >>> import torch
    >>> from speechbrain.lobes.models.Tacotron2 import ConvNorm
    >>> layer = ConvNorm(in_channels=10, out_channels=5, kernel_size=3)
    >>> x = torch.randn(3, 10, 5)
    >>> y = layer(x)
    >>> y.shape
    torch.Size([3, 5, 5])
       NTr	   c	           	   	      sv   t    |d u r|d dksJ t||d  d }tjj|||||||d| _tjjj| jj	tjj
|d d S )N   r)   )kernel_sizestridepaddingdilationr   r   )r   r   intr   r   Conv1dconvr   r   r   r   )	r   in_channelsout_channelsr+   r,   r-   r.   r   r   r   r   r   r      s    


zConvNorm.__init__c                 C   r   )zComputes the forward pass

        Arguments
        ---------
        signal: torch.Tensor
            the input to the convolutional layer

        Returns
        -------
        output: torch.Tensor
            the output
        )r1   )r   signalr   r   r   r!      s   
zConvNorm.forward)r)   r)   Nr)   Tr	   r"   r   r   r   r   r(   j   s    %r(   c                       s0   e Zd ZdZ			d	 fdd	Zdd Z  ZS )
LocationLayera  A location-based attention layer consisting of a Xavier-initialized
    convolutional layer followed by a dense layer

    Arguments
    ---------
    attention_n_filters: int
        the number of filters used in attention

    attention_kernel_size: int
        the kernel size of the attention layer

    attention_dim: int
        the dimension of linear attention layers


    Example
    -------
    >>> import torch
    >>> from speechbrain.lobes.models.Tacotron2 import LocationLayer
    >>> layer = LocationLayer()
    >>> attention_weights_cat = torch.randn(3, 2, 64)
    >>> processed_attention = layer(attention_weights_cat)
    >>> processed_attention.shape
    torch.Size([3, 64, 128])

              c              	      sH   t    t|d d }td|||dddd| _t||ddd| _d S )Nr)   r*   F)r+   r-   r   r,   r.   tanhr   r   )r   r   r/   r(   location_convr   location_dense)r   attention_n_filtersattention_kernel_sizeattention_dimr-   r   r   r   r      s   
	zLocationLayer.__init__c                 C   s$   |  |}|dd}| |}|S )a.  Performs the forward pass for the attention layer

        Arguments
        ---------
        attention_weights_cat: torch.Tensor
            the concatenating attention weights

        Returns
        -------
        processed_attention: torch.Tensor
            the attention layer output

        r)   r*   )r;   	transposer<   )r   attention_weights_catprocessed_attentionr   r   r   r!      s   

zLocationLayer.forward)r6   r7   r8   r"   r   r   r   r   r5      s    r5   c                       s<   e Zd ZdZ					d fdd	Zd	d
 Zdd Z  ZS )	AttentionaC  The Tacotron attention layer. Location-based attention is used.

    Arguments
    ---------
    attention_rnn_dim: int
        the dimension of the RNN to which the attention layer
        is applied
    embedding_dim: int
        the embedding dimension
    attention_dim: int
        the dimension of the memory cell
    attention_location_n_filters: int
        the number of location filters
    attention_location_kernel_size: int
        the kernel size of the location layer

    Example
    -------
    >>> import torch
    >>> from speechbrain.lobes.models.Tacotron2 import (
    ... Attention)
    >>> from speechbrain.lobes.models.transformer.Transformer import (
    ... get_mask_from_lengths)
    >>> layer = Attention()
    >>> attention_hidden_state = torch.randn(2, 1024)
    >>> memory = torch.randn(2, 173, 512)
    >>> processed_memory = torch.randn(2, 173, 128)
    >>> attention_weights_cat = torch.randn(2, 2, 173)
    >>> memory_lengths = torch.tensor([173, 91])
    >>> mask = get_mask_from_lengths(memory_lengths)
    >>> attention_context, attention_weights = layer(
    ...    attention_hidden_state,
    ...    memory,
    ...    processed_memory,
    ...    attention_weights_cat,
    ...    mask
    ... )
    >>> attention_context.shape, attention_weights.shape
    (torch.Size([2, 512]), torch.Size([2, 173]))
          r8   r6   r7   c                    s\   t    t||ddd| _t||ddd| _t|ddd| _t|||| _td | _	d S )NFr9   r:   r)   r
   inf)
r   r   r   query_layermemory_layervr5   location_layerfloatscore_mask_value)r   attention_rnn_dimembedding_dimr?   attention_location_n_filtersattention_location_kernel_sizer   r   r   r   )  s   
zAttention.__init__c                 C   s@   |  |d}| |}| t|| | }|d}|S )a  Computes the alignment energies

        Arguments
        ---------
        query: torch.Tensor
            decoder output (batch, n_mel_channels * n_frames_per_step)
        processed_memory: torch.Tensor
            processed encoder outputs (B, T_in, attention_dim)
        attention_weights_cat: torch.Tensor
            cumulative and prev. att weights (B, 2, max_time)

        Returns
        -------
        alignment : torch.Tensor
            (batch, max_time)
        r)   r*   )rG   	unsqueezerJ   rI   r   r9   squeeze)r   queryprocessed_memoryrA   processed_queryprocessed_attention_weightsenergiesr   r   r   get_alignment_energies@  s   


z Attention.get_alignment_energiesc           	      C   sN   |  |||}||| j}tj|dd}t|d|}|d}||fS )a7  Computes the forward pass

        Arguments
        ---------
        attention_hidden_state: torch.Tensor
            attention rnn last output
        memory: torch.Tensor
            encoder outputs
        processed_memory: torch.Tensor
            processed encoder outputs
        attention_weights_cat: torch.Tensor
            previous and cumulative attention weights
        mask: torch.Tensor
            binary mask for padded data

        Returns
        -------
        result: tuple
            a (attention_context, attention_weights) tuple
        r)   dim)	rX   masked_fillrL   Fsoftmaxr   bmmrQ   rR   )	r   attention_hidden_statememoryrT   rA   mask	alignmentattention_weightsattention_contextr   r   r   r!   _  s   
zAttention.forward)rD   rE   r8   r6   r7   )r#   r$   r%   r&   r   rX   r!   r'   r   r   r   r   rC      s    +rC   c                       s4   e Zd ZdZdddgdf fdd	Zdd Z  ZS )	Preneta*  The Tacotron pre-net module consisting of a specified number of
    normalized (Xavier-initialized) linear layers

    Arguments
    ---------
    in_dim: int
        the input dimensions
    sizes: int
        the dimension of the hidden layers/output
    dropout: float
        the dropout probability

    Example
    -------
    >>> import torch
    >>> from speechbrain.lobes.models.Tacotron2 import Prenet
    >>> layer = Prenet()
    >>> x = torch.randn(862, 2, 80)
    >>> output = layer(x)
    >>> output.shape
    torch.Size([862, 2, 256])
    P            ?c                    sB   t    |g|d d  }tdd t||D | _|| _d S )Nc                 S   s   g | ]\}}t ||d dqS )Fr
   )r   ).0in_sizeout_sizer   r   r   
<listcomp>  s    z#Prenet.__init__.<locals>.<listcomp>)r   r   r   
ModuleListziplayersdropout)r   r   sizesrq   in_sizesr   r   r   r     s   

zPrenet.__init__c                 C   s,   | j D ]}tjt||| jdd}q|S )zComputes the forward pass for the prenet

        Arguments
        ---------
        x: torch.Tensor
            the prenet inputs

        Returns
        -------
        output: torch.Tensor
            the output
        T)ptraining)rp   r\   rq   relu)r   r    r	   r   r   r   r!     s   
zPrenet.forwardr"   r   r   r   r   re     s    re   c                       s2   e Zd ZdZ				d	 fdd	Zdd Z  ZS )
Postnetah  The Tacotron postnet consists of a number of 1-d convolutional layers
    with Xavier initialization and a tanh activation, with batch normalization.
    Depending on configuration, the postnet may either refine the MEL spectrogram
    or upsample it to a linear spectrogram

    Arguments
    ---------
    n_mel_channels: int
        the number of MEL spectrogram channels
    postnet_embedding_dim: int
        the postnet embedding dimension
    postnet_kernel_size: int
        the kernel size of the convolutions within the decoders
    postnet_n_convolutions: int
        the number of convolutions in the postnet

    Example
    -------
    >>> import torch
    >>> from speechbrain.lobes.models.Tacotron2 import Postnet
    >>> layer = Postnet()
    >>> x = torch.randn(2, 80, 861)
    >>> output = layer(x)
    >>> output.shape
    torch.Size([2, 80, 861])
    rf   rE      c                    s   t    t | _| jtt|||dt|d d dddt	| t
d|d D ]}| jtt|||dt|d d dddt	| q-| jtt|||dt|d d dddt	| t| j| _d S )Nr)   r*   r9   r+   r,   r-   r.   r   r	   )r   r   r   rn   convolutionsappend
Sequentialr(   r/   BatchNorm1drangelenn_convs)r   n_mel_channelspostnet_embedding_dimpostnet_kernel_sizepostnet_n_convolutionsir   r   r   r     s\   

			zPostnet.__init__c                 C   s^   d}| j D ]'}|| jd k rtjt||d| jd}ntj||d| jd}|d7 }q|S )az  Computes the forward pass of the postnet

        Arguments
        ---------
        x: torch.Tensor
            the postnet input (usually a MEL spectrogram)

        Returns
        -------
        output: torch.Tensor
            the postnet output (a refined MEL spectrogram or a
            linear spectrogram depending on how the model is
            configured)
        r   r)   rh   )ru   )rz   r   r\   rq   r   r9   ru   )r   r    r   r1   r   r   r   r!     s   

zPostnet.forward)rf   rE   rx   rx   r"   r   r   r   r   rw     s    9rw   c                       sH   e Zd ZdZ			d fdd	Zejjdd Zejj	d	d
 Z
  ZS )Encodera  The Tacotron2 encoder module, consisting of a sequence of  1-d convolution banks (3 by default)
    and a bidirectional LSTM

    Arguments
    ---------
    encoder_n_convolutions: int
        the number of encoder convolutions
    encoder_embedding_dim: int
        the dimension of the encoder embedding
    encoder_kernel_size: int
        the kernel size of the 1-D convolutional layers within
        the encoder

    Example
    -------
    >>> import torch
    >>> from speechbrain.lobes.models.Tacotron2 import Encoder
    >>> layer = Encoder()
    >>> x = torch.randn(2, 512, 128)
    >>> input_lengths = torch.tensor([128, 83])
    >>> outputs = layer(x, input_lengths)
    >>> outputs.shape
    torch.Size([2, 128, 512])

       rE   rx   c                    s   t    g }t|D ]}tt|||dt|d d dddt|}|| qt	|| _
tj|t|d dddd| _d S )Nr)   r*   rv   ry   T)batch_firstbidirectional)r   r   r~   r   r|   r(   r/   r}   r{   rn   rz   LSTMlstm)r   encoder_n_convolutionsencoder_embedding_dimencoder_kernel_sizerz   _
conv_layerr   r   r   r   G  s0   
	
zEncoder.__init__c                 C   s   | j D ]}tt||d| j}q|dd}|  }tj	j
j||dd}| j  | |\}}tj	j
j|dd\}}|S )aD  Computes the encoder forward pass

        Arguments
        ---------
        x: torch.Tensor
            a batch of inputs (sequence embeddings)

        input_lengths: torch.Tensor
            a tensor of input lengths

        Returns
        -------
        outputs: torch.Tensor
            the encoder output
        rh   r)   r*   Tr   )rz   r\   rq   rv   ru   r@   cpunumpyr   utilsrnnpack_padded_sequencer   flatten_parameterspad_packed_sequence)r   r    input_lengthsr1   outputsr   r   r   r   r!   h  s   

zEncoder.forwardc              	   C   s   |j }| jD ]}tt|||d| j}q|dd}| }t	j
jj||dd}| |\}}t	j
jj|dd\}}|S )aS  Performs a forward step in the inference context

        Arguments
        ---------
        x: torch.Tensor
            a batch of inputs (sequence embeddings)

        input_lengths: torch.Tensor
            a tensor of input lengths

        Returns
        -------
        outputs: torch.Tensor
            the encoder output
        rh   r)   r*   Tr   )devicerz   r\   rq   rv   toru   r@   r   r   r   r   r   r   r   )r   r    r   r   r1   r   r   r   r   r   infer  s   
"zEncoder.infer)r   rE   rx   )r#   r$   r%   r&   r   r   jitignorer!   exportr   r'   r   r   r   r   r   ,  s    !
"r   c                       s   e Zd ZdZ											
				d fdd	Zdd Zdd Zdd Zdd Zdd Z	e
jjdd Ze
jjdd Z  ZS )Decodera  The Tacotron decoder

    Arguments
    ---------
    n_mel_channels: int
        the number of channels in the MEL spectrogram
    n_frames_per_step: int
        the number of frames in the spectrogram for each
        time step of the decoder
    encoder_embedding_dim: int
        the dimension of the encoder embedding
    attention_dim: int
        Size of attention vector
    attention_location_n_filters: int
        the number of filters in location-based attention
    attention_location_kernel_size: int
        the kernel size of location-based attention
    attention_rnn_dim: int
        RNN dimension for the attention layer
    decoder_rnn_dim: int
        the encoder RNN dimension
    prenet_dim: int
        the dimension of the prenet (inner and output layers)
    max_decoder_steps: int
        the maximum number of decoder steps for the longest utterance
        expected for the model
    gate_threshold: float
        the fixed threshold to which the outputs of the decoders will be compared
    p_attention_dropout: float
        dropout probability for attention layers
    p_decoder_dropout: float
        dropout probability for decoder layers
    early_stopping: bool
        Whether to stop training early.

    Example
    -------
    >>> import torch
    >>> from speechbrain.lobes.models.Tacotron2 import Decoder
    >>> layer = Decoder()
    >>> memory = torch.randn(2, 173, 512)
    >>> decoder_inputs = torch.randn(2, 80, 173)
    >>> memory_lengths = torch.tensor([173, 91])
    >>> mel_outputs, gate_outputs, alignments = layer(
    ...     memory, decoder_inputs, memory_lengths)
    >>> mel_outputs.shape, gate_outputs.shape, alignments.shape
    (torch.Size([2, 80, 173]), torch.Size([2, 173]), torch.Size([2, 173, 173]))
    rf   r)   rE   r8   r6   r7   rD   rg     rh   皙?Tc                    s   t    || _|| _|| _|| _|| _|	| _|
| _|| _	|| _
|| _|| _t|| |	|	g| _t|	| || _t|||||| _t|| |d| _t|| || | _t|| dddd| _d S )Nr)   Tsigmoidr:   )r   r   r   n_frames_per_stepr   rM   decoder_rnn_dim
prenet_dimmax_decoder_stepsgate_thresholdp_attention_dropoutp_decoder_dropoutearly_stoppingre   prenetr   LSTMCellattention_rnnrC   attention_layerdecoder_rnnr   linear_projection
gate_layer)r   r   r   r   r?   rO   rP   rM   r   r   r   r   r   r   r   r   r   r   r     sL   

zDecoder.__init__c                 C   s4   | d}|j}|j}tj|| j| j ||d}|S )zGets all zeros frames to use as first decoder input

        Arguments
        ---------
        memory: torch.Tensor
            decoder outputs

        Returns
        -------
        decoder_input: torch.Tensor
            all zeros frames
        r   dtyper   )sizer   r   r   zerosr   r   )r   r`   Br   r   decoder_inputr   r   r   get_go_frame  s   

zDecoder.get_go_framec                 C   s   | d}| d}|j}|j}tj|| j||d}tj|| j||d}tj|| j||d}tj|| j||d}	tj||||d}
tj||||d}tj|| j||d}| j	|}||||	|
|||fS )ar  Initializes attention rnn states, decoder rnn states, attention
        weights, attention cumulative weights, attention context, stores memory
        and stores processed memory

        Arguments
        ---------
        memory: torch.Tensor
            Encoder outputs

        Returns
        -------
        attention_hidden: torch.Tensor
        attention_cell: torch.Tensor
        decoder_hidden: torch.Tensor
        decoder_cell: torch.Tensor
        attention_weights: torch.Tensor
        attention_weights_cum: torch.Tensor
        attention_context: torch.Tensor
        processed_memory: torch.Tensor
        r   r)   r   )
r   r   r   r   r   rM   r   r   r   rH   )r   r`   r   MAX_TIMEr   r   attention_hiddenattention_celldecoder_hiddendecoder_cellrc   attention_weights_cumrd   rT   r   r   r   initialize_decoder_states5  sB   






z!Decoder.initialize_decoder_statesc                 C   s@   | dd}||dt|d| j d}| dd}|S )a,  Prepares decoder inputs, i.e. mel outputs

        Arguments
        ---------
        decoder_inputs: torch.Tensor
            inputs used for teacher-forced training, i.e. mel-specs

        Returns
        -------
        decoder_inputs: torch.Tensor
            processed decoder inputs

        r)   r*   r   ri   )r@   viewr   r/   r   )r   decoder_inputsr   r   r   parse_decoder_inputsr  s   zDecoder.parse_decoder_inputsc                 C   sz   | dd }| dkr|d}n| dd }| dd }|jd d| jf}|j| }| dd}|||fS )a  Prepares decoder outputs for output

        Arguments
        ---------
        mel_outputs: torch.Tensor
            MEL-scale spectrogram outputs
        gate_outputs: torch.Tensor
            gate output energies
        alignments: torch.Tensor
            the alignment tensor

        Returns
        -------
        mel_outputs: torch.Tensor
            MEL-scale spectrogram outputs
        gate_outputs: torch.Tensor
            gate output energies
        alignments: torch.Tensor
            the alignment tensor
        r   r)   ri   r*   )r@   
contiguousrZ   rQ   shaper   r   )r   mel_outputsgate_outputs
alignmentsr   r   r   r   parse_decoder_outputs  s   

zDecoder.parse_decoder_outputsc              	   C   s   t ||fd}| |||f\}}t|| j| j}t j|d|dfdd}| ||	|
||\}}||7 }t ||fd}| 	|||f\}}t|| j
| j}t j||fdd}| |}| |}|||||||||f	S )aa  Decoder step using stored states, attention and memory
        Arguments
        ---------
        decoder_input: torch.Tensor
            previous mel output
        attention_hidden: torch.Tensor
            the hidden state of the attention module
        attention_cell: torch.Tensor
            the attention cell state
        decoder_hidden: torch.Tensor
            the decoder hidden state
        decoder_cell: torch.Tensor
            the decoder cell state
        attention_weights: torch.Tensor
            the attention weights
        attention_weights_cum: torch.Tensor
            cumulative attention weights
        attention_context: torch.Tensor
            the attention context tensor
        memory: torch.Tensor
            the memory tensor
        processed_memory: torch.Tensor
            the processed memory tensor
        mask: torch.Tensor



        Returns
        -------
        mel_output: torch.Tensor
            the MEL-scale outputs
        gate_output: torch.Tensor
            gate output energies
        attention_weights: torch.Tensor
            attention weights
        ri   r)   rY   )r   catr   r\   rq   r   ru   rQ   r   r   r   r   r   )r   r   r   r   r   r   rc   r   rd   r`   rT   ra   
cell_inputrA    decoder_hidden_attention_contextdecoder_outputgate_predictionr   r   r   decode  sZ   2


zDecoder.decodec                 C   s(  |  |d}| |}tj||fdd}| |}t|}| |\}}}}	}
}}}g g g }}}t||	dd k r||t| }| 
|||||	|
|||||\	}}}}}}	}
}}||dg7 }|| g7 }||
g7 }t||	dd k s?| t|t|t|\}}}|||fS )a`  Decoder forward pass for training

        Arguments
        ---------
        memory: torch.Tensor
            Encoder outputs
        decoder_inputs: torch.Tensor
            Decoder inputs for teacher forcing. i.e. mel-specs
        memory_lengths: torch.Tensor
            Encoder output lengths for attention masking.

        Returns
        -------
        mel_outputs: torch.Tensor
            mel outputs from the decoder
        gate_outputs: torch.Tensor
            gate outputs from the decoder
        alignments: torch.Tensor
            sequence of attention weights from the decoder
        r   rY   r)   )r   rQ   r   r   r   r   r   r   r   r   r   rR   r   stack)r   r`   r   memory_lengthsr   ra   r   r   r   r   rc   r   rd   rT   r   r   r   
mel_outputgate_outputr   r   r   r!     sf   





zDecoder.forwardc                 C   s  |  |}t|}| |\}}}}}	}
}}tj|dgtj|jd}tj|dgtj|jd}tdtdtd}}}d}	 | 	|}| 
||||||	|
||||\	}}}}}}}	}
}|rq|d}|}|	}d}ntj||dfdd}tj||fdd}tj||	fdd}tt|| jtjd}|| }||7 }| jrt|dkrnt|| jkrn|}qF| |||\}}}||||fS )a=  Decoder inference

        Arguments
        ---------
        memory: torch.Tensor
            Encoder outputs
        memory_lengths: torch.Tensor
            The corresponding relative lengths of the inputs.

        Returns
        -------
        mel_outputs: torch.Tensor
            mel outputs from the decoder
        gate_outputs: torch.Tensor
            gate outputs from the decoder
        alignments: torch.Tensor
            sequence of attention weights from the decoder
        mel_lengths: torch.Tensor
            the length of MEL spectrograms
        r   r   r)   TFrY   )r   r   r   r   r   r   int32r   onesr   r   rQ   r   ler   r   r   rR   r   sumr   r   r   )r   r`   r   r   ra   r   r   r   r   rc   r   rd   rT   mel_lengthsnot_finishedr   r   r   
first_iterr   r   decr   r   r   r   i  s   




5
zDecoder.infer)rf   r)   rE   r8   r6   r7   rD   rD   rg   r   rh   r   r   T)r#   r$   r%   r&   r   r   r   r   r   r   r   r   r   r!   r   r   r'   r   r   r   r   r     s4    3>=&i
Nr   c                       sj   e Zd ZdZ											
												d fdd	ZdddZdddZdd Z  ZS )	Tacotron2a  The Tactron2 text-to-speech model, based on the NVIDIA implementation.

    This class is the main entry point for the model, which is responsible
    for instantiating all submodules, which, in turn, manage the individual
    neural network layers

    Simplified STRUCTURE: input->word embedding ->encoder ->attention     ->decoder(+prenet) -> postnet ->output

    prenet(input is decoder previous time step) output is input to decoder
    concatenated with the attention output

    Arguments
    ---------
    mask_padding: bool
        whether or not to mask pad-outputs of tacotron
    n_mel_channels: int
        number of mel channels for constructing spectrogram
    n_symbols:  int=128
        number of accepted char symbols defined in textToSequence
    symbols_embedding_dim: int
        number of embedding dimension for symbols fed to nn.Embedding
    encoder_kernel_size: int
        size of kernel processing the embeddings
    encoder_n_convolutions: int
        number of convolution layers in encoder
    encoder_embedding_dim: int
        number of kernels in encoder, this is also the dimension
        of the bidirectional LSTM in the encoder
    attention_rnn_dim: int
        input dimension
    attention_dim: int
        number of hidden representation in attention
    attention_location_n_filters: int
        number of 1-D convolution filters in attention
    attention_location_kernel_size: int
        length of the 1-D convolution filters
    n_frames_per_step: int=1
        only 1 generated mel-frame per step is supported for the decoder as of now.
    decoder_rnn_dim: int
        number of 2 unidirectional stacked LSTM units
    prenet_dim: int
        dimension of linear prenet layers
    max_decoder_steps: int
        maximum number of steps/frames the decoder generates before stopping
    gate_threshold: int
        cut off level any output probability above that is considered
        complete and stops generation so we have variable length outputs
    p_attention_dropout: float
        attention drop out probability
    p_decoder_dropout: float
        decoder drop  out probability
    postnet_embedding_dim: int
        number os postnet dfilters
    postnet_kernel_size: int
        1d size of posnet kernel
    postnet_n_convolutions: int
        number of convolution layers in postnet
    decoder_no_early_stopping: bool
        determines early stopping of decoder
        along with gate_threshold . The logical inverse of this is fed to the decoder

    Example
    -------
    >>> import torch
    >>> _ = torch.manual_seed(213312)
    >>> from speechbrain.lobes.models.Tacotron2 import Tacotron2
    >>> model = Tacotron2(
    ...    mask_padding=True,
    ...    n_mel_channels=80,
    ...    n_symbols=148,
    ...    symbols_embedding_dim=512,
    ...    encoder_kernel_size=5,
    ...    encoder_n_convolutions=3,
    ...    encoder_embedding_dim=512,
    ...    attention_rnn_dim=1024,
    ...    attention_dim=128,
    ...    attention_location_n_filters=32,
    ...    attention_location_kernel_size=31,
    ...    n_frames_per_step=1,
    ...    decoder_rnn_dim=1024,
    ...    prenet_dim=256,
    ...    max_decoder_steps=32,
    ...    gate_threshold=0.5,
    ...    p_attention_dropout=0.1,
    ...    p_decoder_dropout=0.1,
    ...    postnet_embedding_dim=512,
    ...    postnet_kernel_size=5,
    ...    postnet_n_convolutions=5,
    ...    decoder_no_early_stopping=False
    ... )
    >>> _ = model.eval()
    >>> inputs = torch.tensor([
    ...     [13, 12, 31, 14, 19],
    ...     [31, 16, 30, 31, 0],
    ... ])
    >>> input_lengths = torch.tensor([5, 4])
    >>> outputs, output_lengths, alignments = model.infer(inputs, input_lengths)
    >>> outputs.shape, output_lengths.shape, alignments.shape
    (torch.Size([2, 80, 1]), torch.Size([2]), torch.Size([2, 1, 5]))
    Trf      rE   rx   r   rD   r8   r6   r7   r)   rg   r   rh   r   Fc                    s   t    || _|| _|| _t||| _td||  }td| }| jj	j
| | t|||| _t||||	|
||||||||| | _t||||| _d S )Ng       @g      @)r   r   mask_paddingr   r   r   	Embedding	embeddingr   r   datauniform_r   encoderr   decoderrw   postnet)r   r   r   	n_symbolssymbols_embedding_dimr   r   r   rM   r?   rO   rP   r   r   r   r   r   r   r   r   r   r   decoder_no_early_stoppingstdvalr   r   r   r   =  sB   
 
zTacotron2.__init__Nc           	      C   s   |\}}}}| j rH|durHt||dd}|| j|d|d}|ddd}| |d ||d ||dddddf d |durYt	|d||d f}||||fS )	a  
        Masks the padded part of output

        Arguments
        ---------
        outputs: list
            a list of tensors - raw outputs
        output_lengths: torch.Tensor
            a tensor representing the lengths of all outputs
        alignments_dim: int
            the desired dimension of the alignments along the last axis
            Optional but needed for data-parallel training


        Returns
        -------
        mel_outputs: torch.Tensor
        mel_outputs_postnet: torch.Tensor
        gate_outputs: torch.Tensor
        alignments: torch.Tensor
            the original outputs - with the mask applied
        Nri   )max_lenr   r)   r*           g     @@)
r   r   r   expandr   permuteclonemasked_fill_r\   pad)	r   r   output_lengthsalignments_dimr   mel_outputs_postnetr   r   ra   r   r   r   parse_output  s   
zTacotron2.parse_outputc                 C   sx   |\}}}}}|j |j }}| |dd}| ||}| j|||d\}	}
}| |	}|	| }| |	||
|g||S )a  Decoder forward pass for training

        Arguments
        ---------
        inputs: tuple
            batch object
        alignments_dim: int
            the desired dimension of the alignments along the last axis
            Optional but needed for data-parallel training

        Returns
        -------
        mel_outputs: torch.Tensor
            mel outputs from the decoder
        mel_outputs_postnet: torch.Tensor
            mel outputs from postnet
        gate_outputs: torch.Tensor
            gate outputs from the decoder
        alignments: torch.Tensor
            sequence of attention weights from the decoder
        output_lengths: torch.Tensor
            length of the output without padding
        r)   r*   )r   )r   r   r@   r   r   r   r   )r   inputsr   r   targetsr   r   embedded_inputsencoder_outputsr   r   r   r   r   r   r   r!     s   

zTacotron2.forwardc                 C   sr   |  |dd}| j||}| j||\}}}}| |}	||	 }	|	d}
|d|
|
dd}|	||fS )a  Produces outputs


        Arguments
        ---------
        inputs: torch.tensor
            text or phonemes converted

        input_lengths: torch.tensor
            the lengths of input parameters

        Returns
        -------
        mel_outputs_postnet: torch.Tensor
            final mel output of tacotron 2
        mel_lengths: torch.Tensor
            length of mels
        alignments: torch.Tensor
            sequence of attention weights
        r)   r*   r   )r   r@   r   r   r   r   r   unfold)r   r   r   r   r   r   r   r   r   r   BSr   r   r   r     s   


zTacotron2.infer)Trf   r   rE   rx   r   rE   rD   r8   r6   r7   r)   rD   rg   r   rh   r   r   rE   rx   rx   FN)	r#   r$   r%   r&   r   r   r!   r   r'   r   r   r   r   r     s8    h
B
)-r   c                 C   s   |  ||S )a}  
    An inference hook for pretrained synthesizers

    Arguments
    ---------
    model: Tacotron2
        the tacotron model
    text_sequences: torch.Tensor
        encoded text sequences
    input_lengths: torch.Tensor
        input lengths

    Returns
    -------
    result: tuple
        (mel_outputs_postnet, mel_lengths, alignments) - the exact
        model output
    )r   )modeltext_sequencesr   r   r   r   r     s   r   TacotronLossz-loss mel_loss gate_loss attn_loss attn_weightc                       s<   e Zd ZdZ					d
 fdd	Zdd Zdd	 Z  ZS )Lossa  The Tacotron loss implementation

    The loss consists of an MSE loss on the spectrogram, a BCE gate loss
    and a guided attention loss (if enabled) that attempts to make the
    attention matrix diagonal

    The output of the module is a LossStats tuple, which includes both the
    total loss

    Arguments
    ---------
    guided_attention_sigma: float
        The guided attention sigma factor, controlling the "width" of
        the mask
    gate_loss_weight: float
        The constant by which the hate loss will be multiplied
    guided_attention_weight: float
        The weight for the guided attention
    guided_attention_scheduler: callable
        The scheduler class for the guided attention loss
    guided_attention_hard_stop: int
        The number of epochs after which guided attention will be completely
        turned off

    Example
    -------
    >>> import torch
    >>> _ = torch.manual_seed(42)
    >>> from speechbrain.lobes.models.Tacotron2 import Loss
    >>> loss = Loss(guided_attention_sigma=0.2)
    >>> mel_target = torch.randn(2, 80, 861)
    >>> gate_target = torch.randn(1722, 1)
    >>> mel_out = torch.randn(2, 80, 861)
    >>> mel_out_postnet = torch.randn(2, 80, 861)
    >>> gate_out = torch.randn(2, 861)
    >>> alignments = torch.randn(2, 861, 173)
    >>> targets = mel_target, gate_target
    >>> model_outputs = mel_out, mel_out_postnet, gate_out, alignments
    >>> input_lengths = torch.tensor([173,  91])
    >>> target_lengths = torch.tensor([861, 438])
    >>> loss(model_outputs, targets, input_lengths, target_lengths, 1)
    TacotronLoss(loss=tensor(4.8566), mel_loss=tensor(4.0097), gate_loss=tensor(0.8460), attn_loss=tensor(0.0010), attn_weight=tensor(1.))
    N      ?c                    sX   t    |dkrd }|| _t | _t | _t|d| _	|| _
|| _|| _|| _d S )Nr   )sigma)r   r   guided_attention_weightr   MSELossmse_lossBCEWithLogitsLossbce_lossr   guided_attention_lossgate_loss_weightguided_attention_schedulerguided_attention_hard_stop)r   guided_attention_sigmar
  r  r  r  r   r   r   r   B  s   



zLoss.__init__c                 C   s   |d |d }}d|_ d|_ |dd}|\}}	}
}|
dd}
| ||| |	| }| j| |
| }| ||||\}}|| | }t|||||S )a  Computes the loss

        Arguments
        ---------
        model_output: tuple
            the output of the model's forward():
            (mel_outputs, mel_outputs_postnet, gate_outputs, alignments)
        targets: tuple
            the targets
        input_lengths: torch.Tensor
            a (batch, length) tensor of input lengths
        target_lengths: torch.Tensor
            a (batch, length) tensor of target (spectrogram) lengths
        epoch: int
            the current epoch number (used for the scheduling of the guided attention
            loss) A StepScheduler is typically used

        Returns
        -------
        result: LossStats
            the total loss - and individual losses (mel and gate)

        r   r)   Fri   )requires_gradr   r  r
  r  get_attention_loss	LossStats)r   model_outputr   r   target_lengthsepoch
mel_targetgate_targetmel_outmel_out_postnetgate_outr   mel_loss	gate_loss	attn_lossattn_weight
total_lossr   r   r   r!   X  s"   
zLoss.forwardc           
      C   s   t jd|jd}| jdu s| jdkr||}}||fS | jduo$|| jk}|r-||}}n| j}| jdur<| |\}	}t j||jd}|| ||| }||fS )aa  Computes the attention loss

        Arguments
        ---------
        alignments: torch.Tensor
            the alignment matrix from the model
        input_lengths: torch.Tensor
            a (batch, length) tensor of input lengths
        target_lengths: torch.Tensor
            a (batch, length) tensor of target (spectrogram) lengths
        epoch: int
            the current epoch number (used for the scheduling of the guided attention
            loss) A StepScheduler is typically used

        Returns
        -------
        attn_loss: torch.Tensor
            the attention loss value
        r   )r   Nr   )r   tensorr   r  r  r  r	  )
r   r   r   r  r  zero_tensorr  r  hard_stop_reachedr   r   r   r   r    s$   




zLoss.get_attention_loss)Nr  r  NN)r#   r$   r%   r&   r   r!   r  r'   r   r   r   r   r    s    ..r  c                   @   s"   e Zd ZdZdddZdd ZdS )	TextMelCollatezZero-pads model inputs and targets based on number of frames per step

    Arguments
    ---------
    n_frames_per_step: int
        the number of output frames per step
    r)   c                 C   s
   || _ d S r   )r   )r   r   r   r   r   r     s   
zTextMelCollate.__init__c              	   C   s  t |}tt|D ]
}|| d ||< q
tjtdd |D ddd\}}|d }tt||}|  tt|D ]}|||  d }|||d|df< q=|d d d}	td	d |D }
|
| j	 dkr|
| j	|
| j	  7 }
|
| j	 dksJ t
t||	|
}|  t
t||
}|  tt|}g g }}tt|D ]@}|| }|| d }|||ddd|df< d|||dd df< |d||< ||| d
  ||| d  qdd |D }t|}||||||||fS )a  Collate's training batch from normalized text and mel-spectrogram

        Arguments
        ---------
        batch: list
            [text_normalized, mel_normalized]

        Returns
        -------
        text_padded: torch.Tensor
        input_lengths: torch.Tensor
        mel_padded: torch.Tensor
        gate_padded: torch.Tensor
        output_lengths: torch.Tensor
        len_x: torch.Tensor
        labels: torch.Tensor
        wavs: torch.Tensor
        mel_text_pairc                 S   s   g | ]}t |d  qS )r   )r   rj   r    r   r   r   rm     s    z+TextMelCollate.__call__.<locals>.<listcomp>r   T)rZ   
descendingNr)   c                 S   s   g | ]	}|d   d qS r)   )r   r#  r   r   r   rm     s    labelwavc                 S   s   g | ]}|d  qS )r*   r   r#  r   r   r   rm     s    )listr~   r   r   sort
LongTensorzero_r   maxr   FloatTensorr{   Tensor)r   batch	raw_batchr   r   ids_sorted_decreasingmax_input_lentext_paddedtextnum_melsmax_target_len
mel_paddedgate_paddedr   labelswavsidxmellen_xr   r   r   __call__  s\   


zTextMelCollate.__call__Nr%  )r#   r$   r%   r&   r   r>  r   r   r   r   r!    s    
r!  r)   h㈵>c                 C   s   t t j| |d| S )z+Dynamic range compression for audio signals)min)r   logclamp)r    Cclip_valr   r   r   dynamic_range_compression  s   rE  c                 C   sL   ddl m} |j| |||||||||	|
d|j}||}|r$t|}|S )a  calculates MelSpectrogram for a raw audio signal

    Arguments
    ---------
    sample_rate : int
        Sample rate of audio signal.
    hop_length : int
        Length of hop between STFT windows.
    win_length : int
        Window size.
    n_fft : int
        Size of FFT.
    n_mels : int
        Number of mel filterbanks.
    f_min : float
        Minimum frequency.
    f_max : float
        Maximum frequency.
    power : float
        Exponent for the magnitude spectrogram.
    normalized : bool
        Whether to normalize by magnitude after stft.
    norm : str or None
        If "slaney", divide the triangular mel weights by the width of the mel band
    mel_scale : str
        Scale to use: "htk" or "slaney".
    compression : bool
        whether to do dynamic range compression
    audio : torch.Tensor
        input audio signal

    Returns
    -------
    mel : torch.Tensor
        The computed mel spectrogram features.
    r   )
transforms)sample_rate
hop_length
win_lengthn_fftn_melsf_minf_maxpower
normalizednorm	mel_scale)
torchaudiorF  MelSpectrogramr   r   rE  )rG  rH  rI  rJ  rK  rL  rM  rN  rO  rP  rQ  compressionaudiorF  audio_to_melr<  r   r   r   mel_spectogram  s(   3rW  )r)   r?  )r&   collectionsr   mathr   r   r   torch.nnr   r\   0speechbrain.lobes.models.transformer.Transformerr   %speechbrain.nnet.loss.guidedattn_lossr   Moduler   r(   r5   rC   re   rw   r   r   r   r   r  r  r!  rE  rW  r   r   r   r   <module>   sD    (4PE 
5o     -  &  
\