o
    ½e¦iÙj  ã                   @   s¬   d Z ddlZddlmZ ddlmZ ddlZddlmZ ddlm	Z
 ddlmZmZmZmZmZ ddlmZ G d	d
„ d
ejƒZeddƒZG dd„ dejƒZG dd„ dƒZdS )zÅ
Neural network modules for the Zero-Shot Multi-Speaker Tacotron2 end-to-end neural
Text-to-Speech (TTS) model

Authors
* Georges Abous-Rjeili 2021
* Artem Ploujnikov 2021
* Pradnya Kandarkar 2023
é    N)Ú
namedtuple)Úsqrt)Únn)Ú
functional)ÚDecoderÚEncoderÚ
LinearNormÚPostnetÚget_mask_from_lengths)ÚGuidedAttentionLossc                       sj   e Zd ZdZ											
												d‡ fdd„	Zddd„Zddd„Zdd„ Z‡  ZS )Ú	Tacotron2ax  The Tactron2 text-to-speech model, based on the NVIDIA implementation.

    This class is the main entry point for the model, which is responsible
    for instantiating all submodules, which, in turn, manage the individual
    neural network layers

    Simplified STRUCTURE: phoneme input->token embedding ->encoder -> (encoder output + speaker embedding) ->attention     ->decoder(+prenet) -> postnet ->output

    prenet(input is decoder previous time step) output is input to decoder
    concatenated with the attention output

    Arguments
    ---------
    spk_emb_size: int
        Speaker embedding size
    mask_padding: bool
        whether or not to mask pad-outputs of tacotron
    n_mel_channels: int
        number of mel channels for constructing spectrogram
    n_symbols:  int=128
        number of accepted char symbols defined in textToSequence
    symbols_embedding_dim: int
        number of embedding dimension for symbols fed to nn.Embedding
    encoder_kernel_size: int
        size of kernel processing the embeddings
    encoder_n_convolutions: int
        number of convolution layers in encoder
    encoder_embedding_dim: int
        number of kernels in encoder, this is also the dimension
        of the bidirectional LSTM in the encoder
    attention_rnn_dim: int
        input dimension
    attention_dim: int
        number of hidden representation in attention
    attention_location_n_filters: int
        number of 1-D convolution filters in attention
    attention_location_kernel_size: int
        length of the 1-D convolution filters
    n_frames_per_step: int=1
        only 1 generated mel-frame per step is supported for the decoder as of now.
    decoder_rnn_dim: int
        number of 2 unidirectional stacked LSTM units
    prenet_dim: int
        dimension of linear prenet layers
    max_decoder_steps: int
        maximum number of steps/frames the decoder generates before stopping
    gate_threshold: int
        cut off level any output probability above that is considered
        complete and stops generation so we have variable length outputs
    p_attention_dropout: float
        attention drop out probability
    p_decoder_dropout: float
        decoder drop  out probability
    postnet_embedding_dim: int
        number os postnet dfilters
    postnet_kernel_size: int
        1d size of posnet kernel
    postnet_n_convolutions: int
        number of convolution layers in postnet
    decoder_no_early_stopping: bool
        determines early stopping of decoder
        along with gate_threshold . The logical inverse of this is fed to the decoder

    Example
    -------
    >>> import torch
    >>> _ = torch.manual_seed(213312)
    >>> from speechbrain.lobes.models.Tacotron2 import Tacotron2
    >>> model = Tacotron2(
    ...    mask_padding=True,
    ...    n_mel_channels=80,
    ...    n_symbols=148,
    ...    symbols_embedding_dim=512,
    ...    encoder_kernel_size=5,
    ...    encoder_n_convolutions=3,
    ...    encoder_embedding_dim=512,
    ...    attention_rnn_dim=1024,
    ...    attention_dim=128,
    ...    attention_location_n_filters=32,
    ...    attention_location_kernel_size=31,
    ...    n_frames_per_step=1,
    ...    decoder_rnn_dim=1024,
    ...    prenet_dim=256,
    ...    max_decoder_steps=32,
    ...    gate_threshold=0.5,
    ...    p_attention_dropout=0.1,
    ...    p_decoder_dropout=0.1,
    ...    postnet_embedding_dim=512,
    ...    postnet_kernel_size=5,
    ...    postnet_n_convolutions=5,
    ...    decoder_no_early_stopping=False
    ... )
    >>> _ = model.eval()
    >>> inputs = torch.tensor([
    ...     [13, 12, 31, 14, 19],
    ...     [31, 16, 30, 31, 0],
    ... ])
    >>> input_lengths = torch.tensor([5, 4])
    >>> outputs, output_lengths, alignments = model.infer(inputs, input_lengths)
    >>> outputs.shape, output_lengths.shape, alignments.shape
    (torch.Size([2, 80, 1]), torch.Size([2]), torch.Size([2, 1, 5]))
    TéP   é”   é   é   é   é   é€   é    é   é   é   éè  ç      à?çš™™™™™¹?Fc                    sÞ   t ƒ  ¡  || _|| _|| _t ||¡| _td||  ƒ}tdƒ| }| jj	j
 | |¡ t|||ƒ| _t||||
|||	||||||| ƒ| _t||||ƒ| _t|| d ƒ| _t|| jƒ| _t| j|ƒ| _t| j|ƒ| _d S )Ng       @g      @é   )ÚsuperÚ__init__Úmask_paddingÚn_mel_channelsÚn_frames_per_stepr   Ú	EmbeddingÚ	embeddingr   ÚweightÚdataÚuniform_r   Úencoderr   Údecoderr	   ÚpostnetÚintÚms_film_hidden_sizer   Úms_film_hiddenÚ	ms_film_hÚ	ms_film_g)ÚselfÚspk_emb_sizer   r   Ú	n_symbolsÚsymbols_embedding_dimÚencoder_kernel_sizeÚencoder_n_convolutionsÚencoder_embedding_dimÚattention_rnn_dimÚattention_dimÚattention_location_n_filtersÚattention_location_kernel_sizer    Údecoder_rnn_dimÚ
prenet_dimÚmax_decoder_stepsÚgate_thresholdÚp_attention_dropoutÚp_decoder_dropoutÚpostnet_embedding_dimÚpostnet_kernel_sizeÚpostnet_n_convolutionsÚdecoder_no_early_stoppingÚstdÚval©Ú	__class__© úb/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/speechbrain/lobes/models/MSTacotron2.pyr   ¥   sV   
!ÿòü	
ÿÿ
ÿzTacotron2.__init__Nc           	      C   sÀ   |\}}}}| j rH|durHt|| d¡d}| | j| d¡| d¡¡}| ddd¡}| ¡  |d¡ | |d¡ | |dd…ddd…f d¡ |durYt 	|d|| d¡ f¡}|||||fS )	a¨  
        Masks the padded part of output

        Arguments
        ---------
        outputs: list
            a list of tensors - raw outputs
        output_lengths: torch.Tensor
            a tensor representing the lengths of all outputs
        alignments_dim: int
            the desired dimension of the alignments along the last axis
            Optional but needed for data-parallel training

        Returns
        -------
        mel_outputs: torch.Tensor
        mel_outputs_postnet: torch.Tensor
        gate_outputs: torch.Tensor
        alignments: torch.Tensor
        output_lengths: torch.Tensor
            the original outputs - with the mask applied
        Néÿÿÿÿ)Úmax_lenr   r   r   ç        g     @@)
r   r
   ÚsizeÚexpandr   ÚpermuteÚcloneÚmasked_fill_ÚFÚpad)	r.   ÚoutputsÚoutput_lengthsÚalignments_dimÚmel_outputsÚmel_outputs_postnetÚgate_outputsÚ
alignmentsÚmaskrG   rG   rH   Úparse_outputõ   s(   
ÿÿûzTacotron2.parse_outputc                 C   sä   |\}}}}}|j |j }}|  |¡ dd¡}|  ||¡}	t |  |¡¡}
|  |
¡}t 	|d¡ 
d|	jd d¡}|	| }	|  |
¡}t 	|d¡ 
d|	jd d¡}|	| }	| j|	||d\}}}|  |¡}|| }|  ||||g||¡S )a1  Decoder forward pass for training

        Arguments
        ---------
        inputs: tuple
            batch object
        spk_embs: torch.Tensor
            Speaker embeddings corresponding to the inputs
        alignments_dim: int
            the desired dimension of the alignments along the last axis
            Optional but needed for data-parallel training

        Returns
        -------
        mel_outputs: torch.Tensor
            mel outputs from the decoder
        mel_outputs_postnet: torch.Tensor
            mel outputs from postnet
        gate_outputs: torch.Tensor
            gate outputs from the decoder
        alignments: torch.Tensor
            sequence of attention weights from the decoder
        output_lengths: torch.Tensor
            length of the output without padding
        r   r   )Úmemory_lengths)r$   r"   Ú	transposer&   rQ   Úrelur+   r,   ÚtorchÚ	unsqueezeÚrepeatÚshaper-   r'   r(   r[   )r.   ÚinputsÚspk_embsrU   Úinput_lengthsÚtargetsrJ   rT   Úembedded_inputsÚencoder_outputsÚspk_embs_sharedÚ
spk_embs_hÚ
spk_embs_grV   rX   rY   rW   rG   rG   rH   Úforward$  s2   
ÿ
ÿÿ

ýzTacotron2.forwardc                 C   sÞ   |   |¡ dd¡}| j ||¡}t |  |¡¡}|  |¡}t 	|d¡ 
d|jd d¡}|| }|  |¡}t 	|d¡ 
d|jd d¡}|| }| j ||¡\}	}
}}|  |	¡}|	| }| d¡}| d||¡ dd¡}|||fS )a*  Produces outputs

        Arguments
        ---------
        inputs: torch.tensor
            text or phonemes converted
        spk_embs: torch.Tensor
            Speaker embeddings corresponding to the inputs
        input_lengths: torch.tensor
            the lengths of input parameters

        Returns
        -------
        mel_outputs_postnet: torch.Tensor
            final mel output of tacotron 2
        mel_lengths: torch.Tensor
            length of mels
        alignments: torch.Tensor
            sequence of attention weights
        r   r   r   )r"   r]   r&   ÚinferrQ   r^   r+   r,   r_   r`   ra   rb   r-   r'   r(   rL   Úunfold)r.   rc   rd   re   rg   rh   ri   rj   rk   rV   rX   rY   Úmel_lengthsrW   ÚBSrG   rG   rH   rm   a  s*   
ÿ
ÿÿ


zTacotron2.infer)Tr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   F©N)	Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   r[   rl   rm   Ú__classcell__rG   rG   rE   rH   r   <   s8    ká
P
/=r   ÚTacotronLossz:loss mel_loss spk_emb_loss gate_loss attn_loss attn_weightc                       sB   e Zd ZdZ								d
‡ fdd„	Zdd„ Zdd	„ Z‡  ZS )ÚLossa|  The Tacotron loss implementation
    The loss consists of an MSE loss on the spectrogram, a BCE gate loss
    and a guided attention loss (if enabled) that attempts to make the
    attention matrix diagonal
    The output of the module is a LossStats tuple, which includes both the
    total loss

    Arguments
    ---------
    guided_attention_sigma: float
        The guided attention sigma factor, controlling the "width" of
        the mask
    gate_loss_weight: float
        The constant by which the gate loss will be multiplied
    mel_loss_weight: float
        The constant by which the mel loss will be multiplied
    spk_emb_loss_weight: float
        The constant by which the speaker embedding loss will be multiplied - placeholder for future work
    spk_emb_loss_type: str
        Type of the speaker embedding loss - placeholder for future work
    guided_attention_weight: float
        The weight for the guided attention
    guided_attention_scheduler: callable
        The scheduler class for the guided attention loss
    guided_attention_hard_stop: int
        The number of epochs after which guided attention will be completely
        turned off

    Example
    -------
    >>> import torch
    >>> _ = torch.manual_seed(42)
    >>> from speechbrain.lobes.models.MSTacotron2 import Loss
    >>> loss = Loss(guided_attention_sigma=0.2)
    >>> mel_target = torch.randn(2, 80, 861)
    >>> gate_target = torch.randn(1722, 1)
    >>> mel_out = torch.randn(2, 80, 861)
    >>> mel_out_postnet = torch.randn(2, 80, 861)
    >>> gate_out = torch.randn(2, 861)
    >>> alignments = torch.randn(2, 861, 173)
    >>> pred_mel_lens = torch.randn(2)
    >>> targets = mel_target, gate_target
    >>> model_outputs = mel_out, mel_out_postnet, gate_out, alignments, pred_mel_lens
    >>> input_lengths = torch.tensor([173,  91])
    >>> target_lengths = torch.tensor([861, 438])
    >>> spk_embs = None
    >>> loss(model_outputs, targets, input_lengths, target_lengths, spk_embs, 1)
    TacotronLoss(loss=tensor([4.8566]), mel_loss=tensor(4.0097), spk_emb_loss=tensor([0.]), gate_loss=tensor(0.8460), attn_loss=tensor(0.0010), attn_weight=tensor(1.))
    Nç      ð?c	           	         sŒ   t ƒ  ¡  |dkrd }|| _|| _|| _|| _|| _t ¡ | _	t 
¡ | _t|d| _t ¡ | _tjjdd„ d| _t ¡ | _|| _|| _d S )Nr   )Úsigmac                 S   s   dt  | |¡ S )Nry   )rQ   Úcosine_similarity)ÚxÚyrG   rG   rH   Ú<lambda>ê  s    zLoss.__init__.<locals>.<lambda>)Údistance_function)r   r   Úguided_attention_weightÚgate_loss_weightÚmel_loss_weightÚspk_emb_loss_weightÚspk_emb_loss_typer   ÚMSELossÚmse_lossÚBCEWithLogitsLossÚbce_lossr   Úguided_attention_lossÚCosineSimilarityÚcos_simr_   ÚTripletMarginWithDistanceLossÚtriplet_lossÚCosineEmbeddingLossÚcos_emb_lossÚguided_attention_schedulerÚguided_attention_hard_stop)	r.   Úguided_attention_sigmar   r‚   rƒ   r„   r€   r   r‘   rE   rG   rH   r   Ï  s(   


ÿ
ÿ

zLoss.__init__c                 C   s^  |d |d }}d|_ d|_ | dd¡}|\}	}
}}}| dd¡}|  |	|¡|  |
|¡ }| j| }| j|  ||¡ }|  ||||¡\}}t dg¡ 	|j
¡}| jdkrk|\}}|  ||¡}t t |¡t|ƒ¡ }| jdkr„|\}}|  ||t t|ƒ¡ 	|j
¡¡}| jdkr™|\}}}|dur™|  |||¡}| j| }|| | | }t||||||ƒS )	aZ  Computes the loss
        Arguments
        ---------
        model_output: tuple
            the output of the model's forward():
            (mel_outputs, mel_outputs_postnet, gate_outputs, alignments)
        targets: tuple
            the targets
        input_lengths: torch.Tensor
            a (batch, length) tensor of input lengths
        target_lengths: torch.Tensor
            a (batch, length) tensor of target (spectrogram) lengths
        spk_embs: torch.Tensor
            Speaker embedding input for the loss computation - placeholder for future work
        epoch: int
            the current epoch number (used for the scheduling of the guided attention
            loss) A StepScheduler is typically used
        Returns
        -------
        result: LossStats
            the total loss - and individual losses (mel and gate)
        r   r   FrI   Úscl_lossr   r   N)Úrequires_gradÚviewr†   r‚   r   rˆ   Úget_attention_lossr_   ÚTensorÚtoÚdevicer„   r‹   ÚdivÚsumÚlenr   Úonesr   rƒ   Ú	LossStats)r.   Úmodel_outputrf   re   Útarget_lengthsrd   ÚepochÚ
mel_targetÚgate_targetÚmel_outÚmel_out_postnetÚgate_outrY   Úpred_mel_lensÚmel_lossÚ	gate_lossÚ	attn_lossÚattn_weightÚspk_emb_lossÚtarget_spk_embsÚpreds_spk_embsÚcos_sim_scoresÚanchor_spk_embsÚpos_spk_embsÚneg_spk_embsÚ
total_lossrG   rG   rH   rl   ñ  sd   úÿ
ÿ
ÿ
ý

ÿ
úzLoss.forwardc           
      C   s¢   t jd|jd}| jdu s| jdkr||}}||fS | jduo$|| jk}|r-||}}n| j}| jdur<|  |¡\}	}t j||jd}||  |||¡ }||fS )a_  Computes the attention loss
        Arguments
        ---------
        alignments: torch.Tensor
            the alignment matrix from the model
        input_lengths: torch.Tensor
            a (batch, length) tensor of input lengths
        target_lengths: torch.Tensor
            a (batch, length) tensor of target (spectrogram) lengths
        epoch: int
            the current epoch number (used for the scheduling of the guided attention
            loss) A StepScheduler is typically used
        Returns
        -------
        attn_loss: torch.Tensor
            the attention loss value
        rK   )r™   Nr   )r_   Útensorr™   r€   r‘   r   r‰   )
r.   rY   re   r    r¡   Úzero_tensorr«   rª   Úhard_stop_reachedÚ_rG   rG   rH   r–   O  s$   



óþ
ÿzLoss.get_attention_loss)Nry   ry   ry   Nry   NN)rr   rs   rt   ru   r   rl   r–   rv   rG   rG   rE   rH   rx   œ  s    4÷"^rx   c                   @   s$   e Zd ZdZ	ddd„Zdd„ ZdS )	ÚTextMelCollatea
  Zero-pads model inputs and targets based on number of frames per step

    Arguments
    ---------
    speaker_embeddings_pickle : str
        Path to the file containing speaker embeddings
    n_frames_per_step: int
        The number of output frames per step
    r   c                 C   s   || _ || _d S rq   )r    Úspeaker_embeddings_pickle)r.   r¹   r    rG   rG   rH   r   †  s   
zTextMelCollate.__init__c              
   C   s†  t |ƒ}tt|ƒƒD ]
}|| d ||< q
tjt dd„ |D ƒ¡ddd\}}|d }t t|ƒ|¡}| ¡  tt|ƒƒD ]}|||  d }|||d| d¡…f< q=|d d  d¡}	td	d„ |D ƒƒ}
|
| j	 dkr|
| j	|
| j	  7 }
|
| j	 dksJ ‚t 
t|ƒ|	|
¡}| ¡  t 
t|ƒ|
¡}| ¡  t t|ƒ¡}g g g g f\}}}}t| jd
ƒ}t |¡}W d  ƒ n1 s¿w   Y  tt|ƒƒD ][}|| }|| d }|||dd…d| d¡…f< d||| d¡d d…f< | d¡||< | || d ¡ | || d ¡ ||| d  }| |¡ | || d  d¡d ¡ qÊt |¡}dd„ |D ƒ}t |¡}||||||||||f
S )a  Collate's training batch from normalized text and mel-spectrogram

        Arguments
        ---------
        batch: list
            [text_normalized, mel_normalized]

        Returns
        -------
        text_padded: torch.Tensor
        input_lengths: torch.Tensor
        mel_padded: torch.Tensor
        gate_padded: torch.Tensor
        output_lengths: torch.Tensor
        len_x: torch.Tensor
        labels: torch.Tensor
        wavs: torch.Tensor
        spk_embs: torch.Tensor
        spk_ids: torch.Tensor
        Úmel_text_pairc                 S   s   g | ]}t |d  ƒ‘qS )r   )rœ   ©Ú.0r|   rG   rG   rH   Ú
<listcomp>¯  s    z+TextMelCollate.__call__.<locals>.<listcomp>r   T)ÚdimÚ
descendingNr   c                 S   s   g | ]	}|d    d ¡‘qS ©r   )rL   r»   rG   rG   rH   r½   »  s    ÚrbÚlabelÚwavÚuttidr·   c                 S   s   g | ]}|d  ‘qS )r   rG   r»   rG   rG   rH   r½   ß  s    )ÚlistÚrangerœ   r_   ÚsortÚ
LongTensorÚzero_rL   Úmaxr    ÚFloatTensorÚopenr¹   ÚpickleÚloadÚappendÚsplitÚstackr—   )r.   ÚbatchÚ	raw_batchÚire   Úids_sorted_decreasingÚmax_input_lenÚtext_paddedÚtextÚnum_melsÚmax_target_lenÚ
mel_paddedÚgate_paddedrT   ÚlabelsÚwavsÚspk_embs_listÚspk_idsÚspeaker_embeddings_fileÚspeaker_embeddingsÚidxÚmelÚspk_embrd   Úlen_xrG   rG   rH   Ú__call__  st   ÿ
ÿÿÿý


özTextMelCollate.__call__NrÀ   )rr   rs   rt   ru   r   rç   rG   rG   rG   rH   r¸   {  s
    
ý	r¸   )ru   rÍ   Úcollectionsr   Úmathr   r_   r   Útorch.nnr   rQ   Ú"speechbrain.lobes.models.Tacotron2r   r   r   r	   r
   Ú%speechbrain.nnet.loss.guidedattn_lossr   ÚModuler   rž   rx   r¸   rG   rG   rG   rH   Ú<module>   s$    )  ]ÿ `