o
    ¡¿¯i°_  ã                   @   sŽ   d Z ddlZddlZddlm  mZ ddlmZ dd„ Z	G dd„ dejj
ƒZG dd	„ d	ejj
ƒZG d
d„ dejj
ƒZG dd„ dejj
ƒZdS )z"Tacotron2 decoder related modules.é    N)ÚAttForwardTAc                 C   s2   t | tjjƒrtjj | jtjj d¡¡ dS dS )zInitialize decoder parameters.ÚtanhN)Ú
isinstanceÚtorchÚnnÚConv1dÚinitÚxavier_uniform_ÚweightÚcalculate_gain)Úm© r   úa/home/ubuntu/.local/lib/python3.10/site-packages/espnet/nets/pytorch_backend/tacotron2/decoder.pyÚdecoder_init   s    ÿr   c                       s2   e Zd ZdZd	‡ fdd„	Zdd„ Zdd„ Z‡  ZS )
ÚZoneOutCella  ZoneOut Cell module.

    This is a module of zoneout described in
    `Zoneout: Regularizing RNNs by Randomly Preserving Hidden Activations`_.
    This code is modified from `eladhoffer/seq2seq.pytorch`_.

    Examples:
        >>> lstm = torch.nn.LSTMCell(16, 32)
        >>> lstm = ZoneOutCell(lstm, 0.5)

    .. _`Zoneout: Regularizing RNNs by Randomly Preserving Hidden Activations`:
        https://arxiv.org/abs/1606.01305

    .. _`eladhoffer/seq2seq.pytorch`:
        https://github.com/eladhoffer/seq2seq.pytorch

    çš™™™™™¹?c                    s>   t t| ƒ ¡  || _|j| _|| _|dks|dk rtdƒ‚dS )a   Initialize zone out cell module.

        Args:
            cell (torch.nn.Module): Pytorch recurrent cell module
                e.g. `torch.nn.Module.LSTMCell`.
            zoneout_rate (float, optional): Probability of zoneout from 0.0 to 1.0.

        g      ð?ç        z9zoneout probability must be in the range from 0.0 to 1.0.N)Úsuperr   Ú__init__ÚcellÚhidden_sizeÚzoneout_rateÚ
ValueError)Úselfr   r   ©Ú	__class__r   r   r   )   s   	ÿÿzZoneOutCell.__init__c                 C   s    |   ||¡}|  ||| j¡}|S )aÞ  Calculate forward propagation.

        Args:
            inputs (Tensor): Batch of input tensor (B, input_size).
            hidden (tuple):
                - Tensor: Batch of initial hidden states (B, hidden_size).
                - Tensor: Batch of initial cell states (B, hidden_size).

        Returns:
            tuple:
                - Tensor: Batch of next hidden states (B, hidden_size).
                - Tensor: Batch of next cell states (B, hidden_size).

        )r   Ú_zoneoutr   )r   ÚinputsÚhiddenÚnext_hiddenr   r   r   Úforward;   s   zZoneOutCell.forwardc                    sŒ   t ˆ tƒr%tˆ ƒ}t ˆtƒstˆg| ƒ‰t‡ ‡‡‡fdd„t|ƒD ƒƒS ˆjr<ˆ jˆ  ¡ Ž  ˆ¡}|ˆ  d| ˆ  S ˆˆ  dˆ ˆ  S )Nc                    s&   g | ]}ˆ  ˆ | ˆ| ˆ| ¡‘qS r   )r   )Ú.0Úi©ÚhÚnext_hÚprobr   r   r   Ú
<listcomp>U   s   & z(ZoneOutCell._zoneout.<locals>.<listcomp>é   )r   ÚtupleÚlenÚrangeÚtrainingÚnewÚsizeÚ
bernoulli_)r   r$   r%   r&   Únum_hÚmaskr   r#   r   r   N   s   

ÿzZoneOutCell._zoneout)r   )Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   r    r   Ú__classcell__r   r   r   r   r      s
    r   c                       s*   e Zd ZdZd	‡ fdd„	Zdd„ Z‡  ZS )
ÚPrenetaÌ  Prenet module for decoder of Spectrogram prediction network.

    This is a module of Prenet in the decoder of Spectrogram prediction network,
    which described in `Natural TTS
    Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`_.
    The Prenet preforms nonlinear conversion
    of inputs before input to auto-regressive lstm,
    which helps to learn diagonal attentions.

    Note:
        This module alway applies dropout even in evaluation.
        See the detail in `Natural TTS Synthesis by
        Conditioning WaveNet on Mel Spectrogram Predictions`_.

    .. _`Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`:
       https://arxiv.org/abs/1712.05884

    é   é   ç      à?c              	      sp   t t| ƒ ¡  || _tj ¡ | _tj	 
|¡D ]}|dkr|n|}|  jtj tj ||¡tj ¡ ¡g7  _qdS )a  Initialize prenet module.

        Args:
            idim (int): Dimension of the inputs.
            odim (int): Dimension of the outputs.
            n_layers (int, optional): The number of prenet layers.
            n_units (int, optional): The number of prenet units.

        r   N)r   r7   r   Údropout_rater   r   Ú
ModuleListÚprenetÚsixÚmovesr+   Ú
SequentialÚLinearÚReLU)r   ÚidimÚn_layersÚn_unitsr;   ÚlayerÚn_inputsr   r   r   r   s   s   

ÿþzPrenet.__init__c                 C   s4   t j t| jƒ¡D ]}t | j| |ƒ| j¡}q	|S )zÄCalculate forward propagation.

        Args:
            x (Tensor): Batch of input tensors (B, ..., idim).

        Returns:
            Tensor: Batch of output tensors (B, ..., odim).

        )r>   r?   r+   r*   r=   ÚFÚdropoutr;   )r   Úxr"   r   r   r   r    †   s   
zPrenet.forward)r8   r9   r:   ©r2   r3   r4   r5   r   r    r6   r   r   r   r   r7   _   s    r7   c                       s4   e Zd ZdZ					d
‡ fdd„	Zdd	„ Z‡  ZS )ÚPostnetaþ  Postnet module for Spectrogram prediction network.

    This is a module of Postnet in Spectrogram prediction network,
    which described in `Natural TTS Synthesis by
    Conditioning WaveNet on Mel Spectrogram Predictions`_.
    The Postnet predicts refines the predicted
    Mel-filterbank of the decoder,
    which helps to compensate the detail structure of spectrogram.

    .. _`Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`:
       https://arxiv.org/abs/1712.05884

    é   é   r:   Tc                    s’  t t| ƒ ¡  tj ¡ | _tj 	|d ¡D ]b}|dkr|n|}	||d kr'|n|}
|rT|  jtj 
tjj|	|
|d|d d ddtj |
¡tj ¡ tj |¡¡g7  _q|  jtj 
tjj|	|
|d|d d ddtj ¡ tj |¡¡g7  _q|dkr~|n|}	|r¨|  jtj 
tjj|	||d|d d ddtj |¡tj |¡¡g7  _dS |  jtj 
tjj|	||d|d d ddtj |¡¡g7  _dS )aã  Initialize postnet module.

        Args:
            idim (int): Dimension of the inputs.
            odim (int): Dimension of the outputs.
            n_layers (int, optional): The number of layers.
            n_filts (int, optional): The number of filter size.
            n_units (int, optional): The number of filter channels.
            use_batch_norm (bool, optional): Whether to use batch normalization..
            dropout_rate (float, optional): Dropout rate..

        r(   r   r8   F)ÚstrideÚpaddingÚbiasN)r   rL   r   r   r   r<   Úpostnetr>   r?   r+   r@   r   ÚBatchNorm1dÚTanhÚDropout)r   rC   ÚodimrD   Ún_chansÚn_filtsr;   Úuse_batch_normrF   ÚichansÚochansr   r   r   r   ¥   s€   
ú

õ
ÿ
ú
ö
ÿ
ú

öÿ
ú
÷ÿzPostnet.__init__c                 C   s*   t j t| jƒ¡D ]	}| j| |ƒ}q	|S )zæCalculate forward propagation.

        Args:
            xs (Tensor): Batch of the sequences of padded input tensors (B, idim, Tmax).

        Returns:
            Tensor: Batch of padded output tensor. (B, odim, Tmax).

        )r>   r?   r+   r*   rR   )r   Úxsr"   r   r   r   r    þ   s   
zPostnet.forward)rM   rN   rM   r:   TrK   r   r   r   r   rL   –   s    øYrL   c                       sl   e Zd ZdZ														
	d‡ fdd„	Zdd„ Zdd„ Z							ddd„Zdd„ Z‡  Z	S )ÚDecoderaÔ  Decoder module of Spectrogram prediction network.

    This is a module of decoder of Spectrogram prediction network in Tacotron2,
    which described in `Natural TTS
    Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`_.
    The decoder generates the sequence of
    features from the sequence of the hidden states.

    .. _`Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`:
       https://arxiv.org/abs/1712.05884

    r8   é   r9   rM   rN   NTr:   r   r(   c              	      sT  t t| ƒ ¡  || _|| _|| _|| _|| _|| _|| _	t
| jtƒr&d| _nd| _|dkr/|n|}tj ¡ | _tj |¡D ]$}|dkrG|| n|}tj ||¡}|dkrYt||ƒ}|  j|g7  _q=|dkrpt||||d| _nd| _|dkr„t||||	|
||d| _nd| _|r|| n|}tjj||| dd| _tj ||¡| _|  t¡ dS )	aD  Initialize Tacotron2 decoder module.

        Args:
            idim (int): Dimension of the inputs.
            odim (int): Dimension of the outputs.
            att (torch.nn.Module): Instance of attention class.
            dlayers (int, optional): The number of decoder lstm layers.
            dunits (int, optional): The number of decoder lstm units.
            prenet_layers (int, optional): The number of prenet layers.
            prenet_units (int, optional): The number of prenet units.
            postnet_layers (int, optional): The number of postnet layers.
            postnet_filts (int, optional): The number of postnet filter size.
            postnet_chans (int, optional): The number of postnet filter channels.
            output_activation_fn (torch.nn.Module, optional):
                Activation function for outputs.
            cumulate_att_w (bool, optional):
                Whether to cumulate previous attention weight.
            use_batch_norm (bool, optional): Whether to use batch normalization.
            use_concate (bool, optional): Whether to concatenate encoder embedding
                with decoder lstm outputs.
            dropout_rate (float, optional): Dropout rate.
            zoneout_rate (float, optional): Zoneout rate.
            reduction_factor (int, optional): Reduction factor.

        TFr   r   )rC   rD   rE   r;   N)rC   rV   rD   rW   rX   rY   r;   )rQ   )r   r]   r   rC   rV   ÚattÚoutput_activation_fnÚcumulate_att_wÚuse_concateÚreduction_factorr   r   Úuse_att_extra_inputsr   r   r<   Úlstmr>   r?   r+   ÚLSTMCellr   r7   r=   rL   rR   rA   Úfeat_outÚprob_outÚapplyr   )r   rC   rV   r_   ÚdlayersÚdunitsÚprenet_layersÚprenet_unitsÚpostnet_layersÚpostnet_chansÚpostnet_filtsr`   ra   rY   rb   r;   r   rc   rF   Úiunitsre   r   r   r   r     sT   -

ü
ù
zDecoder.__init__c                 C   s   |  | d¡| jd j¡}|S )Nr   )Ú	new_zerosr.   re   r   )r   ÚhsÚinit_hsr   r   r   Ú_zero_state„  s   zDecoder._zero_statec                 C   sÚ  | j dkr|dd…| j d d| j …f }ttt|ƒƒ}|  |¡g}|  |¡g}tj dt| j	ƒ¡D ]}||  |¡g7 }||  |¡g7 }q1| 
| d¡| j¡}d}| j ¡  g g g }	}
}| dd¡D ]¯}| jrv|  |||d ||¡\}}n|  |||d |¡\}}| jdurŒ|  |¡n|}tj||gdd}| j	d ||d |d fƒ\|d< |d< tj dt| j	ƒ¡D ]}| j	| ||d  || || fƒ\||< ||< qµ| jrÞtj|d |gddn|d }|	|  |¡ | d¡| jd¡g7 }	|
|  |¡g7 }
||g7 }|}| jr|dur|| }qc|}qctj|
dd}
tj|	dd}tj|dd}| j dkr9| | d¡| jd¡}| jdurG||  |¡ }n|}| dd¡}| dd¡}|
}
| jdurg|  |¡}|  |¡}|||
|fS )aÅ  Calculate forward propagation.

        Args:
            hs (Tensor): Batch of the sequences of padded hidden states (B, Tmax, idim).
            hlens (LongTensor): Batch of lengths of each input batch (B,).
            ys (Tensor):
                Batch of the sequences of padded target features (B, Lmax, odim).

        Returns:
            Tensor: Batch of output tensors after postnet (B, Lmax, odim).
            Tensor: Batch of output tensors before postnet (B, Lmax, odim).
            Tensor: Batch of logits of stop prediction (B, Lmax).
            Tensor: Batch of attention weights (B, Lmax, Tmax).

        Note:
            This computation is performed in teacher-forcing manner.

        r(   Nr   ©Údiméÿÿÿÿr8   )rc   ÚlistÚmapÚintru   r>   r?   r+   r*   re   rr   r.   rV   r_   ÚresetÚ	transposerd   r=   r   Úcatrb   rg   Úviewrh   ra   ÚstackrR   r`   )r   rs   ÚhlensÚysÚc_listÚz_listÚ_Úprev_outÚ
prev_att_wÚoutsÚlogitsÚatt_wsÚyÚatt_cÚatt_wÚ
prenet_outr\   r"   ÚzcsÚbefore_outsÚ
after_outsr   r   r   r    ˆ  sf   

(ÿÿý"

ÿ

zDecoder.forwardr   ç      $@Fc              
   C   sX  t | ¡ ƒdks
J ‚| d¡}| d¡g}	t| d¡| ƒ}
t| d¡| ƒ}|  |¡g}|  |¡g}tj dt | jƒ¡D ]}||  |¡g7 }||  |¡g7 }q=| 	d| j
¡}d}| j ¡  |rcd}nd}d}g g g }}}	 || j7 }| jrŠ| j||	|d |||||d\}}n| j||	|d ||||d\}}||g7 }| jdur©|  |¡n|}tj||gdd}| jd ||d |d fƒ\|d< |d< tj dt | jƒ¡D ]}| j| ||d  || || fƒ\||< ||< qÒ| jrûtj|d |gddn|d }||  |¡ d| j
d¡g7 }|t |  |¡¡d g7 }| jdur1|  |d dd…dd…df ¡}n|d dd…dd…df }| jrL|durL|| }n|}|rWt| ¡ ƒ}tt|d |kƒƒdksi||
kr›||k roqotj|dd}| jdurƒ||  |¡ }| dd¡ d¡}tj|dd}tj|dd}nqp| jdur§|  |¡}|||fS )	aØ  Generate the sequence of features given the sequences of characters.

        Args:
            h (Tensor): Input sequence of encoder hidden states (T, C).
            threshold (float, optional): Threshold to stop generation.
            minlenratio (float, optional): Minimum length ratio.
                If set to 1.0 and the length of input is 10,
                the minimum length of outputs will be 10 * 1 = 10.
            minlenratio (float, optional): Minimum length ratio.
                If set to 10 and the length of input is 10,
                the maximum length of outputs will be 10 * 10 = 100.
            use_att_constraint (bool):
                Whether to apply attention constraint introduced in `Deep Voice 3`_.
            backward_window (int): Backward window size in attention constraint.
            forward_window (int): Forward window size in attention constraint.

        Returns:
            Tensor: Output sequence of features (L, odim).
            Tensor: Output sequence of stop probabilities (L,).
            Tensor: Attention weights (L, T).

        Note:
            This computation is performed in auto-regressive manner.

        .. _`Deep Voice 3`: https://arxiv.org/abs/1710.07654

        r8   r   r(   NT)Úlast_attended_idxÚbackward_windowÚforward_windowrv   rx   )r*   r.   Ú	unsqueezer{   ru   r>   r?   r+   re   rr   rV   r_   r|   rc   rd   r=   r   r~   rb   rg   r   Úsigmoidrh   r`   ra   ÚargmaxÚsumrR   r}   Úsqueeze)r   r$   Ú	thresholdÚminlenratioÚmaxlenratioÚuse_att_constraintr”   r•   rs   ÚilensÚmaxlenÚminlenrƒ   r„   r…   r†   r‡   r“   Úidxrˆ   rŠ   ÚprobsrŒ   r   rŽ   r\   r"   r   r   r   r   Ú	inferenceâ  s˜   &


ø
ù

(ÿÿý"
$
ÀB

zDecoder.inferencec                 C   sÒ  | j dkr|dd…| j d d| j …f }ttt|ƒƒ}|  |¡g}|  |¡g}tj dt| j	ƒ¡D ]}||  |¡g7 }||  |¡g7 }q1| 
| d¡| j¡}d}| j ¡  g }	| dd¡D ]‚}
| jrp|  |||d ||¡\}}n|  |||d |¡\}}|	|g7 }	| jdur‹|  |¡n|}tj||gdd}| j	d ||d |d fƒ\|d< |d< tj dt| j	ƒ¡D ]}| j	| ||d  || || fƒ\||< ||< q´|
}| jrÝ|durÝ|| }q]|}q]tj|	dd}	|	S )aü  Calculate all of the attention weights.

        Args:
            hs (Tensor): Batch of the sequences of padded hidden states (B, Tmax, idim).
            hlens (LongTensor): Batch of lengths of each input batch (B,).
            ys (Tensor):
                Batch of the sequences of padded target features (B, Lmax, odim).

        Returns:
            numpy.ndarray: Batch of attention weights (B, Lmax, Tmax).

        Note:
            This computation is performed in teacher-forcing manner.

        r(   Nr   rv   )rc   ry   rz   r{   ru   r>   r?   r+   r*   re   rr   r.   rV   r_   r|   r}   rd   r=   r   r~   ra   r€   )r   rs   r   r‚   rƒ   r„   r…   r†   r‡   rŠ   r‹   rŒ   r   rŽ   r\   r"   r   r   r   Úcalculate_all_attentionsj  s<   


(ÿ
z Decoder.calculate_all_attentions)r8   r^   r8   r9   rM   rN   rM   NTTTr:   r   r(   )r:   r   r’   FNN)
r2   r3   r4   r5   r   ru   r    r¤   r¥   r6   r   r   r   r   r]     s8    îi]
ø 	r]   )r5   r>   r   Útorch.nn.functionalr   Ú
functionalrH   Ú*espnet.nets.pytorch_backend.rnn.attentionsr   r   ÚModuler   r7   rL   r]   r   r   r   r   Ú<module>   s   I7w