o
    i                     @   sD   d Z ddlZddlZddlmZmZ dd ZG dd dejjZ	dS )z"Tacotron2 encoder related modules.    N)pack_padded_sequencepad_packed_sequencec                 C   s2   t | tjjrtjj| jtjjd dS dS )zInitialize encoder parameters.reluN)
isinstancetorchnnConv1dinitxavier_uniform_weightcalculate_gain)m r   a/home/ubuntu/.local/lib/python3.10/site-packages/espnet/nets/pytorch_backend/tacotron2/encoder.pyencoder_init   s    r   c                       sJ   e Zd ZdZ												
d fdd	ZdddZdd Z  ZS )Encodera  Encoder module of Spectrogram prediction network.

    This is a module of encoder of Spectrogram prediction network in Tacotron2,
    which described in `Natural TTS Synthesis by Conditioning WaveNet on Mel
    Spectrogram Predictions`_. This is the encoder which converts either a sequence
    of characters or acoustic features into the sequence of hidden states.

    .. _`Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`:
       https://arxiv.org/abs/1712.05884

    embed            TF      ?r   c                    s  t t|   || _|
| _|dkrtj||| _n|dkr)tjj	|||d| _nt
d| |dkrtj | _tj|D ]\}|dkrK|dkrK|n|}|	rx|  jtjtjj|||d|d d dd	tj|tj tj|g7  _q?|  jtjtjj|||d|d d dd	tj tj|g7  _q?nd
| _|dkr|dkr|n|}tjj||d |ddd| _nd
| _| t d
S )a;  Initialize Tacotron2 encoder module.

        Args:
            idim (int) Dimension of the inputs.
            input_layer (str): Input layer type.
            embed_dim (int, optional) Dimension of character embedding.
            elayers (int, optional) The number of encoder blstm layers.
            eunits (int, optional) The number of encoder blstm units.
            econv_layers (int, optional) The number of encoder conv layers.
            econv_filts (int, optional) The number of encoder conv filter size.
            econv_chans (int, optional) The number of encoder conv filter channels.
            use_batch_norm (bool, optional) Whether to use batch normalization.
            use_residual (bool, optional) Whether to use residual connection.
            dropout_rate (float, optional) Dropout rate.

        linearr   )padding_idxzunknown input_layer: r   r      F)stridepaddingbiasNT)batch_firstbidirectional)superr   __init__idimuse_residualr   r   Linearr   	Embedding
ValueError
ModuleListconvssixmovesrange
Sequentialr   BatchNorm1dReLUDropoutLSTMblstmapplyr   )selfr"   input_layer	embed_dimelayerseunitseconv_layerseconv_chanseconv_filtsuse_batch_normr#   dropout_rater   layerichansiunits	__class__r   r   r!   !   sh   






$
zEncoder.__init__Nc                 C   s   |  |dd}| jdur.tjt| jD ]}| jr&|| j| | }q| j| |}q| jdu r9|ddS t	|t
jsDt
|}t|dd| dd}| j  | |\}}t|dd\}}||fS )a  Calculate forward propagation.

        Args:
            xs (Tensor): Batch of the padded sequence. Either character ids (B, Tmax)
                or acoustic feature (B, Tmax, idim * encoder_reduction_factor). Padded
                value should be 0.
            ilens (LongTensor): Batch of lengths of each input batch (B,).

        Returns:
            Tensor: Batch of the sequences of encoder states(B, Tmax, eunits).
            LongTensor: Batch of lengths of each sequence (B,)

        r   r   NT)r   )r   	transposer(   r)   r*   r+   lenr#   r1   r   r   Tensortensorr   cpuflatten_parametersr   )r3   xsilensi_hlensr   r   r   forward   s   



zEncoder.forwardc                 C   s0   | d}t|dg}| ||d d S )zInference.

        Args:
            x (Tensor): The sequeunce of character ids (T,)
                    or acoustic feature (T, idim * encoder_reduction_factor).

        Returns:
            Tensor: The sequences of encoder states(T, eunits).

        r   )	unsqueezer   rE   sizerM   )r3   xrH   rI   r   r   r   	inference   s   
zEncoder.inference)r   r   r   r   r   r   r   TFr   r   )N)__name__
__module____qualname____doc__r!   rM   rQ   __classcell__r   r   r@   r   r      s     
^ r   )
rU   r)   r   torch.nn.utils.rnnr   r   r   r   Moduler   r   r   r   r   <module>   s   