o
    i	                     @   sP   d Z ddlZddlmZ ddlZddlmZ ddlmZ G dd dej	j
ZdS )z[Text encoder module in VITS.

This code is based on https://github.com/jaywalnut310/vits.

    N)Tuple)Encoder)make_non_pad_maskc                #       s   e Zd ZdZ										
							d(dedededededededededededededededed ef" fd!d"Zd#e	j
d$e	j
d%ee	j
e	j
e	j
e	j
f fd&d'Z  ZS ))TextEncodera  Text encoder module in VITS.

    This is a module of text encoder described in `Conditional Variational Autoencoder
    with Adversarial Learning for End-to-End Text-to-Speech`_.

    Instead of the relative positional Transformer, we use conformer architecture as
    the encoder module, which contains additional convolution layers.

    .. _`Conditional Variational Autoencoder with Adversarial Learning for End-to-End
        Text-to-Speech`: https://arxiv.org/abs/2006.04558

                conv1d   rel_posrel_selfattnswishTF   皙?        vocabsattention_dimattention_headslinear_unitsblockspositionwise_layer_typepositionwise_conv_kernel_sizepositional_encoding_layer_typeself_attention_layer_typeactivation_typenormalize_beforeuse_macaron_styleuse_conformer_convconformer_kernel_sizedropout_ratepositional_dropout_rateattention_dropout_ratec                    s   t    || _tj||| _tjj| jj	d|d  t
di ddddd|d|d	|d
|d|d|d|d|d|d|d|d|d|	d|
d|d|| _tj||d d| _dS )a  Initialize TextEncoder module.

        Args:
            vocabs (int): Vocabulary size.
            attention_dim (int): Attention dimension.
            attention_heads (int): Number of attention heads.
            linear_units (int): Number of linear units of positionwise layers.
            blocks (int): Number of encoder blocks.
            positionwise_layer_type (str): Positionwise layer type.
            positionwise_conv_kernel_size (int): Positionwise layer's kernel size.
            positional_encoding_layer_type (str): Positional encoding layer type.
            self_attention_layer_type (str): Self-attention layer type.
            activation_type (str): Activation function type.
            normalize_before (bool): Whether to apply LayerNorm before attention.
            use_macaron_style (bool): Whether to use macaron style components.
            use_conformer_conv (bool): Whether to use conformer conv layers.
            conformer_kernel_size (int): Conformer's conv kernel size.
            dropout_rate (float): Dropout rate.
            positional_dropout_rate (float): Dropout rate for positional encoding.
            attention_dropout_rate (float): Dropout rate for attention.

        r   g      idiminput_layerNr   r   r   
num_blocksr    r!   r"   r   r   r   macaron_stylepos_enc_layer_typeselfattention_layer_typer   use_cnn_modulecnn_module_kernelr       )super__init__r   torchnn	Embeddingembinitnormal_weightr   encoderConv1dproj)selfr   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   	__class__r-   U/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/gan_tts/vits/text_encoder.pyr/   !   sT   
*	
zTextEncoder.__init__x	x_lengthsreturnc                 C   s   |  |t| j }t|j|j|jdd}| 	||\}}|
dd}| || }|j|dd dd\}}||||fS )a  Calculate forward propagation.

        Args:
            x (Tensor): Input index tensor (B, T_text).
            x_lengths (Tensor): Length tensor (B,).

        Returns:
            Tensor: Encoded hidden representation (B, attention_dim, T_text).
            Tensor: Projected mean tensor (B, attention_dim, T_text).
            Tensor: Projected scale tensor (B, attention_dim, T_text).
            Tensor: Mask tensor for input tensor (B, 1, T_text).

        )devicedtyper,   r   )dim)r3   mathsqrtr   r   torA   rB   	unsqueezer7   	transposer9   splitsize)r:   r>   r?   x_mask_statsmlogsr-   r-   r=   forwardh   s   
zTextEncoder.forward)r   r   r   r	   r
   r   r   r   r   TFFr   r   r   r   )__name__
__module____qualname____doc__intstrboolfloatr/   r0   Tensorr   rP   __classcell__r-   r-   r;   r=   r      sx    	
Gr   )rT   rD   typingr   r0   -espnet.nets.pytorch_backend.conformer.encoderr   &espnet.nets.pytorch_backend.nets_utilsr   r1   Moduler   r-   r-   r-   r=   <module>   s   