o
    iu'                     @   s   d Z ddlmZ ddlZddlmZ ddlmZ G dd dej	j
ZG dd	 d	ej	j
ZG d
d dej	j
ZG dd deZdS )zStyle encoder of GST-Tacotron.    )SequenceN)check_argument_types)MultiHeadedAttentionc                       sz   e Zd ZdZ										
	ddedededededee dedededef fddZdejdejfddZ	  Z
S )StyleEncodera  Style encoder.

    This module is style encoder introduced in `Style Tokens: Unsupervised Style
    Modeling, Control and Transfer in End-to-End Speech Synthesis`.

    .. _`Style Tokens: Unsupervised Style Modeling, Control and Transfer in End-to-End
        Speech Synthesis`: https://arxiv.org/abs/1803.09017

    Args:
        idim (int, optional): Dimension of the input mel-spectrogram.
        gst_tokens (int, optional): The number of GST embeddings.
        gst_token_dim (int, optional): Dimension of each GST embedding.
        gst_heads (int, optional): The number of heads in GST multihead attention.
        conv_layers (int, optional): The number of conv layers in the reference encoder.
        conv_chans_list: (Sequence[int], optional):
            List of the number of channels of conv layers in the referece encoder.
        conv_kernel_size (int, optional):
            Kernel size of conv layers in the reference encoder.
        conv_stride (int, optional):
            Stride size of conv layers in the reference encoder.
        gru_layers (int, optional): The number of GRU layers in the reference encoder.
        gru_units (int, optional): The number of GRU units in the reference encoder.

    Todo:
        * Support manual weight specification in inference.

    P   
                r   @   r      r            r   idim
gst_tokensgst_token_dim	gst_headsconv_layersconv_chans_listconv_kernel_sizeconv_stride
gru_layers	gru_unitsc              	      sF   t  sJ tt|   t||||||	|
d| _t|
|||d| _dS )z&Initilize global style encoder module.)r   r   r   r   r   r   r   )ref_embed_dimr   r   r   N)r   superr   __init__ReferenceEncoderref_encStyleTokenLayerstl)selfr   r   r   r   r   r   r   r   r   r   	__class__ Q/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/tts/gst/style_encoder.pyr   -   s"   
	zStyleEncoder.__init__speechreturnc                 C   s   |  |}| |}|S )zCalculate forward propagation.

        Args:
            speech (Tensor): Batch of padded target features (B, Lmax, odim).

        Returns:
            Tensor: Style token embeddings (B, token_dim).

        )r    r"   )r#   r(   ref_embs
style_embsr&   r&   r'   forwardN   s   


zStyleEncoder.forward)
r   r   r   r	   r
   r   r   r   r   r   __name__
__module____qualname____doc__intr   r   torchTensorr,   __classcell__r&   r&   r$   r'   r      sD    	
!r   c                       sd   e Zd ZdZ							dd	ed
ee dedededef fddZdejdejfddZ	  Z
S )r   a  Reference encoder module.

    This module is reference encoder introduced in `Style Tokens: Unsupervised Style
    Modeling, Control and Transfer in End-to-End Speech Synthesis`.

    .. _`Style Tokens: Unsupervised Style Modeling, Control and Transfer in End-to-End
        Speech Synthesis`: https://arxiv.org/abs/1803.09017

    Args:
        idim (int, optional): Dimension of the input mel-spectrogram.
        conv_layers (int, optional): The number of conv layers in the reference encoder.
        conv_chans_list: (Sequence[int], optional):
            List of the number of channels of conv layers in the referece encoder.
        conv_kernel_size (int, optional):
            Kernel size of conv layers in the reference encoder.
        conv_stride (int, optional):
            Stride size of conv layers in the reference encoder.
        gru_layers (int, optional): The number of GRU layers in the reference encoder.
        gru_units (int, optional): The number of GRU units in the reference encoder.

    r   r
   r   r   r   r   r   r   r   r   r   r   r   c              
      s$  t  sJ tt|   |d dksJ dt||ks J dg }|d d }	t|D ],}
|
dkr4dn||
d  }||
 }|tjj|||||	ddtj	|tjj
dd	g7 }q,tjj| | _|| _|| _|| _|	| _|}t|D ]}
|| d|	  | d }qr||9 }tjj|||dd
| _dS )z#Initilize reference encoder module.r   r   zkernel size must be odd.zGthe number of conv layers and length of channels list must be the same.r   F)kernel_sizestridepaddingbiasT)inplace)batch_firstN)r   r   r   r   lenranger3   nnConv2dBatchNorm2dReLU
Sequentialconvsr   r6   r7   r8   GRUgru)r#   r   r   r   r   r   r   r   rC   r8   iconv_in_chansconv_out_chansgru_in_unitsr$   r&   r'   r   u   sJ   

	zReferenceEncoder.__init__r(   r)   c                 C   sf   | d}|d}| |dd}| d}| ||d}| j  | |\}}|d }|S )zCalculate forward propagation.

        Args:
            speech (Tensor): Batch of padded target features (B, Lmax, idim).

        Returns:
            Tensor: Reference embedding (B, gru_units)

        r   r   r   )size	unsqueezerC   	transpose
contiguousviewrE   flatten_parameters)r#   r(   
batch_sizexshstime_length_r*   r&   r&   r'   r,      s   




zReferenceEncoder.forward)r   r
   r   r   r   r   r   r-   r&   r&   r$   r'   r   ^   s.    6r   c                       sX   e Zd ZdZ					ddeded	ed
edef
 fddZdejdejfddZ	  Z
S )r!   a  Style token layer module.

    This module is style token layer introduced in `Style Tokens: Unsupervised Style
    Modeling, Control and Transfer in End-to-End Speech Synthesis`.

    .. _`Style Tokens: Unsupervised Style Modeling, Control and Transfer in End-to-End
        Speech Synthesis`: https://arxiv.org/abs/1803.09017

    Args:
        ref_embed_dim (int, optional): Dimension of the input reference embedding.
        gst_tokens (int, optional): The number of GST embeddings.
        gst_token_dim (int, optional): Dimension of each GST embedding.
        gst_heads (int, optional): The number of heads in GST multihead attention.
        dropout_rate (float, optional): Dropout rate in multi-head attention.

    r   r   r   r	           r   r   r   r   dropout_ratec                    s^   t  sJ tt|   t||| }| dtj| t	||| || |||d| _
dS )z#Initilize style token layer module.gst_embs)q_dimk_dimv_dimn_headn_featrW   N)r   r   r!   r   r3   randnregister_parameterr>   	Parameterr   mha)r#   r   r   r   r   rW   rX   r$   r&   r'   r      s   
	zStyleTokenLayer.__init__r*   r)   c                 C   sJ   | d}t| jd|dd}|d}| |||d}|dS )zCalculate forward propagation.

        Args:
            ref_embs (Tensor): Reference embeddings (B, ref_embed_dim).

        Returns:
            Tensor: Style token embeddings (B, gst_token_dim).

        r   rJ   r   N)rK   r3   tanhrX   rL   expandra   squeeze)r#   r*   rQ   rX   r+   r&   r&   r'   r,      s
   



zStyleTokenLayer.forward)r   r   r   r	   rV   )r.   r/   r0   r1   r2   floatr   r3   r4   r,   r5   r&   r&   r$   r'   r!      s&    r!   c                   @   s   e Zd ZdZdddZdS )r   z;Multi head attention module with different input dimension.rV   c                 C   s   t jj|  || dksJ || | _|| _t j||| _t j||| _t j||| _	t j||| _
d| _t jj|d| _dS )z'Initialize multi head attention module.r   N)p)r3   r>   Moduler   d_khLinearlinear_qlinear_klinear_v
linear_outattnDropoutdropout)r#   rY   rZ   r[   r\   r]   rW   r&   r&   r'   r     s   
zMultiHeadedAttention.__init__N)rV   )r.   r/   r0   r1   r   r&   r&   r&   r'   r      s    r   )r1   typingr   r3   	typeguardr   1espnet.nets.pytorch_backend.transformer.attentionr   BaseMultiHeadedAttentionr>   rg   r   r   r!   r&   r&   r&   r'   <module>   s   Nd=