o
    iR                     @   s   d Z ddlZddlmZmZmZmZ ddlZddlm	  m
Z ddlmZ ddlmZ ddlmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZmZmZ ddlmZ ddlm Z  G dd deZ!dS )z'Tacotron 2 related modules for ESPnet2.    N)DictOptionalSequenceTuple)check_argument_types)force_gatherable)AbsTTS)StyleEncoder)GuidedAttentionLossTacotron2Loss)make_pad_mask)
AttForwardAttForwardTAAttLoc)Decoder)Encoderc                _       s  e Zd ZdZ													
		
																						
													dkdededededed ed!ed"ed#ed$ed%ed&ed'ed(ed)ed*ed+ed,ed-ed.ed/ed0ed1ed2ed3ed4ee d5ee d6ee d7ed8ed9ed:ed;ed<ee d=ed>ed?ed@edAe	dBe	dCedDedEe	dFedGedHe	dIe	f^ fdJdKZ
				dldLejdMejdNejdOejdPeej dQeej dReej dSedTeejeeejf ejf fdUdVZdWejdXejdYejdZejdPejdQejdRejdTeejejejf fd[d\Z						]	^				dmdLejdNeej dPeej dQeej dReej d_e	d`e	dae	dbedceddedeedTeeejf fdfdgZdhejdPejdTejfdidjZ  ZS )n	Tacotron2a  Tacotron2 module for end-to-end text-to-speech.

    This is a module of Spectrogram prediction network in Tacotron2 described
    in `Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`_,
    which converts the sequence of characters into the sequence of Mel-filterbanks.

    .. _`Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`:
       https://arxiv.org/abs/1712.05884

                location       T         NFconcat
         r   r   @   r"      r#   r#         ?皙?      @L1+L2皙?      ?idimodim	embed_dimelayerseunitseconv_layerseconv_chanseconv_filtsatypeadimaconv_chansaconv_filtscumulate_att_wdlayersdunitsprenet_layersprenet_unitspostnet_layerspostnet_chanspostnet_filtsoutput_activationuse_batch_normuse_concateuse_residualreduction_factorspkslangsspk_embed_dimspk_embed_integration_typeuse_gst
gst_tokens	gst_headsgst_conv_layersgst_conv_chans_listgst_conv_kernel_sizegst_conv_stridegst_gru_layersgst_gru_unitsdropout_ratezoneout_rateuse_maskinguse_weighted_maskingbce_pos_weight	loss_typeuse_guided_attn_lossguided_attn_loss_sigmaguided_attn_loss_lambdac0           3         s  t  sJ t   || _|| _|d | _|| _|| _|| _|-| _	|,| _
|du r,d| _ntt|r8tt|| _ntd| dd}0|0| _t||||||||||'|0d| _| jrgt|||| |!|"|#|$|%|&d
| _d| _|dur}|dkr}|| _tj||| _d| _|dur|dkr|| _tj||| _d| _|dur|dkr|| _|| _| jdu r|}1n"| jdkr|| }1n| jd	kr|}1tj| j|| _nt| d
|	dkrt |1||
||}2n7|	dkrt!|1||
||}2| jrt"#d d| _n|	dkrt$|1||
|||}2| jrt"#d d| _nt%dt&d$i d|1d|d|2d|d|d|d|d|d|d|d| jd| jd|d|d|'d |(d!|| _'t(|)|*|+d"| _)| j	rbt*|.|/d#| _+dS dS )%a  Initialize Tacotron2 module.

        Args:
            idim (int): Dimension of the inputs.
            odim: (int) Dimension of the outputs.
            embed_dim (int): Dimension of the token embedding.
            elayers (int): Number of encoder blstm layers.
            eunits (int): Number of encoder blstm units.
            econv_layers (int): Number of encoder conv layers.
            econv_filts (int): Number of encoder conv filter size.
            econv_chans (int): Number of encoder conv filter channels.
            dlayers (int): Number of decoder lstm layers.
            dunits (int): Number of decoder lstm units.
            prenet_layers (int): Number of prenet layers.
            prenet_units (int): Number of prenet units.
            postnet_layers (int): Number of postnet layers.
            postnet_filts (int): Number of postnet filter size.
            postnet_chans (int): Number of postnet filter channels.
            output_activation (str): Name of activation function for outputs.
            adim (int): Number of dimension of mlp in attention.
            aconv_chans (int): Number of attention conv filter channels.
            aconv_filts (int): Number of attention conv filter size.
            cumulate_att_w (bool): Whether to cumulate previous attention weight.
            use_batch_norm (bool): Whether to use batch normalization.
            use_concate (bool): Whether to concat enc outputs w/ dec lstm outputs.
            reduction_factor (int): Reduction factor.
            spks (Optional[int]): Number of speakers. If set to > 1, assume that the
                sids will be provided as the input and use sid embedding layer.
            langs (Optional[int]): Number of languages. If set to > 1, assume that the
                lids will be provided as the input and use sid embedding layer.
            spk_embed_dim (Optional[int]): Speaker embedding dimension. If set to > 0,
                assume that spembs will be provided as the input.
            spk_embed_integration_type (str): How to integrate speaker embedding.
            use_gst (str): Whether to use global style token.
            gst_tokens (int): Number of GST embeddings.
            gst_heads (int): Number of heads in GST multihead attention.
            gst_conv_layers (int): Number of conv layers in GST.
            gst_conv_chans_list: (Sequence[int]): List of the number of channels of conv
                layers in GST.
            gst_conv_kernel_size (int): Kernel size of conv layers in GST.
            gst_conv_stride (int): Stride size of conv layers in GST.
            gst_gru_layers (int): Number of GRU layers in GST.
            gst_gru_units (int): Number of GRU units in GST.
            dropout_rate (float): Dropout rate.
            zoneout_rate (float): Zoneout rate.
            use_masking (bool): Whether to mask padded part in loss calculation.
            use_weighted_masking (bool): Whether to apply weighted masking in
                loss calculation.
            bce_pos_weight (float): Weight of positive sample of stop token
                (only for use_masking=True).
            loss_type (str): Loss function type ("L1", "L2", or "L1+L2").
            use_guided_attn_loss (bool): Whether to use guided attention loss.
            guided_attn_loss_sigma (float): Sigma in guided attention loss.
            guided_attn_loss_lambda (float): Lambda in guided attention loss.

        r   Nz*there is no such an activation function. ()r   )r*   r,   r-   r.   r/   r0   r1   r?   rA   rP   padding_idx)
r*   rH   gst_token_dimrI   conv_layersconv_chans_listconv_kernel_sizeconv_stride
gru_layers	gru_unitsr   addz is not supported.r   forwardzAcumulation of attention weights is disabled in forward attention.F
forward_taz Support only location or forwardr*   r+   attr7   r8   r9   r:   r;   r<   r=   output_activation_fnr6   r?   r@   rP   rQ   rB   )rR   rS   rT   )sigmaalpha ),r   super__init__r*   r+   eosr6   rB   rG   rV   rU   rf   hasattrFgetattr
ValueErrorrZ   r   encr	   gstrC   torchnn	Embeddingsid_embrD   lid_embrE   rF   Linear
projectionr   r   loggingwarningr   NotImplementedErrorr   decr   
taco2_lossr
   	attn_loss)3selfr*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rZ   dec_idimre   	__class__ri   S/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/tts/tacotron2/tacotron2.pyrk   &   s  
m








	
zTacotron2.__init__texttext_lengthsfeatsfeats_lengthsspembssidslidsjoint_trainingreturnc	              	      sn  |ddd|  f }|ddd|  f }|d}	t|ddgd j}
t|D ]\}} j|
||f< q,|d }|}|}t|d |j	|j
}t|ddgdd} j|
||||||d\}}}} jdkr| j swJ d| fdd	|D }t |}|ddd|f }|ddd|f }t|d|d dd} ||||||\}}} jd
kr|| | }n jdkr|| }n jdkr|| }ntd j t| | | d} jr jdkr| fdd	|D }n|} |||}|| }|j| d |s2|j| d t|||	f|j	\}}}|||fS |||fS )aV  Calculate forward propagation.

        Args:
            text (LongTensor): Batch of padded character ids (B, T_text).
            text_lengths (LongTensor): Batch of lengths of each input batch (B,).
            feats (Tensor): Batch of padded target features (B, T_feats, odim).
            feats_lengths (LongTensor): Batch of the lengths of each target (B,).
            spembs (Optional[Tensor]): Batch of speaker embeddings (B, spk_embed_dim).
            sids (Optional[Tensor]): Batch of speaker IDs (B, 1).
            lids (Optional[Tensor]): Batch of language IDs (B, 1).
            joint_training (bool): Whether to perform joint training with vocoder.

        Returns:
            Tensor: Loss scalar value.
            Dict: Statistics to be monitored.
            Tensor: Weight value if not joint training else model outputs.

        Nr   r   constantr)   xsilensysolensr   r   r   z@Output length must be greater than or equal to reduction factor.c                    s   g | ]	}|| j   qS ri   rB   .0olenr   ri   r   
<listcomp>S  s    z%Tacotron2.forward.<locals>.<listcomp>r'   L1L2zunknown --loss-type )l1_lossmse_lossbce_lossc                    s   g | ]}| j  qS ri   r   r   r   ri   r   r   s  s    )r   )loss)maxsizern   padrZ   	enumeraterl   r   todevicedtype_forwardrB   geallnewrs   scatter	unsqueezer~   rU   rp   dictitemrV   r   updater   )r   r   r   r   r   r   r   r   r   
batch_sizer   ilr   r   r   labels
after_outsbefore_outslogitsatt_wsmax_outr   r   r   r   statsolens_inr   weightri   r   r   rc     s   










zTacotron2.forwardr   r   r   r   c                 C   s   |  ||\}}	| jr| |}
||
d }| jd ur+| |d}||d }| jd ur?| |d}||d }| j	d urJ| 
||}| ||	|S )Nr   )rq   rG   rr   r   rC   rv   viewrD   rw   rE   _integrate_with_spk_embedr}   )r   r   r   r   r   r   r   r   hshlens
style_embssid_embslid_embsri   ri   r   r     s   




zTacotron2._forward              $@	thresholdminlenratiomaxlenratiouse_att_constraintbackward_windowforward_windowuse_teacher_forcingc              	   C   s  |}|}|}t |ddgd| j}|ra|dusJ d|d|d}}|du r,dn|d}||dg }||dg }| j|||||||d\}}}}t|d |d dS | j	
|}| jrv| |d}|| }| jdur| |d}|| }| jdur| |d}|| }| jdur|d|d}}| ||d }| jj
|||||	|
|d	\}}}t|||d
S )a  Generate the sequence of features given the sequences of characters.

        Args:
            text (LongTensor): Input sequence of characters (T_text,).
            feats (Optional[Tensor]): Feature sequence to extract style (N, idim).
            spembs (Optional[Tensor]): Speaker embedding (spk_embed_dim,).
            sids (Optional[Tensor]): Speaker ID (1,).
            lids (Optional[Tensor]): Language ID (1,).
            threshold (float): Threshold in inference.
            minlenratio (float): Minimum length ratio in inference.
            maxlenratio (float): Maximum length ratio in inference.
            use_att_constraint (bool): Whether to apply attention constraint.
            backward_window (int): Backward window in attention constraint.
            forward_window (int): Forward window in attention constraint.
            use_teacher_forcing (bool): Whether to use teacher forcing.

        Returns:
            Dict[str, Tensor]: Output dict including the following items:
                * feat_gen (Tensor): Output sequence of features (T_feats, odim).
                * prob (Tensor): Output sequence of stop probabilities (T_feats,).
                * att_w (Tensor): Attention weights (T_feats, T).

        r   r   r   Nz,feats must be provided with teacher forcing.r   )feat_genatt_wr   )r   r   r   r   r   r   )r   probr   )rn   r   rl   r   
new_tensorr   longr   r   rq   	inferencerG   rr   rC   rv   r   rD   rw   rE   r   r}   )r   r   r   r   r   r   r   r   r   r   r   r   r   xyspembr   r   r   r   outs_r   h	style_embrv   rw   r   outr   r   ri   ri   r   r     sV   &




zTacotron2.inferencer   c                 C   st   | j dkr| t|}||d }|S | j dkr6t|dd|dd}tj||gdd}|S t	d)a  Integrate speaker embedding with hidden states.

        Args:
            hs (Tensor): Batch of hidden state sequences (B, Tmax, eunits).
            spembs (Tensor): Batch of speaker embeddings (B, spk_embed_dim).

        Returns:
            Tensor: Batch of integrated hidden state sequences (B, Tmax, eunits) if
                integration_type is "add" else (B, Tmax, eunits + spk_embed_dim).

        rb   r   r   r   )dimzsupport only add or concat.)
rF   ry   rn   	normalizer   expandr   rs   catr|   )r   r   r   ri   ri   r   r     s   

 z#Tacotron2._integrate_with_spk_embed)-r   r   r   r   r   r   r   r   r   r   Tr   r   r   r   r   r   r   NTTFr   NNNr   Fr   r   r    r!   r   r   r   r#   r$   r%   TFr&   r'   Tr(   r)   )NNNF)NNNNr$   r   r   Fr   r   F)__name__
__module____qualname____doc__intstrboolr   r   floatrk   rs   Tensorr   r   rc   r   r   r   __classcell__ri   ri   r   r   r      s   	
 !"#$%&'()+,-./0123 u	

o	
	

[r   )"r   rz   typingr   r   r   r   rs   torch.nn.functionalrt   
functionalrn   	typeguardr    espnet2.torch_utils.device_funcsr   espnet2.tts.abs_ttsr   espnet2.tts.gst.style_encoderr	   -espnet.nets.pytorch_backend.e2e_tts_tacotron2r
   r   &espnet.nets.pytorch_backend.nets_utilsr   *espnet.nets.pytorch_backend.rnn.attentionsr   r   r   -espnet.nets.pytorch_backend.tacotron2.decoderr   -espnet.nets.pytorch_backend.tacotron2.encoderr   r   ri   ri   ri   r   <module>   s   