o
    it                     @   s   d Z ddlZddlmZmZmZmZ ddlZddlm	  m
Z ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddl m!Z!m"Z" ddl#m$Z$ ddl%m&Z&m'Z' dd	l(mZ) G dd deZ*dS )z'Fastspeech related modules for ESPnet2.    N)DictOptionalSequenceTuple)check_argument_types)force_gatherable)
initialize)AbsTTS)StyleEncoder)Encoder)FeedForwardTransformerLoss)DurationPredictor)LengthRegulator)make_non_pad_maskmake_pad_mask)Postnet)PositionalEncodingScaledPositionalEncodingc                y       s4  e Zd ZdZ													
											
																													
						d|deded ed!ed"ed#ed$ed%ed&ed'ed(ed)ed*ed+ed,ed-ed.ed/ed0ed1ed2ed3ed4ed5ed6ed7ed8ed9ed:ed;ed<ed=ed>ed?ed@edAedBedCedDedEedFedGedHee dIee dJee dKedLedMedNedOedPe	e dQedRedSedTedUedVedWedXedYefx fdZd[Z
								d}d\ejd]ejd^eej d_eej d`eej daeej dbeej dceej ddedeedfe	ej fdgdhZ				d~diejdjejdkejdlejdmejdnejdaeej dbeej dceej doedfeejeeejf ejf fdpdqZ							ddiejdkeej dmeej daeej dbeej dceej deedredfeeejf fdsdtZduejdaejdfejfdvdwZd]ejdfejfdxdyZdUedVedWefdzd{Z  ZS )
FastSpeecha  FastSpeech module for end-to-end text-to-speech.

    This is a module of FastSpeech, feed-forward Transformer with duration predictor
    described in `FastSpeech: Fast, Robust and Controllable Text to Speech`_, which
    does not require any auto-regressive processing during inference, resulting in
    fast decoding compared with auto-regressive Transformer.

    .. _`FastSpeech: Fast, Robust and Controllable Text to Speech`:
        https://arxiv.org/pdf/1905.09263.pdf

                           ?conv1d   TF      皙?transformerlegacyrel_posrel_selfattnswish      Nadd
       r+   @   r,      r-   r-   xavier_uniform      ?idimodimadimaheadselayerseunitsdlayersdunitspostnet_layerspostnet_chanspostnet_filtspostnet_dropout_ratepositionwise_layer_typepositionwise_conv_kernel_sizeuse_scaled_pos_encuse_batch_normencoder_normalize_beforedecoder_normalize_beforeencoder_concat_afterdecoder_concat_afterduration_predictor_layersduration_predictor_chansduration_predictor_kernel_sizeduration_predictor_dropout_ratereduction_factorencoder_typedecoder_typetransformer_enc_dropout_rate'transformer_enc_positional_dropout_rate!transformer_enc_attn_dropout_ratetransformer_dec_dropout_rate'transformer_dec_positional_dropout_rate!transformer_dec_attn_dropout_rateconformer_rel_pos_typeconformer_pos_enc_layer_typeconformer_self_attn_layer_typeconformer_activation_typeuse_macaron_style_in_conformeruse_cnn_in_conformerconformer_enc_kernel_sizeconformer_dec_kernel_size	zero_triuspkslangsspk_embed_dimspk_embed_integration_typeuse_gst
gst_tokens	gst_headsgst_conv_layersgst_conv_chans_listgst_conv_kernel_sizegst_conv_stridegst_gru_layersgst_gru_units	init_typeinit_enc_alphainit_dec_alphause_maskinguse_weighted_maskingc=           ?         s  t  sJ t   || _|| _|d | _|| _|| _|| _|| _	|/| _
d| _| j	r,tnt}=d||fv rg|"dkrO|#dkrCd}#td |$dkrNd	}$td
 n|"dkr`|#dksYJ |$d	ks_J ntd|" tjj||| jd}>|dkrt||||||>||||=||||d| _nK|dkrtd+i d|d|d|d|d|d|>d|d|d|d|d|d|d|d|&d|#d|$d |%d!|'d"|(| _nt| d#| j
rt||0||1|2|3|4|5|6|7d$
| _d%| _|+d%ur|+dkr|+| _tj|+|| _d%| _|,d%ur|,dkr|,| _tj|,|| _d%| _|-d%ur'|-dkr'|-| _|.| _| jd%urH| jd&kr=tj| j|| _ ntj|| j || _ t!|||||d'| _"t# | _$|dkrotd||||d%|| |!|=||||d| _%nL|dkrtd+i ddd|d|d|d|dd%d|d| d|!d|d|d|d|d|&d|#d|$d |%d!|'d"|)| _%nt| d#tj||| | _&|	dkrd%n
t'|||	|
|||d(| _(| j)|8|9|:d) t*|;|<d*| _+d%S ),a  Initialize FastSpeech module.

        Args:
            idim (int): Dimension of the inputs.
            odim (int): Dimension of the outputs.
            elayers (int): Number of encoder layers.
            eunits (int): Number of encoder hidden units.
            dlayers (int): Number of decoder layers.
            dunits (int): Number of decoder hidden units.
            postnet_layers (int): Number of postnet layers.
            postnet_chans (int): Number of postnet channels.
            postnet_filts (int): Kernel size of postnet.
            postnet_dropout_rate (float): Dropout rate in postnet.
            use_scaled_pos_enc (bool): Whether to use trainable scaled pos encoding.
            use_batch_norm (bool): Whether to use batch normalization in encoder prenet.
            encoder_normalize_before (bool): Whether to apply layernorm layer before
                encoder block.
            decoder_normalize_before (bool): Whether to apply layernorm layer before
                decoder block.
            encoder_concat_after (bool): Whether to concatenate attention layer's input
                and output in encoder.
            decoder_concat_after (bool): Whether to concatenate attention layer's input
                and output in decoder.
            duration_predictor_layers (int): Number of duration predictor layers.
            duration_predictor_chans (int): Number of duration predictor channels.
            duration_predictor_kernel_size (int): Kernel size of duration predictor.
            duration_predictor_dropout_rate (float): Dropout rate in duration predictor.
            reduction_factor (int): Reduction factor.
            encoder_type (str): Encoder type ("transformer" or "conformer").
            decoder_type (str): Decoder type ("transformer" or "conformer").
            transformer_enc_dropout_rate (float): Dropout rate in encoder except
                attention and positional encoding.
            transformer_enc_positional_dropout_rate (float): Dropout rate after encoder
                positional encoding.
            transformer_enc_attn_dropout_rate (float): Dropout rate in encoder
                self-attention module.
            transformer_dec_dropout_rate (float): Dropout rate in decoder except
                attention & positional encoding.
            transformer_dec_positional_dropout_rate (float): Dropout rate after decoder
                positional encoding.
            transformer_dec_attn_dropout_rate (float): Dropout rate in decoder
                self-attention module.
            conformer_rel_pos_type (str): Relative pos encoding type in conformer.
            conformer_pos_enc_layer_type (str): Pos encoding layer type in conformer.
            conformer_self_attn_layer_type (str): Self-attention layer type in conformer
            conformer_activation_type (str): Activation function type in conformer.
            use_macaron_style_in_conformer: Whether to use macaron style FFN.
            use_cnn_in_conformer: Whether to use CNN in conformer.
            conformer_enc_kernel_size: Kernel size of encoder conformer.
            conformer_dec_kernel_size: Kernel size of decoder conformer.
            zero_triu: Whether to use zero triu in relative self-attention module.
            spks (Optional[int]): Number of speakers. If set to > 1, assume that the
                sids will be provided as the input and use sid embedding layer.
            langs (Optional[int]): Number of languages. If set to > 1, assume that the
                lids will be provided as the input and use sid embedding layer.
            spk_embed_dim (Optional[int]): Speaker embedding dimension. If set to > 0,
                assume that spembs will be provided as the input.
            spk_embed_integration_type: How to integrate speaker embedding.
            use_gst (str): Whether to use global style token.
            gst_tokens (int): The number of GST embeddings.
            gst_heads (int): The number of heads in GST multihead attention.
            gst_conv_layers (int): The number of conv layers in GST.
            gst_conv_chans_list: (Sequence[int]):
                List of the number of channels of conv layers in GST.
            gst_conv_kernel_size (int): Kernel size of conv layers in GST.
            gst_conv_stride (int): Stride size of conv layers in GST.
            gst_gru_layers (int): The number of GRU layers in GST.
            gst_gru_units (int): The number of GRU units in GST.
            init_type (str): How to initialize transformer parameters.
            init_enc_alpha (float): Initial value of alpha in scaled pos encoding of the
                encoder.
            init_dec_alpha (float): Initial value of alpha in scaled pos encoding of the
                decoder.
            use_masking (bool): Whether to apply masking for padded part in loss
                calculation.
            use_weighted_masking (bool): Whether to apply weighted masking in loss
                calculation.

        r   r   	conformerr"   r#   legacy_rel_poszFallback to conformer_pos_enc_layer_type = 'legacy_rel_pos' due to the compatibility. If you want to use the new one, please use conformer_pos_enc_layer_type = 'latest'.r$   legacy_rel_selfattnzFallback to conformer_self_attn_layer_type = 'legacy_rel_selfattn' due to the compatibility. If you want to use the new one, please use conformer_pos_enc_layer_type = 'latest'.latestzUnknown rel_pos_type: )num_embeddingsembedding_dimpadding_idxr!   )r0   attention_dimattention_headslinear_units
num_blocksinput_layerdropout_ratepositional_dropout_rateattention_dropout_ratepos_enc_classnormalize_beforeconcat_afterr<   r=   r0   rs   rt   ru   rv   rw   rx   ry   rz   r|   r}   r<   r=   macaron_stylepos_enc_layer_typeselfattention_layer_typeactivation_typeuse_cnn_modulecnn_module_kernelz is not supported.)
r0   r_   gst_token_dimr`   conv_layersconv_chans_listconv_kernel_sizeconv_stride
gru_layers	gru_unitsNr(   )r0   n_layersn_chanskernel_sizerx   )r0   r1   r   r   n_filtsr?   rx   )rg   rh   ri   )rj   rk    ),r   super__init__r0   r1   eosrH   rI   rJ   r>   r^   rr   r   r   loggingwarning
ValueErrortorchnn	EmbeddingTransformerEncoderencoderConformerEncoderr
   gstrZ   sid_embr[   lid_embr\   r]   Linear
projectionr   duration_predictorr   length_regulatordecoderfeat_outr   postnet_reset_parametersFastSpeechLoss	criterion)?selfr0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r]   r^   r_   r`   ra   rb   rc   rd   re   rf   rg   rh   ri   rj   rk   r{   encoder_input_layer	__class__r   U/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/tts/fastspeech/fastspeech.pyr   /   s   



	
	


	

zFastSpeech.__init__xsilensysolensdsspembssidslidsis_inferencealphareturnc                    s    |} ||\}} jr |}||d } jd ur0 |d}||d } jd urD 	|d}||d } j
d urO ||}t||j}|	rh j||} |||
}n ||} ||}|d ur|	s jdkr| fdd|D }n|}  |}nd } ||\}} ||dd j} jd u r|}n| |dddd }|||fS )Nr   c                    s   g | ]}| j  qS r   rH   .0olenr   r   r   
<listcomp>  s    z'FastSpeech._forward.<locals>.<listcomp>r   r   )_source_maskr   r^   r   	unsqueezerZ   r   viewr[   r   r\   _integrate_with_spk_embedr   todevicer   	inferencer   rH   newr   r   sizer1   r   	transpose)r   r   r   r   r   r   r   r   r   r   r   x_maskshs_
style_embssid_embslid_embsd_masksd_outsolens_inh_maskszsbefore_outs
after_outsr   r   r   _forward  sJ   









zFastSpeech._forwardtexttext_lengthsfeatsfeats_lengths	durationsdurations_lengthsjoint_trainingc                    s  |ddd|  f }|ddd|  f }|ddd|  f }|d}t|ddgd j}t|D ]\}} j|||f< q8|d }||}}|} j||||||||	dd	\}}} jdkr|	 fdd|D }t |}|ddd|f } j
du rd} |||||||\}}|| }t| | d	} jd
kr jr|j jjd jj d  jd
kr͈ jr|j jjd jj d |
s|j| d t|||f|j\}}}|||fS |||dur|fS |fS )a  Calculate forward propagation.

        Args:
            text (LongTensor): Batch of padded character ids (B, T_text).
            text_lengths (LongTensor): Batch of lengths of each input (B,).
            feats (Tensor): Batch of padded target features (B, T_feats, odim).
            feats_lengths (LongTensor): Batch of the lengths of each target (B,).
            durations (LongTensor): Batch of padded durations (B, T_text + 1).
            durations_lengths (LongTensor): Batch of duration lengths (B, T_text + 1).
            spembs (Optional[Tensor]): Batch of speaker embeddings (B, spk_embed_dim).
            sids (Optional[Tensor]): Batch of speaker IDs (B, 1).
            lids (Optional[Tensor]): Batch of language IDs (B, 1).
            joint_training (bool): Whether to perform joint training with vocoder.

        Returns:
            Tensor: Loss scalar value.
            Dict: Statistics to be monitored.
            Tensor: Weight value if not joint training else model outputs.

        Nr   r   constantF)r   r   r   r   c                    s   g | ]	}|| j   qS r   r   r   r   r   r   r     s    z&FastSpeech.forward.<locals>.<listcomp>)l1_lossduration_lossr!   r   )encoder_alpha)decoder_alpha)loss)maxr   Fpadrr   	enumerater   r   rH   r   r   r   dictitemrI   r>   updater   embedr   datarJ   r   r   r   )r   r   r   r   r   r   r   r   r   r   r   
batch_sizer   ilr   r   r   r   r   r   r   max_olenr   r   r   statsweightr   r   r   forward  sd   !





zFastSpeech.forwarduse_teacher_forcingc	              
   C   s   ||}	}
||}}t |	ddgd| j}	tj|	jd gtj|	jd}|	dd}}|
dur4|
d}|dur=|d}|rT|d}| j	|||||||d\}}}n| j	||||||d|d\}}}t
|d |d d	S )
a  Generate the sequence of features given the sequences of characters.

        Args:
            text (LongTensor): Input sequence of characters (T_text,).
            feats (Optional[Tensor]): Feature sequence to extract style (N, idim).
            durations (Optional[LongTensor]): Groundtruth of duration (T_text + 1,).
            spembs (Optional[Tensor]): Speaker embedding (spk_embed_dim,).
            sids (Optional[Tensor]): Speaker ID (1,).
            lids (Optional[Tensor]): Language ID (1,).
            alpha (float): Alpha to control the speed.
            use_teacher_forcing (bool): Whether to use teacher forcing.
                If true, groundtruth of duration, pitch and energy will be used.

        Returns:
            Dict[str, Tensor]: Output dict including the following items:
                * feat_gen (Tensor): Output sequence of features (T_feats, odim).
                * duration (Tensor): Duration sequence (T_text + 1,).

        r   r   r   )dtyper   N)r   r   r   r   T)r   r   r   r   r   )feat_genduration)r   r   r   r   tensorshapelongr   r   r   r   )r   r   r   r   r   r   r   r   r   xyspembdr   r   r   r   r   outsr   r   r   r   r   =  s>   




zFastSpeech.inferencer   c                 C   sz   | j dkr| t|}||d }|S | j dkr9t|dd|dd}| tj||gdd}|S t	d)aE  Integrate speaker embedding with hidden states.

        Args:
            hs (Tensor): Batch of hidden state sequences (B, T_text, adim).
            spembs (Tensor): Batch of speaker embeddings (B, spk_embed_dim).

        Returns:
            Tensor: Batch of integrated hidden state sequences (B, T_text, adim).

        r(   r   concatr   )dimzsupport only add or concat.)
r]   r   r   	normalizer   expandr   r   catNotImplementedError)r   r   r   r   r   r   r     s   

 z$FastSpeech._integrate_with_spk_embedc                 C   s"   t |t|  j}|dS )a  Make masks for self-attention.

        Args:
            ilens (LongTensor): Batch of lengths (B,).

        Returns:
            Tensor: Mask tensor for self-attention.
                dtype=torch.uint8 in PyTorch 1.2-
                dtype=torch.bool in PyTorch 1.2+ (including 1.2)

        Examples:
            >>> ilens = [5, 3]
            >>> self._source_mask(ilens)
            tensor([[[1, 1, 1, 1, 1],
                     [1, 1, 1, 0, 0]]], dtype=torch.uint8)

        )r   r   next
parametersr   r   )r   r   r   r   r   r   r     s   
zFastSpeech._source_maskc                 C   sj   |dkr	t | | | jdkr| jrt|| jjd j_| j	dkr1| jr3t|| j
jd j_d S d S d S )Npytorchr!   r   )r   rI   r>   r   r   r   r   r   r   rJ   r   )r   rg   rh   ri   r   r   r   r     s   
zFastSpeech._reset_parameters):r   r   r   r   r   r   r   r   r   r   r   r   TTTTFFr   r   r   r    r   r!   r!   r    r    r    r    r    r    r"   r#   r$   r%   TTr&   r'   FNNNr(   Fr)   r   r   r*   r   r   r   r-   r.   r/   r/   FF)NNNNNNFr/   )NNNF)NNNNNr/   F)__name__
__module____qualname____doc__intfloatstrboolr   r   r   r   Tensorr   r   r   r   r   r   r   r   __classcell__r   r   r   r   r   "   s4   	
 !"#%&'()*+,-/0123456789:;=>?@A  n	

K	

f	

G
r   )+r  r   typingr   r   r   r   r   torch.nn.functionalr   
functionalr   	typeguardr    espnet2.torch_utils.device_funcsr   espnet2.torch_utils.initializer   espnet2.tts.abs_ttsr	   espnet2.tts.gst.style_encoderr
   -espnet.nets.pytorch_backend.conformer.encoderr   r   .espnet.nets.pytorch_backend.e2e_tts_fastspeechr   r   9espnet.nets.pytorch_backend.fastspeech.duration_predictorr   7espnet.nets.pytorch_backend.fastspeech.length_regulatorr   &espnet.nets.pytorch_backend.nets_utilsr   r   -espnet.nets.pytorch_backend.tacotron2.decoderr   1espnet.nets.pytorch_backend.transformer.embeddingr   r   /espnet.nets.pytorch_backend.transformer.encoderr   r   r   r   r   r   <module>   s&   