o
    io                     @   s
  d Z ddlmZmZmZmZ ddlZddlm  m	Z
 ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZmZ ddlmZ ddlmZ ddlm Z! ddl"m#Z# ddl$m%Z% ddl&m'Z'm(Z( ddl)m Z  ddl*m+Z+ G dd deZ,dS )z Transformer-TTS related modules.    )DictOptionalSequenceTupleN)check_argument_types)force_gatherable)
initialize)AbsTTS)StyleEncoder)GuidedMultiHeadAttentionLossTransformerLoss)make_non_pad_maskmake_pad_mask)Postnet)Prenet)Encoder)MultiHeadedAttention)Decoder)PositionalEncodingScaledPositionalEncoding)subsequent_maskc                }       s  e Zd ZdZ																	
																																													ddededededed ed!ed"ed#ed$ed%ed&ed'ed(ed)ed*ed+ed,ed-ed.ed/ed0ed1ed2ed3ed4ed5ee d6ee d7ee d8ed9ed:ed;ed<ed=ee d>ed?ed@edAedBe	dCe	dDe	dEe	dFe	dGe	dHe	dIe	dJe	dKe	dLedMe	dNe	dOedPedQe	dRedSedTedUedVee dWe	dXe	f| fdYdZZ
dd[d\Z				dd]ejd^ejd_ejd`ejdaeej dbeej dceej ddedeeejeeejf ejf fdfdgZdhejdiejdjejdkejdaejdbejdcejdeeejejejf fdldmZ						n	o	dd]ejd_eej daeej dbeej dceej dpe	dqe	dre	dsedeeeejf fdtduZdjejdeejfdvdwZdxdy Zdkejdeejfdzd{Zd|ejdaejdeejfd}d~Z  ZS )TransformeraP  Transformer-TTS module.

    This is a module of text-to-speech Transformer described in `Neural Speech Synthesis
    with Transformer Network`_, which convert the sequence of tokens into the sequence
    of Mel-filterbanks.

    .. _`Neural Speech Synthesis with Transformer Network`:
        https://arxiv.org/pdf/1809.08895.pdf

                            conv1d   TFNadd
       r%   @   r&      r'   r'   皙?      ?xavier_uniform      ?      @L1encoder-decoder皙?idimodim	embed_dimeprenet_conv_layerseprenet_conv_chanseprenet_conv_filtsdprenet_layersdprenet_unitselayerseunitsadimaheadsdlayersdunitspostnet_layerspostnet_chanspostnet_filtspositionwise_layer_typepositionwise_conv_kernel_sizeuse_scaled_pos_encuse_batch_normencoder_normalize_beforedecoder_normalize_beforeencoder_concat_afterdecoder_concat_afterreduction_factorspkslangsspk_embed_dimspk_embed_integration_typeuse_gst
gst_tokens	gst_headsgst_conv_layersgst_conv_chans_listgst_conv_kernel_sizegst_conv_stridegst_gru_layersgst_gru_unitstransformer_enc_dropout_rate'transformer_enc_positional_dropout_rate!transformer_enc_attn_dropout_ratetransformer_dec_dropout_rate'transformer_dec_positional_dropout_rate!transformer_dec_attn_dropout_rate%transformer_enc_dec_attn_dropout_rateeprenet_dropout_ratedprenet_dropout_ratepostnet_dropout_rate	init_typeinit_enc_alphainit_dec_alphause_maskinguse_weighted_maskingbce_pos_weight	loss_typeuse_guided_attn_lossnum_heads_applied_guided_attnnum_layers_applied_guided_attnmodules_applied_guided_attnguided_attn_loss_sigmaguided_attn_loss_lambdac?           B         s  t  sJ t   || _|| _|d | _|| _|| _|9| _|| _	|8| _
|9| _| jrC|;dkr2|	| _n|;| _|:dkr=|| _n|:| _|<| _d| _| j	rKtnt}?|dkrjtjt||d|||||/| jd	tj||}@n
tjj||| jd}@t||||
|	|@|(|)|*|?||||d| _| jrt|| ||!|"|#|$|%|&|'d
| _d| _|dur|dkr|| _tj||| _d| _|dur|dkr|| _tj||| _d| _|dur|dkr|| _|| _ | jdur| j d	krtj| j|| _!ntj|| j || _!|dkrtjt"||||0d
tj||}And}At#||||||+|,|-|.|Ad|?||d| _$tj||| | _%tj||| _&|dkr;dn
t'|||||||1d| _(t)|5|6|7d| _*| jrZt+|=|>d| _,| j-|2|3|4d dS )an  Initialize Transformer module.

        Args:
            idim (int): Dimension of the inputs.
            odim (int): Dimension of the outputs.
            embed_dim (int): Dimension of character embedding.
            eprenet_conv_layers (int): Number of encoder prenet convolution layers.
            eprenet_conv_chans (int): Number of encoder prenet convolution channels.
            eprenet_conv_filts (int): Filter size of encoder prenet convolution.
            dprenet_layers (int): Number of decoder prenet layers.
            dprenet_units (int): Number of decoder prenet hidden units.
            elayers (int): Number of encoder layers.
            eunits (int): Number of encoder hidden units.
            adim (int): Number of attention transformation dimensions.
            aheads (int): Number of heads for multi head attention.
            dlayers (int): Number of decoder layers.
            dunits (int): Number of decoder hidden units.
            postnet_layers (int): Number of postnet layers.
            postnet_chans (int): Number of postnet channels.
            postnet_filts (int): Filter size of postnet.
            use_scaled_pos_enc (bool): Whether to use trainable scaled pos encoding.
            use_batch_norm (bool): Whether to use batch normalization in encoder prenet.
            encoder_normalize_before (bool): Whether to apply layernorm layer before
                encoder block.
            decoder_normalize_before (bool): Whether to apply layernorm layer before
                decoder block.
            encoder_concat_after (bool): Whether to concatenate attention layer's input
                and output in encoder.
            decoder_concat_after (bool): Whether to concatenate attention layer's input
                and output in decoder.
            positionwise_layer_type (str): Position-wise operation type.
            positionwise_conv_kernel_size (int): Kernel size in position wise conv 1d.
            reduction_factor (int): Reduction factor.
            spks (Optional[int]): Number of speakers. If set to > 1, assume that the
                sids will be provided as the input and use sid embedding layer.
            langs (Optional[int]): Number of languages. If set to > 1, assume that the
                lids will be provided as the input and use sid embedding layer.
            spk_embed_dim (Optional[int]): Speaker embedding dimension. If set to > 0,
                assume that spembs will be provided as the input.
            spk_embed_integration_type (str): How to integrate speaker embedding.
            use_gst (str): Whether to use global style token.
            gst_tokens (int): Number of GST embeddings.
            gst_heads (int): Number of heads in GST multihead attention.
            gst_conv_layers (int): Number of conv layers in GST.
            gst_conv_chans_list: (Sequence[int]): List of the number of channels of conv
                layers in GST.
            gst_conv_kernel_size (int): Kernel size of conv layers in GST.
            gst_conv_stride (int): Stride size of conv layers in GST.
            gst_gru_layers (int): Number of GRU layers in GST.
            gst_gru_units (int): Number of GRU units in GST.
            transformer_lr (float): Initial value of learning rate.
            transformer_warmup_steps (int): Optimizer warmup steps.
            transformer_enc_dropout_rate (float): Dropout rate in encoder except
                attention and positional encoding.
            transformer_enc_positional_dropout_rate (float): Dropout rate after encoder
                positional encoding.
            transformer_enc_attn_dropout_rate (float): Dropout rate in encoder
                self-attention module.
            transformer_dec_dropout_rate (float): Dropout rate in decoder except
                attention & positional encoding.
            transformer_dec_positional_dropout_rate (float): Dropout rate after decoder
                positional encoding.
            transformer_dec_attn_dropout_rate (float): Dropout rate in decoder
                self-attention module.
            transformer_enc_dec_attn_dropout_rate (float): Dropout rate in source
                attention module.
            init_type (str): How to initialize transformer parameters.
            init_enc_alpha (float): Initial value of alpha in scaled pos encoding of the
                encoder.
            init_dec_alpha (float): Initial value of alpha in scaled pos encoding of the
                decoder.
            eprenet_dropout_rate (float): Dropout rate in encoder prenet.
            dprenet_dropout_rate (float): Dropout rate in decoder prenet.
            postnet_dropout_rate (float): Dropout rate in postnet.
            use_masking (bool): Whether to apply masking for padded part in loss
                calculation.
            use_weighted_masking (bool): Whether to apply weighted masking in loss
                calculation.
            bce_pos_weight (float): Positive sample weight in bce calculation
                (only for use_masking=true).
            loss_type (str): How to calculate loss.
            use_guided_attn_loss (bool): Whether to use guided attention loss.
            num_heads_applied_guided_attn (int): Number of heads in each layer to apply
                guided attention loss.
            num_layers_applied_guided_attn (int): Number of layers to apply guided
                attention loss.
            modules_applied_guided_attn (Sequence[str]): List of module names to apply
                guided attention loss.
            guided_attn_loss_sigma (float) Sigma in guided attention loss.
            guided_attn_loss_lambda (float): Lambda in guided attention loss.

        r!   r   )	r0   r2   r8   econv_layerseconv_chanseconv_filtsrD   dropout_ratepadding_idx)num_embeddingsembedding_dimrs   )r0   attention_dimattention_headslinear_units
num_blocksinput_layerrr   positional_dropout_rateattention_dropout_ratepos_enc_classnormalize_beforeconcat_afterrA   rB   )
r0   rO   gst_token_dimrP   conv_layersconv_chans_listconv_kernel_sizeconv_stride
gru_layers	gru_unitsNr"   )r0   n_layersn_unitsrr   linearF)r1   rv   rw   rx   ry   rr   r{   self_attention_dropout_ratesrc_attention_dropout_raterz   use_output_layerr}   r~   r   )r0   r1   r   n_chansn_filtsrD   rr   )rd   re   rf   )sigmaalpha)ra   rb   rc   ).r   super__init__r0   r1   eosrI   rN   rh   rC   rg   rj   ri   rk   rs   r   r   torchnn
SequentialEncoderPrenetLinear	Embeddingr   encoderr
   gstrJ   sid_embrK   lid_embrL   rM   
projectionDecoderPrenetr   decoderfeat_outprob_outr   postnetr   	criterionr   attn_criterion_reset_parameters)Bselfr0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r]   r^   r_   r`   ra   rb   rc   rd   re   rf   rg   rh   ri   rj   rk   rl   rm   r}   encoder_input_layerdecoder_input_layer	__class__ W/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/tts/transformer/transformer.pyr   .   s   
!







zTransformer.__init__c                 C   sL   |dkr	t | | | jr$t|| jjd j_t|| jjd j_d S d S )Npytorchrn   )	r   rC   r   tensorr   embedr   datar   )r   ra   rb   rc   r   r   r   r   }  s   
zTransformer._reset_parameterstexttext_lengthsfeatsfeats_lengthsspembssidslidsjoint_trainingreturnc	           "   	      sX  |ddd|  f }|ddd|  f }|d}	t|ddgd j}
t|D ]\}} j|
||f< q,|d }|}|}t|d |j	|j
}t|ddgdd} j|
||||||d\}}}|} jdkr| j sxJ d| fdd	|D }| fd
d	|D }t |}|ddd|f }|ddd|f }t|d|d dd} ||||||\}}} jdkr|| }n jdkr|| }n jdkr|| | }ntd j t| | | d} jrd jv rLg }tttt jjD ]#\}}| jj| jj ddd j!f g7 }|d  j"kr0 nqtj#|dd} $|||}|| }|j%| d d jv rg }tttt j&j'D ]#\}}| j&j'| jj ddd j!f g7 }|d  j"kr nq`tj#|dd} $|||}|| }|j%| d d jv rg }tttt j&j'D ]#\}}| j&j'| j(j ddd j!f g7 }|d  j"kr nqtj#|dd} $|||} ||  }|j%|  d  j)r|j% jj*d j+j,  j&j*d j+j, d |s'|j%| d t-|||	f|j	\}}}!|||!fS |||fS )aQ  Calculate forward propagation.

        Args:
            text (LongTensor): Batch of padded character ids (B, Tmax).
            text_lengths (LongTensor): Batch of lengths of each input batch (B,).
            feats (Tensor): Batch of padded target features (B, Lmax, odim).
            feats_lengths (LongTensor): Batch of the lengths of each target (B,).
            spembs (Optional[Tensor]): Batch of speaker embeddings (B, spk_embed_dim).
            sids (Optional[Tensor]): Batch of speaker IDs (B, 1).
            lids (Optional[Tensor]): Batch of language IDs (B, 1).
            joint_training (bool): Whether to perform joint training with vocoder.

        Returns:
            Tensor: Loss scalar value.
            Dict: Statistics to be monitored.
            Tensor: Weight value if not joint training else model outputs.

        Nr   r!   constantr+   xsilensysolensr   r   r   z@Output length must be greater than or equal to reduction factor.c                       g | ]}| j  qS r   rI   .0olenr   r   r   
<listcomp>      z'Transformer.forward.<locals>.<listcomp>c                    s   g | ]	}|| j   qS r   r   r   r   r   r   r     s    r-   L2zL1+L2zunknown --loss-type )l1_lossl2_lossbce_lossr   dim)enc_attn_lossr   )dec_attn_lossr.   )enc_dec_attn_lossrn   )encoder_alphadecoder_alpha)loss).maxsizeFpadrs   	enumerater   r   todevicedtype_forwardrI   geallnewr   scatter	unsqueezer   rg   
ValueErrordictitemrh   rk   reversedrangelenr   encoders	self_attnattnri   rj   catr   updater   decoderssrc_attnrC   r   r   r   r   )"r   r   r   r   r   r   r   r   r   
batch_sizer   ilr   r   r   labels
after_outsbefore_outslogitsolens_inmax_olenr   r   r   r   statsatt_wsidx	layer_idxr   r   r   weightr   r   r   forward  s   










zTransformer.forwardr   r   r   r   c                    s~    |} ||\}	}
 jr |}|	|d }	 jd ur0 |d}|	|d }	 jd urD 	|d}|	|d }	 j
d urO |	|}	 jdkrp|d d  jd d  jf }| fdd|D }n||}} |} |} |||	|
\}} ||dd j} ||dd} jd u r|}n| |dddd }|||fS )Nr!   rn   c                    r   r   r   r   r   r   r   r   K  r   z(Transformer._forward.<locals>.<listcomp>r   r   )_source_maskr   rN   r   r   rJ   r   viewrK   r   rL   _integrate_with_spk_embedrI   r   &_add_first_frame_and_remove_last_frame_target_maskr   r   r   r1   r   r   	transpose)r   r   r   r   r   r   r   r   x_maskshsh_masks
style_embssid_embslid_embsys_inr   y_maskszs_r   r   r   r   r   r   r   (  s<   











zTransformer._forward              $@	thresholdminlenratiomaxlenratiouse_teacher_forcingc
           #   	   C   s  |}
|}|}t |
ddgd| j}
|	r~|dusJ d|
d|d}}|du r,dn|d}|
|dg }||dg }| j|||||||d^}}g }tt	| j
jD ]}|| j
j| jjg7 }q_tj|dd}t|d |d dS |
d}| |d\}}| jr| |d}||d }| jdur| |d	}||d }| jdur| |d	}||d }| jdur|d}| ||}t|d| | j }t|d| | j }d}|dd| j}g g }}| j
|
}	 |d7 }t |d!|
j"}| j
j#||||d\}}|| $|| j| jg7 }|t%| &|d g7 }tj'||d	 d	 dd| jfdd}g } | ( D ] \}!}"t)|"t*rqd|!v rq| |"jdddd	f dg7 } qR|dkr{| }n
dd t+|| D }tt,|d	 |kdks||kr||k rqtj'|ddd-dd}| j.dur|| .| }|-dd/d}tj'|dd}nqtj|dd}t|||dS )a1  Generate the sequence of features given the sequences of characters.

        Args:
            text (LongTensor): Input sequence of characters (T_text,).
            feats (Optional[Tensor]): Feature sequence to extract style embedding
                (T_feats', idim).
            spembs (Optional[Tensor]): Speaker embedding (spk_embed_dim,).
            sids (Optional[Tensor]): Speaker ID (1,).
            lids (Optional[Tensor]): Language ID (1,).
            threshold (float): Threshold in inference.
            minlenratio (float): Minimum length ratio in inference.
            maxlenratio (float): Maximum length ratio in inference.
            use_teacher_forcing (bool): Whether to use teacher forcing.

        Returns:
            Dict[str, Tensor]: Output dict including the following items:
                * feat_gen (Tensor): Output sequence of features (T_feats, odim).
                * prob (Tensor): Output sequence of stop probabilities (T_feats,).
                * att_w (Tensor): Source attn weight (#layers, #heads, T_feats, T_text).

        r   r!   r   Nz,feats must be provided with teacher forcing.r   r   )feat_genatt_wrn   T)cachesrcc                 S   s"   g | ]\}}t j||gd dqS )r!   r   )r   r   )r   r  att_w_r   r   r   r     s    z)Transformer.inference.<locals>.<listcomp>r   )r  probr  )0r   r   r   r   
new_tensorr   longr   r   r   r   r   r   r   r   stackr   r   rN   r   rJ   r   r   rK   r   rL   r  intrI   	new_zerosr1   
init_stater   r   r   forward_one_stepr   sigmoidr   r   named_modules
isinstancer   zipsumr  r   squeeze)#r   r   r   r   r   r   r  r  r  r  xyspembr   r   r   r   outsr  r   r   r  r  r	  r
  maxlenminlenr   probsz_cacher  zatt_ws_namemr   r   r   	inferenced  s   !







 
$
0zTransformer.inferencec                 C   s>   t j||jd d|jd f|d d d df gdd}|S )Nr   r!   r   rn   r   )r   r   r  shape)r   r   r  r   r   r   r    s   0z2Transformer._add_first_frame_and_remove_last_framec                 C   s"   t |t|  j}|dS )a  Make masks for self-attention.

        Args:
            ilens (LongTensor): Batch of lengths (B,).

        Returns:
            Tensor: Mask tensor for self-attention.
                    dtype=torch.uint8 in PyTorch 1.2-
                    dtype=torch.bool in PyTorch 1.2+ (including 1.2)

        Examples:
            >>> ilens = [5, 3]
            >>> self._source_mask(ilens)
            tensor([[[1, 1, 1, 1, 1],
                    [[1, 1, 1, 0, 0]]], dtype=torch.uint8)

        )r   r   next
parametersr   r   )r   r   r  r   r   r   r     s   
zTransformer._source_maskc                 C   s@   t |t|  j}t|d|jdd}|d|@ S )a"  Make masks for masked self-attention.

        Args:
            olens (LongTensor): Batch of lengths (B,).

        Returns:
            Tensor: Mask tensor for masked self-attention.
                dtype=torch.uint8 in PyTorch 1.2-
                dtype=torch.bool in PyTorch 1.2+ (including 1.2)

        Examples:
            >>> olens = [5, 3]
            >>> self._target_mask(olens)
            tensor([[[1, 0, 0, 0, 0],
                     [1, 1, 0, 0, 0],
                     [1, 1, 1, 0, 0],
                     [1, 1, 1, 1, 0],
                     [1, 1, 1, 1, 1]],
                    [[1, 0, 0, 0, 0],
                     [1, 1, 0, 0, 0],
                     [1, 1, 1, 0, 0],
                     [1, 1, 1, 0, 0],
                     [1, 1, 1, 0, 0]]], dtype=torch.uint8)

        rn   )r   r   r6  )r   r   r7  r8  r   r   r   r   )r   r   r  s_masksr   r   r   r    s   zTransformer._target_maskr  c                 C   sz   | j dkr| t|}||d }|S | j dkr9t|dd|dd}| tj||gdd}|S t	d)aA  Integrate speaker embedding with hidden states.

        Args:
            hs (Tensor): Batch of hidden state sequences (B, Tmax, adim).
            spembs (Tensor): Batch of speaker embeddings (B, spk_embed_dim).

        Returns:
            Tensor: Batch of integrated hidden state sequences (B, Tmax, adim).

        r"   r!   concatrn   r   zsupport only add or concat.)
rM   r   r   	normalizer   expandr   r   r   NotImplementedError)r   r  r   r   r   r   r  5  s   

 z%Transformer._integrate_with_spk_embed)<r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   TTTTFFr!   NNNr"   Fr#   r   r   r$   r   r   r!   r'   r(   r(   r(   r(   r(   r(   r(   r)   r)   r)   r*   r+   r+   FFr,   r-   Tr   r   r.   r/   r+   )r+   r+   )NNNF)NNNNr)   r  r  F)__name__
__module____qualname____doc__r  strboolr   r   floatr   r   r   Tensorr   r   r   r   r4  r  r   r  r  __classcell__r   r   r   r   r   "   s   	
 !"#$%&'()*,-./0123456789:;<=>?@AB  
Q	

 "	
?	

 r   )-rA  typingr   r   r   r   r   torch.nn.functionalr   
functionalr   	typeguardr    espnet2.torch_utils.device_funcsr   espnet2.torch_utils.initializer   espnet2.tts.abs_ttsr	   espnet2.tts.gst.style_encoderr
   /espnet.nets.pytorch_backend.e2e_tts_transformerr   r   &espnet.nets.pytorch_backend.nets_utilsr   r   -espnet.nets.pytorch_backend.tacotron2.decoderr   r   r   -espnet.nets.pytorch_backend.tacotron2.encoderr   r   1espnet.nets.pytorch_backend.transformer.attentionr   /espnet.nets.pytorch_backend.transformer.decoderr   1espnet.nets.pytorch_backend.transformer.embeddingr   r   /espnet.nets.pytorch_backend.transformer.encoder,espnet.nets.pytorch_backend.transformer.maskr   r   r   r   r   r   <module>   s(   