o
    ¡¿¯iÓ  ã                   @   sT   d Z ddlmZ ddlZddlmZ ddlmZ ddlm	Z	 G dd„ dej
jƒZdS )	z,Fastspeech2 related loss module for ESPnet2.é    )ÚTupleN)Úcheck_argument_types)ÚDurationPredictorLoss)Úmake_non_pad_maskc                       s   e Zd ZdZddedef‡ fdd„Zdejd	ejd
ejdejdejdejdejdejdejdejdejdeejejejejf fdd„Z	‡  Z
S )ÚFastSpeech2Lossz%Loss function module for FastSpeech2.TFÚuse_maskingÚuse_weighted_maskingc                    sn   t ƒ sJ ‚tƒ  ¡  ||ks|rJ ‚|| _|| _| jrdnd}tjj|d| _tjj	|d| _
t|d| _dS )a!  Initialize feed-forward Transformer loss module.

        Args:
            use_masking (bool): Whether to apply masking for padded part in loss
                calculation.
            use_weighted_masking (bool): Whether to weighted masking in loss
                calculation.

        ÚnoneÚmean)Ú	reductionN)r   ÚsuperÚ__init__r   r   ÚtorchÚnnÚL1LossÚl1_criterionÚMSELossÚmse_criterionr   Úduration_criterion)Úselfr   r   r   ©Ú	__class__© úP/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/tts/fastspeech2/loss.pyr      s   


zFastSpeech2Loss.__init__Ú
after_outsÚbefore_outsÚd_outsÚp_outsÚe_outsÚysÚdsÚpsÚesÚilensÚolensÚreturnc                 C   sâ  | j rRt|ƒ d¡ |j¡}| |¡}|dur| |¡}| |¡}t|
ƒ |j¡}| |¡}| |¡}t|
ƒ d¡ |j¡}| |¡}| |¡}| |¡}|	 |¡}	|  ||¡}|durd||  ||¡7 }|  ||¡}|  ||¡}|  ||	¡}| j	rët|ƒ d¡ |j¡}| 
¡ |jddd 
¡  }|| d¡| d¡  }t|
ƒ |j¡}| 
¡ |jddd 
¡  }|| d¡ }| |¡ |¡ ¡ }| |¡ |¡ ¡ }| d¡}| d¡}| |¡ |¡ ¡ }| |¡ |¡ ¡ }||||fS )aW  Calculate forward propagation.

        Args:
            after_outs (Tensor): Batch of outputs after postnets (B, T_feats, odim).
            before_outs (Tensor): Batch of outputs before postnets (B, T_feats, odim).
            d_outs (LongTensor): Batch of outputs of duration predictor (B, T_text).
            p_outs (Tensor): Batch of outputs of pitch predictor (B, T_text, 1).
            e_outs (Tensor): Batch of outputs of energy predictor (B, T_text, 1).
            ys (Tensor): Batch of target features (B, T_feats, odim).
            ds (LongTensor): Batch of durations (B, T_text).
            ps (Tensor): Batch of target token-averaged pitch (B, T_text, 1).
            es (Tensor): Batch of target token-averaged energy (B, T_text, 1).
            ilens (LongTensor): Batch of the lengths of each input (B,).
            olens (LongTensor): Batch of the lengths of each target (B,).

        Returns:
            Tensor: L1 loss value.
            Tensor: Duration predictor loss value.
            Tensor: Pitch predictor loss value.
            Tensor: Energy predictor loss value.

        éÿÿÿÿNé   T)ÚdimÚkeepdimr   é   )r   r   Ú	unsqueezeÚtoÚdeviceÚmasked_selectr   r   r   r   ÚfloatÚsumÚsizeÚmul)r   r   r   r   r   r   r   r    r!   r"   r#   r$   Ú	out_masksÚduration_masksÚpitch_masksÚl1_lossÚduration_lossÚ
pitch_lossÚenergy_lossÚout_weightsÚduration_weightsÚpitch_weightsr   r   r   Úforward+   sJ   %








ÿÿ

ÿzFastSpeech2Loss.forward)TF)Ú__name__Ú
__module__Ú__qualname__Ú__doc__Úboolr   r   ÚTensorr   r=   Ú__classcell__r   r   r   r   r      s8    þýüûúùø	÷
öõôór   )rA   Útypingr   r   Ú	typeguardr   Ú9espnet.nets.pytorch_backend.fastspeech.duration_predictorr   Ú&espnet.nets.pytorch_backend.nets_utilsr   r   ÚModuler   r   r   r   r   Ú<module>   s   