o
    i$                     @   s   d Z ddlmZ ddlmZmZmZ ddlZddlm	Z
 ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ e
eje
dkrPddlmZ nedddZG dd deZdS )z&GAN-based text-to-speech ESPnet model.    )contextmanager)AnyDictOptionalN)parse)check_argument_types)	AbsGANTTS)AbsNormalize)InversibleInterface)AbsGANESPnetModel)AbsFeatsExtractz1.6.0)autocastTc                 c   s    d V  d S )N )enabledr   r   P/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/gan_tts/espnet_model.pyr      s   
r   c                !       s  e Zd ZdZdee deeoe dee deeoe dee deeo#e def fd	d
Z											d de
jde
jde
jde
jdee
j dee
j dee
j dee
j dee
j dee
j dee
j dee
j dee
j dedeeef fddZ									d!de
jde
jde
jde
jdee
j dee
j dee
j dee
j dee
j dee
j dee
j dee
j dee
j deee
jf fddZ  ZS )"ESPnetGANTTSModelz/ESPnet model for GAN-based text-to-speech task.feats_extract	normalizepitch_extractpitch_normalizeenergy_extractenergy_normalizettsc                    sf   t  sJ t   || _|| _|| _|| _|| _|| _|| _	t
|ds(J dt
|ds1J ddS )z$Initialize ESPnetGANTTSModel module.	generatorz4generator module must be registered as tts.generatordiscriminatorz<discriminator module must be registered as tts.discriminatorN)r   super__init__r   r   r   r   r   r   r   hasattr)selfr   r   r   r   r   r   r   	__class__r   r   r      s&   

zESPnetGANTTSModel.__init__NTtexttext_lengthsspeechspeech_lengths	durationsdurations_lengthspitchpitch_lengthsenergyenergy_lengthsspembssidslidsforward_generatorreturnc                 K   s  t dh d}| jdur| ||\}}| jdur)|du r)| j|||||d\}}| jdur>|	du r>| j|||||d\}	}
| jdurK| ||\}}| jdurX| ||\}}| jdure| |	|
\}	}
W d   n1 sow   Y  t|||d}|dur|j||d | j	j
r|j||d |dur|j||d | jdur|dur|j||d | jdur|	dur|j|	|
d	 |dur|j|d
 |dur|j|d |dur|j|d | j	di |S )aB  Return generator or discriminator loss with dict format.

        Args:
            text (Tensor): Text index tensor (B, T_text).
            text_lengths (Tensor): Text length tensor (B,).
            speech (Tensor): Speech waveform tensor (B, T_wav).
            speech_lengths (Tensor): Speech length tensor (B,).
            duration (Optional[Tensor]): Duration tensor.
            duration_lengths (Optional[Tensor]): Duration length tensor (B,).
            pitch (Optional[Tensor]): Pitch tensor.
            pitch_lengths (Optional[Tensor]): Pitch length tensor (B,).
            energy (Optional[Tensor]): Energy tensor.
            energy_lengths (Optional[Tensor]): Energy length tensor (B,).
            spembs (Optional[Tensor]): Speaker embedding tensor (B, D).
            sids (Optional[Tensor]): Speaker ID tensor (B, 1).
            lids (Optional[Tensor]): Language ID tensor (B, 1).
            forward_generator (bool): Whether to forward generator.
            kwargs: "utt_id" is among the input.

        Returns:
            Dict[str, Any]:
                - loss (Tensor): Loss scalar tensor.
                - stats (Dict[str, float]): Statistics to be monitored.
                - weight (Tensor): Weight tensor to summarize losses.
                - optim_idx (int): Optimizer index (0 for G and 1 for D).

        FNfeats_lengthsr%   r&   )r!   r"   r.   featsr1   )r#   r$   )r%   r&   r'   r(   r)   r*   )r+   )r,   )r-   r   )r   r   r   r   r   r   r   dictupdater   require_raw_speech)r   r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   kwargsr3   r1   batchr   r   r   forward:   sj   
-



	

"zESPnetGANTTSModel.forwardc                 K   s   d}| j dur|  ||\}}| jdur | j|||||d\}}| jdur1| j|||||d\}	}
i }|dur>|j||d |durI|j||d |	durT|j|	|
d |S )a  Calculate features and return them as a dict.

        Args:
            text (Tensor): Text index tensor (B, T_text).
            text_lengths (Tensor): Text length tensor (B,).
            speech (Tensor): Speech waveform tensor (B, T_wav).
            speech_lengths (Tensor): Speech length tensor (B, 1).
            durations (Optional[Tensor): Duration tensor.
            durations_lengths (Optional[Tensor): Duration length tensor (B,).
            pitch (Optional[Tensor): Pitch tensor.
            pitch_lengths (Optional[Tensor): Pitch length tensor (B,).
            energy (Optional[Tensor): Energy tensor.
            energy_lengths (Optional[Tensor): Energy length tensor (B,).
            spembs (Optional[Tensor]): Speaker embedding tensor (B, D).
            sids (Optional[Tensor]): Speaker index tensor (B, 1).
            lids (Optional[Tensor]): Language ID tensor (B, 1).

        Returns:
            Dict[str, Tensor]: Dict of features.

        Nr0   r2   r4   r5   )r   r   r   r7   )r   r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r9   r3   r1   
feats_dictr   r   r   collect_feats   s<   &




	zESPnetGANTTSModel.collect_feats)
NNNNNNNNNT)	NNNNNNNNN)__name__
__module____qualname____doc__r   r   r	   r
   r   r   torchTensorboolr   strr   r;   r=   __classcell__r   r   r   r   r      s    


!	


o	
r   )T)rA   
contextlibr   typingr   r   r   rB   packaging.versionr   V	typeguardr   espnet2.gan_tts.abs_gan_ttsr   espnet2.layers.abs_normalizer	   #espnet2.layers.inversible_interfacer
   "espnet2.train.abs_gan_espnet_modelr   +espnet2.tts.feats_extract.abs_feats_extractr   __version__torch.cuda.ampr   r   r   r   r   r   <module>   s    