o
    iw0                     @   s   d Z ddlmZ ddlmZmZmZ ddlZddlm	Z
 ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ e
eje
dkrPddlmZ nedddZG dd deZdS )zText-to-speech ESPnet model.    )contextmanager)DictOptionalTupleN)parse)check_argument_types)AbsNormalize)InversibleInterface)AbsESPnetModel)AbsTTS)AbsFeatsExtractz1.6.0)autocastTc                 c   s    d V  d S )N )enabledr   r   L/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/tts/espnet_model.pyr      s   
r   c                !       s   e Zd ZdZdee dee dee deeoe deeoe deeo#e def fd	d
Z										d de
jde
jde
jde
jdee
j dee
j dee
j dee
j dee
j dee
j dee
j dee
j dee
j dee
jeee
jf e
jf fddZ									d de
jde
jde
jde
jdee
j dee
j dee
j dee
j dee
j dee
j dee
j dee
j dee
j deee
jf fddZ							d!de
jdee
j dee
j dee
j dee
j dee
j dee
j dee
j deee
jf fddZ  ZS )"ESPnetTTSModelz%ESPnet model for text-to-speech task.feats_extractpitch_extractenergy_extract	normalizepitch_normalizeenergy_normalizettsc                    sB   t  sJ t   || _|| _|| _|| _|| _|| _|| _	dS )z!Initialize ESPnetTTSModel module.N)
r   super__init__r   r   r   r   r   r   r   )selfr   r   r   r   r   r   r   	__class__r   r   r      s   


zESPnetTTSModel.__init__Ntexttext_lengthsspeechspeech_lengths	durationsdurations_lengthspitchpitch_lengthsenergyenergy_lengthsspembssidslidsreturnc                 K   s  t dl | jdur| ||\}}n||}}| jdur-|du r-| j|||||d\}}| jdurB|	du rB| j|||||d\}	}
| jdurO| ||\}}| jdur\| ||\}}| jduri| |	|
\}	}
W d   n1 ssw   Y  t||||d}|dur|j|d |dur|j|d |dur|j|d |dur|j||d | jdur|dur|j||d	 | jdur|	dur|j|	|
d
 | j	j
r|j||d | j	di |S )ac  Caclualte outputs and return the loss tensor.

        Args:
            text (Tensor): Text index tensor (B, T_text).
            text_lengths (Tensor): Text length tensor (B,).
            speech (Tensor): Speech waveform tensor (B, T_wav).
            speech_lengths (Tensor): Speech length tensor (B,).
            duration (Optional[Tensor]): Duration tensor.
            duration_lengths (Optional[Tensor]): Duration length tensor (B,).
            pitch (Optional[Tensor]): Pitch tensor.
            pitch_lengths (Optional[Tensor]): Pitch length tensor (B,).
            energy (Optional[Tensor]): Energy tensor.
            energy_lengths (Optional[Tensor]): Energy length tensor (B,).
            spembs (Optional[Tensor]): Speaker embedding tensor (B, D).
            sids (Optional[Tensor]): Speaker ID tensor (B, 1).
            lids (Optional[Tensor]): Language ID tensor (B, 1).
            kwargs: "utt_id" is among the input.

        Returns:
            Tensor: Loss scalar tensor.
            Dict[str, float]: Statistics to be monitored.
            Tensor: Weight tensor to summarize losses.

        FNfeats_lengthsr"   r#   )r   r   featsr-   r(   r)   r*   )r"   r#   r$   r%   r&   r'   )r    r!   r   )r   r   r   r   r   r   r   dictupdater   require_raw_speech)r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   kwargsr.   r-   batchr   r   r   forward4   sb   
)




	

#zESPnetTTSModel.forwardc                 K   s   | j dur|  ||\}}n||}}| jdur$| j|||||d\}}| jdur5| j|||||d\}	}
t||d}|durF|j||d |	durQ|j|	|
d |S )a  Caclualte features and return them as a dict.

        Args:
            text (Tensor): Text index tensor (B, T_text).
            text_lengths (Tensor): Text length tensor (B,).
            speech (Tensor): Speech waveform tensor (B, T_wav).
            speech_lengths (Tensor): Speech length tensor (B,).
            durations (Optional[Tensor): Duration tensor.
            durations_lengths (Optional[Tensor): Duration length tensor (B,).
            pitch (Optional[Tensor): Pitch tensor.
            pitch_lengths (Optional[Tensor): Pitch length tensor (B,).
            energy (Optional[Tensor): Energy tensor.
            energy_lengths (Optional[Tensor): Energy length tensor (B,).
            spembs (Optional[Tensor]): Speaker embedding tensor (B, D).
            sids (Optional[Tensor]): Speaker ID tensor (B, 1).
            lids (Optional[Tensor]): Language ID tensor (B, 1).

        Returns:
            Dict[str, Tensor]: Dict of features.

        Nr,   )r.   r-   r2   r3   )r   r   r   r4   r5   )r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r7   r.   r-   
feats_dictr   r   r   collect_feats   s2   
'




	zESPnetTTSModel.collect_featsc	                 K   s$  t |d}
|	d st| jddrK|du rtd| jdur)| |d d d }n|}| jdur;| |d d d }|
j|d | jjrK|
j|d	 |	d r|durY|
j|d
 | jdurt| j|d t	
t|g|d dd d }| jdur| |d d d }|dur|
j|d | jdur| j|d t	
t|g|d dd d }| jdur| |d d d }|dur|
j|d |dur|
j|d |dur|
j|d |dur|
j|d | jjdi |
|	}| jdur|ddur| j|d  d d d }|j|d |S )a_  Caclualte features and return them as a dict.

        Args:
            text (Tensor): Text index tensor (T_text).
            speech (Tensor): Speech waveform tensor (T_wav).
            spembs (Optional[Tensor]): Speaker embedding tensor (D,).
            sids (Optional[Tensor]): Speaker ID tensor (1,).
            lids (Optional[Tensor]): Language ID tensor (1,).
            durations (Optional[Tensor): Duration tensor.
            pitch (Optional[Tensor): Pitch tensor.
            energy (Optional[Tensor): Energy tensor.

        Returns:
            Dict[str, Tensor]: Dict of outputs.

        )r   use_teacher_forcinguse_gstFNz#missing required argument: 'speech'r   )r.   )r    )r"   )r-   r"   )r$   )r&   r/   r0   r1   feat_gen)feat_gen_denormr   )r4   getattrr   RuntimeErrorr   r   r5   r6   r   torch
LongTensorlenr   r   r   	inferencegetinverseclone)r   r   r    r(   r)   r*   r"   r$   r&   decode_config
input_dictr.   output_dictr?   r   r   r   rE      sx   






zESPnetTTSModel.inference)	NNNNNNNNN)NNNNNNN)__name__
__module____qualname____doc__r   r   r   r	   r   r   rB   Tensorr   r   strr9   r;   rE   __classcell__r   r   r   r   r      s    


	

k	

I	r   )T)rO   
contextlibr   typingr   r   r   rB   packaging.versionr   V	typeguardr   espnet2.layers.abs_normalizer   #espnet2.layers.inversible_interfacer	   espnet2.train.abs_espnet_modelr
   espnet2.tts.abs_ttsr   +espnet2.tts.feats_extract.abs_feats_extractr   __version__torch.cuda.ampr   r   r   r   r   r   <module>   s    