o
    %ݫixy                     @   s   d Z ddlZddlZddlZddlZddlZddlmZ ddlm	Z	 ddl
mZ ddlmZ ddlmZ ddlmZ dd	lmZ eeZG d
d deZG dd deZG dd deZG dd deZdS )aR   Specifies the inference interfaces for Text-To-Speech (TTS) modules.

Authors:
 * Aku Rouhe 2021
 * Peter Plantinga 2021
 * Loren Lugosch 2020
 * Mirco Ravanelli 2020
 * Titouan Parcollet 2021
 * Abdel Heba 2021
 * Andreas Nautsch 2022, 2023
 * Pooneh Mousavi 2023
 * Sylvain de Langen 2023
 * Adel Moumen 2023
 * Pradnya Kandarkar 2023
    N)EncoderClassifier)MelSpectrogramEncoder)
Pretrained)GraphemeToPhoneme)fetch)
get_logger)text_to_sequencec                       sH   e Zd ZdZddgZ fddZdd Zdd	 Zd
d Zdd Z	  Z
S )	Tacotron2a  
    A ready-to-use wrapper for Tacotron2 (text -> mel_spec).

    Arguments
    ---------
    *args : tuple
    **kwargs : dict
        Arguments are forwarded to ``Pretrained`` parent class.

    Example
    -------
    >>> tmpdir_tts = getfixture('tmpdir') / "tts"
    >>> tacotron2 = Tacotron2.from_hparams(source="speechbrain/tts-tacotron2-ljspeech", savedir=tmpdir_tts)
    >>> mel_output, mel_length, alignment = tacotron2.encode_text("Mary had a little lamb")
    >>> items = [
    ...   "A quick brown fox jumped over the lazy dog",
    ...   "How much wood would a woodchuck chuck?",
    ...   "Never odd or even"
    ... ]
    >>> mel_outputs, mel_lengths, alignments = tacotron2.encode_batch(items)

    >>> # One can combine the TTS model with a vocoder (that generates the final waveform)
    >>> # Initialize the Vocoder (HiFIGAN)
    >>> tmpdir_vocoder = getfixture('tmpdir') / "vocoder"
    >>> from speechbrain.inference.vocoders import HIFIGAN
    >>> hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir=tmpdir_vocoder)
    >>> # Running the TTS
    >>> mel_output, mel_length, alignment = tacotron2.encode_text("Mary had a little lamb")
    >>> # Running Vocoder (spectrogram-to-waveform)
    >>> waveforms = hifi_gan.decode_batch(mel_output)
    modelr   c                    s4   t  j|i | t| jddg| _| jjj| _d S )Ntext_cleanersenglish_cleaners)super__init__getattrhparamsr   r
   inferselfargskwargs	__class__ M/home/ubuntu/.local/lib/python3.10/site-packages/speechbrain/inference/TTS.pyr   F   s
   
zTacotron2.__init__c                 C   s   | j || j}|t|fS zHEncodes raw text into a tensor with a customer text-to-sequence function)r   r   r   lenr   txtsequencer   r   r   text_to_seqM   s   zTacotron2.text_to_seqc                    s   t  @  fdd|D }tjj|} fdd|D }|t|ddks*J dt j| jd} 	|j
j|\}}}W d   n1 sGw   Y  |||fS )	aN  Computes mel-spectrogram for a list of texts

        Texts must be sorted in decreasing order on their lengths

        Arguments
        ---------
        texts: List[str]
            texts to be encoded into spectrogram

        Returns
        -------
        tensors of output spectrograms, output lengths and alignments
        c                    *   g | ]}d t j |d  jdiqS text_sequencesr   device)torchtensorr   r$   .0itemr   r   r   
<listcomp>a       z*Tacotron2.encode_batch.<locals>.<listcomp>c                    s   g | ]	}  |d  qS )   )r   r'   r*   r   r   r+   k   s    Treverse0input lengths must be sorted in decreasing orderr#   N)r%   no_gradspeechbraindataiobatchPaddedBatchsortedr&   r$   r   r"   data)r   textsinputslensinput_lengthsmel_outputs_postnetmel_lengths
alignmentsr   r*   r   encode_batchR   s"   



zTacotron2.encode_batchc                 C   s   |  |gS )z$Runs inference for a single text strr?   )r   textr   r   r   encode_textv   s   zTacotron2.encode_textc                 C   s
   |  |S )zEncodes the input texts.r@   )r   r8   r   r   r   forwardz   s   
zTacotron2.forward)__name__
__module____qualname____doc__HPARAMS_NEEDEDr   r   r?   rB   rC   __classcell__r   r   r   r   r	   #   s     $r	   c                       sN   e Zd ZdZdgZ fddZdd Zdd Zd	d
 Zdd Z	dd Z
  ZS )MSTacotron2a  
    A ready-to-use wrapper for Zero-Shot Multi-Speaker Tacotron2.
    For voice cloning: (text, reference_audio) -> (mel_spec).
    For generating a random speaker voice: (text) -> (mel_spec).

    Arguments
    ---------
    *args : tuple
    **kwargs : dict
        Arguments are forwarded to ``Pretrained`` parent class.

    Example
    -------
    >>> tmpdir_tts = getfixture('tmpdir') / "tts"
    >>> mstacotron2 = MSTacotron2.from_hparams(source="speechbrain/tts-mstacotron2-libritts", savedir=tmpdir_tts) # doctest: +SKIP
    >>> # Sample rate of the reference audio must be greater or equal to the sample rate of the speaker embedding model
    >>> reference_audio_path = "tests/samples/single-mic/example1.wav"
    >>> input_text = "Mary had a little lamb."
    >>> mel_output, mel_length, alignment = mstacotron2.clone_voice(input_text, reference_audio_path) # doctest: +SKIP
    >>> # One can combine the TTS model with a vocoder (that generates the final waveform)
    >>> # Initialize the Vocoder (HiFIGAN)
    >>> tmpdir_vocoder = getfixture('tmpdir') / "vocoder"
    >>> from speechbrain.inference.vocoders import HIFIGAN
    >>> hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-libritts-22050Hz", savedir=tmpdir_vocoder) # doctest: +SKIP
    >>> # Running the TTS
    >>> mel_output, mel_length, alignment = mstacotron2.clone_voice(input_text, reference_audio_path) # doctest: +SKIP
    >>> # Running Vocoder (spectrogram-to-waveform)
    >>> waveforms = hifi_gan.decode_batch(mel_output) # doctest: +SKIP
    >>> # For generating a random speaker voice, use the following
    >>> mel_output, mel_length, alignment = mstacotron2.generate_random_voice(input_text) # doctest: +SKIP
    r
   c                    s   t  j|i | dg| _| jjj| _| jj| _tj| jj	d| j
id| _	d | _| jr:tj| jjd| j
id| _d S tj| jjd| j
id| _d S )Nr   r$   )run_opts)sourcerK   )r   r   r   r   r
   r   custom_mel_spec_encoderr   from_hparamsg2pr$   spk_emb_encoderr   r   r   r   r   r   r      s"   
zMSTacotron2.__init__c                 C   s   t || j}|t|fS r   )r   r   r   r   r   r   r   __text_to_seq   s   zMSTacotron2.__text_to_seqc           	      C   s   t |\}}|| jjkrt j||| jj}|| j}| jr'| j	
|}n| j	|}|d}t|tr:|g}| |}tt|D ]}d|| ||< d||  d ||< qE|t|d}| ||S )aD  
        Generates mel-spectrogram using input text and reference audio

        Arguments
        ---------
        texts : str or list
            Input text
        audio_path : str
            Reference audio

        Returns
        -------
        tensors of output spectrograms, output lengths and alignments
        r    {}r-   )
torchaudioloadr   spk_emb_sample_rate
functionalresampletor$   rM   rP   encode_waveformr?   squeeze
isinstancestrrO   ranger   joinrepeat_MSTacotron2__encode_batch)	r   r8   
audio_path
ref_signal	signal_srspk_embphoneme_seqsispk_embsr   r   r   clone_voice   s$   



zMSTacotron2.clone_voicec                 C   s   |    }|| j}t|tr|g}| |}tt|D ]}d	|| ||< d||  d ||< q|
t|d}| ||S )a  
        Generates mel-spectrogram using input text and a random speaker voice

        Arguments
        ---------
        texts : str or list
            Input text

        Returns
        -------
        tensors of output spectrograms, output lengths and alignments
        rR   rS   rT   r-   )#_MSTacotron2__sample_random_speakerfloatrZ   r$   r]   r^   rO   r_   r   r`   ra   rb   )r   r8   rf   rg   rh   ri   r   r   r   generate_random_voice   s   

z!MSTacotron2.generate_random_voicec           	         s   t  H  fdd|D }t|dd dd}dd |D }tjj|}|t|ddks1J d	t j| jd
} 	|j
j||\}}}W d   n1 sOw   Y  |||fS )a  Computes mel-spectrograms for a list of texts
        Texts are sorted in decreasing order on their lengths

        Arguments
        ---------
        texts: List[str]
            texts to be encoded into spectrogram
        spk_embs: torch.Tensor
            speaker embeddings

        Returns
        -------
        tensors of output spectrograms, output lengths and alignments
        c                    r    r!   )r%   r&   _MSTacotron2__text_to_seqr$   r'   r*   r   r   r+     r,   z.MSTacotron2.__encode_batch.<locals>.<listcomp>c                 S   s   | d   d S )Nr"   r   size)xr   r   r   <lambda>(  s    z,MSTacotron2.__encode_batch.<locals>.<lambda>T)keyr/   c                 S   s   g | ]
}|d    d qS )r"   r   ro   )r(   entryr   r   r   r+   ,  s    r.   r0   r#   N)r%   r1   r6   r2   r3   r4   r5   r&   r$   r   r"   r7   )	r   r8   ri   r9   r:   r;   r<   r=   r>   r   r*   r   __encode_batch  s,   

	


zMSTacotron2.__encode_batchc           
         s   t | jj| jj| jjjd}t|}|d }|d }|d }t|}d|t	
d|d < tjd|jd}t||dk D ])}tjj|| ||  t fdd	tt|| D }	tj||	fdd
}q?|S )zSamples a random speaker embedding from a pretrained GMM

        Returns
        -------
        x: torch.Tensor
            A randomly sampled speaker embedding
        )filenamerL   savedirgmm_n_components	gmm_meansgmm_covariancesr-   r   r#   c                    s   g | ]}   qS r   )sample)r(   _d_kr   r   r+   Y      z7MSTacotron2.__sample_random_speaker.<locals>.<listcomp>)dim)r   r   random_speaker_samplerrandom_speaker_sampler_source
pretrainer
collect_inr%   rV   zerosrandomrandintemptyr$   arangedistributionsmultivariate_normalMultivariateNormalstackr_   intcat)
r   speaker_gmm_local_pathrandom_speaker_gmmrx   ry   rz   countsrq   kx_kr   r}   r   __sample_random_speaker:  s&   


$z#MSTacotron2.__sample_random_speaker)rD   rE   rF   rG   rH   r   rn   rj   rm   rb   rk   rI   r   r   r   r   rJ      s     0.rJ   c                       sT   e Zd ZdZg dZ fddZdddZ	ddd	Z	dd
dZdddZ	  Z
S )FastSpeech2a  
    A ready-to-use wrapper for Fastspeech2 (text -> mel_spec).

    Arguments
    ---------
    *args : tuple
    **kwargs : dict
        Arguments are forwarded to ``Pretrained`` parent class.

    Example
    -------
    >>> tmpdir_tts = getfixture('tmpdir') / "tts"
    >>> fastspeech2 = FastSpeech2.from_hparams(source="speechbrain/tts-fastspeech2-ljspeech", savedir=tmpdir_tts) # doctest: +SKIP
    >>> mel_outputs, durations, pitch, energy = fastspeech2.encode_text(["Mary had a little lamb."]) # doctest: +SKIP
    >>> items = [
    ...   "A quick brown fox jumped over the lazy dog",
    ...   "How much wood would a woodchuck chuck?",
    ...   "Never odd or even"
    ... ]
    >>> mel_outputs, durations, pitch, energy = fastspeech2.encode_text(items) # doctest: +SKIP
    >>>
    >>> # One can combine the TTS model with a vocoder (that generates the final waveform)
    >>> # Initialize the Vocoder (HiFIGAN)
    >>> tmpdir_vocoder = getfixture('tmpdir') / "vocoder"
    >>> from speechbrain.inference.vocoders import HIFIGAN
    >>> hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir=tmpdir_vocoder) # doctest: +SKIP
    >>> # Running the TTS
    >>> mel_outputs, durations, pitch, energy = fastspeech2.encode_text(["Mary had a little lamb."]) # doctest: +SKIP
    >>> # Running Vocoder (spectrogram-to-waveform)
    >>> waveforms = hifi_gan.decode_batch(mel_outputs) # doctest: +SKIP
    )spn_predictorr
   input_encoderc                    sp   t  j|i | | jj}dg| }| jj| _| jj|dd | j  td| _	| j
dg  | _d S )N@@Fsequence_inputspeechbrain/soundchoice-g2pspn)r   r   r   lexiconr   update_from_iterableadd_unkr   rN   rO   encode_sequence_torchr   r)   spn_token_encodedr   r   r   r   r   r   r   r     s   


zFastSpeech2.__init__      ?c                 C   s`  t  }t  }t  }|D ]_}t  }	t  }
t  }| }dd |D }| |}tt|D ].}|| }|D ]}| sI|	| |
d |d q4d|
d< || d dv rZd|d< q,||	 ||
 || qt  }d}tt|D ]}|| }	| j|		 
| j}t|| 
| j}
| jjd |d|
d	 }t|d }tt|| D ]}|| | dkr|| qt  }t|jd D ]}|||   ||v r|| j qt|
| j}|| ||jd k r|jd }qvtt||
| j}|  t|D ]\}}|||dt|f< q| j||||d	S )
  Computes mel-spectrogram for a list of texts

        Arguments
        ---------
        texts: List[str]
            texts to be converted to spectrogram
        pace: float
            pace for the speech synthesis
        pitch_rate : float
            scaling factor for phoneme pitches
        energy_rate : float
            scaling factor for phoneme energies

        Returns
        -------
        tensors of output spectrograms, output lengths and alignments
        c                 S   s   g | ]}|  qS r   )strip)r(   wordr   r   r   r+     r   z+FastSpeech2.encode_text.<locals>.<listcomp>r   r-   z:;-,.!?r   Npace
pitch_rateenergy_rate)listsplitrO   r_   r   isspaceappendr   r   r   rZ   r$   r%   
LongTensorr   modulesr   	unsqueezenonzeroreshapetolistshaper)   r   zero_	enumerater?   )r   r8   r   r   r   phoneme_labelslast_phonemes_combinedpunc_positionslabelphoneme_labellast_phonemespunc_positionwordswords_phonemesrh   words_phonemes_seqphonemeall_tokens_with_spnmax_seq_len	token_seq	spn_preds
spn_to_addjtokens_with_spn	token_idxtokens_with_spn_tensor_paddedseq_idxseqr   r   r   rB     s   










zFastSpeech2.encode_textc                 C      g }d}|D ]}| j | | j}||jd k r |jd }|| qtt	||| j}	|	
  t|D ]\}
}||	|
dt	|f< q:| j|	|||dS a  Computes mel-spectrogram for a list of phoneme sequences

        Arguments
        ---------
        phonemes: List[List[str]]
            phonemes to be converted to spectrogram
        pace: float
            pace for the speech synthesis
        pitch_rate : float
            scaling factor for phoneme pitches
        energy_rate : float
            scaling factor for phoneme energies

        Returns
        -------
        tensors of output spectrograms, output lengths and alignments
        r   Nr   r   r   r   rZ   r$   r   r   r%   r   r   r   r   r?   r   phonemesr   r   r   
all_tokensr   r   r   tokens_paddedr   r   r   r   r   encode_phoneme  ,   

zFastSpeech2.encode_phonemec           
   	   C   sd   t    | jj||||d\}}}}}}	}}|dd}W d   n1 s'w   Y  ||||	fS aH  Batch inference for a tensor of phoneme sequences

        Arguments
        ---------
        tokens_padded : torch.Tensor
            A sequence of encoded phonemes to be converted to spectrogram
        pace : float
            pace for the speech synthesis
        pitch_rate : float
            scaling factor for phoneme pitches
        energy_rate : float
            scaling factor for phoneme energies

        Returns
        -------
        post_mel_outputs : torch.Tensor
        durations : torch.Tensor
        pitch : torch.Tensor
        energy : torch.Tensor
        r   r   r-   Nr%   r1   r   r
   	transpose
r   r   r   r   r   r|   post_mel_outputs	durationspitchenergyr   r   r   r?   ,  s&   

	zFastSpeech2.encode_batchc                 C      | j |g|||dS a  Batch inference for a tensor of phoneme sequences

        Arguments
        ---------
        text : str
            A text to be converted to spectrogram
        pace : float
            pace for the speech synthesis
        pitch_rate : float
            scaling factor for phoneme pitches
        energy_rate : float
            scaling factor for phoneme energies

        Returns
        -------
        Encoded text
        r   rB   r   rA   r   r   r   r   r   r   rC   Y     
zFastSpeech2.forwardr   r   r   )rD   rE   rF   rG   rH   r   rB   r   r?   rC   rI   r   r   r   r   r   `  s     
l
1
-r   c                       s\   e Zd ZdZddgZ fddZdddZd	d
 Z	dddZ	dddZ	dddZ
  ZS )FastSpeech2InternalAlignmenta  
    A ready-to-use wrapper for Fastspeech2 with internal alignment(text -> mel_spec).

    Arguments
    ---------
    *args : tuple
    **kwargs : dict
        Arguments are forwarded to ``Pretrained`` parent class.

    Example
    -------
    >>> tmpdir_tts = getfixture('tmpdir') / "tts"
    >>> fastspeech2 = FastSpeech2InternalAlignment.from_hparams(source="speechbrain/tts-fastspeech2-internal-alignment-ljspeech", savedir=tmpdir_tts) # doctest: +SKIP
    >>> mel_outputs, durations, pitch, energy = fastspeech2.encode_text(["Mary had a little lamb."]) # doctest: +SKIP
    >>> items = [
    ...   "A quick brown fox jumped over the lazy dog",
    ...   "How much wood would a woodchuck chuck?",
    ...   "Never odd or even"
    ... ]
    >>> mel_outputs, durations, pitch, energy = fastspeech2.encode_text(items) # doctest: +SKIP
    >>> # One can combine the TTS model with a vocoder (that generates the final waveform)
    >>> # Initialize the Vocoder (HiFIGAN)
    >>> tmpdir_vocoder = getfixture('tmpdir') / "vocoder"
    >>> from speechbrain.inference.vocoders import HIFIGAN
    >>> hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir=tmpdir_vocoder) # doctest: +SKIP
    >>> # Running the TTS
    >>> mel_outputs, durations, pitch, energy = fastspeech2.encode_text(["Mary had a little lamb."]) # doctest: +SKIP
    >>> # Running Vocoder (spectrogram-to-waveform)
    >>> waveforms = hifi_gan.decode_batch(mel_outputs) # doctest: +SKIP
    r
   r   c                    sX   t  j|i | | jj}dg| }| jj| _| jj|dd | j  td| _	d S )Nr   Fr   r   )
r   r   r   r   r   r   r   r   rN   rO   r   r   r   r   r     s   


z%FastSpeech2InternalAlignment.__init__r   c                 C   s   t  }d}|D ]$}| | j|}|t|k rt|}| j| | j}	|	|	 qt
t||| j}
|
  t|D ]\}}||
|dt|f< q@| j|
|||dS )r   r   Nr   )r   _g2p_keep_punctuationsrO   r   r   r   r   rZ   r$   r   r%   r   r   r   r?   )r   r8   r   r   r   r   r   r   phonemes_with_puncr   r   r   r   r   r   r   rB     s.   
z(FastSpeech2InternalAlignment.encode_textc                 C   sl  t d|}|D ] }|dd}|dd}|dd}|dd}|||}qt d|}z||}W n tyI   td|  t  Y nw d|d	}g }	d
}
z|D ]}|dvrn|		||
 d |
d7 }
qY|	
| qYW n1 ty   td|  |D ]}|dvr||}dd |D }|		| q|	
| qY nw d|	v r|	d d|	v s|	S )zBdo grapheme to phoneme and keep the punctuations between the wordsz\w+[-':\.][-':\.\w]*\w+- ':.z[\w]+|[-!'(),.:;? ]zerror with text: rR   r   z-!'(),.:;? r-   zEDo g2p word by word because of unexpected outputs from g2p for text: c                 S   s   g | ]}|d kr|qS )rR   r   )r(   rh   r   r   r   r+     s    zGFastSpeech2InternalAlignment._g2p_keep_punctuations.<locals>.<listcomp>)refindallreplaceRuntimeErrorloggerinfoquitr`   r   extendr   
IndexErrorwarningrO   remove)r   	g2p_modelrA   special_wordsspecial_wordrmpall_r   word_phonemesr   countrh   pp_without_spacer   r   r   r     sP   



z3FastSpeech2InternalAlignment._g2p_keep_punctuationsc                 C   r   r   r   r   r   r   r   r     r   z+FastSpeech2InternalAlignment.encode_phonemec           
      C   sl   t  $ | jj||||d\}}}}}}	}}}}}}|dd}W d   n1 s+w   Y  ||||	fS r   r   r   r   r   r   r?   2  s.   
	z)FastSpeech2InternalAlignment.encode_batchc                 C   r   r   r   r   r   r   r   rC   c  r   z$FastSpeech2InternalAlignment.forwardr   )rD   rE   rF   rG   rH   r   rB   r   r   r?   rC   rI   r   r   r   r   r   p  s    

43
1
1r   )rG   r   r   r%   rU   r2   !speechbrain.inference.classifiersr   speechbrain.inference.encodersr    speechbrain.inference.interfacesr   speechbrain.inference.textr   speechbrain.utils.fetchingr   speechbrain.utils.loggerr   "speechbrain.utils.text_to_sequencer   rD   r   r	   rJ   r   r   r   r   r   r   <module>   s*    \ b  