o
    %ݫiG2                     @   sl   d Z ddlZddlmZ ddlmZ ddlmZ eeZ	G dd deZ
G dd	 d	eZG d
d deZdS )aR   Specifies the inference interfaces for Text-To-Speech (TTS) modules.

Authors:
 * Aku Rouhe 2021
 * Peter Plantinga 2021
 * Loren Lugosch 2020
 * Mirco Ravanelli 2020
 * Titouan Parcollet 2021
 * Abdel Heba 2021
 * Andreas Nautsch 2022, 2023
 * Pooneh Mousavi 2023
 * Sylvain de Langen 2023
 * Adel Moumen 2023
 * Pradnya Kandarkar 2023
    N)length_to_mask)
Pretrained)
get_loggerc                       sH   e Zd ZdZdgZ fddZdddZdd	 Zd
d Zdd Z	  Z
S )HIFIGANa  
    A ready-to-use wrapper for HiFiGAN (mel_spec -> waveform).

    Arguments
    ---------
    *args : tuple
    **kwargs : dict
        Arguments are forwarded to ``Pretrained`` parent class.

    Example
    -------
    >>> tmpdir_vocoder = getfixture('tmpdir') / "vocoder"
    >>> hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir=tmpdir_vocoder)
    >>> mel_specs = torch.rand(2, 80,298)
    >>> waveforms = hifi_gan.decode_batch(mel_specs)
    >>> # You can use the vocoder coupled with a TTS system
    >>>	# Initialize TTS (tacotron2)
    >>> tmpdir_tts = getfixture('tmpdir') / "tts"
    >>> from speechbrain.inference.TTS import Tacotron2
    >>>	tacotron2 = Tacotron2.from_hparams(source="speechbrain/tts-tacotron2-ljspeech", savedir=tmpdir_tts)
    >>>	# Running the TTS
    >>>	mel_output, mel_length, alignment = tacotron2.encode_text("Mary had a little lamb")
    >>>	# Running Vocoder (spectrogram-to-waveform)
    >>>	waveforms = hifi_gan.decode_batch(mel_output)
    	generatorc                    s(   t  j|i | | jjj| _d| _d S NT)super__init__hparamsr   	inferenceinfer
first_callselfargskwargs	__class__ R/home/ubuntu/.local/lib/python3.10/site-packages/speechbrain/inference/vocoders.pyr	   7   s   
zHIFIGAN.__init__Nc                 C   st   | j r| jj  d| _ t  | || j}W d   n1 s$w   Y  |dur8|dur8| 	|||}|S )a^  Computes waveforms from a batch of mel-spectrograms

        Arguments
        ---------
        spectrogram: torch.Tensor
            Batch of mel-spectrograms [batch, mels, time]
        mel_lens: torch.tensor
            A list of lengths of mel-spectrograms for the batch
            Can be obtained from the output of Tacotron/FastSpeech
        hop_len: int
            hop length used for mel-spectrogram extraction
            should be the same value as in the .yaml file

        Returns
        -------
        waveforms: torch.Tensor
            Batch of mel-waveforms [batch, 1, time]
        FN)
r   r
   r   remove_weight_normtorchno_gradr   todevice
mask_noise)r   spectrogrammel_lenshop_lenwaveformr   r   r   decode_batch<   s   
zHIFIGAN.decode_batchc                 C   @   | d}t|| |jd |jd }|| d |dS aa  Mask the noise caused by padding during batch inference

        Arguments
        ---------
        waveform: torch.tensor
            Batch of generated waveforms [batch, 1, time]
        mel_lens: torch.tensor
            A list of lengths of mel-spectrograms for the batch
            Can be obtained from the output of Tacotron/FastSpeech
        hop_len: int
            hop length used for mel-spectrogram extraction
            same value as in the .yaml file

        Returns
        -------
        waveform: torch.tensor
            Batch of waveforms without padded noise [batch, 1, time]
           )r   g        squeezer   shaper   boolmasked_fill_	unsqueezer   r   r   r   maskr   r   r   r   \      

zHIFIGAN.mask_noisec                 C   sb   | j r| jj  d| _ t  | |d| j	}W d   n1 s'w   Y  |
dS )a  Computes waveforms from a single mel-spectrogram

        Arguments
        ---------
        spectrogram: torch.Tensor
            mel-spectrogram [mels, time]

        Returns
        -------
        waveform: torch.Tensor
            waveform [1, time]
        audio can be saved by:
        >>> import torchaudio
        >>> waveform = torch.rand(1, 666666)
        >>> sample_rate = 22050
        >>> torchaudio.save(str(getfixture('tmpdir') / "test.wav"), waveform, sample_rate)
        Fr   N)r   r
   r   r   r   r   r   r)   r   r   r%   )r   r   r   r   r   r   decode_spectrogramw   s   

zHIFIGAN.decode_spectrogramc                 C   
   |  |S zDecodes the input spectrogramsr    r   r   r   r   r   forward      
zHIFIGAN.forward)NN__name__
__module____qualname____doc__HPARAMS_NEEDEDr	   r    r   r-   r2   __classcell__r   r   r   r   r      s    
 r   c                       sT   e Zd ZdZdgZ fddZ			dddZd	d
 Z		dddZdd Z	  Z
S )DiffWaveVocodera7  
    A ready-to-use inference wrapper for DiffWave as vocoder.
    The wrapper allows to perform generative tasks:
        locally-conditional generation: mel_spec -> waveform

    Arguments
    ---------
    *args : tuple
    **kwargs : dict
        Arguments are forwarded to ``Pretrained`` parent class.
    	diffusionc                    s2   t  j|i | t| jdr| jjj| _d S t)Ndiffwave)r   r	   hasattrr
   r<   r   r   NotImplementedErrorr   r   r   r   r	      s   zDiffWaveVocoder.__init__NFc                 C   sf   t   | jd||| j||d}W d   n1 sw   Y  |dur1|dur1| |||}|S )a  Generate waveforms from spectrograms

        Arguments
        ---------
        mel: torch.tensor
            spectrogram [batch, mels, time]
        hop_len: int
            Hop length during mel-spectrogram extraction
            Should be the same value as in the .yaml file
            Used to determine the output wave length
            Also used to mask the noise for vocoding task
        mel_lens: torch.tensor
            Used to mask the noise caused by padding
            A list of lengths of mel-spectrograms for the batch
            Can be obtained from the output of Tacotron/FastSpeech
        fast_sampling: bool
            whether to do fast sampling
        fast_sampling_noise_schedule: list
            the noise schedules used for fast sampling
        Returns
        -------
        waveforms: torch.tensor
            Batch of mel-waveforms [batch, 1, time]

        Funconditionalscale	conditionfast_samplingfast_sampling_noise_scheduleN)r   r   r   r   r   r   )r   melr   r   rD   rE   r   r   r   r   r       s   
!

zDiffWaveVocoder.decode_batchc                 C   r!   r"   r$   r*   r   r   r   r      r,   zDiffWaveVocoder.mask_noisec                 C   sT   t   | jd||d| j||d}W d   n1 s w   Y  |dS )a  Computes waveforms from a single mel-spectrogram

        Arguments
        ---------
        spectrogram: torch.tensor
            mel-spectrogram [mels, time]
        hop_len: int
            hop length used for mel-spectrogram extraction
            same value as in the .yaml file
        fast_sampling: bool
            whether to do fast sampling
        fast_sampling_noise_schedule: list
            the noise schedules used for fast sampling

        Returns
        -------
        waveform: torch.tensor
            waveform [1, time]

        audio can be saved by:
        >>> import torchaudio
        >>> waveform = torch.rand(1, 666666)
        >>> sample_rate = 22050
        >>> torchaudio.save(str(getfixture('tmpdir') / "test.wav"), waveform, sample_rate)
        Fr   r@   N)r   r   r   r)   r   r   r%   )r   r   r   rD   rE   r   r   r   r   r-      s   
 
z"DiffWaveVocoder.decode_spectrogramc                 C   r.   r/   r0   r1   r   r   r   r2     r3   zDiffWaveVocoder.forward)NFN)FNr4   r   r   r   r   r;      s    
/
*r;   c                       sD   e Zd ZdZdgZ fddZdddZddd	Zdd
dZ  Z	S )UnitHIFIGANa  
    A ready-to-use wrapper for Unit HiFiGAN (discrete units -> waveform).

    Arguments
    ---------
    *args : tuple
        See `Pretrained`
    **kwargs : dict
        See `Pretrained`

    Example
    -------
    >>> tmpdir_vocoder = getfixture('tmpdir') / "vocoder"
    >>> hifi_gan = UnitHIFIGAN.from_hparams(source="speechbrain/hifigan-hubert-l1-3-7-12-18-23-k1000-LibriTTS", savedir=tmpdir_vocoder)
    >>> codes = torch.randint(0, 99, (100, 1))
    >>> waveform = hifi_gan.decode_unit(codes)
    r   c                    s.   t  j|i | | jjj| _d| _d| _d S r   )r   r	   r
   r   r   r   r   tokenizer   r   r   r   r	   9  s   
zUnitHIFIGAN.__init__Nc                 C   s   | j r| jj  d| _ |ddk rtd| jr|d7 }|dur(|| j}t	
  | j|| j|d}W d   |S 1 sCw   Y  |S )av  Computes waveforms from a batch of discrete units

        Arguments
        ---------
        units: torch.tensor
            Batch of discrete units [batch, codes]
        spk: torch.tensor
            Batch of speaker embeddings [batch, spk_dim]

        Returns
        -------
        waveforms: torch.tensor
            Batch of mel-waveforms [batch, 1, time]
        Fr#      zPThe 'units' argument should have a length of at least 3 because of padding size.Nspk)r   r
   r   r   size
ValueErrorrH   r   r   r   r   r   r   unitsrK   r   r   r   r   r    @  s"   

zUnitHIFIGAN.decode_batchc                 C   s   | j r| jj  d| _ |ddk rtd| jr|d }|dur+|d| j	}t
  | j|d| j	|d}W d   n1 sHw   Y  |dS )a1  Computes waveforms from a single sequence of discrete units
        Arguments
        ---------
        units: torch.tensor
            codes: [time]
        spk: torch.tensor
            spk: [spk_dim]
        Returns
        -------
        waveform: torch.tensor
            waveform [1, time]
        Fr      zPThe 'units' argument should have a length of at least 4 because of padding size.r#   NrJ   )r   r
   r   r   rL   rM   rH   r)   r   r   r   r   r   r%   rN   r   r   r   decode_unitc  s   

zUnitHIFIGAN.decode_unitc                 C   s   | j ||dS )zDecodes the input unitsrJ   r0   )r   rO   rK   r   r   r   r2     s   zUnitHIFIGAN.forward)N)
r5   r6   r7   r8   r9   r	   r    rQ   r2   r:   r   r   r   r   rG   $  s    

#!rG   )r8   r   speechbrain.dataio.dataior    speechbrain.inference.interfacesr   speechbrain.utils.loggerr   r5   loggerr   r;   rG   r   r   r   r   <module>   s    { 