o
    %ݫi!                      @   s<   d Z ddlZddlmZ G dd deZG dd deZdS )aO   Specifies the inference interfaces for speech and audio encoders.

Authors:
 * Aku Rouhe 2021
 * Peter Plantinga 2021
 * Loren Lugosch 2020
 * Mirco Ravanelli 2020
 * Titouan Parcollet 2021
 * Abdel Heba 2021
 * Andreas Nautsch 2022, 2023
 * Pooneh Mousavi 2023
 * Sylvain de Langen 2023
 * Adel Moumen 2023
 * Pradnya Kandarkar 2023
    N)
Pretrainedc                   @   s.   e Zd ZdZdgZdd Zdd Zdd Zd	S )
WaveformEncoderaV  A ready-to-use waveformEncoder model

    It can be used to wrap different embedding models such as SSL ones (wav2vec2)
    or speaker ones (Xvector) etc. Two functions are available: encode_batch and
    encode_file. They can be used to obtain the embeddings directly from an audio
    file or from a batch of audio tensors respectively.

    The given YAML must contain the fields specified in the *_NEEDED[] lists.

    Arguments
    ---------
    See ``Pretrained``

    Example
    -------
    >>> from speechbrain.inference.encoders import WaveformEncoder
    >>> tmpdir = getfixture("tmpdir")
    >>> ssl_model = WaveformEncoder.from_hparams(
    ...     source="speechbrain/ssl-wav2vec2-base-libri",
    ...     savedir=tmpdir,
    ... ) # doctest: +SKIP
    >>> ssl_model.encode_file("samples/audio_samples/example_fr.wav") # doctest: +SKIP
    encoderc                 K   s<   | j |fi |}|d}tdg}| ||}|d S )ah  Encode the given audiofile into a sequence of embeddings.

        Arguments
        ---------
        path : str
            Path to audio file which to encode.
        **kwargs : dict
            Arguments forwarded to ``load_audio``

        Returns
        -------
        torch.Tensor
            The audiofile embeddings produced by this system.
        r         ?
embeddings)
load_audio	unsqueezetorchtensorencode_batch)selfpathkwargswaveformbatch
rel_lengthresults r   R/home/ubuntu/.local/lib/python3.10/site-packages/speechbrain/inference/encoders.pyencode_file1   s
   
zWaveformEncoder.encode_filec                 C   s4   |  }|| j|| j}}| j||}|S )a  Encodes the input audio into a sequence of hidden states

        The waveforms should already be in the model's desired format.

        Arguments
        ---------
        wavs : torch.Tensor
            Batch of waveforms [batch, time, channels] or [batch, time]
            depending on the model.
        wav_lens : torch.Tensor
            Lengths of the waveforms relative to the longest one in the
            batch, tensor of shape [batch]. The longest one should have
            relative length 1.0 and others len(waveform) / max_length.
            Used for ignoring padding.

        Returns
        -------
        torch.Tensor
            The encoded batch
        )floattodevicemodsr   )r   wavswav_lensencoder_outr   r   r   r   G   s   zWaveformEncoder.encode_batchc                 C      |  ||S zRuns the encoderr   )r   r   r   r   r   r   forwarda      zWaveformEncoder.forwardN)__name__
__module____qualname____doc__MODULES_NEEDEDr   r   r    r   r   r   r   r      s    r   c                   @   sL   e Zd ZdZddgZdddZdd	 Zd
d Zdd ZdddZ	dd Z
dS )MelSpectrogramEncodera  A MelSpectrogramEncoder class created for the Zero-Shot Multi-Speaker TTS models.

    This is for speaker encoder models using the PyTorch MelSpectrogram transform for compatibility with the
    current TTS pipeline.

    This class can be used to encode a single waveform, a single mel-spectrogram, or a batch of mel-spectrograms.

    Arguments
    ---------
    See ``Pretrained``

    Example
    -------
    >>> import torchaudio
    >>> from speechbrain.inference.encoders import MelSpectrogramEncoder
    >>> # Model is downloaded from the speechbrain HuggingFace repo
    >>> tmpdir = getfixture("tmpdir")
    >>> encoder = MelSpectrogramEncoder.from_hparams(
    ...     source="speechbrain/tts-ecapa-voxceleb",
    ...     savedir=tmpdir,
    ... ) # doctest: +SKIP

    >>> # Compute embedding from a waveform (sample_rate must match the sample rate of the encoder)
    >>> signal, fs = torchaudio.load("tests/samples/single-mic/example1.wav") # doctest: +SKIP
    >>> spk_emb = encoder.encode_waveform(signal) # doctest: +SKIP

    >>> # Compute embedding from a mel-spectrogram (sample_rate must match the sample rate of the ecoder)
    >>> mel_spec = encoder.mel_spectogram(audio=signal) # doctest: +SKIP
    >>> spk_emb = encoder.encode_mel_spectrogram(mel_spec) # doctest: +SKIP

    >>> # Compute embeddings for a batch of mel-spectrograms
    >>> spk_embs = encoder.encode_mel_spectrogram_batch(mel_spec) # doctest: +SKIP
    
normalizerembedding_model   h㈵>c                 C   s   t t j||d| S )z+Dynamic range compression for audio signals)min)r	   logclamp)r   xCclip_valr   r   r   dynamic_range_compression   s   z/MelSpectrogramEncoder.dynamic_range_compressionc                 C   s~   ddl m} |j| jj| jj| jj| jj| jj| jj	| jj
| jj| jj| jj| jjd|j}||}| jjr=| |}|S )zcalculates MelSpectrogram for a raw audio signal

        Arguments
        ---------
        audio : torch.tensor
            input audio signal

        Returns
        -------
        mel : torch.Tensor
            Mel-spectrogram
        r   )
transforms)sample_rate
hop_length
win_lengthn_fftn_melsf_minf_maxpower
normalizednorm	mel_scale)
torchaudior3   MelSpectrogramhparamsr4   r5   r6   r7   n_mel_channelsmel_fminmel_fmaxr;   mel_normalizedr=   r>   r   r   r2   )r   audior3   audio_to_melmelr   r   r   mel_spectogram   s(   
z$MelSpectrogramEncoder.mel_spectogramc                 C   s"   | | j}| j|d}| |S )z
        Encodes a single waveform

        Arguments
        ---------

        wav : torch.Tensor
            waveform

        Returns
        -------
        encoder_out : torch.Tensor
            Speaker embedding for the input waveform
        )rF   )r   r   rI   encode_mel_spectrogram)r   wavmel_specr   r   r   encode_waveform   s   
z%MelSpectrogramEncoder.encode_waveformc                 C   s8   |}t |jdkr|d}tdg}| ||}|S )a  
        Encodes a single mel-spectrograms

        Arguments
        ---------

        mel_spec : torch.Tensor
            Mel-spectrograms

        Returns
        -------
        encoder_out : torch.Tensor
            Speaker embedding for the input mel-spectrogram
           r   r   )lenshaper   r	   r
   encode_mel_spectrogram_batch)r   rL   r   r   r   r   r   r   rJ      s   
z,MelSpectrogramEncoder.encode_mel_spectrogramNc                 C   sd   |du rt j|jd | jd}|| j|| j}}t |dd}| j||}| j|}|S )ap  
        Encodes a batch of mel-spectrograms

        Arguments
        ---------

        mel_specs : torch.Tensor
            Mel-spectrograms
        lens : torch.Tensor
            Relative lengths of the mel-spectrograms

        Returns
        -------
        encoder_out : torch.Tensor
            Speaker embedding for the input mel-spectrogram batch
        Nr   )r   r*   rN   )	r	   onesrP   r   r   	transposerA   r(   r)   )r   	mel_specslensfeatsr   r   r   r   rQ      s   z2MelSpectrogramEncoder.encode_mel_spectrogram_batchc                 C   r   r   r   )r   rT   rU   r   r   r   	__forward  r!   zMelSpectrogramEncoder.__forward)r*   r+   )N)r"   r#   r$   r%   r&   r2   rI   rM   rJ   rQ   _MelSpectrogramEncoder__forwardr   r   r   r   r'   f   s    "
$
 r'   )r%   r	    speechbrain.inference.interfacesr   r   r'   r   r   r   r   <module>   s
    P