o
    ei                     @   sv   d Z ddlZddlm  mZ ddlZddlZddlm	Z	 ddl
mZ ddlmZ ddlmZmZ G dd de	ZdS )	aN   Specifies the inference interfaces for interpretability modules.

Authors:
 * Aku Rouhe 2021
 * Peter Plantinga 2021
 * Loren Lugosch 2020
 * Mirco Ravanelli 2020
 * Titouan Parcollet 2021
 * Abdel Heba 2021
 * Andreas Nautsch 2022, 2023
 * Pooneh Mousavi 2023
 * Sylvain de Langen 2023
 * Adel Moumen 2023
 * Pradnya Kandarkar 2023
    N)
Pretrained)spectral_phase)
split_path)LocalStrategyfetchc                       sT   e Zd ZdZ fddZdd Zdd Zdd	 Zd
d ZdddZ	dddZ
  ZS )PIQAudioInterpreterah  
    This class implements the interface for the PIQ posthoc interpreter for an audio classifier.

    Arguments
    ---------
    *args : tuple
    **kwargs : dict
        Arguments are forwarded to ``Pretrained`` parent class.

    Example
    -------
    >>> from speechbrain.inference.interpretability import PIQAudioInterpreter
    >>> tmpdir = getfixture("tmpdir")
    >>> interpreter = PIQAudioInterpreter.from_hparams(
    ...     source="speechbrain/PIQ-ESC50",
    ...     savedir=tmpdir,
    ... )
    >>> signal = torch.randn(1, 16000)
    >>> interpretation, _ = interpreter.interpret_batch(signal)
    c                    s   t  j|i | d S N)super__init__)selfargskwargs	__class__ d/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/speechbrain/inference/interpretability.pyr
   2   s   zPIQAudioInterpreter.__init__c                 C   s6   | j |}tjjj|| jjd}t	|}|||fS )z#Pre-process wavs to calculate STFTs)power)
modscompute_stftspeechbrain
processingfeaturesspectral_magnitudehparamsspec_mag_powertorchlog1p)r   wavsX_stftX_stft_powerX_stft_logpowerr   r   r   
preprocess5   s   

zPIQAudioInterpreter.preprocessc                 C   s>   | j |}|d}| j |d}|d}||||fS )z#the forward pass for the classifier)   )r   embedding_modelmean
classifiersqueezeargmax)r   r    hcat
embeddingspredictions
class_predr   r   r   classifier_forward?   s
   

z&PIQAudioInterpreter.classifier_forwardc                 C   s|   t jt |dt |dfdd}|ddd|jd ddddf }|jdkr2|d}|| }| j|}|S )z!Inverts STFT spectra given phase.r"   )dimNr$      )	r   catcos	unsqueezesinshapendimr   compute_istft)r   X_intX_stft_phaseX_stft_phase_sbX_wpsbx_int_sbr   r   r   invert_stft_with_phaseG   s   &

z*PIQAudioInterpreter.invert_stft_with_phasec                 C   s  | | j}| |\}}}t|}| |\}}}}	| jjr*| j||	\}
}}n| jj	|}
|

d}
|
jd }| jjrTt|
}
|
|ddd|ddf  }nt|
}
|
 | jj }|
|k|ddd|ddf  }t|}| ||}| jj|	d}||fS )a#  Classifies the given audio into the given set of labels.
        It also provides the interpretation in the audio domain.

        Arguments
        ---------
        wavs : torch.Tensor
            Batch of waveforms [batch, time, channels] or [batch, time]
            depending on the model. Make sure the sample rate is fs=16000 Hz.

        Returns
        -------
        x_int_sound_domain : torch.Tensor
            The interpretation in the waveform domain
        text_lab : str
            The text label for the classification
        r$   Nr   )todevicer!   r   r.   r   use_vqr   psidecoderr(   r5   use_mask_outputFsigmoidsoftplusmaxmask_thr   expm1r=   label_encoderdecode_torchr3   )r   r   r    r   r   r9   r*   r+   r,   r-   xhatz_q_xTmaxr8   thx_int_sound_domaintext_labr   r   r   interpret_batchX   s.   


 
"
z#PIQAudioInterpreter.interpret_batchNc                 C   s   t |\}}t|||tjd}t|\}}|| j}| jj	}||krCt
d|| tjj||d| j}|jddd}||}| |\}	}
|	|
|fS )aQ  Classifies the given audiofile into the given set of labels.
        It also provides the interpretation in the audio domain.

        Arguments
        ---------
        path : str
            Path to audio file to classify.
        savedir : str
            Path to cache directory.

        Returns
        -------
        x_int_sound_domain : torch.Tensor
            The interpretation in the waveform domain
        text_lab : str
            The text label for the classification
        fs_model : int
            The sampling frequency of the model. Useful to save the audio.
        )sourcesavedirlocal_strategyz(Resampling the audio from {} Hz to {} Hz)	orig_freqnew_freqr   T)r/   keepdim)r   r   r   SYMLINK
torchaudioloadr>   r?   r   sample_rateprintformat
transformsResampler&   rR   )r   pathrT   rS   flbatchfs_filefs_modeltfrP   rQ   r   r   r   interpret_file   s2   
z"PIQAudioInterpreter.interpret_filec                 C   s   |  ||S )zRuns the classification)rR   )r   r   wav_lensr   r   r   forward   s   zPIQAudioInterpreter.forwardr   )__name__
__module____qualname____doc__r
   r!   r.   r=   rR   rg   ri   __classcell__r   r   r   r   r      s    

/0r   )rm   r   torch.nn.functionalnn
functionalrD   rZ   r    speechbrain.inference.interfacesr   speechbrain.processing.NMFr   speechbrain.utils.data_utilsr   speechbrain.utils.fetchingr   r   r   r   r   r   r   <module>   s    