o
    %ݫi                     @   s   d Z ddlZddlmZ ddlm  mZ G dd dejj	Z
G dd dejj	ZG dd	 d	ejj	ZG d
d dejj	ZG dd dejj	ZG dd dejj	ZG dd dejj	ZdddZdd Zdd ZdddZdS )ak	  Multi-microphone components.

This library contains functions for multi-microphone signal processing.

Example
-------
>>> import torch
>>>
>>> from speechbrain.dataio.dataio import read_audio
>>> from speechbrain.processing.features import STFT, ISTFT
>>> from speechbrain.processing.multi_mic import Covariance
>>> from speechbrain.processing.multi_mic import GccPhat, SrpPhat, Music
>>> from speechbrain.processing.multi_mic import DelaySum, Mvdr, Gev
>>>
>>> xs_speech = read_audio(
...    'tests/samples/multi-mic/speech_-0.82918_0.55279_-0.082918.flac'
... )
>>> xs_speech = xs_speech.unsqueeze(0) # [batch, time, channels]
>>> xs_noise_diff = read_audio('tests/samples/multi-mic/noise_diffuse.flac')
>>> xs_noise_diff = xs_noise_diff.unsqueeze(0)
>>> xs_noise_loc = read_audio('tests/samples/multi-mic/noise_0.70225_-0.70225_0.11704.flac')
>>> xs_noise_loc =  xs_noise_loc.unsqueeze(0)
>>> fs = 16000 # sampling rate

>>> ss = xs_speech
>>> nn_diff = 0.05 * xs_noise_diff
>>> nn_loc = 0.05 * xs_noise_loc
>>> xs_diffused_noise = ss + nn_diff
>>> xs_localized_noise = ss + nn_loc

>>> # Delay-and-Sum Beamforming with GCC-PHAT localization
>>> stft = STFT(sample_rate=fs)
>>> cov = Covariance()
>>> gccphat = GccPhat()
>>> delaysum = DelaySum()
>>> istft = ISTFT(sample_rate=fs)

>>> Xs = stft(xs_diffused_noise)
>>> Ns = stft(nn_diff)
>>> XXs = cov(Xs)
>>> NNs = cov(Ns)
>>> tdoas = gccphat(XXs)
>>> Ys_ds = delaysum(Xs, tdoas)
>>> ys_ds = istft(Ys_ds)

>>> # Mvdr Beamforming with SRP-PHAT localization
>>> mvdr = Mvdr()
>>> mics = torch.zeros((4,3), dtype=torch.float)
>>> mics[0,:] = torch.FloatTensor([-0.05, -0.05, +0.00])
>>> mics[1,:] = torch.FloatTensor([-0.05, +0.05, +0.00])
>>> mics[2,:] = torch.FloatTensor([+0.05, +0.05, +0.00])
>>> mics[3,:] = torch.FloatTensor([+0.05, +0.05, +0.00])
>>> srpphat = SrpPhat(mics=mics)
>>> doas = srpphat(XXs)
>>> Ys_mvdr = mvdr(Xs, NNs, doas, doa_mode=True, mics=mics, fs=fs)
>>> ys_mvdr = istft(Ys_mvdr)

>>> # Mvdr Beamforming with MUSIC localization
>>> music = Music(mics=mics)
>>> doas = music(XXs)
>>> Ys_mvdr2 = mvdr(Xs, NNs, doas, doa_mode=True, mics=mics, fs=fs)
>>> ys_mvdr2 = istft(Ys_mvdr2)

>>> # GeV Beamforming
>>> gev = Gev()
>>> Xs = stft(xs_localized_noise)
>>> Ss = stft(ss)
>>> Ns = stft(nn_loc)
>>> SSs = cov(Ss)
>>> NNs = cov(Ns)
>>> Ys_gev = gev(Xs, SSs, NNs)
>>> ys_gev = istft(Ys_gev)

Authors:
 * William Aris
 * Francois Grondin

    N)versionc                       s8   e Zd ZdZd	 fdd	Zdd Zed	ddZ  ZS )

Covariancea  Computes the covariance matrices of the signals.

    Arguments
    ---------
    average : bool
        Informs the module if it should return an average
        (computed on the time dimension) of the covariance
        matrices. The Default value is True.

    Example
    -------
    >>> import torch
    >>> from speechbrain.dataio.dataio import read_audio
    >>> from speechbrain.processing.features import STFT
    >>> from speechbrain.processing.multi_mic import Covariance
    >>>
    >>> xs_speech = read_audio(
    ...    'tests/samples/multi-mic/speech_-0.82918_0.55279_-0.082918.flac'
    ... )
    >>> xs_speech = xs_speech.unsqueeze(0) # [batch, time, channels]
    >>> xs_noise = read_audio('tests/samples/multi-mic/noise_diffuse.flac')
    >>> xs_noise = xs_noise.unsqueeze(0)
    >>> xs = xs_speech + 0.05 * xs_noise
    >>> fs = 16000

    >>> stft = STFT(sample_rate=fs)
    >>> cov = Covariance()
    >>>
    >>> Xs = stft(xs)
    >>> XXs = cov(Xs)
    >>> XXs.shape
    torch.Size([1, 1001, 201, 2, 10])
    Tc                       t    || _d S N)super__init__average)selfr   	__class__ T/home/ubuntu/.local/lib/python3.10/site-packages/speechbrain/processing/multi_mic.pyr   y   s   

zCovariance.__init__c                 C   s   t j|| jd}|S )a.  This method uses the utility function _cov to compute covariance
        matrices. Therefore, the result has the following format:
        (batch, time_step, n_fft/2 + 1, 2, n_mics + n_pairs).

        The order on the last dimension corresponds to the triu_indices for a
        square matrix. For instance, if we have 4 channels, we get the following
        order: (0, 0), (0, 1), (0, 2), (0, 3), (1, 1), (1, 2), (1, 3), (2, 2), (2, 3)
        and (3, 3). Therefore, XXs[..., 0] corresponds to channels (0, 0) and XXs[..., 1]
        corresponds to channels (0, 1).

        Arguments:
        ----------
        Xs : torch.Tensor
            A batch of audio signals in the frequency domain.
            The tensor must have the following format:
            (batch, time_step, n_fft/2 + 1, 2, n_mics)
        )Xsr   )r   _covr   )r	   r   XXsr   r   r   forward~   s   zCovariance.forwardc                 C   s
  | j d }| ddddf d}| ddddf d}t||ddt||dd }t||ddt||dd }t||}|d|d |d f }|d|d |d f }	t||	fd}
|du r|
j d }tj|
ddd}
|
d|ddd}
|
S )	a\  Computes the covariance matrices (XXs) of the signals. The result will
        have the following format: (batch, time_step, n_fft/2 + 1, 2, n_mics + n_pairs).

        Arguments:
        ----------
        Xs : torch.Tensor
            A batch of audio signals in the frequency domain.
            The tensor must have the following format:
            (batch, time_step, n_fft/2 + 1, 2, n_mics)

        average : boolean
            Informs the function if it should return an average
            (computed on the time dimension) of the covariance
            matrices. Default value is True.
           .r   N      T)keepdim)	shape	unsqueezetorchmatmul	transposetriu_indicesstackmeanrepeat)r   r   n_micsXs_reXs_imRxx_reRxx_imidxXXs_reXXs_imr   n_time_framesr   r   r   r      s$   

zCovariance._cov)T)	__name__
__module____qualname____doc__r   r   staticmethodr   __classcell__r   r   r
   r   r   V   s    "r   c                       s>   e Zd ZdZ fddZ				dddZed	d
 Z  ZS )DelaySumaQ  Performs delay and sum beamforming by using the TDOAs and
    the first channel as a reference.

    Example
    -------
    >>> import torch

    >>> from speechbrain.dataio.dataio import read_audio
    >>> from speechbrain.processing.features import STFT, ISTFT
    >>> from speechbrain.processing.multi_mic import Covariance
    >>> from speechbrain.processing.multi_mic import GccPhat, DelaySum
    >>>
    >>> xs_speech = read_audio(
    ...    'tests/samples/multi-mic/speech_-0.82918_0.55279_-0.082918.flac'
    ... )
    >>> xs_speech = xs_speech. unsqueeze(0) # [batch, time, channel]
    >>> xs_noise  = read_audio('tests/samples/multi-mic/noise_diffuse.flac')
    >>> xs_noise = xs_noise.unsqueeze(0) #[batch, time, channels]
    >>> fs = 16000
    >>> xs = xs_speech + 0.05 * xs_noise
    >>>
    >>> stft = STFT(sample_rate=fs)
    >>> cov = Covariance()
    >>> gccphat = GccPhat()
    >>> delaysum = DelaySum()
    >>> istft = ISTFT(sample_rate=fs)
    >>>
    >>> Xs = stft(xs)
    >>> XXs = cov(Xs)
    >>> tdoas = gccphat(XXs)
    >>> Ys = delaysum(Xs, tdoas)
    >>> ys = istft(Ys)
    c                       t    d S r   r   r   r	   r
   r   r   r         zDelaySum.__init__FN     pu@c                 C   sT   |j d }||j}|rt||||d}nt|d}t||d}	tj||	d}
|
S )a6  This method computes a steering vector by using the TDOAs/DOAs and
        then calls the utility function _delaysum to perform beamforming.
        The result has the following format: (batch, time_step, n_fft, 2, 1).

        Arguments
        ---------
        Xs : torch.Tensor
            A batch of audio signals in the frequency domain.
            The tensor must have the following format:
            (batch, time_step, n_fft/2 + 1, 2, n_mics)
        localization_tensor : torch.Tensor
            A tensor containing either time differences of arrival (TDOAs)
            (in samples) for each timestamp or directions of arrival (DOAs)
            (xyz coordinates in meters). If localization_tensor represents
            TDOAs, then its format is (batch, time_steps, n_mics + n_pairs).
            If localization_tensor represents DOAs, then its format is
            (batch, time_steps, 3)
        doa_mode : bool
            The user needs to set this parameter to True if localization_tensor
            represents DOAs instead of TDOAs. Its default value is set to False.
        mics : torch.Tensor
            The cartesian position (xyz coordinates in meters) of each microphone.
            The tensor must have the following format (n_mics, 3). This
            parameter is only mandatory when localization_tensor represents
            DOAs.
        fs : int
            The sample rate in Hertz of the signals. This parameter is only
            mandatory when localization_tensor represents DOAs.
        c : float
            The speed of sound in the medium. The speed is expressed in meters
            per second and the default value of this parameter is 343 m/s. This
            parameter is only used when localization_tensor represents DOAs.

        Returns
        -------
        Ys : torch.Tensor
           doasmicsfsctdoastausn_fft)r   As)r   todevice	doas2taus
tdoas2taussteeringr.   	_delaysum)r	   r   localization_tensordoa_moder7   r8   r9   r>   r=   r?   Ysr   r   r   r      s   
/
zDelaySum.forwardc           
      C   s   | j d }|ddddf | }d|ddddf  | }| ddddf }| ddddf }tj|| ||  ddd	}tj|| ||  ddd	}t||fd}	|	S )
al  Perform delay and sum beamforming. The result has
        the following format: (batch, time_step, n_fft, 2, 1).

        Arguments
        ---------
        Xs : torch.Tensor
            A batch of audio signals in the frequency domain.
            The tensor must have the following format:
            (batch, time_step, n_fft/2 + 1, 2, n_mics)
        As : torch.Tensor
            The steering vector to point in the direction of
            the target source. The tensor must have the format
            (batch, time_step, n_fft/2 + 1, 2, n_mics)

        Returns
        -------
        Ys : torch.Tensor
        r   .r   Nr   r   Tdimr   )r   r   sumr   )
r   r?   r   Ws_reWs_imr    r!   Ys_reYs_imrH   r   r   r   rE   ,  s   
zDelaySum._delaysumFNNr3   )	r(   r)   r*   r+   r   r   r,   rE   r-   r   r   r
   r   r.      s    "
@r.   c                       sB   e Zd ZdZd fdd	Z				ddd	Zedd
dZ  ZS )MvdraS  Perform minimum variance distortionless response (MVDR) beamforming
    by using an input signal in the frequency domain, its covariance matrices
    and tdoas (to compute a steering vector).

        Example
        -------
        >>> import torch

        >>> from speechbrain.dataio.dataio import read_audio
        >>> from speechbrain.processing.features import STFT, ISTFT
        >>> from speechbrain.processing.multi_mic import Covariance
        >>> from speechbrain.processing.multi_mic import GccPhat, DelaySum
        >>>
        >>> xs_speech = read_audio(
        ...    'tests/samples/multi-mic/speech_-0.82918_0.55279_-0.082918.flac'
        ... )
        >>> xs_speech = xs_speech.unsqueeze(0) # [batch, time, channel]
        >>> xs_noise  = read_audio('tests/samples/multi-mic/noise_diffuse.flac')
        >>> xs_noise = xs_noise.unsqueeze(0) #[batch, time, channels]
        >>> fs = 16000
        >>> xs = xs_speech + 0.05 * xs_noise
        >>>
        >>> stft = STFT(sample_rate=fs)
        >>> cov = Covariance()
        >>> gccphat = GccPhat()
        >>> mvdr = Mvdr()
        >>> istft = ISTFT(sample_rate=fs)
        >>>
        >>> Xs = stft(xs)
        >>> Ns = stft(xs_noise)
        >>> XXs = cov(Xs)
        >>> NNs = cov(Ns)
        >>> tdoas = gccphat(XXs)
        >>> Ys = mvdr(Xs, NNs, tdoas)
        >>> ys = istft(Ys)
    #B;c                    r   r   )r   r   eps)r	   rT   r
   r   r   r   {  s   

zMvdr.__init__FNr3   c                 C   sv   |j d }||j}||j}|dur||j}|r&t||||d}	nt|d}	t|	|d}
tj|||
d}|S )a  This method computes a steering vector before using the
        utility function _mvdr to perform beamforming. The result has
        the following format: (batch, time_step, n_fft, 2, 1).

        Arguments
        ---------
        Xs : torch.Tensor
            A batch of audio signals in the frequency domain.
            The tensor must have the following format:
            (batch, time_step, n_fft/2 + 1, 2, n_mics)
        NNs : torch.Tensor
            The covariance matrices of the noise signal. The tensor must
            have the format (batch, time_steps, n_fft/2 + 1, 2, n_mics + n_pairs)
        localization_tensor : torch.Tensor
            A tensor containing either time differences of arrival (TDOAs)
            (in samples) for each timestamp or directions of arrival (DOAs)
            (xyz coordinates in meters). If localization_tensor represents
            TDOAs, then its format is (batch, time_steps, n_mics + n_pairs).
            If localization_tensor represents DOAs, then its format is
            (batch, time_steps, 3)
        doa_mode : bool
            The user needs to set this parameter to True if localization_tensor
            represents DOAs instead of TDOAs. Its default value is set to False.
        mics : torch.Tensor
            The cartesian position (xyz coordinates in meters) of each microphone.
            The tensor must have the following format (n_mics, 3). This
            parameter is only mandatory when localization_tensor represents
            DOAs.
        fs : int
            The sample rate in Hertz of the signals. This parameter is only
            mandatory when localization_tensor represents DOAs.
        c : float
            The speed of sound in the medium. The speed is expressed in meters
            per second and the default value of this parameter is 343 m/s. This
            parameter is only used when localization_tensor represents DOAs.

        Returns
        -------
        Ys : torch.Tensor
        r4   Nr5   r:   r<   )r   NNsr?   )r   r@   rA   rB   rC   rD   rR   _mvdr)r	   r   rU   rF   rG   r7   r8   r9   r>   r=   r?   rH   r   r   r   r     s   
3
zMvdr.forwardc                 C   sx  t j|ddd\}}t|}|d dd|f }|d dd|f }|ddddf d	}	d
|ddddf d	 }
|	dd	}d|
dd	 }t ||	t ||
 }t ||
t ||	 }d
t ||t ||  }t ||d	}t ||d	 }| ddddf }| ddddf }t j|| ||  ddd}t j|| ||  ddd}t 	||fd}|S )a;  Perform minimum variance distortionless response beamforming.

        Arguments
        ---------
        Xs : torch.Tensor
            A batch of audio signals in the frequency domain.
            The tensor must have the following format:
            (batch, time_step, n_fft/2 + 1, 2, n_mics).
        NNs : torch.Tensor
            The covariance matrices of the noise signal. The tensor must
            have the format (batch, time_steps, n_fft/2 + 1, 2, n_mics + n_pairs).
        As : torch.Tensor
            The steering vector to point in the direction of
            the target source. The tensor must have the format
            (batch, time_step, n_fft/2 + 1, 2, n_mics).
        eps : float
            A small value to avoid division by zero.

        Returns
        -------
        Ys : torch.Tensor
        Tr   return_inverserK   .r   N.r   .r   r         ?r         rJ   )
r   uniqueeiginvr   r   r   squeezerL   r   )r   rU   r?   rT   NNs_valNNs_idxNNs_inv
NNs_inv_re
NNs_inv_imAsC_reAsC_imAsT_reAsT_imNNs_inv_AsC_reNNs_inv_AsC_imalpharM   rN   r    r!   rO   rP   rH   r   r   r   rV     s6   


z
Mvdr._mvdrrS   rQ   )	r(   r)   r*   r+   r   r   r,   rV   r-   r   r   r
   r   rR   U  s    %
HrR   c                       s4   e Zd ZdZ fddZdd Zedd Z  ZS )GevaG  Generalized EigenValue decomposition (GEV) Beamforming.

    Example
    -------
    >>> from speechbrain.dataio.dataio import read_audio
    >>> import torch
    >>>
    >>> from speechbrain.processing.features import STFT, ISTFT
    >>> from speechbrain.processing.multi_mic import Covariance
    >>> from speechbrain.processing.multi_mic import Gev
    >>>
    >>> xs_speech = read_audio(
    ...    'tests/samples/multi-mic/speech_-0.82918_0.55279_-0.082918.flac'
    ... )
    >>> xs_speech  = xs_speech.unsqueeze(0) # [batch, time, channels]
    >>> xs_noise = read_audio('tests/samples/multi-mic/noise_0.70225_-0.70225_0.11704.flac')
    >>> xs_noise = xs_noise.unsqueeze(0)
    >>> fs = 16000
    >>> ss = xs_speech
    >>> nn = 0.05 * xs_noise
    >>> xs = ss + nn
    >>>
    >>> stft = STFT(sample_rate=fs)
    >>> cov = Covariance()
    >>> gev = Gev()
    >>> istft = ISTFT(sample_rate=fs)
    >>>
    >>> Ss = stft(ss)
    >>> Nn = stft(nn)
    >>> Xs = stft(xs)
    >>>
    >>> SSs = cov(Ss)
    >>> NNs = cov(Nn)
    >>>
    >>> Ys = gev(Xs, SSs, NNs)
    >>> ys = istft(Ys)
    c                    r/   r   r0   r1   r
   r   r   r   6  r2   zGev.__init__c                 C   s   t j|||d}|S )ag  This method uses the utility function _gev to perform generalized
        eigenvalue decomposition beamforming. Therefore, the result has
        the following format: (batch, time_step, n_fft, 2, 1).

        Arguments
        ---------
        Xs : torch.Tensor
            A batch of audio signals in the frequency domain.
            The tensor must have the following format:
            (batch, time_step, n_fft/2 + 1, 2, n_mics).
        SSs : torch.Tensor
            The covariance matrices of the target signal. The tensor must
            have the format (batch, time_steps, n_fft/2 + 1, 2, n_mics + n_pairs).
        NNs : torch.Tensor
            The covariance matrices of the noise signal. The tensor must
            have the format (batch, time_steps, n_fft/2 + 1, 2, n_mics + n_pairs).

        Returns
        -------
        Ys : torch.Tensor
        )r   SSsrU   )ro   _gev)r	   r   rp   rU   rH   r   r   r   r   :  s   zGev.forwardc                 C   s  | | j}| | j}| jd }|jd }tj||fdd}tj|ddd\}}|dtd|f }|dt|d| f }t|}t	||\}}	|d|d df }
|d|d df }d	tj
|
d |d  d
ddd ddd| }|
|9 }
||9 }|
dd|f }|dd|f }| ddddf }| ddddf }tj
|| ||  d
dd}tj
|| ||  d
dd}t||fd
}|S )a&  Perform generalized eigenvalue decomposition beamforming. The result
        has the following format: (batch, time_step, n_fft, 2, 1).

        Arguments
        ---------
        Xs : torch.Tensor
            A batch of audio signals in the frequency domain.
            The tensor must have the following format:
            (batch, time_step, n_fft/2 + 1, 2, n_mics).
        SSs : torch.Tensor
            The covariance matrices of the target signal. The tensor must
            have the format (batch, time_steps, n_fft/2 + 1, 2, n_mics + n_pairs).
        NNs : torch.Tensor
            The covariance matrices of the noise signal. The tensor must
            have the format (batch, time_steps, n_fft/2 + 1, 2, n_mics + n_pairs).

        Returns
        -------
        Ys : torch.Tensor
        r   rK   Tr   rW   .r   r4   r[   r   rJ         ?N)r@   rA   r   r   catr^   ranger_   pos_defgevdrL   r   r   )r   rp   rU   r   n_mics_pairsSSs_NNsSSs_NNs_valSSs_NNs_idxVsDsF_reF_imF_normrM   rN   r    r!   rO   rP   rH   r   r   r   rq   T  s8   



zGev._gev)	r(   r)   r*   r+   r   r   r,   rq   r-   r   r   r
   r   ro     s    &ro   c                       sR   e Zd ZdZd fdd	Zdd Zeddd	Zedd
dZedd Z	  Z
S )GccPhata  Generalized Cross-Correlation with Phase Transform localization.

    Arguments
    ---------
    tdoa_max : int
        Specifies a range to search for delays. For example, if
        tdoa_max = 10, the method will restrict its search for delays
        between -10 and 10 samples. This parameter is optional and its
        default value is None. When tdoa_max is None, the method will
        search for delays between -n_fft/2 and n_fft/2 (full range).
    eps : float
        A small value to avoid divisions by 0 with the phase transformation.
        The default value is 1e-20.

    Example
    -------
    >>> import torch

    >>> from speechbrain.dataio.dataio import read_audio
    >>> from speechbrain.processing.features import STFT, ISTFT
    >>> from speechbrain.processing.multi_mic import Covariance
    >>> from speechbrain.processing.multi_mic import GccPhat, DelaySum
    >>>
    >>> xs_speech = read_audio(
    ...    'tests/samples/multi-mic/speech_-0.82918_0.55279_-0.082918.flac'
    ... )
    >>> xs_speech = xs_speech.unsqueeze(0) # [batch, time, channel]
    >>> xs_noise  = read_audio('tests/samples/multi-mic/noise_diffuse.flac')
    >>> xs_noise = xs_noise.unsqueeze(0) #[batch, time, channels]
    >>> fs = 16000
    >>> xs = xs_speech + 0.05 * xs_noise
    >>>
    >>> stft = STFT(sample_rate=fs)
    >>> cov = Covariance()
    >>> gccphat = GccPhat()
    >>> Xs = stft(xs)
    >>> XXs = cov(Xs)
    >>> tdoas = gccphat(XXs)
    NrS   c                    s   t    || _|| _d S r   )r   r   tdoa_maxrT   )r	   r   rT   r
   r   r   r     s   

zGccPhat.__init__c                 C   s2   t j|| jd}t j|| jd}t j||d}|S )a  Perform generalized cross-correlation with phase transform localization
        by using the utility function _gcc_phat and by extracting the delays (in samples)
        before performing a quadratic interpolation to improve the accuracy.
        The result has the format: (batch, time_steps, n_mics + n_pairs).

        The order on the last dimension corresponds to the triu_indices for a
        square matrix. For instance, if we have 4 channels, we get the following
        order: (0, 0), (0, 1), (0, 2), (0, 3), (1, 1), (1, 2), (1, 3), (2, 2), (2, 3)
        and (3, 3). Therefore, delays[..., 0] corresponds to channels (0, 0) and delays[..., 1]
        corresponds to channels (0, 1).

        Arguments:
        ----------
        XXs : torch.Tensor
            The covariance matrices of the input signal. The tensor must
            have the format (batch, time_steps, n_fft/2 + 1, 2, n_mics + n_pairs).
        )r   rT   )xxsr   )r   delays)r   	_gcc_phatrT   _extract_delaysr   _interpolate)r	   r   r   r   r;   r   r   r   r     s   zGccPhat.forwardc                 C   s   | j d d d }tj| ddd\}}|ddddf }|ddddf }t|d |d  | }|| }|| }	t||	fd}
|
dd	}
ttjtd
krft	|
d |
d }
tj
j|
|d}n	tj|
d|gd}|d|ddf }|dd	}|S )aI  Evaluate GCC-PHAT for each timestamp. It returns the result in the time
        domain. The result has the format: (batch, time_steps, n_fft, n_mics + n_pairs).

        Arguments
        ---------
        XXs : torch.Tensor
            The covariance matrices of the input signal. The tensor must
            have the format (batch, time_steps, n_fft/2 + 1, 2, n_mics + n_pairs).
        eps : float
            A small value to avoid divisions by 0 with the phase transform. The
            default value is 1e-20.

        Returns
        -------
        xxs : torch.Tensor
        r4   r   Tr   rW   .r   Nr   z1.8.0rY   rZ   )n)signal_ndimsignal_sizes)r   r   r^   sqrtr   r   r   parse__version__complexfftirfft)r   rT   	n_samplesXXs_valXXs_idxr%   r&   XXs_absXXs_re_phatXXs_im_phatXXs_phatr   r   r   r   r     s    zGccPhat._gcc_phatc           
      C   s   | j d }|du rtj|ddd}| dd|ddf }| d| dddf }t||fd}t|d\}}||j d  }||j d k}	||	  |7  < ||	  |8  < |S )aB  Extract the rounded delays from the cross-correlation for each timestamp.
        The result has the format: (batch, time_steps, n_mics + n_pairs).

        Arguments
        ---------
        xxs : torch.Tensor
            The correlation signals obtained after a gcc-phat operation. The tensor
            must have the format (batch, time_steps, n_fft, n_mics + n_pairs).
        tdoa_max : int
            Specifies a range to search for delays. For example, if
            tdoa_max = 10, the method will restrict its search for delays
            between -10 and 10 samples. This parameter is optional and its
            default value is None. When tdoa_max is None, the method will
            search for delays between -n_fft/2 and +n_fft/2 (full range).

        Returns
        -------
        delays : torch.Tensor
        r4   Nfloorrounding_mode.r   )r   r   divrt   max)
r   r   r>   slice_1slice_2
xxs_sliced_r   offsetr$   r   r   r   r     s   
zGccPhat._extract_delaysc                 C   s   | j d }t|d | |d}t| d|d}t|| |d}t| d|d}t|d | |d}t| d|d}||| d| d|  d|    }|S )a  Perform quadratic interpolation on the cross-correlation to
        improve the tdoa accuracy. The result has the format:
        (batch, time_steps, n_mics + n_pairs)

        Arguments
        ---------
        xxs : torch.Tensor
            The correlation signals obtained after a gcc-phat operation. The tensor
            must have the format (batch, time_steps, n_fft, n_mics + n_pairs).
        delays : torch.Tensor
            The rounded tdoas obtained by selecting the sample with the highest
            amplitude. The tensor must have the format
            (batch, time_steps, n_mics + n_pairs).

        Returns
        -------
        delays_frac : torch.Tensor
        r4   r   r   )r   r   fmodr   gatherra   )r   r   r>   tpy1y2y3delays_fracr   r   r   r   =  s   
$zGccPhat._interpolate)NrS   rn   r   )r(   r)   r*   r+   r   r   r,   r   r   r   r-   r   r   r
   r   r     s    (0.r   c                       s@   e Zd ZdZ				d fdd	Zdd	 Zedd
dZ  ZS )SrpPhataV	  Steered-Response Power with Phase Transform Localization.

    Arguments
    ---------
    mics : torch.Tensor
        The cartesian coordinates (xyz) in meters of each microphone.
        The tensor must have the following format (n_mics, 3).
    space : string
        If this parameter is set to 'sphere', the localization will
        be done in 3D by searching in a sphere of possible doas. If
        it set to 'circle', the search will be done in 2D by searching
        in a circle. By default, this parameter is set to 'sphere'.
        Note: The 'circle' option isn't implemented yet.
    sample_rate : int
        The sample rate in Hertz of the signals to perform SRP-PHAT on.
        By default, this parameter is set to 16000 Hz.
    speed_sound : float
        The speed of sound in the medium. The speed is expressed in meters
        per second and the default value of this parameter is 343 m/s.
    eps : float
        A small value to avoid errors like division by 0. The default value
        of this parameter is 1e-20.

    Example
    -------
    >>> import torch

    >>> from speechbrain.dataio.dataio import read_audio
    >>> from speechbrain.processing.features import STFT
    >>> from speechbrain.processing.multi_mic import Covariance
    >>> from speechbrain.processing.multi_mic import SrpPhat

    >>> xs_speech = read_audio('tests/samples/multi-mic/speech_-0.82918_0.55279_-0.082918.flac')
    >>> xs_noise = read_audio('tests/samples/multi-mic/noise_diffuse.flac')
    >>> fs = 16000

    >>> xs_speech = xs_speech.unsqueeze(0) # [batch, time, channels]
    >>> xs_noise = xs_noise.unsqueeze(0)

    >>> ss1 = xs_speech
    >>> ns1 = 0.05 * xs_noise
    >>> xs1 = ss1 + ns1

    >>> ss2 = xs_speech
    >>> ns2 = 0.20 * xs_noise
    >>> xs2 = ss2 + ns2

    >>> ss = torch.cat((ss1,ss2), dim=0)
    >>> ns = torch.cat((ns1,ns2), dim=0)
    >>> xs = torch.cat((xs1,xs2), dim=0)

    >>> mics = torch.zeros((4,3), dtype=torch.float)
    >>> mics[0,:] = torch.FloatTensor([-0.05, -0.05, +0.00])
    >>> mics[1,:] = torch.FloatTensor([-0.05, +0.05, +0.00])
    >>> mics[2,:] = torch.FloatTensor([+0.05, +0.05, +0.00])
    >>> mics[3,:] = torch.FloatTensor([+0.05, +0.05, +0.00])

    >>> stft = STFT(sample_rate=fs)
    >>> cov = Covariance()
    >>> srpphat = SrpPhat(mics=mics)

    >>> Xs = stft(xs)
    >>> XXs = cov(Xs)
    >>> doas = srpphat(XXs)
    sphere>  r3   rS   c                    sB   t    |dkrt | _|dkr	 t| j|||d| _|| _d S Nr   circle)r7   r8   r9   )r   r   r   r6   rB   r=   rT   )r	   r7   spacesample_ratespeed_soundrT   r
   r   r   r     s   
	

zSrpPhat.__init__c                 C   s8   |j d }t| j|j|}tj||| j| jd}|S )aG  Perform SRP-PHAT localization on a signal by computing a steering
        vector and then by using the utility function _srp_phat to extract the doas.
        The result is a tensor containing the directions of arrival (xyz coordinates
        (in meters) in the direction of the sound source). The output tensor
        has the format (batch, time_steps, 3).

        This localization method uses Global Coherence Field (GCF):
        https://www.researchgate.net/publication/221491705_Speaker_localization_based_on_oriented_global_coherence_field

        Arguments
        ---------
        XXs : torch.Tensor
            The covariance matrices of the input signal. The tensor must
            have the format (batch, time_steps, n_fft/2 + 1, 2, n_mics + n_pairs).

        Returns
        -------
        doas : torch.Tensor
        r4   )r   r?   r6   rT   )	r   rD   r=   r@   rA   r   	_srp_phatr6   rT   r	   r   r>   r?   r6   r   r   r   r     s   
zSrpPhat.forwardc                 C   s$  | | j}| | j}|jd }t||}|ddddd|dddf f }|ddddd|dddf f }|ddddd|dddf f }|ddddd|dddf f }	|| ||	  }
||	 ||  }|
|
jd d}
||jd d}tj| ddd\}}|dddddddddf }|dddddddddf }||jd |jd df}||jd |jd df}t|d |d  | }|| }|| }t||
	dd}t||	dd}|| }tj
|dd	\}}||ddf dd|ddf }|S )
a  Perform srp-phat to find the direction of arrival
        of the sound source. The result is a tensor containing the directions
        of arrival (xyz coordinates (in meters) in the direction of the sound source).
        The output tensor has the format: (batch, time_steps, 3).

        Arguments
        ---------
        XXs : torch.Tensor
            The covariance matrices of the input signal. The tensor must
            have the format (batch, time_steps, n_fft/2 + 1, 2, n_mics + n_pairs).
        As : torch.Tensor
            The steering vector that cover the all the potential directions
            of arrival. The tensor must have the format
            (n_doas, n_fft/2 + 1, 2, n_mics).
        doas : torch.Tensor
            All the possible directions of arrival that will be scanned. The
            tensor must have the format (n_doas, 3).
        eps : float
            A very small value used to avoid division by 0.

        Returns
        -------
        doas : torch.Tensor
        r   Nr   r   rI   TrW   r4   rr   )r@   rA   r   r   r   reshaper^   r   r   r   r   )r   r?   r6   rT   r   r$   As_1_reAs_1_imAs_2_reAs_2_imrM   rN   r   r   r%   r&   r   XXs_re_normXXs_im_normYs_AYs_BrH   r   doas_idxr   r   r   r     s4   
$$$$"""zSrpPhat._srp_phat)r   r   r3   rS   rn   )	r(   r)   r*   r+   r   r   r,   r   r-   r   r   r
   r   r   b  s    Er   c                       sB   e Zd ZdZ					d fdd	Zd	d
 ZedddZ  ZS )Musica	  Multiple Signal Classification (MUSIC) localization.

    Arguments
    ---------
    mics : torch.Tensor
        The cartesian coordinates (xyz) in meters of each microphone.
        The tensor must have the following format (n_mics, 3).
    space : string
        If this parameter is set to 'sphere', the localization will
        be done in 3D by searching in a sphere of possible doas. If
        it set to 'circle', the search will be done in 2D by searching
        in a circle. By default, this parameter is set to 'sphere'.
        Note: The 'circle' option isn't implemented yet.
    sample_rate : int
        The sample rate in Hertz of the signals to perform SRP-PHAT on.
        By default, this parameter is set to 16000 Hz.
    speed_sound : float
        The speed of sound in the medium. The speed is expressed in meters
        per second and the default value of this parameter is 343 m/s.
    eps : float
        A small value to avoid errors like division by 0. The default value
        of this parameter is 1e-20.
    n_sig : int
        An estimation of the number of sound sources. The default value is set
        to one source.

    Example
    -------
    >>> import torch

    >>> from speechbrain.dataio.dataio import read_audio
    >>> from speechbrain.processing.features import STFT
    >>> from speechbrain.processing.multi_mic import Covariance
    >>> from speechbrain.processing.multi_mic import SrpPhat

    >>> xs_speech = read_audio('tests/samples/multi-mic/speech_-0.82918_0.55279_-0.082918.flac')
    >>> xs_noise = read_audio('tests/samples/multi-mic/noise_diffuse.flac')
    >>> fs = 16000

    >>> xs_speech = xs_speech.unsqueeze(0) # [batch, time, channels]
    >>> xs_noise = xs_noise.unsqueeze(0)

    >>> ss1 = xs_speech
    >>> ns1 = 0.05 * xs_noise
    >>> xs1 = ss1 + ns1

    >>> ss2 = xs_speech
    >>> ns2 = 0.20 * xs_noise
    >>> xs2 = ss2 + ns2

    >>> ss = torch.cat((ss1,ss2), dim=0)
    >>> ns = torch.cat((ns1,ns2), dim=0)
    >>> xs = torch.cat((xs1,xs2), dim=0)

    >>> mics = torch.zeros((4,3), dtype=torch.float)
    >>> mics[0,:] = torch.FloatTensor([-0.05, -0.05, +0.00])
    >>> mics[1,:] = torch.FloatTensor([-0.05, +0.05, +0.00])
    >>> mics[2,:] = torch.FloatTensor([+0.05, +0.05, +0.00])
    >>> mics[3,:] = torch.FloatTensor([+0.05, +0.05, +0.00])

    >>> stft = STFT(sample_rate=fs)
    >>> cov = Covariance()
    >>> music = Music(mics=mics)

    >>> Xs = stft(xs)
    >>> XXs = cov(Xs)
    >>> doas = music(XXs)
    r   r   r3   rS   r   c                    sH   t    |dkrt | _|dkr	 t| j|||d| _|| _|| _d S r   )r   r   r   r6   rB   r=   rT   n_sig)r	   r7   r   r   r   rT   r   r
   r   r   r   l  s   



zMusic.__init__c                 C   s<   |j d }t| j|j|}tj||| j| j| j	d}|S )a  Perform MUSIC localization on a signal by computing a steering
        vector and then by using the utility function _music to extract the doas.
        The result is a tensor containing the directions of arrival (xyz coordinates
        (in meters) in the direction of the sound source). The output tensor
        has the format (batch, time_steps, 3).

        Arguments
        ---------
        XXs : torch.Tensor
            The covariance matrices of the input signal. The tensor must
            have the format (batch, time_steps, n_fft/2 + 1, 2, n_mics + n_pairs).

        Returns
        -------
        doas : torch.Tensor
        r4   )r   r?   r6   r   rT   )
r   rD   r=   r@   rA   r   _musicr6   r   rT   r   r   r   r   r     s   
zMusic.forwardc              	   C   s  | | j}| | j}|jd }|jd }|jd }|| }tj| ddd\}	}
t|	\}}|ddd|dddd}|dt	d|df }|dt	d|df }|ddd
dddddd	d
}||jd |jd ddddd}|d }|d }t||t|| }t||t|| }t|d |d  }tj|d	d}tj|d d	dtj|d d	d }|||  d
}tj|dd| }tj|dd\}}||ddf dd|
ddf }|S )a  Perform multiple signal classification to find the
        direction of arrival of the sound source. The result
        has the format: (batch, time_steps, 3).

        Arguments
        ---------
        XXs : torch.Tensor
            The covariance matrices of the input signal. The tensor must
            have the format (batch, time_steps, n_fft/2 + 1, 2, n_mics + n_pairs).
        As : torch.Tensor
            The steering vector that covers the all the potential directions
            of arrival. The tensor must have the format.
            (n_doas, n_fft/2 + 1, 2, n_mics).
        doas : torch.Tensor
            All the possible directions of arrival that will be scanned. The
            tensor must have the format (n_doas, 3).
        n_sig : int
            The number of signals in the signal + noise subspace (default is 1).
        eps : float
            A small number to avoid div by zero errors.

        Returns
        -------
        doas : torch.Tensor
        r   r   r4   Tr   rW   .      r   rY   rZ   rr   N)r@   rA   r   r   r^   r_   svdlr   r   ru   permuter   r   rL   ra   r   )r   r?   r6   r   rT   r   n_doasn_bins	svd_ranger   r   Usr   Us_reUs_imAs_reAs_imAs_mm_Us_reAs_mm_Us_imAs_mm_Us_absAs_mm_Us_sum	As_As_absPsrH   r   r   r   r   r     s:   


"$"zMusic._music)r   r   r3   rS   r   rn   )	r(   r)   r*   r+   r   r   r,   r   r-   r   r   r
   r   r   &  s    Hr   r3   c                 C   s(   || t | |j|dd }|S )a  This function converts directions of arrival (xyz coordinates
    expressed in meters) in time differences of arrival (expressed in
    samples). The result has the following format: (batch, time_steps, n_mics).

    Arguments
    ---------
    doas : torch.Tensor
        The directions of arrival expressed with cartesian coordinates (xyz)
        in meters. The tensor must have the following format: (batch, time_steps, 3).
    mics : torch.Tensor
        The cartesian position (xyz) in meters of each microphone.
        The tensor must have the following format (n_mics, 3).
    fs : int
        The sample rate in Hertz of the signals.
    c : float
        The speed of sound in the medium. The speed is expressed in meters
        per second and the default value of this parameter is 343 m/s.

    Returns
    -------
    taus : torch.Tensor

    Example
    -------
    >>> import torch

    >>> from speechbrain.dataio.dataio import read_audio
    >>> from speechbrain.processing.multi_mic import sphere, doas2taus

    >>> xs = read_audio('tests/samples/multi-mic/speech_-0.82918_0.55279_-0.082918.flac')
    >>> xs = xs.unsqueeze(0) # [batch, time, channels]
    >>> fs = 16000
    >>> mics = torch.zeros((4,3), dtype=torch.float)
    >>> mics[0,:] = torch.FloatTensor([-0.05, -0.05, +0.00])
    >>> mics[1,:] = torch.FloatTensor([-0.05, +0.05, +0.00])
    >>> mics[2,:] = torch.FloatTensor([+0.05, +0.05, +0.00])
    >>> mics[3,:] = torch.FloatTensor([+0.05, +0.05, +0.00])

    >>> doas = sphere()
    >>> taus = doas2taus(doas, mics, fs)
    r   r   )r   r   r@   rA   r   )r6   r7   r8   r9   r=   r   r   r   rB     s   $*rB   c                 C   sF   | j t| j d  }tdd|  d d d }| dtd|f }|S )a  This function selects the tdoas of each channel and put them
    in a tensor. The result has the following format:
    (batch, time_steps, n_mics).

    Arguments
    ---------
    tdoas : torch.Tensor
       The time difference of arrival (TDOA) (in samples) for
       each timestamp. The tensor has the format
       (batch, time_steps, n_mics + n_pairs).

    Returns
    -------
    taus : torch.Tensor

    Example
    -------
    >>> import torch
    >>> from speechbrain.dataio.dataio import read_audio
    >>> from speechbrain.processing.features import STFT
    >>> from speechbrain.processing.multi_mic import Covariance
    >>> from speechbrain.processing.multi_mic import GccPhat, tdoas2taus
    >>>
    >>> xs_speech = read_audio(
    ...    'tests/samples/multi-mic/speech_-0.82918_0.55279_-0.082918.flac'
    ... )
    >>> xs_noise = read_audio('tests/samples/multi-mic/noise_diffuse.flac')
    >>> xs = xs_speech + 0.05 * xs_noise
    >>> xs = xs.unsqueeze(0)
    >>> fs = 16000
    >>>
    >>> stft = STFT(sample_rate=fs)
    >>> cov = Covariance()
    >>> gccphat = GccPhat()
    >>>
    >>> Xs = stft(xs)
    >>> XXs = cov(Xs)
    >>> tdoas = gccphat(XXs)
    >>> taus = tdoas2taus(tdoas)
    r      rs   r4   .r   )r   lenintru   )r;   n_pairs
n_channelsr=   r   r   r   rC   (  s   )rC   c                 C   s   d}t |d d }d| tjd|| jd | }|| jd }| t| jdt| j |f } t| |  }t	| |  }t
||ft|j}|t|jd t|jd t|jd t|jd }|S )a  This function computes a steering vector by using the time differences
    of arrival for each channel (in samples) and the number of bins (n_fft).
    The result has the following format: (batch, time_step, n_fft/2 + 1, 2, n_mics).

    Arguments:
    ----------
    taus : torch.Tensor
        The time differences of arrival for each channel. The tensor must have
        the following format: (batch, time_steps, n_mics).

    n_fft : int
        The number of bins resulting of the STFT. It is assumed that the
        argument "onesided" was set to True for the STFT.

    Example:
    --------f
    >>> import torch
    >>> from speechbrain.dataio.dataio import read_audio
    >>> from speechbrain.processing.features import STFT
    >>> from speechbrain.processing.multi_mic import Covariance
    >>> from speechbrain.processing.multi_mic import GccPhat, tdoas2taus, steering
    >>>
    >>> xs_speech = read_audio(
    ...    'tests/samples/multi-mic/speech_-0.82918_0.55279_-0.082918.flac'
    ... )
    >>> xs_noise = read_audio('tests/samples/multi-mic/noise_diffuse.flac')
    >>> xs = xs_speech + 0.05 * xs_noise
    >>> xs = xs.unsqueeze(0) # [batch, time, channels]
    >>> fs = 16000

    >>> stft = STFT(sample_rate=fs)
    >>> cov = Covariance()
    >>> gccphat = GccPhat()
    >>>
    >>> Xs = stft(xs)
    >>> n_fft = Xs.shape[2]
    >>> XXs = cov(Xs)
    >>> tdoas = gccphat(XXs)
    >>> taus = tdoas2taus(tdoas)
    >>> As = steering(taus, n_fft)
    g-DT!	@r   r4   r   )rA   )r   r   )r   r   arangerA   r   r   r   r   cossinr   r   )r=   r>   pi
frame_sizeomegasa_rea_imar   r   r   rD   X  s   + rD   r   c              
   C   s  d}d}d}t jdt jd}t g d|dddf< t g d	|d
ddf< |t d| t dd d  |tdddf< |t d| t dd d  |tdddf< ||tdddf< d| t d| t dd d  |tdd
df< d| t d| t dd d  |tdd
df< d| |tdd
df< t jdt jd}t 	g d|dddf< t 	g d|dddf< t 	g d|dddf< t 	g d|dddf< t 	g d|dddf< t 	g d|dddf< t 	g d|dddf< t 	g d|dddf< t 	g d|dddf< t 	g d |d!ddf< t 	g d"|d#ddf< t 	g d$|d
ddf< t 	g d%|d&ddf< t 	g d'|d(ddf< t 	g d)|d*ddf< t 	g d+|d,ddf< t 	g d-|d.ddf< t 	g d/|d0ddf< t 	g d1|d2ddf< t 	g d3|d4ddf< td| D ]}|j
d }|d }t j|dft jd}	|dddf |	d| t d| df< |dddf |	d| t d| df< |dddf |	d| t d| df< |dddf |	d| t d| df< |dddf |	d| t d| df< |dddf |	d| t d| df< |dddf |	d| t d| df< |dddf |	d| t d| df< |dddf |	d| t d| df< |dddf |	d| t d| df< |dddf |	d| t d| df< |dddf |	d| t d| df< |dddf |	d| t d| df< |dddf |	d| t d| df< |dddf |	d| t d| df< |dddf |	d| t d| df< |dddf |	d| t d| df< |dddf |	d| t d| df< |dddf |	d| t d| df< |dddf |	d| t d| df< |dddf |	d| t d| df< |dddf |	d| t d| df< |dddf |	d| t d| df< |dddf |	d| t d| df< t j|	ddddgf |	ddddgf |	ddddgf fdd5}
t j|
dd5\}}t |}|dddf |d  |dddf  }t j|d6d7\}}t j|j
d df|jd}t j||d d8d9|dddf< ||dddf |d   |dddf< t t |d:dd}||dddf ddf ||dddf ddf  }|t t t j|d dd5d; ddd }q|S )<a  This function generates cartesian coordinates (xyz) for a set
    of points forming a 3D sphere. The coordinates are expressed in
    meters and can be used as doas. The result has the format:
    (n_points, 3).

    Arguments
    ---------
    levels_count : int
        A number proportional to the number of points that the user
        wants to generate.
            - If levels_count = 1, then the sphere will have 42 points
            - If levels_count = 2, then the sphere will have 162 points
            - If levels_count = 3, then the sphere will have 642 points
            - If levels_count = 4, then the sphere will have 2562 points
            - If levels_count = 5, then the sphere will have 10242 points
            - ...
        By default, levels_count is set to 4.

    Returns
    -------
    pts : torch.Tensor
        The list of xyz points in the sphere.

    Example
    -------
    >>> import torch
    >>> from speechbrain.processing.multi_mic import sphere
    >>> doas = sphere()
    g%?g%?gPERT!	@)   r   )dtype)r   r   r   r   N)r   r   rI      g       @r   g      @r   r   r4   r\   )   r   )r   r4   r   )r   r   r4   )r   r   r   )r   r   r   r   )r   r   r   r   )	   r   r4   )
   r4   r   )r   r   r      )r   r   r   r   )r   r   r   r   )r   r   r   r   )r   r   r   )r   r   r   r   )r4   r   r      )r   r   r      )r   r   r      )r   r   r      )r   r   r      )r   r   r      )r   r   r      )axisT)rX   r   r   )r   rI   rs   )r   zerosfloatFloatTensorr   r   ru   r   long
LongTensorr   rt   sortr   r^   r   r   r   r   repeat_interleaver   rL   )levels_counthrr   ptstrslevels_index	trs_countsubtrs_countsubtrssubtrs_flattensubtrs_sortedr   	index_maxsubtrs_scalarunique_scalarunique_indicesunique_valuesr   r   r   r     s    00$$
((((((((((((((((((((((((:
&

8 
r   )r3   )r   )r+   r   	packagingr   $speechbrain.processing.decomposition
processingdecompositionr_   nnModuler   r.   rR   ro   r   r   r   rB   rC   rD   r   r   r   r   r   <module>   s*    Oo  ; 	 L E 
T/0A