o
    pit                     @   sd   d dl mZ d dlZd dlmZ d dlm  mZ dejdejdejfddZ	G dd	 d	ej
ZdS )
    )OptionalN	sequencesweightsreturnc                 C   s   |j dd}|jddd }tj| | dd| }t| | d }t|jdd}tj|| dd|||  d  }t|}tj||gddS )a  Helper function to compute statistics pooling

    Assumes that weights are already interpolated to match the number of frames
    in sequences and that they encode the activation of only one speaker.

    Parameters
    ----------
    sequences : (batch, features, frames) torch.Tensor
        Sequences of features.
    weights : (batch, frames) torch.Tensor
        (Already interpolated) weights.

    Returns
    -------
    output : (batch, 2 * features) torch.Tensor
        Concatenation of mean and (unbiased) standard deviation.
       dim   g:0yE>)	unsqueezesumtorchsquaresqrtcat)r   r   v1meandx2v2varstd r   X/home/ubuntu/.local/lib/python3.10/site-packages/pyannote/audio/models/blocks/pooling.py_pool   s   "
r   c                   @   s4   e Zd ZdZ	ddejdeej dejfddZdS )		StatsPoolzStatistics pooling

    Compute temporal mean and (unbiased) standard deviation
    and returns their concatenation.

    Reference
    ---------
    https://en.wikipedia.org/wiki/Weighted_arithmetic_mean

    Nr   r   r   c                    s   du r j dd} jddd}tj||gddS  dkr)d}jddnd}  \}}} \}}}	||	krEtj|d	d
tj	 fddt
|D dd}
|s^|
jddS |
S )a  Forward pass

        Parameters
        ----------
        sequences : (batch, features, frames) torch.Tensor
            Sequences of features.
        weights : (batch, frames) or (batch, speakers, frames) torch.Tensor, optional
            Compute weighted mean and standard deviation, using provided `weights`.

        Note
        ----
        `sequences` and `weights` might use a different number of frames, in which case `weights`
        are interpolated linearly to reach the number of frames in `sequences`.

        Returns
        -------
        output : (batch, 2 * features) or (batch, speakers, 2 * features) torch.Tensor
            Concatenation of mean and (unbiased) standard deviation. When `weights` are
            provided with the `speakers` dimension, `output` is computed for each speaker
            separately and returned as (batch, speakers, 2 * channel)-shaped tensor.
        Nr   r   )r   
correctionr	   FTnearest)sizemodec              	      s(   g | ]}t  d d |d d f qS N)r   ).0speakerr   r   r   r   
<listcomp>x   s    z%StatsPool.forward.<locals>.<listcomp>)r   r   r   r   r   r
   r   Finterpolatestackrangesqueeze)selfr   r   r   r   has_speaker_dimension_
num_framesnum_speakersnum_weightsoutputr   r"   r   forwardL   s*   zStatsPool.forwardr   )__name__
__module____qualname____doc__r   Tensorr   r0   r   r   r   r   r   @   s    r   )typingr   r   torch.nnnntorch.nn.functional
functionalr$   r5   r   Moduler   r   r   r   r   <module>   s   "