o
    9wiO,                     @   s   d dl mZ d dlmZmZ d dlZd dlmZ d dlm  m	Z
 d dlZd dlmZ d dlmZ d dlmZ d dlmZ d dlmZmZmZ G d	d
 d
eZdS )    )	lru_cache)OptionalUnionN)pairwise)Model)Task)
merge_dict)conv1d_num_framesconv1d_receptive_field_centerconv1d_receptive_field_sizec                       s   e Zd ZdZdZddddddZddd	Z	
		
	
			
d)deee	f de
dee dee de
de
dee f fddZede
fddZdd Zede
de
fddZd*de
de
fd d!Zd+d#e
de
fd$d%Zd&ejdejfd'd(Z  ZS ),	SSeRiouSSa^  Self-Supervised Representation for Speaker Segmentation

    wav2vec > LSTM > Feed forward > Classifier

    Parameters
    ----------
    sample_rate : int, optional
        Audio sample rate. Defaults to 16kHz (16000).
    num_channels : int, optional
        Number of channels. Defaults to mono (1).
    wav2vec: dict or str, optional
        Defaults to "WAVLM_BASE".
    wav2vec_layer: int, optional
        Index of layer to use as input to the LSTM.
        Defaults (-1) to use average of all layers (with learnable weights).
    lstm : dict, optional
        Keyword arguments passed to the LSTM layer.
        Defaults to {"hidden_size": 128, "num_layers": 4, "bidirectional": True},
        i.e. two bidirectional layers with 128 units each.
        Set "monolithic" to False to split monolithic multi-layer LSTM into multiple mono-layer LSTMs.
        This may proove useful for probing LSTM internals.
    linear : dict, optional
        Keyword arugments used to initialize linear layers
        Defaults to {"hidden_size": 128, "num_layers": 2},
        i.e. two linear layers with 128 units each.
    
WAVLM_BASE      T        )hidden_size
num_layersbidirectional
monolithicdropout   )r   r   N>     wav2vecwav2vec_layerlstmlinearsample_ratenum_channelstaskc                    s>  t  j|||d t|trcttj|r;ttj|}||jkr+t	d|j d| d|j
d |j
d }	| | _n?t|}
|
d}tjjdi || _|
d}| j| |d |d }	nt|trztjjdi || _|d |d }	|d	k rtjt|	d
d| _t| j  d
 d< t| j|}| dddd  d }|rt }|d= tjfi || _n1 d }|dkrtj d d| _t dd< dd< d= t  fddt!|D | _|d dk rd S | j"jd | j"jd rdnd }t dd t#|g| j"j$d g| j"j$d   D | _$d S )N)r   r   r    z	Expected z
Hz, found zHz.encoder_embed_dimencoder_num_layersconfig
state_dictr   T)datarequires_gradbatch_firstr   r   r   r   r   r   r   r   )pr   c                    s>   g | ]}t j|d krn d  d rdnd fi qS )r   r   r   r   r   )nnLSTM).0ir   one_layer_lstmwav2vec_dim i/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/pyannote/audio/models/segmentation/SSeRiouSS.py
<listcomp>   s    
z&SSeRiouSS.__init__.<locals>.<listcomp>r   r   r   c                 S   s   g | ]
\}}t ||qS r0   )r)   Linear)r+   in_featuresout_featuresr0   r0   r1   r2      s    
r0   )%super__init__
isinstancestrhasattr
torchaudio	pipelinesgetattr_sample_rate
ValueError_params	get_modelr   torchloadpopmodelswav2vec2_modelload_state_dictdictr)   	Parameteroneswav2vec_weightsr   LSTM_DEFAULTSLINEAR_DEFAULTSsave_hyperparametersr*   r   Dropoutr   
ModuleListrangehparamsr   r   )selfr   r   r   r   r   r   r    bundlewav2vec_num_layers_checkpointr$   r   multi_layer_lstmr   lstm_out_features	__class__r-   r1   r7   Q   s   














zSSeRiouSS.__init__returnc                 C   s0   t | jtr
td| jjr| jjS t| jjS )zDimension of outputz)SSeRiouSS does not support multi-tasking.)r8   specificationstupler?   powersetnum_powerset_classeslenclasses)rS   r0   r0   r1   	dimension   s
   zSSeRiouSS.dimensionc                 C   s\   | j jd dkr| j jd }n| j jd | j jd rdnd }t|| j| _|  | _d S )Nr   r   r   r   r   r   )	rR   r   r   r)   r3   rb   
classifierdefault_activation
activation)rS   r4   r0   r0   r1   build   s   
zSSeRiouSS.buildnum_samplesc                 C   s>   |}| j jjD ]}t||j|j|jjd |jjd d}q|S )zCompute number of output frames

        Parameters
        ----------
        num_samples : int
            Number of input samples.

        Returns
        -------
        num_frames : int
            Number of output frames.
        r   kernel_sizestridepaddingdilation)	r   feature_extractorconv_layersr	   ri   rj   convrk   rl   )rS   rg   
num_frames
conv_layerr0   r0   r1   rp      s   

zSSeRiouSS.num_framesrp   c                 C   B   |}t | jjjD ]}t||j|j|jjd |jj	d d}q	|S )a
  Compute size of receptive field

        Parameters
        ----------
        num_frames : int, optional
            Number of frames in the output signal

        Returns
        -------
        receptive_field_size : int
            Receptive field size.
        r   )rp   ri   rj   rk   rl   )
reversedr   rm   rn   r   ri   rj   ro   rk   rl   )rS   rp   receptive_field_sizerq   r0   r0   r1   rt      s   

zSSeRiouSS.receptive_field_sizer   framec                 C   rr   )zCompute center of receptive field

        Parameters
        ----------
        frame : int, optional
            Frame index

        Returns
        -------
        receptive_field_center : int
            Index of receptive field center.
        r   rh   )
rs   r   rm   rn   r
   ri   rj   ro   rk   rl   )rS   ru   receptive_field_centerrq   r0   r0   r1   rv     s   

z SSeRiouSS.receptive_field_center	waveformsc                 C   s  | j jdk rdn| j j}t  | jj|d|d\}}W d   n1 s(w   Y  |du rAtj|ddtj	| j
dd }n|d }| j jd rS| |\}}nt| jD ]\}}||\}}|d | j jd k rq| |}qX| j jd dkr| jD ]	}t||}q}| | |S )	zPass forward

        Parameters
        ----------
        waveforms : (batch, channel, sample)

        Returns
        -------
        scores : (batch, frame, classes)
        r   Nr   )r   r   )dimr   r   )rR   r   rB   no_gradr   extract_featuressqueezestackFsoftmaxrK   r   	enumerater   r   
leaky_relure   rc   )rS   rw   r   outputs_r,   r   r   r0   r0   r1   forward  s.   




zSSeRiouSS.forward)Nr   NNr   r   N)r   )r   )__name__
__module____qualname____doc__WAV2VEC_DEFAULTSrL   rM   r   rH   r9   intr   r   r7   propertyrb   rf   r   rp   rt   rv   rB   Tensorr   __classcell__r0   r0   rY   r1   r   *   sP    

f
r   )	functoolsr   typingr   r   rB   torch.nnr)   torch.nn.functional
functionalr}   r;   pyannote.core.utils.generatorsr   pyannote.audio.core.modelr   pyannote.audio.core.taskr   pyannote.audio.utils.paramsr   $pyannote.audio.utils.receptive_fieldr	   r
   r   r   r0   r0   r0   r1   <module>   s   