o
    pi:-                     @   s   d dl mZ d dlmZmZ d dlZd dlmZ d dlm  m	Z
 d dlZd dlmZ d dlmZ d dlmZ d dlmZ d dlmZmZmZ G d	d
 d
eZdS )    )	lru_cache)OptionalUnionN)pairwise)Model)Task)
merge_dict)conv1d_num_framesconv1d_receptive_field_centerconv1d_receptive_field_sizec                       s   e Zd ZdZdZddddddZddd	Z	
			
	
			
d+deee	f de
dedee dee dededee f fddZedefddZdd Zededefdd Zd,d!edefd"d#Zd-d%edefd&d'Zd(ejdejfd)d*Z  ZS ).	SSeRiouSSa  Self-Supervised Representation for Speaker Segmentation

    wav2vec > LSTM > Feed forward > Classifier

    Parameters
    ----------
    sample_rate : int, optional
        Audio sample rate. Defaults to 16kHz (16000).
    num_channels : int, optional
        Number of channels. Defaults to mono (1).
    wav2vec: dict or str, optional
        Defaults to "WAVLM_BASE".
    wav2vec_frozen: bool, optional
        Whether to freeze wav2vec weights. Defaults to False.
    wav2vec_layer: int, optional
        Index of layer to use as input to the LSTM.
        Defaults (-1) to use average of all layers (with learnable weights).
    lstm : dict, optional
        Keyword arguments passed to the LSTM layer.
        Defaults to {"hidden_size": 128, "num_layers": 4, "bidirectional": True},
        i.e. two bidirectional layers with 128 units each.
        Set "monolithic" to False to split monolithic multi-layer LSTM into multiple mono-layer LSTMs.
        This may proove useful for probing LSTM internals.
    linear : dict, optional
        Keyword arugments used to initialize linear layers
        Defaults to {"hidden_size": 128, "num_layers": 2},
        i.e. two linear layers with 128 units each.
    
WAVLM_BASE      T        )hidden_size
num_layersbidirectional
monolithicdropout   )r   r   NF>     wav2vecwav2vec_frozenwav2vec_layerlstmlinearsample_ratenum_channelstaskc	                    sZ  t  j|||d t|trcttj|r;ttj|}	||	jkr+t	d|	j d| d|	j
d |	j
d }
|	 | _n?t|}|d}tjjdi || _|d}| j| |d |d }
nt|trztjjdi || _|d |d }
|d	k rtjt|
d
d| _| j D ]}| |_qt| j  d
 d< t| j|}| ddddd  d }|rt }|d= tjfi || _n1 d }|dkrtj  d d| _!t dd< dd< d= t" fddt#|D | _|d dk rd S | j$jd | j$jd rdnd }t"dd t%|g| j$j&d g| j$j&d   D | _&d S )N)r   r    r!   z	Expected z
Hz, found zHz.encoder_embed_dimencoder_num_layersconfig
state_dictr   T)datarequires_gradbatch_firstr   r   r   r   r   r   r   r   r   )pr   c                    s>   g | ]}t j|d krn d  d rdnd fi qS )r   r   r   r   r   )nnLSTM).0ir   one_layer_lstmwav2vec_dim `/home/ubuntu/.local/lib/python3.10/site-packages/pyannote/audio/models/segmentation/SSeRiouSS.py
<listcomp>   s    
z&SSeRiouSS.__init__.<locals>.<listcomp>r   r   r   c                 S   s   g | ]
\}}t ||qS r1   )r*   Linear)r,   in_featuresout_featuresr1   r1   r2   r3      s    
r1   )'super__init__
isinstancestrhasattr
torchaudio	pipelinesgetattr_sample_rate
ValueError_params	get_modelr   torchloadpopmodelswav2vec2_modelload_state_dictdictr*   	Parameteroneswav2vec_weights
parametersr'   r   LSTM_DEFAULTSLINEAR_DEFAULTSsave_hyperparametersr+   r   Dropoutr   
ModuleListrangehparamsr   r   )selfr   r   r   r   r   r   r    r!   bundlewav2vec_num_layers_checkpointr%   paramr   multi_layer_lstmr   lstm_out_features	__class__r.   r2   r8   S   s   















zSSeRiouSS.__init__returnc                 C   s0   t | jtr
td| jjr| jjS t| jjS )zDimension of outputz)SSeRiouSS does not support multi-tasking.)r9   specificationstupler@   powersetnum_powerset_classeslenclasses)rU   r1   r1   r2   	dimension   s
   zSSeRiouSS.dimensionc                 C   s\   | j jd dkr| j jd }n| j jd | j jd rdnd }t|| j| _|  | _d S )Nr   r   r   r   r   r   )	rT   r   r   r*   r4   re   
classifierdefault_activation
activation)rU   r5   r1   r1   r2   build   s   
zSSeRiouSS.buildnum_samplesc                 C   s>   |}| j jjD ]}t||j|j|jjd |jjd d}q|S )zCompute number of output frames

        Parameters
        ----------
        num_samples : int
            Number of input samples.

        Returns
        -------
        num_frames : int
            Number of output frames.
        r   kernel_sizestridepaddingdilation)	r   feature_extractorconv_layersr	   rl   rm   convrn   ro   )rU   rj   
num_frames
conv_layerr1   r1   r2   rs      s   

zSSeRiouSS.num_framesrs   c                 C   B   |}t | jjjD ]}t||j|j|jjd |jj	d d}q	|S )a
  Compute size of receptive field

        Parameters
        ----------
        num_frames : int, optional
            Number of frames in the output signal

        Returns
        -------
        receptive_field_size : int
            Receptive field size.
        r   )rs   rl   rm   rn   ro   )
reversedr   rp   rq   r   rl   rm   rr   rn   ro   )rU   rs   receptive_field_sizert   r1   r1   r2   rw      s   

zSSeRiouSS.receptive_field_sizer   framec                 C   ru   )zCompute center of receptive field

        Parameters
        ----------
        frame : int, optional
            Frame index

        Returns
        -------
        receptive_field_center : int
            Index of receptive field center.
        r   rk   )
rv   r   rp   rq   r
   rl   rm   rr   rn   ro   )rU   rx   receptive_field_centerrt   r1   r1   r2   ry   	  s   

z SSeRiouSS.receptive_field_center	waveformsc                 C   s   | j jdk rdn| j j}| jj|d|d\}}|du r-tj|ddtj| j	dd }n|d }| j j
d r?| 
|\}}nt| j
D ]\}}||\}}|d | j j
d k r]| |}qD| j jd dkrs| jD ]	}t||}qi| | |S )	zPass forward

        Parameters
        ----------
        waveforms : (batch, channel, sample)

        Returns
        -------
        scores : (batch, frame, classes)
        r   Nr   )r   r   )dimr   r   )rT   r   r   extract_featuressqueezerC   stackFsoftmaxrL   r   	enumerater   r   
leaky_relurh   rf   )rU   rz   r   outputs_r-   r   r   r1   r1   r2   forward!  s*   




zSSeRiouSS.forward)NFr   NNr   r   N)r   )r   )__name__
__module____qualname____doc__WAV2VEC_DEFAULTSrN   rO   r   rI   r:   boolintr   r   r8   propertyre   ri   r   rs   rw   ry   rC   Tensorr   __classcell__r1   r1   r\   r2   r   *   sV    

	l
r   )	functoolsr   typingr   r   rC   torch.nnr*   torch.nn.functional
functionalr   r<   pyannote.core.utils.generatorsr   pyannote.audio.core.modelr   pyannote.audio.core.taskr   pyannote.audio.utils.paramsr   $pyannote.audio.utils.receptive_fieldr	   r
   r   r   r1   r1   r1   r2   <module>   s   