o
    9wi71                     @   s  d dl mZ d dlmZ d dlZd dlmZ d dlm  mZ	 d dl
mZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZmZ zd d
lmZ d dlmZ dZW n eyb   dZY nw z
d dlmZ dZ W n eyx   dZ Y nw G dd deZ!dS )    )	lru_cache)OptionalN)make_enc_dec)pairwise)Model)Task)
merge_dict)conv1d_num_framesconv1d_receptive_field_centerconv1d_receptive_field_size)DPRNN)
pad_x_to_yTF)	AutoModelc                       s  e Zd ZdZdddddZdddZd	d
d
dddddZddiZ										d6dede	e de	e dede
de
de	e de
d ed!ef fd"d#Zed$e
fd%d&Zd'd( Zed)e
d$e
fd*d+Zd7d,e
d$e
fd-d.Zd8d0e
d$e
fd1d2Zd3ejd$ejfd4d5Z  ZS )9	ToTaToNetu4  ToTaToNet joint speaker diarization and speech separation model

                        /--------------\
    Conv1D Encoder --------+--- DPRNN --X------- Conv1D Decoder
    WavLM -- upsampling --/                 \--- Avg pool -- Linear -- Classifier


    Parameters
    ----------
    sample_rate : int, optional
        Audio sample rate. Defaults to 16kHz (16000).
    num_channels : int, optional
        Number of channels. Defaults to mono (1).
    sincnet : dict, optional
        Keyword arugments passed to the SincNet block.
        Defaults to {"stride": 1}.
    linear : dict, optional
        Keyword arugments used to initialize linear layers
        See ToTaToNet.LINEAR_DEFAULTS for default values.
    diar : dict, optional
        Keyword arguments used to initalize the average pooling in the diarization branch.
        See ToTaToNet.DIAR_DEFAULTS for default values.
    encoder_decoder : dict, optional
        Keyword arguments used to initalize the encoder and decoder.
        See ToTaToNet.ENCODER_DECODER_DEFAULTS for default values.
    dprnn : dict, optional
        Keyword arguments used to initalize the DPRNN model.
        See ToTaToNet.DPRNN_DEFAULTS for default values.
    sample_rate : int, optional
        Audio sample rate. Defaults to 16000.
    num_channels : int, optional
        Number of channels. Defaults to 1.
    task : Task, optional
        Task to perform. Defaults to None.
    n_sources : int, optional
        Number of separated sources. Defaults to 3.
    use_wavlm : bool, optional
        Whether to use the WavLM large model for feature extraction. Defaults to True.
    gradient_clip_val : float, optional
        Gradient clipping value. Required when fine-tuning the WavLM model and thus using two different optimizers.
        Defaults to 5.0.

    References
    ----------
    Joonas Kalda, Clément Pagés, Ricard Marxer, Tanel Alumäe, and Hervé Bredin.
    "PixIT: Joint Training of Speaker Diarization and Speech Separation
    from Real-world Multi-speaker Recordings"
    Odyssey 2024. https://arxiv.org/abs/2403.02288
    free    @      )fb_namekernel_size	n_filtersstride   )hidden_size
num_layers      d   gLNreluLSTM)	n_repeatsbn_chanhid_size
chunk_size	norm_typemask_actrnn_typeframes_per_second}   N>        T      @encoder_decoderlineardiardprnnsample_ratenum_channelstask	n_sources	use_wavlmgradient_clip_valc                    s  t stdtstdt j|||d t| j|}t| j|}t| j|}t| j	|}|	| _
| dddd || _|d d	krF|d
 }n|d dkrYtd|d
 d d  }ntdtdd|i| jj\| _| _| j
rtd| _d}| jjjD ]}t|jtjr||jjd 9 }q|t||d  | _t|d
 | jj j!j" f|d
 |d| jj#| _$nt|d
 f|d
 |d| jj#| _$t||d  |d  | _%tj&| j%| j%d| _'|}|d dkrt(dd t)|g| jj*d g| jj*d   D | _*|
| _+d| _,d S )Nzw'asteroid' must be installed to use ToTaToNet separation. `pip install pyannote-audio[separation]` should do the trick.z{'transformers' must be installed to use ToTaToNet separation. `pip install pyannote-audio[separation]` should do the trick.)r2   r3   r4   r.   r/   r1   r0   r   r   r   stftr   r+   zFilterbank type not recognized.r2   zmicrosoft/wavlm-larger   r   )out_chann_srcr(   )r   r   c                 S   s   g | ]
\}}t ||qS  )nnLinear).0in_featuresout_featuresr;   r;   g/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/pyannote/audio/models/separation/ToTaToNet.py
<listcomp>   s    
z&ToTaToNet.__init__.<locals>.<listcomp>r   Fr;   )-ASTEROID_IS_AVAILABLEImportErrorTRANSFORMERS_IS_AVAILABLEsuper__init__r   LINEAR_DEFAULTSDPRNN_DEFAULTSENCODER_DECODER_DEFAULTSDIAR_DEFAULTSr6   save_hyperparametersr5   int
ValueErrorr   hparamsr.   encoderdecoderr   from_pretrainedwavlmfeature_extractorconv_layers
isinstanceconvr<   Conv1dr   wavlm_scalingr   feature_projection
projectionr@   r1   maskerdiarization_scaling	AvgPool1daverage_pool
ModuleListr   r/   r7   automatic_optimization)selfr.   r/   r0   r1   r2   r3   r4   r5   r6   r7   n_feats_outdownsampling_factor
conv_layerlinaer_input_features	__class__r;   rA   rG      s   




zToTaToNet.__init__returnc                 C   s   dS )zDimension of outputr+   r;   rb   r;   r;   rA   	dimension   s   zToTaToNet.dimensionc                 C   s@   | j jd dkrtd| j| _ntd| j| _|  | _d S )Nr   r   r   r+   )rO   r/   r<   r=   rk   
classifierdefault_activation
activationrj   r;   r;   rA   build   s   zToTaToNet.buildnum_samplesc                 C   2   | j | jjd  }| j | jjd  }t|||dS )zCompute number of output frames

        Parameters
        ----------
        num_samples : int
            Number of input samples.

        Returns
        -------
        num_frames : int
            Number of output frames.
        r   r   r   r   )r]   rO   r.   r	   )rb   rp   equivalent_strideequivalent_kernel_sizer;   r;   rA   
num_frames   s   zToTaToNet.num_framesru   c                 C   rq   )a
  Compute size of receptive field

        Parameters
        ----------
        num_frames : int, optional
            Number of frames in the output signal

        Returns
        -------
        receptive_field_size : int
            Receptive field size.
        r   r   rr   )r]   rO   r.   r   )rb   ru   rs   rt   r;   r;   rA   receptive_field_size     zToTaToNet.receptive_field_sizer   framec                 C   rq   )zCompute center of receptive field

        Parameters
        ----------
        frame : int, optional
            Frame index

        Returns
        -------
        receptive_field_center : int
            Index of receptive field center.
        r   r   rr   )r]   rO   r.   r
   )rb   rx   rs   rt   r;   r;   rA   receptive_field_center  rw   z ToTaToNet.receptive_field_center	waveformsc           
      C   sV  |j d }| |}| jr8| |dj}|dd}|j| jdd}t	||}t
j||fdd}| |}n| |}||d }| |}t	||}|dd}t
j|ddd}| |}|dd}| jjd dkr|| jD ]	}	t|	|}qr| jjd dkr|d jddd}| |}||| jd}|dd}| jd ||fS )zPass forward

        Parameters
        ----------
        waveforms : (batch, channel, sample)

        Returns
        -------
        scores : (batch, frame, classes)
        sources : (batch, sample, n_sources)
        r   r+   r   )dim)	start_dimend_dimr   )shaperP   r6   rS   squeezelast_hidden_state	transposerepeat_interleaverY   r   torchcatr\   	unsqueezerQ   flattenr_   rO   r/   F
leaky_relusumrl   reshaper5   rn   )
rb   rz   bsztf_rep	wavlm_repmasksmasked_tf_repdecoded_sourcesoutputsr/   r;   r;   rA   forward5  s4   








zToTaToNet.forward)
NNNNr*   r+   Nr,   Tr-   )r+   )r   )__name__
__module____qualname____doc__rJ   rH   rI   rK   dictr   rM   r   boolfloatrG   propertyrk   ro   r   ru   rv   ry   r   Tensorr   __classcell__r;   r;   rg   rA   r   <   sp    3
		
\r   )"	functoolsr   typingr   r   torch.nnr<   torch.nn.functional
functionalr   asteroid_filterbanksr   pyannote.core.utils.generatorsr   pyannote.audio.core.modelr   pyannote.audio.core.taskr   pyannote.audio.utils.paramsr   $pyannote.audio.utils.receptive_fieldr	   r
   r   asteroid.masknnr   asteroid.utils.torch_utilsr   rC   rD   transformersr   rE   r   r;   r;   r;   rA   <module>   s2   