o
    pi2                     @   s  d dl mZ d dlmZ d dlZd dlmZ d dlm  mZ	 d dl
mZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZmZ zd d
lmZ d dlmZ dZW n eyb   dZY nw z
d dlmZ dZ W n eyx   dZ Y nw G dd deZ!dS )    )	lru_cache)OptionalN)make_enc_dec)pairwise)Model)Task)
merge_dict)conv1d_num_framesconv1d_receptive_field_centerconv1d_receptive_field_size)DPRNN)
pad_x_to_yTF)	AutoModelc                       s  e Zd ZdZdddddZdddZd	d
d
dddddZddiZ											d8dede	e de	e dede
de
de	e d e
d!ed"ed#ef fd$d%Zed&e
fd'd(Zd)d* Zed+e
d&e
fd,d-Zd9d.e
d&e
fd/d0Zd:d2e
d&e
fd3d4Zd5ejd&ejfd6d7Z  ZS );	ToTaToNetu  ToTaToNet joint speaker diarization and speech separation model

                        /--------------\
    Conv1D Encoder --------+--- DPRNN --X------- Conv1D Decoder
    WavLM -- upsampling --/                 \--- Avg pool -- Linear -- Classifier


    Parameters
    ----------
    sample_rate : int, optional
        Audio sample rate. Defaults to 16kHz (16000).
    num_channels : int, optional
        Number of channels. Defaults to mono (1).
    sincnet : dict, optional
        Keyword arugments passed to the SincNet block.
        Defaults to {"stride": 1}.
    linear : dict, optional
        Keyword arugments used to initialize linear layers
        See ToTaToNet.LINEAR_DEFAULTS for default values.
    diar : dict, optional
        Keyword arguments used to initalize the average pooling in the diarization branch.
        See ToTaToNet.DIAR_DEFAULTS for default values.
    encoder_decoder : dict, optional
        Keyword arguments used to initalize the encoder and decoder.
        See ToTaToNet.ENCODER_DECODER_DEFAULTS for default values.
    dprnn : dict, optional
        Keyword arguments used to initalize the DPRNN model.
        See ToTaToNet.DPRNN_DEFAULTS for default values.
    sample_rate : int, optional
        Audio sample rate. Defaults to 16000.
    num_channels : int, optional
        Number of channels. Defaults to 1.
    task : Task, optional
        Task to perform. Defaults to None.
    n_sources : int, optional
        Number of separated sources. Defaults to 3.
    use_wavlm : bool, optional
        Whether to use the WavLM large model for feature extraction. Defaults to True.
    wavlm_frozen : bool, optional
        Whether to freeze the WavLM model. Defaults to False.
    gradient_clip_val : float, optional
        Gradient clipping value. Required when fine-tuning the WavLM model and thus using two different optimizers.
        Defaults to 5.0.

    References
    ----------
    Joonas Kalda, Clément Pagés, Ricard Marxer, Tanel Alumäe, and Hervé Bredin.
    "PixIT: Joint Training of Speaker Diarization and Speech Separation
    from Real-world Multi-speaker Recordings"
    Odyssey 2024. https://arxiv.org/abs/2403.02288
    free    @      )fb_namekernel_size	n_filtersstride   )hidden_size
num_layers      d   gLNreluLSTM)	n_repeatsbn_chanhid_size
chunk_size	norm_typemask_actrnn_typeframes_per_second}   N>        TF      @encoder_decoderlineardiardprnnsample_ratenum_channelstask	n_sources	use_wavlmwavlm_frozengradient_clip_valc                    s$  t stdtstdt j|||d t| j|}t| j|}t| j|}t| j	|}|	| _
| ddddd || _|d	 d
krG|d }n|d	 dkrZtd|d d d  }ntdtdd|i| jj\| _| _| j
rtd| _| j D ]}|
 |_q{d}| jjjD ]}t|jtjr||jjd 9 }qt||d  | _ t!|d | jj"j#j$ f|d |d| jj%| _&nt!|d f|d |d| jj%| _&t||d  |d  | _'tj(| j'| j'd| _)|}|d dkr
t*dd t+|g| jj,d g| jj,d   D | _,|| _-|
| _.d S )Nzw'asteroid' must be installed to use ToTaToNet separation. `pip install pyannote-audio[separation]` should do the trick.z{'transformers' must be installed to use ToTaToNet separation. `pip install pyannote-audio[separation]` should do the trick.)r2   r3   r4   r.   r/   r1   r0   r7   r   r   r   stftr   r+   zFilterbank type not recognized.r2   zmicrosoft/wavlm-larger   r   )out_chann_srcr(   )r   r   c                 S   s   g | ]
\}}t ||qS  )nnLinear).0in_featuresout_featuresr<   r<   ^/home/ubuntu/.local/lib/python3.10/site-packages/pyannote/audio/models/separation/ToTaToNet.py
<listcomp>   s    
z&ToTaToNet.__init__.<locals>.<listcomp>r   r<   )/ASTEROID_IS_AVAILABLEImportErrorTRANSFORMERS_IS_AVAILABLEsuper__init__r   LINEAR_DEFAULTSDPRNN_DEFAULTSENCODER_DECODER_DEFAULTSDIAR_DEFAULTSr6   save_hyperparametersr5   int
ValueErrorr   hparamsr.   encoderdecoderr   from_pretrainedwavlm
parametersrequires_gradfeature_extractorconv_layers
isinstanceconvr=   Conv1dr   wavlm_scalingr   feature_projection
projectionrA   r1   maskerdiarization_scaling	AvgPool1daverage_pool
ModuleListr   r/   r8   automatic_optimization)selfr.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   n_feats_outparamdownsampling_factor
conv_layerlinaer_input_features	__class__r<   rB   rH      s   






zToTaToNet.__init__returnc                 C   s   dS )zDimension of outputr+   r<   re   r<   r<   rB   	dimension   s   zToTaToNet.dimensionc                 C   s@   | j jd dkrtd| j| _ntd| j| _|  | _d S )Nr   r   r   r+   )rP   r/   r=   r>   ro   
classifierdefault_activation
activationrn   r<   r<   rB   build   s   zToTaToNet.buildnum_samplesc                 C   2   | j | jjd  }| j | jjd  }t|||dS )zCompute number of output frames

        Parameters
        ----------
        num_samples : int
            Number of input samples.

        Returns
        -------
        num_frames : int
            Number of output frames.
        r   r   r   r   )r`   rP   r.   r	   )re   rt   equivalent_strideequivalent_kernel_sizer<   r<   rB   
num_frames   s   zToTaToNet.num_framesry   c                 C   ru   )a
  Compute size of receptive field

        Parameters
        ----------
        num_frames : int, optional
            Number of frames in the output signal

        Returns
        -------
        receptive_field_size : int
            Receptive field size.
        r   r   rv   )r`   rP   r.   r   )re   ry   rw   rx   r<   r<   rB   receptive_field_size     zToTaToNet.receptive_field_sizer   framec                 C   ru   )zCompute center of receptive field

        Parameters
        ----------
        frame : int, optional
            Frame index

        Returns
        -------
        receptive_field_center : int
            Index of receptive field center.
        r   r   rv   )r`   rP   r.   r
   )re   r|   rw   rx   r<   r<   rB   receptive_field_center$  r{   z ToTaToNet.receptive_field_center	waveformsc           
      C   sV  |j d }| |}| jr8| |dj}|dd}|j| jdd}t	||}t
j||fdd}| |}n| |}||d }| |}t	||}|dd}t
j|ddd}| |}|dd}| jjd dkr|| jD ]	}	t|	|}qr| jjd dkr|d jddd}| |}||| jd}|dd}| jd ||fS )zPass forward

        Parameters
        ----------
        waveforms : (batch, channel, sample)

        Returns
        -------
        scores : (batch, frame, classes)
        sources : (batch, sample, n_sources)
        r   r+   r   )dim)	start_dimend_dimr   )shaperQ   r6   rT   squeezelast_hidden_state	transposerepeat_interleaver\   r   torchcatr_   	unsqueezerR   flattenrb   rP   r/   F
leaky_relusumrp   reshaper5   rr   )
re   r~   bsztf_rep	wavlm_repmasksmasked_tf_repdecoded_sourcesoutputsr/   r<   r<   rB   forward=  s4   








zToTaToNet.forward)NNNNr*   r+   Nr,   TFr-   )r+   )r   )__name__
__module____qualname____doc__rK   rI   rJ   rL   dictr   rN   r   boolfloatrH   propertyro   rs   r   ry   rz   r}   r   Tensorr   __classcell__r<   r<   rk   rB   r   <   sv    5
		
br   )"	functoolsr   typingr   r   torch.nnr=   torch.nn.functional
functionalr   asteroid_filterbanksr   pyannote.core.utils.generatorsr   pyannote.audio.core.modelr   pyannote.audio.core.taskr   pyannote.audio.utils.paramsr   $pyannote.audio.utils.receptive_fieldr	   r
   r   asteroid.masknnr   asteroid.utils.torch_utilsr   rD   rE   transformersr   rF   r   r<   r<   r<   rB   <module>   s2   