o
    }oiDN                     @   s   d dl mZ d dlZd dlmZ d dlm  mZ d dlm	Z	 d dl
mZ d dlmZ d dlmZmZmZmZ d dlmZ dgZG d	d
 d
ejZG dd deeZdS )    )OrderedDictN)	typecheck)
Exportable)NeuralModule)EncodedRepresentationLengthsType
NeuralTypeSpectrogramType)	ProbsTypeMSDD_modulec                       s&   e Zd Zd fdd	Zdd Z  ZS )		ConvLayer      r   r   r   c              	      s@   t t|   ttj||||dt tj|ddd| _d S )Nin_channelsout_channelskernel_sizestridegMbP?gGz?)epsmomentum)	superr   __init__nn
SequentialConv2dReLUBatchNorm2dcnn)selfr   r   r   r   	__class__ ^/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/asr/modules/msdd_diarizer.pyr      s   
zConvLayer.__init__c                 C   s   |  |}|S N)r   )r    featurer#   r#   r$   forward'   s   
zConvLayer.forward)r   r   r   r   )__name__
__module____qualname__r   r'   __classcell__r#   r#   r!   r$   r      s    r   c                       s   e Zd ZdZedd Zedd Zdd Z					
							d0dededede	dededede	dede
de
f fddZdd  Zd!d" Zd#d$ Zd%d& Zd'd( Zd1d*d+Ze d,d- Zd.d/ Z  ZS )2r   a  
    Multi-scale Diarization Decoder (MSDD) for overlap-aware diarization and improved diarization accuracy from clustering diarizer.
    Based on the paper: Taejin Park et. al, "Multi-scale Speaker Diarization with Dynamic Scale Weighting", Interspeech 2022.
    Arxiv version: https://arxiv.org/pdf/2203.15974.pdf

    Args:
        num_spks (int):
            Max number of speakers that are processed by the model. In `MSDD_module`, `num_spks=2` for pairwise inference.
        hidden_size (int):
            Number of hidden units in sequence models and intermediate layers.
        num_lstm_layers (int):
            Number of the stacked LSTM layers.
        dropout_rate (float):
            Dropout rate for linear layers, CNN and LSTM.
        cnn_output_ch (int):
            Number of channels per each CNN layer.
        emb_dim (int):
            Dimension of the embedding vectors.
        scale_n (int):
            Number of scales in multi-scale system.
        clamp_max (float):
            Maximum value for limiting the scale weight values.
        conv_repeat (int):
            Number of CNN layers after the first CNN layer.
        weighting_scheme (str):
            Name of the methods for estimating the scale weights.
        context_vector_type (str):
            If 'cos_sim', cosine similarity values are used for the input of the sequence models.
            If 'elem_prod', element-wise product values are used for the input of the sequence models.
    c                 C   s   t tdt tdt dS )z<
        Return definitions of module output ports.
        BTCr-   r.   r/   D)probsscale_weights)r   r   r
   r    r#   r#   r$   output_typesL   s
   

zMSDD_module.output_typesc                 C   s6   t tdt ttdt tdt tdt dS )z<
        Return  definitions of module input ports.
        r0   r-   )r-   r/   r1   r/   r,   )
ms_emb_seqlengthms_avg_embstargets)r   r   r	   tupler   r   r
   r4   r#   r#   r$   input_typesX   s   


zMSDD_module.input_typesc                 C   s   t |tjkrtjj|j |jj	d d S t |tj
tjtjfv rQ| D ]*\}}d|v r9tjj|j q(d|v rFtjj|j q(d|v rP|j	d q(d S d S )Ng{Gz?	weight_ih	weight_hhbias)typer   Lineartorchinitxavier_uniform_weightr>   datafill_GRULSTMRNNnamed_parametersorthogonal_)r    mnameparamr#   r#   r$   init_weightsf   s   zMSDD_module.init_weights            ?               ?r   conv_scale_weightcos_simnum_spkshidden_sizenum_lstm_layersdropout_ratecnn_output_chemb_dimscale_n	clamp_maxconv_repeatweighting_schemecontext_vector_typec              	      s<  t    d | _d| _d| _|| _|| _|| _|| _|	| _	d| _
d| _|| _|
| _|| _tjjdd| _tjjd| jd| _tj||| jdd|d	| _| jd
krttd|| j| j|  dfddg| _td|	d D ]}| jtd|| jdfdd qlt | _t| j	d D ]}| jtj| jdd qt|| || _t|| j| _ n| jdkrtj||dd| _!tj"#| j!j$ nt%d| j td| | j| _&| jdkrt| j| j || _'| j'(| j) n| jdkrt| j| j || _*nt%d| j t+|| _,| j&(| j) | j(| j) || _-d S )Nr   2   rP   gư>dimr   )rf   r   T)
num_layersbatch_firstbidirectionaldropoutrW   r   r   F)affineattn_scale_weight)r>   No such weighting scheme as rX   	elem_prodNo such context vector type as ).r   r   _speaker_model
batch_sizer7   r^   rY   r_   r]   ra   chanr   r[   rb   rc   rA   r   SoftmaxsoftmaxCosineSimilaritycos_distrH   lstm
ModuleListr   convrangeappendconv_bnr   r@   conv_to_linearlinear_to_weightsW_arB   eye_rD   
ValueErrorhidden_to_spksdist_to_embapplyrO   product_to_embDropoutrj   r`   )r    rY   rZ   r[   r\   r]   r^   r_   r`   ra   rb   rc   conv_idxr!   r#   r$   r   s   sx   

	





zMSDD_module.__init__c                 C   s\  |j d | _|j d | _|j d | _|ddddd| j}|}|dd| jddd}|ddddd| j| jd| j}| j	dkrL| 
||}n| j	dkrX| ||}ntd	| j	 ||j}| jd
krs| |||}	n| jdkr| |||}	ntd| j | t|	}	| |	}
| t|
d }| |}t |}||fS )a)  
        Core model that accepts multi-scale cosine similarity values and estimates per-speaker binary label.

        Args:
            ms_emb_seq (Tensor):
                Multiscale input embedding sequence
                Shape: (batch_size, length, scale_n, emb_dim)
            length (Tensor):
                The actual length of embedding sequences without zero padding
                Shape: (batch_size,)
            ms_avg_embs (Tensor):
                Cluster-average speaker embedding vectors.
                Shape: (batch_size, scale_n, self.emb_dim, max_spks)
            targets (Tensor):
                Ground-truth labels for the finest segment.
                Shape: (batch_size, feats_len, max_spks)

        Returns:
            preds (Tensor):
                Predicted binary speaker label for each speaker.
                Shape: (batch_size, feats_len, max_spks)
            scale_weights (Tensor):
                Multiscale weights per each base-scale segment.
                Shape: (batch_size, length, scale_n, max_spks)

        r   r      rP   r   rW   rl   rm   rX   rn   ro   )shaperq   r7   r^   	unsqueezeexpandrY   permutereshaperb   conv_scale_weightsattention_scale_weightsr   todevicerc   cosine_similarityelement_wise_productrj   Frelurw   r   r   Sigmoid)r    r6   r7   r8   r9   _ms_emb_seqms_emb_seq_singlems_avg_embs_permr3   context_emblstm_outputlstm_hidden_out	spk_predspredsr#   r#   r$   
core_model   s0   $





zMSDD_module.core_modelc                 C   s   | | j| j | j| j}| | j| j | j| j| j}| d| j| j}|d d| j| j| j}|| }t| | j| j | j d| j| | j| j | j | j| j}	|	 | j| j| j| j }	| 	|	}
|
S )a3  
        Calculate element wise product values among cluster-average embedding vectors and input embedding vector sequences.
        This function is selected by assigning `self.context_vector_type = "elem_prod"`. `elem_prod` method usually takes more
        time to converge compared to `cos_sim` method.

        Args:
            scale_weights (Tensor):
                Multiscale weight vector.
                Shape: (batch_size, feats_len, scale_n, max_spks)
            ms_avg_embs_perm (Tensor):
                Tensor containing cluster-average speaker embeddings for each scale.
                Shape: (batch_size, length, scale_n, emb_dim)
            ms_emb_seq (Tensor):
                Tensor containing multi-scale speaker embedding sequences. `ms_emb_seq` is a single channel input from the
                given audio stream input.
                Shape: (batch_size, length, num_spks, emb_dim)

        Returns:
            context_emb (Tensor):
                Output of `dist_to_emb` linear layer containing context for speaker label estimation.
        r   r   r   )
r   rq   r7   rY   r_   r^   r   rA   bmmr   )r    r3   r8   r6   scale_weight_flattenms_avg_embs_flattenms_emb_seq_flattenms_emb_seq_flatten_repelemwise_productcontext_vectorsr   r#   r#   r$   r     s   
z MSDD_module.element_wise_productc                 C   s8   |  ||}t||}|| j| jd}| |}|S )a  
        Calculate cosine similarity values among cluster-average embedding vectors and input embedding vector sequences.
        This function is selected by assigning self.context_vector_type = "cos_sim".

        Args:
            scale_weights (Tensor):
                Multiscale weight vector.
                Shape: (batch_size, feats_len, scale_n, max_spks)
            ms_avg_embs_perm (Tensor):
                Tensor containing cluster-average speaker embeddings for each scale.
                Shape: (batch_size, length, scale_n, emb_dim)
            _ms_emb_seq (Tensor):
                Tensor containing multi-scale speaker embedding sequences. `ms_emb_seq` is a single channel input from the
                given audio stream input.
                Shape: (batch_size, length, num_spks, emb_dim)

        Returns:
            context_emb (Tensor):
                Output of `dist_to_emb` linear layer containing context for speaker label estimation.
        r   )rv   rA   mulviewrq   r7   r   )r    r3   r8   r   cos_dist_seqr   r   r#   r#   r$   r   &  s
   
zMSDD_module.cosine_similarityc                 C   s   |  |dd |  |dd}|ddddd}t||d| j| j| j}ttj	|ddd}|| j
| j| j| j}| |}|S )a  
        Use weighted inner product for calculating each scale weight. W_a matrix has (emb_dim * emb_dim) learnable parameters
        and W_a matrix is initialized with an identity matrix. Compared to "conv_scale_weight" method, this method shows more evenly
        distributed scale weights.

        Args:
            ms_avg_embs_perm (Tensor):
                Tensor containing cluster-average speaker embeddings for each scale.
                Shape: (batch_size, length, scale_n, emb_dim)
            ms_emb_seq (Tensor):
                Tensor containing multi-scale speaker embedding sequences. `ms_emb_seq` is input from the
                given audio stream input.
                Shape: (batch_size, length, num_spks, emb_dim)

        Returns:
            scale_weights (Tensor):
                Weight vectors that determine the weight of each scale.
                Shape: (batch_size, length, num_spks, emb_dim)
        r   r   rP   r   )dim1dim2)r   flattenr   rA   matmulr   r_   rY   sigmoiddiagonalrq   r7   rt   )r    r   r6   mat_amat_bweighted_corrr3   r#   r#   r$   r   A  s   
z#MSDD_module.attention_scale_weightsc           	      C   s   t j||gdd}|ddd}| j|| jd | jd dd}td| jd D ]}| j|| j| | j| dd}q)|	| j
| j| j| j }| |}| t|}| | |}|d	d
d
d
| j}|S )a  
        Use multiple Convnet layers to estimate the scale weights based on the cluster-average embedding and
        input embedding sequence.

        Args:
            ms_avg_embs_perm (Tensor):
                Tensor containing cluster-average speaker embeddings for each scale.
                Shape: (batch_size, length, scale_n, emb_dim)
            ms_emb_seq_single (Tensor):
                Tensor containing multi-scale speaker embedding sequences. ms_emb_seq_single is input from the
                given audio stream input.
                Shape: (batch_size, length, num_spks, emb_dim)

        Returns:
            scale_weights (Tensor):
                Weight vectors that determine the weight of each scale.
                Shape: (batch_size, length, num_spks, emb_dim)
        rP   re   r   r   T)conv_module	bn_modulefirst_layerF)
conv_inputr   r   r   r   r   )rA   catr   r   conv_forwardry   r|   rz   ra   r   rq   r7   r]   r^   r}   rj   r   
leaky_relurt   r~   r   rY   )	r    r   r   ms_cnn_input_seqconv_outr   lin_input_seq
hidden_seqr3   r#   r#   r$   r   _  s$   
zMSDD_module.conv_scale_weightsFc                 C   s~   ||}|s| ddddn|}|| j| j| j| j}|ddd}|| dddd dddd}| t	
|}|S )a  
        A module for convolutional neural networks with 1-D filters. As a unit layer batch normalization, non-linear layer and dropout
        modules are included.

        Note:
            If `first_layer=True`, the input shape is set for processing embedding input.
            If `first_layer=False`, then the input shape is set for processing the output from another `conv_forward` module.

        Args:
            conv_input (Tensor):
                Reshaped tensor containing cluster-average embeddings and multi-scale embedding sequences.
                Shape: (batch_size*length, 1, scale_n*(num_spks+1), emb_dim)
            conv_module (ConvLayer):
                ConvLayer instance containing torch.nn.modules.conv modules.
            bn_module (torch.nn.modules.batchnorm.BatchNorm2d):
                Predefined Batchnorm module.
            first_layer (bool):
                Boolean for switching between the first layer and the others.
                Default: `False`

        Returns:
            conv_out (Tensor):
                Convnet output that can be fed to another ConvLayer module or linear layer.
                Shape: (batch_size*length, 1, cnn_output_ch, emb_dim)
        r   rP   r   r   )r   r   rq   r7   r]   r^   r   r   rj   r   r   )r    r   r   r   r   r   r#   r#   r$   r     s    zMSDD_module.conv_forwardc                 C   s   |  ||||\}}||fS r%   )r   )r    r6   r7   r8   r9   r   r3   r#   r#   r$   r'     s   zMSDD_module.forwardc                 C   s   t |  j}tj|jd fd|d}tjd|| j| j|d}tjd| j| j| j	|d}td|| j	
  }t||||gS )z{
        Generate input examples for tracing etc.

        Returns (tuple):
            A tuple of input examples.
        r   {   )size
fill_valuer   r   )r   )next
parametersr   rA   fullr   randnr_   r^   rY   roundfloatr:   )r    r   input_examplelensavg_embsr9   r#   r#   r$   r     s   zMSDD_module.input_example)rP   rQ   rP   rR   rS   rT   rU   rV   r   rW   rX   )F)r(   r)   r*   __doc__propertyr5   r;   rO   intr   strr   r   r   r   r   r   r   r   r'   r   r+   r#   r#   r!   r$   r   ,   sd    

	
S;%
("
)collectionsr   rA   torch.nnr   torch.nn.functional
functionalr   nemo.core.classes.commonr   nemo.core.classes.exportabler   nemo.core.classes.moduler   nemo.core.neural_typesr   r   r   r	   nemo.core.neural_types.elementsr
   __all__Moduler   r   r#   r#   r#   r$   <module>   s   