o
    wiT2                    @   sJ  d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlm	Z	 d dl
mZ d dlmZmZmZmZmZmZ d dlZd dlZd dlmZ d dlmZmZ d dlmZ d d	lmZmZ d d
l m!Z! d dl"m#Z# d dl$m$Z$ d dl%m&Z&m'Z' d dl(m)Z) d dl*m+Z+ d dl,m-Z- d dl.m/Z/ d dl0m1Z1m2Z2m3Z3m4Z4 d dl5m6Z6 d dl7m8Z8 d dl9m:Z: d dl;m<Z<m=Z=m>Z>m?Z?m@Z@mAZAmBZBmCZCmDZD d dlEmFZF d dlGmHZHmIZI d dlJmKZKmLZLmMZM d dlNmOZO d dlPmQZQ zd dlRmSZS W n eTy   d dlUmVZV eVd'ddZSY nw g d ZWG d!d" d"eFe/ZXG d#d$ d$ejYjZZ[G d%d& d&eZ\dS )(    N)OrderedDict)Path)mode)AnyDictListOptionalTupleUnion)instantiate)LightningModuleTrainer)rank_zero_only)
DictConfig	open_dict)
Annotation)DiarizationErrorRate)tqdm)AudioToSpeechMSDDInferDatasetAudioToSpeechMSDDTrainDataset)score_labels)MultiBinaryAccuracy)ClusteringDiarizer)ExportableEncDecModel)_MODEL_CONFIG_YAML_SPEAKER_MODEL
_VAD_MODELget_available_model_names)NeuralDiarizerInferenceConfig)EncDecSpeakerLabelModel)WaveformFeaturizer)	audio_rttm_mapget_embs_and_timestampsget_id_tup_dictget_scale_mapping_argmatget_uniq_id_list_from_manifestlabels_to_pyannote_objectmake_rttm_with_overlapparse_scale_configsrttm_to_labels)ModelPT)PretrainedModelInfo	typecheck)AudioSignalLengthsType
NeuralType)	ProbsType)logging)autocast)contextmanagerc                 c   s    dV  dS )zauto-casting context managerN )enabledr4   r4   d/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/asr/models/msdd_models.pyr2   G   s   
r2   )EncDecDiarLabelModelClusterEmbeddingNeuralDiarizerc                       s,  e Zd ZdZedee fddZdJdede	f fdd	Z
d
d Zdd Zdd Zdd Z	dKdedededefddZdeeeef  fddZdeeeef  fddZdeeeef  fd d!Zd"d# Zd$d% Zedeeeef  fd&d'Zedeeef fd(d)Zd*ejd+ejd,ejdejfd-d.Z e! d*ejd/ejd,ejdejfd0d1Z"e! d2ejd3ejd4ejd,ejde#ejejeje#ejejf f f
d5d6Z$d7d8 Z%e& d9d: Z'd;e(d<e)fd=d>Z*dLd;e(d<e)d@e)fdAdBZ+dLdCe(d@e)fdDdEZ,dLdCeeeejf  d@e)fdFdGZ-dHdI Z.  Z/S )Mr7   a%  
    Encoder decoder class for multiscale diarization decoder (MSDD). Model class creates training,
    validation methods for setting up data performing model forward pass.

    This model class expects config dict for:
        * preprocessor
        * msdd_model
        * speaker_model
    returnc                 C   s    g }t dddd}|| |S )
        This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud.

        Returns:
            List of available pre-trained models.
        diar_msdd_telephoniczthttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/diar_msdd_telephonic/versions/1.0.1/files/diar_msdd_telephonic.nemozqFor details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:diar_msdd_telephonic)pretrained_model_namelocationdescription)r+   append)clsresultmodelr4   r4   r6   list_available_models[   s   
z*EncDecDiarLabelModel.list_available_modelsNcfgtrainerc                    sD  |r|nd| _ || _| j r!|   |j|j | _| jj| _d| _nd| _d| _t j	| j|d | jj
jjj}t|tsBt|dkrFtdt|| j_| jj| jj_| jj| _t| jj| _td| jjj | _t| jj| _|dur|   | | nt|j| j_|  d t!| jj"| _"t# | _$t# | _%t# | _&dS )z
        Initialize an MSDD model and the specified speaker embedding model. In this init function,
        training and validation datasets are prepared.
        NF   T)rE   rF   zRwindow_length_in_sec should be a list containing multiple segment (window) lengthsrE   )'_trainercfg_msdd_model_init_segmentation_info	num_nodesnum_devices
world_sizeemb_batch_sizepairwise_infersuper__init__diarizerspeaker_embeddings
parameterswindow_length_in_sec
isinstanceintlen
ValueErrorscale_nmsdd_moduler   from_config_dictpreprocessor_cfgwindow_strideframe_per_secr7   msdd_init_speaker_modeladd_speaker_model_configspeaker_model_cfg_speaker_modelsave_hyperparametersr   lossr   _accuracy_test_accuracy_train_accuracy_valid)selfrE   rF   rU   	__class__r4   r6   rQ   o   s8   


zEncDecDiarLabelModel.__init__c                 C   sP   t | t| jjj}||_|j`|j`W d   dS 1 s!w   Y  dS )a  
        Add config dictionary of the speaker model to the model's config dictionary. This is required to
        save and load speaker model with MSDD model.

        Args:
            cfg (DictConfig): DictConfig type variable that conatains hyperparameters of MSDD model.
        N)r   copyra   re   rE   rd   train_dsvalidation_ds)rk   rE   cfg_cpr4   r4   r6   rc      s   
"z-EncDecDiarLabelModel.add_speaker_model_configc                 C   s4   | j j| _t| jjjj| jjjj| jjjj| _	dS )zGInitialize segmentation settings: window, shift and multiscale weights.N)
rI   rR   _diarizer_paramsr(   rS   rT   rU   shift_length_in_secmultiscale_weightsmultiscale_args_dictrk   r4   r4   r6   rJ      s   




z,EncDecDiarLabelModel._init_segmentation_infoc                 C   s   | j jjj}| j j| _tj std}n| j	r!t| j	j
}nd}|dur>|dr>tj||d| j_td| n7|drTtj||d| _td| n!|ttvrdtd| d}td	| tj||d
| j_| j jjj| _dS )a<  
        Initialize speaker embedding model with model name or path passed through config. Note that
        speaker embedding model is loaded to `self.msdd` to enable multi-gpu and multi-node training.
        In addition, speaker embedding model is also saved with msdd model when `.ckpt` files are saved.
        cpuN.nemo)map_locationz&Speaker Model restored locally from {}.ckptzCrequested {} model name not available in pretrained models, insteadtitanet_large$Loading pretrained {} model from NGC
model_namery   )rI   rR   rS   
model_pathrr   torchcudais_availabledevicerH   global_rankendswithr   restore_fromra   re   r1   infoformatload_from_checkpointr   warningfrom_pretrainedrT   _speaker_params)rk   r   rank_idr4   r4   r6   rb      s.   



z(EncDecDiarLabelModel._init_speaker_modelc                 C   s   t |d |ddd d}d|v r!|d d u r!td|  d S t|j|j| j|j|| j	j
j|jd| jjd	}|j| _|}|j}|d }tjjj||||d	dd|d
d|dddS )Nsample_rate
int_valuesF)r   r   	augmentormanifest_filepathJCould not load dataset as `manifest_filepath` was None. Provided config : )	r   emb_dirru   soft_label_thres
featurizerr_   rN   rO   r   
batch_size	drop_lastnum_workersr   
pin_memorydatasetr   
collate_fnr   shuffler   r   )r    getr1   r   r   r   r   ru   r   rI   r]   r_   rN   rH   r   
collectiondata_collectionmsdd_train_collate_fnr   utilsdata
DataLoader)rk   configr   r   
collate_dsr   r   r4   r4   r6   __setup_dataloader_from_config   s<   


z3EncDecDiarLabelModel.__setup_dataloader_from_configFr   emb_dictemb_seqclus_label_dictc                 C   s   | dd}d|v r|d d u rtd|  d S t|d ||||j|j| jjjd|d	}|j	| _
|}|j}	|d }
tjjj||
|	| dd|| dd	| d
ddS )Nr   Fr   r   )	r   r   r   r   r   seq_eval_moder_   use_single_scale_clusrO   r   r   r   r   r   r   )r   r1   r   r   r   r   r^   r]   r_   r   r   msdd_infer_collate_fnr   r   r   r   )rk   r   r   r   r   rO   r   r   r   r   r   r4   r4   r6   $__setup_dataloader_from_config_infer   s8   


z9EncDecDiarLabelModel.__setup_dataloader_from_config_infertrain_data_configc                 C      | j |d| _d S N)r   )3_EncDecDiarLabelModel__setup_dataloader_from_config	_train_dl)rk   r   r4   r4   r6   setup_training_data     z(EncDecDiarLabelModel.setup_training_dataval_data_layer_configc                 C   r   r   )r   _validation_dl)rk   r   r4   r4   r6   setup_validation_data  r   z*EncDecDiarLabelModel.setup_validation_datatest_data_configc                 C   s,   | j r| j|| j| j| j| j d| _d S d S )N)r   r   r   r   rO   )rO   9_EncDecDiarLabelModel__setup_dataloader_from_config_inferemb_sess_test_dictemb_seq_testclus_test_label_dict_test_dlrk   r   r4   r4   r6   setup_test_data"  s   z$EncDecDiarLabelModel.setup_test_datac                 C   s   dS )zu
        MSDD does not use multiple_test_data template. This function is a placeholder for preventing error.
        Nr4   r   r4   r4   r6   setup_multiple_test_data,  s   z-EncDecDiarLabelModel.setup_multiple_test_datac                 C   s   | j d ur| j S d S N)r   rv   r4   r4   r6   test_dataloader2  s   
z$EncDecDiarLabelModel.test_dataloaderc              	   C   sl   t | jdrt| jjd}nt }td|tdt tdt tdt tdt tdt tdt d	S )
N_sample_rate)freq)BT)r   )r   Cr   D)r   r   )r   r   r   r   r   r   featuresfeature_lengthms_seg_timestampsms_seg_countsclus_label_indexscale_mappingtargets)hasattrr]   r-   r   r/   r.   r0   )rk   audio_eltyper4   r4   r6   input_types6  s   





z EncDecDiarLabelModel.input_typesc                 C   s   t tdt tdt dS )Nr   )r   r   r   r   )probsscale_weights)r   r/   r0   rv   r4   r4   r6   output_typesF  s
   

z!EncDecDiarLabelModel.output_typesembsr   r   c                    s   |d j d |j d  }tj||d dd fddtdt D }g }t|D ]/}g }t D ]}	|| |	 }
||| |	 |
ddf  q8t|	ddd}|| q0t|}|S )	a/	  
        Reshape the given tensor and organize the embedding sequence based on the original sequence counts.
        Repeat the embeddings according to the scale_mapping information so that the final embedding sequence has
        the identical length for all scales.

        Args:
            embs (Tensor):
                Merged embeddings without zero-padding in the batch. See `ms_seg_counts` for details.
                Shape: (Total number of segments in the batch, emb_dim)
            scale_mapping (Tensor):
                The element at the m-th row and the n-th column of the scale mapping matrix indicates
                the (m+1)-th scale segment index which has the closest center distance with (n+1)-th segment
                in the base scale.

                Example:
                    scale_mapping_argmat[2][101] = 85
                In the above example, it means that 86-th segment in the 3rd scale (python index is 2)
                is mapped with 102-th segment in the base scale. Thus, the longer segments bound to have more
                repeating numbers since multiple base scale segments (since the base scale has the shortest length)
                fall into the range of the longer segments. At the same time, each row contains N numbers of
                indices where N is number of segments in the base-scale (i.e., the finest scale).
                Shape: (batch_size, scale_n, self.diar_window_length)
            ms_seg_counts (Tensor):
                Cumulative sum of the number of segments in each scale. This information is needed to reconstruct
                the multi-scale input matrix during forward propagating.

                Example: `batch_size=3, scale_n=6, emb_dim=192`
                    ms_seg_counts =
                     [[8,  9, 12, 16, 25, 51],
                      [11, 13, 14, 17, 25, 51],
                      [ 9,  9, 11, 16, 23, 50]]

                In this function, `ms_seg_counts` is used to get the actual length of each embedding sequence without
                zero-padding.

        Returns:
            ms_emb_seq (Tensor):
                Multi-scale embedding sequence that is mapped, matched and repeated. The longer scales are
                less repeated, while shorter scales are more frequently repeated following the scale mapping tensor.
        r   dimc                       g | ]
}||   qS r4   r4   .0irZ   split_emb_tupr4   r6   
<listcomp>|      z7EncDecDiarLabelModel.get_ms_emb_seq.<locals>.<listcomp>NrG      )
shaper   splitviewtolistrangerX   r@   stackpermute)rk   r   r   r   r   batch_emb_listms_emb_seq_list	batch_idx
feats_listscale_index
repeat_matrepp
ms_emb_seqr4   r   r6   get_ms_emb_seqO  s   +  
z#EncDecDiarLabelModel.get_ms_emb_seqr   c                    s~  |d j d |j d  }tj||d dd fddtdt D }g }t|D ]p}|| }	t|| }
t|	d|
 ||  }g }t D ]E}g }t| jj	D ]2}|| |k
  }t|szt| jjjjj|j}ntj|| | | dd}|| qZ|t| qPt|}|| q0t|dddd	}|  |j}|jrJ d
|S )aA  
        Calculate the cluster-average speaker embedding based on the ground-truth speaker labels
        (i.e., cluster labels).

        Args:
            embs (Tensor):
                Merged embeddings without zero-padding in the batch. See `ms_seg_counts` for details.
                Shape: (Total number of segments in the batch, emb_dim)
            clus_label_index (Tensor):
                Merged ground-truth cluster labels from all scales with zero-padding. Each scale's
                index can be retrieved by using segment index in `ms_seg_counts`.
                Shape: (batch_size, maximum total segment count among the samples in the batch)
            ms_seg_counts (Tensor):
                Cumulative sum of the number of segments in each scale. This information is needed
                to reconstruct multi-scale input tensors during forward propagating.

                Example: `batch_size=3, scale_n=6, emb_dim=192`
                    .. code:: python

                        ms_seg_counts =
                            [
                                [ 8,  9, 12, 16, 25, 51],
                                [11, 13, 14, 17, 25, 51],
                                [ 9,  9, 11, 16, 23, 50]
                            ]

                    Counts of merged segments: (121, 131, 118)
                    embs has shape of (370, 192)
                    clus_label_index has shape of (3, 131)

                Shape: (batch_size, scale_n)

        Returns:
            ms_avg_embs (Tensor):
                Multi-scale cluster-average speaker embedding vectors. These embedding vectors are used
                as reference for each speaker to predict the speaker label for the given multi-scale
                embedding sequences.
                Shape: (batch_size, scale_n, emb_dim, self.num_spks_per_model)
        r   r   r   c                    r   r4   r4   r   r   r4   r6   r     r   zCEncDecDiarLabelModel.get_cluster_avg_embs_model.<locals>.<listcomp>NrG      r   zVms_avg_embs.requires_grad = True. ms_avg_embs should be detached from the torch graph.)r   r   r   r   r   r   rX   sumrI   max_num_of_spksclonedetachanyzerosra   re   r^   decoder	emb_sizestor   meanr@   r   r   floatrequires_grad)rk   r   r   r   r   r   r   ms_avg_embs_listr   oracle_clus_idxmax_seq_lenclus_label_index_batchsession_avg_emb_set_listr   spk_set_listidx_whereavg_embsession_avg_emb_setms_avg_embsr4   r   r6   get_cluster_avg_embs_model  s6   + 

z/EncDecDiarLabelModel.get_cluster_avg_embs_modelprocessed_signalprocessed_signal_lenr   c              	   C   s  |j }t| j|  }| jjj}t| j	d d d | j
 }g g g }	}
}t|}|jd }t|D ]s}t| jD ]b}|| | }t|| | d| D ]M\}\}}t|  t|  }}t||| }t||tj|}|| dd||f |ddd|| f< || |	||  qQq=|
|| d  q6t||}t|	|}t|
|}|dkrtg d}}t|}nt| jj t|d| }t||d }||f}||||fS )aw  
        Load acoustic feature from audio segments for each scale and save it into a torch.tensor matrix.
        In addition, create variables containing the information of the multiscale subsegmentation information.

        Note: `self.emb_batch_size` determines the number of embedding tensors attached to the computational graph.
        If `self.emb_batch_size` is greater than 0, speaker embedding models are simultaneosly trained. Due to the
        constrant of GPU memory size, only a subset of embedding tensors can be attached to the computational graph.
        By default, the graph-attached embeddings are selected randomly by `torch.randperm`. Default value of
        `self.emb_batch_size` is 0.

        Args:
            processed_signal (Tensor):
                Zero-padded Feature input.
                Shape: (batch_size, feat_dim, the longest feature sequence length)
            processed_signal_len (Tensor):
                The actual legnth of feature input without zero-padding.
                Shape: (batch_size,)
            ms_seg_timestamps (Tensor):
                Timestamps of the base-scale segments.
                Shape: (batch_size, scale_n, number of base-scale segments, self.num_spks_per_model)
            ms_seg_counts (Tensor):
                Cumulative sum of the number of segments in each scale. This information is needed to reconstruct
                the multi-scale input matrix during forward propagating.
                Shape: (batch_size, scale_n)

        Returns:
            ms_mel_feat (Tensor):
                Feature input stream split into the same length.
                Shape: (total number of segments, feat_dim, self.frame_per_sec * the-longest-scale-length)
            ms_mel_feat_len (Tensor):
                The actual length of feature without zero-padding.
                Shape: (total number of segments,)
            seq_len (Tensor):
                The length of the input embedding sequences.
                Shape: (total number of segments,)
            detach_ids (tuple):
                Tuple containing both detached embeding indices and attached embedding indices
        
scale_dictr   Nr   )r   minrN   r   itemr]   r^   r   rW   ru   r`   r   r   r   rZ   	enumerater   r   r   float32r@   r   tensorarangemanual_seedrH   current_epochrandperm)rk   r  r  r   r   r   _emb_batch_sizefeat_dimmax_sample_countms_mel_feat_len_listsequence_lengths_listms_mel_feat_listtotal_seg_countr   r   	scale_idxscale_seg_numksttend	_featuresms_mel_featms_mel_feat_lenseq_lenattacheddetached
detach_idsr4   r4   r6   get_ms_mel_feat  s<   .


$",
z$EncDecDiarLabelModel.get_ms_mel_featc                 C   s   | j ||||d\}}||fS )z
        Wrapper function for inference case. This `forward_infer` is only used during inference, where `forward`
        is used for training and validation.
        r   lengthr  r   )ra   )rk   input_signalinput_signal_lengthemb_vectorsr   predsr   r4   r4   r6   forward_infer  s   
z"EncDecDiarLabelModel.forward_inferc                 C   sR  | j jj||d\}}	| ||	||\}
}}}t = | j j  | j jj|
|d  ||d  d\}}t|
j	d |j	d 
|j}| ||d ddf< W d   n1 s[w   Y  | j j  t|d dkr| j jj|
|d  ||d  d\}}|||d ddf< | |||}| ||||}| j ||||d\}}||fS )z9Function to compute forward pass for training/validation.)r1  r0  rG   )audio_signalr0  r   Nr/  )ra   re   r]   r.  r   no_gradevalforward_for_exportr   r   r   r   r   trainrX   r   r  )rk   r   r   r   r   r   r   r   r  r  r6  audio_signal_lensequence_lengthsr-  logitsembs_dr   embs_ar   r  r4  r   r4   r4   r6   forward)  s4   


 	

zEncDecDiarLabelModel.forwardbatchr   c              	   C   s   |\}}}}}}}	t dd | D }
| j|||||||	d\}}| j||	|
d}| ||	|
 t j  | j \}}}| j	d|dd | j	d| j
jd	 d
 dd | j	d|dd | j  d|iS )z"Function to compute training step.c                 S      g | ]}|d  qS r   r4   r   xr4   r4   r6   r   P      z6EncDecDiarLabelModel.training_step.<locals>.<listcomp>r   r   labelstarget_lensrg   T	sync_distlearning_rater   lrtrain_f1_acc)r   r  r   r@  rg   ri   r   empty_cachecomputelog
_optimizerparam_groupsreset)rk   rA  r   r   r   r   r   r   r   r   r<  r4  _rg   f1_accr4   r4   r6   training_stepM  s(   



z"EncDecDiarLabelModel.training_stepr   dataloader_idxc              	   C   s   |\}}}}}}	}
t dd |D }| j||||||	|
d\}}| j||
|d}| ||
| | j \}}}| jd|dd | jd|dd ||d	S )
z$Function to compute validation step.c                 S   rB  rC  r4   rD  r4   r4   r6   r   h  rF  z8EncDecDiarLabelModel.validation_step.<locals>.<listcomp>r   rG  val_lossTrJ  
val_f1_accrY  rZ  )r   r  r@  rg   rj   rP  rQ  )rk   rA  r   rX  r   r   r   r   r   r   r   r<  r4  rU  rg   rV  r4   r4   r6   validation_stepe  s&   

z$EncDecDiarLabelModel.validation_stepoutputsc                 C   s\   t dd |D  }| j \}}}| j  | jd|dd | jd|dd ||dS )Nc                 S   rB  )rY  r4   rD  r4   r4   r6   r   ~  rF  zCEncDecDiarLabelModel.multi_validation_epoch_end.<locals>.<listcomp>rY  TrJ  rZ  r[  )r   r   r   rj   rP  rT  rQ  )rk   r]  rX  val_loss_meanrV  rU  r4   r4   r6   multi_validation_epoch_end}  s   
z/EncDecDiarLabelModel.multi_validation_epoch_endc                 C   sL   t dd |D  }| j \}}}| j  | jd|dd ||dS )Nc                 S   rB  )	test_lossr4   rD  r4   r4   r6   r     rF  z=EncDecDiarLabelModel.multi_test_epoch_end.<locals>.<listcomp>test_f1_accTrJ  )r`  ra  )r   r   r   rh   rP  rT  rQ  )rk   r]  rX  test_loss_meanrV  rU  r4   r4   r6   multi_test_epoch_end  s   
z)EncDecDiarLabelModel.multi_test_epoch_endc                 C   sH   | j  \}}}t| j j }tt| j jj	}|| }||fS )aL  
        Calculate F1 score and accuracy of the predicted sigmoid values.

        Returns:
            f1_score (float): F1 score of the estimated diarized speaker label sequences.
            simple_acc (float): Accuracy of predicted speaker labels:
                (total # of correct labels)/(total # of sigmoid values)
        )
rh   rP  r   r   trueboolprodr  r   r   )rk   f1_scorerU  num_correcttotal_count
simple_accr4   r4   r6   compute_accuracies  s
   	z'EncDecDiarLabelModel.compute_accuraciesr   )F)r   )0__name__
__module____qualname____doc__classmethodr   r+   rD   r   r   rQ   rc   rJ   rb   r   dictr   r   r
   r   r   r   r   r   r   propertystrr/   r   r   r   Tensorr   r7  r  r	   r.  r5  r,   r@  listrW   rW  r\  r_  rc  rk  __classcell__r4   r4   rl   r6   r7   P   s    
,	"#
"

9G N

#"
r7   c                       s   e Zd ZdZdededee f fddZdd Zd	e	d
e	fddZ
dee de	eef fddZde	dede	d
e	fddZdedefddZdd Zdd Zdd Zdd Z  ZS ) r8   a  
    This class is built for calculating cluster-average embeddings, segmentation and load/save of
    the estimated cluster labels.

    The methods in this class is used for the inference of MSDD models.

    Args:
        cfg_diar_infer (DictConfig):
            Config dictionary from diarization inference YAML file
        cfg_msdd_model (DictConfig):
            Config dictionary from MSDD model checkpoint file

    Class Variables:
        self.cfg_diar_infer (DictConfig):
            Config dictionary from diarization inference YAML file
        cfg_msdd_model (DictConfig):
            Config dictionary from MSDD model checkpoint file
        self._speaker_model (class `EncDecSpeakerLabelModel`):
            This is a placeholder for class instance of `EncDecSpeakerLabelModel`
        self.scale_window_length_list (list):
            List containing the window lengths (i.e., scale length) of each scale.
        self.scale_n (int):
            Number of scales for multi-scale clustering diarizer
        self.base_scale_index (int):
            The index of the base-scale which is the shortest scale among the given multiple scales
    cfg_diar_inferrI   speaker_modelc                    sb   t    || _|| _|| _t| jjjjj	| _
t| j
| _t| j
d | _t| j| jd| _d S )NrG   )rE   rx  )rP   rQ   rw  	_cfg_msddre   ru  rR   rS   rT   rU   scale_window_length_listrX   rZ   base_scale_indexr   clus_diar_model)rk   rw  rI   rx  rl   r4   r6   rQ     s   
zClusterEmbedding.__init__c                 C   s:   | j jjjj| _| | jjj| jjj	\| _
| _| _}dS )za
        Launch clustering diarizer to prepare embedding vectors and clustering results.
        N)rw  rR   
clusteringrT   max_num_speakersrun_clustering_diarizerry  test_dsr   r   r   r   r   )rk   rU  r4   r4   r6   prepare_cluster_embs_infer  s   z+ClusterEmbedding.prepare_cluster_embs_inferbase_clus_label_dictsession_scale_mapping_dictc                 C   s   dd t | jD }| D ]j\}}tdd || D }||| j |< t | jd D ]K}g }|| jd |jd ks@J dt|| }	t |	d D ]$}
|
|| v r_t||| |
k }nt	|dkrgdn|d }|
| qL||| |< q,q|S )	a  
        In multi-scale speaker diarization system, clustering result is solely based on the base-scale
        (the shortest scale). To calculate cluster-average speaker embeddings for each scale that are longer
        than the base-scale, this function assigns clustering results for the base-scale to the longer scales
        by measuring the distance between subsegment timestamps in the base-scale and non-base-scales.

        Args:
            base_clus_label_dict (dict):
                Dictionary containing clustering results for base-scale segments. Indexed by `uniq_id` string.
            session_scale_mapping_dict (dict):
                Dictionary containing multiscale mapping information for each session. Indexed by `uniq_id` string.

        Returns:
            all_scale_clus_label_dict (dict):
                Dictionary containing clustering labels of all scales. Indexed by scale_index in integer format.

        c                 S   s   i | ]}|i qS r4   r4   r   r   r4   r4   r6   
<dictcomp>      zAClusterEmbedding.assign_labels_to_longer_segs.<locals>.<dictcomp>c                 S   rB  rC  r4   rD  r4   r4   r6   r     rF  zAClusterEmbedding.assign_labels_to_longer_segs.<locals>.<listcomp>rG   r   z]The number of base scale labels does not match the segment numbers in uniq_scale_mapping_dictr   )r   rZ   itemsnparrayr{  r   maxr   rX   r@   )rk   r  r  all_scale_clus_label_dictuniq_iduniq_scale_mapping_dictbase_scale_clus_labelr   new_clus_label	max_indexseg_idxseg_clus_labelr4   r4   r6   assign_labels_to_longer_segs  s$   z-ClusterEmbedding.assign_labels_to_longer_segsclus_labelsemb_scale_seq_dictc           
      C   s   dd || j   D }|D ].}| d }t| d dd }dd | dd	 D \}}|| |||g q|d | d jd }	||	fS )
aS  
        Retrieve base scale clustering labels from `emb_scale_seq_dict`.

        Args:
            clus_labels (list):
                List containing cluster results generated by clustering diarizer.
            emb_scale_seq_dict (dict):
                Dictionary containing multiscale embedding input sequences.
        Returns:
            base_clus_label_dict (dict):
                Dictionary containing start and end of base scale segments and its cluster label.
                Indexed by `uniq_id`.
            emb_dim (int):
                Embedding dimension in integer.
        c                 S      i | ]}|g qS r4   r4   r   keyr4   r4   r6   r    r  z=ClusterEmbedding.get_base_clus_label_dict.<locals>.<dictcomp>r   r   rU  c                 S   s   g | ]	}t t|d qS )r   )roundr  rD  r4   r4   r6   r         z=ClusterEmbedding.get_base_clus_label_dict.<locals>.<listcomp>rG   r   )r{  keysr   rW   r@   r   )
rk   r  r  r  liner  labelr%  r&  emb_dimr4   r4   r6   get_base_clus_label_dict  s   z)ClusterEmbedding.get_base_clus_label_dictspeaker_mapping_dictc                    s"  t   _ fdd  D }| \}}||}  D ]g}	 |	  D ]^\}
}t|tkr@t	t
|}n|}||	 |
 }t|}t|}t|j}|D ]}|||k }tj|dd|dd|f< qZ|durdd ||
  D }nd}||d||	 |
< q-q%||fS )a  
        MSDD requires cluster-average speaker embedding vectors for each scale. This function calculates
        an average embedding vector for each cluster (speaker) and each scale.

        Args:
            emb_scale_seq_dict (dict):
                Dictionary containing embedding sequence for each scale. Keys are scale index in integer.
            clus_labels (list):
                Clustering results from clustering diarizer including all the sessions provided
                in input manifest files.
            speaker_mapping_dict (dict):
                Speaker mapping dictionary in case RTTM files are provided. This is mapping between
                integer based speaker index and speaker ID tokens in RTTM files.
                Example:
                    {'en_0638': {'speaker_0': 'en_0638_A', 'speaker_1': 'en_0638_B'},
                     'en_4065': {'speaker_0': 'en_4065_B', 'speaker_1': 'en_4065_A'}, ...,}
            session_scale_mapping_dict (dict):
                Dictionary containing multiscale mapping information for each session. Indexed by `uniq_id` string.

        Returns:
            emb_sess_avg_dict (dict):
                Dictionary containing speaker mapping information and cluster-average speaker embedding vector.
                Each session-level dictionary is indexed by scale index in integer.
            output_clus_label_dict (dict):
                Subegmentation timestamps in float type and Clustering result in integer type.
                Indexed by `uniq_id` keys.
        c                    s*   i | ]}|d d  j d   D qS )c                 S   r  r4   r4   r  r4   r4   r6   r  4  r  zDClusterEmbedding.get_cluster_avg_embs.<locals>.<dictcomp>.<dictcomp>rG   )rZ   r  r  r  rk   r4   r6   r  3  s    z9ClusterEmbedding.get_cluster_avg_embs.<locals>.<dictcomp>r   r   Nc                 S   s   i | ]\}}||qS r4   r4   )r   rttm_keyclus_keyr4   r4   r6   r  L  s    )mappingavg_embs)rX   r  rZ   r  r  r  typeru  r   r  r  r  setrt  r   r~  r   )rk   r  r  r  r  emb_sess_avg_dictoutput_clus_label_dictr  r  r   r  _emb_tensor
emb_tensorclus_label_listspk_setlabel_arrayr  spk_idxselected_embsinv_mapr4   r  r6   get_cluster_avg_embs  s4   
z%ClusterEmbedding.get_cluster_avg_embsr   r   c                 C   s^  || j j_|| j j_| jjj| _tj	| jd| _
tj| j
dd | j jjj| j_| j jjjj| jjd< | j jjj| jjj_| jj}t|trKt|n| }tj|dd}td| jjd   td|  | jj| j jd	}|d
ur}|\}}}nd\}}t| jj| jj| _| | j}	| |}
|  |}| !|
|||	\}}|	|
d< ||
||fS )a  
        If no pre-existing data is provided, run clustering diarizer from scratch. This will create
        scale-wise speaker embedding sequence, cluster-average embeddings, scale mapping and base scale
        clustering labels. Note that speaker embedding `state_dict` is loaded from the `state_dict`
        in the provided MSDD checkpoint.

        Args:
            manifest_filepath (str):
                Input manifest file for creating audio-to-RTTM mapping.
            emb_dir (str):
                Output directory where embedding files and timestamp files are saved.

        Returns:
            emb_sess_avg_dict (dict):
                Dictionary containing cluster-average embeddings for each session.
            emb_scale_seq_dict (dict):
                Dictionary containing embedding tensors which are indexed by scale numbers.
            base_clus_label_dict (dict):
                Dictionary containing clustering results. Clustering results are cluster labels
                for the base scale segments.
        
pred_rttmsTexist_okrt      )indentzMultiscale Weights: zClustering Parameters: )r   N)NNsession_scale_mapping)"rw  rR   r   out_dirr|  rr   _out_dirospathjoinout_rttm_dirmakedirsr}  rT   _cluster_paramsrS   rt   ru   rV   r   rq  jsondumpsr1   r   diarizer   r"   $multiscale_embeddings_and_timestamps_embs_and_timestampsget_scale_mapload_emb_scale_seq_dictload_clustering_labelsr  )rk   r   r   cluster_paramsclustering_params_strscoresmetricr  rU  r  r  r  r  r  r4   r4   r6   r  S  s<   






z(ClusterEmbedding.run_clustering_diarizerc                 C   s*   i }|  D ]\}}t|}|||< q|S )a  
        Save multiscale mapping data into dictionary format.

        Args:
            embs_and_timestamps (dict):
                Dictionary containing embedding tensors and timestamp tensors. Indexed by `uniq_id` string.
        Returns:
            session_scale_mapping_dict (dict):
                Dictionary containing multiscale mapping information for each session. Indexed by `uniq_id` string.
        )r  r$   )rk   embs_and_timestampsr  r  uniq_embs_and_timestampsscale_mapping_dictr4   r4   r6   r    s
   
zClusterEmbedding.get_scale_mapc                 C   sD   t j|dd| j d}t j|}|std| d ||fS )a  
        Check whether the laoded clustering label file is including clustering results for all sessions.
        This function is used for inference mode of MSDD.

        Args:
            out_dir (str):
                Path to the directory where clustering result files are saved.
        Returns:
            file_exists (bool):
                Boolean that indicates whether clustering result file exists.
            clus_label_path (str):
                Path to the clustering label output file.
        speaker_outputssubsegments_scalez_cluster.labelzClustering label file z does not exist.)r  r  r  r{  existsr1   r   )rk   r  clus_label_pathfile_existsr4   r4   r6   check_clustering_labels  s   z(ClusterEmbedding.check_clustering_labelsc                 C   sT   |  |\}}td|  t|}| }W d   |S 1 s#w   Y  |S )ar  
        Load clustering labels generated by clustering diarizer. This function is used for inference mode of MSDD.

        Args:
            out_dir (str):
                Path to the directory where clustering result files are saved.
        Returns:
            emb_scale_seq_dict (dict):
                List containing clustering results in string format.
        z Loading cluster label file from N)r  r1   r   open	readlines)rk   r  r  r  fr  r4   r4   r6   r    s   


z'ClusterEmbedding.load_clustering_labelsc           
   	   C   s   t | jjjjj}dd tt|D }tt|D ]E}tj	
|ddd| d}td| d|  t|d	}t|}W d
   n1 sIw   Y  | D ]\}}	|	||< qR|||< q|S )a  
        Load saved embeddings generated by clustering diarizer. This function is used for inference mode of MSDD.

        Args:
            out_dir (str):
                Path to the directory where embedding pickle files are saved.
        Returns:
            emb_scale_seq_dict (dict):
                Dictionary containing embedding tensors which are indexed by scale numbers.
        c                 S   s   i | ]}|d qS r   r4   r  r4   r4   r6   r    r  z<ClusterEmbedding.load_emb_scale_seq_dict.<locals>.<dictcomp>r  
embeddingsr  z_embeddings.pklz'Loading embedding pickle file of scale:z at rbN)ru  rw  rR   rS   rT   rU   r   rX   r  r  r  r1   r   r  pklloadr  )
rk   r  window_len_listr  r   pickle_path
input_filer   r  valr4   r4   r6   r    s   

z(ClusterEmbedding.load_emb_scale_seq_dict)rl  rm  rn  ro  r   r   r   rQ   r  r   r  r   rs  rW   rq  r  r  r  r  r  r  r  rv  r4   r4   rl   r6   r8     s4    	%
?@r8   c                       s  e Zd ZdZdeeef f fddZdd Ze	de
fdd	ZdHde
defddZdeeef fddZdeeee eej f  dejfddZdee
 dee deej deej fddZdd Ze deeeeeeef     fddZdejdejdedee deejejef f
d d!Zd"eej d#ee d$ed%deejejejf fd&d'Zd"eej dee deejejejf fd(d)Z e deeej eej eej f fd*d+Z!deej d,e"deeeeef   fd-d.Z#e$	/	0	1dId2e
d3e
d4ee
 d5e%fd6d7Z&	8	9	0	0	0	1dJd:e
d;ed<ed=ee d>ee d?ee
 d5e%dee'ee' f fd@dAZ(dBe
d=ee d>ee dCe)j*d;ed<ed5e%dd0fdDdEZ+e$dee, fdFdGZ-  Z.S )Kr9   a.  
    Class for inference based on multiscale diarization decoder (MSDD). MSDD requires initializing
    clustering results from clustering diarizer. Overlap-aware diarizer requires separate RTTM
    generation and evaluation modules to check the effect of overlap detection in speaker diarization.
    rE   c                    s  t    || _|jjjdd| _|jjjdd| _|jjjdd| _	|jjjdd| _
|jjjdd| _|jjjd	d
| _|jjjdg d| _| | |jjjj| _| | j|| j_t|| jj| jd| _| jjj| _|jjjd| j| _d S )Nuse_speaker_model_from_ckptTuse_clus_as_mainFmax_overlap_spksr   num_spks_per_modeluse_adaptive_thresmax_pred_lengthr   diar_eval_settings))      ?T)r  F)g        F)rw  rI   rx  overlap_infer_spk_limit)rP   rQ   r^   rR   
msdd_modelrT   r   r  r  r  r  r  r  r  _init_msdd_modeldiar_window_length$transfer_diar_params_to_model_paramsrE   r8   re   clustering_embeddingr   clustering_max_spksr  )rk   rE   rl   r4   r6   rQ     s*   




zNeuralDiarizer.__init__c                 C   sd   |j j|jj _|j j|jj_|j j|jj_|j jjj|jj_	|j jjj
|jj_
|j jjj|j_|jS )z
        Transfer the parameters that are needed for MSDD inference from the diarization inference config files
        to MSDD model config `msdd_model.cfg`.
        )rR   r  rE   r   r  r   r  rT   infer_batch_sizer   r   r}  r~  r^   r   )rk   r  rE   r4   r4   r6   r  
  s   z3NeuralDiarizer.transfer_diar_params_to_model_params	save_pathc                 C   s   | j j| _d}t L}tj|t}tj|t	}tj||}| jj
|d | jjr:tj|t}| jj| | jj| | j| | jj||d W d   dS 1 sZw   Y  dS )a   
        Saves model instances (weights and configuration) into EFF archive.
        You can use "restore_from" method to fully restore instance from .nemo file.

        .nemo file is an archive (tar.gz) with the following:
            model_config.yaml - model configuration in .yaml format.
                                You can deserialize this into cfg argument for model's constructor
            model_wights.chpt - model checkpoint

        Args:
            save_path: Path to .nemo file where model instance should be saved
        zmsdd_model.nemo)path2yaml_file)filename
source_dirN)r  r|  	clus_diartempfileTemporaryDirectoryr  r  r  r   r   to_config_filehas_vad_modelr   
_vad_modelsave_tore   r  +_NeuralDiarizer__make_nemo_file_from_folder)rk   r  _NEURAL_DIAR_MODELtmpdirconfig_yaml
spkr_modelneural_diar_model	vad_modelr4   r4   r6   r    s   

"zNeuralDiarizer.save_tomsdd._speaker_model.prefixr:   c                 C   sr   | j  }g }| D ]}||v r|| qi }|D ]}||d}|| ||< qt| j jj}|	| |S )a  
        MSDD model file contains speaker embedding model and MSDD model. This function extracts standalone
        speaker model and save it to `self.spk_emb_state_dict` to be loaded separately for clustering diarizer.

        Args:
            ext (str):
                File-name extension of the provided model path.
        Returns:
            standalone_model_path (str):
                Path to the extracted standalone model without speaker embedding extractor model.
         )
r  
state_dictr  r@   replacer   r\   rE   rd   load_state_dict)rk   r  model_state_dictspk_emb_module_namesnamespk_emb_state_dictorg_namere   r4   r4   r6    extract_standalone_speaker_model5  s   


z/NeuralDiarizer.extract_standalone_speaker_modelc                 C   s   |j jj}|drtd|  tj||jd| _n7|dr3td|  tj	||jd| _n |t
tvrBtd| d td	| tj||jd
| _| jr]|  | _dS d| _dS )z}
        Initialized MSDD model with the provided config. Load either from `.nemo` file or `.ckpt` checkpoint files.
        rx   zUsing local nemo file from )restore_pathry   rz   zUsing local checkpoint from )checkpoint_pathry   z
requested z7 model name not available in pretrained models, insteadr|   r}   N)rR   r  r   r   r1   r   r7   r   r   r   r   r   r   r   r  r  re   )rk   rE   r   r4   r4   r6   r  P  s    




zNeuralDiarizer._init_msdd_model	data_listc                    s   t  }|D ]}||d 7 }qtt|}tttt|t| tdd |D }t	||}|D ]6\}} fdd|D }	t|j
dkrK|d}|| jkrS|}q2|j
d }
|d|
|	f  |  7  < q2||d  }|S )a  
        This module puts together the pairwise, two-speaker, predicted results to form a finalized matrix
        that has dimension of `(total_len, n_est_spks)`. The pairwise results are evenutally averaged.
        For example, in 4 speaker case (speaker 1, 2, 3, 4), the sum of the pairwise results
        (1, 2), (1, 3), (1, 4) are then divided by 3 to take average of the sigmoid values.

        Args:
            data_list (list):
                List containing data points from `test_data_collection` variable. `data_list`
                has sublists `data` as follows:
                    data[0]: `target_spks` tuple
                        Examples: (0, 1, 2)
                    data[1]: Tensor containing estimaged sigmoid values.
                    [[0.0264, 0.9995],
                        [0.0112, 1.0000],
                        ...,
                        [1.0000, 0.0512]]

        Returns:
            sum_pred (Tensor):
                Tensor containing the averaged sigmoid values for each speaker.
        r   c                 S   s   g | ]	}|d  j d  qS )rG   )r   )r   sessr4   r4   r6   r     r  z/NeuralDiarizer.get_pred_mat.<locals>.<listcomp>c                       g | ]} | qS r4   r4   rD  	digit_mapr4   r6   r     rF  r   NrG   )tuplerX   r  rq  zipsortedr   r  r   r   r   squeezer  rw   r  )rk   r  all_tupsr   
n_est_spks	total_lensum_pred_dim_tuppred_matdim_tup_endr4   r  r6   get_pred_math  s"   


"zNeuralDiarizer.get_pred_matuniq_id_listtest_data_collection
preds_listc           	         sZ   t |||}dd |D  | D ]\}}| |}|d |< q fdd|D }|S )a}  
        Merge multiple sequence inference outputs into a session level result.

        Args:
            uniq_id_list (list):
                List containing `uniq_id` values.
            test_data_collection (collections.DiarizationLabelEntity):
                Class instance that is containing session information such as targeted speaker indices,
                audio filepaths and RTTM filepaths.
            preds_list (list):
                List containing tensors filled with sigmoid values.

        Returns:
            output_list (list):
                List containing session-level estimated prediction matrix.
        c                 S   r  r4   r4   r   r  r4   r4   r6   r    r  z<NeuralDiarizer.get_integrated_preds_list.<locals>.<dictcomp>r   c                    r  r4   r4   r&  output_dictr4   r6   r     rF  z<NeuralDiarizer.get_integrated_preds_list.<locals>.<listcomp>)r#   r  r"  	unsqueeze)	rk   r#  r$  r%  session_dictr  r  r  output_listr4   r'  r6   get_integrated_preds_list  s   
z(NeuralDiarizer.get_integrated_preds_listc                 C   s"   |j | j_ |j| j_|j| j_dS )zcAssign dictionaries containing the clustering results from the class instance `cluster_embeddings`.N)r   r  r   r   )rk   cluster_embeddingsr4   r4   r6   get_emb_clus_infer  s   

z!NeuralDiarizer.get_emb_clus_inferc                    sR   j   dj_j   \ }}tjjjj	j
} fdd|D S )a  
        Launch diarization pipeline which starts from VAD (or a oracle VAD stamp generation),
        initialization clustering and multiscale diarization decoder (MSDD). Note that the result of MSDD
        can include multiple speakers at the same time. Therefore, RTTM output of MSDD needs to be based on
        `make_rttm_with_overlap()` function that can generate overlapping timestamps.
        `self.run_overlap_aware_eval()` function performs DER evaluation.
        Tc                    s   g | ]}  |qS r4   )run_overlap_aware_eval)r   	thresholdr%  rk   r4   r6   r     s    z*NeuralDiarizer.diarize.<locals>.<listcomp>)r  r  r  rO   r.  run_pairwise_diarizationru  r^   rR   rT   sigmoid_threshold)rk   targets_listsignal_lengths_list
thresholdsr4   r1  r6   r    s   
	zNeuralDiarizer.diarizesignalsr3  diar_window_indexc              	   C   s  t |}tjtj|jd }t dd | jj	| D }t
t|jD ]}|| j t|d | j |jd }	}
|
|	 }|	|jd k r||	|
 }||	|
ddddf t| j|jd || j  }}||j| k}t|rt j|| dd|dddd|f< || jk rt j|t | j| |jd |jd |jgdd}q't | j|jd |jd |j}d}q'|||fS )a$  
        This function is only used when `split_infer=True`. This module calculates cluster-average embeddings
        for the given short range. The range length is set by `self.diar_window_length`, and each cluster-average
        is only calculated for the specified range. Note that if the specified range does not contain some speakers
        (e.g. the range contains speaker 1, 3) compared to the global speaker sets (e.g. speaker 1, 2, 3, 4) then
        the missing speakers (e.g. speakers 2, 4) are assigned with zero-filled cluster-average speaker embedding.

        Args:
            signals (Tensor):
                Zero-padded Input multi-scale embedding sequences.
                Shape: (length, scale_n, emb_vectors, emb_dim)
            emb_vectors (Tensor):
                Cluster-average multi-scale embedding vectors.
                Shape: (length, scale_n, emb_vectors, emb_dim)
            diar_window_index (int):
                Index of split diarization wondows.
            test_data_collection (collections.DiarizationLabelEntity)
                Class instance that is containing session information such as targeted speaker indices,
                audio filepath and RTTM filepath.

        Returns:
            return emb_vectors_split (Tensor):
                Cluster-average speaker embedding vectors for each scale.
            emb_seq (Tensor):
                Zero-padded multi-scale embedding sequences.
            seq_len (int):
                Length of the sequence determined by `self.diar_window_length` variable.
        r   c                 S   rB  rC  r4   rD  r4   r4   r6   r     rF  z4NeuralDiarizer.get_range_average.<locals>.<listcomp>rG   Nr   r   )r   
zeros_liker  r  splitextbasename
audio_filer  r  r   r   rX   target_spksr  r  r   r   r   catr   r   r   )rk   r7  r3  r8  r$  emb_vectors_splitr  clus_label_tensorr  r%  r&  r*  target_clus_label_tensorr   
seg_lengthtarget_clus_label_boolr4   r4   r6   get_range_average  sD   
$
 

z NeuralDiarizer.get_range_average
test_batch_test_data_collectionr   rw   c                 C   s   |\}}}}g g g }}	}
t t |jd | j  }t| j| j| | _t|jd D ]1}|| || || }}}t|D ]}| 	||||\}}}|
| |	
| |

| qCq/t ||}t |	|}	t |
|}
||	|
fS )a  
        This function is only used when `get_range_average` function is called. This module calculates
        cluster-average embeddings for the given short range. The range length is set by `self.diar_window_length`,
        and each cluster-average is only calculated for the specified range.

        Args:
            test_batch: (list)
                List containing embedding sequences, length of embedding sequences, ground truth labels
                (if exists) and initializing embedding vectors.
            test_data_collection: (list)
                List containing test-set dataloader contents. test_data_collection includes wav file path,
                RTTM file path, clustered speaker indices.

        Returns:
            sess_emb_vectors (Tensor):
                Tensor of cluster-average speaker embedding vectors.
                Shape: (batch_size, scale_n, emb_dim, 2*num_of_spks)
            sess_emb_seq (Tensor):
                Tensor of input multi-scale embedding sequences.
                Shape: (batch_size, length, scale_n, emb_dim)
            sess_sig_lengths (Tensor):
                Tensor of the actucal sequence length without zero-padding.
                Shape: (batch_size)
        rG   r   )r   ceilr  r   r  rW   r  r  r   rD  r@   r   r   )rk   rE  rF  r   _signalssignal_lengths_targets_emb_vectorssess_emb_vectorssess_emb_seqsess_sig_lengthssplit_countr$  r7  r3  r$  r8  r?  r   r*  r4   r4   r6   get_range_clus_avg_emb  s$    



z%NeuralDiarizer.get_range_clus_avg_embc                 C   s  |\}}}}| j jjjjrftt|jd | j	 
 }| j||| jjd\}}	}
t  | jj|	|
|dd\}}W d   n1 sDw   Y  |t||| j	 d}|ddd|jd ddf }nt  | jj|||dd\}}W d   n1 sw   Y  t|jd | j| _t|jd | j|jd }t|jd | j|jd }||ddd|jd ddf< |||fS )a  
        Launch forward_infer() function by feeding the session-wise embedding sequences to get pairwise
        speaker prediction values. If split_infer is True, the input audio clips are broken into short
        sequences then cluster average embeddings are calculated for inference. Split-infer might result in
        an improved results if calculating clustering average on the shorter tim-espan can help speaker assignment.

        Args:
            test_batch: (list)
                List containing embedding sequences, length of embedding sequences, ground truth labels (if exists)
                and initializing embedding vectors.
            test_data_collection: (list)
                List containing test-set dataloader contents. test_data_collection includes wav file path,
                RTTM file path, clustered speaker indices.

        Returns:
            preds (Tensor):
                Tensor containing predicted values which are generated from MSDD model.
            targets (Tensor):
                Tensor containing binary ground-truth values.
            signal_lengths (Tensor):
                The actual Session length (number of steps = number of base-scale segments) without zero padding.
        rG   )r   N)r1  r2  r3  r   r   r   r   )r^   rR   r  rT   split_inferr   rG  r  r   r  rW   rP  r   r2   r5  reshaperX   r  r  r   )rk   rE  r$  r7  rI  rJ  r3  rO  rL  rM  rN  _predsr   r4  r   r4   r4   r6   
diar_infer:  s4    
" 
zNeuralDiarizer.diar_inferc                 C   sn  | j j| _| j| jjj | j  dg}g g g }}}t| jjjj}dd | jj	D }t
t| j D ]W\}}|\}	}
}}||d |
jd   | |||d |d  \}}}
| jjjjjrn| j|||
 |tt|d |tt|d |tt|
d q8| jjjjjr| j \}}td|dd	|d | |||}|||fS )
a  
        Setup the parameters needed for batch inference and run batch inference. Note that each sample is
        pairwise speaker input. The pairwise inference results are reconstructed to make session-wise
        prediction results.

        Returns:
            integrated_preds_list: (list)
                List containing the session-wise speaker predictions in torch.tensor format.
            targets_list: (list)
                List containing the ground-truth labels in matrix format filled with  0 or 1.
            signal_lengths_list: (list)
                List containing the actual length of each sequence in session.
        r   c                 S   s   g | ]}|qS r4   r4   )r   dr4   r4   r6   r     s    z;NeuralDiarizer.run_pairwise_diarization.<locals>.<listcomp>r   rG   zTest Inference F1 score. .4fz, simple Acc. )r  r  r  r   rE   r  r8  r%   r   r   r  r   r   r@   r   rT  r^   rR   rT   r   rh   extendru  r   r   rk  r1   r   r,  )rk   cumul_sample_countr%  r4  r5  r#  r$  sidxrE  r7  rI  rJ  r3  r4  r   rg  rj  integrated_preds_listr4   r4   r6   r2  m  s.   



z'NeuralDiarizer.run_pairwise_diarizationr0  c                 C   s   t d|dd| j d| j d g }| jjjj}t|}t	| j
D ].\}\}}t|| jj||d| j| j| j| j| jd
\}	}
t||	|
||| jjd}|| q#t d	 |S )
a5  
        Based on the predicted sigmoid values, render RTTM files then evaluate the overlap-aware diarization results.

        Args:
            preds_list: (list)
                List containing predicted pairwise speaker labels.
            threshold: (float)
                A floating-point threshold value that determines overlapped speech detection.
                    - If threshold is 1.0, no overlap speech is detected and only detect major speaker.
                    - If threshold is 0.0, all speakers are considered active at any time step.
        z     [Threshold: rW  z] [use_clus_as_main=z] [diar_window=]T)r0  infer_overlapr  r  r  r  r  )collarignore_overlapverbosez  
)r1   r   r  r  r  rE   r  r   r!   r  r  r'   r   r  r  r  r  r   r^   r`  r@   )rk   r%  r0  r]  r   rttm_mapr$  r^  r_  all_referenceall_hypothesisoutputr4   r4   r6   r/    s@   

z%NeuralDiarizer.run_overlap_aware_evalvad_multilingual_marblenetNFr~   vad_model_namery   r`  c                 C   s0   t |rt jnt j tj||||d}| |S )aa  
        Instantiate a `NeuralDiarizer` to run Speaker Diarization.

        Args:
            model_name (str): Path/Name of the neural diarization model to load.
            vad_model_name (str): Path/Name of the voice activity detection (VAD) model to load.
            map_location (str): Optional str to map the instantiated model to a device (cpu, cuda).
                By default, (None), it will select a GPU if available, falling back to CPU otherwise.
            verbose (bool): Enable verbose logging when loading models/running diarization.
        Returns:
            `NeuralDiarizer`
        )diar_model_pathvad_model_pathry   r`  )r1   setLevelINFOWARNINGr   init_config)rA   r~   rf  ry   r`  rE   r4   r4   r6   r     s   zNeuralDiarizer.from_pretrained@   rG   audio_filepathr   r   max_speakersnum_speakersr  c              
   C   s   |r	t j|dd tj|dc}t j|d}	|dddd|ddd	g}
t|	d
}|ddd |
D  W d   n1 s@w   Y  | j|	||||||d |	| j	j
j_|   t| dt|j d}W d   t|S 1 suw   Y  t|S )a  
        Run the `NeuralDiarizer` inference pipeline.

        Args:
            audio_filepath (str, list): Audio path to run speaker diarization on.
            max_speakers (int): If known, the max number of speakers in the file(s).
            num_speakers (int): If known, the exact number of speakers in the file(s).
            batch_size (int): Batch size when running inference.
            num_workers (int): Number of workers to use in data-loading.
            out_dir (str): Path to store intermediate files during inference (default temp directory).
        Returns:
            `pyannote.Annotation` for each audio path, containing speaker labels and segment timestamps.
        Tr  )dirzmanifest.jsonr   Ninfer-)rn  offsetdurationr  textrp  rttm_filepathuem_filepathw
c                 s   s    | ]}t |V  qd S r   )r  r  rD  r4   r4   r6   	<genexpr>
  s    z*NeuralDiarizer.__call__.<locals>.<genexpr>)manifest_pathro  rp  r  r   r   r`  z/pred_rttms/z.rttm)r  r  r  r  r  r  r  write_initialize_configsr  rE   r  r   r  r)   r   stemr&   )rk   rn  r   r   ro  rp  r  r`  r  r|  metar  pred_labels_clusr4   r4   r6   __call__  sB   

  zNeuralDiarizer.__call__r|  r  c                 C   sd   || j _|| j _|| j j_|| j j_|| j _|d u| j jjj_	|r(|| j jjj_
| | j| j  d S r   )r^   r   r   rR   r   r  r`  r}  rT   oracle_num_speakersr~  r  r  )rk   r|  ro  rp  r  r   r   r`  r4   r4   r6   r~    s   


z"NeuralDiarizer._initialize_configsc                 C   s   t  S )r;   )r7   rD   )rA   r4   r4   r6   rD   0  s   z$NeuralDiarizer.list_available_models)r  )re  NF)rm  rG   NNNF)/rl  rm  rn  ro  r
   r   r   rQ   r  r   rs  r  r   r  r  r   r	   rW   r   rt  r"  r   r,  r.  r7  r   r   r   r  rD  r   rP  rT  r2  r  r/  rp  re  r   r   r  r  r  r~  r+   rD   rv  r4   r4   rl   r6   r9     s    **
(
J
-
3,(
.	
;	
r9   r   )]rn   r  r  pickler  r  collectionsr   pathlibr   
statisticsr   typingr   r   r   r   r	   r
   numpyr  r   hydra.utilsr   lightning.pytorchr   r   lightning.pytorch.utilitiesr   	omegaconfr   r   pyannote.corer   pyannote.metrics.diarizationr   r   -nemo.collections.asr.data.audio_to_diar_labelr   r    nemo.collections.asr.metrics.derr   -nemo.collections.asr.metrics.multi_binary_accr   nemo.collections.asr.modelsr   %nemo.collections.asr.models.asr_modelr   /nemo.collections.asr.models.clustering_diarizerr   r   r   r   3nemo.collections.asr.models.configs.diarizer_configr   (nemo.collections.asr.models.label_modelsr   1nemo.collections.asr.parts.preprocessing.featuresr    .nemo.collections.asr.parts.utils.speaker_utilsr!   r"   r#   r$   r%   r&   r'   r(   r)   nemo.core.classesr*   nemo.core.classes.commonr+   r,   nemo.core.neural_typesr-   r.   r/   nemo.core.neural_types.elementsr0   
nemo.utilsr1   torch.cuda.ampr2   ImportError
contextlibr3   __all__r7   nnModuler8   r9   r4   r4   r4   r6   <module>   sd    ,	    W  D