o
    }oiY                     @   s  d dl Z d dlZd dlZd dlZd dlmZ d dlmZ d dlm	Z	m
Z
mZmZ d dlZd dlZd dlZd dlZd dlmZ d dlmZ d dlmZmZmZ d dlmZ d d	lmZ d d
lmZ d dl m!Z!m"Z"m#Z# d dl$m%Z%m&Z& d dl'm(Z( d dl)m*Z* d dl+m,Z, d dl-m.Z. d dl/m0Z0 d dl1m2Z2 d dl3m4Z4 d dl5m6Z6 d dl7m8Z8 d dl9T d dl:m;Z; dgZ<G dd de6e*e,Z=dS )    N)Counter)ceil)DictListOptionalUnion)instantiate)Trainer)
DictConfig	OmegaConf	open_dict)	roc_curve)Accuracy)tqdm)AudioPairToLabelDatasetAudioToSpeechLabelDatasetcache_datastore_manifests)&get_concat_tarred_speech_label_datasetget_tarred_speech_label_dataset)convert_to_config_list)ExportableEncDecModel)VerificationMixin)WaveformFeaturizer)process_augmentations)TopKClassificationAccuracy)ASRSpeechLabel)ModelPT)PretrainedModelInfo)*)loggingEncDecSpeakerLabelModelc                	       s  e Zd ZdZedee fddZdOdede	f fdd	Z
ed
d Zdee fddZdeeeef  fddZdeeeef  fddZdeeeef  fddZdd Zedeeeef  fddZedeeeef  fddZdd Zd d! Zd"d# ZdPd&ed'efd(d)ZdPd&ed'efd*d+ZdPd&ed'efd,d-Z dPd&ed'efd.d/Z!dQd&efd0d1Z"dQd&efd2d3Z#dQd&efd4d5Z$dQd&efd6d7Z%e&' d8d9 Z(e&' d:d; Z)e*j+d<dfd=ed>e,d?ed@efdAdBZ-dCdD Z.e&' dRdFdGZ/e&' dSdKdLZ0e&' dTdMdNZ1  Z2S )Ur    a  
    Encoder decoder class for speaker label models.
    Model class creates training, validation methods for setting up data
    performing model forward pass.
    Expects config dict for

        * preprocessor

        * Jasper/Quartznet Encoder

        * Speaker Decoder
    returnc                 C   s   g }t dddd}|| t dddd}|| t dd	d
d}|| t dddd}|| t dddd}|| |S )z
        This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud.
        Returns:
            List of available pre-trained models.
        speakerverification_speakernetzhttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/speakerverification_speakernet/versions/1.16.0/files/speakerverification_speakernet.nemoz{For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:speakerverification_speakernet)pretrained_model_namelocationdescription
ecapa_tdnnzahttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/ecapa_tdnn/versions/1.16.0/files/ecapa_tdnn.nemozgFor details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:ecapa_tdnntitanet_largez_https://api.ngc.nvidia.com/v2/models/nvidia/nemo/titanet_large/versions/v1/files/titanet-l.nemozuFor details about this model, please visit https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/titanet_largelangid_ambernetzdhttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/langid_ambernet/versions/1.12.0/files/ambernet.nemozwFor details about this model, please visit https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/langid_ambernettitanet_smallzjFor details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:titanet_smallzchttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/titanet_small/versions/1.19.0/files/titanet-s.nemo)r#   r%   r$   )r   append)clsresultmodel r.   \/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/asr/models/label_models.pylist_available_modelsG   s@   




z-EncDecSpeakerLabelModel.list_available_modelsNcfgtrainerc                    s  d _ d _d  _d  _|jj}d|v r/d|jv r-|jjdkr(|dg }d _n|jj}nd }|d ur:|j|j	  _ t
 j||d  jrO fdd	 jD }d|v rt|j}d
|jdi v rt|d t| d|j_W d    n1 s{w   Y  d|jv r||j_d |_t|j _t| _ntddi}t| _t| _tdgd _t|j _t|j _t|j _t|dddd _tddddd _t j dr j j!d urt j j! _"d S d  _"d S )N   FlossweightautoT)r1   r2   c                    s$   g | ]}t  jt j|  qS r.   )sumlabels_occurrencelen).0iselfr.   r/   
<listcomp>   s   $ z4EncDecSpeakerLabelModel.__init__.<locals>.<listcomp>angular_target_z=nemo.collections.common.losses.cross_entropy.CrossEntropyLoss)top_kmacro
multiclass)num_classesrA   averagetask   spec_augment)#
world_sizecal_labels_occurrence_trainr8   labelsdecoderrD   r4   r5   	num_nodesnum_devicessuper__init__copydeepcopygetr   
set_structr   r?   r   	eval_losscreater   	_accuracyr    from_config_dictpreprocessorencoderr   _macro_accuracy_pair_macro_accuracyhasattr_cfgrH   spec_augmentation)r=   r1   r2   rD   r5   cfg_eval_losstmp_loss_cfg	__class__r<   r/   rP   u   sV   








z EncDecSpeakerLabelModel.__init__c                 C   s   t  }| dd }|d u rtd d S t| d }tj|D ]}t|d t	|| dd | dd dd}|
|j q tt|}tdt| d	 |S )
Nmanifest_filepathz;No manifest_filepath was provided, no labels got extracted!)manifest_filepathsmin_durationmax_durationT)manifests_filesrf   rg   index_by_file_idzTotal number of z( labels found in all the manifest files.)setrS   r   warningr   	itertoolschainfrom_iterabler   r   updateuniq_labelslistsortedr9   )data_layer_configrK   rd   re   
collectionr.   r.   r/   extract_labels   s$   



z&EncDecSpeakerLabelModel.extract_labelsconfigc           
      C   s  d|v rt |d }nd }t|d |dd|d}|dd}|ddrrd|v r0|d d u s:d	|v rD|d	 d u rDtd
|  d S |rP|dd|d  nd}|ddrdt|||| j| jd}nt|||| j| jd}d}nTd	|v r|d	 d u rtd|  d S |ddrt	}td nt
}||d	 |d ||dd |dd |dd|dd |dd|ddd	}|jr|j| _t|dr|j}n|jd j}|d }	tjjj||	||dd||dd|ddd S )!N	augmentorsample_rate
int_valuesF)rx   ry   rw   shuffle	is_tarredtarred_audio_filepathsrd   znCould not load dataset as `manifest_filepath` was None or `tarred_audio_filepaths` is None. Provided config : 	shuffle_n   
batch_sizer   	is_concat)
featurizerrv   r}   global_rankrI   zJCould not load dataset as `manifest_filepath` was None. Provided config : is_audio_pairzGUsing AudioPairToLabelDataset, where Angular loss will not be computed.rK   rg   rf   trim_silencechannel_selectornormalize_audiocal_labels_occurrence)	rd   rK   r   rg   rf   trimr   r   r   fixed_seq_collate_fn	drop_lastnum_workers
pin_memory)datasetr   
collate_fnr   rz   r   r   )r   r   rS   r   rk   r   r   rI   r   r   r   r8   r]   r   datasetstorchutilsdata
DataLoader)
r=   rv   rw   r   rz   r}   r   data_clsr   r   r.   r.   r/   __setup_dataloader_from_config   s   









z6EncDecSpeakerLabelModel.__setup_dataloader_from_configtrain_data_layer_configc                 C   s  | j r!t|d t| d|d< W d    n1 sw   Y  | || _| j|d< d|vr4d|d< | j|d| _| jd urt| jdrt	| jj
tjjjr| jd urut	| jjtrut| jjtt| jj
| j |d   | j_d S | jd u rtd d S d S d S d S d S )	NTr   rK   rz   rv   r   r   zModel Trainer was not set before constructing the dataset, incorrect number of training batches will be used. Please set the trainer and rebuild the dataset.)rJ   r   rT   r   ru   rK   6_EncDecSpeakerLabelModel__setup_dataloader_from_config	_train_dlr]   
isinstancer   r   r   r   IterableDataset_trainerlimit_train_batchesfloatintr   r9   rI   r   rk   )r=   r   r.   r.   r/   setup_training_data  s6   





z+EncDecSpeakerLabelModel.setup_training_dataval_data_layer_configc                 C   s6   | ddrddg|d< n| j|d< | j|d| _d S )Nr   F01rK   r   )rS   rK   r   _validation_dl)r=   r   r.   r.   r/   setup_validation_data>  s   
z-EncDecSpeakerLabelModel.setup_validation_datatest_data_layer_paramsc                 C   s\   t | dr|ddrddg|d< n| j|d< |dd| _| j|d	| _|d
d | _d S )Nr   r   Fr   r   rK   embedding_dirz./r   rd   )r]   rS   rK   r   r   _test_dltest_manifest)r=   r   r.   r.   r/   setup_test_dataE  s   

z'EncDecSpeakerLabelModel.setup_test_datac                 C   s   | j d ur| j S d S N)r   r<   r.   r.   r/   test_dataloaderP  s   
z'EncDecSpeakerLabelModel.test_dataloaderc                 C   s>   t | jdrt| jjd}nt }td|ttdt dS )N_sample_rate)freq)BTr   input_signalinput_signal_length)r]   rY   AudioSignalr   
NeuralTypetupleLengthsType)r=   audio_eltyper.   r.   r/   input_typesT  s   z#EncDecSpeakerLabelModel.input_typesc                 C   s   t dt t dt dS )N)r   D)logitsembs)r   
LogitsTypeAcousticEncodedRepresentationr<   r.   r.   r/   output_types_  s   

z$EncDecSpeakerLabelModel.output_typesc                 C   s$   | j ||d\}}| j||d}|S )Naudio_signallengthencoder_outputr   )rZ   rL   )r=   r   r   encodedoutputr.   r.   r/   forward_for_exportf  s   z*EncDecSpeakerLabelModel.forward_for_exportc                 C   s   | j ||d\}}| jd ur| jr| j||d}| j||d}t|tr)|\}}n|d }}| j||d}t|trB|\}	}
|	|
fS |d }	}
|	|
fS )N)r   r   )
input_specr   r   r   )rY   r_   trainingrZ   r   r   rL   )r=   r   r   processed_signalprocessed_signal_lenencoder_outputsr   r   decoder_outputsr   r   r.   r.   r/   forwardk  s    





zEncDecSpeakerLabelModel.forwardc                 C   s0  t |dkr7|\}}}}}}| j||d\}}	| j||d\}}
| d d }t|	|
}tjj||}n |\}}}}| j||d}t|t	rN|\}}n|}| j
||d}| d| | d| jjd d	  | d
| jj | j||d | j }| j  t|D ]\}}| d| | qd|iS )Nr~   r         ?rG   r   rK   r4   learning_rater   lrglobal_steptraining_batch_accuracy_top_)r9   r   r   r   cosine_similaritynn
functionalmse_lossr   r   r4   log
_optimizerparam_groupsr2   r   rW   computereset	enumerate)r=   batch	batch_idxaudio_signal_1audio_signal_len_1audio_signal_2audio_signal_len_2rK   _
audio_emb1
audio_emb2loss_labels
cosine_simr4   r   audio_signal_lenr   r   rA   r;   top_ir.   r.   r/   training_step  s,   



z%EncDecSpeakerLabelModel.training_stepr   valdataloader_idxtagc              
   C   s`  t |dkr| ||||S |\}}}}| j||d}	t|	tr%|	\}
}n|	}
| j|
|d}| j|
|d}| jj| jj}}| j	j
|
|d | j	 }| d|| d|| d|| d|| d	|i}	|d
krt| jjttfrt | jjdkr| j| |	 |	S | j|	 |	S t| jjttfrt | jjdkr| j| |	 |	S | j|	 |	S )Nr~   r   r   predstarget_loss_correct_counts_total_counts_acc_micro_top_k_acc_macro_statsr   r3   )r9   pair_evaluation_stepr   r   r   rU   rW   correct_counts_ktotal_counts_kr[   ro   _final_stater2   val_dataloadersrq   validation_step_outputsr*   test_dataloaderstest_step_outputs)r=   r   r   r   r   r   r   rK   r   r   r   
loss_value	acc_top_kcorrect_countstotal_countsstatsr.   r.   r/   evaluation_step  s:   







"	"z'EncDecSpeakerLabelModel.evaluation_stepc                 C   s  |\}}}}}	}
| j ||d\}
}| j ||d\}
}|	 d d }t||}tjj||}tjd| |gdd}| j||	d}| jj	| jj
}}| jj||	d | j }| d	|| d
|| d|| d|| d|| d|| d|	i}|dkrt| jjttfrt| jjdkr| j| | |S | j| |S t| jjttfrt| jjdkr| j| | |S | j| |S )Nr   r   rG   r3   dimr   r   r   r   r   r   r   _scores_labelsr   )r   r   r   r   r   r   r   stackrW   r   r   r\   ro   r   r   r2   r   rq   r   r9   r   r*   r   r   )r=   r   r   r   r   r   r   r   r   rK   r   r   r   r   r   r   r   r  r  r  r  r   r.   r.   r/   r     s<   








"	"z,EncDecSpeakerLabelModel.pair_evaluation_stepc              
      sB  t  fdd|D  }t  fdd|D   }t  fdd|D    }t||dd\}}}	d| }
z|t	t
|
|  d }W n tyj } ztd|  d	}W Y d }~nd }~ww t  fd
d|D jdd}t  fdd|D jdd}|| j_|| j_| j }t  fdd|D jdd| j_t  fdd|D jdd| j_t  fdd|D jdd| j_t  fdd|D jdd| j_| j }| j  | j    d|  d|i}t| jj|D ]\}}||  d| < q||  d<   d|d|iS )Nc                       g | ]	}|  d  qS r   r.   r:   xr   r.   r/   r>         zEEncDecSpeakerLabelModel.pair_multi_eval_epoch_end.<locals>.<listcomp>c                    r  )r	  r.   r  r  r.   r/   r>     r  c                    r  )r
  r.   r  r  r.   r/   r>     r  r3   )y_truey_score	pos_labeld   z&Got ValueError while calculating EER: g      Y@c                    r  r   r.   r  r  r.   r/   r>     r  r   axisc                    r  r   r.   r  r  r.   r/   r>     r  c                       g | ]}|  d  d qS r   r   r.   r  r  r.   r/   r>          c                    r  r   r3   r.   r  r  r.   r/   r>     r  c                    r  r   rG   r.   r  r  r.   r/   r>     r  c                    r  r      r.   r  r  r.   r/   r>     r  r   _eer_acc_micro_top_
_acc_macror   )r   r  meancatcpunumpylongr   np	nanargminabsolute
ValueErrorr   rk   r7   rW   r   r   r   r\   tpfptnfnr   ziprA   )r=   outputsr   r   	loss_meanscoresrK   fprtpr
thresholdsfnreerer  r  topk_scoresmacro_accuracy_scoretensorboard_logsrA   scorer.   r  r/   pair_multi_eval_epoch_end  s:    $   
$$$$


z1EncDecSpeakerLabelModel.pair_multi_eval_epoch_endc                    s  |rt dd |D std  d| d|  i S   d|d v r+| || S t fdd|D  }t fd	d|D jdd
}t fdd|D jdd
}|| j_	|| j_
| j }t fdd|D jdd
| j_t fdd|D jdd
| j_t fdd|D jdd
| j_t fdd|D jdd
| j_| j }| j  | j    d|i}	t| jj|D ]\}
}||	  d|
 < q||	  d<   d|d|	iS )Nc                 S   s   g | ]}t |qS r.   )boolr  r.   r.   r/   r>         zFEncDecSpeakerLabelModel.multi_evaluation_epoch_end.<locals>.<listcomp>z?Not all outputs are dictionaries. Cannot aggregate results for z dataset in dataloader z. Outputs: r	  r   c                    r  r  r.   r  r  r.   r/   r>     r  c                    r  r  r.   r  r  r.   r/   r>     r  r  c                    r  r  r.   r  r  r.   r/   r>     r  c                    r  r  r.   r  r  r.   r/   r>   $  r  c                    r  r  r.   r  r  r.   r/   r>   %  r  c                    r  r  r.   r  r  r.   r/   r>   &  r  c                    r  r  r.   r  r  r.   r/   r>   '  r  r   r"  r#  r   )allr   rk   r?  r   r  r$  r7   rW   r   r   r   r[   r-  r.  r/  r0  r   r1  rA   )r=   r2  r   r   r3  r  r  r;  r<  r=  rA   r>  r.   r  r/   multi_evaluation_epoch_end  s2     
$$$$


z2EncDecSpeakerLabelModel.multi_evaluation_epoch_endc                 C      |  |||dS Nr   r  r=   r   r   r   r.   r.   r/   validation_step4     z'EncDecSpeakerLabelModel.validation_stepc                 C      |  ||dS rE  rC  r=   r2  r   r.   r.   r/   multi_validation_epoch_end7     z2EncDecSpeakerLabelModel.multi_validation_epoch_endc                 C   rD  NtestrF  rG  r.   r.   r/   	test_step:  rI  z!EncDecSpeakerLabelModel.test_stepc                 C   rJ  rO  rK  rL  r.   r.   r/   multi_test_epoch_end=  rN  z,EncDecSpeakerLabelModel.multi_test_epoch_endc                 C   s   t |\}}| jjdd}||krtjj|||d}|jd }| j	}t
|g}tj||tjdtj|g|d}}| j}	|   | j||d\}
}| j|	d |	d	u rZ|   ~~||
fS )
z
        Args:
            path2audio_file: path to an audio wav file

        Returns:
            emb: speaker embeddings (Audio representations)
            logits: logits corresponding of final layer
        rx   >  orig_sr	target_srr   devicedtyperX  r   modeT)sfreadr^   train_dsrS   librosacoreresampleshaperX  r)  arrayr   tensorfloat32r   freezer   trainunfreeze)r=   path2audio_fileaudiosrrV  audio_lengthrX  r   r   r\  r   embr.   r.   r/   
infer_file@  s$   

z"EncDecSpeakerLabelModel.infer_filec           
      C   s   |j d }| j}t|g}tj||tjdtj|g|d}}| j}|   | j	||d\}}	| j
|d |du r>|   ~~|	|fS )z
        Args:
            segment: segment of audio file

        Returns:
            emb: speaker embeddings (Audio representations)
            logits: logits corresponding of final layer
        r   rW  rZ  r   r[  T)rc  rX  r)  rd  r   re  rf  r   rg  r   rh  ri  )
r=   segmentsegment_lengthrX  rk  r   r   r\  r   rn  r.   r.   r/   infer_segment`  s   

z%EncDecSpeakerLabelModel.infer_segmentr3   rj  segment_durationnum_segmentsrandom_seedc                 C   s  t |\}}| jjdd}||krtjj|||d}|jd }|| }	|	|kr+|}	g }
t	j
| t	j
jd||	 d |d}|D ] }||||	  }| |\}}|jdd}|
t|d  qBt|
dd d }| jd d	d
}|d
urt|}|| }|S td |}|S )a  
        Returns label of path2audio_file from classes the model was trained on.
        Args:
            path2audio_file (str): Path to audio wav file.
            segment_duration (float): Random sample duration in seconds.
            num_segments (int): Number of segments of file to use for majority vote.
            random_seed (int): Seed for generating the starting position of the segment.

        Returns:
            label: label corresponding to the trained model
        rx   rS  rT  r   r3   )sizer  r_  rK   NzGlabels are not saved to model, hence only outputting the label id index)r]  r^  r^   r_  rS   r`  ra  rb  rc  r)  randomseedrandintrr  argmaxr*   r   r   most_commonrq   r   info)r=   rj  rs  rt  ru  rk  rl  rV  rm  durationlabel_id_liststartsstartr   r   label_id
m_label_idtrained_labelslabelr.   r.   r/   	get_label}  s2   

z!EncDecSpeakerLabelModel.get_labelc                 C   s   | j |d\}}|S )z
        Returns the speaker embeddings for a provided audio file.

        Args:
            path2audio_file: path to an audio wav file

        Returns:
            emb: speaker embeddings (Audio representations)
        )rj  )ro  )r=   rj  rn  r   r.   r.   r/   get_embedding  s   z%EncDecSpeakerLabelModel.get_embeddingffffff?c           	      C   s   |  | }|  | }|tj| }|tj| }t||t||t|| d  }|d d }||krCtd dS td dS )a  
        Verify if two audio files are from the same speaker or not.

        Args:
            path2audio_file1: path to audio wav file of speaker 1
            path2audio_file2: path to audio wav file of speaker 2
            threshold: cosine similarity score used as a threshold to distinguish two embeddings (default = 0.7)

        Returns:
            True if both audio files are from same speaker, False otherwise
        r   r3   rG   z& two audio files are from same speakerTz, two audio files are from different speakersF)r  squeezer   linalgnormdotr   r|  )	r=   path2audio_file1path2audio_file2	thresholdembs1embs2XYsimilarity_scorer.   r.   r/   verify_speakers  s   (

z'EncDecSpeakerLabelModel.verify_speakers    rS  cudac              	   C   s  t |tu r1t }tj|jd}tj|jd}| dd |D | | dd |D | nt	d| j
||||d\}	}
}
}
| j
||||d\}}
}
}
t|	|}	t||}t|	tjj|	dd	jdd	}	t|tjj|dd	jdd	}|	jdd	}|jd
d	}t|| t||dd
d t|dd
d|  d  }|d d
 }||k}|  |  S )aZ  
        Verify if audio files from the first and second manifests are from the same speaker or not.

        Args:
            audio_files_pairs: list of tuples with audio_files pairs to be verified
            threshold: cosine similarity score used as a threshold to distinguish two embeddings (default = 0.7)
            batch_size: batch size to perform batch inference
            sample_rate: sample rate of audio files in manifest file
            device: compute device to perform operations.

        Returns:
            True if both audio pair is from same speaker, False otherwise
        ztmp_manifest1.jsonztmp_manifest2.jsonc                 S      g | ]}|d  qS r   r.   r:   pr.   r.   r/   r>     rA  zAEncDecSpeakerLabelModel.verify_speakers_batch.<locals>.<listcomp>c                 S   r  )r3   r.   r  r.   r.   r/   r>     rA  zQaudio_files_pairs must be of type list of tuples containing a pair of audio files)r   rx   rX  r3   r  rG   r   r   )typerq   tempfileTemporaryDirectoryospathjoinnamepath2audio_files_to_manifestr,  batch_inferencer   Tensortodivr  r  	unsqueezematmulr  permutecleanupr&  r'  )r=   audio_files_pairsr  r   rx   rX  tmp_dirmanifest_filepath1manifest_filepath2r  r   r  r  r  similarity_scoresdecisionr.   r.   r/   verify_speakers_batch  s4     6z-EncDecSpeakerLabelModel.verify_speakers_batchc                    sV  | j }|   |   |  | jd d }|durt|}||d|d}| || _| j|d< | j|d g }g }	g }
t	 D ]9}dkrPfdd	|D }|\}}}}| j
||d
\}}||   |
|   |	|   qAtt fdd|
}
| j|d |du r|   t|t|	t|
}}	}
|	||
|fS )a  
        Perform batch inference on EncDecSpeakerLabelModel.
        To perform inference on single audio file, once can use infer_model, get_label or get_embedding

        To map predicted labels, one can do
            `arg_values = logits.argmax(axis=1)`
            `pred_labels = list(map(lambda t : trained_labels[t], arg_values))`

        Args:
            manifest_filepath: Path to manifest file
            batch_size: batch size to perform batch inference
            sample_rate: sample rate of audio files in manifest file
            device: compute device to perform operations.

        Returns:
            The variables below all follow the audio file order in the manifest file.
            embs: embeddings of files provided in manifest file
            logits: logits of final layer of EncDecSpeakerLabel Model
            gt_labels: labels from manifest file (needed for speaker enrollment and testing)
            trained_labels: Classification labels sorted in the order that they are mapped by the trained model

        r_  rK   Nr   )rd   rx   r   r   r   r  c                    s   g | ]}|  qS r.   )r  r  rZ  r.   r/   r>   <  s    z;EncDecSpeakerLabelModel.batch_inference.<locals>.<listcomp>r   c                    s    j j|  S r   )r   id2label)t)
dataloaderr.   r/   <lambda>D  s    z9EncDecSpeakerLabelModel.batch_inference.<locals>.<lambda>r[  T)r   rg  evalr  r^   rq   ru   rK   r   r   r   extendr&  r'  maprh  ri  r)  asarray)r=   rd   r   rx   rX  r\  r  	dl_configr   r   	gt_labels
test_batchr   r   rK   r   logitrn  r.   )r  rX  r/   r    s@   

"z'EncDecSpeakerLabelModel.batch_inferencer   )r   r   r  )r  )r  r  rS  r  )r  rS  r  )3__name__
__module____qualname____doc__classmethodr   r   r0   r
   r	   rP   staticmethodru   r   r   r   r   r   r   r   r   propertystrr   r   r   r   r   r   r   r  r   r?  rC  rH  rM  rQ  rR  r   no_gradro  rr  r)  infr   r  r  r  r  r  __classcell__r.   r.   rb   r/   r    9   s`    -A
O#
%'#$


.3)>rQ   rl   r  r  collectionsr   mathr   typingr   r   r   r   r`  r'  r)  	soundfiler]  r   hydra.utilsr   lightning.pytorchr	   	omegaconfr
   r   r   sklearn.metricsr   torchmetricsr   r   (nemo.collections.asr.data.audio_to_labelr   r   r   0nemo.collections.asr.data.audio_to_label_datasetr   r   /nemo.collections.asr.data.audio_to_text_datasetr   %nemo.collections.asr.models.asr_modelr   (nemo.collections.asr.parts.mixins.mixinsr   1nemo.collections.asr.parts.preprocessing.featuresr   0nemo.collections.asr.parts.preprocessing.perturbr   nemo.collections.common.metricsr   7nemo.collections.common.parts.preprocessing.collectionsr   nemo.core.classesr   nemo.core.classes.commonr   nemo.core.neural_types
nemo.utilsr   __all__r    r.   r.   r.   r/   <module>   s@   