o
    }oi$]                     @   sJ  d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlm	Z	m
Z
mZmZ d dlZd dlmZ d dlmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZmZmZm Z m!Z!m"Z"m#Z#m$Z$ d dl%m&Z&m'Z'm(Z(m)Z) d dl*m+Z+ d dl,m-Z- d dl.m/Z/m0Z0 dgZ1dZ2dZ3dZ4dd Z5G dd dej6j7e+eZ8dS )    N)deepcopy)AnyListOptionalUnion)rank_zero_only)
DictConfig	OmegaConf)tqdm)score_labels)EncDecClassificationModel)EncDecSpeakerLabelModel)DiarizationMixin)audio_rttm_mapget_embs_and_timestampsget_uniqname_from_filepathparse_scale_configsperform_clustering)segments_manifest_to_subsegments_manifestvalidate_vad_manifestwrite_rttm2manifest)generate_overlap_vad_seqgenerate_vad_segment_tableget_vad_stream_statusprepare_manifest)Model)SaveRestoreConnector)loggingmodel_utilsClusteringDiarizerzmodel_config.yamlzvad_model.nemozspeaker_model.nemoc                 C   s   |   }ttdd |S )z/lists available pretrained model names from NGCc                 S   s   | j S N)pretrained_model_name)x r#   c/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/asr/models/clustering_diarizer.py<lambda>>   s    z+get_available_model_names.<locals>.<lambda>)list_available_modelslistmap)
class_nameavailable_modelsr#   r#   r$   get_available_model_names;   s   r+   c                	       s  e Zd ZdZd/deeef f fddZedd Z	dd	 Z
d/d
dZdd Zdd Zdd Zd0dededefddZdd ZdededefddZd1d ee d!efd"d#Zed$efd%d&Ze		d2d'ed(ee d)eej fd*d+Zed,efd-d.Z  Z S )3r   a  
    Inference model Class for offline speaker diarization.
    This class handles required functionality for diarization : Speech Activity Detection, Segmentation,
    Extract Embeddings, Clustering, Resegmentation and Scoring.
    All the parameters are passed through config file
    Ncfgc                    s   t    t|trt|}t|}|| _| jj| _	d| _
| j	js6| jjjjd ur6| jjjj| _|   i | _| | | jjjj| _| j	jj| _d S )NF)super__init__
isinstancer   r   #convert_model_config_to_dict_configmaybe_update_config_version_cfgdiarizer_diarizer_paramshas_vad_model
oracle_vadvad
model_path
parameters_vad_params_init_vad_model$multiscale_embeddings_and_timestamps_init_speaker_modelspeaker_embeddings_speaker_params
clustering_cluster_params)selfr,   speaker_model	__class__r#   r$   r.   I   s   





zClusteringDiarizer.__init__c                 C   s   d S r    r#   )clsr#   r#   r$   r&   c   s   z(ClusteringDiarizer.list_available_modelsc                 C   s   | j jjj}|drtj|| j jd| _t	
d| n"|ttvr.t	d| d}t	
d| tj|| j jd| _| jj| _| jj| _d| _d	S )
zT
        Initialize VAD model with model name or path passed through config
        .nemomap_locationz VAD model loaded locally from {}Crequested {} model name not available in pretrained models, insteadvad_telephony_marblenet$Loading pretrained {} model from NGC
model_namerI   TN)r2   r3   r7   r8   endswithr   restore_fromdevice
_vad_modelr   infoformatr+   warningfrom_pretrainedr:   window_length_in_sec_vad_window_length_in_secshift_length_in_sec_vad_shift_length_in_secr5   )rB   r8   r#   r#   r$   r;   g   s    



z"ClusteringDiarizer._init_vad_modelc                 C   s  |dur| j jdu rtj r|td| _n`|| _n\| j jjj	}|dur?|
dr?tj|| j jd| _td| n:|
drWtj|| j jd| _td| n"|ttvrgtd| d}td	| tj|| j jd
| _t| jjjj| jjjj| jjjj| _dS )zb
        Initialize speaker embedding model with model name or path passed through config
        NcudarG   rH   z&Speaker Model restored locally from {}z.ckptrJ   
ecapa_tdnnrL   rM   )r2   rQ   torchr[   is_availableto_speaker_modelr3   r>   r8   rO   r   rP   r   rS   rT   load_from_checkpointr+   rU   rV   r   r4   r9   rW   rY   multiscale_weightsmultiscale_args_dict)rB   rC   r8   r#   r#   r$   r=   }   s6   




z&ClusteringDiarizer._init_speaker_modelc              
   C   s@   || j j| j dddg| j| jd| j jd	}| jj|d d S )N
batch_sizeTinferF)	manifest_filepathsample_raterd   
vad_streamlabelsrW   rY   trim_silencenum_workers)test_data_config)r2   rg   getrX   rZ   rk   rR   setup_test_data)rB   manifest_vad_inputvad_dl_configr#   r#   r$   _setup_vad_test_data   s   
z'ClusteringDiarizer._setup_vad_test_datac                 C   s2   || j j| j ddd | j jd}| j| d S )Nrd   F)rf   rg   rd   rj   ri   rk   )r2   rg   rm   rk   r`   rn   )rB   manifest_filespk_dl_configr#   r#   r$   _setup_spkr_test_data   s   
z(ClusteringDiarizer._setup_spkr_test_datac              
      s<  t j jdd t j  j  t j j	 }t|d }|| }d}g }t
|dddD ]}t|d }|t| q0t|}	tt j d	d j d
D ]\}
} fdd|D }tj jjj  j|d |d d}tj|dd}|dddf }|	|
 dkr|d|  }n|	|
 dkr|||  }n|	|
 dkr||d }n|}|t|7 }tj j||
 d }t
|ddd}tt|D ]}|d||  qW d   n1 sw   Y  W d   n1 sw   Y  ~|	|
 dks
|	|
 dkrd}qT j j!s j _" j	}nt#$d t% j j j! j j& j j	 j'j(d}| _"d}t#$d t) j t*t+frH j n j + }t, j"|| j'j( jd}i } j-D ]2}tj.tj||d rt/ j-| ||< tj||d || d< q_t#0d | d! q_t1| j2  j2 _3dS )"aw  
        Run voice activity detection.
        Get log probability of voice activity detection and smoothes using the post processing parameters.
        Using generated frame level predictions generated manifest file for later speaker embedding extraction.
        input:
        manifest_file (str) : Manifest file containing path to audio file and label as infer

        Tignore_errors   r   rutf-8encodingaudio_filepathr7   descleavedisablec                       g | ]	}|  jjqS r#   )r_   rR   rQ   .0r"   rB   r#   r$   
<listcomp>       z/ClusteringDiarizer._run_vad.<locals>.<listcomp>   input_signalinput_signal_lengthdimNstartnextendz.frameaz	{0:0.4f}
singlez6Generating predictions with overlapping input segments)frame_pred_dirsmoothing_methodoverlaprW   rY   rk   g{Gz?z\Converting frame level prediction to speech/no-speech segment in start and end times format.)vad_pred_dirpostprocessing_paramsframe_length_in_secrk   out_dirz.txtrttm_filepathzno vad file found for z! due to zero or negative duration)4shutilrmtree_vad_dirosmakedirsrR   evalintrX   rZ   openjsonloadsappendr   r   	enumerater
   test_dataloaderverboser]   ampautocastrQ   typesoftmaxlenpathjoinrangewriterT   r:   	smoothingr   r   rS   r   r   r2   rk   r/   r   dictr   AUDIO_RTTM_MAPexistsr   rU   r   _vad_out_file_speaker_manifest_path)rB   rr   	time_unittrunctrunc_lall_lendatalinefilestatusi
test_batch	log_probsprobspredto_saveoutpathfoutfr   smoothing_pred_dir
vad_paramstable_out_dirAUDIO_VAD_RTTM_MAPkeyr#   r   r$   _run_vad   s   




"
zClusteringDiarizer._run_vad windowshift	scale_tagc                 C   sT   t j| jd| d| _td|dd d| j  t| j	| j||d| _d S )Nsubsegmentsz.jsonz)Subsegmentation for embedding extraction:_ z, )segments_manifest_filesubsegments_manifest_filer   r   )
r   r   r   _speaker_dirsubsegments_manifest_pathr   rS   replacer   r   )rB   r   r   r   r#   r#   r$   _run_segmentation  s   z$ClusteringDiarizer._run_segmentationc                 C   s   | j r=d| _d| _| jj}| jr-td td || j| j| j	j
| jjd}t|}ntd | | | | n(| jjjdurK| jjj| _n| jjratj| jd| _t| j| j| _ntd	t| j| jd
 dS )z
        Checks for type of speech activity detection from config. Choices are NeMo VAD,
        external vad manifest and oracle VAD (generates speech activity labels from provided RTTM files)
        T2   z0Split long audio file to avoid CUDA memory issuez>Try smaller split_duration if you still have CUDA memory issue)inputrW   split_durationrk   r   z_If you encounter CUDA memory issue, try splitting manifest entry by split_duration to avoid it.Nzoracle_vad_manifest.jsonzgOnly one of diarizer.oracle_vad, vad.model_path or vad.external_vad_manifest must be passed from config)vad_manifest)r5   _auto_split_split_durationr4   rf   r   rS   debugrX   r2   rk   r   r   rU   rq   r   r7   external_vad_manifestr   r6   r   r   r   r   r   r   
ValueErrorr   )rB   ro   configr#   r#   r$   "_perform_speech_activity_detection$  s8   



z5ClusteringDiarizer._perform_speech_activity_detectionrr   	scale_idx
num_scalesc              	      sh  t d  | i  _ j  i  _tdg}t	 j
 d|d  d| dd j dD ]M} fd	d
|D }|\}}}}	tj jjj*  jj||d\}
}|jd }|d|}tj||  fdd}W d   n1 sxw   Y  ~q1t|dddd}t| D ]U\}}| }t|}t|d }| jv rt j| || ddf j|< n|| dd j|< | jvrg  j|< |d }||d  } j| ||g qW d   n1 sw   Y   jjr2t j!" j#d}t j!$|st j%|dd t|}t j!"||}|d  _&t'( jt j&d t d)| dS dS )z
        This method extracts speaker embeddings from segments passed through manifest_file
        Optionally you may save the intermediate speaker embeddings for debugging or any use.
        z%Extracting embeddings for Diarizationr   [r   /z] extract embeddingsTr}   c                    r   r#   )r_   r`   rQ   r   r   r#   r$   r   ^  r   z:ClusteringDiarizer._extract_embeddings.<locals>.<listcomp>r   r   r   Nrx   ry   rz   r|   offsetduration
embeddingsexist_okz_embeddings.pklwbzSaved embedding files to {})*r   rS   rt   r   r`   r   time_stampsr]   emptyr
   r   r   r   r   rQ   r   forwardshapeviewcatcpudetachr   r   	readlinesstripr   r   r   r   r?   save_embeddingsr   r   r   r   r   r   _embeddings_filepkldumprT   )rB   rr   r   r   all_embsr   audio_signalaudio_signal_lenri   slicesr   embs	emb_shapemanifestr   r   dic	uniq_namer   r   embedding_dirprefixnamer#   r   r$   _extract_embeddingsL  s\   






(



z&ClusteringDiarizer._extract_embeddingsr   paths2audio_filesrd   c                 C   s  | j j| _tj| j jd| _tj| jr#t	d t
j| jdd t| j tj| js6t| j tj| jd| _tj| jd| _|rN|| j_|rmt|tu ritj| jd| j _| || j j ntdt| j j| _tj| jd	}tj|dd
 |   | jd  }|D ]$\}\}}| j||d| d | | j|t| | j | j!g| j"|< qt#| j"| j}t$|| j|| j%| j&j'| j(d\}	}
t)d*tj+| j j t,| j|	|
| j j-| j j.| j(dS )a  
        Diarize files provided through paths2audio_files or manifest file
        input:
        paths2audio_files (List[str]): list of paths to file containing audio file
        batch_size (int): batch_size considered for extraction of speaker embeddings and VAD computation
        speaker_outputsz.Deleting previous clustering diarizer outputs.Tru   vad_outputszvad_out.jsonzpaths2audio_filepath.jsonzMpaths2audio_files must be of type list of paths to file containing audio file
pred_rttmsr   
scale_dict_scale)r   )embs_and_timestampsr   out_rttm_dirclustering_paramsrQ   r   z!Outputs are saved in {} directory)collarignore_overlapr   )/r4   r   _out_dirr   r   r   r   r   r   rU   r   r   r   mkdirr   r   r2   rd   r   r'   rf   path2audio_files_to_manifestr   r   r   r   rc   itemsr   r  r   r   r   r   r<   r   r   rA   r`   rQ   r   rS   rT   abspathr   r  r  )rB   r  rd   r  scalesr   r   r   r  all_referenceall_hypothesisr#   r#   r$   diarize  s\   


zClusteringDiarizer.diarize	save_pathc                 C   s   t  :}tj|t}tj|t}| j|d | jr)tj|t	}| j
| | j| tj||d W d   dS 1 sAw   Y  dS )a  
        Saves model instance (weights and configuration) into EFF archive or .
         You can use "restore_from" method to fully restore instance from .nemo file.

        .nemo file is an archive (tar.gz) with the following:
            model_config.yaml - model configuration in .yaml format. You can deserialize this into cfg argument for model's constructor
            model_wights.chpt - model checkpoint

        Args:
            save_path: Path to .nemo file where model instance should be saved
        )path2yaml_file)filename
source_dirN)tempfileTemporaryDirectoryr   r   r   _MODEL_CONFIG_YAML_SPEAKER_MODELto_config_filer5   
_VAD_MODELrR   save_tor`   r   _make_nemo_file_from_folder)rB   r   tmpdirconfig_yaml
spkr_model	vad_modelr#   r#   r$   r*    s   
"zClusteringDiarizer.save_torestore_pathoverride_config_pathrI   c           	   
   C   s  t  }t {}zktj||d t | |d u r"t j|t	}n|}t
|}t jt j|tr?t j|t|jj_n
td| j d t j|t|jj_||_t
|d | |d}td| j d| d W t | nt | w W d    |S 1 sw   Y  |S )N)	path2file
out_folderzModel zt does not contain a VAD model. A VAD model or manifest file withspeech segments need for diarization with this modelT)r,   z  was successfully restored from .)r   getcwdr$  r%  r   _unpack_nemo_filechdirr   r   r&  r	   loadr   r)  r3   r7   r8   r   rS   __name__r'  r>   restore_map_location
set_struct)	rF   r0  r1  rI   cwdr,  r-  confinstancer#   r#   r$   rP     s0   	




zClusteringDiarizer.restore_fromreturnc                 C   s   | j jS r    )r2   r   r   r#   r#   r$   r     s   zClusteringDiarizer.verboser    )r   )Nr   )NN)!r9  
__module____qualname____doc__r   r   r   r.   classmethodr&   r;   r=   rq   rt   r   floatstrr   r   r   r  r   r  r   r*  r   r]   rQ   rP   propertyboolr   __classcell__r#   r#   rD   r$   r   A   s8    

#[(5N&)9r   r   pickler   r   r$  copyr   typingr   r   r   r   r]   lightning.pytorch.utilitiesr   	omegaconfr   r	   r
    nemo.collections.asr.metrics.derr   1nemo.collections.asr.models.classification_modelsr   (nemo.collections.asr.models.label_modelsr   (nemo.collections.asr.parts.mixins.mixinsr   .nemo.collections.asr.parts.utils.speaker_utilsr   r   r   r   r   r   r   r   *nemo.collections.asr.parts.utils.vad_utilsr   r   r   r   nemo.core.classesr   +nemo.core.connectors.save_restore_connectorr   
nemo.utilsr   r   __all__r&  r)  r'  r+   nnModuler   r#   r#   r#   r$   <module>   s4   (
