o
    9wiۃ                     @   s  d dl Z d dlZd dlmZ d dlmZmZmZmZm	Z	m
Z
mZ d dlZd dlZd dlZd dlmZ d dlmZmZ d dlmZ d dlmZmZ d dlmZmZ d d	lmZ d d
l m!Z! d dl"m#Z# d dl$m%Z%m&Z&m'Z' d dl(m)Z) d dl*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3 d dl4m5Z5m6Z6m7Z7 d dl8m9Z9 d dl:m;Z; e<ej=Z>e<ej=Z?G dd de)Z@ddeAdeAdeAfddZBeCdkrd dlDZDeDEeB dS dS )    N)Counter)DictLiteralOptionalSequenceTextTupleUnion)pyplot)SegmentSlidingWindowFeature)SpeakerDiarizationProtocol)ScopeSubset)MLFlowLoggerTensorBoardLogger)track)BaseWaveformTransform)Metric)Problem
ResolutionSpecifications)SegmentationTask)	DiarizationErrorRateFalseAlarmRateMissedDetectionRateOptimalDiarizationErrorRate$OptimalDiarizationErrorRateThresholdOptimalFalseAlarmRateOptimalMissedDetectionRateOptimalSpeakerConfusionRateSpeakerConfusionRate)binary_cross_entropymse_lossnll_loss)	permutate)Powersetc                $       s  e Zd ZdZ																d3dedeeedf  d	ed
ee	 dee	 de
deeeeef f deee  dee de	dee	 de
dee ded deeee eeef f dee	 ded f" fddZd4 fdd	Zdd Zde	d ed	efd!d"Zd#ejfd$d%Z	d4d&ejd'ejdeej d#ejfd(d)Z	d4d&ejd'ejdeej d#ejfd*d+Zd,e	fd-d.Zd#eeee eeef f fd/d0Zd,e	fd1d2Z  ZS )5SpeakerDiarizationuO  Speaker diarization

    Parameters
    ----------
    protocol : SpeakerDiarizationProtocol
        pyannote.database protocol
    cache : str, optional
        As (meta-)data preparation might take a very long time for large datasets,
        it can be cached to disk for later (and faster!) re-use.
        When `cache` does not exist, `Task.prepare_data()` generates training
        and validation metadata from `protocol` and save them to disk.
        When `cache` exists, `Task.prepare_data()` is skipped and (meta)-data
        are loaded from disk. Defaults to a temporary path.
    duration : float, optional
        Chunks duration. Defaults to 2s.
    max_speakers_per_chunk : int, optional
        Maximum number of speakers per chunk (must be at least 2).
        Defaults to estimating it from the training set.
    max_speakers_per_frame : int, optional
        Maximum number of (overlapping) speakers per frame.
        Setting this value to 1 or more enables `powerset multi-class` training.
        Default behavior is to use `multi-label` training.
    weigh_by_cardinality: bool, optional
        Weigh each powerset classes by the size of the corresponding speaker set.
        In other words, {0, 1} powerset class weight is 2x bigger than that of {0}
        or {1} powerset classes. Note that empty (non-speech) powerset class is
        assigned the same weight as mono-speaker classes. Defaults to False (i.e. use
        same weight for every class). Has no effect with `multi-label` training.
    warm_up : float or (float, float), optional
        Use that many seconds on the left- and rightmost parts of each chunk
        to warm up the model. While the model does process those left- and right-most
        parts, only the remaining central part of each chunk is used for computing the
        loss during training, and for aggregating scores during inference.
        Defaults to 0. (i.e. no warm-up).
    balance: Sequence[Text], optional
        When provided, training samples are sampled uniformly with respect to these keys.
        For instance, setting `balance` to ["database","subset"] will make sure that each
        database & subset combination will be equally represented in the training samples.
    weight: str, optional
        When provided, use this key as frame-wise weight in loss function.
    batch_size : int, optional
        Number of training samples per batch. Defaults to 32.
    num_workers : int, optional
        Number of workers used for generating training samples.
        Defaults to multiprocessing.cpu_count() // 2.
    pin_memory : bool, optional
        If True, data loaders will copy tensors into CUDA pinned
        memory before returning them. See pytorch documentation
        for more details. Defaults to False.
    augmentation : BaseWaveformTransform, optional
        torch_audiomentations waveform transform, used by dataloader
        during training.
    vad_loss : {"bce", "mse"}, optional
        Add voice activity detection loss.
        Cannot be used in conjunction with `max_speakers_per_frame`.
    metric : optional
        Validation metric(s). Can be anything supported by torchmetrics.MetricCollection.
        Defaults to AUROC (area under the ROC curve).

    References
    ----------
    Hervé Bredin and Antoine Laurent
    "End-To-End Speaker Segmentation for Overlap-Aware Resegmentation."
    Proc. Interspeech 2021

    Zhihao Du, Shiliang Zhang, Siqi Zheng, and Zhijie Yan
    "Speaker Embedding-aware Neural Diarization: an Efficient Framework for Overlapping
    Speech Diarization in Meeting Scenarios"
    https://arxiv.org/abs/2203.09767

    N       @F            protocolcachedurationmax_speakers_per_chunkmax_speakers_per_frameweigh_by_cardinalitywarm_upbalanceweight
batch_sizenum_workers
pin_memoryaugmentationvad_loss)bcemsemetricmax_num_speakerslossc                    s   t  j||||
|||||d	 t|tstd|d u r'|d ur'|}td |d ur0td |d urH|dk r@td| d|d urHtd|| _|| _|| _	|| _
|	| _|| _d S )	N)r-   r1   r4   r5   r6   r7   r;   r,   z>SpeakerDiarization task requires a SpeakerDiarizationProtocol.zL`max_num_speakers` has been deprecated in favor of `max_speakers_per_chunk`.z-`loss` has been deprecated and has no effect.   z5`max_speakers_per_frame` must be 1 or more (you used z).z?`vad_loss` cannot be used jointly with `max_speakers_per_frame`)super__init__
isinstancer   
ValueErrorwarningswarnr.   r/   r0   r2   r3   r8   )selfr+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r;   r<   r=   	__class__ r/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/pyannote/audio/tasks/segmentation/speaker_diarization.pyr@      sJ   



zSpeakerDiarization.__init__c              
      s  t  | | jd u r-| jd d tdk}g }d| jdd}tt	|d |dD ]}| jd	 t	| jd	 d
 |kd  }| jd t	| jd d
 |kd  }|D ]U}|d }	|d |d  }
|t	|d |	k|d |
k d  }t
|	|
| j d| j D ]&}|| j }|t	|d |k|d |k d  }|tt|d  qqWq-ttt|  \}}t|t|}}t|}|| }|| }t|t| }t||D ]-\}}|dkrtd|dd q|dkrtd|dd qtd|dd| d qtd|t	|dkd d  | _td| j d| j d | jd urG| j| jkrGtd| j d| j dt| jd u rQtjntjtj| j| j | j!d d! t"| jD | jd"d#| _#d S )$Naudio-metadatasubsettrainz*Estimating maximum number of speakers per gzs chunk in the training setr   descriptionannotations-segmentsfile_idzannotations-regionsstartr-   endg      ?file_label_idxz   - z7.2%z( of all chunks contain no speech at all.r>   z contain 1 speaker or lessz	 contain z speakers or less   g
ףp=
?z$Setting `max_speakers_per_chunk` to za. You can override this value (or avoid this estimation step) by passing `max_speakers_per_chunk=z` to the task constructor.z`max_speakers_per_frame` (z1) must be smaller than `max_speakers_per_chunk` ()c                 S   s   g | ]	}d |d  qS )zspeaker#r>   rH   ).0irH   rH   rI   
<listcomp>*  s    z,SpeakerDiarization.setup.<locals>.<listcomp>T)problem
resolutionr-   min_durationr1   classespowerset_max_classespermutation_invariant)$r?   setupr.   prepared_dataSubsetsindexr-   r   npwherearangeappendlenuniqueziplistr   itemsarrayargsortcumsumsumprintmaxr/   rB   r   r   MULTI_LABEL_CLASSIFICATIONMONO_LABEL_CLASSIFICATIONr   FRAMEr\   r1   rangespecifications)rE   stagetrainingnum_unique_speakersprogress_descriptionrQ   annotationsannotated_regionsregionregion_start
region_endregion_annotationswindow_start
window_endwindow_annotationsnum_speakerscountssorting_indicesratioskratiorF   rH   rI   r`      s   









zSpeakerDiarization.setupc                 C   s*   | j jrtt| j j| j j| j_d S d S N)rw   powersetr&   rh   r]   r^   modelrE   rH   rH   rI   setup_loss_func/  s   
z"SpeakerDiarization.setup_loss_funcrQ   
start_timec                    s  |  |}t| jd | d  }| d}t||| }t }| jjj|||d\|d< }	| jd | jd d |k }
|
|
d |jk |
d	 |j	k@  }| jj
j}d
| jj
j }t|d |j	|j	 | }tdt|| t}t|d	 |j|j	 | }t|| t}tt|| }t|}|| jkr	 | jt|| jjj }tj||ftjd}dd t|D }t|||| D ]\}}}|| }d|||d |f< qt|| jj
|d|d< | jd |   fdd jj D |d< ||d d< |S )a  Prepare chunk

        Parameters
        ----------
        file_id : int
            File index
        start_time : float
            Chunk start time
        duration : float
            Chunk duration.

        Returns
        -------
        sample : dict
            Dictionary containing the chunk data with the following keys:
            - `X`: waveform
            - `y`: target as a SlidingWindowFeature instance where y.labels is
                   in meta.scope space.
            - `meta`:
                - `scope`: target scope (0: file, 1: database, 2: global)
                - `database`: database index
                - `file`: file index
        rJ   scope
_label_idx)r-   XrP   rQ   rR   rS         ?r   )dtypec                 S   s   i | ]\}}||qS rH   rH   )rW   idxlabelrH   rH   rI   
<dictcomp>}      z4SpeakerDiarization.prepare_chunk.<locals>.<dictcomp>r>   )labelsyc                    s   i | ]}| | qS rH   rH   )rW   keymetadatarH   rI   r     r   metafile)!get_fileScopesra   r   dictr   audiocroprS   rR   receptive_fieldstepr-   rd   maximumroundastypeintminimumrk   ri   rh   r.   
num_frameshparamssample_ratezerosuint8	enumeraterj   r   r   names)rE   rQ   r   r-   r   label_scopelabel_scope_keychunksample_r|   chunk_annotationsr   halfrR   	start_idxrS   end_idxr   
num_labelsr   r   mappingr   mapped_labelrH   r   rI   prepare_chunk6  sJ   




z SpeakerDiarization.prepare_chunkreturnc                 C   s   g }|D ]I}|d j }t|d j}|| jkr2tjtj|dd dd}|dd|d| j f }n|| jk rGtj|dd| j| ffdd}n	 || qt	
t|S )a  

        Parameters
        ----------
        batch : list
            List of samples to collate.
            "y" field is expected to be a SlidingWindowFeature.

        Returns
        -------
        y : torch.Tensor
            Collated target tensor of shape (num_frames, self.max_speakers_per_chunk)
            If one chunk has more than `self.max_speakers_per_chunk` speakers, we keep
            the max_speakers_per_chunk most talkative ones. If it has less, we pad with
            zeros (artificial inactive speakers).
        r   r   )axisN)r   r   constant)mode)datarh   r   r.   rd   rn   rp   padrg   torch
from_numpystack)rE   batch
collated_ybr   r   indicesrH   rH   rI   	collate_y  s    


zSpeakerDiarization.collate_ypermutated_predictiontargetc                 C   sX   | j jr!| jrt| jjjdnd}t|tj|dd||d}|S t	||
 |d}|S )a  Permutation-invariant segmentation loss

        Parameters
        ----------
        permutated_prediction : (batch_size, num_frames, num_classes) torch.Tensor
            Permutated speaker activity predictions.
        target : (batch_size, num_frames, num_speakers) torch.Tensor
            Speaker activity.
        weight : (batch_size, num_frames, 1) torch.Tensor, optional
            Frames weight.

        Returns
        -------
        seg_loss : torch.Tensor
            Permutation-invariant segmentation loss
              ?Ndim)class_weightr3   r3   )rw   r   r0   r   	clamp_minr   cardinalityr$   argmaxr"   float)rE   r   r   r3   r   seg_lossrH   rH   rI   segmentation_loss  s    
z$SpeakerDiarization.segmentation_lossc                 C   sd   t j|ddd\}}t j| ddd\}}| jdkr$t|||d}|S | jdkr0t|||d}|S )a  Voice activity detection loss

        Parameters
        ----------
        permutated_prediction : (batch_size, num_frames, num_classes) torch.Tensor
            Speaker activity predictions.
        target : (batch_size, num_frames, num_speakers) torch.Tensor
            Speaker activity.
        weight : (batch_size, num_frames, 1) torch.Tensor, optional
            Frames weight.

        Returns
        -------
        vad_loss : torch.Tensor
            Voice activity detection loss.
        rU   T)r   keepdimFr9   r   r:   )r   rr   r   r8   r"   r#   )rE   r   r   r3   vad_predictionr   
vad_targetr=   rH   rH   rI   voice_activity_detection_loss  s   

z0SpeakerDiarization.voice_activity_detection_loss	batch_idxc              	   C   s  |d }|d }t jt j|dddd}|| jk}|| }|| }| s'dS | |}|j\}}	}
t| dd}||t j||	d| jj	d}t
| jd | j |	 }d	|ddd|f< t
| jd | j |	 }d	|dd|	| df< | jjr| jj|}t||\}}
| jj| }| j|||d
}nt||\}}
| j|||d
}| jjd|ddddd | jdu rd	}n!| jjr| j|||d
}n| j|||d
}| jjd|ddddd || }t |rdS | jjd|ddddd d|iS )a8  Compute permutation-invariant segmentation loss

        Parameters
        ----------
        batch : (usually) dict of torch.Tensor
            Current batch.
        batch_idx: int
            Batch index.

        Returns
        -------
        loss : {str: torch.tensor}
            {"loss": loss}
        r   r   r>   r   Nr3   devicer   r)   r   zloss/train/segmentationFTon_stepon_epochprog_barloggerzloss/train/vadz
loss/trainr=   )r   rp   anyr.   r   shapegetattrgetonesr   r   r1   r-   rw   r   to_multilabelr%   to_powersetr   r   logr8   r   isnan)rE   r   r   r   waveformr   keep
predictionr4   r   r   
weight_keyr3   warm_up_leftwarm_up_right
multilabelpermutated_targetpermutated_target_powersetr   r   r8   r=   rH   rH   rI   training_step  s   


		
	z SpeakerDiarization.training_stepc                 C   s@   | j jrtdtdtdtddS t t t t	 t
 dS )z1Returns diarization error rate and its componentsr   )r   DiarizationErrorRate/ConfusionDiarizationErrorRate/MissDiarizationErrorRate/FalseAlarm)r   zDiarizationErrorRate/Thresholdr   r   r   )rw   r   r   r!   r   r   r   r   r    r   r   r   rH   rH   rI   default_metric  s   z!SpeakerDiarization.default_metricc           #   
   C   s  |d }|d }|  |}|j\}}}t| dd}	||	tj||d| j jd}
t| jd | j	 | }d|
ddd|f< t| jd | j	 | }d|
dd|| df< | j
jrz| j j|}t||\}}| j j| }| j|||
d	}nt||\}}| j|||
d	}| j jd
|ddddd | jdu rd}n!| j
jr| j|||
d	}n| j|||
d	}| j jd|ddddd || }| j jd|ddddd | j
jr| j t|dd||| f ddt|dd||| f dd n%| j t|dd||| f ddt|dd||| f dd | j j| j jddddd | j jdksBt| j jd dksB|dkrDdS | j
jrX|   }|  }n|   }|  }t| jd}tt|}t|| }t j!d| |ddd\}}t"j#||dk< t$|jdkr|ddddt"j%f }|t"&|jd 9 }t'|D ]{}|| }|| }||d d |f }|| }|(| |)dt$| |*d|jd  |+ ,d |- ,d ||d d |f } || }!| j.d|dddd | j.|| |dddd | (|! | *dd | )dt$| | + ,d qt /  | j j0D ]+}"t1|"t2rI|"j34d|| j j q5t1|"t5r_|"j3j6|"j7|d| j j dd q5t 8| dS )zCompute validation loss and metric

        Parameters
        ----------
        batch : dict of torch.Tensor
            Current batch.
        batch_idx: int
            Batch index.
        r   r   r3   Nr>   r   r   r)   r   zloss/val/segmentationFTr   zloss/val/vadzloss/valrU   	   )      )nrowsncolsfigsizesqueezer   r   r   )coloralphalwgg?samplessamples_epochz.png)run_idfigureartifact_file)9r   r   r   r   r   r   r   r   r1   r-   rw   r   r   r%   r   r   r   r   r8   r   validation_metric	transposelog_dictcurrent_epochmathlog2cpunumpyminr4   ceilsqrtpltsubplotsrd   nanrh   newaxisrf   rv   plotset_xlimset_ylim	get_xaxisset_visible	get_yaxisaxvspantight_layoutloggersrA   r   
experiment
add_figurer   
log_figurer  close)#rE   r   r   r   r   r   r4   r   r   r   r3   r   r   r   r   r   r   r   r8   r=   r   y_prednum_samplesr  r  figaxes
sample_idxrow_idxcol_idxax_refsample_yax_hypsample_y_predr   rH   rH   rI   validation_step  s  

					





z"SpeakerDiarization.validation_step)Nr(   NNFr)   NNr*   NFNNNNNr   )__name__
__module____qualname____doc__r   r   r	   strr   r   boolr   r   r   r   r   r   r   r@   r`   r   r   r   Tensorr   r   r   r   r   r6  __classcell__rH   rH   rF   rI   r'   =   s    K
	
BgW2
/
%w
r'   testpyannote/segmentationr+   rK   r   c                    sZ  ddl m}m} ddlm} ddlm} ddlm} ddl	m
} ddlm}	 |dd	\}
| }|| d
| id} tt| | }| Zj| jt|d}jddd ddtt dtt f fdd}|||
d}|D ]'}j |d d |d }|	|||d}|d }||||d}| qnW d   n1 sw   Y  |jdd}dS )zEvaluate a segmentation modelr   )
FileFinderget_protocol)Progress)	Inference)get_devices)DiscreteDiarizationErrorRate)binarizer>   )needsr   )preprocessors)total
Processingr   N	completedrJ  c                    s   j  | | d d S )N)rL  )update)rL  rJ  	file_taskprogressrH   rI   progress_hookx  s   zmain.<locals>.progress_hookr   urirN   
annotation)hook	annotated)uemT)display)NN)pyannote.databaserA  rB  rich.progressrC  pyannote.audiorD  pyannote.audio.pipelines.utilsrE  pyannote.audio.utils.metricrF  pyannote.audio.utils.signalrG  rk   r   add_tasknamerh   r   r   rM  advancereport)r+   rK   r   rA  rB  rC  rD  rE  rF  rG  r   r;   files	main_taskrQ  	inferencer   	reference
hypothesisrV  r   rH   rN  rI   maind  s2   "rg  __main__)r?  r@  )Fr  rC   collectionsr   typingr   r   r   r   r   r   r	   r  rd   r   torch.nn.functional
matplotlibr
   r  pyannote.corer   r   pyannote.database.protocolr   #pyannote.database.protocol.protocolr   r   pytorch_lightning.loggersr   r   rY  r   /torch_audiomentations.core.transforms_interfacer   torchmetricsr   pyannote.audio.core.taskr   r   r   (pyannote.audio.tasks.segmentation.mixinsr   pyannote.audio.torchmetricsr   r   r   r   r   r   r   r    r!   pyannote.audio.utils.lossr"   r#   r$    pyannote.audio.utils.permutationr%   pyannote.audio.utils.powersetr&   rk   __args__rb   r   r'   r;  rg  r7  typerrunrH   rH   rH   rI   <module>   sF   $,

      -$