o
    pi)b                     @   s~  d dl Z d dlZd dlmZ d dlmZmZmZmZm	Z	m
Z
 d dlZd dlZd dlmZmZ d dlmZ d dlmZmZmZ d dlmZ d dlmZmZmZmZmZm Z m!Z! d d	l"m#Z# d d
l$m%Z% d dl&m'Z' d dl(m)Z)m*Z* d dl+m,Z, d dl-m.Z.m/Z/ d dl0m1Z1 d dl2m3Z3 d dl4m5Z5 e6e/j7Z8e6e.j7Z9G dd deZ:dde;de;de;fddZ<e=dkrd dl>Z>e>?e< dS dS )    N)Counter)DictLiteralOptionalSequenceTextUnion)MLFlowLoggerTensorBoardLogger)pyplot)Problem
ResolutionSpecifications)SegmentationTask)DetectionErrorRateDiarizationErrorRateDiarizationPrecisionDiarizationRecallFalseAlarmRateMissedDetectionRateSpeakerConfusionRate)nll_loss)	permutate)Powerset)SegmentSlidingWindowFeature)SpeakerDiarizationProtocol)ScopeSubset)track)BaseWaveformTransform)Metricc                       s4  e Zd ZdZ													d*dedeeedf  d	ed
ee	 de	dee
e  dee de	dee	 dedee deee
e eeef f dee	 ded f fddZd+ fdd	Zdd Zde	ded	efddZd ejfd!d"Zd#e	fd$d%Zd eee
e eeef f fd&d'Zd#e	fd(d)Z  ZS ),SpeakerDiarizationu	  Speaker diarization

    Parameters
    ----------
    protocol : SpeakerDiarizationProtocol
        pyannote.database protocol
    cache : str, optional
        As (meta-)data preparation might take a very long time for large datasets,
        it can be cached to disk for later (and faster!) re-use.
        When `cache` does not exist, `Task.prepare_data()` generates training
        and validation metadata from `protocol` and save them to disk.
        When `cache` exists, `Task.prepare_data()` is skipped and (meta)-data
        are loaded from disk. Defaults to a temporary path.
    duration : float, optional
        Chunks duration. Defaults to 2s.
    max_speakers_per_chunk : int, optional
        Maximum number of speakers per chunk (must be at least 2).
        Defaults to estimating it from the training set.
    max_speakers_per_frame : int, optional
        Maximum number of (overlapping) speakers per frame. Defaults to 2.
    balance: Sequence[Text], optional
        When provided, training samples are sampled uniformly with respect to these keys.
        For instance, setting `balance` to ["database","subset"] will make sure that each
        database & subset combination will be equally represented in the training samples.
    weight: str, optional
        When provided, use this key as frame-wise weight in loss function.
    batch_size : int, optional
        Number of training samples per batch. Defaults to 32.
    num_workers : int, optional
        Number of workers used for generating training samples.
        Defaults to multiprocessing.cpu_count() // 2.
    pin_memory : bool, optional
        If True, data loaders will copy tensors into CUDA pinned
        memory before returning them. See pytorch documentation
        for more details. Defaults to False.
    augmentation : BaseWaveformTransform, optional
        torch_audiomentations waveform transform, used by dataloader
        during training.
    metric : optional
        Validation metric(s). Can be anything supported by torchmetrics.MetricCollection.
        Defaults to DiarizationErrorRate and its components.

    References
    ----------
    Alexis Plaquet and Hervé Bredin
    "Powerset multi-class cross entropy loss for neural speaker diarization"
    Proc. Interspeech 2023

    Hervé Bredin and Antoine Laurent
    "End-To-End Speaker Segmentation for Overlap-Aware Resegmentation."
    Proc. Interspeech 2021
    N      $@       Fprotocolcachedurationmax_speakers_per_chunkmax_speakers_per_framebalanceweight
batch_sizenum_workers
pin_memoryaugmentationmetricmax_num_speakersloss)bcemsec              
      s   t  j||||	|
|||d t|tstd|d u r&|d ur&|}td |d ur/td |dk r;td| d|| _|| _|| _	|| _
d S )N)r(   r-   r.   r/   r0   r1   r'   z>SpeakerDiarization task requires a SpeakerDiarizationProtocol.zL`max_num_speakers` has been deprecated in favor of `max_speakers_per_chunk`.z-`loss` has been deprecated and has no effect.   z5`max_speakers_per_frame` must be 1 or more (you used z).)super__init__
isinstancer   
ValueErrorwarningswarnr)   r*   r+   r,   )selfr&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   	__class__ i/home/ubuntu/.local/lib/python3.10/site-packages/pyannote/audio/tasks/segmentation/speaker_diarization.pyr8   p   s:   



zSpeakerDiarization.__init__c              
      s  t  | | jd u r-| jd d tdk}g }d| jdd}tt	|d |dD ]}| jd	 t	| jd	 d
 |kd  }| jd t	| jd d
 |kd  }|D ]U}|d }	|d |d  }
|t	|d |	k|d |
k d  }t
|	|
| j d| j D ]&}|| j }|t	|d |k|d |k d  }|tt|d  qqWq-ttt|  \}}t|t|}}t|}|| }|| }t|t| }t||D ]-\}}|dkrtd|dd q|dkrtd|dd qtd|dd| d qtd|t	|dkd d  | _td| j d| j d | j| jkrAtd| j d| j dttjtj| j| jd d! t | jD | jd"d#| _!d S )$Naudio-metadatasubsettrainz*Estimating maximum number of speakers per gzs chunk in the training setr   descriptionannotations-segmentsfile_idzannotations-regionsstartr(   endg      ?file_label_idxz   - z7.2%z( of all chunks contain no speech at all.r6   z contain 1 speaker or lessz	 contain z speakers or lessr$   g
ףp=
?z$Setting `max_speakers_per_chunk` to za. You can override this value (or avoid this estimation step) by passing `max_speakers_per_chunk=z` to the task constructor.z`max_speakers_per_frame` (z1) must be smaller than `max_speakers_per_chunk` ()c                 S   s   g | ]	}d |d  qS )zspeaker#r6   r@   ).0ir@   r@   rA   
<listcomp>  s    z,SpeakerDiarization.setup.<locals>.<listcomp>T)problem
resolutionr(   min_durationclassespowerset_max_classespermutation_invariant)"r7   setupr)   prepared_dataSubsetsindexr(   r   npwherearangeappendlenuniqueziplistr   itemsarrayargsortcumsumsumprintmaxr*   r:   r   r   MONO_LABEL_CLASSIFICATIONr   FRAMErS   rangespecifications)r=   stagetrainingnum_unique_speakersprogress_descriptionrI   annotationsannotated_regionsregionregion_start
region_endregion_annotationswindow_start
window_endwindow_annotationsnum_speakerscountssorting_indicesratioskratior>   r@   rA   rW      s   









zSpeakerDiarization.setupc                 C   s   t t| jj| jj| j_d S N)r   r_   rm   rT   rU   modelpowersetr=   r@   r@   rA   setup_loss_func  s   
z"SpeakerDiarization.setup_loss_funcrI   
start_timec                    s  |  |}t| jd | d  }| d}t||| }t }| jj||\|d< }	| jd | \}
}| jd |
| }||d |jk |d |j	k@  }| jj
j}d	| jj
j }t|d |j	|j	 | }td
t|| t}t|d |j|j	 | }t|| t}tt|| }t|}|| jkr	 | jt|| jjj }tj||ftjd}dd t|D }t|||| D ]\}}}|| }d|||d |f< qt|| jj
|d|d< | jd |   fdd jj D |d< ||d d< |S )a  Prepare chunk

        Parameters
        ----------
        file_id : int
            File index
        start_time : float
            Chunk start time
        duration : float
            Chunk duration.

        Returns
        -------
        sample : dict
            Dictionary containing the chunk data with the following keys:
            - `X`: waveform
            - `y`: target as a SlidingWindowFeature instance where y.labels is
                   in meta.scope space.
            - `meta`:
                - `scope`: target scope (0: file, 1: database, 2: global)
                - `database`: database index
                - `file`: file index
        rB   scope
_label_idxXzaudio-segments-idsrH   rJ   rK         ?r   )dtypec                 S   s   i | ]\}}||qS r@   r@   )rN   idxlabelr@   r@   rA   
<dictcomp>T      z4SpeakerDiarization.prepare_chunk.<locals>.<dictcomp>r6   )labelsyc                    s   i | ]}| | qS r@   r@   )rN   keymetadatar@   rA   r   _  r   metafile)!get_fileScopesrX   r   dictr   audiocroprK   rJ   receptive_fieldstepr(   r[   maximumroundastypeintminimumrb   r`   r_   r)   
num_frameshparamssample_ratezerosuint8	enumeratera   r   r   names)r=   rI   r   r(   r   label_scopelabel_scope_keychunksample_start_idend_idrr   chunk_annotationsr   halfrJ   	start_idxrK   end_idxr   
num_labelsr   r   mappingr   mapped_labelr@   r   rA   prepare_chunk  sH   




z SpeakerDiarization.prepare_chunkreturnc                 C   s   g }|D ]I}|d j }t|d j}|| jkr2tjtj|dd dd}|dd|d| j f }n|| jk rGtj|dd| j| ffdd}n	 || qt	
t|S )a  

        Parameters
        ----------
        batch : list
            List of samples to collate.
            "y" field is expected to be a SlidingWindowFeature.

        Returns
        -------
        y : torch.Tensor
            Collated target tensor of shape (num_frames, self.max_speakers_per_chunk)
            If one chunk has more than `self.max_speakers_per_chunk` speakers, we keep
            the max_speakers_per_chunk most talkative ones. If it has less, we pad with
            zeros (artificial inactive speakers).
        r   r   )axisN)r   r   constant)mode)datar_   r   r)   r[   re   rg   padr^   torch
from_numpystack)r=   batch
collated_ybr   r{   indicesr@   r@   rA   	collate_yd  s    


zSpeakerDiarization.collate_y	batch_idxc              	   C   sZ  |d }|d }t jt j|dddd}|| jk}|| }|| }| s'dS | |}|j\}}	}
t| dd}||t j||	d| jj	d}| jj
|}t||\}}
| jj
| }t|t j|dd|d	}| jjd
|ddddd | js| j }t|tr|n|g}|D ]}|  q| j| |D ]}| jj|ddd |  qd|iS )a8  Compute permutation-invariant segmentation loss

        Parameters
        ----------
        batch : (usually) dict of torch.Tensor
            Current batch.
        batch_idx: int
            Batch index.

        Returns
        -------
        loss : {str: torch.tensor}
            {"loss": loss}
        r   r   r6   dimNr,   devicer,   z
loss/trainFTon_stepon_epochprog_barloggerg      @norm)gradient_clip_valgradient_clip_algorithmr3   )r   rg   anyr)   r   shapegetattrgetonesr   r   to_multilabelr   to_powersetfloatr   argmaxlogautomatic_optimization
optimizersr9   rb   	zero_gradmanual_backwardclip_gradientsr   )r=   r   r   targetwaveformr{   keep
predictionr-   r   r   
weight_keyr,   
multilabelpermutated_targetpermutated_target_powersetr3   r   	optimizerr@   r@   rA   training_step  s^   

	


z SpeakerDiarization.training_stepc                 C   s0   t dtdtdtdtdtdtddS )z1Returns diarization error rate and its componentsr   )r   zDiarizationErrorRate/ConfusionzDiarizationErrorRate/MisszDiarizationErrorRate/FalseAlarmzDiarizationErrorRate/PrecisionzDiarizationErrorRate/Recallz'DiarizationErrorRate/DetectionErrorRate)r   r   r   r   r   r   r   r   r@   r@   rA   default_metric  s   z!SpeakerDiarization.default_metricc              	   C   s  |d }|d }|  |}|j\}}}t| dd}	||	tj||d| j jd}
| j j|}t	||\}}| j j
| }t|tj|dd|
d	}| j jd
|ddddd | j t|ddt|dd | j j| j jddddd | j jdkst| j jd dks|dkrdS |   }|  }t| jd|jd }tt|}t|| }tjd| |ddd\}}tj||dk< t|jdkr|ddddtj f }|t!|jd 9 }t"|D ]d}|| }|| }||d d |f }|| }|#| |$dt| |%d|jd  |& 'd |( 'd ||d d |f }|| }|#| |%dd |$dt| |& 'd qt)  | j j*D ]+}t+|t,ri|j-.d|| j j qUt+|t/r|j-j0|j1|d| j j dd qUt2| dS )zCompute validation loss and metric

        Parameters
        ----------
        batch : dict of torch.Tensor
            Current batch.
        batch_idx: int
            Batch index.
        r   r   r,   Nr6   r   r   r   r   zloss/valFTr   r$   r   	   )      )nrowsncolsfigsizesqueezegg?samplessamples_epochz.png)run_idfigureartifact_file)3r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   validation_metric	transposelog_dictcurrent_epochmathlog2cpunumpyminr-   ceilsqrtpltsubplotsr[   nanr_   newaxisr]   rl   plotset_xlimset_ylim	get_xaxisset_visible	get_yaxistight_layoutloggersr9   r
   
experiment
add_figurer	   
log_figurer   close)r=   r   r   r   r   r   r-   r   r   r   r,   r   r   r   r3   r   y_prednum_samplesr   r   figaxes
sample_idxrow_idxcol_idxax_refsample_yax_hypsample_y_predr   r@   r@   rA   validation_step  s   
	



z"SpeakerDiarization.validation_step)Nr#   Nr$   NNr%   NFNNNNr   )__name__
__module____qualname____doc__r   r   r   strr   r   r   r   boolr    r!   r   r   r8   rW   r   r   r   Tensorr   r   r   r"  __classcell__r@   r@   r>   rA   r"   :   sn    8
	
7aV.U
r"   testpyannote/segmentationr&   rC   r   c                    sZ  ddl m} ddlm} ddlm} ddlm} ddlm	}m
} ddlm}	 |dd	\}
| }|| d
| id} tt| | }|	 Zj| jt|d}jddd ddtt dtt f fdd}|||
d}|D ]'}j |d d |d }||||d}|d }||||d}| qnW d   n1 sw   Y  |jdd}dS )zEvaluate a segmentation modelr   )	Inference)get_devices)DiscreteDiarizationErrorRate)binarize)
FileFinderget_protocol)Progressr6   )needsr   )preprocessors)total
Processingg      ?N	completedr6  c                    s   j  | | d d S )N)r8  )update)r8  r6  	file_taskprogressr@   rA   progress_hook  s   zevaluate.<locals>.progress_hookr   urirF   
annotation)hook	annotated)uemT)display)NN)pyannote.audior-  pyannote.audio.pipelines.utilsr.  pyannote.audio.utils.metricr/  pyannote.audio.utils.signalr0  pyannote.databaser1  r2  rich.progressr3  rb   r   add_tasknamer_   r   r   r9  advancereport)r&   rC   r   r-  r.  r/  r0  r1  r2  r3  r   r1   files	main_taskr=  	inferencer   	reference
hypothesisrB  r   r@   r:  rA   evaluate  s2   "rS  __main__)r+  r,  )@r   r;   collectionsr   typingr   r   r   r   r   r   r  r[   r   lightning.pytorch.loggersr	   r
   
matplotlibr   r  pyannote.audio.core.taskr   r   r   (pyannote.audio.tasks.segmentation.mixinsr   pyannote.audio.torchmetricsr   r   r   r   r   r   r   pyannote.audio.utils.lossr    pyannote.audio.utils.permutationr   pyannote.audio.utils.powersetr   pyannote.corer   r   pyannote.database.protocolr   #pyannote.database.protocol.protocolr   r   rI  r   /torch_audiomentations.core.transforms_interfacer    torchmetricsr!   rb   __args__rY   r   r"   r'  rS  r#  typerrunr@   r@   r@   rA   <module>   s@    $	

    I#