o
    pi'                     @   s   d dl mZmZmZmZmZmZ d dlZd dl	m
Z
mZmZ d dlmZ d dlmZmZ d dlmZ d dlmZ d dlmZ G d	d
 d
eZdS )    )DictOptionalSequenceTextTupleUnionN)Problem
ResolutionSpecifications)SegmentationTask)SegmentSlidingWindowFeature)Protocol)BaseWaveformTransform)Metricc                       s   e Zd ZdZ										ddedeeedf  d	ed
eee	eef f dee
e  dee dedee dedee deee
e eeef f f fddZdeded	efddZ  ZS )VoiceActivityDetectiona	  Voice activity detection

    Voice activity detection (or VAD) is the task of detecting speech regions
    in a given audio recording.

    It is addressed as a binary (0 or 1) sequence labeling task. A frame is
    marked as "speech" (1) as soon as at least one speaker is active.

    Parameters
    ----------
    protocol : Protocol
        pyannote.database protocol
    cache : str, optional
        As (meta-)data preparation might take a very long time for large datasets,
        it can be cached to disk for later (and faster!) re-use.
        When `cache` does not exist, `Task.prepare_data()` generates training
        and validation metadata from `protocol` and save them to disk.
        When `cache` exists, `Task.prepare_data()` is skipped and (meta)-data
        are loaded from disk. Defaults to a temporary path.
    duration : float, optional
        Chunks duration. Defaults to 2s.
    warm_up : float or (float, float), optional
        Use that many seconds on the left- and rightmost parts of each chunk
        to warm up the model. While the model does process those left- and right-most
        parts, only the remaining central part of each chunk is used for computing the
        loss during training, and for aggregating scores during inference.
        Defaults to 0. (i.e. no warm-up).
    balance: Sequence[Text], optional
        When provided, training samples are sampled uniformly with respect to these keys.
        For instance, setting `balance` to ["database","subset"] will make sure that each
        database & subset combination will be equally represented in the training samples.
    weight: str, optional
        When provided, use this key to as frame-wise weight in loss function.
    batch_size : int, optional
        Number of training samples per batch. Defaults to 32.
    num_workers : int, optional
        Number of workers used for generating training samples.
        Defaults to multiprocessing.cpu_count() // 2.
    pin_memory : bool, optional
        If True, data loaders will copy tensors into CUDA pinned
        memory before returning them. See pytorch documentation
        for more details. Defaults to False.
    augmentation : BaseWaveformTransform, optional
        torch_audiomentations waveform transform, used by dataloader
        during training.
    metric : optional
        Validation metric(s). Can be anything supported by torchmetrics.MetricCollection.
        Defaults to AUROC (area under the ROC curve).
    N       @            Fprotocolcachedurationwarm_upbalanceweight
batch_sizenum_workers
pin_memoryaugmentationmetricc                    sP   t  j||||||	|
||d	 || _|| _ttjtj| j	| j
| jdgd| _d S )N)r   r   r   r   r   r   r   r   speech)problem
resolutionr   min_durationr   classes)super__init__r   r   r
   r   BINARY_CLASSIFICATIONr	   FRAMEr   r#   r   specifications)selfr   r   r   r   r   r   r   r   r   r   r   	__class__ n/home/ubuntu/.local/lib/python3.10/site-packages/pyannote/audio/tasks/segmentation/voice_activity_detection.pyr&   V   s,   zVoiceActivityDetection.__init__file_id
start_timec                    s  |  |}t||| }t }| jj||\|d< }| jd | jd d |k }||d |jk |d |jk@  }	| jj	j
}
d| jj	j }t|	d |j|j | }tdt||
 t}t|	d |j|j | }t||
 t}| jt|| jjj }tj|dftjd	}t||D ]\}}d|||d df< qt|| jj	d
gd|d< | jd |   fdd jjD |d< ||d d< |S )a6  Prepare chunk for voice activity detection

        Parameters
        ----------
        file_id : int
            File index
        start_time : float
            Chunk start time
        duration : float
            Chunk duration.

        Returns
        -------
        sample : dict
            Dictionary containing the chunk data with the following keys:
            - `X`: waveform
            - `y`: target as a SlidingWindowFeature instance
            - `meta`:
                - `database`: database index
                - `file`: file index
        Xzannotations-segmentsr/   startendg      ?r      )dtyper    )labelsyzaudio-metadatac                    s   i | ]}| | qS r-   r-   ).0keymetadatar-   r.   
<dictcomp>   s    z8VoiceActivityDetection.prepare_chunk.<locals>.<dictcomp>metafile)get_filer   dictmodelaudiocropprepared_datar3   r2   receptive_fieldstepr   npmaximumroundastypeintminimum
num_frameshparamssample_ratezerosuint8zipr   r5   names)r*   r/   r0   r   r>   chunksample_annotationschunk_annotationsrF   halfr2   	start_idxr3   end_idxrM   r7   r-   r:   r.   prepare_chunk~   s:   


z$VoiceActivityDetection.prepare_chunk)
Nr   r   NNr   NFNN)__name__
__module____qualname____doc__r   r   r   strfloatr   r   r   rK   boolr   r   r   r&   r\   __classcell__r-   r-   r+   r.   r   #   sH    5
	
(r   )typingr   r   r   r   r   r   numpyrG   pyannote.audio.core.taskr   r	   r
   (pyannote.audio.tasks.segmentation.mixinsr   pyannote.corer   r   pyannote.databaser   /torch_audiomentations.core.transforms_interfacer   torchmetricsr   r   r-   r-   r-   r.   <module>   s    