o
    9wi6                     @   sT  d Z ddlZddlmZ ddlmZ ddlmZ ddlm	Z	m
Z
mZmZ ddlZddlmZmZ ddlmZ dd	lmZmZ dd
lmZmZmZmZ ddlmZ ddlmZ ddl m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z' ddl(m)Z) ddl*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0 ddl1m2Z3 ddl4m5Z5 G dd de)Z6G dd de)Z2G dd de)Z7dS )z"Voice activity detection pipelines    N)deepcopy)partial)
MethodType)CallableOptionalTextUnion)
AnnotationSlidingWindowFeature)SpeakerDiarizationProtocol)DetectionErrorRate DetectionPrecisionRecallFMeasure)CategoricalInteger
LogUniformUniform)Trainer)SGD)BaseWaveformTransform)	Inference)GraduallyUnfreeze)	AudioFile)Pipeline)PipelineAugmentationPipelineInferencePipelineModelget_augmentationget_inference	get_model)VoiceActivityDetection)Binarizec                   @   s&   e Zd ZdZededefddZdS )OracleVoiceActivityDetectionz(Oracle voice activity detection pipelinefilereturnc                 C   s   | d    }|jdddS )a  Return groundtruth voice activity detection

        Parameter
        ---------
        file : AudioFile
            Must provide a "annotation" key.

        Returns
        -------
        hypothesis : `pyannote.core.Annotation`
            Speech regions
        
annotationstringspeech)	generatormodality)get_timelinesupportto_annotation)r"   r&    r,   n/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/pyannote/audio/pipelines/voice_activity_detection.pyapply>   s   z"OracleVoiceActivityDetection.applyN)__name__
__module____qualname____doc__staticmethodr   r	   r.   r,   r,   r,   r-   r!   ;   s    r!   c                	       s   e Zd ZdZ			ddededeedf f fdd	Zd
d Z	dd Z
dd ZdZddedee defddZdeeef fddZdd Z  ZS )r   a*  Voice activity detection pipeline

    Parameters
    ----------
    segmentation : Model, str, or dict, optional
        Pretrained segmentation (or voice activity detection) model.
        Defaults to "pyannote/segmentation".
        See pyannote.audio.pipelines.utils.get_model for supported format.
    fscore : bool, optional
        Optimize (precision/recall) fscore. Defaults to optimizing detection
        error rate.
    use_auth_token : str, optional
        When loading private huggingface.co models, set `use_auth_token`
        to True or to a string containing your hugginface.co authentication
        token that can be obtained by running `huggingface-cli login`
    inference_kwargs : dict, optional
        Keywords arguments passed to Inference.

    Hyper-parameters
    ----------------
    onset, offset : float
        Onset/offset detection thresholds
    min_duration_on : float
        Remove speech regions shorter than that many seconds.
    min_duration_off : float
        Fill non-speech regions shorter than that many seconds.
    pyannote/segmentationFNsegmentationfscoreuse_auth_tokenc                    s   t    || _|| _t||d}dd |d< t|fi || _|jjr+d | _	| _
ntdd| _	tdd| _
tdd| _tdd| _d S )N)r7   c                 S   s   t j| dddS )NTaxiskeepdims)npmax)scoresr,   r,   r-   <lambda>}   s    z1VoiceActivityDetection.__init__.<locals>.<lambda>pre_aggregation_hook      ?        g      ?)super__init__r5   r6   r   r   _segmentationspecificationspowersetonsetoffsetr   min_duration_onmin_duration_off)selfr5   r6   r7   inference_kwargsmodel	__class__r,   r-   rD   n   s   
zVoiceActivityDetection.__init__c                 C   s2   | j dkrdddddS | j dkrddd	S t )
Nr4   g%C?gT㥛 ?g rh?gx&?rH   rI   rJ   rK   zpyannote/segmentation-3.0.0rB   )rJ   rK   )r5   NotImplementedErrorrL   r,   r,   r-   default_parameters   s   

z)VoiceActivityDetection.default_parametersc                 C   s   dgS )NSPEECHr,   rS   r,   r,   r-   classes   s   zVoiceActivityDetection.classesc                 C   s   t | j| j| j| jd| _dS )z2Initialize pipeline with current set of parametersrQ   N)r    rH   rI   rJ   rK   	_binarizerS   r,   r,   r-   
initialize   s   z!VoiceActivityDetection.initializezcache/segmentation/inferencer"   hookr#   c                 C   s   | j ||d}| jr&| j|v r|| j }n| j|t|ddd}||| j< n| j|t|ddd}|d| | |}|d |_|dd | D S )a  Apply voice activity detection

        Parameters
        ----------
        file : AudioFile
            Processed file.
        hook : callable, optional
            Callback called after each major steps of the pipeline as follows:
                hook(step_name,      # human-readable name of current step
                     step_artefact,  # artifact generated by current step
                     file=file)      # file being processed
            Time-consuming steps call `hook` multiple times with the same `step_name`
            and additional `completed` and `total` keyword arguments usable to track
            progress of current step.

        Returns
        -------
        speech : Annotation
            Speech regions.
        )rY   r5   Nuric                 S   s   i | ]}|d qS )rU   r,   ).0labelr,   r,   r-   
<dictcomp>   s    z0VoiceActivityDetection.apply.<locals>.<dictcomp>)	
setup_hooktrainingCACHED_SEGMENTATIONrE   r   rW   rZ   rename_labelslabels)rL   r"   rY   segmentationsr&   r,   r,   r-   r.      s   



zVoiceActivityDetection.applyc                 C      | j r	tdddS tdddS z'Return new instance of detection metricrB   F)collarskip_overlapr6   r   r   rS   r,   r,   r-   
get_metric      z!VoiceActivityDetection.get_metricc                 C      | j rdS dS Nmaximizeminimizer6   rS   r,   r,   r-   get_direction      z$VoiceActivityDetection.get_direction)r4   FNN)r/   r0   r1   r2   r   boolr   r   rD   rT   rV   rX   r`   r   r   r   r	   r.   r   r   ri   rp   __classcell__r,   r,   rO   r-   r   Q   s&    
 
.r   c                       sj   e Zd ZdZ			ddedee def fdd	Zd
e	de
fddZdeeef fddZdd Z  ZS )AdaptiveVoiceActivityDetectiona<  Adaptive voice activity detection pipeline

    Let M be a pretrained voice activity detection model.

    For each file f, this pipeline starts by applying the model to obtain a first set of
    speech/non-speech labels.

    Those (automatic, possibly erroneous) labels are then used to fine-tune M on the very
    same file f into a M_f model, in a self-supervised manner.

    Finally, the fine-tuned model M_f is applied to file f to obtain the final (and
    hopefully better) speech/non-speech labels.

    During fine-tuning, frames where the pretrained model M is very confident are weighted
    more than those with lower confidence: the intuition is that the model will use these
    high confidence regions to adapt to recording conditions (e.g. background noise) and
    hence will eventually be better on the parts of f where it was initially not quite
    confident.

    Conversely, to avoid overfitting too much to those high confidence regions, we use
    data augmentation and freeze all but the final few layers of the pretrained model M.

    Parameters
    ----------
    segmentation : Model, str, or dict, optional
        Pretrained segmentation model.
        Defaults to "hbredin/VoiceActivityDetection-PyanNet-DIHARD".
    augmentation : BaseWaveformTransform, or dict, optional
        torch_audiomentations waveform transform, used during fine-tuning.
        Defaults to no augmentation.
    fscore : bool, optional
        Optimize (precision/recall) fscore.
        Defaults to optimizing detection error rate.

    Hyper-parameters
    ----------------
    num_epochs : int
        Number of epochs (where one epoch = going through the file once).
    batch_size : int
        Batch size.
    learning_rate : float
        Learning rate.

    See also
    --------
    pyannote.audio.pipelines.utils.get_inference
    -hbredin/VoiceActivityDetection-PyanNet-DIHARDNFr5   augmentationr6   c                    sN   t    t|| _t|| _|| _tdd| _t	g d| _
tdd| _d S )Nr   
   )                   gư>ry   )rC   rD   r   	inferencer   rw   r6   r   
num_epochsr   
batch_sizer   learning_rate)rL   r5   rw   r6   rO   r,   r-   rD     s   


z'AdaptiveVoiceActivityDetection.__init__r"   r#   c           
   	      sd  t     d< tj d ddd d< tdddddd}|  d	< jd
kr2 d	 S tjt d d d ddd d< G  fdddt	}t
| jjdjjd}tjj}||_fdd}t|||_t }tjddtjd dgd|d}|| W d    n1 sw   Y  t|jjjjd}	|	  d< | S )Nsegry   Tr9   vadrA   rB   rQ   r$   r   
confidencec                       s   e Zd ZdZ fddZdS )z;AdaptiveVoiceActivityDetection.apply.<locals>.DummyProtocolDummyProtocolc                 3   s     V  d S rr   r,   rS   r"   r,   r-   
train_iterQ  s   
zFAdaptiveVoiceActivityDetection.apply.<locals>.DummyProtocol.train_iterN)r/   r0   r1   namer   r,   r   r,   r-   r   N  s    r   )durationweightr   rw   c                    s   t |   jdS )N)lr)r   
parametersr   )rN   rS   r,   r-   configure_optimizers_  s   zBAdaptiveVoiceActivityDetection.apply.<locals>.configure_optimizersgpu)epochs_per_stageF)
max_epochsacceleratordevices	callbacksenable_checkpointingdefault_root_dir)devicer   )dictr   r<   r=   r   instantiater   minabsr   VoiceActivityDetectionTaskr   r   rw   r   rN   taskr   r   tempfileTemporaryDirectoryr   r   fitr   r   )
rL   r"   vad_pipeliner   vad_task	vad_modelr   r   trainerr   r,   )r"   rL   r-   r.   .  s\   


z$AdaptiveVoiceActivityDetection.applyc                 C   rd   re   rh   rS   r,   r,   r-   ri   x  rj   z)AdaptiveVoiceActivityDetection.get_metricc                 C   rk   rl   ro   rS   r,   r,   r-   rp     rq   z,AdaptiveVoiceActivityDetection.get_direction)rv   NF)r/   r0   r1   r2   r   r   r   rs   rD   r   r	   r.   r   r   r   ri   rp   rt   r,   r,   rO   r-   ru      s    2Jru   )8r2   r   copyr   	functoolsr   typesr   typingr   r   r   r   numpyr<   pyannote.corer	   r
   pyannote.database.protocolr   pyannote.metrics.detectionr   r   pyannote.pipeline.parameterr   r   r   r   pytorch_lightningr   torch.optimr   /torch_audiomentations.core.transforms_interfacer   pyannote.audior   pyannote.audio.core.callbackr   pyannote.audio.core.ior   pyannote.audio.core.pipeliner   pyannote.audio.pipelines.utilsr   r   r   r   r   r   pyannote.audio.tasksr   r   pyannote.audio.utils.signalr    r!   ru   r,   r,   r,   r-   <module>   s2     