o
    9wi&                     @   s   d dl Z d dlmZmZmZ d dlmZ d dlZd dlZd dl	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZmZ d d	lmZ d d	lmZ d d
lmZ dddZG dd dZG dd deZG dd deZdS )    N)CallableTextUnion)Optional)Model)	AudioFile)VoiceActivityDetection)PipelineModel)
AnnotationSlidingWindowFeature)Segment)Vad      ?Zd;?c                 C   s   t j }tjtjtjt}tj|dd |d u r-tj	|dd}tj|}ntj|}tj
|s@td| tj
|rStj|sSt| dt|d }tj||d}||d	d	d
}	t|t | d}
|
|	 |
S )NT)exist_okassetszpytorch_model.binzModel file not found at z! exists and is not a regular filerb)use_auth_tokeng?)onsetoffsetmin_duration_onmin_duration_off)segmentationdevice)torchhub_get_torch_homeospathdirnameabspath__file__makedirsjoinexistsFileNotFoundErrorisfileRuntimeErroropenreadr   from_pretrainedVoiceActivitySegmentationr   instantiate)r   	vad_onset
vad_offsetr   model_fp	model_dirmain_dirmodel_bytes	vad_modelhyperparametersvad_pipeline r6   S/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/whisperx/vads/pyannote.pyload_vad_model   s*   

r8   c                       sh   e Zd ZdZddddddedfdedee ded	ed
ededef fddZdedefddZ	  Z
S )Binarizea  Binarize detection scores using hysteresis thresholding, with min-cut operation
    to ensure not segments are longer than max_duration.

    Parameters
    ----------
    onset : float, optional
        Onset threshold. Defaults to 0.5.
    offset : float, optional
        Offset threshold. Defaults to `onset`.
    min_duration_on : float, optional
        Remove active regions shorter than that many seconds. Defaults to 0s.
    min_duration_off : float, optional
        Fill inactive regions shorter than that many seconds. Defaults to 0s.
    pad_onset : float, optional
        Extend active regions by moving their start time by that many seconds.
        Defaults to 0s.
    pad_offset : float, optional
        Extend active regions by moving their end time by that many seconds.
        Defaults to 0s.
    max_duration: float
        The maximum length of an active segment, divides segment at timestamp with lowest score.
    Reference
    ---------
    Gregory Gelly and Jean-Luc Gauvain. "Minimum Word Error Training of
    RNN-based Voice Activity Detection", InterSpeech 2015.

    Modified by Max Bain to include WhisperX's min-cut operation
    https://arxiv.org/abs/2303.00747

    Pyannote-audio
    r   N        infr   r   r   r   	pad_onset
pad_offsetmax_durationc                    s<   t    || _|p|| _|| _|| _|| _|| _|| _d S N)	super__init__r   r   r<   r=   r   r   r>   )selfr   r   r   r   r<   r=   r>   	__class__r6   r7   rA   S   s   


zBinarize.__init__scoresreturnc                    sD  |j j\}}|j  fddt|D }t }t|j jD ]\}}|jdu r(|n|j| }|d }	|d | jk}
|d g}|	g}|	}t	|dd |dd D ]|\}}|
r||	 }|| j
krt|d }|t||d  }|| }t|	| j || j }||||f< || }	||d d }||d d }n|| jk rt|	| j || j }||||f< |}	d}
g }g }|| || qO|| jkr|}	d}
qO|
rt|	| j || j }||||f< q| jd	ks| jd	ks| jd	kr| j
td
k rtd|j| jd}| jdkr t| D ]\}}|j| jk r|||f= q|S )zBinarize detection scores
        Parameters
        ----------
        scores : SlidingWindowFeature
            Detection scores.
        Returns
        -------
        active : Annotation
            Binarized scores.
        c                    s   g | ]} | j qS r6   )middle).0iframesr6   r7   
<listcomp>y   s    z%Binarize.__call__.<locals>.<listcomp>Nr         FTr:   r;   z+This would break current max_duration param)collar)datashapesliding_windowranger
   	enumerateTlabelsr   zipr>   lennpargminr   r<   r=   r   appendr   floatNotImplementedErrorsupportr   list
itertracksduration)rB   rE   
num_framesnum_classes
timestampsactivekk_scoreslabelstart	is_activecurr_scorescurr_timestampstycurr_durationsearch_aftermin_score_div_idxmin_score_tregionsegmenttrackr6   rJ   r7   __call__k   sd   
"



 
zBinarize.__call__)__name__
__module____qualname____doc__r\   r   rA   r   r
   rv   __classcell__r6   r6   rC   r7   r9   2   s2    "r9   c                	       sV   e Zd Z			ddededeedf f fddZdd	ed
e	e
 defddZ  ZS )r+   pyannote/segmentationFNr   fscorer   c                    s   t  jd|||d| d S )N)r   r}   r   r6   )r@   rA   )rB   r   r}   r   inference_kwargsrC   r6   r7   rA      s   z"VoiceActivitySegmentation.__init__filehookrF   c                 C   sR   | j ||d}| jr"| j|v r|| j }|S | |}||| j< |S | |}|S )a  Apply voice activity detection

        Parameters
        ----------
        file : AudioFile
            Processed file.
        hook : callable, optional
            Hook called after each major step of the pipeline with the following
            signature: hook("step_name", step_artefact, file=file)

        Returns
        -------
        speech : Annotation
            Speech regions.
        )r   )
setup_hooktrainingCACHED_SEGMENTATION_segmentation)rB   r   r   segmentationsr6   r6   r7   apply   s   




zVoiceActivitySegmentation.apply)r|   FNr?   )rw   rx   ry   r	   boolr   r   rA   r   r   r   r
   r   r{   r6   r6   rC   r7   r+      s    
$
r+   c                       sX   e Zd Zd fdd	ZdefddZedd Ze			dd
ede	e fddZ
  ZS )PyannoteNc                    s,   t d t |d  t|||d| _d S )Nz7>>Performing voice activity detection using Pyannote...r-   )r   r/   )printr@   rA   r8   r5   )rB   r   r   r/   kwargsrC   r6   r7   rA      s   zPyannote.__init__audioc                 K   s
   |  |S r?   )r5   )rB   r   r   r6   r6   r7   rv      s   
zPyannote.__call__c                 C   s   t | dS )Nr   )r   
from_numpy	unsqueeze)r   r6   r6   r7   preprocess_audio   s   zPyannote.preprocess_audior   r   r   c                 C   s~   |dksJ t |||d}|| } g }|  D ]}|t|j|jd qt|dkr1td g S |s7J dt	||||S )Nr   )r>   r   r   UNKNOWNzNo active speech found in audiozsegments_list is empty.)
r9   get_timeliner[   SegmentXri   endrX   r   r   merge_chunks)segments
chunk_sizer   r   binarizesegments_listspeech_turnr6   r6   r7   r      s   zPyannote.merge_chunks)NN)r   N)rw   rx   ry   rA   r   rv   staticmethodr   r\   r   r   r{   r6   r6   rC   r7   r      s    
r   )r   r   NN)r   typingr   r   r   r   numpyrY   r   pyannote.audior   pyannote.audio.core.ior   pyannote.audio.pipelinesr   pyannote.audio.pipelines.utilsr	   pyannote.corer
   r   r   whisperx.diarizer   whisperx.vads.vadr   r8   r9   r+   r   r6   r6   r6   r7   <module>   s$    
  
-