o
    ui                     @   s:   d dl Z d dlZd dlZd dlZdZdZG dd dZdS )    N   i>  c                   @   s6   e Zd ZdZddedfddZdd Zd	d
 ZdS )	SileroVADz:
    Voice Activity Detection (VAD) using Silero-VAD.
    F
silero_vadcpuc              
      s   zG|j dkrddg}ndg}tjj d|f fdd	}|tj_tjj|s&dnd|d	d
|s.dndd\}} tj_|| _|\}}	}	}	}	|| _W dS  t	yZ }
 zt
d|
 d}
~
ww )a  
        Initialize the VAD object.

        Args:
            local (bool, optional): Whether to load the model locally. Defaults to False.
            model (str, optional): The VAD model name to load. Defaults to "silero_vad".
            device (torch.device, optional): The device to run the model on. Defaults to 'cpu'.

        Returns:
            None

        Raises:
            RuntimeError: If loading the model fails.
        cudaCUDAExecutionProviderCPUExecutionProviderNc                    s    | |f||d| d S )N)sess_options	providers )selfpath_or_bytesr	   r
   kwargsoriginal_initr   </home/ubuntu/sommelier/podcast-pipeline/models/silero_vad.pypatched_init0   s   z(SileroVAD.__init__.<locals>.patched_initzsnakers4/silero-vadzvad/silero-vadFTgithublocal)repo_or_dirmodelforce_reloadonnxsourcezFailed to load VAD model: )typeonnxruntimeInferenceSession__init__torchhubload	vad_modelget_speech_timestamps	ExceptionRuntimeError)r   r   r   devicer
   r   r!   utilsr"   _er   r   r   r      s,   




	zSileroVAD.__init__c                    s   |du st |tjtfstd| j|| jd}fdd|D   s&g S dd t dd  dd D g  fd	d
dt d   fddD }|S )aZ  
        Segment speech from an audio segment and return a list of timestamps.

        Args:
            audio_segment (np.ndarray): The audio segment to be segmented.
            start_time (int): The start time of the audio segment in frames.
            end_time (int): The end time of the audio segment in frames.
            sampling_rate (int): The sampling rate of the audio segment.

        Returns:
            list: A list of timestamps, each containing the start and end times of speech segments in frames.

        Raises:
            ValueError: If the audio segment is invalid.
        NzInvalid audio segment)sampling_ratec                    s$   g | ]}|d    |d   fqS )startendr   ).0ts)
start_timer   r   
<listcomp>]   s    z,SileroVAD.segment_speech.<locals>.<listcomp>c                 S   s    g | ]\}}|d  |d  qS r      r   r,   r*   r+   r   r   r   r/   d   s    r1   c                    s   | |ks | d  |  d  d k r | |g d S | | s%d S | | t| | }| | }| | |d | d S )Nr1   r   r   )appendindexmax)start_index	end_indexmax_interval_indexsplit_index)adjusted_timestamps	intervalsr)   segmentssplit_timestampsr   r   r>   k   s   


z2SileroVAD.segment_speech.<locals>.split_timestampsr   c                    s(   g | ]\}} | d   | d gqS r0   r   r2   )r;   r   r   r/      s    )	
isinstancenpndarraylist
ValueErrorr"   r!   ziplen)r   audio_segmentr.   end_timer)   speech_timestampsmerged_timestampsr   )r;   r<   r)   r=   r>   r.   r   segment_speechF   s(   

zSileroVAD.segment_speechc              	   C   s@  |d }|d }g }d}t  }d}| D ]\}	}
t|
d }t|
d }||kr*q|}t|| }t|| }|
d |vrE||
d  || tkra|t|d|||
d d |d	7 }q||| }t	j
||td
}| |t|t t|t tD ]\}}|t|d|t |t |
d d |d	7 }qq|S )a  
        Process the audio based on the given speaker diarization dataframe.

        Args:
            speakerdia (pd.DataFrame): The diarization dataframe containing start, end, and speaker info.
            audio (dict): A dictionary containing the audio waveform and sample rate.

        Returns:
            list: A list of dictionaries containing processed audio segments with start, end, and speaker.
        sample_ratewaveformr   r*   r+   speaker   )r5   r*   r+   rM   r1   )orig_sr	target_sr)setiterrowsfloatintaddVAD_THRESHOLDr4   strzfilllibrosaresampleSAMPLING_RATErJ   )r   
speakerdiaaudior)   
audio_dataoutlast_endspeakers_seencount_idr5   rowr*   r+   start_frame	end_frame
temp_audiotemp_audio_resampledstart_frame_subend_frame_subr   r   r   vad   s\   


zSileroVAD.vadN)	__name__
__module____qualname____doc__r   r%   r   rJ   rj   r   r   r   r   r      s
    /?r   )rY   r   numpyr@   r   rV   r[   r   r   r   r   r   <module>   s   