o
    if                  
   @   s   d Z ddlZddlmZ ddlZddlmZ ddlm	Z	m
Z
 dZzddlZW n  eyF Z zede  ed ed	e dZ[ww G d
d dZG dd de	ZdS )zSilero Voice Activity Detection (VAD) implementation for Pipecat.

This module provides a VAD analyzer based on the Silero VAD ONNX model,
which can detect voice activity in audio streams with high accuracy.
Supports 8kHz and 16kHz sample rates.
    N)Optional)logger)VADAnalyzer	VADParamsg      @zException: zAIn order to use Silero VAD, you need to `pip install pipecat-ai`.zMissing module(s): c                   @   s@   e Zd ZdZdddZdefddZdd	d
ZdefddZdS )SileroOnnxModelzONNX runtime wrapper for the Silero VAD model.

    Provides voice activity detection using the pre-trained Silero VAD model
    with ONNX runtime for efficient inference. Handles model state management
    and input validation for audio processing.
    Tc                 C   s`   t  }d|_d|_|rdt  v rt j|dg|d| _nt j||d| _|   ddg| _dS )zInitialize the Silero ONNX model.

        Args:
            path: Path to the ONNX model file.
            force_onnx_cpu: Whether to force CPU execution provider.
           CPUExecutionProvider)	providerssess_options)r
   @  >  N)	onnxruntimeSessionOptionsinter_op_num_threadsintra_op_num_threadsget_available_providersInferenceSessionsessionreset_statessample_rates)selfpathforce_onnx_cpuopts r   L/home/ubuntu/.local/lib/python3.10/site-packages/pipecat/audio/vad/silero.py__init__*   s   
zSileroOnnxModel.__init__src                 C   s|   t |dkrt |d}t |dkrtd|  || jvr+td| j d|t |d  dkr:td||fS )	z)Validate and preprocess input audio data.r   r      z*Too many dimensions for input audio chunk zSupported sampling rates: z (or multiple of 16000)g     @?@zInput audio chunk is too short)npndimexpand_dims
ValueErrordimr   shape)r   xr   r   r   r   _validate_input?   s   
zSileroOnnxModel._validate_inputr   c                 C   s:   t jd|dfdd| _t j|dfdd| _d| _d| _dS )zReset the internal model states.

        Args:
            batch_size: Batch size for state initialization. Defaults to 1.
        r      float32dtyper   N)r   zeros_state_context_last_sr_last_batch_size)r   
batch_sizer   r   r   r   O   s   
zSileroOnnxModel.reset_statesc           
      C   sN  |  ||\}}|dkrdnd}t|d |kr&tdt|d  dt|d }|dkr3dnd	}| js=| | | jrJ| j|krJ| | | jrW| j|krW| | t| jd
 sitj||fdd| _tj	| j|fd
d}|dv r|| j
tj|ddd}| jd|}|\}}	|	| _
nt |d| df | _|| _|| _|S )z*Process audio input through the VAD model.r         zProvided number of samples is z< (Supported values: 256 for 8000 sample rate, 512 for 16000)r   @       r   r(   r)   )axis)r   r   int64)inputstater   N.)r&   r   r$   r"   r/   r   r.   r-   r+   concatenater,   arrayr   run)
r   r%   r   num_samplesr0   context_size
ort_inputsort_outsoutr9   r   r   r   __call__Z   s6   


zSileroOnnxModel.__call__N)T)r   )	__name__
__module____qualname____doc__r   intr&   r   rB   r   r   r   r   r   "   s    

r   c                       sh   e Zd ZdZddddee dee f fddZdef fdd	Zd
efddZ	d
e
fddZ  ZS )SileroVADAnalyzera  Voice Activity Detection analyzer using the Silero VAD model.

    Implements VAD analysis using the pre-trained Silero ONNX model for
    accurate voice activity detection. Supports 8kHz and 16kHz sample rates
    with automatic model state management and periodic resets.
    Nsample_rateparamsrJ   rK   c                   s   t  j||d td d}d}zddl}t|||}W n= ty^   ddl	m
} z|||
}|}W d   n1 sBw   Y  W n ty[   t|||}Y nw Y nw t|dd	| _d| _td
 dS )zInitialize the Silero VAD analyzer.

        Args:
            sample_rate: Audio sample rate (8000 or 16000 Hz). If None, will be set later.
            params: VAD parameters for detection thresholds and timing.
        rI   zLoading Silero VAD model...zsilero_vad.onnxzpipecat.audio.vad.datar   N)	resourcesT)r   zLoaded Silero VAD)superr   r   debugimportlib_resourcesstrfilesjoinpathBaseException	importlibrL   r   r   _model_last_reset_time)r   rJ   rK   
model_namepackage_pathimpresourcesmodel_file_pathf	__class__r   r   r      s,   
	zSileroVADAnalyzer.__init__c                    s0   |dkr|dkrt d| dt | dS )zSet the sample rate for audio processing.

        Args:
            sample_rate: Audio sample rate (must be 8000 or 16000 Hz).

        Raises:
            ValueError: If sample rate is not 8000 or 16000 Hz.
        r   r   z?Silero VAD sample rate needs to be 16000 or 8000 (sample rate: )N)r"   rM   set_sample_rate)r   rJ   r\   r   r   r_      s
   	
z!SileroVADAnalyzer.set_sample_ratereturnc                 C   s   | j dkrdS dS )zGet the number of audio frames required for VAD analysis.

        Returns:
            Number of frames required (512 for 16kHz, 256 for 8kHz).
        r   r1   r2   )rJ   )r   r   r   r   num_frames_required   s   z%SileroVADAnalyzer.num_frames_requiredc              
   C   s   z6t |t j}t j|t jdt jd }| || jd }t }|| j }|t	kr4| j
  || _|W S  tyQ } ztd|  W Y d}~dS d}~ww )zCalculate voice activity confidence for the given audio buffer.

        Args:
            buffer: Audio buffer to analyze.

        Returns:
            Voice confidence score between 0.0 and 1.0.
        r)   g      @r   z'Error analyzing audio with Silero VAD: N)r   
frombufferint16astyper(   rU   rJ   timerV   _MODEL_RESET_STATES_TIMEr   	Exceptionr   error)r   bufferaudio_int16audio_float32new_confidence	curr_time	diff_timeer   r   r   voice_confidence   s   	

z"SileroVADAnalyzer.voice_confidence)rC   rD   rE   rF   r   rG   r   r   r_   ra   floatrp   __classcell__r   r   r\   r   rH      s    &%rH   )rF   re   typingr   numpyr   logurur   pipecat.audio.vad.vad_analyzerr   r   rf   r   ModuleNotFoundErrorro   rh   rg   r   rH   r   r   r   r   <module>   s"   
`