o
    }oi6                     @   s   d dl mZmZ d dlmZ d dlZd dlZd dlZd dl	m
Z
 d dlmZ d dlmZ G dd deZG d	d
 d
eZG dd deZ	ddejdededeeef fddZdedededededeeef fddZdS )    )ABCabstractmethod)TupleN)EncDecClassificationModel)normalize_volume)loggingc                   @   s<   e Zd ZdZedejdedede	ejeef fddZ
dS )	AudioTrimmerz3Interface for silence trimming implementations
    audiosample_rateaudio_idreturnc                 C   s   t )a  Trim starting and trailing silence from the input audio.
           Args:
               audio: Numpy array containing audio samples. Float [-1.0, 1.0] format.
               sample_rate: Sample rate of input audio.
               audio_id: String identifier (eg. file name) used for logging.

           Returns numpy array with trimmed audio, and integer sample indices representing the start and end
           of speech within the original audio array.
        )NotImplementedError)selfr	   r
   r    r   k/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/tts/parts/preprocessing/audio_trimming.py
trim_audio   s   zAudioTrimmer.trim_audioN)__name__
__module____qualname____doc__r   nparrayintstrr   r   r   r   r   r   r      s    .r   c                   @   sp   e Zd Z							dded	ed
edededededdfddZddejdede	de
ejeef fddZdS )EnergyAudioTrimmer2         ?         皙?Tdb_thresholdref_amplitudespeech_frame_thresholdtrim_win_lengthtrim_hop_lengthpad_secondsvolume_normr   Nc                 C   sj   |dksJ |dksJ |dksJ |dksJ |dksJ || _ || _|| _|| _|| _|| _|| _dS )a  Energy/power based silence trimming using Librosa backend.
           Args:
               db_threshold: Audio frames at least db_threshold decibels below ref_amplitude will be
                 considered silence.
               ref_amplitude: Amplitude threshold for classifying speech versus silence.
               speech_frame_threshold: Start and end of speech will be detected where there are at least
                 speech_frame_threshold consecutive audio frames classified as speech. Setting this value higher
                 is more robust to false-positives (silence detected as speech), but setting it too high may result
                 in very short speech segments being cut out from the audio.
               trim_win_length: Length of audio frames to use when doing speech detection. This does not need to match
                 the win_length used any other part of the code or model.
               trim_hop_length: Stride of audio frames to use when doing speech detection. This does not need to match
                 the hop_length used any other part of the code or model.
               pad_seconds: Audio duration in seconds to keep before and after each speech segment.
                 Set this to at least 0.1 to avoid cutting off any speech audio, with larger values
                 being safer but increasing the average silence duration left afterwards.
               volume_norm: Whether to normalize the volume of audio before doing speech detection.
        r   N)r!   r"   r#   r$   r%   r&   r'   )r   r!   r"   r#   r$   r%   r&   r'   r   r   r   __init__.   s   
zEnergyAudioTrimmer.__init__ r	   r
   r   c           
      C   s   | j r	t|dd}tjj|| j| j| j| jd}t	|| j
|d\}}|s.|s.tg ddfS tjj|| jd}tjj|| jd}t|||jd || jd\}}||| }	|	||fS )Nr   r	   volume_level)refframe_length
hop_lengthtop_db	is_speechr#   r   r   r.   start_sample
end_sample
max_sampler
   r&   )r'   r   librosaeffects_signal_to_frame_nonsilentr"   r$   r%   r!   "get_start_and_end_of_speech_framesr#   r   r   coreframes_to_samplespad_sample_indicesshaper&   )
r   r	   r
   r   speech_framesstart_frame	end_framer4   r5   trimmed_audior   r   r   r   X   s2   


zEnergyAudioTrimmer.trim_audio)r   r   r   r   r   r    Tr)   )r   r   r   r   floatboolr(   r   r   r   r   r   r   r   r   r   r   -   s4    	
.*r   c                   @   s   e Zd Z										d#d
edededededededededdfddZdej	dej	fddZ
dedededeeef fddZd$dej	ded edeej	eef fd!d"ZdS )%VadAudioTrimmervad_multilingual_marblenet>        ?cpur         r    T
model_namevad_sample_ratevad_thresholddevicer#   r$   r%   r&   r'   r   Nc
           
      C   s   |dksJ |dksJ |dksJ |dksJ |dksJ || _ tj|d | j | _|| _|| _|| _|| _	|| _
| j	d | _|| _|	| _dS )a!  Voice activity detection (VAD) based silence trimming.

           Args:
               model_name: NeMo VAD model to load. Valid configurations can be found with
                 EncDecClassificationModel.list_available_models()
               vad_sample_rate: Sample rate used for pretrained VAD model.
               vad_threshold: Softmax probability [0, 1] of VAD output, above which audio frames will be classified
                 as speech.
               device: Device "cpu" or "cuda" to use for running the VAD model.
               trim_win_length: Length of audio frames to use when doing speech detection. This does not need to match
                 the win_length used any other part of the code or model.
               trim_hop_length: Stride of audio frames to use when doing speech detection. This does not need to match
                 the hop_length used any other part of the code or model.
               pad_seconds: Audio duration in seconds to keep before and after each speech segment.
                 Set this to at least 0.1 to avoid cutting off any speech audio, with larger values
                 being safer but increasing the average silence duration left afterwards.
               volume_norm: Whether to normalize the volume of audio before doing speech detection.
        r   )rM      N)rP   r   from_pretrainedevalto	vad_modelrN   rO   r#   r$   r%   
trim_shiftr&   r'   )
r   rM   rN   rO   rP   r#   r$   r%   r&   r'   r   r   r   r(   |   s   
zVadAudioTrimmer.__init__r	   c           
      C   s   |j d | jk rtg S tjj|| j| jd }|j d | jg }t	j
|t	j| jd}t	j
|t	j| jd}| j||d}t	j|dd}|   }|d d df }|| jk}	|	S )Nr   )r-   r.   )dtyperP   )input_signalinput_signal_length)dimr   )r>   r$   r   r   r7   utilframer%   	transposetorchtensorfloat32rP   int32rU   softmaxdetachrJ   numpyrO   )
r   r	   audio_framesaudio_frame_lengthsaudio_signalaudio_signal_len	log_probsprobsspeech_probsr?   r   r   r   _detect_speech   s    


zVadAudioTrimmer._detect_speechr4   r5   r
   c                 C   s*   || j  }t|| }t|| }||fS )N)rN   r   )r   r4   r5   r
   sample_rate_ratior   r   r   _scale_sample_indices   s   
z%VadAudioTrimmer._scale_sample_indicesr)   r   c                 C   s  || j kr|}n	tj||| j d}| jrt|dd}| j|d}t|| j|d\}}|s6|s6t	g ddfS |dkr=d}ntj
j|| jd}|| j7 }||jd krX|jd }	ntj
j|| jd}	|	| j7 }	|| j kru| j||	|d\}}	t||	|jd || jd	\}}	|||	 }
|
||	fS )
N)orig_sr	target_srr   r*   )r	   r0   r   r2   )r4   r5   r
   r3   )rN   r7   resampler'   r   rm   r:   r#   r   r   r;   r<   r%   rV   r>   ro   r=   r&   )r   r	   r
   r   	vad_audior?   r@   rA   r4   r5   rB   r   r   r   r      s@   







zVadAudioTrimmer.trim_audio)	rG   rH   rI   rJ   r   rK   rL   r    TrC   )r   r   r   r   r   rD   rE   r(   r   r   rm   r   ro   r   r   r   r   r   rF   {   sD    	

2".rF   r)   r1   r#   r   r   c           	      C   s   | j d }d}td|| d D ]}|| }t| || r"|} nqd}t||d dD ]}|| }t| || r?|} nq-|du sH|du rStd| d dS ||fS )a  Finds the speech frames corresponding to the start and end of speech for an utterance.
       Args:
           is_speech: [num_frames] boolean array with true entries labeling speech frames.
           speech_frame_threshold: The number of consecutive speech frames required to classify the speech boundaries.
           audio_id: String identifier (eg. file name) used for logging.

       Returns integers representing the frame indices of the start (inclusive) and end (exclusive) of speech.
    r   Nr   rZ   z+Could not find start or end of speech for '')r   r   )r>   rangeallr   warning)	r1   r#   r   
num_framesr@   ihigh_irA   low_ir   r   r   r:     s&   
r:   r4   r5   r6   r
   r&   c                 C   s8   t || }| | } || }td| } t||}| |fS )a  Shift the input sample indices by pad_seconds in front and back within [0, max_sample]
       Args:
           start_sample: Start sample index
           end_sample: End sample index
           max_sample: Maximum sample index
           sample_rate: Sample rate of audio
           pad_seconds: Amount to pad/shift the indices by.

       Returns the sample indices after padding by the input amount.
    r   )r   maxmin)r4   r5   r6   r
   r&   pad_samplesr   r   r   r=   &  s   

r=   rC   )abcr   r   typingr   r7   re   r   r_   nemo.collections.asr.modelsr   2nemo.collections.tts.parts.utils.tts_dataset_utilsr   
nemo.utilsr   r   r   rF   r   r   r   r:   rD   r=   r   r   r   r   <module>   sF   N 

%
