o
    1i*                     @  s   U d Z ddlmZ ddlZddlZddlmZ ddlmZ ddl	Z	ddl
Z
ddlmZ eeZeG dd dZddlZe Zi Zd	ed
< dd Zd1ddZd2ddZ	d3d4d d!Z	d3d5d"d#Z	$	%d6d7d*d+Zd8d/d0ZdS )9zVAD-aware audio segmentation using Silero-VAD.

Produces speech segments between 3-30s with good duration distribution.
Cuts only at silence boundaries to avoid mid-word splits.
    )annotationsN)	dataclass)Path)	VADConfigc                   @  s4   e Zd ZU ded< ded< ded< ed
ddZd	S )Segmentfloatstart_send_storch.Tensoraudioreturnc                 C  s   | j | j S N)r	   r   )self r   4/home/ubuntu/bench-codecs/codecbench/pipeline/vad.py
duration_s   s   zSegment.duration_sN)r   r   )__name__
__module____qualname____annotations__propertyr   r   r   r   r   r      s   
 r   zdict[int, tuple]_vad_modelsc                  C  s   t  } | tvr<t* | tvr*tjjdddd\}}||ft| < W d   t|  S W d   t|  S 1 s7w   Y  t|  S )zGet or create a per-thread VAD model instance.

    Silero-VAD has internal RNN state that isn't thread-safe.
    Each thread gets its own model to avoid state corruption.
    zsnakers4/silero-vad
silero_vadT)repo_or_dirmodel
trust_repoN)	threading	get_identr   	_vad_locktorchhubload)tidr   utilsr   r   r   	_load_vad'   s"   


r$   wavr
   srintcfgr   r   
list[dict]c              
   C  sj   t  \}}|d }|dkrtj| |d} d}| jdkr |  } || |||j|j|jt	|j
d d}|S )z1Run Silero-VAD and return speech timestamp dicts.r   i>     i  )sampling_rate	thresholdmin_silence_duration_msspeech_pad_msmin_speech_duration_ms)r$   
torchaudio
functionalresamplendimsqueezer,   r-   r.   r'   min_speech_duration_s)r%   r&   r(   r   r#   get_ts
timestampsr   r   r   get_speech_timestamps:   s"   

	r8   r7   total_sampleslist[tuple[int, int]]c                 C  s|  t |j| }t |j| }| sg S g }| d d }| d d }tdt| D ]B}	| |	 d }
| |	 d }|}|| }||krB|}q'|| }||krR|||f n|dkret|| ||}|||f |
}|}q'|| }||kry|||f g }|D ]>\}}|| }||kr|||f q}|}||k rt |tdd }t|| |}|| |kr|||f |}||k sq}|S )a+  Merge VAD speech chunks into segments within [min_s, max_s] range.

    Strategy: greedily accumulate consecutive speech chunks. When accumulated
    duration would exceed max_segment_s, cut at the last silence boundary.
    Random target durations give good distribution across [min_s, max_s].
    r   startendr*   g333333?g      ?)	r'   min_segment_smax_segment_srangelenappendminrandomuniform)r7   r9   r&   r(   min_samplesmax_samplessegmentscurrent_startcurrent_endichunk_start	chunk_endproposed_endproposed_durseg_durextended	final_durresultr;   r<   durpostargetseg_endr   r   r   _merge_segments_to_targetV   sP   rW   
audio_path
Path | strVADConfig | Nonelist[Segment]c              	   C  sl  |du rt  }tt| \}}|jd dkr|jddd}||jkr/tj|||j}|j}|	 }|jd }|| }t
d|| t|||}tdd |D | }t
d	||d
| t|d  t||||}	g }
|	D ]\}}|dd||f }|
t|| || |d qndd |
D }|rt
dt|
t|t|t|t| t| |
S t
d|  |
S )u   Full VAD pipeline: load audio → detect speech → create segments.

    Returns list of Segment objects with 2-30s speech audio.
    Nr   r*   Tdimkeepdimz Audio loaded: %.1f s, %d samplesc                 s  s     | ]}|d  |d  V  qdS )r<   r;   Nr   ).0tsr   r   r   	<genexpr>   s    z segment_audio.<locals>.<genexpr>z7VAD detected %.1f s speech out of %.1f s total (%.0f%%)d   g{Gz?r   r	   r   c                 S  s   g | ]}|j qS r   )r   )r_   sr   r   r   
<listcomp>   s    z!segment_audio.<locals>.<listcomp>zCCreated %d segments: %.1f-%.1f s (mean %.1f s, total %.1f s usable)z$No valid speech segments found in %s)r   r0   r!   strshapemeansample_rater1   r2   r4   loggerinfor8   summaxrW   rA   r   r@   rB   warning)rX   r(   r%   r&   wav_1dr9   total_durationr7   
speech_dur
seg_boundsrG   r;   r<   segment_wav	durationsr   r   r   segment_audio   sH   


ru   c              
   C  s  |du rt  }| jdkr| d} | jd dkr| jddd} ||jkr0tj| ||j} |j}| 	 }|jd }|| }|j
}|j}|j}||krSt|||||}	nt|||}	t|	|||}
g }|
D ]\}}|t|| || | dd||f d qd|S )zASame as segment_audio but from an in-memory tensor [1, T] or [T].Nr*   r   Tr\   rc   )r   r3   	unsqueezerg   rh   ri   r0   r1   r2   r4   chunk_threshold_schunk_size_schunk_overlap_s_chunked_vadr8   rW   rA   r   )r%   r&   r(   ro   r9   r   rw   rx   	overlap_sr7   rr   rG   r;   r<   r   r   r   segment_tensor   s6   




r|        r@       @ro   rx   r   r{   c                   sT  ddl m}m} jd }t| }t| }	||	 }
g }d}||k r@t|| |}|||f ||
7 }||kr<n||k s%t|dkrLtS t	
dt||||  dfd
d tt|d}dgt| }||dd% fddt|D }||D ]}|| }| ||< qW d   n1 sw   Y  t||	}|S )u  Run VAD on audio chunks in parallel, merge results.

    Splits long audio into overlapping chunks, processes each in its own thread
    (each with its own Silero model), then merges timestamps with overlap dedup.
    For 2500s audio with 300s chunks: 36s → ~4s (9x speedup).
    r   )ThreadPoolExecutoras_completedr*   z?Chunked VAD: %d chunks of %.0fs (%.1fs overlap) for %.1fs audiorK   r'   rL   r   r)   c                   sF   | | }t | }|D ]}|d  | 7  < |d  | 7  < q|S )Nr;   r<   )r8   )rK   rL   	chunk_wavr`   t)r(   r&   ro   r   r   
_vad_chunk  s   z _chunked_vad.<locals>._vad_chunk   Nvad)max_workersthread_name_prefixc                   s$   i | ]\}\}}  |||qS r   )submit)r_   rJ   r;   r<   )r   poolr   r   
<dictcomp>+  s    
z _chunked_vad.<locals>.<dictcomp>)rK   r'   rL   r'   r   r)   )concurrent.futuresr   r   rg   r'   rB   rA   r@   r8   rj   debug	enumeraterR   _merge_overlapping_timestamps)ro   r&   r(   rx   r{   r   r   r9   chunk_samplesoverlap_samplesstep_sampleschunksrT   r<   r   all_timestampsfuture_to_idxfutureidxmergedr   )r   r(   r   r&   ro   r   rz      sB   



rz   chunk_timestampslist[list[dict]]r   c           	      C  s   | sg S t | d }tdt| D ]H}| | }|sq|s"|| q|r*|d d nd}|r4|d d nd}|}|D ]}|d |krH|| q:|d |krX|||d d q:q|S )a4  Merge timestamps from overlapping chunks, deduplicating the overlap regions.

    For each pair of adjacent chunks, timestamps in the overlap zone are resolved
    by keeping the earlier chunk's timestamps up to the midpoint of the overlap,
    and the later chunk's timestamps from the midpoint onward.
    r   r*   r<   r;   )r;   r<   )listr?   r@   extendrA   )	r   r   rR   rJ   next_tslast_end_prevfirst_start_nextcutoffr   r   r   r   r   8  s*   

r   )r%   r
   r&   r'   r(   r   r   r)   )
r7   r)   r9   r'   r&   r'   r(   r   r   r:   r   )rX   rY   r(   rZ   r   r[   )r%   r
   r&   r'   r(   rZ   r   r[   )r}   r~   )ro   r
   r&   r'   r(   r   rx   r   r{   r   r   r)   )r   r   r   r'   r   r)   )__doc__
__future__r   loggingrC   dataclassesr   pathlibr   r   r0   codecbench.pipeline.configr   	getLoggerr   rj   r   r   Lockr   r   r   r$   r8   rW   ru   r|   rz   r   r   r   r   r   <module>   s4    



E8/?