o
    5iL                     @   sl   d Z ddlZddlZddlZddlmZ ddlm	Z	m
Z
mZ ddlmZ eG dd dZG dd	 d	ZdS )
a  
Audio Polisher - Segment boundary cleanup for transcription quality.

Two-phase polishing:
  Phase 1 (original): Remove CUT SPEECH ARTIFACTS from VAD boundaries
    - Detects burst -> gap -> speech pattern and removes the burst
    - Silence at boundaries preserved as natural padding
  Phase 2 (new): TIGHTEN boundaries to actual speech onset/offset
    - Energy-based detection of where speech actually starts/ends
    - Cuts to those points with configurable silence margin (~50ms)
    - Ensures clean, crisp boundaries for TTS training data
    - Prevents segments starting/ending mid-voice-energy

Combined result: artifact-free + precisely-bounded audio segments.
No ML models - pure signal processing, runs in <10ms/segment.
    N)	dataclass)OptionalTupleList)Pathc                   @   s   e Zd ZU dZeed< eed< eed< dZeed< dZ	eed< dZ
eed	< dZeed
< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< edd Zdd ZdS )PolishResultz/Result of audio polishing for a single segment.
input_pathoutput_pathwas_modified        start_trimmed_msend_trimmed_mscleanstart_qualityend_qualityoriginal_duration_mspolished_duration_mssnr_dbrms_dbpeak_dbF
is_clippedvolume_adjustedvolume_gain_dbc                 C   s   | j | j S N)r   r   )self r   ;/home/ubuntu/maya3_transcribe/src/backend/audio_polisher.pytotal_trimmed_ms,   s   zPolishResult.total_trimmed_msc                 C   s   t | jj}| js d| d| jdd| jdd| j d| j 
S g }| jdkr6|	d	| jdd
| j d | j
dkrJ|	d| j
dd
| j d | jrX|	d| jdd d|}d| d| jdd| jdd| d| jddS )Nz  z	: SKIP | z.0fz	ms | SNR:z.1fzdB | start:z end:r   zstart -zms ()zend -zgain +dBz | z: POLISHED | zms -> zms | z | SNR:)r   r   namer
   r   r   r   r   r   appendr   r   r   joinr   )r   r    partschangesr   r   r   summary0   s8   


zPolishResult.summaryN)__name__
__module____qualname____doc__str__annotations__boolr   floatr   r   r   r   r   r   r   r   r   r   r   propertyr   r%   r   r   r   r   r      s(   
 
r   c                   @   s   e Zd ZdZ												
												d+ddZdd Zdd Zdd Zdd Zdd Z	dd  Z
d!d" Zd,d$d%Zd-d'd(Zd-d)d*Zd&S ).AudioPolisherz
    Two-phase audio segment polisher for transcription quality.

    Phase 1: Conservative artifact removal (burst-gap-speech pattern detection)
    Phase 2: Energy-based boundary tightening (speech onset/offset + silence margin)
          $@      @      i@     b@      .@      Y@ffffff?333333?      @      >@     r@      2      (@Gz?T      I@flacc                 C   s   || _ || _|| _|| _|| _|| _|| _|| _|	| _|
| _	|| _
|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _d S r   )frame_mshop_msmax_start_trim_msartifact_search_ms
min_gap_msmax_end_trim_msmin_burst_msmax_burst_msenergy_gap_factorzcr_artifact_thresholdmin_dynamic_range_dbmin_silence_pad_msmin_remaining_msmin_trim_ms
fade_in_msfade_out_mstarget_rms_dbsnr_boost_threshold_dbmax_gain_dbclipping_thresholdtighten_boundaries_enabledsilence_margin_msoutput_format)r   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   tighten_boundariesrU   rV   r   r   r   __init__M   s.   
zAudioPolisher.__init__c              
   C   s  t | j| d }t | j| d }tdt|| | d }t|}t|}t|}t|D ]P}	|	| }
t|
| t|}||
| }t	t
|d }dt|d  ||	< t|dkrxtttt|dt|  ||	< |
|d  | ||	< q2|||fS )z5Compute per-frame energy (dB) and zero-crossing rate.           绽|=)intr@   rA   maxlennpzerosrangeminsqrtmeanlog10sumabsdiffsign)r   audiosrframe_sampleshop_samplesn_framesste_dbzcrframe_timesistartendframermsr   r   r   _compute_frames   s$   




zAudioPolisher._compute_framesc                 C   sf  | j d }t||k d }t|dk rdS t|d}t|d}|| }|| jk r-dS || j|  }	tdt|}
||d|
  }t||	k rLdS || |	k }g }d}d}t	t|D ]0}|| rk|skd	}|}q^|| s|rd}|||  |||   d }|| j
kr||||f q^|r||d
  |||   d }|| j
kr||t||f |sdS |d }|d dkrdS |d t|krdS |d|d  }t|dkrt|| nd}||d  }|| }|d | jks|d | jk rd|d | jkrdfS dfS |d }|| jk r|| jkrdS || jkr#|dfS |d | j
d kr1|dfS dS )ar  
        Detect CUT SPEECH artifact at segment start.

        ONLY acts when: burst (cut speech) -> gap (silence) -> real speech.
        Trims at gap START = removes burst, keeps gap as silence padding.

        Does NOT trim if segment starts with silence (that's good padding).
        Does NOT trim if segment starts with continuous speech (natural start).
        rY   r      )Nr      U   NFTrZ   )Nmarginal_keptr~   r   r[   artifact_trimmedg      ?)rC   ra   wherer`   
percentilerJ   rH   rd   allrc   rD   r!   rf   rB   rM   rK   rI   )r   rq   rr   rs   search_limitsinfsldr
gap_threshn_checkfirst_frames_energybelowgapsin_gapgsrt   durgpremzcrgap_start_framettgap_msr   r   r   _detect_leading_artifact   sl   




z&AudioPolisher._detect_leading_artifactc                 C   sZ  t |d}t |d}|| }|| jk rdS || j|  }||k}t|}	td|	}
t ||
 d  r8dS t |d }t|dkrGdS |d }td|}|dkr||| | }t |}|d	k r|}t	|d
 ddD ]	}|| rx|}qo || ||  d }| j
|  kr| jkrn dS || }|| d }|| jkr|| jkr|ddfS dS )z
        Detect CUT SPEECH artifact at segment end.

        ONLY acts when: sustained speech -> silence gap -> isolated burst at end.
        r{   r|   )Nr   Frz   Nr   r}   
   g333333?rZ   rY   r   F)Nr   T)ra   r   rJ   rH   r`   rd   r   r   rf   rc   rF   rG   rE   rM   )r   rq   rs   	total_durr   r   r   threshabovenr   ai
last_abovelookback	preceding	sustainedburst_startbi	burst_durr   actual_trimr   r   r   _detect_trailing_artifact   sB   




z'AudioPolisher._detect_trailing_artifactc           
      C   s\   |d }d}d}t |D ]\}}||kr)|d7 }||kr(|| d }	||	   S qd}qdS )as  
        Find precise speech onset: first sustained energy rise above noise floor.

        Uses hysteresis: speech starts when energy exceeds threshold for 3+
        consecutive frames (~15ms at 5ms hop). Prevents transient noise spikes
        from being misidentified as speech onset.

        Returns: onset time in seconds, or None if no clear onset found.
        r8   rz   r   rZ   N)	enumerate)
r   rq   rs   noise_floor_dbr   consec_neededconsecrt   eonset_framer   r   r   _find_speech_onset  s   
z AudioPolisher._find_speech_onsetc           	      C   sv   |d }d}d}t t|d ddD ]&}|| |kr6|d7 }||kr5t|| d t|d }||   S qd}qdS )z
        Find precise speech offset: last sustained energy above noise floor.

        Mirrors onset detection, scanning backward.
        Returns: offset time in seconds, or None if no clear offset found.
        r8   rz   r   rZ   r}   N)rc   r`   rd   )	r   rq   rs   r   r   r   r   rt   offset_framer   r   r   _find_speech_offset-  s   z!AudioPolisher._find_speech_offsetc                 C   s   t || }t|d}| |||}| |||}|du s"|du r$dS | jd }	td||	 }
t|||	 }|
d }|| d }|dk rK|dk rKdS ||
 d }|| jk rXdS t	|
| }t	|| }||| ||fS )a  
        Tighten segment boundaries to actual speech onset/offset.

        After artifact removal, precisely locates where speech energy begins
        and ends, then cuts to those points with silence_margin_ms padding.
        Ensures segments don't start/end mid-voice-energy.

        Returns: (tightened_audio, start_trim_ms, end_trim_ms) or
                 (None, 0, 0) if no tightening needed
        r   N)Nr   r   g     @@r   rY   r9   )
r`   ra   r   r   r   rU   r_   rd   rL   r^   )r   rl   rm   rq   rs   r   noise_flooronsetoffset
margin_sec	new_startnew_end
start_trimend_trim
new_dur_msssesr   r   r   _tighten_to_speechB  s&   

z AudioPolisher._tighten_to_speechc                 C   s   t |dt |d S )z&Estimate SNR from energy distribution.Z   r   )ra   r   )r   rq   r   r   r   _measure_snri  s   zAudioPolisher._measure_snrFc                 C   s   |  }|r-| jdkr-t| j| d }|dkr-|t|k r-|d|  tdd|9  < |rW| jdkrWt| j| d }|dkrW|t|k rW|| d  tdd|9  < |S )z9Apply smooth fade-in/fade-out to prevent click artifacts.r   rY   NrZ   )copyrN   r^   r`   ra   linspacerO   )r   rl   rm   fade_infade_outoutr   r   r   r   _apply_fadem  s    zAudioPolisher._apply_fadeNc           )      C   s  t |\}}t|jdkr|jdd}t|| d }tt|d }tt|}dt	|d  }	dt	|d  }
|| j
k}|| jk rVt||d|||	|
|dS | ||\}}}| |}| |||\}}| ||t|| \}}}|d	ur|d nd
}|d	urt|| | d nd
}|| | | jk rd\}}d\}}|dkrdnd}|dkrdnd}d}d}d
}|| jk r|	| jk rt| j|	 | j}|dkrd}nd
}| }|d	urt|| nd}|d	urt|| nt|}||| }|d	up|d	u}| jrPt|| d | jkrP| ||\}}}| ||||\} }!}"| d	urP| }||!7 }||"7 }|dkrD|!dkrDd}|dkrP|"dkrPd}| j|||dk|d}|r{d|d  }#||# }tt|}$|$dkr{|d|$  }|p|dkp|dkp|}%|%st||d||||||	|
|dS t|| d }&|d	u r|rtj|dd tj|t|j}nt|j }'t|j!pd| j" }(t#t|j$|' d|(  }t %||| t||d||||||&||	|
|||dS )a%  
        Polish a single audio segment (two-phase).

        Phase 1: Remove cut speech artifacts (burst-gap-speech pattern)
        Phase 2: Tighten boundaries to actual speech onset/offset + silence margin

        Already-clean segments pass through unchanged (was_modified=False).
        rZ   )axisrY   r[   r\   r]   F)r   r	   r
   r   r   r   r   r   Nr   NN)r   r   r   r~   g      ?Tr      boundary_tightened)r   r   r   r=   )r   r	   r
   r   r   r   r   r   r   r   r   )exist_ok.	_polished)r   r	   r
   r   r   r   r   r   r   r   r   r   r   r   r   )&sfreadr`   shaperf   ra   re   r_   ri   rg   rS   rL   r   ry   r   r   r   rQ   rP   rd   rR   r   r^   rT   r   r   osmakedirspathr"   r   r    stemsuffixrV   r*   parentwrite))r   
audio_pathr	   
output_dirrl   rm   orig_msrms_valpeakr   r   r   rq   rr   ftr   s_secs_qe_sece_qneeds_fade_outs_mse_msvol_adjvol_gainpolr   r   phase1_modifiedste2_ft2	tightenedt_startt_endr   mx	needs_modpol_msr   extr   r   r   polishz  s   	



  




zAudioPolisher.polishc                 C   s   |du rt j|d}h d}g }|D ]}|t|d|  qt|dd d}|r3|d| }tdt| d	 g }d
}|D ]}	| j	t
|	|d}
||
 |
jrZ|d7 }t|
  qCtd| dt| dt||  d |S )z)Polish all audio segments in a directory.Npolished>   .m4a.mp3.ogg.wav.flac*c                 S   s   | j S r   )r    )xr   r   r   <lambda>   s    z0AudioPolisher.polish_directory.<locals>.<lambda>)keyz[Polisher] Processing z segments...r   )r   rZ   z[Polisher] Done: /z modified, z already clean)r   r   r"   extendr   globsortedprintr`   r   r*   r!   r
   r%   )r   	input_dirr   	max_filesextsfilesr   resultsmodifiedfrr   r   r   polish_directory  s,   


zAudioPolisher.polish_directory)r0   r1   r2   r3   r4   r3   r0   r5   r6   r7   r8   r9   r:   r0   r1   r0   r;   r<   r<   r=   Tr>   r?   )FFr   )r&   r'   r(   r)   rX   ry   r   r   r   r   r   r   r   r   r  r   r   r   r   r/   E   sH    	
3M4'

~r/   )r)   r   numpyra   	soundfiler   dataclassesr   typingr   r   r   pathlibr   r   r/   r   r   r   r   <module>   s    +