o
    zi#                     @   sL  d Z ddlZddlZddlZddlZddlZddlmZ ddl	m
Z
 ddlZdddddd	d
dZddddddZ		d,dejdedededef
ddZdejdedededef
ddZdejdedejfddZd-dejdededejfd d!Zd"ed#efd$d%Zed&kre Zejd'd(d) ejd*d+d) e Zeej ej! dS dS ).z
Data preprocessing for LeWM TTS.
- Loads Modi dataset from HuggingFace Arrow format
- Segments long clips into 3-15 second chunks using silence detection
- Extracts mel spectrograms
- Saves processed dataset
    N)load_from_disk)Pathi]  i      P   i.  g      ?sample_raten_fft
hop_lengthn_melsf_minf_maxpowerg      @g      (@g       @333333?)min_durationmax_durationtarget_durationsilence_threshold_dbmin_silence_lenwaveformsrthreshold_dbr   returnc                 C   s  t d| }t d| }dt| | |  }g }t|D ]&}|| }	| |	|	|  }
tt|
d d }|dt|d   qt|}||k }g }t |d }d}d}t	|D ](\}}|ri|si|}d	}q\|s|r|| |kr|| d }|| }|| d}q\|S )
z+Find silence points in audio for splitting.g?{Gz?      g|=   Fr   T)
intlenrangenpsqrtmeanappendlog10array	enumerate)r   r   r   r   frame_lengthhopn_framesrms_dbistartframerms
is_silencesilence_points
min_frames
in_silencesilence_starts	mid_frame
mid_sample r7   #/home/ubuntu/lewm-tts/preprocess.pydetect_silence_points(   s6   

r9   textconfigc                 C   s4  t | | }||d kr||d kr| |fgS g S t| ||d |d d}dg| t | g }g }d}t|dd dD ]h\}	}
|
| | }||d	 kr| ||
 }t || }|d |  krh|d d
 krpn n|| n2||d d
 krt|d | }tdt ||D ]}||||  }t || |d kr|| q|
}q<|t | k r| |d }t || |d kr|| n|rt|d |g}t || |d d
 kr||d< g }tdd |D }t	|}d}|D ](}t || }t
dtt || }||||  }||7 }||| f q|S )zDSegment a long audio clip into shorter chunks at silence boundaries.r   r   r   r   )r   r   r   r   Nr   g333333?c                 s   s    | ]}t |V  qd S )N)r   ).0r4   r7   r7   r8   	<genexpr>   s    z segment_audio.<locals>.<genexpr>)r   r9   r&   r#   r   r   r    concatenatesumlistmaxstrip)r   r   r:   r;   total_durationr0   split_pointssegmentscurrent_startr+   pointcurrent_durationchunkchunk_durationmax_samplesjsub	remainingcombinedresultstotal_samples
text_charschar_idxseg
proportionn_charsseg_textr7   r7   r8   segment_audioQ   s`   
$
rY   c              	   C   sV   t jj|d |d |d |d |d |d |d d}|| }ttj|d	d
}|S )zExtract log-mel spectrogram.r   r   r	   r
   r   r   r   r   gh㈵>)min)
torchaudio
transformsMelSpectrogramtorchlogclamp)r   r;   mel_transformmellog_melr7   r7   r8   extract_mel   s   
rd   c           	      C   s   d|d  }t | }td| }tdt|| |D ]}t ||||  |kr, nqtt||| D ]}t ||| | |krG nq6td| }td|| }tt| || }| || S )z"Trim leading and trailing silence.
   r   r   r   g?)r    absr   r   r   rB   rZ   )	r   r   r   	thresholdabs_wavwindowr,   endpadr7   r7   r8   trim_silence   s   
rm   
input_pathoutput_pathc                 C   s  t d|  d t| }t|}|d }|d }|jddd |jddd g }d}d}tt|D ]}	||	 }
|
d }tj|d	 tjd
}|d }|
d }t	||}t
|||t}|D ]\\}}|scq\t|| }||7 }||dd }t|d}tt||| t|t}||dd }t|dt| ||t|t||t|d|jd d |d7 }q\|	d d dkrt d|	d  dt| d| d|d dd	 q2|d }t|ddd}tj||d d!d" W d#   n1 sw   Y  tt|t|d d!t|d$}|d% }t|d}tj||d!d& W d#   n	1 s*w   Y  t d'| d|d dd( t d)|  t d*|  d#S )+zMain preprocessing pipeline.zLoading dataset from z...audiomelsT)parentsexist_okg        r   r%   )dtypesampling_rater:   06dz.wavz.pt   r<   )id
audio_pathmel_pathr:   duration
mel_framesr   d   z  Processed /z
 samples, z segments, i  z.2fzh totalzmanifest.jsonwzutf-8)encodingFr   )ensure_asciiindentN)rb   segmentationtotal_segmentstotal_duration_hoursoriginal_sampleszconfig.json)r   z
Done! z hoursz
Manifest: zConfig: )printr   r   mkdirr   r   r    r%   float32rm   rY   
SEG_CONFIGr^   
from_numpy	unsqueezer[   savestrrd   
MEL_CONFIGsqueezer#   roundshapeopenjsondump)rn   ro   ds
output_dir	audio_dirmel_dirmanifestrD   idxr+   sample
audio_datar   r   r:   rF   seg_wavrX   r{   ry   
wav_tensorrb   rz   manifest_pathfr;   config_pathr7   r7   r8   preprocess_dataset   st   


	0r   __main__z--inputz/home/ubuntu/modi_dataset)defaultz--outputz$/home/ubuntu/lewm-tts/processed_data)r   r   )re   )"__doc__osr   numpyr    r^   r[   datasetsr   pathlibr   argparser   r   ndarrayr   floatrA   r9   r   dictrY   Tensorrd   rm   r   __name__ArgumentParserparseradd_argument
parse_argsargsinputoutputr7   r7   r7   r8   <module>   sV    

 )H Q