o
    im                     @   s   d dl Z d dlZeje   d dlZd dlmZ d dlm	Z	 d dl
mZ d dlZd dlmZ d dlmZ dd Zd	d
 Zedkr}dZdZg dZdZdde de ddddZee	ddde  Zede de d e  dS dS )    N)ProcessPoolExecutor)files)Path)ArrowWriter)tqdmc           	      C   s   g g }}t  }t| d}|D ]6}|d}t|d  }t|j	}|dk s/|dkr0q|
t|||d |
| |t| q|||fS )Nz*.wavz.normalized.txtrg?   )
audio_pathtextduration)setlistrglobwith_suffixopenreadstripsfinfor   appendstrupdate)		audio_dir
sub_result	durations	vocab_setaudio_listsline	text_pathr
   r    r   Z/home/ubuntu/.local/lib/python3.10/site-packages/f5_tts/train/datasets/prepare_libritts.pydeal_with_audio_dir   s   



r!   c                     s  g } g }t  }ttd g ttD ]}ttjt	|} fdd|
 D  qttdD ]}| \}}}| | || || q2   tjt s\tt  tdt d tt dd}	t| d	d
D ]}
|	|
 qs|	  W d    n1 sw   Y  tt dddd}tjd|i|dd W d    n1 sw   Y  tt dd}t|D ]	}||d  qW d    n1 sw   Y  tdt dt|   tdt dt|  tdt dt|d dd d S )N)max_workersc                    s&   g | ]}|  r t|qS r   )is_dirr   submitr!   ).0r   executorfuturesr   r    
<listcomp>-   s    zmain.<locals>.<listcomp>)totalz
Saving to z ...z
/raw.arrow)pathzWriting to raw.arrow ...)descz/duration.jsonwzutf-8)encodingr   F)ensure_asciiz
/vocab.txt
z
For z, sample count: zFor z, vocab size is: z, total i  z.2fz hours)r   r   r"   r   SUB_SETr   osr+   joindataset_diriterdirlenresultextendr   shutdownexistssave_dirmakedirsprintr   writefinalizer   jsondumpsorteddataset_namesum)r7   duration_listtext_vocab_setsubsetdataset_pathfuturer   r   r   writerr   fvocabr   r&   r    main"   sF   



$rM   __main__$   char)ztrain-clean-100ztrain-clean-360ztrain-other-500z<SOME_PATH>/LibriTTS	LibriTTS__ztrain-clean- ztrain-other-f5_ttsz../../z/data/z
Prepare for z, will save to r0   )r2   sysr+   r   getcwdr@   concurrent.futuresr   importlib.resourcesr   pathlibr   	soundfiler   datasets.arrow_writerr   r   r!   rM   __name__r"   	tokenizerr1   r4   r3   replacerC   r   joinpathr;   r=   r   r   r   r    <module>   s,    /&
