o
    i                     @   s   d dl Z d dlZd dlmZ d dlmZ d dlmZ d dlm	Z	 d dl
m
Z
 d dlmZ e Zg dZd	d
 Zdd ZedkrhdZdZdZde Zeeddde  Zede de d e  dS dS )    N)ProcessPoolExecutor)files)Path)ArrowWriter)tqdm)repetition_found)u   اu   いu   てc           
   	      s   g g t  }}}d}|  D ]f}|jdkrut|dR}t|}|d  t fddtD s6t ddrB|d	7 }	 W d    q|d
 }|	d}	|	
 rf|t|	 |d || |t  W d    n1 spw   Y  q||||fS )Nr   z.jsonrtextc                 3   s    | ]}| v V  qd S )N ).0fr	   r
   [/home/ubuntu/.local/lib/python3.10/site-packages/f5_tts/train/datasets/prepare_emilia_v2.py	<genexpr>   s    z*process_audio_directory.<locals>.<genexpr>   )length   durationz.mp3)
audio_pathr	   r   )setiterdirsuffixopenjsonloadany
en_filtersr   with_suffixexistsappendstrupdatelist)
	audio_dir
sub_result	durations	vocab_setbad_case_enfiler   objr   
audio_filer
   r   r   process_audio_directory   s*   

"

r+   c                  C   s  t dv sJ g g t } }}d}ttd}g }tt}| D ]}| r/||	t
| q t|t|dD ]}| \}	}
}}| |	 ||
 || ||7 }q8|  tjt sgtt  tt dd}t| ddD ]}|| qv|  W d    n1 sw   Y  tt d	d
dd}tjd|i|dd W d    n1 sw   Y  tt dd
}t|D ]	}||d  qW d    n1 sw   Y  tdt dt|   tdt dt|  tdt dt|d dd td| d d S )N)pinyincharr   )max_workers)totalz
/raw.arrow)pathzWriting to raw.arrow ...)descz/duration.jsonwzutf-8)encodingr   F)ensure_asciiz
/vocab.txt
zFor z, sample count: z, vocab size is: z, total i  z.2fz hourszBad en transcription case: ) 	tokenizerr   r   r.   r   dataset_dirr   is_dirr   submitr+   r   lenresultextendr!   shutdownosr0   r   save_dirmakedirsr   writefinalizer   r   dumpsortedprintdataset_namesum)r;   duration_listtext_vocab_settotal_bad_case_enexecutorfuturesdataset_pathsub_dirfuturer$   r%   r&   r'   writerliner   vocabr
   r
   r   main,   sH   





 rS   __main__    r-   z+/home/ubuntu/emilia-dataset/Emilia-YODAS/EN
Emilia_EN_f5_ttsz../../z/data/zPrepare for z, will save to r5   )r   r>   concurrent.futuresr   importlib.resourcesr   pathlibr   datasets.arrow_writerr   r   f5_tts.model.utilsr   r   out_enr   r+   rS   __name__r.   r6   r7   rF   r    joinpathr?   rE   r
   r
   r
   r   <module>   s*   *

