o
    i                     @   s  d dl Z d dlZeje   d dlZd dlmZ d dlm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZmZ h dZd	d
gZh dZg dZdd Zdd ZedkrdZdZdZddgZdZdde de Ze e	d!dde  Z"e#de de" d e  dS dS )     N)ProcessPoolExecutor)files)Path)ArrowWriter)tqdm)convert_char_to_pinyinrepetition_found>   ZH_B00041_S06226ZH_B00042_S09204ZH_B00065_S09430ZH_B00065_S09431ZH_B00066_S09327ZH_B00066_S09328   い   て>G   EN_B00013_S00913EN_B00042_S00120EN_B00055_S04111EN_B00059_S00092EN_B00059_S00950EN_B00059_S03913EN_B00059_S06227EN_B00059_S08397EN_B00060_S05389EN_B00060_S07290EN_B00061_S00693EN_B00061_S01494EN_B00061_S02400EN_B00061_S03375EN_B00061_S05386EN_B00061_S06983EN_B00061_S07060EN_B00061_S08286EN_B00061_S09504EN_B00061_S09694EN_B00062_S04187EN_B00062_S08995EN_B00063_S02859EN_B00063_S04297EN_B00063_S04614EN_B00063_S06860EN_B00064_S01262EN_B00064_S05954EN_B00065_S00497EN_B00065_S05444EN_B00065_S05725EN_B00065_S09873EN_B00065_S09922EN_B00066_S06544EN_B00067_S05066EN_B00067_S05623EN_B00068_S06467EN_B00069_S04036EN_B00069_S07628EN_B00070_S04089EN_B00070_S04343EN_B00071_S07665EN_B00072_S01739EN_B00072_S08620EN_B00073_S06399EN_B00073_S09236EN_B00074_S09659EN_B00075_S01547EN_B00076_S01262EN_B00076_S02943EN_B00076_S06944EN_B00076_S07135EN_B00076_S09127EN_B00078_S05127EN_B00079_S02901EN_B00079_S04698EN_B00082_S06192EN_B00083_S03875EN_B00084_S02463EN_B00085_S05618EN_B00087_S00432EN_B00087_S03811EN_B00089_S00946EN_B00089_S07349EN_B00091_S01238EN_B00092_S03643EN_B00096_S08653EN_B00100_S03759EN_B00104_S01666EN_B00106_S08060EN_B00111_S04300)u   اr   r   c              
      s  |  d}g g }}t }d}d}t|d}| }t||j dD ]}	t|	}
|
d  |
d dkr_|
d d	d
 t	v sNt
 fddtD sNt rS|d
7 }q# tdddd |
d dkr|
d d	d
 tv st
 fddtD st ddr|d
7 }q#tdkrt gtdd  |
d }|t| j|
d   |d || |t  q#W d    n1 sw   Y  |||||fS )Nz.jsonlr   rdesctextlanguagezhwav/   c                 3       | ]}| v V  qd S N .0fr[   rc   X/home/ubuntu/.local/lib/python3.10/site-packages/f5_tts/train/datasets/prepare_emilia.py	<genexpr>{       z&deal_with_audio_dir.<locals>.<genexpr>u   ，u   ！u   ？),!?enc                 3   ra   rb   rc   rd   rg   rc   rh   ri      rj      )lengthpinyin)	polyphoneduration)
audio_pathr[   rs   )with_suffixsetopen	readlinesr   stemjsonloadssplitout_zhany
zh_filtersr   	translatestr	maketransout_en
en_filters	tokenizerr   rr   appendparentupdatelist)	audio_diraudio_jsonl
sub_result	durations	vocab_setbad_case_zhbad_case_enrf   lineslineobjrs   rc   rg   rh   deal_with_audio_diro   sD   


4
 
r   c                     s^  t dv sJ g } g }t }d}d}ttd g tD ]}ttjt	|} fdd|
 D  qttdD ]" \}}}	}
}| | || ||	 ||
7 }||7 }q:   tjt sntt  tdt d tt d	d
}t| ddD ]}|| q|  W d    n1 sw   Y  tt dddd}tjd|i|dd W d    n1 sw   Y  tt dd}t|D ]	}||d  qW d    n1 sw   Y  tdt dt|   tdt dt|  tdt dt|d dd dtv rtd|  d tv r-td!| d d S d S )"N)rq   charr   )max_workersc                    s&   g | ]}|  r t|qS rc   )is_dirr   submitr   )re   r   executorfuturesrc   rh   
<listcomp>   s    zmain.<locals>.<listcomp>)totalz
Saving to z ...z
/raw.arrow)pathzWriting to raw.arrow ...rY   z/duration.jsonwzutf-8)encodingrs   F)ensure_asciiz
/vocab.txt
z
For z, sample count: zFor z, vocab size is: z, total i  z.2fz hoursZHzBad zh transcription case: ENzBad en transcription case: )r   rv   r   r   langsr   osr   joindataset_diriterdirr   lenresultextendr   shutdownexistssave_dirmakedirsprintr   writefinalizerw   rz   dumpsorteddataset_namesum)r   duration_listtext_vocab_settotal_bad_case_zhtotal_bad_case_enlangdataset_pathr   r   r   r   r   writerr   rf   vocabrc   r   rh   main   sZ   





 

r   __main__    rq   Tr   r   z<SOME_PATH>/Emilia_Dataset/rawEmilia__f5_ttsz../../z/data/z
Prepare for z, will save to r   )$r   sysr   r   getcwdrz   concurrent.futuresr   importlib.resourcesr   pathlibr   datasets.arrow_writerr   r   f5_tts.model.utilsr   r   r}   r   r   r   r   r   __name__r   r   rr   r   r   r   r   r   joinpathr   r   rc   rc   rc   rh   <module>   s6   J$<
