o
    §µiÑ7  ã                   @   sî   d dl Z d dlZd dlZd dlZd dlmZ d dlZd dlm  m	Z
 d dlZd dlmZ d dlmZ d dlmZ d dlmZ dd„ Zd	d
„ Zdd„ Z															d(dd„Zdd„ Zd)dd „Zd*d"d#„Zd$d%„ Zd&d'„ ZdS )+é    N)ÚPath)Útqdm)ÚECAPA_TDNN_SMALL)ÚMelSpec)Úconvert_char_to_pinyinc           
      C   sØ   t | ƒ}| ¡ }| ¡  g }|D ]Y}t| ¡  d¡ƒdkr*| ¡  d¡\}}}}}	n$t| ¡  d¡ƒdkrN| ¡  d¡\}}}}tj tj 	| ¡d|d ¡}	tj 
|¡s_tj tj 	| ¡|¡}| |||||	f¡ q|S )Nú|é   é   Úwavsú.wav)ÚopenÚ	readlinesÚcloseÚlenÚstripÚsplitÚosÚpathÚjoinÚdirnameÚisabsÚappend)
ÚmetalstÚfÚlinesÚmetainfoÚlineÚuttÚprompt_textÚ
prompt_wavÚgt_textÚgt_wav© r"   úJ/home/ubuntu/.local/lib/python3.10/site-packages/f5_tts/eval/utils_eval.pyÚget_seedtts_testset_metainfo   s   r$   c                 C   s¨   t | ƒ}| ¡ }| ¡  g }|D ]A}| ¡  d¡\}}}}	}
}| d¡\}}}tj ||||d ¡}|	 d¡\}}}tj ||||	d ¡}| |	||d| |f¡ q|S )Nú	ú-ú.flacú )	r   r   r   r   r   r   r   r   r   )r   Úlibrispeech_test_clean_pathr   r   r   r   Úref_uttÚref_durÚref_txtÚgen_uttÚgen_durÚgen_txtÚ
ref_spk_idÚref_chaptr_idÚ_Úref_wavÚ
gen_spk_idÚgen_chaptr_idÚgen_wavr"   r"   r#   Ú#get_librispeech_test_clean_metainfo$   s   r7   c                 C   sj   t  dd„ | D ƒ¡ ¡ }g }| D ]}tj|d||jd  fdd}| |¡ qt  |¡}| ddd¡}|S )Nc                 S   s   g | ]}|j d  ‘qS )éÿÿÿÿ)Úshape)Ú.0Úmelr"   r"   r#   Ú
<listcomp>;   s    z$padded_mel_batch.<locals>.<listcomp>r   r8   )Úvalueé   é   )	ÚtorchÚ
LongTensorÚamaxÚFÚpadr9   r   ÚstackÚpermute)Úref_melsÚmax_mel_lengthÚpadded_ref_melsr;   Úpadded_ref_melr"   r"   r#   Úpadded_mel_batch:   s   
rK   ç      ð?ÚpinyinTéÀ]  é   éd   é   Úvocosçš™™™™™¹?Fr?   éÈ   é   é(   c           /         s„  g }|| | }|| | }dgˆ  }‡ fdd„t dƒD ƒ\}}}}}}t||||||	d}t| ddD ]X\}}}}}t |¡\} }!t t t | ¡¡¡}"|"|
k rX| |
 |" } | j	d d	ksgJ d
|› dƒ‚|!|krvtj
 |!|¡}#|#| ƒ} t|d  d¡ƒdkr…|d }|| g}$|dkr•t|$|d}%n|$}%|| ƒ}&|& d¡}&|&j	d }'|rËt |¡\}(})|)|kr½tj
 |)|¡}#|#|(ƒ}(|'t|(j	d | | ƒ }*nt| d¡ƒ}+t| d¡ƒ},|'t|'|+ |, | ƒ }*|dksíJ dƒ‚||*  krø|ksn J d|› d|*| | › d|› d|› d	ƒ‚t |*| || d  ˆ  ¡}-||-  |¡ ||-  |"¡ ||-  |&¡ ||-  |'¡ ||-  |*¡ ||-  |%¡ ||-  |*7  < ||- |kr| ||- ||- t||- ƒ||- ||- ||- f¡ d||-< g g g g g g f\||-< ||-< ||-< ||-< ||-< ||-< q4t|ƒD ]#\}-}.|.dkr´| ||- ||- t||- ƒ||- ||- ||- f¡ q’t d¡ t |¡ |S )Nr   c                 3   s"    | ]}d d„ t ˆ ƒD ƒV  qdS )c                 S   s   g | ]}g ‘qS r"   r"   ©r:   r2   r"   r"   r#   r<   a   ó    z2get_inference_prompt.<locals>.<genexpr>.<listcomp>N)ÚrangerW   ©Únum_bucketsr"   r#   Ú	<genexpr>`   s   € 
ÿz'get_inference_prompt.<locals>.<genexpr>é   )Ún_fftÚ
hop_lengthÚ
win_lengthÚn_mel_channelsÚtarget_sample_rateÚmel_spec_typezProcessing prompts...)Údescr8   iˆ  zEmpty prompt wav: z, or torchaudio backend issue.zutf-8r?   r(   rM   )Ú	polyphonez*infer_batch_size should be greater than 0.zAudio z has duration zs out of range [z, z].iš  )rY   r   r   Ú
torchaudioÚloadr@   ÚsqrtÚmeanÚsquarer9   Ú
transformsÚResampler   Úencoder   ÚsqueezeÚintÚmathÚfloorr   ÚextendrK   Ú	enumerateÚrandomÚseedÚshuffle)/r   ÚspeedÚ	tokenizerre   rb   r^   r`   ra   r_   rc   Ú
target_rmsÚuse_truth_durationÚinfer_batch_sizer[   Úmin_secsÚmax_secsÚprompts_allÚ
min_tokensÚ
max_tokensÚbatch_accumÚuttsÚref_rms_listrG   Úref_mel_lensÚtotal_mel_lensÚfinal_text_listÚmel_spectrogramr   r   r   r    r!   Ú	ref_audioÚref_srÚref_rmsÚ	resamplerÚtextÚ	text_listÚref_melÚref_mel_lenÚgt_audioÚgt_srÚtotal_mel_lenÚref_text_lenÚgen_text_lenÚbucket_iÚbucket_framesr"   rZ   r#   Úget_inference_promptH   s²   

ÿú	


$ÿ
úÿ
ù€

úÿ€

r—   c                 C   sP  t | ƒ}| ¡ }| ¡  g }t|ƒD ]`}t| ¡  d¡ƒdkr,| ¡  d¡\}}}	}
}nt| ¡  d¡ƒdkrB| ¡  d¡\}}}	}
tj 	tj 
||d ¡¡sPqtj 
||d ¡}tj |	¡sjtj 
tj | ¡|	¡}	| ||	|
f¡ qt|ƒ}|dkr‚|d |fgS t|ƒ| d }g }t|ƒD ]}| || ||| |d | … f¡ q|S )Nr   r   r	   r   r?   r   )r   r   r   r   r   r   r   r   r   Úexistsr   r   r   r   rY   )r   Úgen_wav_dirÚgpusr   r   Ú	test_set_r   r   r   r   r    r!   r6   Únum_jobsÚwav_per_jobÚtest_setÚir"   r"   r#   Úget_seed_tts_testÔ   s.   (r    c                 C   sJ  t | ƒ}| ¡ }| ¡  g }t|ƒD ]]}| ¡  d¡\}	}
}}}}|r7| d¡\}}}tj ||||d ¡}ntj 	tj ||d ¡¡sKt
d|› ƒ‚tj ||d ¡}|	 d¡\}}}tj ||||	d ¡}| |||f¡ qt|ƒ}|dkr|d |fgS t|ƒ| d }g }t|ƒD ]}| || ||| |d | … f¡ q|S )Nr%   r&   r'   r   zGenerated wav not found: r?   r   )r   r   r   r   r   r   r   r   r   r˜   ÚFileNotFoundErrorr   r   rY   )r   r™   rš   r)   Úeval_ground_truthr   r   r›   r   r*   r+   r,   r-   r.   r/   r4   r5   r2   r6   r0   r1   r3   rœ   r   rž   rŸ   r"   r"   r#   Úget_librispeech_test÷   s.   (r£   Ú c                 C   sd   | dkrddl m} |tj |d¡dd}|S | dkr0ddlm} |d	kr'd
n|}||ddd}|S )NÚzhr   )Ú	AutoModelzparaformer-zhT)ÚmodelÚdisable_updateÚen)ÚWhisperModelr¤   zlarge-v3ÚcudaÚfloat16)ÚdeviceÚcompute_type)Úfunasrr¦   r   r   r   Úfaster_whisperrª   )ÚlangÚckpt_dirr¦   r§   rª   Ú
model_sizer"   r"   r#   Úload_asr_model  s   ûûr´   c                 C   s´  | \}}}}|dkrdd l }tj |¡ n|dkr!t|ƒtjd< ntdƒ‚t||d}ddl	m
} |tj
 }g }	ddlm}
 t|ƒD ]•\}}}|dkr`|j|d	d
d}|d d }| |d¡}n|dkr||j|ddd\}}d}|D ]	}|d |j }qr|}|}|D ]}| |d¡}| |d¡}q‚| dd¡}| dd¡}|dkr¶d dd„ |D ƒ¡}d dd„ |D ƒ¡}n|dkrÂ| ¡ }| ¡ }|
||ƒ}|j}|	 t|ƒj|||dœ¡ qB|	S )Nr¥   r   r©   ÚCUDA_VISIBLE_DEVICESzWlang support only 'zh' (funasr paraformer-zh), 'en' (faster-whisper-large-v3), for now.)r²   )Úpunctuation)Úprocess_wordsi,  T)ÚinputÚbatch_size_sÚdisable_pbarrŒ   zzh-cnr   )Ú	beam_sizeÚlanguager¤   r(   z  c                 S   ó   g | ]}|‘qS r"   r"   ©r:   Úxr"   r"   r#   r<   _  rX   zrun_asr_wer.<locals>.<listcomp>c                 S   r½   r"   r"   r¾   r"   r"   r#   r<   `  rX   )ÚwavÚtruthÚhypoÚwer)Úzhconvr@   r«   Ú
set_deviceÚstrr   ÚenvironÚNotImplementedErrorr´   Ú
zhon.hanzir¶   ÚstringÚjiwerr·   r   ÚgenerateÚconvertÚ
transcriberŒ   Úreplacer   ÚlowerrÃ   r   r   Ústem)ÚargsÚrankr±   rž   r²   rÄ   Ú	asr_modelr¶   Úpunctuation_allÚwer_resultsr·   r6   r   rÁ   ÚresrÂ   Úsegmentsr2   ÚsegmentÚ	raw_truthÚraw_hypor¿   ÚmeasuresrÃ   r"   r"   r#   Úrun_asr_wer2  s^   ÿ

üÿ	rÝ   c              	   C   s~  | \}}}d|› }t ddd d}tj|ddd„ d}|j|d	 d
d tj ¡ r+dnd
}|r4| |¡}| ¡  g }t|ƒD ]~\}	}
}t |	¡\}}t |
¡\}}|r]| |¡}| |¡}|dkrttj	j
|dd}|rp| |¡}||ƒ}|dkr‹tj	j
|dd}|r‡| |¡}||ƒ}t ¡  ||ƒ}||ƒ}W d   ƒ n1 s¢w   Y  t ||¡d  ¡ }| t|	ƒj|dœ¡ q>|S )Nzcuda:rO   Úwavlm_large)Úfeat_dimÚ	feat_typeÚconfig_pathTc                 S   s   | S )Nr"   )ÚstorageÚlocr"   r"   r#   Ú<lambda>  s    zrun_sim.<locals>.<lambda>)Úweights_onlyÚmap_locationr§   F)Ústricti€>  )Ú	orig_freqÚnew_freqr   )rÀ   Úsim)r   r@   rg   Úload_state_dictr«   Úis_availableÚevalr   rf   rk   rl   Úno_gradrC   Úcosine_similarityÚitemr   r   rÑ   )rÒ   rÓ   rž   r²   r­   r§   Ú
state_dictÚuse_gpuÚsim_resultsr6   r   rÁ   Úwav1Úsr1Úwav2Úsr2Ú	resample1Ú	resample2Úemb1Úemb2rê   r"   r"   r#   Úrun_sim|  sJ   








þþÿrü   )rL   rM   TrN   rO   rO   rP   rQ   rR   rS   Fr?   rT   rU   rV   )F)r¤   )rp   r   rt   rÊ   Úpathlibr   r@   Útorch.nn.functionalÚnnÚ
functionalrC   rf   r   Úf5_tts.eval.ecapa_tdnnr   Úf5_tts.model.modulesr   Úf5_tts.model.utilsr   r$   r7   rK   r—   r    r£   r´   rÝ   rü   r"   r"   r"   r#   Ú<module>   sJ    
ð 
#
%J