o
    ikF                     @   s  d Z ddlZddlZddlZddlZddlZddlZddlZddlZ	ddl
Z
ddlmZ ddlm  mZ ddlZddlmZ ddlmZ ddlmZ ddlmZ ddlmZmZ ddlmZmZ dd	l m Z  dd
l!m"Z" ej#$ej#%ej#&e' d ddl(m)Z) ddl*m+Z+ ddl,m-Z-m.Z.m/Z/ e0dj1Z1e
2d dd Z3d!ddZ4dd Z5	d"ddZ6G dd dZ7dd Z8e9d kre8  dS dS )#ae   Example Usage
torchrun --nproc_per_node=1 benchmark.py --output-dir $log_dir --batch-size $batch_size --enable-warmup --split-name $split_name --model-path $CKPT_DIR/$model/model_1200000.pt --vocab-file $CKPT_DIR/$model/vocab.txt --vocoder-trt-engine-path $vocoder_trt_engine_path --backend-type $backend_type --tllm-model-dir $TRTLLM_ENGINE_DIR || exit 1
    N)load_dataset)hf_hub_download)trt_dtype_to_torch)logger)Session
TensorInfo)
DataLoaderDistributedSampler)tqdm)Vocosz/../../../../src/)padded_mel_batch)get_vocos_mel_spectrogram)convert_char_to_pinyinget_tokenizerlist_str_to_idxz(model_repo_f5_tts.f5_tts.1.f5_tts_trtllmc                  C   s  t jdd} | jdtdg ddd | jdd	td
d | jdd	tdd | jdd	tdd | jdd	tdd | jdd	tdd | jdtddd | jdtd dd | jddtdd | jdd tdd | jd d!d" | jd#d!d" | jd$d!d%d& | jd'td(d)d*gd+d |  }|S ),Nzextract speech code)descriptionz--split-namewenetspeech4tts)r   test_zhtest_en	test_hardzhuggingface dataset split name)typedefaultchoiceshelpz--output-dirTzdir to save result)requiredr   r   z--vocab-filez
vocab filez--model-pathz"model path, to load text embeddingz--tllm-model-dirztllm model dirz--batch-sizez%batch size (per-device) for inferencez--num-workersr   zworkers for dataloader)r   r   r   z
--prefetchzprefetch for dataloaderz	--vocodervocoszvocoder name)r   r   r   z--vocoder-trt-engine-pathzvocoder trt engine pathz--enable-warmup
store_true)actionz--remove-input-paddingz
--use-perfzuse nvtx to record performance)r   r   z--backend-typetritontrtpytorchzbackend type)argparseArgumentParseradd_argumentstrint
parse_args)parserargs r)   Z/home/ubuntu/.local/lib/python3.10/site-packages/f5_tts/runtime/triton_trtllm/benchmark.pyget_args@   sj   r+   cudaFc                 C   s  |r	t jjd d}d}g g g g g g f\}}}}	}
}t| D ]\}}|d |d |d }}}|| |||  |d d |d d	 }}t |d
 }t 	t 
t |}|| ||k rn|| | }||kr~tj||}||}n|}|rt jjd|  |d}t|d
}|rt jj  |jd }|jd
 dksJ || |	| |
t|dt|dt|d    qt|}t |	}t|dd}t||}|rt jj  ||||||
dS )Ndata_collator]  皙?idprompt_texttarget_textprompt_audioarraysampling_rater   zmel_spectrogram r,   d      zutf-8T)	polyphone)idsref_rms_listref_mel_batchref_mel_len_batchtext_pad_sequence"estimated_reference_target_mel_len)torchr,   nvtx
range_push	enumerateappend
from_numpy	unsqueezefloatsqrtmeansquare
torchaudio
transformsResampletor   squeeze	range_popshaper%   lenencoder   
LongTensorr   r   )batchvocab_char_mapdeviceuse_perftarget_sample_rate
target_rmsr:   r;   ref_mel_listref_mel_len_listr?   reference_target_texts_listiitemitem_idr1   r2   ref_audio_orgref_srref_rms	resampler	ref_audioref_melref_mel_lenr<   r=   pinyin_listr>   r)   r)   r*   r-   x   s~   		









&

r-   c                  C   sp   t tjdd} t tjdd}t tjdd}td|d||   tj| t	
d | ||fS )	N
WORLD_SIZEr8   
LOCAL_RANKr   RANKz'Inference on multiple gpus, this gpu {}z, rank {}, world_size {}nccl)r%   osenvirongetprintformatr@   r,   
set_devicedistinit_process_group)
world_size
local_rankrankr)   r)   r*   init_distributed   s   

rx   r    c                 C   s   | dkrp|d urt |d}|S |r#td|  | d}| d}ntd d}	t|	|dd	}t|	|d
d	}t|}tj|ddd}
ddlm} t	|j
|rbdd |j
j  D }|
| ||
 | |}|S | dkrxtd|S )Nr   )engine_pathzLoad vocos from local path z/config.yamlz/pytorch_model.binz8Download Vocos from huggingface charactr/vocos-mel-24khzzcharactr/vocos-mel-24khzzconfig.yaml)repo_id	cache_dirfilenamezpytorch_model.bincpuT)map_locationweights_onlyr   )EncodecFeaturesc                 S   s   i | ]	\}}d | |qS )zfeature_extractor.encodec.r)   ).0keyvaluer)   r)   r*   
<dictcomp>   s    z load_vocoder.<locals>.<dictcomp>bigvganzBigVGAN is not implemented yet)VocosTensorRTrp   r   r   from_hparamsr@   loadvocos.feature_extractorsr   
isinstancefeature_extractorencodec
state_dictitemsupdateload_state_dictevalrN   NotImplementedError)vocoder_nameis_local
local_pathrW   hf_cache_dirvocoder_trt_engine_pathvocoderconfig_path
model_pathr{   r   r   encodec_parametersr)   r)   r*   load_vocoder   s4   




r   c                   @   s   e Zd ZdddZdd ZdS )r   ./vocos_vocoder.planNc                 C   s   t t jj}t j|dd td|  || _t|d}| }W d    n1 s-w   Y  t	
|| _|d urA|| _d S tj j| _d S )Nry   )	namespacezLoading vocoder engine from rb)r   LoggerWARNINGinit_libnvinfer_pluginsr   inforz   openreadr   from_serialized_enginesessionr@   r,   current_streamcuda_streamstream)selfrz   r   
TRT_LOGGERfengine_bufferr)   r)   r*   __init__   s   
$zVocosTensorRT.__init__c                 C   sd   |  }d|i}| jtdtjj|jg}dd |D }| j||| j	}|s,J d|d }|S )Nmelc                 S   s,   i | ]}|j tjt|jt|jd dqS )r,   )dtyperW   )namer@   emptytuplerQ   r   r   )r   tr)   r)   r*   r     s     z(VocosTensorRT.decode.<locals>.<dictcomp>z(Runtime execution failed for vae sessionwaveform)

contiguousr   infer_shapesr   r   DataTypeFLOATrQ   runr   )r   melsinputsoutput_infooutputsoksamplesr)   r)   r*   decode  s   zVocosTensorRT.decode)r   N)__name__
__module____qualname__r   r   r)   r)   r)   r*   r      s    

r   c            ,         s  t   tj jdd tj sJ t \} }}td| }t	 j
d\} j}ttj|d}t|}W d    n1 sDw   Y   jdkrYt|d| j|d}n8 jd	krd
dlm}	 d
dlm}
 |d }t|d |d |d |d |d |d |d |d d}|	|
| j}t j| jd}td jdddd }|jddd j rŇfddt!d D }t"#|| d!krt$| |d"}nd }t% j&|d j' j( fd#d$d%}t)} j*ra|D ]p}|d& +||d' +|}}|d( +|}|d) }t,-|d
d
d
t.||j/d!  d
d
f} jdkr1|j0|||| j1d*}q jd	kr`tj2||d+}t3  |j0|||d,d-d.d/\}}W d    n	1 s[w   Y  q|d
krmt4|d0d1d2}d
}d
}d
} j r~tj5 6  t77 }|D ];} j rtjj89d3 |d& +||d' +|}}|d( +|}|d) }t,-|d
d
d
t.||j/d!  d
d
f} j rtjj8:   jdkr|j0|||| j1 j d4\}} n: jd	krtj2||d+}t3   t77 }!|j0||||d,d-d.d5\}}t77 |! } W d    n	1 sw   Y  || 7 }t77 }"d6}#d7}$t;|D ]}\}%}&|&||% ||% d d f <d
}&|&=d
d8d!+tj>}' jd9krl j rZtjj89d: |?|'@ }( j rktjj8:  n	||'Ad
@ }(|d; |% |#k r|(|d; |%  |# }(|d< |% })tBC j d=|) d>|(|$ ||(j/d! |$ 7 }q)|t77 |" 7 }|d
kr|D| t)|d<   qt77 | }|d
kr|E  || }*d?|*d@dA}+|+dB|dCdD7 }+|+dE|dF dGdH7 }+|+dI|dCdJ|dF dGdH7 }+|+dK|dCdJ|dF dGdH7 }+|+dL|dCdJ|dF dGdH7 }+|+dM j& dA7 }+tF|+ t j dNdO}|G|+ W d    n	1 sCw   Y  tHI  tHJ  d S )PNT)exist_okzcuda:customzconfig.jsonr   F)
debug_modetllm_model_dirr   
vocab_sizer    r   )
load_model)DiTpretrained_confighidden_sizenum_hidden_layersnum_attention_headsff_multtext_dimtext_mask_paddingconv_layerspe_attn_head)dimdepthheadsr   r   r   r   r   )r   rW   r   zyuekai/seed_tts)splittrust_remote_codec                 S   sN   | d d j d }dt| d t| d   }|| }|| d d  | d< | S )	Nr3   r4   r   r8   r2   r1   r5   estimated_duration)rQ   rR   )exampleprompt_audio_lenscale_factorr   r)   r)   r*   add_estimated_durationE  s
   z$main.<locals>.add_estimated_durationr   )reversec                    s   g | ]}  d gqS )   )select)r   r^   )datasetr)   r*   
<listcomp>P  s    zmain.<locals>.<listcomp>   r8   )num_replicasrw   c                    s   t |  jdS )N)rX   )r-   rX   )x)r(   rV   r)   r*   <lambda>a  s    zmain.<locals>.<lambda>)
batch_sizesamplershufflenum_workersprefetch_factor
collate_fnr<   r=   r>   r?   )remove_input_padding)rW       g       @r6   )condtextdurationstepscfg_strengthsway_sampling_coef
Processingwavs)totaldescunitzdata sample)r   rX   )r   r   r   lensr   r   r   r/   r.      r   zvocoder decoder;   r:   /z.wavzRTF: z.4f
ztotal_duration: z.3fz	 seconds
(i  z.2fz hours)
z
DiT time: z
 seconds (zVocoder time: ztotal decoding time: zbatch size: z/rtf.txtw)Kr+   rm   makedirs
output_dirr@   r,   is_availablerx   rW   r   
vocab_filer   r   pathjoinjsonr   backend_typeF5TTSr   f5_tts.infer.utils_inferr   f5_tts.modelr   dictr   r   r   r   
split_namemapsortrX   rangedatasetsconcatenate_datasetsr	   r   r   r   prefetchrR   enable_warmuprN   FpadmaxrQ   sampler   tensorinference_moder
   cudartcudaProfilerStarttimerA   rB   rP   rC   rF   permutefloat32r   r~   rO   rK   saver   closerp   writers   barrierdestroy_process_group),ru   rv   rw   rW   r   r   r   tllm_model_configmodelr   r   r   pt_model_configr   r   dataset_list_shortr   
dataloadertotal_stepsrU   ref_melsref_mel_lenstext_pad_seqtotal_mel_lenscond_pad_seq_	generatedprogress_bardecoding_timevocoder_timetotal_durationtotal_decoding_time	cost_time
start_timevocoder_start_timerZ   rY   r^   gengen_mel_specgenerated_waveuttrtfsr)   )r(   r   rV   r*   main  sZ  





&



&

	"

rB  __main__)r,   F)r   Fry   r,   NN):__doc__r!   	importlibr	  rm   sysr  r  tensorrtr   r@   torch.distributeddistributedrs   torch.nn.functionalnn
functionalr  rK   r   huggingface_hubr   tensorrt_llm._utilsr   tensorrt_llm.loggerr   tensorrt_llm.runtime.sessionr   r   torch.utils.datar   r	   r
   r   r   r  rD   dirnameabspath__file__f5_tts.eval.utils_evalr   f5_tts.model.modulesr   f5_tts.model.utilsr   r   r   import_moduler  manual_seedr+   r-   rx   r   r   rB  r   r)   r)   r)   r*   <module>   sL   "

8O
" H
