o
    }oi{                  
   @   sb  d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZ d dl	m
Z
mZmZmZ d dlZedZdZzd dlmZ d dlmZ W n" eyg Z zed	eej d
e  dZW Y dZ[ndZ[ww dZzd dlmZ d dlmZ d dlm Z m!Z! W n" ey Z zedeej d
e  dZW Y dZ[ndZ[ww dZ"zd dl#m$Z$ W n" ey Z zedeej d
e  dZ"W Y dZ[ndZ[ww dZ%zd dl&m'Z' W n" ey Z zedeej d
e  dZ%W Y dZ[ndZ[ww G dd deZ(eG dd dZ)eG dd dZ*dd Z+de,dee- de,fdd Z.	!		"	"							#	#	#	$	%												d/deee) ee* f fd&d'Z/	#	"	#	$	%					d0deee) ee* f fd(d)Z0d*d+ Z1d,d- Z2ed.krz
e1 Z3e2e3 W dS  e(y Z ze4e  edZ[w e j5y Z ze4e  edZ[ww dS )1    N)	dataclass)Path)DictListOptionalTupleNeMoT)DeployPyTriton)NemoQueryLLMz8Cannot import Triton, deployment will not be available. z: F)CommonInferenceParams)NemoQueryLLMPyTorch)MegatronLLMDeployMegatronLLMDeployableNemo2zCannot import MegatronLLMDeploy* classes, or NemoQueryLLMPyTorch, or CommonInferenceParams, in-framework inference will not be available. Reason: )TensorRTLLMzBCannot import the TensorRTLLM exporter, it will not be available. )vLLMExporterz;Cannot import the vLLM exporter, it will not be available. c                   @   s   e Zd ZdS )
UsageErrorN)__name__
__module____qualname__ r   r   L/home/ubuntu/.local/lib/python3.10/site-packages/tests/export/nemo_export.pyr   @   s    r   c                   @   s.   e Zd ZU dZee ed< dZee ed< dS )FunctionalResultNregular_passdeployed_pass)r   r   r   r   r   bool__annotations__r   r   r   r   r   r   D   s   
 r   c                   @   s6   e Zd ZU eed< eed< eed< eed< eed< dS )AccuracyResultaccuracyaccuracy_relaxeddeployed_accuracydeployed_accuracy_relaxedevaluation_timeN)r   r   r   floatr   r   r   r   r   r   J   s   
 r   c                 C   sV  d}d}d}d}g }	g }
t |d}t|}t }|D ]}|d }|d   }|	| | d urtrPt	| t
rP| j|gtdddddd	d
}|d j}n| j|gdddd||d}|d d   }|
| ||krt|d7 }||ks||s||rt|dkrt|dkrq|d7 }|d urtrt	|tr|j|gddddd}|d d d d d dd    }n|j|gdddd|d}|d d   }||kr|d7 }||ks||s||rt|dkrt|dkrq|d7 }qt }W d    n	1 sw   Y  t|t|	 |t|	 |t|	 |t|	 || dS )Nr   rtext_before_last_word	last_wordg?           F)temperaturetop_ktop_pnum_tokens_to_generatereturn_log_probs)promptsinference_params)input_textsmax_output_lenr)   r*   r(   task_ids	lora_uids)r-   
max_lengthr)   r*   r(   choicestext)r-   r0   r)   r*   r(   task_id)r   r   r   r    r!   )openjsonloadtime	monotonicstriplowerappendin_framework_supported
isinstancer   generater   generated_textforward
startswithlenr   	query_llmr   )modelnqr1   r2   test_data_pathcorrect_answerscorrect_answers_deployedcorrect_answers_relaxed correct_answers_deployed_relaxedall_expected_outputsall_actual_outputsfilerecords
eval_startrecordpromptexpected_outputmodel_outputdeployed_outputeval_endr   r   r   get_accuracy_with_lambadaS   s   


	
*	
R



rY   	streamingexpected_outputsreturnc                 C   s`   | rt |dkr
dS |d }t |t |krdS tt |D ]}|| || d vr- dS qdS )Nr   FT)rE   range)rZ   model_outputsr[   ir   r   r   check_model_outputs   s   ra         r&   r'         ?c#           3      C   s  |!d u ri }!|"d u ri }"t | r|tj kr)td|| |tj  dS t |jddd |rMtd td td td td|| | d }#d }$d}%|rot | ri|}#d	}%d
g}$|rhtd ntd dS d }&d }'d }(d })|rt | r|g}&g d}'d}(dg})|rtd ntd dS |rt }*|*j	d,||||||
| t
jd|" n/t||&dd}*|r|*j||||
||d n|*j	d,|||||
|
| |||%|(|)||	|| d|! |r|*jd
|#d |*j||||||$|'||d	}+t|+}+t },t
jrd|,_t||+|std d|,_d}-|r3|(s3|s3|s3t|ddd}.|.j|||||d}-d }/d }0d}1|rvt|*| dd}0|0  |0  td| d}/|/j||d d!d"|'d#}1t|1}1t
jrvd|,_t||1|svtd$ d|,_|s|,jdks|,jdkrtd td%| td td&| td td'|+ td td(|1 td td td)|- td d }2|rtd* t|*|/|$|'|}2|r|0  |s|rt | |,|2fS t!d+|)-NzUPath: {0} and model: {1} with {2} tps won't be tested since available # of gpus = {3})NNT)parentsexist_ok n################################################## NEW TEST ##################################################z4Path: {0} and model: {1} with {2} tps will be testedr   i    0z---- PTuning enabled.z8---- PTuning could not be enabled and skipping the test.)ri   z-1ri   bfloat16attn_qkvz---- LoRA enabled.z5---- LoRA could not be enabled and skipping the test.)nemo_checkpoint	model_dir
model_typetensor_parallel_sizepipeline_parallel_sizemax_model_lengpu_memory_utilizationF)
load_model)hf_model_pathmax_batch_sizetensor_parallelism_sizemax_input_lenmax_num_tokensrn   )nemo_checkpoint_pathrn   rv   pipeline_parallelism_sizerw   max_seq_lenru   use_parallel_embeddingmax_prompt_embedding_table_sizeuse_lora_pluginlora_target_modulesrx   use_embedding_sharingfp8_quantizedfp8_kvcache)	task_name!prompt_embeddings_checkpoint_path)	r/   r0   r)   r*   r(   r1   r2   rZ   stop_words_listz.Model outputs don't match the expected result.)rs   use_python_runtime)r/   r0   r)   r*   r(   @  rG   triton_model_name	http_portlocalhost:8000url
model_namer&   r'   rd   )r-   r0   r)   r*   r(   r2   z7Deployed model outputs don't match the expected result.z--- Prompt: z--- Expected keywords: z--- Output: z--- Output deployed: z--- Output with C++ runtime:  Start model accuracy testing ..."Checkpoint {0} could not be found.r   )"r   existstorchcudadevice_countprintformatmkdirr   exportargsrr   r   export_hf_modeladd_prompt_tablerC   listr   functional_testr   ra   LOGGERwarningr	   deployrunr
   rF   r   rY   stopshutilrmtree	Exception)3r   rn   r-   r[   checkpoint_pathrm   use_vllmuse_huggingfaceru   r   rw   r0   rx   r|   ptuningp_tuning_checkpointloralora_checkpointtp_sizepp_sizer)   r*   r(   run_accuracydebugrZ   r   test_cpp_runtimetest_deploymentrI   save_enginer   r   trt_llm_export_kwargsvllm_export_kwargsr   r1   r}   lora_ckpt_listr2   r~   r   exporteroutputfunctional_result
output_cppexporter_cpprH   nmoutput_deployedaccuracy_resultr   r   r   run_inference   sb  %	








r   c                 C   s   t | rq|	r td td td td td||  tj||||d}t|| dd}|  |  t	d| d}|j
|||||d	}|d
 d d }t|}td| d }|ritd t||d d |
}|  d |fS td|)Nrg   rh   z'Path: {0} and model: {1} will be tested)enable_flash_decodelegacy_ckptr   r   r   r   )r-   r)   r*   r(   r3   r4   r   r5   z
 --------- Output: r   r   )r   r   r   r   r   get_deployabler	   r   r   r   rF   r   rY   r   r   )r   r-   r   num_gpusr0   r)   r*   r(   r   r   rI   r   r   deployed_modelr   rH   r   r   r   r   r   run_in_framework_inference  sB   

r   c               
   C   s  t jt jdd} | jdtdd | jdtdd | jdtd	dd
 | jdtd | jdtd	d | jdtddd
 | jdtd | jdtdd | jdtdd | jdtdd | jdtd | jdtdd | jdtd | jdtdd | jdtd | jdtdd | jdtd	d | jd td!d | jd"td#d | jd$tdd | jd%td&d | jd'dd(d) | jd*tdd | jd+tdd | jd,tdd | jd-dd(d) | jd.td d | jd/tdd | jd0tdd | jd1tdd | jd2tdd | jd3tdd | jd4tdd5d6 | jd7d8d9td:d; | jd<d=d>td?d; | jd@dAd>tdBd; | jdCi tjdDd; | jdEi tjdFd; | 	 }d^dGtdHtdIt
dJtt
 fdKdL}t|j dMkr`d n|j|_|dN|j|_|dO|j|_|dP|j|_|dQ|j|_|dR|j|_|dS|j|_|dT|j|_|dU|j|_|dV|j|_|dW|j|_|dX|j|_|dY|j|_|dZ|jdd[|_|d\|jdd[|_|d]|j|_|S )_Nz5Deploy nemo models to Triton and benchmark the models)formatter_classdescriptionz--model_nameT)typerequiredz--model_typeFz	--min_tpsr&   )r   defaultr   z	--max_tps)r   z--pps)r   r   z--checkpoint_dirz/tmp/nemo_checkpoint/z--model_dirz--max_batch_sizerb   z--max_input_len   z--max_output_lenrc   z--max_num_tokensz--use_parallel_embeddingFalsez--p_tuning_checkpointz	--ptuningz--lora_checkpointz--loraz--top_kz--top_pr'   z--temperaturerd   z--run_accuracyz--accuracy_thresholdg      ?z--streaming
store_true)r   actionz--test_cpp_runtimez--test_deploymentz--functional_testz--debugz--test_data_pathz--save_enginez
--use_vllmz--use_huggingfacez--enable_flash_decodez--in_frameworkz--legacy_ckptzFLoad checkpoint saved with TE < 1.14 (only for in-framework inference))r   r   helpz-gmuz--gpu_memory_utilizationgffffff?z+GPU memory utilization percentage for vLLM.)r   r   r   z-fp8z--export_fp8_quantizedautoz7Enables exporting to a FP8-quantized TRT LLM checkpointz-kv_fp8z--use_fp8_kv_cachez1Enables exporting with FP8-quantizatized KV-cachez--trt_llm_export_kwargsz4Extra keyword arguments passed to TensorRTLLM.exportz--vllm_export_kwargsz5Extra keyword arguments passed to vLLMExporter.exportnamesoptionalr\   c                 S   sb   |  }ddg}ddg}|dkrdS ||v rdS ||v rdS |r&|dkr&d S td	|  d
| d)Ntrue1falseri   rg   FTr   z%Invalid boolean value for argument --z: '')r=   r   )r   r   r   true_stringsfalse_stringsr   r   r   str_to_bool  s   zget_args.<locals>.str_to_boolnoner   r   r   r   r   r   r   r   r   r   r|   in_frameworkexport_fp8_quantized)r   use_fp8_kv_cacher   )F)argparseArgumentParserArgumentDefaultsHelpFormatteradd_argumentstrintr"   r8   loads
parse_argsr   r   rn   r=   r   r   r   r   r   r   r   r   r   r   r|   r   r   r   r   )parserr   r   r   r   r   get_args  s    r   c                 C   s,  | j s| jststd| j rtstd| jrtstd| j r+| js'| jr+td| jr4t	s4td| j
r@| jd u r@td| jd u rI| j| _| j rV| j| jkrVtd| jr_ttj i }| jsm| jd u rmtdd	d
g}ddg}| j}|| jkr1| jrt| j|| j|| j| j| j| j| j
| j| j| j| jd||< ntdIi d| jd| jd|d|d| jd| jd| j d| j d|d| j!d| j"d| j#d| jd| j$d| j%d| jd| j&d| jd | j'd!| jd"| jd#| jd$| j
d%| jd&| j(d'| jd(| j)d)| jd*| j*d+| j+d,| j,d-| j-d.| j.||< |d/ }|| jks~d0}d0}d1}t/d2 | joC| jd3k}|0 D ]\}	}
|
\}}|rWt/d4 d5}d6t1t2 fd7d8}t/d9|	  | j3r|d urt/d:||j4  t/d;||j5  |j4d1krd<}|j5d1krd<}| j
r|d urt/d=|j6d> t/d?|j7d> t/d@|j8d> t/dA|j9d> t/dB|j:dC |r|j9| j;k s|s|j7| j;k rd<}qHt/dD | j3rt/dE|  | j
rt/dF|  |d<krtdG|d<krtdH| j; d S )JNz9TensorRT-LLM engine is not supported in this environment.z1vLLM engine is not supported in this environment.z<In-framework inference is not supported in this environment.zAThe vLLM integration currently does not support P-tuning or LoRA.zWDeployment tests are not available because Triton is not supported in this environment.z8Accuracy testing requires the --test_data_path argument.zvLLM doesn't support changing tensor parallel group size without relaunching the process. Use the same value for --min_tps and --max_tps.z7When using custom checkpoints, --model_dir is required.zThe capital of France iszLargest animal in the sea isParisz
blue whale)r   r-   r   r   r0   r)   r*   r(   r   r   rI   r   r   r   rn   r-   r[   r   rm   r   r   r   r   ru   rw   r0   rx   r|   r   r   r   r   r)   r*   r(   r   r   rZ   r   r   rI   r   r   r   r   r      PASSFz'============= Test Summary ============r&   z'---------------------------------------Tbc                 S   s   | d u rdS | r
dS dS )NzN/Ar   FAILr   )r   r   r   r   optional_bool_to_pass_fail]  s   z7run_inference_tests.<locals>.optional_bool_to_pass_failz!Tensor Parallelism:              z!Functional Test:                 z!Deployed Functional Test:        r   z!Model Accuracy:                  z.4fz!Relaxed Model Accuracy:          z!Deployed Model Accuracy:         z!Deployed Relaxed Model Accuracy: z!Evaluation Time [s]:             z.2fz'=======================================zFunctional: zAcccuracy: zFunctional test failedzModel accuracy is below r   )<r   r   trt_llm_supportedr   vllm_supportedr?   r   r   r   triton_supportedr   rI   max_tpsmin_tpsr   r   setLevelloggingDEBUGrm   r   r   r   checkpoint_dirr0   r)   r*   r(   r   r   r   rn   r   ppsru   rw   rx   r|   r   r   rZ   r   r   r   r   r   r   r   itemsr   r   r   r   r   r   r   r   r    r!   accuracy_threshold)r   
result_dicr-   r[   tpsfunctional_test_resultaccuracy_test_resultprint_separatordeployed_tests_onlynum_tpsresultsr   r   r   r   r   r   run_inference_tests  s2  



	
 !$
8

r  __main__)rb   Frc   rc   NFFNFNr&   r&   r&   r'   rd   FTFNFFNFFFNN)
r&   rc   r&   r'   rd   FTNTF)6r   r8   r   r   r:   dataclassesr   pathlibr   typingr   r   r   r   r   	getLoggerr   r   nemo.deployr	   nemo.deploy.nlpr
   r   er   r   r   r?   /megatron.core.inference.common_inference_paramsr   r   &nemo.deploy.nlp.megatronllm_deployabler   r   r   nemo.export.tensorrt_llmr   r   nemo.export.vllm_exporterr   r   r   r   rY   r   r   ra   r   r   r   r  r   errorArgumentErrorr   r   r   r   <module>   s   
i$
 ~
= k 
