o
    wiD                  
   @   s"  d dl Z d dlZd dlZd dlZd dlmZ d dlZd dlmZ dZ	zd dl
mZ d dlmZmZ d dlmZ W n eyL Z zdZ	W Y dZ[ndZ[ww dd	d
Z				dddZ																						dddZdd Zdd Zdd Zedkre Zee dS dS )    N)Path)MegatronLLMDeployableT)DeployPyTriton)NemoQueryLLMNemoQueryLLMPyTorch)TensorRTLLMFc                 C   s  |d u rt dd}d}d}d}g }	g }
t|d}t|}t }|D ]}|d }|d   }| j|gdddd||d}|d d   }|		| |
	| ||kr\|d7 }||ksj|
|sj|
|r{t|dkrwt|dkrwq%|d7 }|d ur|j|gdddd|d	}|d d   }||kr|d7 }||ks|
|s|
|rt|dkrt|dkrq%|d7 }q%t }W d    n1 sw   Y  |t|	 }|t|	 }|t|	 }|t|	 }|| }|||||fS )
Nztest_data_path cannot be None.r   rtext_before_last_word	last_word   g?)input_textsmax_output_lentop_ktop_ptemperaturetask_ids	lora_uids)promptsr   r   r   r   task_id)	Exceptionopenjsonloadtimeperf_counterstriplowerforwardappend
startswithlen	query_llm)modelnqr   r   test_data_pathtrtllm_correcttrtllm_deployed_correcttrtllm_correct_relaxedtrtllm_deployed_correct_relaxedall_expected_outputsall_trtllm_outputsfilerecords
eval_startrecordpromptexpected_outputtrtllm_outputtrtllm_deployed_outputeval_endtrtllm_accuracytrtllm_accuracy_relaxedtrtllm_deployed_accuracy trtllm_deployed_accuracy_relaxedevaluation_time r9   U/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/tests/deploy/nemo_deploy.pyget_accuracy_with_lambada#   s   
	


9r;   r   c                 C   sV   t ||}t|| dd}|  |  td| d}	|	j|d}
td|
 |  dS )N@  r"   triton_model_name	http_portlocalhost:8000url
model_name)r   zOutput: NNNNN)r   r   deployrunr   r!   printstop)rC   r/   checkpoint_pathn_gpumax_batch_sizemax_input_lenr   r"   nmr#   output_deployedr9   r9   r:   run_in_framework_inference}   s   
	
rO                       ?c           (      C   s  t | rG|tj krtd|| |tj  dS t |jddd |rAtd td td td td|| | d }d }d}|rct | r]|}d	}d
g}|r\td ntd dS d }d }d } d }!|rt | r|g}g d}d} dg}!|rtd ntd dS t||dd}"|"j	||||||	||| |!|
d|d |r|"j
d
|d |"j||	|||||||d	}#| s|st|||	dd d }$d }%d}&|rt|"| dd}%|%  |%  td| d}$|$j||	ddd|d }&|rtd td!| td td"|# td td td#|& td |r6td$ t|"|$|||}'|r,|%  |s4t| |'S |r=|%  |sEt| dS td%|)&NzVPath: {0} and model: {1} with {2} gpus won't be tested since available # of gpus = {3}rD   T)parentsexist_ok zn################################################## NEW TEST ##################################################z5Path: {0} and model: {1} with {2} gpus will be testedr   i    0z---- PTuning enabled.z8---- PTuning could not be enabled and skipping the test.)rW   z-1rW   bfloat16attn_qkvz---- LoRA enabled.z5---- LoRA could not be enabled and skipping the test.F
load_model<   )nemo_checkpoint_path
model_typetensor_parallelism_sizepipeline_parallelism_sizerL   r   rK   max_prompt_embedding_table_sizeuse_lora_pluginlora_target_modulesmax_num_tokensopt_num_tokensuse_embedding_sharing)	task_name!prompt_embeddings_checkpoint_path)	r   r   r   r   r   r   r   	streamingstop_words_list)engine_pathr/   r   debugr<   r=   r@   rA   r   rR   rS   )r   r   r   r   r   r   z--- Prompt: z--- Output: z--- Output deployed: z Start model accuracy testing ...z"Checkpoint {0} could not be found.)r   existstorchcudadevice_countrG   formatmkdirr   exportadd_prompt_tabler   test_cpp_runtimer   rE   rF   r   r!   r;   rH   shutilrmtreer   )(rC   r^   r/   rI   trt_llm_model_dirrJ   rK   rf   rL   r   rd   ptuningp_tuning_checkpointloralora_checkpointtp_sizepp_sizer   r   r   run_accuracyrl   ri   rj   test_deploymentr$   save_enginerh   r   ra   lora_ckpt_listr   rb   rc   trt_llm_exporteroutputr#   rM   rN   resultr9   r9   r:   run_trt_llm_inference   s   	




r   c                 C   sF   t | dd}|j||dddd}|r!td td| td d S d S )	NTrZ   r   rR   rS   )r   r   r   r   r   rV   z&--- Output deployed with cpp runtime: )r   r   rG   )rk   r/   r   rl   r   r   r9   r9   r:   ru   N  s   
ru   c               	   C   s  t jt jdd} | jdtdd | jdtdd | jdtd	d
 | jdtd | jdtddd | jdtd | jdtdd
 | jdtdd
 | jdtdd
 | jdtd | jdtd | jdddd | jdtd | jdddd | jdtd	d
 | jdtd	d
 | jd td	d
 | jd!td"d
 | jd#td$d
 | jd%td&d
 | jd'ddd | jd(td&d
 | jd)ddd | jd*ddd | jd+td d
 | jd,d-d.d d/g d0d1d2 | jd3td&d
 |  S )4Nz5Deploy nemo models to Triton and benchmark the models)formatter_classdescriptionz--model_nameT)typerequiredz--model_typeFz
--min_gpusr   )r   defaultz
--max_gpus)r   z--checkpoint_dirz/tmp/nemo_checkpoint/)r   r   r   z--trt_llm_model_dirz--max_batch_sizerP   z--max_input_len   z--max_output_lenrQ   z--max_num_tokensz--p_tuning_checkpointz	--ptuning
store_true)r   actionz--lora_checkpointz--loraz	--tp_sizez	--pp_sizez--top_kz--top_prR   z--temperaturerS   z--run_accuracyFalsez--streamingz--test_deploymentz--debugz!--ci_upload_test_results_to_cloudz--test_data_pathz-bz	--backend?TensorRT-LLM)r   vLLMzIn-Frameworkz'Different options to deploy nemo model.)nargsconstr   choiceshelpz--save_engine)argparseArgumentParserArgumentDefaultsHelpFormatteradd_argumentstrintfloat
parse_args)parserr9   r9   r:   get_argsc  s  	r   c              	   C   sF  | j dkr	d| _ nd| _ | jdkrd| _nd| _| jdkr!d| _nd| _| jr0| jd u r0tdi }ddg}| j}| jd u rB| j| _|| jkr| j dkrt	d.i d| j
d	| jd
|d| jd| jd|d| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd| j d| jd | j||< nt| j
|| j|| j| j| jd!||< |d" }|| jksGd#}d}td$ | D ].\}}|d% d ur|d& d ur|rtd' td(j |g|R   d}|d& d)k rd*}qtd+ td,|  |d*kr!td-d S )/NTrueTFz$test_data_path param cannot be None.zThe capital of France iszLargest animal in the sea isztensorrt-llmrC   r^   r/   rI   rx   rJ   rK   rL   r   rd   ry   rz   r{   r|   r}   r~   r   r   r   r   rl   ri   r   r$   r   )rC   r/   rI   rJ   rK   rL   r      PASSz'============= Test Summary ============r   r   z'---------------------------------------zNumber of GPUS:                  {}
Model Accuracy:                  {:.4f}
Relaxed Model Accuracy:          {:.4f}
Deployed Model Accuracy:         {:.4f}
Deployed Relaxed Model Accuracy: {:.4f}
Evaluation Time [s]:             {:.2f}g      ?FAILz'=======================================zTEST: zModel accuracy is below 0.5r9   )!r   r   r   r$   r   min_gpusmax_gpusbackendr   r   rC   r^   checkpoint_dirrx   rK   rL   r   rd   ry   rz   r{   r|   r}   r~   r   r   r   rl   ri   rO   rG   itemsrq   )args
result_dicprompt_templaten_gpustest_resultprint_separatoriresultsr9   r9   r:   run_inference_tests  s   





	




*
r   __main__)N)r   NNN)r   rP   FrQ   rQ   NFNFNNNr   rR   rS   FTFNFNF)r   r   rv   r   pathlibr   rn   &nemo.deploy.nlp.megatronllm_deployabler   run_export_testsnemo.deployr   nemo.deploy.nlpr   r   nemo.export.tensorrt_llmr   r   er;   rO   r   ru   r   r   __name__r   r9   r9   r9   r:   <module>   sl   
^
$
 4 ^