o
    ٷi<                     @   s   d dl Z d dlZd dlZd dlZd dlmZ d dlZd dlZd dlmZm	Z	m
Z
mZmZ d dlmZmZmZmZ d dlmZ d dlmZ d dlmZ d dlmZ ed	Zdd
dZdd Zedkrpe Zeej  ee dS dS )    N)datetime)	Precisioncreate_onnxruntime_sessionget_ort_environment_variablesprepare_environmentsetup_logger)DEFAULT_TOLERANCEMODEL_CLASSESPRETRAINED_GPT2_MODELS
Gpt2Helper)version)QuantizeHelper)
AutoConfig)__version__ c                 C   s6  t  }|jdddtddt d |jddtd	tt d
dt  d |jddtt	j
dddd |jddtt	j
dddd |jdddtdd |jdddddd |jdddddd |jdd  |jd!td"dg d#d$d% |jd&ddd'd |jdd( |jd)d*ttjttd+d, |jd-ddd.d |jdd/ |jd0d1d2td3gd4d5 |jd6d2td3gd7d5 |jd8d9d2tg d:d;d5 |jd<d=dd d>d? |jd@dtdAdBd |jdCdddD |jddE |jdFdddD |jddG |jdHdddD |jddI |jdJdddD |jddK || }|S )LNz-mz--model_name_or_pathTz;Model path, or pretrained model name selected in the list: z, )requiredtypehelpz--model_classFGPT2LMHeadModelz!Model type selected in the list: )r   r   defaultchoicesr   z--cache_dir.cache_modelsz%Directory to cache pre-trained models)r   r   r   r   z
--onnx_dironnx_modelszDirectory to store onnx modelsz--test_timesd   z8Number of repeat times to get average inference latency.)r   r   r   r   z-vz--validate_onnx
store_truezValidate ONNX model)r   actionr   z-oz--optimize_onnxz'Use optimizer.py to optimize onnx model)optimize_onnxz--stager   )r         a6  Stage in generation: 1 (initial decoder), 2 (decoder), 0 (both). 1 - decode the first token when past_sequence_length is zero; 2 - decode the remaining tokens when past_sequence_length is not zero; 0 - one onnx model for both stages 1 and 2. Note that we will optimize 1 and 2 differently for best performance.)r   r   r   r   r   z	--use_gpuzuse GPU for inference)use_gpuz-pz--precisionzfPrecision of model to run. fp32 for full precision, fp16 for half precision, and int8 for quantization)r   r   r   r   z--torchscriptzuse Torchscript)torchscriptz-bz--batch_sizes+r   z
batch size)nargsr   r   r   z--sequence_lengthsz!sequence lengths (excluding past)z-sz--past_sequence_lengths)          @         zpast sequence lengthsz-rz--result_csvz$CSV file for saving summary results.)r   r   r   z--thread_numzThreads to usez--include_copy_output_latency)r   r   )include_copy_output_latencyz	--verbose)verbosez--output_torch_latency)output_torch_latencyz--disable_io_binding)disable_io_binding)argparseArgumentParseradd_argumentstrjoinr
   listr	   keysospathintset_defaultsr   FLOAT32
parse_args)argvparserargs r?   g/home/ubuntu/.local/lib/python3.10/site-packages/onnxruntime/transformers/models/gpt2/benchmark_gpt2.pyparse_arguments!   s   
			
rA   c           !      C   s  t tt dk rtdtd|   | jtjkr&| j	r"| j
s&J d| jtjkr3| j
r3J d| jdkrB| jdgksBJ dt| jdkrOtjd	d
n| j ttj  | j}| j}t||| j
 t| j d }t}tj| j| j|d}|j| j||d}t | j
rdnd}|!| |j"dk}|j#|| j| jd	|d}	|	d }
t| j d }|j$|||
| j%|||d | j	s| jtj&kr|	| jtjkrt'| jnd }
|j	|	d |
| jtjk|j(j)|j(j*|d	| jd | jtjkrtd t+,|
|	d | t+-|}td |	d }
| jr|j|||||d}t.|
| j
d| j| j%d}|d u r0d S |/t0| j1t0| jt0| j2|| j}|3||| jtjk}| j4p[d5t67 8d}t9|ddde}g d }t:j;||d!}|<  | j1D ]I}| j2D ]A}| jD ]9}|dkr|dkr|dksJ t=d"||| |j>||||j)|j*|j"|j?|| jtjk||d#}|/||||| j}z| j@s| jAr|B||| jC\}}tD|D ],\}}tE|tFrt=d$| d%tG| d&|d jH  qt=d$| d'|jH  qnd }d }| jIr|J||| jC\}}n|jK||||| jCd| jLd(\}}| j@rb|}| jIsDg }|D ]}|M|N O  q7|jP||| jtQ| j tQ| j d)rbtd*tQ| j  d+ td,||||| jIrod-nd|rud.nd | j| j| jtR | j
| j| j	| j|||| jI|r|d/nd0|d/d } |S|  W q tTy   tjUd1d	d2 Y    W d    d S w q}qwW d    n	1 sw   Y  td3|  |S )4Nz3.1.0z/This tool requires transformers 3.1.0 or later.z
Arguments:z'fp16 requires --optimize_onnx --use_gpuzquantization only supports CPUr   r   z<past_sequence_lengths shall be 0 for stage==1 (init decoder)T)logical)r!   	cache_dir)configrC   zcuda:0cpu   )has_past
new_folderrawr   )has_position_idshas_attention_maskfp32)auto_mixed_precisionstagezquantizing model...int8zfinished quantizing modelF)enable_all_optimizationnum_threadsr,   zbenchmark_result_{}.csvz%Y%m%d-%H%M%Sar   )modenewline)
model_namemodel_classrN   environment_variablesgpu	precision	optimizerr!   
batch_sizesequence_lengthpast_sequence_lengthr.   torch_latencyonnxruntime_latency)
fieldnameszMRunning test for batch_size=%d sequence_length=%d past_sequence_length=%d ...)float16rJ   rK   ztorch output z is tuple of size z, shape z shape )return_numpyr+   )rV   rtolatolz:Pytorch and ONNX Runtime outputs are all close (tolerance=z).zZbatch_size=%d, sequence_length=%d, past_sequence_length=%d, onnxruntime_latency=%.2f %s %sz(disable_io_binding)z, torch_latency={torch_latency}z.2fNone	Exception)exc_infozResults are saved to file )Vr   parsetransformers_versionRuntimeErrorloggerinforY   r   FLOAT16r   r    INT8rN   past_sequence_lengthstorchset_num_threads
thread_numpsutil	cpu_countprint
__config__parallel_inforC   onnx_dirr   r	   rV   r   r   from_pretrainedmodel_name_or_pathr!   deviceton_layerget_onnx_pathsexport_onnxr,   r:   r2   rD   num_attention_headshidden_sizer   quantize_onnx_modelquantize_torch_modelr   get_output_shapesmaxbatch_sizessequence_lengthsget_output_buffers
result_csvformatr   nowstrftimeopencsv
DictWriterwriteheaderdebugget_dummy_inputs
vocab_sizevalidate_onnxr-   pytorch_inference
test_times	enumerate
isinstancetuplelenshaper.   onnxruntime_inference$onnxruntime_inference_with_binded_ior+   appendrE   numpycompare_outputsr   r   writerowrf   error)!r>   rC   
output_dirrV   
gpt2helperrD   modelr{   use_external_data_formatonnx_model_pathsonnx_model_pathuse_paddingsessionmax_output_shapesoutput_bufferscsv_filenamecsv_filecolumn_names
csv_writerr[   r\   r]   dummy_inputsoutput_shapesoutputsr^   ivalueort_outputsort_latencycopy_outputsoutputrowr?   r?   r@   main   s  
"







"



   r   __main__)N)!r/   r   loggingr6   r   rs   rp   benchmark_helperr   r   r   r   r   gpt2_helperr   r	   r
   r   	packagingr   quantize_helperr   transformersr   r   ri   	getLoggerrk   rA   r   __name__r>   r,   r?   r?   r?   r@   <module>   s.   

  s
