o
    ٷiFU                     @   s  d Z ddlZddlZddlZddlZddlmZ ddlmZ ddlZddl	m
Z
 ddlmZ ddlmZmZmZ ddlmZmZmZ dd	lmZ G d
d dZd)dedefddZd)dedefddZG dd dZdefddZdefddZ	d)dejdejdB fddZdd Z dd  Z!e"d!kre! Z#e$d"e#  e#j%du re#j&d#krd$nd%e#_%e#j'rej() sJ e#j*d&krd'e v sJ d(e#_+n
e#j,rJ e#j+rJ e#j,se#j+ree# dS e e# dS dS )*z]
Benchmark performance of SAM2 encoder with ORT or PyTorch. See benchmark_sam2.sh for usage.
    N)Mapping)datetime)SAM2ImageDecoder)SAM2ImageEncoder)decoder_shape_dictencoder_shape_dictload_sam2_model)InferenceSessionSessionOptionsget_available_providers)CudaSessionc                +   @   s   e Zd Zddddddddddddejdddddd	dfd
edededejdededededededededededededededededef*dd Z	d!d" Z
d#eeee f fd$d%Zd#eeejf fd&d'Zd(S ))
TestConfigimage_encoderCPUExecutionProvidermax-autotune      FT     
model_type	onnx_pathsam2_dirdevice	component
batch_sizeheightwidth
num_labels
num_points	num_masksmulti_mask_outputuse_tf32enable_cuda_graphprefer_nhwcwarm_upenable_nvtx_profileenable_ort_profileenable_torch_profilerepeatsverbosec                 C   s   |dv sJ |	dkr|	dksJ |
dkr|
dksJ || _ || _|| _|| _|| _|| _|| _|	| _|
| _|| _	|| _
|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _| jdkru| jdkrq| jdkswJ dd S d S )Nsam2_hiera_tinysam2_hiera_smallsam2_hiera_largesam2_hiera_base_plus   i   r   r   z7Only image size 1024x1024 is allowed for image encoder.)r   r   r   r   providertorch_compile_moder   r   r   r   r   r   r    r   r!   r"   dtyper#   r$   r%   r&   r'   r(   r)   )selfr   r   r   r   r   r0   r1   r   r   r   r   r   r   r    r!   r"   r2   r#   r$   r%   r&   r'   r(   r)    r4   g/home/ubuntu/.local/lib/python3.10/site-packages/onnxruntime/transformers/models/sam2/benchmark_sam2.py__init__   s>   
zTestConfig.__init__c                 C   s
   t |  S N)varsr3   r4   r4   r5   __repr__V   s   
zTestConfig.__repr__returnc                 C   s6   | j dkrt| j| j| jS t| j| j| j| j| jS )Nr   )	r   r   r   r   r   r   r   r   r   r9   r4   r4   r5   
shape_dictY   s   
zTestConfig.shape_dictc                 C   s   | j }| jdkrdtj| jd| j| j|| jdiS tjdddd|| jdtjddd	d	|| jdtjdddd|| jdtj	d
d| j
| jdf|| jdtj	d
d| j
| jftj| jdtj| j
ddd|| jdtj| j
|| jdtj| j| jgtj| jddS )Nr   image   )r2   r   r          @      r   r      )image_features_0image_features_1image_embeddingspoint_coordspoint_labelsinput_maskshas_input_masksoriginal_image_size)r2   r   torchrandnr   r   r   r   randrandintr   r   int32zerosonestensor)r3   r2   r4   r4   r5   random_inputs_   s    
"zTestConfig.random_inputsN)__name__
__module____qualname__rL   float32strr   intboolr6   r:   r   listr<   TensorrT   r4   r4   r4   r5   r      s    	

;r   configr;   c                 C   s   | j rtdt|   | jdkr=t| jtrtj	 n| jj
}t|| j}t| j|d< | jr5d|d< | j|fdg}ndg}t| j||d}|S )Nzcreate session for CUDAExecutionProviderr!   r   r#   r   )	providers)r)   printr8   r0   
isinstancer   rY   rL   cudacurrent_deviceindexr   get_cuda_provider_optionsr"   rZ   r!   r#   r	   r   )r^   session_options	device_idprovider_optionsr`   ort_sessionr4   r4   r5   create_ort_sessiont   s   
rk   c                 C   s,   t | |}t|| j| j}||   |S r7   )rk   r   r   r"   allocate_buffersr<   )r^   rg   rj   cuda_sessionr4   r4   r5   create_session   s   
rn   c                   @   s(   e Zd ZdZddefddZdd ZdS )	OrtTestSessionz;A wrapper of ORT session to test relevance and performance.Nr^   c                 C   s   t ||| _| | _d S r7   )rn   rj   rT   	feed_dict)r3   r^   rg   r4   r4   r5   r6      s   zOrtTestSession.__init__c                 C   s   | j | jS r7   )rj   inferrp   r9   r4   r4   r5   rq      s   zOrtTestSession.inferr7   )rU   rV   rW   __doc__r   r6   rq   r4   r4   r4   r5   ro      s    ro   rm   c                 C   s"   t   }| |}t   }|| S r7   )timerq   )rm   
input_dictstart_endr4   r4   r5   measure_latency   s   
rx   c                 C   sL  | j j}|dk}|r!tjdjdkr!| jr!dtjjj_	dtjj
_	|o(| jtjk}|  }t i tj|| j|dM t| j| j| j d}| jdkr`|rc| jdkrctj|jj| jdd	d
|j_|  d }t|j| j | jd}t|}|r| jdkrtd| j d t| jD ]	}	||\}
}}q|r| jrdd l }ddlm!} |"  td |#d ||dd W d    n1 sw   Y  |$  |r$| j%r$tj&j'tj&j(j)tj&j(j*gdd&}td tj&+d || W d    n1 sw   Y  W d    n	1 sw   Y  t|, j-ddd |.d | j/dkr9	 W d    W d    d S td| j/ d t00 }t| j/D ]}	||\}
}}|r\tj1  qKn|d |d |d |d |d  |d! |d" |d# f}t2|| j3d$}|r| jdkrtj|j| jdd	d
|_t| jD ]
}	|| \}}}q|r| jrdd l }ddlm!} |"  td% |#d ||d&di W d    n	1 sw   Y  |$  |r7| j%r7tj&j'tj&j(j)tj&j(j*gdd'}td' tj&+d( ||  W d    n	1 sw   Y  W d    n	1 s"w   Y  t|, j-ddd |.d) | j/dkrL	 W d    W d    d S td| j/ d t00 }t| j/D ]}	|| \}}}|rotj1  q^t00 }|| | j/ W  d    W  d    S 1 sw   Y  W d    d S 1 sw   Y  d S )*Nrc   r      T)device_typer2   enabled)r   r   noneF)mode	fullgraphdynamicr=   )r   r2   zBRunning warm up. It will take a while since torch compile mode is .cudartz#Start nvtx profiling on encoder ...one_run)r%   )
activitiesrecord_shapesz$Start torch profiling on encoder ...encodercuda_time_total
   )sort_by	row_limitztorch_image_encoder.jsonzStart z runs of performance tests...rD   rE   rF   rG   rH   rI   rJ   rK   )multimask_outputz"Start nvtx profiling on decoder...r%   z$Start torch profiling on decoder ...decoderztorch_image_decoder.json)4r   typerL   rc   get_device_propertiesmajorr!   backendsmatmul
allow_tf32cudnnr2   rX   rT   inference_modeautocastr   r   r   r   r1   compiler   forwardr<   rM   tor   ra   ranger$   r%   nvtxr   cudaProfilerStartannotatecudaProfilerStopr'   profilerprofileProfilerActivityCPUCUDArecord_functionkey_averagestableexport_chrome_tracer(   rs   synchronizer   r    )r^   rz   is_cudaenabled_auto_cast
ort_inputs
sam2_modelimage_shapeimgsam2_encoderrv   _image_features_0_image_features_1_image_embeddingsr   r   profru   torch_inputssam2_decoder_masks_iou_predictions_low_res_masksrw   r4   r4   r5   	run_torch   s   
"



-


h
Vr   args
csv_writerc                 C   s  | j }| j}| j}|rtj }td|}d}nd}td}d}d}tjtjtj	d}t
d+i d| jd	| jd
| jd| jd|d| jd| jd| jd|ddd|d|| j d| jd| jd| jd| jd| jd| jd| jdd}	| jdkr(t }
| j|
_|	jrd|
_d|
_d|
_t|	|
}|	  }zt!|	jD ]}t"||}qW n t#y } zt$d|	d |  W Y d }~d S d }~ww |	jrdd l%}dd!lm&} |'  |(d" |)|}W d    n1 sw   Y  |*  |	jr|j+,  |dkrd S g }t!|D ]}t"||}|-| qt./|}~nGt0 3 zt1|	}W n$ t#yW } zt$d|	d |  W Y d }~W d    d S d }~ww W d    n	1 scw   Y  |dkrod S | jd# |rxdnd }i d| jd| jd| jd$|d|d|	jd|	j2d| jd| jd| jd%| j3d&|	j4d'|	j5d(|	j6d)| jd|	jd|| j| j||d*}|d ur|7| t$t8|	  t$|  d S ),Nrc   r_   r   cpuFr   fp32fp16bf16r   r   r   r   r0   r   r   r   r   r!   Tr"   r2   r#   r(   r$   r%   r&   r'   r1   r)   ort   zFailed to run config=z. Exception: r   r   :use_gpur    r   r   r   intra_op_num_threads)r%   r1   engineaverage_latencyr4   )9r   use_cuda_graphr(   rL   rc   rd   r   rX   float16bfloat16r   r   r   r   r   r   r   r   r2   r#   r$   r%   r&   r'   r1   r   r
   r   enable_profilinglog_severity_levellog_verbosity_levelrn   rT   r   rx   	Exceptionra   r   r   r   r   rq   r   rj   end_profilingappend
statisticsmeanno_gradr   r!   r   r   r   r   writerowr8   )r   r   r   r"   r(   rh   r   r0   dtypesr^   sess_optionssessionrt   rv   er   r   latency_listlatencyr   r   rowr4   r4   r5   run_test  s&  

	









	


r   c                 C   s   | j rdnd}d|| jt d}t|ddd}g d}tj||d	}|	  t
| | W d    d S 1 s;w   Y  d S )
Ngpur   zbenchmark_sam_{}_{}_{}.csvz%Y%m%d-%H%M%Sa )r}   newline)r   r   r2   r   r"   r#   r!   r   r   r   r    r   r   r   r   r$   r(   r%   r1   r   r   )
fieldnames)r   formatr   r   nowstrftimeopencsv
DictWriterwriteheaderr   )r   featurescsv_filenamecsv_filecolumn_namesr   r4   r4   r5   run_perf_test  s   "r   c                  C   s  t jdd} | jddddgddd | jd	dg d
ddd | jddddd | jdd | jddddd | jdd | jddtg dddd | jddtddd | jddtdd d | jd!dtdd"d | jd#dtd$d%d | jd&dtd'd(d | jd)dtd*d*d+gd,d- | jd.dddd/d0 | jd1dddd2d0 | jd3dddd4d0 | jd5dddd6d0 | jd7dddd8d0 | jd9dtd:g d;d<d- | jd=dtd>d?d | jd@dtdAdBd | jdCdtd g dDdEd- |  }|S )FNz,Benchmark SMA2 for ONNX Runtime and PyTorch.)descriptionz--componentFr   image_decoderzDcomponent to benchmark. Choices are image_encoder and image_decoder.)requiredchoicesdefaulthelpz--dtyper   r   zData type for inference.z	--use_gpu
store_truezUse GPU for inference.)r   actionr   )r   z--use_cuda_graphzUse cuda graph in onnxruntime.)r   z--intra_op_num_threads)r   r   rC   r   ry      r   z&intra_op_num_threads for onnxruntime. )r   r   r   r   r   z--batch_sizer   z
batch size)r   r   r   r   z--heightr   zimage heightz--widthzimage widthz	--repeatsr   z8number of repeats for performance test. Default is 1000.z	--warm_upr   z)number of runs for warm up. Default is 5.z--enginer   rL   zengine for inference)r   r   r   r   r   z--multimask_outputz:Export mask_decoder or image_decoder with multimask_output)r   r   r   r   z--prefer_nhwcz;Use prefer_nhwc=1 provider option for CUDAExecutionProviderz--enable_nvtx_profilezVEnable nvtx profiling. It will add an extra run for profiling before performance test.z--enable_ort_profilezEnable ORT profiling.z--enable_torch_profilezYEnable PyTorch profiling. It will add an extra run for profiling before performance test.z--model_typer-   r*   zsam2 model namez
--sam2_dirz./segment-anything-2z6The directory of segment-anything-2 git root directoryz--onnx_pathz6./sam2_onnx_models/sam2_hiera_large_image_encoder.onnxzpath of onnx modelz--torch_compile_mode)zreduce-overheadr   zmax-autotune-no-cudagraphsr|   z4torch compile mode. none will disable torch compile.)argparseArgumentParseradd_argumentset_defaultsrZ   rY   
parse_args)parserr   r4   r4   r5   _parse_arguments  s  				r   __main__z
arguments:r   r   r|   r   r_   Fr7   )-rr   r   r   r   rs   collections.abcr   r   rL   r   r   r   r   
sam2_utilsr   r   r   onnxruntimer	   r
   r   *onnxruntime.transformers.io_binding_helperr   r   rk   rn   ro   rx   r   	Namespacer   r   r   r   rU   r   ra   r1   r   r   rc   is_availabler   r'   r%   r4   r4   r4   r5   <module>   s\   Z 
~% )



