o
    ٷi--                     @  sF  d dl mZ d dlZd dlZd dlZd dlZd dlZd dlm	Z
 d dlZd dlmZ d dlmZmZ d dlmZmZmZmZmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
l m!Z! d dl"Z#e$dZ%d)ddZ&d)ddZ'dd Z(		d*d+dd Z)d,d#d$Z*g fd,d%d&Z+e,d'krd(Z-ej.-e- e/e- e+  dS dS )-    )annotationsN)setup_logger)get_rankget_size)add_io_bindings_as_ortvaluesconvert_inputs_for_ort%get_merged_sample_with_past_kv_inputsget_sample_inputsget_sample_with_past_kv_inputsverify_ort_inputs)setup_torch_model)make_dynamic_cache)
AutoConfig)__version__)DynamicCache argsargparse.Namespaceconfigr   c                 C  s"   | j rdnd\}}|j}|||fS )N)      )r   r   )use_past_kvmax_position_embeddings)r   r   past_sequence_lengthcurr_sequence_lengthmax_sequence_length r   f/home/ubuntu/.local/lib/python3.10/site-packages/onnxruntime/transformers/models/llama/llama_parity.pyget_sequence_lengths%   s   
r   c                 C  s   t  }d}t| |\}}}| jr#t|| j||||| j| jd|d
}|S | jr5t|| j||| jd|d}|S t	|| j||dd}|S )N   T)seq_lenpast_seq_lenmax_seq_lenuse_fp16use_buffer_sharereturn_dict
world_size)r#   r%   r&   )r%   )
r   r   mergedr   devicer#   r$   r   r
   r	   )r   r   r&   
batch_sizer   sequence_lengthr   inputsr   r   r   
get_inputs+   s<   r,   c                 C  s   t | tttfr
| S t | trtdd | D S t | tr$dd | D S t | tr0dd | D S t | tr>dd |  D S t | t	j
rH|  S t| d	rQ|  S t | tretttt| j| jd
dS tdt|  )Nc                 s  s    | ]}t |V  qd S )Ntorch_deepcopy.0vr   r   r   	<genexpr>R   s    z!torch_deepcopy.<locals>.<genexpr>c                 S  s   g | ]}t |qS r   r-   r/   r   r   r   
<listcomp>T       z"torch_deepcopy.<locals>.<listcomp>c                 S  s   h | ]}t |qS r   r-   r/   r   r   r   	<setcomp>V   r4   z!torch_deepcopy.<locals>.<setcomp>c                 S  s   i | ]	\}}|t |qS r   r-   )r0   kr1   r   r   r   
<dictcomp>X   s    z"torch_deepcopy.<locals>.<dictcomp>cloneF)strictz(torch_deepcopy not implemented for type )
isinstanceintfloatstrtuplelistsetdictitemsnpndarraycopyhasattrr8   r   r   r.   zip	key_cachevalue_cacheNotImplementedErrortype)valuer   r   r   r.   N   s"   





r.   locationr=   use_auth_tokenboolkv_cache_ortvaluesrA   pytorch_modelNone | torch.nn.ModuleNone | AutoConfigc                 C  sd  |}|d u rt | ||| jrtjntj| jd\}}t| |}d|v r4tt	tdkr4t
|d |d< t|}| jdkrBtj  t }	|di |j   }
| jdkr^tj  t }td||	  d | jrz|d urz~tj  t| |\}}}t|| j||d}| j  d}|d	kr|d
| jif}tj| jt  |gd}t!||}| jdkrt"||| jt#| j| j|d\}}|$  t }	|%| |&  t }|' d }~nt }	|(d |}t }|d }td||	  d d| jv sd| jv rdnd}t)j*|
|||d}t+d|  |s0t+dt),|
|   |S )Ntorch_dtyper(   past_key_valuesz4.45cpuzPyTorch took z s)r$   r!   r"   ExecutionProviderCUDAExecutionProvider	device_id)sess_options	providers)
ort_inputsr(   rZ   r$   rP   r   zONNX Runtime took int4int8g      4@g      ?)rtolatolz,Are PyTorch and ONNX Runtime results close? z
Max diff: r   )-r   r#   torchfloat16float32r(   r,   pvVersiontransformers_versionr   r.   execution_providercudasynchronizetimelogitsdetachrW   numpyloggerinfo	small_gpuempty_cacher   r   r$   upperrankortInferenceSessiononnx_model_pathSessionOptionsr   r   r;   synchronize_inputsrun_with_iobindingsynchronize_outputscopy_outputs_to_cpurunrC   allclosewarningmax)r   rM   rN   rP   rQ   r   py_modelr+   inputs_after_deepcopy
start_time
pt_outputsend_timer   _r   ep	ort_model
io_bindingort_outputstolparityr   r   r   verify_parityd   s   	









	
 r   argv	list[str]c                 C  sj  t  }|jddddd |jdddtjdd	d
 |jdddtjddd
 |jddddg ddd |jddddd |jdd |jddddd |jdd |jdd dd!d |jdd" |jd#dd$d |jdd% |jd&d'dg d(d)d* |jd+dtd,d-d. |jd/dd0d | g kr| n|| }|j	d1v s|j	d2kr|j
dkrd3|_	|S d4|_	|S )5Nz-mz--model_nameFzModel name in Hugging Face)requiredhelpz-tz--torch_model_directory.zMPath to folder containing PyTorch model and associated files if saved on disk)r   defaultr   z-oz--onnx_model_pathTzSPath to ONNX model (with external data files saved in the same folder as the model)z-epz--execution_providerrW   )rW   ri   rocmz(Execution provider to verify parity with)r   r   choicesr   z-vz	--verbose
store_truezPrint verbose logs)actionr   )verbosez-pz--use_past_kvzfUse past key and past value as inputs to the model. Necessary for decoder_with_past_model.onnx models.)r   z-gz--use_buffer_sharezWUse if model has GroupQueryAttention and you want to enable past-present buffer sharing)r$   z--mergedz2Use merged model (i.e. decoder_merged_model.onnx).)r'   z-fpz--precision)r^   r_   fp16fp32zPrecision of model)r   r   r   z--cache_dirz./model_cachezQmodel cache dir to override default HF cache dir to avoid overflood the /home dir)r   rK   r   r   z--small_gpuzhLoad the llama in GPU every time for parity_check if it's running in a machine which GPU memory < 36GB. >   r   r_   r^   r   r   )argparseArgumentParseradd_argumentospathjoinset_defaultsr=   
parse_args	precisionrh   )r   parserr   r   r   r   get_args   s   

		r   c                 C  s  t | }t|j td|  t }t|d|jdk ||_t|d|j	dkr*dnd|  t|dt
|j |jtjdk}|rH|jn|j}i }|jsYt|||| d S d  }}|jsst||||jrjt
jnt
j|jd	\}}d
|_t||||||d}d|_t||||||d d S )NzArguments: r#   r   device_namerW   zcuda:r(   r   rT   F)rQ   r   T)r   r   r   ro   rp   r   setattrr   rt   rh   rb   r(   r   torch_model_directoryr   r   r   
model_namer'   r   rq   r   r#   rc   rd   r   )r   r   rt   rN   rM   rP   r   llamar   r   r   main,  s8   
 
	r   __main__r   )r   r   r   r   )NN)r   r   rM   r=   rN   rO   rP   rA   rQ   rR   r   rS   )r   r   )0
__future__r   r   loggingr   rk   rn   rC   packaging.versionversionre   rb   benchmark_helperr   dist_settingsr   r   llama_inputsr   r   r   r	   r
   r   llama_torchr   (models.torch_export_patches.cache_helperr   transformersr   r   rg   transformers.cache_utilsr   onnxruntimeru   	getLoggerro   r   r,   r.   r   r   r   __name__seedrandommanual_seedr   r   r   r   <module>   s@    


#
dd'

