o
    ٷi%a                     @  s  d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlmZ d dlmZmZ d dlmZmZmZmZ d dlZeeZddd	Zd
d Z dd Z!dd Z"dd Z#dd Z$dd Z%edkre%  dS dS )    )annotationsN)setup_logger)add_io_bindings_as_tensorsget_initial_inputs_and_outputs)
AutoConfigAutoModelForCausalLMAutoTokenizerBitsAndBytesConfigargsargparse.Namespacec                 C  s  | j dv rd }| jdkr9| jdkr9tdddtjd}tj| jdkr$| jn| j	| j
| j| j| jdd|| jd	id
	}n[z&tj| jdkrD| jn| j	| j
| j| j| jd| jdkrVdndd| j}W n4 ty } z(td| tj| jdkrv| jn| j	| j
| j| j| jddd| j}W Y d }~nd }~ww |  | j dkrt|}|S t }| jdkrdd| jifnd}tj| j||gd}|S )N   pt-eager
pt-compileint4cudaTnf4)load_in_4bitbnb_4bit_use_double_quantbnb_4bit_quant_typebnb_4bit_compute_dtype flash_attention_280GB)	cache_dirtorch_dtypeuse_auth_tokentrust_remote_code	use_cacheattn_implementationquantization_config
max_memorysdpa)r   r   r   r   r   r   z&Try to load a model using eager mode: eagerr   CUDAExecutionProvider	device_idCPUExecutionProvider)sess_options	providers)benchmark_typeonnx_precisiondevicer	   torchfloat16r   from_pretrainedhf_dir_path
model_namer   r   authtrustr$   totarget_device	ExceptionprintevalcompileortSessionOptionsInferenceSessiononnx_model_path)r
   model
bnb_configer&   ep r@   g/home/ubuntu/.local/lib/python3.10/site-packages/onnxruntime/transformers/models/llama/benchmark_e2e.py	get_model8   st   
	



rB   c           
   	   C  s"  | j dkr t  |di |}W d    n1 sw   Y  d }| j dv r4| jdkr3tj| j nt|||| j| j	}|
  t }t|D ]8}| j dv ryt  |di |}| jdkritj| j W d    n1 ssw   Y  qJ|| |  qJt }|| | }	|	|fS )Nr   r   cpur@   )r(   r+   no_gradr*   r   synchronizer3   r   use_fp16use_buffer_sharesynchronize_inputstimeperf_counterrangerun_with_iobindingsynchronize_outputs)
r
   r<   runsinputsoutputs
io_bindingstart_endavgr@   r@   rA   run_inferencex   s4   








rV   c           	   	   C  sF   t   t||||| j| j| j| j\}}t| || j||\}}||fS N)clear_cacher   r3   rF   rG   enginerV   warmup_runs)	r
   r<   config	tokenizerprompt_lengthpromptrO   rP   rS   r@   r@   rA   prepare_model_for_inference   s   r_   c                   C  s   t   tj  d S rW   )gccollectr+   r   empty_cacher@   r@   r@   rA   rX      s   rX   c                 C  sv   t j| ddddddddd	|d
  dd|d
  dd	| dd| dddgd}|j|dd td| d d S )Nz
Batch SizezPrompt LengthzPrompt Processing Latency (ms)z"Prompt Processing Throughput (tps)zSampling Latency (ms)zSampling Throughput (tps)z"First Token Generated Latency (ms)z&First Token Generated Throughput (tps)Average Latency of First    z Tokens Generated (ms)Average Throughput of First z Tokens Generated (tps)zWall-Clock Latency (s)zWall-Clock Throughput (tps))columnsF)indexzResults saved in !)pd	DataFrameto_csvloggerinfo)resultsfilename
gen_lengthdfr@   r@   rA   save_results   s(   

rr   c               
   C  s  t  } | jddtdg dd | jddtdd	d
 | jdddddd | jdddddd | jddttjdddd | jdtddd | jddddd | jd d!dtjdd"d#d$d%d& | jd'ddd(d | jd)ddd*df | jd+d,d-d. | jd/d0d1d. | jd2d3dtd4g d5d6d7 | jd8d9td:d;d | jd<d=ttj	
 rd>nd?d?d>gd@ | jdAdBtdCdD | jdEdFtdGdD | jdHdItdJdD | jdKtdLdD |  }tj|j t|j dM|jv rt|dN|j  dO |jdPkr|jdQ|jif|_|jdMkr|jsJ dR|jdS|_|jdS|_t|dT|j |jdUv s8|jdVkr:|jd?kr:d4ndW|_|jd?krIdX|j n|j}|jdWkrUtjntj}|jdMkr`dMndY}t|dZ| t|d[| t|d\| t|d]|jdWk |jo|dMk|_|S )^Nz-btz--benchmark-typeT)r   r   r8   )typerequiredchoicesz-mz--model-nameFz<Hugging Face name of model (e.g. 'meta-llama/Llama-2-7b-hf'))rs   rt   helpz-az--auth
store_truez5Use Hugging Face authentication token to access model)defaultactionrv   z-tz--trustzeWhether or not to allow for custom models defined on the Hugging Face Hub in their own modeling filesz-cz--cache-dir.model_cachezPath to directory containing all Hugging Face files (e.g. config, tokenizer, PyTorch model). Use when loading model as `AutoModel.from_pretrained(model_name, cache_dir=cache_dir)`.)rs   rx   rv   z--hf-dir-pathr   zPath to directory containing all Hugging Face files (e.g. config, tokenizer, PyTorch model). Use when loading model as `AutoModel.from_pretrained(folder_path)`.z-oz--onnx-model-pathzPath to ONNX model)rt   rv   z-fz--prompts-filemodelsllamazprompts.jsonzsJSON file containing entries in the format 'prompt length: prompt' where prompt length = tokenized length of prompt)rt   rx   rv   z--use_buffer_sharez3Use when GroupQueryAttention (GQA) is in ONNX modelz--anomaly-filteringzUse this flag to filter anomaly accelerator times for tokens generated.               This may give more accurate latency and throughput metrics for tokens generated.               Wall-clock metrics are still reported with anomaly times though.z-bz--batch-sizesz1 2)rx   z-sz--prompt-lengthsz16 64 256 1024z-pz--precisionfp32)r   int8fp16r~   zePrecision for model. For ONNX models, the model's precision should be set before running this script.)rt   rs   rx   ru   rv   z-gz--generation-length   z Number of new tokens to generatez-dz--devicer   rC   )rs   rx   ru   z-idz--device-idr   )rs   rx   z-wz--warmup-runs   z-nz
--num-runsd   z--seedrd   r8   execution_providerExecutionProviderr#   r$   z,Please specify a path to `--onnx-model-path` r)   >   r~   r   r   r   zcuda:ptr3   r   rY   rF   )argparseArgumentParseradd_argumentstrospathjoinintr+   r   is_available
parse_argsnprandomseedmanual_seedr(   setattrr*   upperr   r$   r;   batch_sizessplitprompt_lengths	precisionr,   float32rG   )parserr
   r3   r   rY   r@   r@   rA   get_args   s   
*r   c            9        s  t  } td t| j d }t| j}tj|dd d}W d    n1 s(w   Y  t	j
| jdkr7| jn| j| j| j| jd}tj
| jdkrL| jn| j| j| j| jd}t| }g }t| j| jD ]u\}}t|t|}}td| d|  t  || j }	||vrttd	| d
| j d| j d| d| d| d| d| d| j d| d|| g| }
||g}ztd t| |||||
\}}t| || j||\}}|d }|||  }td| d td|||   d |||g td t  t| |||||
\}}|d   }|j!d }|j"}t#|dr,|j$n|j%|j& }t'j(|| j)t'j*d}g }g }t+, }||	krt| |d||\}}|-| t+, }|d  j!d dkr|d! .dd }|j/dd"0d|j12|d|j1}t'3|d  d|4 }n|d  d d dd d f }t'j5|dd"}||B |j6k}|7||j68|dg}t+, } |-| |  t'j9||gdd"}|d7 }||d< t'9|d! | :t'j;8|dgd|d!< d#|v rt'j<|d# dd"d$ 8|dd |d#< |d  j!d dkr|d  d d d dd d f = |d < |d  >  | j?d%kr,|d& |d&< nh| j@stA|jBD ]}!|d'|! d( |d)|! d(< |d'|! d* |d)|! d*< q5|d! j!d }"tA|jBD ]2}!t'j(|||"|| j)| jCd}#t'j(|||"|| j)| jCd}$|Dd'|! d(|#= d'|! d*|$= i qa||	ksIt+, }%|Ed$ | jFrd+ tG|tH|}&tItJ fd,d|}tH|}'td-|&|'  d.  d/d  d0 t.|tH| }(|(d })|d|(  }*td1|) d td2|* d |d$ }+|+d },|d|+  }-td3|, d td4|- d | jd5 }.t.|d |. tH|d |.  }/|/d }0|d|/  }1td6|. d7|0 d td8|. d7|1 d t.|tH| }2|2d }3|d|2  }4td6| j d7|3 d td8| j d7|4 d |%| }5||| j |5  }6td9|5 d: td;||| j |5   d td< ||)|*|,|-|0|1|3|4|5|6g
 |-| W qe tKy }7 ztd=| d| d>|7  W Y d }7~7qed }7~7ww d?| j? d@tLjLM dAdB}8tN||8| j d S )CNFc                 S  s   dd |   D S )Nc                 S  s   i | ]	\}}t ||qS r@   )r   ).0kvr@   r@   rA   
<dictcomp>j  s    z*main.<locals>.<lambda>.<locals>.<dictcomp>)items)dr@   r@   rA   <lambda>j  s    zmain.<locals>.<lambda>)object_hookr   )r   r   r   zRunning batch size = z, prompt length = z2
                                A prompt of size z was not found in 'zv'. There are a couple of solutions to fix this.
                                1) You can change one of the keys in 'z' to be z).
                                    If za < actual prompt's length, the benchmark E2E tool will repeat the first word in the prompt until zB = actual prompt's length.
                                    If zm > actual prompt's length, the benchmark E2E tool will automatically trim the actual prompt's length so that zd = actual prompt's length.
                                2) You can add a new key-value entry in 'z' of the form 'z,': 'your prompt goes here'.
                zMeasuring prompt processing...i  z&Average Latency of Prompt Processing: z msz)Average Throughput of Prompt Processing: z tpszMeasuring token generation...	input_idshead_dim)r*   dtype   logitsattention_mask)dimposition_idsr   r   past_key_valueszpresent.z.keyzpast_key_values.z.value
   c                   s   |   k S rW   r@   )acc_timeanomaly_threshold_factor
min_time_sr@   rA   r     s    zFiltered out z$ anomaly accelerator times that are zx greater than z ms...zAverage Latency of Sampling: z Average Throughput of Sampling: z"Latency of First Token Generated: z%Throughput of First Token Generated: rd   rc   z Tokens Generated: re   zWall-Clock Latency: z szWall-Clock Throughput: zAdding results to CSVz$Could not benchmark at batch size = z - 
benchmark__e2e_z%Y-%m-%d_%H:%M:%Sz.csv)Or   r   rl   rm   __dict__openprompts_filejsonloadr   r-   r.   r/   r   r0   r1   r   rB   	itertoolsproductr   r   r   rX   generation_lengthNotImplementedErrortextwrapdedentr_   rV   num_runsextendcloneshapenum_key_value_headshasattrr   hidden_sizenum_attention_headsr+   zerosr3   boolrI   rJ   appendsum	unsqueezerepeat
vocab_sizeviewgathersqueezeargmaxeos_token_idmasked_fillreshapecatr2   int64max
contiguouszero_rY   rG   rK   num_hidden_layersr   updatepopanomaly_filteringminlenlistfilterr4   datetimenowrr   )9r
   size_to_promptfr[   r\   r<   all_csv_metrics
batch_sizer]   
max_lengthr^   csv_metricsrO   rP   accelerator_prompt_latency_saccelerator_prompt_latency_msaccelerator_prompt_thrptall_token_idscurrent_length	num_heads	head_sizehas_eosaccelerator_timessampling_timeswall_clock_start_timeaccelerator_time_latency_ssampling_start_timeprompt_end_indicesidxsnext_token_logitsnext_tokenstokens_to_addsampling_end_timeinew_sequence_lengthpresent_keypresent_valuewall_clock_end_time	orig_sizenew_sizeavg_sampling_latency_savg_sampling_latency_msavg_sampling_thrptfirst_token_latency_sfirst_token_latency_msfirst_token_thrpthalfwayhalfway_token_latency_shalfway_token_latency_mshalfway_token_thrptall_token_latency_sall_token_latency_msall_token_thrptwall_clock_latency_swall_clock_thrptr>   ro   r@   r   rA   mainb  s  








&& 
O
	
 
(r  __main__)r
   r   )&
__future__r   r   r   r`   r   r   loggingr   r   rI   numpyr   pandasri   r+   benchmark_helperr   llama_inputsr   r   transformersr   r   r   r	   onnxruntimer8   	getLogger__name__rl   rB   rV   r_   rX   rr   r   r  r@   r@   r@   rA   <module>   s<   

@	 % ~
