o
    ٷiC                     @   sp   d dl Z d dlZd dlZd dlmZ d dlZejej	ej
ejdZdd ZG dd dZ						dd
dZdS )    N)AutoTokenizer)ztorch.int32ztorch.int64ztorch.float32ztorch.float16c                 C   s8   ddl m} ||  | | |  |jj d S )Nr   )cudart)cudar   
cudaMemcpydata_ptrelement_sizenelementcudaMemcpyKindcudaMemcpyDeviceToDevice)dstsrcr    r   j/home/ubuntu/.local/lib/python3.10/site-packages/onnxruntime/transformers/models/phi2/inference_example.pycuda_memcpy   s   r   c                   @   sb   e Zd Zdd Zdd Zdd Zdejded	efd
dZ		dddZ
dddZdd Zdd ZdS )ORTGeneratorc                 C   s:   || _ d| _d| _d| _d| _d| _d| _d| _i | _d S )N    P   i   r   F)	onnx_decoder_path	num_heads	head_size
num_layersmax_sequence_length	device_iduse_cuda_graphuse_traced_inputsstatic_inputs_map)selfdecoder_pathr   r   r   __init__"   s   
zORTGenerator.__init__c                 C   s  || j v rd S td}td| j}i }tj|dftj|d|d< tjdgtj|d|d< tj|dg tj|d|d< tjdgtj|d|d	< || j| j	| j
f}t| jD ] }tj||tjd
}|d| | d| |  i qVtj|ddftj|d|d< || j |< d S )Ncpur      )dtypedevice	input_idsr   step	seqlens_ktotal_sequence_lengthr"   r!   	past_key_past_value_   logits)r   torchr"   r   zerosint32tensorint64r   r   r   ranger   float16update
contiguousclone)r   
batch_size
cpu_devicecuda_device	static_iocache_shapeicacher   r   r   append_static_inputs-   s   

,z!ORTGenerator.append_static_inputsc              	   C   s2  | j rtjntj| _tj|d | jtjd}tj|d | jtjd}|j\}}| j	o5|| j
v o5| jo5| j | _| jsEtjdg| jtjdn| j
| d }| js\tj|dg | jtjdn| j
| d }t||ddtj | jstjdgtdtjdn| j
| d	 }||d< | | d
}	| jr| |	d< | j	r| |	d< | |	d	< |	d= | jr| jnd}
| jrd|| j|
| jfn|| j|
| jf}| jst| jD ]0}tj|| j| jd}| js|	d| | d| |  in|	d| | i qn,t| jD ]&}|	d| | j
| d|   d| | j
| d|   i qtj||d| j| jd}d| i}| js| jrWd|| j|| jfn|| j|| jf}t| jD ]0}tj|| j| jd}| js|d| | d| | in|d| | i qd|	|fS )Nr#   r'   attention_maskr   r$   r%   r    r   r&   )r#   r>      r(   r)   past_r*   r+   present_key_present_value_present_)use_fp16r,   r2   float32torch_dtyper/   r"   r.   shaper   r   use_buffer_share	packed_kvr   r0   r   sumsubtor4   use_stepr   r   r   r1   r   r-   r3   r5   )r   encodings_dictr#   r>   r6   sequence_lengthr$   r%   total_seq_lengthinputspast_seq_length
past_shaper;   pastr+   outputspresent_shapepresentr   r   r   get_initial_inputs_and_outputsD   s   
	,z+ORTGenerator.get_initial_inputs_and_outputsmodelrQ   rU   c           
   	   C   s  |  }d }| D ](\}}|j||jj|jjdkrdn|jjtt|j t	|j
| d |j}q
| D ]T}|j}	| jrgd|	v rg||	dd }|j|	|jj|jj| jrZtjntjt	|j
| d q7||	 }|j|	|j|jdkrwdn|j| jrtjntjt	|j
| d q7|S )Nr   r   )namedevice_typer   element_typerG   
buffer_ptrrW   rT   )
io_bindingitems
bind_inputr"   typeindexpt_to_npreprr!   tuplerG   r   get_outputsrZ   rH   replacebind_outputrD   npr2   rE   )
r   rY   rQ   rU   r^   r"   kvoutputrZ   r   r   r   apply_io_binding   sD   		zORTGenerator.apply_io_bindingTFc           	      C   s   || _ t }d|_d|_|| _| j dkrd| j | jdfnd}tj| j||gd| _t	 | _
tj r<td| j ntd| _|| _|| _|| _|| _tjd	d
d| _d| j_d S )N   r   CUDAExecutionProvider)r   enable_cuda_graphCPUExecutionProvider)sess_options	providersr   r   zmicrosoft/phi-2T)trust_remote_codez[PAD])r   ortSessionOptionslog_verbosity_levellog_severity_levelr   InferenceSessionr   sess
RunOptionsror,   r   is_availabler"   rD   rH   rI   rM   r   from_pretrained	tokenizer	pad_token)	r   r   rD   rH   rI   rM   r   rr   epr   r   r   create_session   s$   

$zORTGenerator.create_sessionc              
   C   s  |  |\}}|d  }|j\}}	|	}
tj|| jtjd}|r"g }d}|
|k r&| | j||}|r7t		 }|
  |rc| jrG| jdd | j|| j | jr`| jd| jr]t|nd d}n| j|| j |  |r|t		 }|||  |d d d dd d f }tj|dd	}||B | jjk}||| jj|d
g}tj||gdd	}t|rnq|
d
7 }
|tj|d< | jrt| j| d |d  | j| d |d< | jrtj|
d
 g| jtj d|d< | jrt| j| d |d  | j| d |d< | jrR|d }|| |d
 tj|d< |
|d d< | jrQt| j| d |d  | j| d |d< |d d | j| d d< | j| d |d< nt|d | |d
gd
tj|d< |d jd
 d
kr|d d d d d
d d f ! |d< | jr| j| d |d< |d "  | j#s"t$| j%D ]-}| j&s|d|  |d| < |d|  |d| < q|d|  |d| < q|d jd
 }| j&rd|| j'|| j(fn|| j'|| j(f}t$| j%D ]2}tj|| j| j)d}| j&s|*d| |! d| | ! in|*d| |! i q|
|k s)|rRt+d| d|	 d||	   t+dd|d   ddt,-|d
d    d d S | jj.|dd}|S )Nr#   r'   Tgpu_graph_idz-1Fr+   )dimr    r$   r%   r&   r   r>   rA   r(   rB   r)   rC   r@   r?   zBatch size: z, Sequence length: z, Token num: zPrompt letency: i  zms, Token latency: ms)skip_special_tokens)/rX   r5   rG   r,   r-   r"   boolrm   rz   timesynchronize_inputsr   r|   add_run_config_entryrun_with_iobindingr   strsynchronize_outputsappendargmaxr   eos_token_idmasked_fillreshapecatallrL   r.   r   r   rM   r/   r0   r4   zero_rH   r1   r   rI   r   r   rF   r3   printri   meanbatch_decode)r   rN   
max_lengthcuda_graph_annotation	benchmarkrQ   rU   all_token_idsr6   rO   current_lengthhas_eoslatency
prompt_runr^   startendnext_token_logitsnext_tokenstokens_to_addprevious_seqlens_kr;   new_sequence_lengthrV   rW   textsr   r   r   generate_impl   s   


&h0zORTGenerator.generate_implc                 C   s   | j j|dd}| |||S )NT)padding)r   batch_encode_plusr   )r   promptr   r   rN   r   r   r   generatea  s   zORTGenerator.generatec                 C   sx   |\}}|| }i }t jdd||ft jd |d< t j||ft jd |d< | j|||dd | j|||dd d S )	Nr   iX  )r!   r#   r>   F)r   T)r,   randintr.   tolistonesr   )r   prompt_shape	token_numr   r6   rO   r   rN   r   r   r   generate_benchmarkf  s    zORTGenerator.generate_benchmarkN)TTFFF)F)__name__
__module____qualname__r   r=   rX   ru   ry   dictrm   r   r   r   r   r   r   r   r   r   !   s    ])

r   FTc                    s   t |   |||||  fdd}dg}	|s||	 |r=d}
dD ]} | dD ]}||f} j||
|d q-q$d S d S )Nc                    sZ   t | }r j|d  j| d|d}tt |D ]}td| |  td||  qd S )N)r6      )r   r   zPrompt: zTexts: )lenr=   r   r1   r   )r   example_batch_sizer   r;   	generatorr   r   r   
simple_run  s   zrun_phi2.<locals>.simple_runzV```python
    def print_prime(n):
    """
    Print all primes between 1 and n
    """r   )r    r?   rn      )   i   )r   )r   r   r=   r   )onnx_model_pathrH   r   rI   rD   rM   r   run_benchmarkr   r   r   r6   rO   r   r   r   r   run_phi2u  s"   

r   )FTFFF)r   numpyri   r,   transformersr   onnxruntimeru   r.   r0   rE   r2   rc   r   r   r   r   r   r   r   <module>   s(     Z