o
    }oi                     @   s   d dl Z d dlZzd dlZW n ey    d dlZed Y nw d dlZd dlZd dl	Z
d dlZd dlZd dlmZ d dlZd dlZd dlmZ d dlmZ d dlmZmZ d dlmZmZmZ d dlmZ d dlm Z  d d	l!m"Z"m#Z# d d
l$m%Z% dd Z&G dd dZ'G dd de'Z(dS )    Nz;The package `decord` was not installed in this environment.)Image)logger)str_dtype_to_trttorch_dtype_to_trt)ModelRunnerSession
TensorInfo)
functional)
transforms)AutoProcessorCLIPImageProcessor)TRTLLM_ENGINE_DIRc                 C   sL   | t jkrtjS | t jkrtjS | t jkrtjS | t jkr tjS td|  )Nz%s is not supported)trtfloat16torchfloat32int32bfloat16	TypeErrordtype r   N/home/ubuntu/.local/lib/python3.10/site-packages/nemo/export/multimodal/run.pytrt_dtype_to_torch-   s   



r   c                   @   s   e Zd Zd;ddZdd Zdd Zdd	 Zd
d Zdd Zdd Z	dd Z
ed<ddZdd Z	d=ddZdd Zdd Zdd Zd d! Zd"d# Zd$d% Zd=d&d'Zd(d) Zd*d+ Zd,d- Zd.d/ Zd0d1 Zd2d3 Z		4	4d>d5d6Zd7d8 Zd9d: ZdS )?MultimodalModelRunnervisionc                 C   sJ  || _ t | _| jtj  }tj| d| | _tj	tj
 | _tj| j ttj|dd}t|}W d    n1 sFw   Y  |d d | _|d d | _|d d | _|d dd | _|d dd | _d	| _|d
kr~| | | | | tj|t | jdks| jdks| jdkr| | d S d S )Nzcuda:%dzconfig.jsonrbuilder_config
model_type	precision
num_frames
image_size   r   litavilavita)modalitytensorrt_llmmpi_rankruntime_rankr   cudadevice_count
set_devicedeviceStreamcurrent_devicestream
set_streamopenospathjoinjsonloadr   vision_precisionmodality_precisiongetr    r!   profiling_iterationsinit_image_encoderinit_tokenizerinit_llmr   init_vision_preprocessor)selfvisual_engine_dirllm_engine_dirr&   	device_idfconfigr   r   r   __init__<   s.   



zMultimodalModelRunner.__init__c                    sJ  t jt j|drFddlm} ||| _| jj| j_	| j
dkrD| jd| j_| jd| j_| jd| j_| jd| j_d S d S dd	lm} |t j|d
G dd d G  fddd}| | _ | j_ | j_ | j_d| j_| j
dkrd| j_d| j_d| j_d| j_d S d S )Nztokenizer_config.jsonr   )AutoTokenizerr%   z<extra_id_4>z<extra_id_5>z<extra_id_8>z<extra_id_9>)SentencePieceProcessorztokenizer.modelc                   @   s   e Zd Zdd Zdd ZdS )z8MultimodalModelRunner.init_tokenizer.<locals>.return_objc                 S   s
   || _ d S N	input_ids)r@   rK   r   r   r   rF   k   s   
zAMultimodalModelRunner.init_tokenizer.<locals>.return_obj.__init__c                 S   s   |dv r| j S td| d)NrK   z'return_obj' has no item '')rK   AttributeError)r@   namer   r   r   __getitem__n   s   zDMultimodalModelRunner.init_tokenizer.<locals>.return_obj.__getitem__N)__name__
__module____qualname__rF   rO   r   r   r   r   
return_obji   s    rS   c                       s:   e Zd Zd
 fdd	Zd
ddZfddZdd	 ZdS )zBMultimodalModelRunner.init_tokenizer.<locals>.HFTokenizerInterfaceNc                    s$    |}|dkrt|} |S )Npt)encoder   tensor)r@   xreturn_tensorskwargsoutrS   spr   r   rU   w   s   

zIMultimodalModelRunner.init_tokenizer.<locals>.HFTokenizerInterface.encodec                 [   s   | j ||fi |S rI   )rU   )r@   rW   rX   rY   r   r   r   __call__}   s   zKMultimodalModelRunner.init_tokenizer.<locals>.HFTokenizerInterface.__call__c                    s     | S rI   )decodetolistr@   rW   rY   )r\   r   r   r^      s   zIMultimodalModelRunner.init_tokenizer.<locals>.HFTokenizerInterface.decodec                 [   s   | j |fi |S rI   )r^   r`   r   r   r   batch_decode   s   zOMultimodalModelRunner.init_tokenizer.<locals>.HFTokenizerInterface.batch_decoderI   )rP   rQ   rR   rU   r]   r^   ra   r   r[   r   r   HFTokenizerInterfaceu   s
    
rb   rightr#   )r3   r4   existsr5   transformersrG   from_pretrained	tokenizer	eos_token	pad_tokenr   convert_tokens_to_idsim_start_id	im_end_idvid_start_id
vid_end_idsentencepiecerH   eos_ideos_token_idbos_idbos_token_idpad_idpad_token_idpadding_sidepiece_to_id)r@   rB   rG   rH   rb   r   r[   r   r=   Y   s2   

z$MultimodalModelRunner.init_tokenizerc                 C   sp   t j|d}td|  t|d}| }W d    n1 s#w   Y  td|  t|| _	d S )Nzvisual_encoder.engineLoading engine from rbCreating session from engine )
r3   r4   r5   r   infor2   readr   from_serialized_enginevisual_encoder_session)r@   rA   vision_encoder_pathrD   engine_bufferr   r   r   r<      s   
z(MultimodalModelRunner.init_image_encoderc                 C   s   t tj|dd}t|| _W d    n1 sw   Y  | jd d }| jdkr:tj	|d t
jdd| _d S | jd	ksD| jd
krXddlm} |j	|d t
jdd| _d S td| j )Nznemo_config.yamlr   mm_cfgvision_encoderr#   rf   T)torch_dtypetrust_remote_coder$   r%   r   )SiglipImageProcessorzInvalid model type: )r2   r3   r4   r5   yaml	safe_loadnemo_configr   r   rf   r   r   image_processorre   r   
ValueError)r@   visual_encoder_dirrD   vision_configr   r   r   r   r?      s   
z.MultimodalModelRunner.init_vision_preprocessorc                 C   s6   t j|t d| jd| _| jjj| _| jjj	| _
d S )NF)rank
debug_moder0   )r   from_dirr'   r(   r0   modelsession_model_configmodel_configmappingruntime_mapping)r@   rB   r   r   r   r>      s   zMultimodalModelRunner.init_llmc                    st  ddl m} t trO| | j}|dkrdd D }n|t|t}tjdtd |t	d}fdd|D }t||k rN||d g|t|  7 }nKt tj
r| j}|dkrdd	d  D }n6t| jd }tjd jd d |t	d} fd
d|D }t||k r||d g|t|  7 }n| j}tjdtjd}|j|ddd }|tj| j}|dS )Nr   )VideoReaderc                 S   s    g | ]}t | d qS RGBr   	fromarrayasnumpyconvert.0framer   r   r   
<listcomp>        z:MultimodalModelRunner.video_preprocess.<locals>.<listcomp>   )numr   c                    s$   g | ]}t  |  d qS r   r   r   idx)vrr   r   r      s   $ c                 S   s   g | ]
}t |d qS r   r   r   r   r   r   r   r   r          c                    s    g | ]}t  | d qS r   r   r   )
video_pathr   r   r      r   zopenai/clip-vit-large-patch14)r   rT   rX   pixel_values)decordr   
isinstancestrr    minlennplinspaceintndarrayshaper   r   rf   r   r   
preprocesstor'   _utilsstr_dtype_to_torchr8   	unsqueeze)r@   r   r   r    framesindices	processormedia_tensorsr   )r   r   r   video_preprocess   s:   

z&MultimodalModelRunner.video_preprocessc                 C   s   | j j}| j j}| j j}| j j}|dkjdd  }|  }d}t|D ]$}	|d | }
|	|
d | |	|
d d |	|
d | |d7 }q(|d | }|	|d | |	|d d |	|d | |
|d  tj|tjdd}|S )Nr   Fas_tupler      r   )rg   rk   rl   rm   rn   nonzerosqueezer_   rangeinsertpopr   rV   longr   )r@   rK   r    rk   rl   rm   rn   image_token_indicesoffsetir   vid_idxr   r   r   insert_tokens_by_index   s(   
z,MultimodalModelRunner.insert_tokens_by_indexc                 C   s@  |s
t | j  |st | j  | jdkr{| ||\}}| ||d |d  | j}	| 	|	}
|
d }t
dd |D }|dkrXt|dkrX||jd |jd  7 }n||jd 7 }t|g| tj}| ||||\}	}|	|||fS | jdks| jdkr,g }t|D ]\}}| ||\}}q|d}| || j\}}}|||g | ||d |d  | j}	| |	|}	| 	|	}|d }t
dd |D }|dd	|jd	 }tj||gdd
}tj| d d	 tjd|j}|dkr||jd |jd  7 }ntdt|g| tj}| ||||\}	}|	|||fS | ||\}}| j|dddj}|d d urx| j|dddj}| jdkrh|jd |jd  |jd |jd   }n|jd |jd  |jd  }nd }|jd |jd  }t|g| tj}|  ||||\}	}|	|||fS )Nr$   r   c                 S      g | ]}|j d  qS r   r   r   idsr   r   r   r   	      z4MultimodalModelRunner.preprocess.<locals>.<listcomp>r   r#   r%   c                 S   r   r   r   r   r   r   r   r   #  r   r   dimr   zCBatch size greater than 1 is not supported for LITA and VITA modelsrT   T)rX   padding
video-neva   )!profilerstartr&   
capitalizestopr   get_visual_featurestokenizer_image_tokenrg   split_prompt_by_imagessumr   r   r   	IntTensorr   r   setup_fake_prompts_vila	enumerater   preprocess_lita_visualr   extendr   viewcatonessizer   r-   r   rK   setup_fake_prompts)r@   warmup
pre_promptpost_promptimageattention_mask
batch_sizevisual_featuresvisual_attsrK   batch_split_promptsfirst_batch_split_promptslengthinput_lengthsptuning_argsvisual_inputr   img	im_tokens
vid_tokensnum_sample_framesbatch_splitspre_input_idspost_input_idsr   r   r   r      sp   



$
& z MultimodalModelRunner.preprocess8c           	         s    fdd| dD }dd }g }d}t|dkr6t|d dkr6|d d  jkr6d}||d d  |||g|d  D ]}|||d   q@tj|tjd}d|||k< |d	| d	}|S )
Nc                    s   g | ]} |j qS r   rJ   )r   chunkrg   r   r   r   M  r   z?MultimodalModelRunner.tokenizer_image_token.<locals>.<listcomp>z<image>c                 S   s&   dd t | |gt|  D d d S )Nc                 S   s   g | ]	}|D ]}|qqS r   r   )r   sublisteler   r   r   r   P      zYMultimodalModelRunner.tokenizer_image_token.<locals>.insert_separator.<locals>.<listcomp>r   )zipr   )Xsepr   r   r   insert_separatorO  s   &zEMultimodalModelRunner.tokenizer_image_token.<locals>.insert_separatorr   r   r   r   )
splitr   rs   appendr   r   rV   r   r   expand)	r   promptrg   image_token_indexprompt_chunksr
  rK   r   rW   r   r  r   r   K  s   .z+MultimodalModelRunner.tokenizer_image_tokenc                 C   s   g }|D ]H}|dkj ddd}d}g }|D ]}||kr)|||| d |d }q|t|k r@|||d  d dd |D }|| q|S )Nr   Fr   r   c                 S   s   g | ]
}|  d kr|qS r   )numel)r   r  r   r   r   r   p  r   z@MultimodalModelRunner.split_prompt_by_images.<locals>.<listcomp>)r   r   r  r   r   )r@   rV   r   batchzero_indices	start_idxsplitsr   r   r   r   r   a  s   
z,MultimodalModelRunner.split_prompt_by_imagesNc                    s  |st d ||||||\} }}|rd S t d jj}t|d g|d< jj|d |d ||jj	d ur@jj	njj
d |	|
||d|ddt d t dkr{ fddt|D fddt|D }t d |S t d d S )	NGenerateLLMr   F)sampling_configprompt_tablemax_new_tokensend_idrt   top_ktop_ptemperaturerepetition_penalty	num_beamsoutput_sequence_lengths	lora_uidsreturn_dictc                    s2   g | ]}j j|d d  | d f ddqS )NT)skip_special_tokens)rg   ra   )r   	batch_idx)r   
output_idsr@   r   r   r     s    z2MultimodalModelRunner.generate.<locals>.<listcomp>c                    s$   g | ]  fd dt D qS )c                    s   g | ]
}  |   qS r   )strip)r   beam_idx)r&  output_beams_listr   r   r     r   z=MultimodalModelRunner.generate.<locals>.<listcomp>.<listcomp>)r   )r   )r!  r*  )r&  r   r     s    )r   r   r   rg   rq   r   stackr   generateru   all_special_idsr   r'   r(   r   )r@   r   r   r   decoder_input_idsr  r   r   r   r  r  r  r   r!  r#  rK   r   r   r  stripped_textr   )r   r!  r*  r'  r@   r   r,  u  sP   





zMultimodalModelRunner.generatec           
         s   d  tj| ji}|d ur||d< tdt| j jg}|d ur/|tdt	j
j|j | j|} fdd|D }| j||| jj}|sNJ d| j  |d }tj| d d tjd  j}	||	fS )	Ninputr   c                    .   i | ]}|j tjt|jt|j jd qS )r   r-   rN   r   emptytupler   r   r   r-   r   tr   r   r   
<dictcomp>       z=MultimodalModelRunner.get_visual_features.<locals>.<dictcomp>3Runtime execution failed for vision encoder sessionoutputr   r   )r   r'   r   r   r8   r   r   r   r  r   DataTypeINT32r~   infer_shapesrunr0   cuda_streamsynchronizer   r   r   r   r-   )
r@   r   r   r   tensor_infovisual_output_infovisual_outputsokimage_embeds
image_attsr   r8  r   r     s    

$z)MultimodalModelRunner.get_visual_featuresc                 C   s   t | dr|jd | jkr||jd d|jd }t| jj| jj|jd |jd   }||jd |jd }|d urD|||g}n||g}tj	|dd
 tj}| |||}||fS )Nr    r   r   r   r   )hasattrr   r    r   r   aranger   
vocab_sizereshaper   
contiguousr   r   ptuning_setup)r@   r   r   r   r   fake_prompt_idrK   r   r   r   r   r     s   z(MultimodalModelRunner.setup_fake_promptsc                 C   sL  | j dks
| j dkr|d d}dd |D }||d g }| jj}|dkr3t|t|ks3J dg }|dkr|d g}| j dkrzt|D ]1\}	}
t|||
jd  }||
jd 7 }|	d}|
| t||	d krx|
||	d   qGn| j dks| j dkrt|D ]9\}	}t|||jd  }||jd }||jd 7 }|	d}|
| t||	d kr|
||	d   qnE|dkr| j dkrt|D ]5\}	}
|
|d  t|||
jd  }||
jd 7 }|	d}|
| t|dkr|
|d  qtj|dd	 tj}||d
}| |||}||fS )Nr#   r%   r   c                 S   s   g | ]}| d qS r  )r   r6  r   r   r   r     r   zAMultimodalModelRunner.setup_fake_prompts_vila.<locals>.<listcomp>r   z[Unexpected number of visual features. Please check #<image> in prompt and the #image files.r$   r   r   )r   r   r   rK  r   r   r   rJ  r   r   r  rL  r   rM  r   r   rN  )r@   r   r   split_input_idsr   squeeze_img_tokensreshape_img_tokensfake_prompt_counterrK   r   visual_featurerO  visual_fr   r   r   r   r     s`   



	



z-MultimodalModelRunner.setup_fake_prompts_vilac                 C   s\  |j \}}}}|}d|d d v rJ|d d d dkrJt||d d d }ttd|d |t}	|d d |	df }
t|d	d
}|
||fS d|d d v r|d d d dkrd}ttd|j d d || t}|d d |df }tj	|dddd}t
j||d}tj	|d|d}t|d	d
}|||d fS td|d d d  )Nvisual_token_formatr   r#   im_vid_start_endsample_framesr   r   .zb t s d -> b t dmeanlita_video_archtemporal_spatial_poolr   zb t (h w) d -> (b t) d h w   )hw)kernel_sizez(b t) d h w -> b (t h w) d)bzInvalid visual token format: )r   r   r   roundr   astyper   einopsreduce	rearrangeF
avg_pool2dr   )r@   r   rE   r`  r7  sdr    num_image_framesr   im_featuresvid_features	pool_sizeselected_framess_tokenst_tokensr   r   r   r   $  s(   
(z,MultimodalModelRunner.preprocess_lita_visualc                 C   s  | j j| jj }| jdks| jdkrtj|dd}|d urWtj|jd gtj	d
 }||jd |jd  |jd f}|jd |ksHJ d|
 jtj| j jd}ntd|g
 }tdg
 }| j jr{tjt|gtj	d
 }ntj|jtj	d
 }|||gS )	Nr#   r%   r   r   r   r   r   z0Prompt table dimensions do not match hidden size)r   hidden_sizer   tp_sizer   r   r   rV   r   r   r*   r   r   r'   r   r   r   r4  zerosremove_input_paddingr   )r@   r  rK   r   rq  task_vocab_sizetasksr   r   r   rN  H  s*   
$
z#MultimodalModelRunner.ptuning_setupc           	      C   s   |j dd  \}}t|}t|}||kr|S ||krDtj|d|||d }|| d }|| }||d d d d ||d d f< |S tj|d|||d }|| d }|| }||d d d d d d ||f< |S )Nzc -> b c h w)r`  r]  r^  r   )r   r   r   Tensorrc  repeatclone)	r@   imagesbackground_colorheightwidthr`  resultpaste_start	paste_endr   r   r   expand2square_ptd  s    
  z&MultimodalModelRunner.expand2square_ptc                 C   s   d }t |tr9tjd tj|d}|d ur.ttdt	|d |
t}||}ntdd |D }nt |tjrGtj|tjd}| |||S )Nr   )urir   r   c                 S   s   g | ]	}t | qS r   )r   rV   r   )r   rD   r   r   r   r     r  z4MultimodalModelRunner.load_video.<locals>.<listcomp>r   )r   r   r   bridge
set_bridger   r   ra  r   r   rb  r   	get_batchr   r   r   rV   r   preprocess_frames)r@   rE   r   r   r    r   video_readerr   r   r   r   
load_videow  s   
"z MultimodalModelRunner.load_videoc                 C   sN   t |d}|d d dkr| |tdd |jD }|j|ddd	 }|S )
Nzt h w c -> t c h wdataimage_aspect_ratiopadc                 s       | ]	}t |d  V  qdS    Nr   r   rW   r   r   r   	<genexpr>      z:MultimodalModelRunner.preprocess_frames.<locals>.<genexpr>rT   r   r   )rc  re  r  r5  
image_meanr   )r@   r   rE   r   processed_framesr   r   r   r    s
   z'MultimodalModelRunner.preprocess_framesc                 C   sx   d|d d v r4|d d d dkr4|d d }||kr|S t tt|| }t tt|| S |d d d S )NrV  r   r#   rW  r  r    rX  )r   r   ceilfloatra  )r@   rE   vid_len
max_frames	subsampler   r   r   get_num_sample_frames  s   z+MultimodalModelRunner.get_num_sample_framesc                 C   s   d }t |tr(tt|}| ||}| ||||dj| j	t
jd}|S t |tjr?| |||dj| j	t
jd}|S )Nr   r   )r   r   r   r   r   r  r  r   r   r-   r   r   r   r   )r@   r   r   r   r   r  r   r   r   r   process_lita_video  s"   


z(MultimodalModelRunner.process_lita_videoc                 C   s   t |tr |d urttj||d}nt|d}n|}|d d d }t|}|	|}|d d dkrV| 
|tdd	 |jD }|j|d
dd d }|S |j|d
dd d }|S )Nr   r   r   	crop_sizer  r  r  c                 s   r  r  r  r  r   r   r   r    r  z6MultimodalModelRunner.process_image.<locals>.<genexpr>rT   r   r   r   )r   r   r   r2   r3   r4   r5   r   r5  resizer  r  r   )r@   
image_filer   r   image_folderr   r  r   r   r   process_image  s   

z#MultimodalModelRunner.process_imagec                    s:   fdd|D  t  fdd D rtj dd  S )Nc                    s    g | ]}  | j jd qS rI   )r  r   r   )r   r   r@   r   r   r     r   z:MultimodalModelRunner.process_vila_img.<locals>.<listcomp>c                 3   s     | ]}|j  d  j kV  qdS )r   Nr   r  )
new_imagesr   r   r    s    z9MultimodalModelRunner.process_vila_img.<locals>.<genexpr>r   r   )allr   r+  )r@   r{  r   )r  r@   r   process_vila_img  s   z&MultimodalModelRunner.process_vila_imgc                 C   s  d }d }| j dkr?| j}tj}ttj||ftjjdt	 t
ddg}|||d}|d u r6d}d}	d| d}
nO| j d	krX| |}|d u rOd
}d}	d| d}
n6| j dv r| j dksg| j dkrtd}	|d u rod}|d }
n| j dkrd}	|d u rd}|d }
ntd| j  | j dks| j dkr| | j|| j}| j dkr|g| }| |}|	g| }	|
g| }
| j dvr| dkr||dddd }n
||ddd }|| j}d }||	|
|||fS )Nneva)interpolation)      ?r  r  r   zHi! What is in this image?z%<extra_id_0>System

<extra_id_1>User

z
<extra_id_1>Assistant
r   zHi! What is in this video?z<extra_id_0>System
A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.

<extra_id_1>Userz
<extra_id_1>Assistant
<extra_id_2>quality:4,toxicity:0,humor:0,creativity:0,helpfulness:4,correctness:4,coherence:4,complexity:4,verbosity:4
)r$   r#   r%   r$   r#   zA chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: z5<image>
 Please elaborate what you see in the images?z ASSISTANT:r%   a"  <|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language. <|start_header_id|>user<|end_header_id|>

z/<|start_header_id|>assistant<|end_header_id|>

Invalid model type    r   )r   r!   r   r   r
   ComposeResizeInterpolationModeBICUBICToTensor	Normalizer   r   r   RuntimeErrorr  r   r   r  r   r  rM  r-   )r@   
input_text	raw_imager   r   r   r!   r   	transformr   r   r.  r   r   r   setup_inputs  sj   













z"MultimodalModelRunner.setup_inputsFc                 C   s   |  |||\}}}}}}| j||||||d||||||	|
d |r%| jnd}t|D ]}| j||||||d||||||	|
d}q+| jdkrP| ||||	|| |S )NT)	r   r   r   r  r  r  r   r!  r#  r   Fr   r  r,  r;   r   r)   print_result)r@   r  input_imager  r   r  r  r  r   r!  r#  run_profilingcheck_accuracyr   r   processed_imager.  r   	num_iters_output_textr   r   r   r@    sP   
zMultimodalModelRunner.runc           
         sL  |s|sd S t d  jdkrt d|  t d|d   |dkr> j|d d ddd	 }t d
t| d |rot|d D ](}|| ||d  ksbt d| d|d  d J d|d d  v snJ qF|r fdd}	t d t d j d|	 j   t d|	d  t d|	d  t d d S )Nz9---------------------------------------------------------nougatz
[Q] z
[A] r   r   F)add_special_tokensrK   z
Generated z tokenszOutput z and z do not matchrobotc                    s   dt |   j S )Ni  )r   elapsed_time_in_secr;   )rN   r  r   r   <lambda>k  s    z4MultimodalModelRunner.print_result.<locals>.<lambda>zLatencies per batch (msec)zTRT z encoder: %.1fzTRTLLM LLM generate: %.1fr  zMultimodal generate: %.1fr  )	r   r{   r   rg   r   r   lowerr&   r   )
r@   r  r  r   r!  r  r  r'  r   msec_per_batchr   r  r   r  V  s,   


"z"MultimodalModelRunner.print_resultc                 C   sR   g d}| j |v r|}|S | j dks| j dkr!t|d}|S td| j  )N)r   r#   r%   r  r$   r   r  )r   r   r2   r   r  )r@   input_mediamedia_modelmediar   r   r   load_test_medias  s   
z%MultimodalModelRunner.load_test_media)r   )r  rI   )NFF)rP   rQ   rR   rF   r=   r<   r?   r>   r   r   r   staticmethodr   r   r,  r   r   r   r   rN  r  r  r  r  r  r  r  r  r@  r  r  r   r   r   r   r   :   sB    
:
'O#
I9$
`
9r   c                       s   e Zd Z fddZdd Zdd Zdd Zd	d
 Zdd Zdd Z	dd Z
													d dededededededefddZ  ZS )!SpeechllmModelRunnerc                    sF   t  ||| | jdksJ tj|d}| || _| | dS )a  
        perception_engine_dir: path to the perception engine directory
                               it should contain:
                               config.json nemo_config.yaml
                               perception_encoder.engine : tensorrt engine
                               feature_extractor.ts  : torchscript model
        llm_engine_dir: path to the LLM engine directory
        salmzfeature_extractor.tsN)	superrF   r   r3   r4   r5   init_speech_preprocessorfeature_extractorinit_modality_encoder)r@   perception_engine_dirrB   r&   feature_extractor_path	__class__r   r   rF     s
   	zSpeechllmModelRunner.__init__c                 C   s   d}t |D ]}|dr|} nq|dusJ d| t j||}td|  t|d}| }W d   n1 sAw   Y  td|  t	
|| _dS )z
        Initialize the modality encoder session from the prebuilt engine directory
        Args:
            engine_dir: str, path to the engine directory
        Nz.enginezEngine file not found in rx   ry   rz   )r3   listdirendswithr4   r5   r   r{   r2   r|   r   r}   modality_encoder_session)r@   
engine_direngine_filefileencoder_pathrD   r   r   r   r   r    s   

z*SpeechllmModelRunner.init_modality_encoderc                 C   s   t j|}|  |S rI   )r   jitr7   eval)r@   r  r  r   r   r   r    s   z-SpeechllmModelRunner.init_speech_preprocessorc                 C   s8   t j|t jd}t j|t jd}| ||\}}||fS )a  
        Args:
            input_signal: audio signal in numpy array
            input_signal_length: length of the audio signal in numpy array

        Returns:
            processed_signal: torch.tensor [B, 80, T]
            processed_signal_length [B]
        r   )r   rV   r   r   r  )r@   input_signalinput_signal_lengthprocessed_signalprocessed_signal_lengthr   r   r   process_audio  s   
z"SpeechllmModelRunner.process_audioc                 C   s   |\}}|  ||\}}|| j}|| j}|du rd}t|tr(|g| }t||ks0J dg| }|}	d}
d}|||	|||
|fS )a-  
        Args:
            input_text: str or List[str] or None
            input_media: Tuple[np.array, np.array]
                input_signal: audio signal in numpy array [b, -1]
                input_signal_length: length of the audio signal in numpy array [b]
            batch_size: int

        Nz,Q: what's the transcription of the audio? A: )r  r   r-   r   r   r   )r@   r  r  r   r  r  r  r  r   r   r.  r   r   r   r   r    s*   



z!SpeechllmModelRunner.setup_inputsc                 C   sD   t j|tjd\}}tj|gtjd}tjt|gtjd}||fS )z
        Args:
            input_media_path: str, path to the audio file
        Returns:
            input_signal: np.array [1, -1]
            input_signal_length: np.array [1]
        r   )sfr|   r   r   arrayr   r   )r@   input_media_pathwaveformsample_rater  r  r   r   r   r    s   z$SpeechllmModelRunner.load_test_mediac           	         s   |dur||d< g }|  D ]\}}|t|t|j|j q j|} fdd|D } j|| j	j
}|s?J d j	  |S )z
        Do inference on the modality encoder engine
        Args:
            modality_features: dict {'input1': torch.tensor, 'input2': torch.tensor, ..}
            attention_mask: None
        Returns:
        Nr   c                    r1  r2  r3  r6  r  r   r   r9    r:  zFSpeechllmModelRunner.get_modality_encoder_features.<locals>.<dictcomp>r;  )itemsr  r   r   r   r   r  r?  r@  r0   rA  rB  )	r@   modality_featuresr   rC  keyrV   output_infooutputsrF  r   r  r   get_modality_encoder_features  s   	

z2SpeechllmModelRunner.get_modality_encoder_featuresc                    sb  |s
t j  |st j  jdks!J dj |d |d tjd}	||}|d |d }}	
|j}

|j}g }g }|	  }	jj}t|D ]*}|	| }t||| }t|
| ||| g}||7 }|t| || q[t|  fdd	|D }tj|tjd
}tj|tjd
}|||}||||fS )a2  
        Args:
            warmup: bool
            pre_prompt: List[str]
            post_prompt: List[str]
            processed_features: Tuple[torch.tensor, torch.tensor]
                processed_signal: torch.tensor [B, 80, T]
                processed_signal_length: torch.tensor [B]
            attention_mask: None
            batch_size: int
        Returns:
            input_ids: torch.tensor [B, L]
            input_lengths: torch.tensor [B]
            ptuning_args: List[torch.tensor]
            encoded_features: torch.tensor [B, L, D]
        r  r  r   r   )r  r  encodedencoded_lengthc                    s.   g | ]}t j|d  t| fdjjdqS )r   constant)constant_values)r   r  r   rg   ru   r   
max_lengthr@   r   r   r   4  r:  z3SpeechllmModelRunner.preprocess.<locals>.<listcomp>r   )r   r   r&   r   r   r   r   r   r   r  rg   rK   cpunumpyr   rK  r   r   rJ  concatenater  r   maxrV   rN  )r@   r   r   r   processed_featuresr   r   encoded_outputsencoded_featuresr  r   r   r   rK   fake_id_startr   feat_lenfeat_fake_idscur_input_idsr   r   r  r   r     s>   zSpeechllmModelRunner.preprocessN   r                 ?Fr  r   r  r  r  r   r!  c                 C   s   |du r|dur|dusJ ||f}|  |||\}}}}}}}||f}| j||||||d||||||	d |
r;| jnd}t|D ]}| j||||||d||||||	d}qA| jdkre| ||||	|
| |S )a%  
        Args:
            input_text: str or List[str] or None
            input_media: Tuple[np.array, np.array] or None
                input_signal: audio signal in numpy array [b, -1]
                input_signal_length: length of the audio signal in numpy array [b]
            max_new_tokens: int
            batch_size: int
            top_k: int
            top_p: float
            temperature: float
            repetition_penalty: float
            num_beams: int
            run_profiling: bool
            check_accuracy: bool
        NT)r   r   r   r  r  r  r   r!  r   Fr   r  )r@   r  r  r  r   r  r  r  r   r!  r  r  r  r  r#  r   r   r  r  r.  r   processed_mediar  r  r  r   r   r   r@  >  s`   !

zSpeechllmModelRunner.run)Nr  r   r   r  r  r  r   FFNNN)rP   rQ   rR   rF   r  r  r  r  r  r  r   r   r  r@  __classcell__r   r   r  r   r    sJ    #<	
r  ))r6   r3   r   	Exceptionloggingwarningrc  r  r   	soundfiler  tensorrtr   r'   tensorrt_llm.profilerr   r   r   PILr   r   tensorrt_llm._utilsr   r   tensorrt_llm.runtimer   r   r   torch.nnr	   rf  torchvisionr
   re   r   r   nemo.export.utils.constantsr   r   r   r  r   r   r   r   <module>   sB         K