o
    }oim                      @   sV  d dl Z d dlZd dlZd dlZd dlmZ d dlmZ d dlmZ d dl	Z
d dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlm Z m!Z!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z* ddl+m,Z, e
-e
j-j.Z/													dUde0de0de0de0de0de1d e1d!e1d"e1d#e1d$e0d%e0d&ee0 d'e1d(ee0 fd)d*Z2										dVde0d+e0de1d e1d!e1d"e1d#e1d$e0d%e0d&ee0 d'e1d(ee0 fd,d-Z3d.gd.d d/iifd0d1Z4d2d3gd4d5gd/d6d7d d/id/d6d8d d/id9fd:d;Z5ej6dddd<fd=d>Z7	dWde0de0de0d?e1fd@dAZ8	dWde0de0d?e1fdBdCZ9	D	dXde0dEe0de0d"e1fdFdGZ:	H	dYde0d+e0dIe0d?e1fdJdKZ;		dZde0de0de0d?e1fdLdMZ<dNe0dOe0fdPdQZ=	H											d[de0dRe0dIe0d?e1de1d e1d!e1d"e1d#e1d$e0d%e0d&ee0 d'e1d(ee0 fdSdTZ>dS )\    N)Path)time)List)	OmegaConf)Image)check_max_num_tokens)BuildConfigBuilder)build)Mapping)MLLaMAForCausalLM)PluginConfig)	AutoModelAutoProcessorMllamaForConditionalGeneration)AudioPerceptionModule)	typecheck)TensorRTLLM)load_nemo_model   )convert_mllama_nemo_to_hfnevallama      bfloat16@   	model_dirvisual_checkpoint_pathllm_checkpoint_path
model_typellm_model_typetensor_parallelism_sizemax_input_lenmax_output_lenmax_batch_sizemax_multimodal_lendtypeuse_lora_pluginlora_target_modulesmax_lora_ranklora_ckpt_listc                 C   sH   t | |dd}|j|du r|n||||||| ||	|
d|||dd dS )z"Build TRTLLM engine by nemo exportF)r   r+   
load_modelN)nemo_checkpoint_pathr    r"   r#   r$   max_seq_lenr%   max_prompt_embedding_table_sizer'   r,   r(   r)   r*   use_mcore_path)r   export)r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   trt_llm_exporter r3   P/home/ubuntu/.local/lib/python3.10/site-packages/nemo/export/multimodal/build.pybuild_trtllm_engine/   s"   
r5   hf_model_pathc                 C   s   |dk r
t d d}t }d|_d|_|jdd d|_d|_|| }tdd|||dd|jdd	d

\}}||||d|||ddd
}t	j
||d}t|D ]}t|||d}tj|||d}t||}||  qLdS )z"Build mllama TRTLLM engine from HF   zeTensorRT LLM may hit a runtime issue with batch size is smaller than 4 on some models. Force set to 4auto   )tokens_per_blockTNr   F)
max_num_tokensopt_num_tokensr.   r%   r#   max_beam_widthremove_input_paddingenable_context_fmhar:   multiple_profiles)
r#   r$   max_encoder_input_lenr%   r=   r.   r;   r<   strongly_typedbuilder_opt)plugin_config)
world_sizeranktp_size)mapping)printr   gpt_attention_plugingemm_pluginenable_paged_kv_cacher>   use_paged_context_fmhar   context_fmhar   	from_dictranger   r   from_hugging_facebuild_trtllmsave)r   r6   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   rD   r.   r;   r<   
build_dictbuild_configrF   rH   modelenginer3   r3   r4   build_mllama_trtllm_engineT   s\   

rX   inputbatchc              	   C   sJ   t tjjd tj| ddd tjj	| || dd|dg|d d	S )
zExport visual wrapper to ONNXExporting onnx/onnxTexist_okz/onnx/visual_encoder.onnx   outputopset_versioninput_namesoutput_namesdynamic_axesN
loggerlogtrtLoggerINFOosmakedirstorchonnxr1   )visual_wrapperrY   
output_dirrc   re   r3   r3   r4   export_visual_wrapper_onnx   s   
rr   processed_signalprocessed_signal_lengthencodedencoded_lengthr   )r      )r   r   )rs   rt   ru   rv   c              	   C   sH   t tjjd tj| ddd tjj	| || dd|||d dS )	z!Export perception wrapper to ONNXr[   r\   Tr]   z/onnx/perception_encoder.onnxr_   ra   Nrf   )perception_wrapperrY   rq   rc   rd   re   r3   r3   r4   export_perception_wrapper_onnx   s   
ry   visual_encoderc	           $      C   s  d||f }	d||f }
d|df }d|df }t |d}t|| W d   n1 s.w   Y  ttjjd|  tt}|	d	t
tjj> }| }t|d
d | d}|dure||d< |durm||d< t jdi |}|j}t|t}t |	d;}|| tj|	sttjjd|	  t|jD ]}ttjj|| qttjjd|	  W d   n1 sw   Y  t| d d}d	}t|t
|d }|}|d}t |t!sJ dt |d t
rttjjd|  |g||_"| } }}nHt#|dkr2t |d t!r2|\}}}ttjjd| d| d|  n"t#|dkrMt |d t$rMttjjd|  nt%d| t |d t$rt|j&D ]8}||}|j'} |d |  }|d	 |  }|d |  }ttjj|  d| d| d|  |(| ||| qan|(|j'|g||g||g| |)| t* }!|+||}"t* }#|"du rt,d|
 ttjjd|
|#|! f  t |
d}|-|" W d   n	1 sw   Y  t.|| dS ) zBuild TRT engine from onnxz%s/onnx/%s.onnxz%s/%s.enginez%s/%szconfig.jsonznemo_config.yamlwNzBuilding TRT engine for %sr   .)	precisionr    
image_size
num_framesrbzFailed parsing %szSucceeded parsing %sr\   rw   r   zinput_sizes must be a listzProcessed input sizes    z"Processed min/opt/max input sizes /zinvalid input sizes: z min/opt/max input sizes zFailed building %szSucceeded building %s in %d swbr3   )/openyamldumprg   rh   ri   rj   rk   r	   create_networkintNetworkDefinitionCreationFlagEXPLICIT_BATCHcreate_optimization_profilestrsplitcreate_builder_configtrt_builder_config
OnnxParserparsereadrl   pathabspathERRORrP   
num_errors	get_errorshutilrmtreemax	get_input
isinstancelistshapelendict
ValueError
num_inputsname	set_shapeadd_optimization_profiler   build_serialized_networkRuntimeErrorwritesave_config)$r    input_sizesrq   vision_max_batch_sizer'   r   r   nemo_config	part_name	onnx_fileengine_fileconfig_filenemo_config_filefbuildernetworkprofileconfig_argsconfig_wrapperconfigparserrV   errornBSnMinBSnOptBSnMaxBSinputTmin_sizeopt_sizemax_sizei
input_namet0engine_stringt1r3   r3   r4   build_trt_engine   s   


$
&$	

r   r   c                 C   s  t j r
t dnd}tj|r[tj|d}t|d}t	
|}W d   n1 s.w   Y  ztj|d}t j||d}	W n7 tyZ   tj|d}t j||d}	Y n!w t }
t|
}t||\}	}}W d   n1 svw   Y  |d	 d
 }G dd dt jj}G dd dt jj}tj|d t jddd}|j}|j}|j}|d	 d dkrt jt jj|d |d ddt j t jj|d |d ddj|d}d}tdddD ]"}|| |	| d| d ||	| d| d |d qn|d	 d d kr5t jj|d |d dd}d}||	| d ||	| d |d no|d	 d d!krt j| t j|d d" t jj|d d" |d ddt j t jj|d |d ddj|d}d}d#D ]#}|| |	| d| d ||	| d| d |d qtnt d$|d	 d  d}|||||}| d%ks| d&kr|j!}| d%kr|d	 d% d' }n|j"j!}| d(kr|d	 d% d' }t j#d)d||||d*}t$||| t%| d||g||||| d%ks | d(kr|nd|d+ dS ),zBuild neva visual enginecudacpumodel_config.yamlrNmodel_weights.ckptmap_locationzmp_rank_00/model_weights.ckptmm_cfgvision_encoderc                   @   s   e Zd Zdd Zdd ZdS )z*build_neva_engine.<locals>.DownSampleBlockc                 S   sX   |}t |jd d  }}||jd ||d}| |}||jd d|jd }|S )Nr   g      ?r   r}   )r   r   reshapeflat_square)selfx
vit_embedshr{   r3   r3   r4   forwardF  s   
z2build_neva_engine.<locals>.DownSampleBlock.forwardc                 S   s  |  \}}}}|d dkr/tj|tj|d||f|jd|jgdd }|  \}}}}|d dkrVtj|tj||d|f|jd|jgdd }|  \}}}}|||t	|d t	|d }|
dddd }||t	|d t	|d t	|d }|S )Nrw   r   r'   )dimr   r   r7   )sizern   catzerosr'   todevice
contiguousviewr   permute)r   r   nr{   r   cr3   r3   r4   r   N  s   22 (z6build_neva_engine.<locals>.DownSampleBlock.flat_squareN)__name__
__module____qualname__r   r   r3   r3   r3   r4   DownSampleBlockD  s    r   c                       $   e Zd Z fddZdd Z  ZS )z/build_neva_engine.<locals>.VisionEncoderWrapperc                       t    || _|| _d S Nsuper__init__encoder	connectorr   r   r   	__class__r3   r4   r   ]     

z8build_neva_engine.<locals>.VisionEncoderWrapper.__init__c                 S   s&   | j |dd}|jd }| |}|S )NTpixel_valuesoutput_hidden_states)r   hidden_statesr   )r   imagesvision_xr3   r3   r4   r   b  s   

z7build_neva_engine.<locals>.VisionEncoderWrapper.forwardr   r   r   r   r   __classcell__r3   r3   r   r4   VisionEncoderWrapper[      r   from_pretrainedTeagertorch_dtypetrust_remote_codeattn_implementationmm_mlp_adapter_type
mlp2x_geluhidden_sizebiasr   Omodel.embedding.word_embeddings.adapter_layer.mm_projector_adapter.mm_projectorr   r   rw   r|   .weight.biasweightr  linearmlp_downsampler7   )r   rw   r7   zUnknown projector type: litavilasample_framesvitar   r'   r   )r   r   r   )&rn   r   is_availabler   rl   r   isdirjoinr   r   	safe_loadloadFileNotFoundErrortempfileTemporaryDirectoryr   r   nnModuler   r  r   vision_modelr   r  
SequentialLinearGELUr   rP   load_state_dict	LayerNormr   r   vision_configemptyrr   r   )r    r   r   r   r   config_pathr   r   weights_pathmp0_weightstemp	temp_path_r(  r   r   r   r   	hf_configr'   vision_connector
key_prefixlayerlita_num_frameswrapperr   dummy_imager3   r3   r4   build_neva_engine(  s   
	


r7  c              	   C   s  t j r
t dnd}t|0}t|d}zt j	|d|d}W n t
y9   t j	|d|d}Y nw W d   n1 sDw   Y  |d d	 }G d
d dt jj}tj|d t jddd}	|	j}
|	j}|j}|d d dksxJ t jj|d |d dd}d}||| d ||| d |d ||
|||}|jj}|d d }t jd|d||||d}t|||  td|d||g| ||||d dS )zBuild video neva visual enginer   r   z./model_config.yamlz./model_weights.ckptr   z./mp_rank_00/model_weights.ckptNr   r   c                       r   )z5build_video_neva_engine.<locals>.VisionEncoderWrapperc                    r   r   r   r   r   r3   r4   r     r   z>build_video_neva_engine.<locals>.VisionEncoderWrapper.__init__c                 S   st   |j \}}}}}||| |||}| j|dd}|jd }|d d dd f }|||d|j d }| |}|S )NTr   r   r   r}   )r   r   r   r   r   )r   r   br   r   r   r{   r   r3   r3   r4   r     s   

z=build_video_neva_engine.<locals>.VisionEncoderWrapper.forwardr   r3   r3   r   r4   r     r   r   r  Tr  r  r  r  r	  r
  r  r  r  r  datar   r   r   r  
video-neva)r   r   )rn   r   r  r   tarfiler   r   r  extractfiler  KeyErrorr   r!  r   r  r   r"  r   r  r$  r&  r   r(  r   r)  rr   r   )r   r   r   r   tarr   r,  r(  r   r   r   r0  r'   r1  r2  r5  r   r   dummy_videor3   r3   r4   build_video_neva_engine  sV   	

r@  salmperception_checkpoint_pathc              	   C   s,  |dksJ d| dd }t j| st |  ||}|j}tjddtjd}tjdgtj	d}|||d\}	}
|

tj	}
| d	 }||||f G d
d dtjj}||j|j|j}t||	|
f|  td|d }g ddgd|ddg|gd|ddg|gdg}t||| |tjddd dS )zBuild perception enginerA  Invalid model type c                 S   sP   d}t tj| |}d}ttj| |}t|d}|| |  |S )Nr   r   )cfg)	rn   r  rl   r   r  r   r   r&  eval)rB  weightsperception_state_dictr   
perceptionr3   r3   r4   load_perception_model  s   

z6build_perception_engine.<locals>.load_perception_modelr   i  r   )input_signallengthz/feature_extractor.tsc                       s,   e Zd Z fddZe dd Z  ZS )z2build_perception_engine.<locals>.PerceptionWrapperc                    s    t    || _|| _|| _d S r   )r   r   r   modality_adapterproj)r   r   rL  rM  r   r3   r4   r   -  s   

z;build_perception_engine.<locals>.PerceptionWrapper.__init__c                 S   sJ   | j ||d\}}| j||d\}}| |dd}|tj}||fS )N)audio_signalrK  r   rw   )r   rL  rM  	transposer   rn   int32)r   rs   rt   ru   encoded_lenr3   r3   r4   r   3  s
   z:build_perception_engine.<locals>.PerceptionWrapper.forward)r   r   r   r   r   disable_checksr   r   r3   r3   r   r4   PerceptionWrapper+  s    rS  rw   )r   P   r   )rs   rt   rT  i   i   Nperception_encoder)r'   r   r   )rl   r   existsrm   preprocessorrn   randnfloat32tensorrP  r   r1   r   r!  r   rL  rM  ry   r   r   float16)r   rB  r    r%   rI  perception_modelfeature_extractorrJ  input_signal_lengthrs   rt   	dump_pathrS  rH  opt_batch_sizeshapesr3   r3   r4   build_perception_engine	  s>   



rb  (meta-llama/Llama-3.2-11B-Vision-Instructprocessor_namec                 C   s  t j|ddd}|j}G dd dtjj}||j|j}t|}t	
dddg}	||	dd	|}
t|td
d |
 D | dd |
D dd |
D d dd |
 D gd }|d  |d< |d  D ]\}}|g|dd  |d |< qhtd|| || dS )zBuild mllama visual enginer8   )r  
device_mapc                       r   )z7build_mllama_visual_engine.<locals>.MLLaMAVisionWrapperc                    r   r   )r   r   r"  output_proj)r   r"  rf  r   r3   r4   r   ]  r   z@build_mllama_visual_engine.<locals>.MLLaMAVisionWrapper.__init__c                 S   s   |  |||j}| |}|S r   )r"  last_hidden_staterf  )r   r   aspect_ratio_idsaspect_ratio_maskoutr3   r3   r4   r   b  s   
z?build_mllama_visual_engine.<locals>.MLLaMAVisionWrapper.forwardr   r3   r3   r   r4   MLLaMAVisionWrapper[  r   rk  RGBi   i
  pt)r   return_tensorsc                 S   s   g | ]\}}|qS r3   r3   ).0r/  valuer3   r3   r4   
<listcomp>o  s    z.build_mllama_visual_engine.<locals>.<listcomp>c                 S   s   g | ]}|qS r3   r3   ro  keyr3   r3   r4   rq  q  s    c                 S   s   i | ]}|d diqS )r   rZ   r3   rr  r3   r3   r4   
<dictcomp>r  s    z.build_mllama_visual_engine.<locals>.<dictcomp>)rc   re   c                 S   s   i | ]
\}}|t |jqS r3   )r   r   )ro  kvr3   r3   r4   rt  t  s    r   r   rw   r   Nmllama)r   r  r'   rn   r   r!  r"  multi_modal_projectorr   r   newr   rr   tupleitemscopyr   )r   r6   rd  r   hf_modelmodel_dtyperk  r5  	processorimageinputsra  ru  rv  r3   r3   r4   build_mllama_visual_engineQ  s&   
r  c                 C   sH   g d}||v rt || || dS |dkrt| || dS td| )zBuild visual engine)r   r  r  r  r:  rC  N)r7  r@  r   )r   r   r    r   
model_listr3   r3   r4   build_visual_engine{  s   r  	lora_ckptrq   c              	   C   sZ  t jt j| drtt j| d}nt jt j| ddr.tt j| dd}ntdt j| d}t j|sCtdi }| D ]\}}d|vrU|||< qIt j|d}t	 B}t j|d}	t
||	 t|d}
|
j|	dd |
j|dd W d	   n1 sw   Y  W d	   |S W d	   |S 1 sw   Y  |S )
zExtrace lora from checkpointr   
mp_rank_00z#Imcompatible lora checkpoint formatr   mm_projectorzllm_lora.nemor{   )arcnameN)rl   r   rV  r  rn   r  r   r{  r  r  rS   r;  r   add)r  rq   model_weightmodel_configllm_lora_weightru  rv  llm_lora_pathtmp_dirllm_weight_pathr>  r3   r3   r4   extract_lora_ckpt  s8   


r  checkpoint_pathc              
   C   s   t ||\}}t|}|tj}|| t 3}tj	
|d}|| ~~ttj	
| d||d ttj	
| d|||||||	 W d   dS 1 sPw   Y  dS )zBuild mllama enginehf_checkpointvisual_engine)r   
llm_engineN)r   r   r   rn   r   r&  r  r  rl   r   r  save_pretrainedr  rX   )r   r  rd  r   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   new_state_dictr   r}  r  r6   r3   r3   r4   build_mllama_engine  s0   


"r  )Nr   r   r   r   r   r   r   r   NNr   N)
r   r   r   r   r   r   NNr   N)r   )rA  r   )rc  r   )r   r   )rc  r   r   r   r   r   r   r   NNr   N)?rl   r   r;  r  pathlibr   r   typingr   tensorrtri   rn   r   	omegaconfr   PILr   tensorrt_llm._commonr   tensorrt_llm.builderr   r	   tensorrt_llm.commands.buildr
   rR   tensorrt_llm.mappingr   tensorrt_llm.modelsr   tensorrt_llm.pluginr   transformersr   r   r   Anemo.collections.multimodal.speech_llm.modules.perception_modulesr   nemo.core.classes.commonr   nemo.export.tensorrt_llmr   .nemo.export.trt_llm.nemo_ckpt_loader.nemo_filer   	converterr   rj   rk   rg   r   r   r5   rX   rr   ry   r   r   r7  r@  rb  r  r  r  r  r3   r3   r3   r4   <module>   s  	

(	

E

 
h
 
Q
K
-

&	
