o
    }oiI                     @   sT  d dl Z d dlmZ d dlmZmZmZmZmZ d dl	Z
d dlZd dlZd dlmZmZ d dlmZ d dlmZmZmZmZ d dlmZ erJd dlZejdd	 Zd
ZeZzd dlmZ W n e ym   e!d dZY nw d
Z"zd dl#Z#W n e y   e!d dZ"Y nw d
Z$zd dlZW n e%y   e!d dZ$Y nw G dd deZ&dS )    N)Path)TYPE_CHECKINGDictListOptionalUnion)	AutoModelAutoTokenizer)ITritonDeployable)get_example_inputsget_model_device_typeis_nemo2_checkpointvalidate_fp8_network)loggingc                    s    fdd}|S )zNo op decoratorc                     s    | i |S N )argskwargsfuncr   Q/home/ubuntu/.local/lib/python3.10/site-packages/nemo/export/onnx_llm_exporter.pywrapper%   s   znoop_decorator.<locals>.wrapperr   )r   r   r   r   r   noop_decorator!   s   r   T)batchzPyTriton is not available.Fzonnxruntime is not available.ztensorrt is not availablec                   @   s  e Zd ZdZ				dDdedeejj dede	fdd	Z
d
d Zdd Z						dEdededededee dee dede	fddZ						dEdededededee dee deejef de	fddZ						dFded e	d!ed"ed#ed$ed% d&dfd'd(ZdGd+d,Zd-d.d/ee d&dfd0d1ZdHd2d3ZdId4eeef d5ee fd6d7Zed8d9 Zed:d; Zed<d= Zed>d? Zed@dA Z e!d4e"j#fdBdCZ$dS )JOnnxLLMExporteraW  
    Exports models to ONNX and run fast inference.

    Example:
        from nemo.export.onnx_llm_exporter import OnnxLLMExporter

        onnx_llm_exporter = OnnxLLMExporter(
            onnx_model_dir="/path/for/onnx_model/files",
            model_name_or_path="/path/for/model/files",
        )

        onnx_llm_exporter.export(
            input_names=["input_ids", "attention_mask", "dimensions"],
            output_names=["embeddings"],
        )

        output = onnx_llm_exporter.forward(["Hi, how are you?", "I am good, thanks, how about you?"])
        print("output: ", output)
    NTonnx_model_dirmodelmodel_name_or_pathload_runtimec                 C   s   || _ || _tt|d | _|| _|| _d| _d| _d| _	d| _
ttj r*dnd| _d| _| jdurR|dur>tdt| j rRt| jrNtd|   |rZ|   dS dS )a  
        Initializes the ONNX Exporter.

        Args:
            onnx_model_dir (str): path for storing the ONNX model files.
            model (Optional[torch.nn.Module]): torch model.
            tokenizer (HF or NeMo tokenizer): tokenizer class.
            model_name_or_path (str): a path for ckpt or HF model ID
            load_runtime (bool): load ONNX runtime if there is any exported model available in
                                 the onnx_model_dir folder.
        z
model.onnxNcudacpuz2A model was also passed but it will be overridden.z,NeMo 2.0 checkpoint will be supported later.)r   r   strr   onnx_model_pathr   	tokenizermodel_input_namesmodel_output_namesonnx_runtime_sessioncalibration_datatorchdevicer   is_availablequant_max_batch_size
ValueErroris_dirr   NotImplementedError_load_hf_model_load_runtime)selfr   r   r#   r   r   r   r   r   __init__Z   s*   

zOnnxLLMExporter.__init__c                 C   sr   t r5t| j r7t| j| _dd | j D | _dd | j	 D | _
tjt| jd dd| _d S d S d S )Nc                 S      g | ]}|j qS r   name).0inputr   r   r   
<listcomp>       z1OnnxLLMExporter._load_runtime.<locals>.<listcomp>c                 S   r3   r   r4   )r6   outputr   r   r   r8      r9   r#   Ttrust_remote_code)use_onnxruntimer   r"   existsonnxruntimeInferenceSessionr&   
get_inputsr$   get_outputsr%   r	   from_pretrainedr   r#   r1   r   r   r   r0      s   zOnnxLLMExporter._load_runtimec                 C   s,   t j| jdd | _tj| jdd| _d S )NTr;   )r   rC   r   evalr   r	   r#   rD   r   r   r   r/      s   zOnnxLLMExporter._load_hf_model   fp32Finput_namesoutput_namesexample_inputsopsetdynamic_axes_inputdynamic_axes_outputexport_dtypeverbosec	           	   
   C   s&   | j ||||||||d |   dS )a  
        Performs ONNX conversion from a PyTorch model.

        Args:
            input_names (list): input parameter names of the model that ONNX will export will use.
            output_names (list): output parameter names of the model that ONNX will export will use.
            example_inputs (dict): example input for the model to build the engine.
            opset (int): ONNX opset version. Default is 20.
            dynamic_axes_input (dict): Variable length axes for the input.
            dynamic_axes_output (dict): Variable length axes for the output.
            export_dtype (str): Export dtype, fp16 or fp32.
            verbose (bool): Enable verbose or not.
        )rH   rJ   rI   rK   rL   rM   rN   rO   N)_export_to_onnxr0   )	r1   rH   rI   rJ   rK   rL   rM   rN   rO   r   r   r   export   s   
zOnnxLLMExporter.exportc	           
      C   s  |d u r	t | j}d|v rtdg|d jd  |d< t|tr*tjtjd| }| j	
| t| jjddd tjt| j	|d tjj| j	|f| j||i ||||d	 W d    n1 sdw   Y  td
| j  t| jd }	|	jdd | j|	 d S )N
dimensions   	input_idsr   )fp16rG   Tparentsexist_ok)device_typedtype)r   r   frH   rI   dynamic_axesrO   opset_versionz2Successfully exported PyTorch model to ONNX model r#   )rX   )r   r#   r(   tensorshape
isinstancer!   float16float32r   tor   r   mkdirautocastr   onnxrQ   r"   r   infosave_pretrained)
r1   rH   rI   rJ   rK   rL   rM   rN   rO   existing_directory_pathr   r   r   rP      s0   


zOnnxLLMExporter._export_to_onnxrU   layer_names_onlytrt_model_dir$override_layernorm_precision_to_fp32override_layers_to_fp32	trt_dtypeprofiling_verbositytrt_builder_flagsztrt.BuilderFlagreturnc                 C   sV  t d| j d ttjj}t|}	|	dttj	j
> }
|	 }t|
|}|| jsHt d t|jD ]
}t || q;dS |r~|D ]1}|	 }t|
jD ] }|
|}|j|j||j d ||j d ||j d d qW|| qL|d	krt d
 |tjj n|dkrt d |tjj |tjj t|
 |rt d | |
 |rt d | |
| ztj j!tj j"tj j#d| |_$W n t%y   d}t&|w t d|j$  |dur|D ]}|| q|	'|
|}|du rt(dt)|}|j*ddd |d }|+| t d| j d| d dS )a7  Performs TensorRT conversion from an ONNX model.

        Args:
            trt_model_dir: path to store the TensorRT model.
            profiles: TensorRT profiles.
            override_layernorm_precision_to_fp32 (bool): whether to convert layers to fp32 or not.
            override_layers_to_fp32 (List): Layer names to be converted to fp32.
            trt_dtype (str): "fp16" or "fp32".
            profiling_verbosity (str): Profiling verbosity. Default is "layer_names_only".
            trt_builder_flags (List[trt.BuilderFlag]): TRT specific flags.
        z%Building TRT engine from ONNX model ()rS   zONNX model could not be parsedNr      )minoptmaxrU   zSetting Build Flag FP16fp8zSetting Build Flag FP8 and FP16z;Overriding TensorRT network LayerNorm precision to float32.z"Overriding some layers to float32.)detailedrj   nonez"Unknown profiling verbosity value.zSetting Profiling Verbosity to zSFailed to serialize the TensorRT Engine. Please check the TensorRT logs for detailsTrV   z
model.planz"Successfully exported ONNX model (z) to TRT engine (),r   rg   r"   trtLoggerWARNINGBuildercreate_networkintNetworkDefinitionCreationFlagEXPLICIT_BATCHcreate_builder_config
OnnxParserparse_from_filewarningrange
num_errorserror	get_errorcreate_optimization_profile
num_inputs	get_input	set_shaper5   add_optimization_profileset_flagBuilderFlagFP16FP8r   %_override_layernorm_precision_to_fp32_override_layers_to_fp32ProfilingVerbosityDETAILEDLAYER_NAMES_ONLYNONEro   KeyErrorr,   build_serialized_network	Exceptionr   rd   write_bytes)r1   rk   profilesrl   rm   rn   ro   rp   
trt_loggerbuildernetworkconfigparserr   profileoptimization_profilei	in_tensor	error_msgflagengine_stringtrt_model_pathr   r   r   export_onnx_to_trt   sx   










z"OnnxLLMExporter.export_onnx_to_trtlayer
trt.ILayerc                 C   s   t j|_|dt j d S )Nr   )rz   rb   	precisionset_output_type)r1   r   r   r   r   !_override_layer_precision_to_fp32F  s   z1OnnxLLMExporter._override_layer_precision_to_fp32r   trt.INetworkDefinitionfp32_layer_patternsc                    s  t |jD ]}||  jtfdd|D r jtjtjhv r j	tj
jhv r<td j	 d| d d qt fddt  jD r^tj _td| d d	 j	 d
 t  jD ]'} |tjtjhv r |tj td| d d	 j	 d| d	 qcqd S )Nc                 3   s    | ]}  |V  qd S r   )
startswith)r6   pattern)
layer_namer   r   	<genexpr>N  s    z;OnnxLLMExporter._override_layers_to_fp32.<locals>.<genexpr>zSkipping overriding z layer  z dtypec                 3   s(    | ]}  |jtjtjhv V  qd S r   )r   rZ   rz   rb   ra   )r6   	input_idxr   r   r   r   U  s
    
zSetting layer z (type: z) precision to FP32z) output type z to FP32)r   
num_layers	get_layerr5   anyr   rz   rb   ra   type	LayerTypeCASTr   rg   r   num_outputsget_output_typer   )r1   r   r   r   jr   )r   r   r   r   J  s*   
  &z(OnnxLLMExporter._override_layers_to_fp32c           
         s  i }t |D ]E\}  jtjjkr.t fddt jD }|r. dj	tj
kr.tj
 _ jtjjkrKttd _ jtjjkrK|| < |   q| D ]\}}d}d}t|| || D ]w}	||	  jtjjkrt|    jtjjkrttd _ jtjjkr|    jtjjkrttd _ jtjjkr|    jtjjkrttd _ jtjjkr|    jtjjkrttd _ jtjjkr|   qaqPdS )	zSet the precision of LayerNorm subgraphs to FP32 to preserve accuracy.

        - https://nvbugs/4478448 (Mistral)
        - https://nvbugs/3802112 (T5)

        Args:
            network: tensorrt.INetworkDefinition
        c                    s&   g | ]}  |o |tjkqS r   )output_type_is_setr   rz   rb   )r6   or   r   r   r8   q  s    zIOnnxLLMExporter._override_layernorm_precision_to_fp32.<locals>.<listcomp>r   IElementWiseLayer      IUnaryLayerN)	enumerater   rz   r   IDENTITYallr   r   r   rZ   rb   r   ELEMENTWISEgetattr	__class__opElementWiseOperationPOWr   itemsr   REDUCESUMUNARYUnaryOperationSQRTDIVPROD)
r1   r   pow_opslayer_indexall_fp32_indexSTART_OFFSET
END_OFFSETr   r   r   r   r   b  sV   







z5OnnxLLMExporter._override_layernorm_precision_to_fp32inputsrR   c                 C   sj   | j du rtd dS t|tr)d| jv r|du rtdt| |}||d< | j 	| j
|}|d S )a  Run inference for a given input.

        Args:
            inputs (Union[List, Dict]): Input for the model. If list, it should be a list of strings.
                If dict, it should be a dictionary with keys as the model input names.
            dimensions (Optional[List]): The dimensions parameter of the model. Required if the model
                was exported to accept dimensions parameter and inputs is given as a list of strings.

        Returns:
            np.ndarray: Model output.
        NzPONNX Runtime is not available. Please install the onnxruntime-gpu and try again.rR   z-Dimensions should be provided for list input.r   )r&   warningswarnr`   r   r$   r,   dictr#   runr%   )r1   r   rR   r:   r   r   r   forward  s   


zOnnxLLMExporter.forwardc                 C      | j S )zReturns the model)r   rD   r   r   r   	get_model     zOnnxLLMExporter.get_modelc                 C   r   )zReturns the tokenizer)r#   rD   r   r   r   get_tokenizer  r   zOnnxLLMExporter.get_tokenizerc                 C   r   )zReturns the model input names)r$   rD   r   r   r   get_model_input_names  r   z%OnnxLLMExporter.get_model_input_namesc                 C      t d)zGet triton input(This function will be implemented later.r.   rD   r   r   r   get_triton_input     z OnnxLLMExporter.get_triton_inputc                 C   r   )zGet triton outputr   r   rD   r   r   r   get_triton_output  r   z!OnnxLLMExporter.get_triton_outputc                 K   r   )zPyTriton inference functionr   r   )r1   r   r   r   r   triton_infer_fn  r   zOnnxLLMExporter.triton_infer_fn)NNNT)NrF   NNrG   F)NFNrU   rj   N)r   r   rq   N)r   r   rq   Nr   )%__name__
__module____qualname____doc__r!   r   r(   nnModuleboolr2   r0   r/   listr   r   rQ   r   rZ   rP   r   r   r   r   r   r   r   propertyr   r   r   r   r   r   npndarrayr   r   r   r   r   r   E   s    

,
	
)	
-	

_
 =




r   )'r   pathlibr   typingr   r   r   r   r   numpyr   r(   wrapttransformersr   r	   nemo.deployr
   nemo.export.utilsr   r   r   r   
nemo.utilsr   tensorrtrz   	decoratorr   use_pytritonr   pytriton.decoratorsr   r   r=   r?   use_trtImportErrorr   r   r   r   r   <module>   sJ   
	


