o
    }oi3p                     @  s  d dl mZ d dlZd dlZd dlZd dlZd dlmZ d dlm	Z	 d dl
mZ d dlmZ d dlmZmZmZmZmZmZ d dlZd dlmZmZ d d	lmZ ed
\ZZerld dlmZ d dlm Z m!Z!m"Z"m#Z#m$Z$ ed\Z%Z&ed\Z'Z(ed\Z)Z(e* Z+dd Z,d<ddZ-dd Z.dd Z/G dd de0Z1G dd dZ2d d! Z3d"d# Z4d=d*d+Z5G d,d- d-Z6d.d/ Z7			d>d?d:d;Z8dS )@    )annotationsN)OrderedDict)	getLogger)Path)
MethodType)AnyDictListSequenceTupleUnion)add_casts_around_normsreplace_for_export)safe_import
polygraphy)bytes_from_path)CreateConfigProfileengine_bytes_from_networkengine_from_bytesnetwork_from_onnx_pathtensorrttorch_tensorrtzcuda.cudartc                   C  s<   t jtjt jtjt jtjt jtjt jtjt jtjt jtjiS )z)
    Map of TRT dtype -> Torch dtype
    )	trtint32torchfloat32float16bfloat16int64int8bool r"   r"   V/home/ubuntu/.local/lib/python3.10/site-packages/nemo/export/tensorrt_lazy_compiler.pytrt_to_torch_dtype_dict/   s   r$   input_shapeSequence[int]dynamic_batchsizeSequence[int] | Nonec                 C  sP   ddd}|r|| |d }|| |d }|| |d	 }n|  } }}|||fS )zb
    Given a sample input shape, calculate min/opt/max shapes according to dynamic_batchsize.
    r%   r&   	scale_numintc                 S  s   g | }||d< |S )Nr   r"   )r%   r)   scale_shaper"   r"   r#   scale_batch_sizeC   s   z,get_profile_shapes.<locals>.scale_batch_sizer         N)r%   r&   r)   r*   r"   )r%   r'   r,   min_input_shapeopt_input_shapemax_input_shaper"   r"   r#   get_profile_shapes>   s   

r2   c                 C  s|   i }| s|S | D ]3}|D ].}g }|| }t t|d D ]}|d | |d | kr/|| qt|dkr:|||< qq|S )z
    This method calculates dynamic_axes to use in onnx.export().
    Args:
       profiles: [[min,opt,max],...] list of profile dimensions
    r   r.   )rangelenappend)profilesdynamic_axesprofilekeyaxesvalsir"   r"   r#   get_dynamic_axesR   s    
r=   c                 C  s6   | d }|dkrt d| t| dkr| d S dS )z[
    Error reporting method for CUDA calls.
    Args:
     cuda_ret: CUDA return code.
    r   zCUDA ERROR: r-   N)RuntimeErrorr4   )cuda_reterrr"   r"   r#   cuassertg   s   rA   c                   @  s   e Zd ZdZdS )
ShapeErrorzM
    Exception class to report errors from setting TRT plan input shapes
    N)__name__
__module____qualname____doc__r"   r"   r"   r#   rB   u   s    rB   c                   @  s4   e Zd ZdZdddZdd Zdd Zdd
dZdS )	TRTEnginezK
    An auxiliary class to implement running of TRT optimized engines

    Nc                 C  s  || _ |ptd| _| jd| j   tt| j | _t | _d| _	| j
 | _g | _g | _g | _d| _i | _t }t| jjD ]6}| j| }| j|tjjkrY| j| qA| j|tjjkrw| j| || j| }| j| qA| jd| j  d| j d| j  dS )z
        Loads serialized engine, creates execution context and activates it
        Args:
          plan_path: path to serialized TRT engine.
          logger: optional logger object
        trt_compilezLoading TensorRT engine: Nr   zLoaded TensorRT engine: z
.
Inputs: z

Outputs: )	plan_pathr   loggerinfor   r   enginer   tensorscuda_graph_instancecreate_execution_contextcontextinput_namesoutput_namesdtypescur_profileinput_tabler$   r3   num_io_tensorsget_tensor_moder   TensorIOModeINPUTr5   OUTPUTget_tensor_dtype)selfrI   rJ   
dtype_dictidxbindingdtyper"   r"   r#   __init__   s2   
zTRTEngine.__init__c                 C  s~   | j }t| jD ]4\}}t||}|| jvs"t| j| j|kr<tj|| j	| |d
 }|| j|< |||  qdS )zx
        Allocates outputs to run TRT engine
        Args:
            device: GPU device to allocate memory on
        )r`   deviceN)rP   	enumeraterR   listget_tensor_shaperM   shaper   emptyrS   
contiguousset_tensor_addressdata_ptr)r\   rb   ctxr<   r_   rf   tr"   r"   r#   allocate_buffers   s   
zTRTEngine.allocate_buffersc                   s   j }j j} fdd}	 z|  W n( ty7   jd |j }||kr+ |_ j| Y n ty>    w q  }t|dksLJ dS )z
        Sets input bindings for TRT engine according to feed_dict
        Args:
           feed_dict: a dictionary [str->Tensor]
           stream: CUDA stream to use
        c                    sT   j D ]$} j|  d }|d ur'| }|j} | |  | |  qd S N)rQ   getrU   rh   rf   set_input_shaperi   rj   )r_   rl   rf   rk   	feed_dictr\   r"   r#   try_set_inputs   s   
z,TRTEngine.set_inputs.<locals>.try_set_inputsTr-   r   N)	rL   rP   rT   rB   num_optimization_profilesset_optimization_profile_async	Exceptioninfer_shapesr4   )r\   rr   streamelast_profilers   next_profileleftr"   rq   r#   
set_inputs   s(   	zTRTEngine.set_inputsFc                 C  s   |rO| j durtt| j | tt| | jS | j|}|s&tdtt|tj	j
 | j| tt|}tt|d| _ | jd | jS | j|}tt| |sbtd| jS )z
        Runs TRT engine.
        Args:
            stream: CUDA stream to run on
            use_cuda_graph: use CUDA graph. Note: requires all inputs to be the same GPU memory between calls.
        NzERROR: inference failed.r   zCUDA Graph captured!)rN   rA   cudartcudaGraphLaunchcudaStreamSynchronizerP   execute_async_v3
ValueErrorcudaStreamBeginCapturecudaStreamCaptureMode cudaStreamCaptureModeThreadLocalcudaStreamEndCapturecudaGraphInstantiaterJ   rK   rM   )r\   rx   use_cuda_graphnoerrorgraphr"   r"   r#   infer   s.   
zTRTEngine.inferrn   )F)rC   rD   rE   rF   ra   rm   r}   r   r"   r"   r"   r#   rG   }   s    
 $rG   c                 C  s   t | tjr| S t|  S )zI
    Creates a new tensor from d, returns d if d is already a tensor
    )
isinstancer   Tensortensorcuda)dr"   r"   r#   make_tensor   s   r   c                 C  sp   i }| D ]1}|| }|dur5t |tst |tr/tt|D ]}t|| || d| < qqt|||< q|S )z;
    Simulates list/tuple unrolling during ONNX export
    N_)r   rd   tupler3   r4   r   )rQ   input_exampleunrolled_inputnamevalr<   r"   r"   r#   unroll_input   s   r   retList[torch.Tensor]output_listsList[List[int]]return3Tuple[Union[torch.Tensor, List[torch.Tensor]], ...]c           
      C  s  t  }d}tt|D ]}|| }t|dkst|dksJ t|dks+|d dkr9g || | R }|d }q|d dkrUg || |||d   R }||d  }q|d dkrt  }t| }tt|d |dD ]M}|| }	t|	dkst|	dksJ t|	dks|	d dkr|d }g || | R }ql|	d dkr||	d  }g || |||	d   R }qltdg || || |ddd R } |S q|S )a)  
    Implements parsing of 'output_lists' arg of trt_compile().

    Args:
      ret: plain list of Tensors

      output_lists: list of output group sizes: to form some Lists/Tuples out of 'ret' List, this will be a list
                    of group dimensions, like [[], [5], [-1]] for returning Tensor, list of 5 items and dynamic list.
        Format: [[group_n] | [], ...]
          [] or group_n == 0 : next output from ret is a scalar
          group_n > 0  :       next output from ret is a list of group_n length
          group_n == -1:       next output is a dynamic list. This entry can be at any
                               position in output_lists, but can appear only once.
    Returns:
       Tuple of Union[torch.Tensor, List[torch.Tensor]], according to the grouping in output_lists

    r   r-   zTwo -1 lists in outputN)r   r3   r4   r   )
r   r   groupscurr<   gl
rev_groupsrcurrlrglr"   r"   r#   parse_groups  s:   
 $r   c                   @  s^   e Zd ZdZ														dddZdd	 Zd
d Zdd Zdd Zdd Z	dS )TrtCompilerz
    This class implements:
      - TRT lazy persistent export
      - Running TRT with optional fallback to Torch
        (for TRT engines with limited profiles)
    fp16onnxNFc                 C  s  ddg}||vrt d| d| dg d}||vr&t d| d| d|| _|| _|| _|du| _|p7g | _|p<g | _|
pAg | _|| _|pIi | _	|	pNi | _
d| _|| _|| _d	| _|patd
| _t|j| _|du rv| jjdd }i | _| jjdurtt| jjD ]}| jj| d  }|durt|}|| j| jj| d  < q|| _|j| _|durtj| jrtj| j|k rt | j dS dS dS dS )a  
        Initialization method:
         Tries to load persistent serialized TRT engine
         Saves its arguments for lazy TRT build on first forward() call
        Args:
            model: Model to "wrap".
            plan_path : Path where to save persistent serialized TRT engine.
            precision: TRT builder precision o engine model. Should be 'fp32'|'tf32'|'fp16'|'bf16'.
            method: One of 'onnx'|'torch_trt'.
                    Default is 'onnx' (torch.onnx.export()->TRT). This is the most stable and efficient option.
                    'torch_trt' may not work for some nets. Also AMP must be turned off for it to work.
            input_names: Optional list of input names. If None, will be read from the function signature.
            output_names: Optional list of output names. Note: If not None, patched forward() will return a dictionary.
            output_lists: Optional list of output group sizes: when forward() returns Lists/Tuples, this will be a list
                          of their dimensions, like [[], [5], [-1]] for Tensor, list of 5 items and dynamic list.
            export_args: Optional args to pass to export method. See onnx.export() and Torch-TensorRT docs for details.
            build_args: Optional args to pass to TRT builder. See polygraphy.Config for details.
            input_profiles: Optional list of profiles for TRT builder and ONNX export.
                            Each profile is a map of the form : {"input id" : [min_shape, opt_shape, max_shape], ...}.
            dynamic_batchsize: A sequence with three elements to define the input batch size range for the model to be
                               converted. Should be a sequence like [MIN_BATCH, OPT_BATCH, MAX_BATCH].
            [note]: If neither input_profiles nor dynamic_batchsize specified, static shapes will be used.
            use_cuda_graph: Use CUDA Graph for inference. Note: inputs have to be the same GPU memory between calls!
            timestamp: Optional timestamp to rebuild TRT engine (e.g. if config file changes).
            fallback: Allow to fall back to Pytorch when TRT inference fails (e.g, shapes exceed max profile).
        r   	torch_trtz)trt_compile(): 'method' should be one of z, got: .)fp32tf32r   bf16z,trt_compile(): 'precision' should be one of NFrH   r-   )!r   rI   	precisionmethodreturn_dictrR   r   r6   r'   export_args
build_argsrL   r   fallbackdisabledr   rJ   inspectgetfullargspecforwardargspecargsdefaultsr3   r4   r   rQ   old_forwardospathexistsgetmtimeremove)r\   modelrI   r   r   rQ   rR   r   r   r   input_profilesr'   r   	timestampr   forward_overriderJ   method_valsprecision_valsr<   r   r"   r"   r#   ra   J  sJ   .





(zTrtCompiler.__init__c                 C  s,   i }t |D ]\}}| j| }|||< q|S rn   )rc   rQ   )r\   r   
trt_inputsr<   inp
input_namer"   r"   r#   _inputs_to_dict  s
   

zTrtCompiler._inputs_to_dictc              
   C  s   z:t | j| j| _i }| jjD ]}|dr"|| jvr"|dd }n|}|||< q|| j_| jd| jj  W dS  tyV } z| jd|  W Y d}~dS d}~ww )zO
        Loads TRT plan from disk and activates its execution context.
        __r.   NzEngine loaded, inputs:z$Exception while loading the engine:
)	rG   rI   rJ   rL   rQ   
startswithrU   rK   rv   )r\   rU   r   	orig_namery   r"   r"   r#   _load_engine  s   
 zTrtCompiler._load_enginec              
   C  sV  | j }|| t|dkr|| | | jdu r| js|j}| j|_z4|   | jdu rX|	 }t
  | || W d   n1 sHw   Y  |   | jdusXJ W n$ ty} } z| jrq| jd|  d| _n|W Y d}~nd}~ww | js| js| D ]}~qt
j  ||_zj| jdurtY t
j }	t
jj|	d}
| jt| j||
j | jj|	d |
t
j  | jj|
j| jd}| j st!|" }| j#rt$|| j#}n
t|dkr|d }|W  d   W S 1 sw   Y  W n$ ty" } z| jr| jd| d	 n|W Y d}~nd}~ww | j|i |S )
af  
        Main forward method:
         Builds TRT engine if not available yet.
         Tries to run TRT engine
         If exception thrown and self.callback==True: falls back to original Pytorch

        Args: Passing through whatever args wrapped module's forward() has
        Returns: Passing through wrapped module's forward() return value(s)

        r   NzFailed to build engine: T)rb   )r   r-   zException: z
Falling back to Pytorch ...)%r   updater4   r   rL   r   r   r   r   copyr   no_grad_build_and_saverv   r   rJ   rK   
parametersr   empty_cachelock_smcurrent_deviceStreamr}   r   rQ   cuda_streamrm   wait_streamcurrent_streamr   r   r   rd   valuesr   r   )r\   r   argvkwargsr   new_forwardr   ry   paramrb   rx   r   r"   r"   r#   r     sp   





"zTrtCompiler.forwardc           	      C  s   g }| j D ]"}t }| D ]\}}|j||d |d |d d q|| q| j }| jdk|d< | jdkr>d|d< n	| jd	krGd|d	< | j	d
| d| j
  t|tjjgd}t|tdd|i|dS )z[
        Builds TRT engine from ONNX file at onnx_path and saves to self.plan_path
        r   r-   r.   )minoptmaxr   r   r   Tr   zBuilding TensorRT engine for z: )flagsr6   )configNr"   )r6   r   itemsaddr5   r   r   r   rJ   rK   rI   r   r   OnnxParserFlagNATIVE_INSTANCENORMr   r   )	r\   	onnx_pathr6   r8   pidr   r   networkr"   r"   r#   _onnx_to_trt  s   
 



zTrtCompiler._onnx_to_trtc              
     s  j durdS j}d}t| t| jdkrVtjg}jdkr)|tj	 njdkr4|tj
 t| }dd fdd|D }tj|d	f||d
|}nj  rtjdkrftdt dkrptdi | D ]6\}}	 fdd}
t|	tst|	trtt|	D ]}|
| d| |	|  qqvt|	tjr|
||	 qvg_tj_tjdkr|dji t p}|ddrd}n
ttj|  }t!t"|d }j#$d| ddj% dj d|   tj&j'||f|f|j%d| t(r(ddl)m*}m+}m,} |||dd}||| j#$d -|}W d   n	1 s>w   Y  |rQt.j/d 0| dS dS )!z
        If TRT engine is not ready, exports model to ONNX,
        builds TRT engine and saves serialized TRT engine to the disk.
        Args:
             input_example: passed to onnx.export()
        Nr   r   r   c                 S  s    t | |\}}}tj|||dS )N)	min_shape	opt_shape	max_shape)r2   r   Input)r%   r'   r/   r0   r1   r"   r"   r#   get_torch_trt_input2  s   z8TrtCompiler._build_and_save.<locals>.get_torch_trt_inputc                   s   g | ]	} |j jqS r"   )rf   r'   ).0r<   )r   r\   r"   r#   
<listcomp>8  s    z/TrtCompiler._build_and_save.<locals>.<listcomp>r   )
arg_inputsenabled_precisionsr   zEERROR: Both dynamic_batchsize and input_profiles set for TrtCompiler!   z&dynamic_batchsize has to have len ==3 c                   sR   |j }t|dkr'|dd  } d g| d g| d g|g| < d S d S )Nr   r-   r.   )rf   r4   )r   r   sh)dbsr8   r"   r#   add_profileJ  s
   0z0TrtCompiler._build_and_save.<locals>.add_profiler   r7   dynamoFz
model.onnxzExporting to z:
zoutput_names=z
input_names=z
export args: )rQ   rR   )fold_constantsonnx_from_path	save_onnxi $ )size_thresholdzExport to ONNX successful.wb)1rL   r   r   r   r   r   r   r   r5   r   r   rd   r   r   convert_method_to_trt_enginer'   r4   r6   r   r   r   r   r3   r   r=   r7   r   tempfileTemporaryDirectoryro   r   rQ   keysstrr   rJ   rK   rR   r   exportpolygraphy_importedpolygraphy.backend.onnx.loaderr   r   r   r   openrI   write)r\   r   r   r   engine_bytesr   inputs	tt_inputsr   r   r   r<   tmpdirrQ   r   r   r   r   
onnx_modelr"   )r   r   r8   r\   r#   r     s   







zTrtCompiler._build_and_save)r   r   NNNNNNNFNFNN)
rC   rD   rE   rF   ra   r   r   r   r   r   r"   r"   r"   r#   r   B  s,    
XFr   c                 O  s   | j | ||S )zk
    Patch function to replace original model's forward() with.
    Redirects to TrtCompiler.forward()
    )_trt_compilerr   )r\   r   r   r"   r"   r#   trt_forwardz  s   r  r   torch.nn.Module	base_pathr  r   Dict[str, Any] | None	submoduleUnion[str, List[str]] | NonerJ   
Any | Nonec                   s  dddddd}|  pi  | trttrttj rttj|r:t	tj
|}d v r6tt	 d |}| d<  fdd	}fd
d|durmt|trS|g}|D ]}| |\}	}
|t|	|
|d |  qU| S || | | S pytdd | S )aJ  
    Instruments model or submodule(s) with TrtCompiler and replaces its forward() with TRT hook.
    Note: TRT 10.3 is recommended for best performance. Some nets may even fail to work with TRT 8.x
    Args:
      model: module to patch with TrtCompiler object.
      base_path: TRT plan(s) saved to f"{base_path}[.{submodule}].plan" path.
                 dirname(base_path) must exist, base_path does not have to.
                 If base_path does point to existing file (e.g. associated checkpoint),
                 that file becomes a dependency - its mtime is added to args["timestamp"].
      args: Optional dict : unpacked and passed to TrtCompiler() - see TrtCompiler above for details.
      submodule: Optional hierarchical id(s) of submodule to patch, e.g. ['image_decoder.decoder']
                  If None, TrtCompiler patch is applied to the whole model.
                  Otherwise, submodule (or list of) is being patched.
      logger: Optional logger for diagnostics.
    Returns:
      Always returns same model passed in as argument. This is for ease of use in configs.
    r   r      obey)builder_optimization_levelprecision_constraints)r   r   r   r   c                   sF   t | ds!| j| _t| |d fdi }|| _tt| | _d S d S )Nr  z.planrJ   )hasattrr   orig_forwardr   r  r   r  )r   r   wrapper)r   rJ   r"   r#   wrap  s   
ztrt_compile.<locals>.wrapc                   sJ   | d}|dkr!|d | }t| |} ||d d  } | |S | |fS )Nr   r   r-   )findgetattr)parentr  r^   parent_name)find_subr"   r#   r&    s   


ztrt_compile.<locals>.find_subNr   rH   zSTensorRT and/or polygraphy packages are not available! trt_compile() has no effect.)r   trt_importedr	  r   r   is_availabler   r   r   r*   r   r   r   r  r#  r   warning)r   r  r   r  rJ   default_argsr   r!  sr$  subr"   )r   r&  rJ   r#   rH     s4   



rH   )r%   r&   r'   r(   )r   r   r   r   r   r   )NNN)r   r  r  r  r   r  r  r  rJ   r  r   r  )9
__future__r   r   r   r  	threadingcollectionsr   loggingr   pathlibr   typesr   typingr   r   r	   r
   r   r   r   nemo.utils.export_utilsr   r   nemo.utils.import_utilsr   r   r	  polygraphy.backend.commonr   polygraphy.backend.trtr   r   r   r   r   r   r'  r   r   r~   Lockr   r$   r2   r=   rA   rv   rB   rG   r   r   r   r   r  rH   r"   r"   r"   r#   <module>   sL    
|
2  :