o
    Ơi2                     @   s  d dl Zd dlZd dlmZmZmZmZ d dlZ	d dl
Z
d dlmZ d dlmZ d dlm  m  m  mZ d dlmZ ddlmZmZ ddlmZ ddlmZ ddlmZmZ dd	lm Z  dd
l!m"Z"m#Z# ddl$m%Z% ddl&m'Z' ddl(m)Z) e*e+Z,ee Z-dddde)j.dddddddddfdej/de0de0dej/fddZ1ej2G dd dZ3dej4de-d edefd!d"Z5d ede3fd#d$Z6d%eege3f defd&d'Z7ej2dd(G d)d* d*Z8dS )+    N)AnyCallableOptionalSequence)SplitResult   )TRTInterpreterTRTInterpreterResult)LowerSetting)LowerPassManagerBuilder)PassFuncvalidate_inference)TimingCacheManager)TRTSplitterTRTSplitterSetting)
acc_tracer)	TRTModule)LowerPrecision
   i   i   F Tg?modulemin_acc_module_sizemax_batch_sizereturnc                 C   sJ   |r|st dt||||||||	|
|||||d}tj|d}|| |S )a  
    Takes in original module, input and lowering setting, run lowering workflow to turn module
    into lowered module, or so called TRTModule.

    Args:
        module: Original module for lowering.
        input: Input for module.
        max_batch_size: Maximum batch size (must be >= 1 to be set, 0 means not set)
        min_acc_module_size: Minimal number of nodes for an accelerated submodule
        max_workspace_size: Maximum size of workspace given to TensorRT.
        explicit_batch_dimension: Use explicit batch dimension in TensorRT if set True, otherwise use implicit batch dimension.
        lower_precision: lower_precision config given to TRTModule.
        verbose_log: Enable verbose log for TensorRT if set True.
        timing_cache_prefix: Timing cache file name for timing cache used by fx2trt.
        save_timing_cache: Update timing cache with current timing cache data if set to True.
        cuda_graph_batch_size: Cuda graph batch size, default to be -1.
        dynamic_batch: batch dimension (dim=0) is dynamic.
        use_experimental_fx_rt: Uses the next generation TRTModule which supports both Python and TorchScript based execution (including in C++).
    Returns:
        A torch.nn.Module lowered by TensorRT.
    zThe experimental unifed runtime only supports explicit batch. Please make sure to set explicit_batch_dimension=True when use_experimental_fx_rt=True)r   r   max_workspace_sizeexplicit_batch_dimensionlower_precisionverbose_logtiming_cache_prefixsave_timing_cachecuda_graph_batch_sizedynamic_batchis_atenuse_experimental_rtcorrectness_atolcorrectness_rtollower_setting)
ValueErrorr
   Lowerercreate)r   inputr   r   r   r   r   r   r   r    r!   r"   r#   use_experimental_fx_rtr%   r&   r(   lowerer r/   K/home/ubuntu/.local/lib/python3.10/site-packages/torch_tensorrt/fx/lower.pycompile   s,   '
r1   c                   @   s8   e Zd ZU eed< eed< edd ZdefddZ	dS )	LowerTrtInterpreterr(   timing_cache_managerc                 C   s   t |j|j}t||S N)r   r   r    r2   )clsr(   r3   r/   r/   r0   r+   a   s   
zLowerTrtInterpreter.creater   c           
   
   C   sF  | j jsJ dtd| d| j j  d }| j jr$| j | d}d }| jrXz| j|}td W n! tyW } ztd| dt	|  d }W Y d }~nd }~ww t
|| j j| j j| j j| j jrktjjntjjd}|j| j j| j j| j j| j j||| j jrtjjntjj| j jd	}|j}	|	r| jr| j||	 |S )
Nz$Can't find input specs for lowering!zsplit_name=z, input_specs=z.jsonzTiming cache is used!zCannot load timing cache for z: )input_specsr   explicit_precisionlogger_level)r   r   r   strict_type_constraintsalgorithm_selectortiming_cacheprofiling_verbositytactic_sources)r(   r6   loggerinfoalgo_selectorr3   get_timing_cache_trt	Exceptionwarningstrr   r   r7   r   trtLoggerVERBOSEWARNINGrunr   r   r   r9   verbose_profileProfilingVerbosityDETAILEDLAYER_NAMES_ONLYr=   serialized_cacheupdate_timing_cache)
selfmodr,   
split_namer@   
cache_dataeinterpreterinterp_resultr;   r/   r/   r0   __call__h   sT   


zLowerTrtInterpreter.__call__N)
__name__
__module____qualname__r
   __annotations__r   classmethodr+   r	   rW   r/   r/   r/   r0   r2   \   s   
 
r2   modelinputsr(   c                 C   s>   t  }|j |_|j|_|j|_t| ||d}|  | S )N)settings)r   r   use_implicit_batch_dimr   r$   r   node_support_previewgenerate_split_results)r]   r^   r(   splitter_settingsplitterr/   r/   r0   default_split_function   s   
re   c                 C   s
   t | S r4   )r2   r+   r'   r/   r/   r0   create_lower_trt_interpreter   s   
rf   create_trt_interpreterc              
      s*   dt jdtdtdtdt jf
 fdd}|S )NrQ   r,   r(   module_namer   c              	      s    |}|| ||}|j rRddl}ddlm} ddlm} | }	|	|j	  |	
 }
W d   n1 s8w   Y  ||
||j|j|dtj  d}|S t|j|j|j|jd}|S )zq
        Create a module transformation pass which lowers an `fx.GraphModule` into a
        `TRTModule`
        r   N)Device)TorchTensorRTModulezcuda:)nameinput_binding_namesoutput_binding_namestarget_device)engineinput_namesoutput_namesr!   )r$   iotorch_tensorrt._Deviceri   *torch_tensorrt.dynamo._TorchTensorRTModulerj   BytesIOwritero   	serializegetvaluerp   rq   torchcudacurrent_devicer   r!   )rQ   r,   r(   rh   rU   
interp_resrr   ri   rj   engine_bytes
engine_str
trt_modulerg   r/   r0   
lower_pass   s2   

z&default_lower_pass.<locals>.lower_pass)nnModuleInputr
   rD   )rg   r   r/   r   r0   default_lower_pass   s   &r   )frozenc                   @   sx   e Zd ZU dZeed< eeefde	de
de
dd fddZ				dd
ejdedee dee
egef  dejf
ddZd	S )r*   a,  Lowers a module using fx2trt.

    This is a composable class to facilitate fx2trt. A normal fx2trt process
    composes of the following passes to transform an `fx.GraphModule`:

        1. trace - use torch.fx to trace the module so we can get the graph
            representation of the model.
        2. split - the graph module is split into several submodules,
            running either via TensorRT, or via regular CUDA.

    For each split that need to run via TRT, the following passes are
    invoked:

        3. `TRTInterpreter` - build the TRT engine for the submodule that
            can be supported through `TRTInterpreter`.
        4. Wraps the executable TRT engine into `TRTModule`, which is an `nn.Module`.
        5. The converted submodule is then set back onto the top-level module

    lower_pass_manager_builderr(   interpreter_builder
split_funcr   c                    sF    j s| t  fdd|t|ddS | t dd |t|ddS )z!Instantiate a `Lowerer` instance.c                    s   t j| | j jdS )N)ast_rewriter_allow_listleaf_module_list)r   tracer   r   r   r^   r'   r/   r0   <lambda>   s    z Lowerer.create.<locals>.<lambda>)r(   
trace_funcr   
lower_func)r   c                 S   s   t | |S r4   )aten_tracer	opt_tracer   r/   r/   r0   r     s    )r#   r   r   )r5   r(   r   r   r/   r'   r0   r+      s"   
zLowerer.createNr   r^   additional_inputsfp16_conversion_fnc                    sN   j jj}j}t||ddtjdtdtjf fdd}|||S )N)atolrtolr   r^   r   c                    s|   |    jjjtjkr&|   d u rdd  n t fdd|D }jr1j	|}nj
|}|| }|S )Nc                 S   s    | d ur| j tjkr|  S | S r4   )dtypery   float32half)xr/   r/   r0   r   /  s    z4Lowerer.__call__.<locals>.do_lower.<locals>.<lambda>c                 3   s    | ]} |V  qd S r4   r/   ).0r   conversion_fnr/   r0   	<genexpr>5  s    z5Lowerer.__call__.<locals>.do_lower.<locals>.<genexpr>)evalr   r(   r   r   FP16r   tupler#   build_aten2trt_lower_pipelinebuild_trt_lower_pipeline)r   r^   pmlower_resultr   r   r(   rP   r   r0   do_lower   s$   
z"Lowerer.__call__.<locals>.do_lower)r   r(   r%   r&   r   r   r   r   )rP   r   r^   r   r   r   r   r   r/   r   r0   rW     s   &
zLowerer.__call__)NN)rX   rY   rZ   __doc__r   r[   r\   rf   re   r
   r   r+   r   r   r   r   rW   r/   r/   r/   r0   r*      s8   
 &r*   )9dataclassesdcloggingtypingr   r   r   r   tensorrtrE   ry   torch.fxfxtorch.nnr   4torch_tensorrt.fx.tracer.dispatch_tracer.aten_tracertracerdispatch_tracerr   torch.fx.passes.splitter_baser   fx2trtr   r	   r(   r
   !passes.lower_pass_manager_builderr   passes.pass_utilsr   r   tools.timing_cache_utilsr   tools.trt_splitterr   r   tracer.acc_tracerr   r   r   utilsr   	getLoggerrX   r>   r   r   r   intr1   	dataclassr2   GraphModulere   rf   r   r*   r/   r/   r/   r0   <module>   s|    

@A


,