o
    ƠiH                 7   @  s   d dl mZ d dlZd dlZd dlZd dlZd dlZd dlm	Z	m
Z
mZmZmZmZmZmZ d dlZd dlmZ d dlmZ d dlmZ d dlmZmZ d dlmZ d d	lmZ d d
lm Z m!Z! d dl"m#Z#m$Z$m%Z%m&Z& d dl'm(Z(m)Z) d dl*m+Z+ d dl,m-Z-m.Z.m/Z/m0Z0m1Z1 d dl2m3Z4 d dl5m6Z6 d dl7m8Z8 d dl9m:Z:m;Z;m<Z< d dl=m>Z> d dl?m@Z@mAZAmBZBmCZCmDZDmEZEmFZFmGZG eHeIZJe	ddde jKe jLe jMe jNe jOe jPe jQe jRe jSe jTe jUe jVe jWe jXdde jYe jZe j[e j\e j]e j^e j_e j`e jae jbe jce jde jee jfe jge jhe jie jje jke jle jme jne joe jpe jqe jre jse jtd.ddWdXZu	ddde jKe jLe jMe jNe jOe jPe jQe jRe jSe jTe jUe jVe jWe jXdde jYe jZe j[e j\e j]e j^e j_e j`e jae jbe jce jde jee jfe jge jhe jie jje jke jle jme jne joe jpe jqe jre jve jwe jxe jye jze j{e j|e jte jsdY5ddfdgZ}e8de- dfddhddrdsZ~	ddde jKe jLe jMe jNe jOe jPe jQe jRe jSe jTe jUe jVe jWe jXdde jYe jZe j[e j\e j]e j^e j_e j`e jae jbe jce jde jee jfe jge jhe jie jje jke jle jme jne joe jpe jqe jrdt,ddvdwZeddzd{Zddd}d~ZdS )    )annotationsN)Any
CollectionListOptionalSequenceSetTupleUnion)ExportedProgram)Target)Device)EngineCapabilitydtype)needs_cross_compile)Input)	_defaultspartitioning)DryRunTrackerPerSubgraphDatadryrun_stats_displayparse_non_trt_nodes)BaseEngineCacheDiskEngineCache)!replace_execute_engine_no_op_node)CompilationSettingsUnsupportedOperatorExceptionconvert_moduleinterpret_module_to_resultrepair_double_inputs)DYNAMO_CONVERTERS)DebuggerConfig)fn_supports_debugger)get_decompositionspost_loweringpre_export_lowering)resource_partition)deallocate_moduleget_cpu_memory_usageget_flat_args_with_checkget_output_metadataparse_graph_ioprepare_inputsto_torch_deviceto_torch_tensorrt_device).
arg_inputskwarg_inputsdevicedisable_tf32assume_dynamic_shape_supportsparse_weightsenabled_precisionsengine_capabilitynum_avg_timing_itersworkspace_sizedla_sram_sizedla_local_dram_sizedla_global_dram_sizetruncate_doublerequire_full_compilationmin_block_sizetorch_executed_opstorch_executed_modulespass_through_build_failuresmax_aux_streamsversion_compatibleoptimization_leveluse_python_runtimeuse_fast_partitioner"enable_experimental_decompositionsdryrunhardware_compatibletiming_cache_pathlazy_engine_initcache_built_enginesreuse_cached_enginesengine_cache_direngine_cache_sizecustom_engine_cacheuse_explicit_typinguse_fp32_accrefit_identical_engine_weightsstrip_engine_weightsimmutable_weightsenable_weight_streamingtiling_optimization_levell2_limit_for_tilingoffload_module_to_cpuuse_distributed_mode_traceenable_resource_partitioningcpu_memory_budgetexported_programr   inputs!Optional[Sequence[Sequence[Any]]]r/   r0   Optional[dict[Any, Any]]r1   *Optional[Union[Device, torch.device, str]]r2   boolr3   r4   r5   GUnion[Set[Union[torch.dtype, dtype]], Tuple[Union[torch.dtype, dtype]]]r6   r   r7   intr8   r9   r:   r;   r<   r=   r>   r?   Optional[Collection[Target]]r@   Optional[List[str]]rA   rB   Optional[int]rC   rD   rE   rF   rG   rH   rI   rJ   strrK   rL   rM   rN   rO   rP   Optional[BaseEngineCache]rQ   rR   rS   rT   rU   rV   rW   rX   rY   rZ   r[   r\   kwargsr   returntorch.fx.GraphModulec       .   <        s  t  dkst  d dkrtdt  dt  d |0ddr,tjdtd	d
 d|0 v rG|t	j
ur;td|0d }tjdtd	d
 d|0 v r`tjdtd	d
 |(r[td|0d  }(d|0 v rytjdtd	d
 |(rttd|0d  }(|&r|(rtdt|	}	|dur|rtd| d |$rt dkst fddtjtjtjtjhD std  d|%rtd |)r|$std|du r|du r|du rtd|dur|durtd |p|}|du ri }t|tjjs|g}t|}1t|}2t|}d!d"  D  i d# r nt	j d$|d%|d&|d'|d(|dur(|nt! d)|d*|d+|d,|d-dd.|d/|d0|
d1|d2|d3|i d4|d5|	d6|d7|d8|d9|d:|d;|d<|d=|d>| d?|&d@|'dA|(dBdCdD|)dE|*|+|-|.|/dF}3dG}4|3" D ]\}5}6|5|4v r|6rd|3|5< tdH|5 dI qt#dPi |3}7t$dJ|7 t%| |7} | &t'|} | ( }8tdKt)|8j*  t+|8|7}8tdLt)|8j*  |,rt,| ( ddM t$dN ntj-. \}9}:|9|:d	 k rtdO t/|8|1|2|7};|;S )Qa&#  Compile an ExportedProgram module using TensorRT in Linux for Inference in Windows

    Takes an exported program and a set of settings to configure the compiler
    and it will convert methods to AOT graphs which call equivalent TensorRT engines

    Arguments:
        exported_program (torch.export.ExportedProgram): Source module, running torch.export on a ``torch.nn.Module``
        inputs (Tuple[Any, ...]): List of specifications of input shape, dtype and memory layout for inputs to the module. This argument is required. Input Sizes can be specified as torch sizes, tuples or lists. dtypes can be specified using
            torch datatypes or torch_tensorrt datatypes and you can use either torch devices or the torch_tensorrt device type enum
            to select device type.

                .. code-block:: py

                    inputs=[
                        torch_tensorrt.Input((1, 3, 224, 224)), # Static NCHW input shape for input #1
                        torch_tensorrt.Input(
                            min_shape=(1, 224, 224, 3),
                            opt_shape=(1, 512, 512, 3),
                            max_shape=(1, 1024, 1024, 3),
                            dtype=torch.int32
                            format=torch.channel_last
                        ), # Dynamic input shape for input #2
                        torch.randn((1, 3, 224, 244)) # Use an example tensor and let torch_tensorrt infer settings
                    ]

    Keyword Arguments:
        arg_inputs (Tuple[Any, ...]): Same as inputs. Alias for better understanding with kwarg_inputs.
        kwarg_inputs (dict[Any, ...]): Optional, kwarg inputs to the module forward function.
        device (Union(torch_tensorrt.Device, torch.device, dict)): Target device for TensorRT engines to run on ::

            device=torch_tensorrt.Device("dla:1", allow_gpu_fallback=True)

        disable_tf32 (bool): Force FP32 layers to use traditional as FP32 format vs the default behavior of rounding the inputs to 10-bit mantissas before multiplying, but accumulates the sum using 23-bit mantissas
        assume_dynamic_shape_support (bool): Setting this to true enables the converters work for both dynamic and static shapes. Default: False
        sparse_weights (bool): Enable sparsity for convolution and fully connected layers.
        enabled_precisions (Set(Union(torch.dtype, torch_tensorrt.dtype))): The set of datatypes that TensorRT can use when selecting kernels
        capability (torch_tensorrt.EngineCapability): Restrict kernel selection to safe gpu kernels or safe dla kernels
        num_avg_timing_iters (int): Number of averaging timing iterations used to select kernels
        workspace_size (int): Maximum size of workspace given to TensorRT
        dla_sram_size (int): Fast software managed RAM used by DLA to communicate within a layer.
        dla_local_dram_size (int): Host RAM used by DLA to share intermediate tensor data across operations
        dla_global_dram_size (int): Host RAM used by DLA to store weights and metadata for execution
        truncate_double (bool): Truncate weights provided in double (float64) to float32
        require_full_compilation (bool): Require modules to be compiled end to end or return an error as opposed to returning a hybrid graph where operations that cannot be run in TensorRT are run in PyTorch
        min_block_size (int): The minimum number of contiguous TensorRT convertible operations in order to run a set of operations in TensorRT
        torch_executed_ops (Collection[Target]): Set of aten operators that must be run in PyTorch. An error will be thrown if this set is not empty but ``require_full_compilation`` is True
        torch_executed_modules (List[str]): List of modules that must be run in PyTorch. An error will be thrown if this list is not empty but ``require_full_compilation`` is True
        pass_through_build_failures (bool): Error out if there are issues during compilation (only applicable to torch.compile workflows)
        max_aux_stream (Optional[int]): Maximum streams in the engine
        version_compatible (bool): Build the TensorRT engines compatible with future versions of TensorRT (Restrict to lean runtime operators to provide version forward compatibility for the engines)
        optimization_level: (Optional[int]): Setting a higher optimization level allows TensorRT to spend longer engine building time searching for more optimization options. The resulting engine may have better performance compared to an engine built with a lower optimization level. The default optimization level is 3. Valid values include integers from 0 to the maximum optimization level, which is currently 5. Setting it to be greater than the maximum level results in identical behavior to the maximum level.
        use_python_runtime: (bool): Return a graph using a pure Python runtime, reduces options for serialization
        use_fast_partitioner: (bool): Use the adjacency based partitioning scheme instead of the global partitioner. Adjacency partitioning is faster but may not be optimal. Use the global paritioner (``False``) if looking for best performance
        enable_experimental_decompositions (bool): Use the full set of operator decompositions. These decompositions may not be tested but serve to make the graph easier to convert to TensorRT, potentially increasing the amount of graphs run in TensorRT.
        dryrun (bool): Toggle for "Dryrun" mode, running everything except conversion to TRT and logging outputs
        hardware_compatible (bool): Build the TensorRT engines compatible with GPU architectures other than that of the GPU on which the engine was built (currently works for NVIDIA Ampere and newer)
        timing_cache_path (str): Path to the timing cache if it exists (or) where it will be saved after compilation
        lazy_engine_init (bool): Defer setting up engines until the compilation of all engines is complete. Can allow larger models with multiple graph breaks to compile but can lead to oversubscription of GPU memory at runtime.
        cache_built_engines (bool): Whether to save the compiled TRT engines to storage
        reuse_cached_engines (bool): Whether to load the compiled TRT engines from storage
        engine_cache_dir (Optional[str]): Directory to store the cached TRT engines
        engine_cache_size (Optional[int]): Maximum hard-disk space (bytes) to use for the engine cache, default is 1GB. If the cache exceeds this size, the oldest engines will be removed by default
        custom_engine_cache (Optional[BaseEngineCache]): Engine cache instance to use for saving and loading engines. Users can provide their own engine cache by inheriting from BaseEngineCache. If used, engine_cache_dir and engine_cache_size will be ignored.
        use_explicit_typing (bool): This flag enables strong typing in TensorRT compilation which respects the precisions set in the Pytorch model. This is useful when users have mixed precision graphs.
        use_fp32_acc (bool): This option inserts cast to FP32 nodes around matmul layers and TensorRT ensures the accumulation of matmul happens in FP32. Use this only when FP16 precision is configured in enabled_precisions.
        refit_identical_engine_weights (bool): Refit engines with identical weights. This is useful when the same model is compiled multiple times with different inputs and the weights are the same. This will save time by reusing the same engine for different inputs.
        strip_engine_weights (bool): Strip engine weights from the serialized engine. This is useful when the engine is to be deployed in an environment where the weights are not required.
        immutable_weights (bool): Build non-refittable engines. This is useful for some layers that are not refittable. If this argument is set to true, `strip_engine_weights` and `refit_identical_engine_weights` will be ignored.
        enable_weight_streaming (bool): Enable weight streaming.
        tiling_optimization_level (str): The optimization level of tiling strategies. A higher level allows TensorRT to spend more time searching for better tiling strategy. We currently support ["none", "fast", "moderate", "full"].
        l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit).
        use_distributed_mode_trace (bool):  Using aot_autograd to trace the graph. This is enabled when DTensors or distributed tensors are present in distributed model
        enable_resource_partitioning (bool): Enable resource-aware partitioning. This is useful when the model is large and the CPU memory is limited.
        cpu_memory_budget (Optional[int]): The maximum amount of CPU memory to use for the compilation. If the compilation requires more memory than this budget, the compilation will fail.
        **kwargs: Any,
    Returns:
        torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT

    Linuxr   64bitznCross compile for windows is only supported on x86-64 Linux architecture, current platform: platform.system()=z, platform.architecture()[0]=debugF`debug` is deprecated. Please use `with torch_tensorrt.dynamo.Debugger(...)` to wrap your compilation call to enable debugging functionality.   
stackleveltruncate_long_and_double}Provided configuration for "truncate_double" and deprecated API "truncate_long_and_double", please only use "truncate_double"Compiler option "truncate_long_and_double" is deprecated in favor of "truncate_double" as int64 is now natively supported, this option will be removed in the next versionrefitzw`refit` is deprecated. Please set `immutable_weights=False` to build a refittable engine whose weights can be refitted.>Use flag `immutable_weights` only. Flag `refit` is deprecated.make_refittable`make_refittable` is deprecated. Please set `immutable_weights=False` to build a refittable engine whose weights can be refittedHUse flag `immutable_weights` only. Flag `make_refittable` is deprecated.P`immutable_weights` must be False when `refit_identical_engine_weights` is True.N/Detected torch_executed_modules was non-empty: =
This feature is unimplemented in Torch-TRT Dynamo currently.   c                 3      | ]}| v V  qd S N .0xr5   r   S/home/ubuntu/.local/lib/python3.10/site-packages/torch_tensorrt/dynamo/_compiler.py	<genexpr>  
    
z,cross_compile_for_windows.<locals>.<genexpr>duse_explicit_typing was set to True, however found that enabled_precisions was also specified (saw: e, expected: dtype.f32, dtype.f4). enabled_precisions should not be used when use_explicit_typing=True4  FP32 accumulation for matmul layers is enabled. This option should only be enabled if the model already has FP16 weights and has no effect if it has FP32 weights.                      This flag inserts casts around matmul layers and ensures TensorRT executes the matmul layers in FP16 with FP32 accumulation.ZWhen enable_weight_streaming is enabled, it requires use_explicit_typing to be set to TrueA'arg_inputs', 'kwarg_inputs' and 'inputs' should not all be None.>'arg_inputs' and 'inputs' should not be used at the same time.c                 S     h | ]}t |qS r   r   _fromr   pr   r   r   	<setcomp>-      z,cross_compile_for_windows.<locals>.<setcomp>r5   r1   r3   r8   r>   r?   rA   rB   rC   rD   rE   r<   rF   r7   rG   r=   r2   r4   r6   r9   r:   r;   rH   rI   rJ   rK   rL   rM   rS   rT   rU    enable_cross_compile_for_windowsTrV   rW   )rX   rZ   r[   r\   )rE   rK   rL   rM   zarg: zR is not supported for cross compilation for windows feature, hence it is disabled.Compilation Settings: %s
Input graph: Lowered Input graph: delete_moduleThe PyTorch model was moved to the CPU to allocate all GPU memory to TensorRT. To retain the model on the GPU, set offload_module_to_cpu=FalseRemaining GPU memory may not be enough to compile the TensorRT engine for this model resulting in an OOM error, Consider setting offload_module_to_cpu=Truer   )0platformsystemarchitectureRuntimeErrorgetwarningswarnDeprecationWarningkeysr   TRUNCATE_DOUBLE
ValueErrorr   r   loggerwarninglenanytorchfloat32r   f32float4_e2m1fn_x2f4AssertionErrorro   
isinstancecollectionsabcr   r,   r.   ENABLED_PRECISIONSsetitemsr   infor%   run_decompositionsr#   modulerh   graphr$   r'   cudamem_get_infocompile_module)<r]   r^   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   rZ   r[   r\   rj   trt_arg_inputstrt_kwarg_inputscompilation_optionsunsupported_settingskeyvaluesettingsgmremaining_memorytotal_memorytrt_gmr   r   r   cross_compile_for_windows<   s   





 !"#$%&'.


r   )5r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   rZ   enable_autocastautocast_low_precision_typeautocast_excluded_nodesautocast_excluded_opsautocast_max_output_thresholdautocast_max_depth_of_reductionautocast_calibration_dataloaderr\   r[   r   r   #Optional[Union[torch.dtype, dtype]]r   Collection[str]r   Collection[Target]r   floatr   r   %Optional[torch.utils.data.DataLoader]c       5   A        s  |7 ddrtjdtdd |7 ddstjdtdd d|7 v r7|tjur+td	|7d }tjd
tdd d|7 v rPtjdtdd |(rKtd|7d  }(d|7 v ritjdtdd |(rdtd|7d  }(|&rq|(rqtdd|7 v r|7d rtdt	|	}	|dur|rt
d| d |.rd}$t
d |$rt dkst fddtjtjtjtjhD std  d|/durt|/tjtfstdt|/ |/tjtjhvr|/tjtjhvrtd|/ |%rt
d  |)r|$std!|du r|du r|du rtd"|dur|durtd#|p|}|du r'i }t|tjjs2|g}t|}8t|}9t |}d$d%  D  d}:|sM| rY|#durT|#nt!|!|"}:i d& r` ntj"d'|d(|d)|d*|d+|durw|nt# d,|d-|d.|d/|d0|d1|d2|d3|
d4|d5|d6|i d7|d8|	d9|d:|d;|d<|d=|d>|d?|d@|dA| d|$dB|%dC|&dD|'dE|(dd|)|*|+|,|-|.|/|0|1|2|3|4|6|5dF};t
dGt$  dH t%dQi |;}<t
&dI|< t'| |<} | (t)|} | * }=t
dJt+|=j,  t-|=|<}=t
dKt$  dH t
dLt+|=j,  |,rMt.|=ddM t.| * ddM t
&dN t
dOt$  dH ntj/0 \}>}?|>|?d k r`t
dP t1|=|8|9|<|:}@|@S )RaR)  Compile an ExportedProgram module for NVIDIA GPUs using TensorRT

    Takes a existing TorchScript module and a set of settings to configure the compiler
    and will convert methods to JIT Graphs which call equivalent TensorRT engines

    Converts specifically the forward method of a TorchScript Module

    Arguments:
        exported_program (torch.export.ExportedProgram): Source module, running torch.export on a ``torch.nn.Module``
        inputs (Optional[Sequence[Sequence[Any]]]): List of specifications of input shape, dtype and memory layout for inputs to the module. This argument is required. Input Sizes can be specified as torch sizes, tuples or lists. dtypes can be specified using
            torch datatypes or torch_tensorrt datatypes and you can use either torch devices or the torch_tensorrt device type enum
            to select device type.

                .. code-block:: py

                    inputs=[
                        torch_tensorrt.Input((1, 3, 224, 224)), # Static NCHW input shape for input #1
                        torch_tensorrt.Input(
                            min_shape=(1, 224, 224, 3),
                            opt_shape=(1, 512, 512, 3),
                            max_shape=(1, 1024, 1024, 3),
                            dtype=torch.int32
                            format=torch.channel_last
                        ), # Dynamic input shape for input #2
                        torch.randn((1, 3, 224, 244)) # Use an example tensor and let torch_tensorrt infer settings
                    ]

    Keyword Arguments:
        arg_inputs (Optional[Sequence[Sequence[Any]]]): Same as inputs. Alias for better understanding with kwarg_inputs.
        kwarg_inputs (Optional[dict[Any, Any]]): kwarg inputs to the module forward function.
        device (Union(torch_tensorrt.Device, torch.device, dict)): Target device for TensorRT engines to run on ::

            device=torch_tensorrt.Device("dla:1", allow_gpu_fallback=True)

        disable_tf32 (bool): Force FP32 layers to use traditional as FP32 format vs the default behavior of rounding the inputs to 10-bit mantissas before multiplying, but accumulates the sum using 23-bit mantissas
        assume_dynamic_shape_support (bool): Setting this to true enables the converters work for both dynamic and static shapes. Default: False
        sparse_weights (bool): Enable sparsity for convolution and fully connected layers.
        enabled_precisions (Union[Set[Union[torch.dtype, dtype]], Tuple[Union[torch.dtype, dtype]]]): The set of datatypes that TensorRT can use when selecting kernels
        engine_capability (torch_tensorrt.EngineCapability): Restrict kernel selection to safe gpu kernels or safe dla kernels
        num_avg_timing_iters (int): Number of averaging timing iterations used to select kernels
        workspace_size (int): Maximum size of workspace given to TensorRT
        dla_sram_size (int): Fast software managed RAM used by DLA to communicate within a layer.
        dla_local_dram_size (int): Host RAM used by DLA to share intermediate tensor data across operations
        dla_global_dram_size (int): Host RAM used by DLA to store weights and metadata for execution
        truncate_double (bool): Truncate weights provided in double (float64) to float32
        require_full_compilation (bool): Require modules to be compiled end to end or return an error as opposed to returning a hybrid graph where operations that cannot be run in TensorRT are run in PyTorch
        min_block_size (int): The minimum number of contiguous TensorRT convertible operations in order to run a set of operations in TensorRT
        torch_executed_ops (Optional[Collection[Target]]): Set of aten operators that must be run in PyTorch. An error will be thrown if this set is not empty but ``require_full_compilation`` is True
        torch_executed_modules (Optional[List[str]]): List of modules that must be run in PyTorch. An error will be thrown if this list is not empty but ``require_full_compilation`` is True
        pass_through_build_failures (bool): Error out if there are issues during compilation (only applicable to torch.compile workflows)
        max_aux_streams (Optional[int]): Maximum streams in the engine
        version_compatible (bool): Build the TensorRT engines compatible with future versions of TensorRT (Restrict to lean runtime operators to provide version forward compatibility for the engines)
        optimization_level: (Optional[int]): Setting a higher optimization level allows TensorRT to spend longer engine building time searching for more optimization options. The resulting engine may have better performance compared to an engine built with a lower optimization level. The default optimization level is 3. Valid values include integers from 0 to the maximum optimization level, which is currently 5. Setting it to be greater than the maximum level results in identical behavior to the maximum level.
        use_python_runtime: (bool): Return a graph using a pure Python runtime, reduces options for serialization
        use_fast_partitioner: (bool): Use the adjacency based partitioning scheme instead of the global partitioner. Adjacency partitioning is faster but may not be optimal. Use the global paritioner (``False``) if looking for best performance
        enable_experimental_decompositions (bool): Use the full set of operator decompositions. These decompositions may not be tested but serve to make the graph easier to convert to TensorRT, potentially increasing the amount of graphs run in TensorRT.
        dryrun (bool): Toggle for "Dryrun" mode, running everything except conversion to TRT and logging outputs
        hardware_compatible (bool): Build the TensorRT engines compatible with GPU architectures other than that of the GPU on which the engine was built (currently works for NVIDIA Ampere and newer)
        timing_cache_path (str): Path to the timing cache if it exists (or) where it will be saved after compilation
        lazy_engine_init (bool): Defer setting up engines until the compilation of all engines is complete. Can allow larger models with multiple graph breaks to compile but can lead to oversubscription of GPU memory at runtime.
        cache_built_engines (bool): Whether to save the compiled TRT engines to storage
        reuse_cached_engines (bool): Whether to load the compiled TRT engines from storage
        engine_cache_dir (str): Directory to store the cached TRT engines
        engine_cache_size (int): Maximum hard-disk space (bytes) to use for the engine cache, default is 1GB. If the cache exceeds this size, the oldest engines will be removed by default
        custom_engine_cache (Optional[BaseEngineCache]): Engine cache instance to use for saving and loading engines. Users can provide their own engine cache by inheriting from BaseEngineCache. If used, engine_cache_dir and engine_cache_size will be ignored.
        use_explicit_typing (bool): This flag enables strong typing in TensorRT compilation which respects the precisions set in the Pytorch model. This is useful when users have mixed precision graphs.
        use_fp32_acc (bool): This option inserts cast to FP32 nodes around matmul layers and TensorRT ensures the accumulation of matmul happens in FP32. Use this only when FP16 precision is configured in enabled_precisions.
        refit_identical_engine_weights (bool): Refit engines with identical weights. This is useful when the same model is compiled multiple times with different inputs and the weights are the same. This will save time by reusing the same engine for different inputs.
        strip_engine_weights (bool): Strip engine weights from the serialized engine. This is useful when the engine is to be deployed in an environment where the weights are not required.
        immutable_weights (bool): Build non-refittable engines. This is useful for some layers that are not refittable. If this argument is set to true, `strip_engine_weights` and `refit_identical_engine_weights` will be ignored.
        enable_weight_streaming (bool): Enable weight streaming.
        tiling_optimization_level (str): The optimization level of tiling strategies. A higher level allows TensorRT to spend more time searching for better tiling strategy. We currently support ["none", "fast", "moderate", "full"].
        l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit).
        offload_module_to_cpu (bool): Offload the module to CPU. This is useful when we need to minimize GPU memory usage.
        use_distributed_mode_trace (bool):  Using aot_autograd to trace the graph. This is enabled when DTensors or distributed tensors are present in distributed model
        enable_autocast (bool): Whether to enable autocast. If enabled, use_explicit_typing will be set to True.
        autocast_low_precision_type (Optional[Union[torch.dtype, dtype]]): The precision to reduce to. We currently support torch.float16 and torch.bfloat16. Default is None, which means no low precision is used.
        autocast_excluded_nodes (Collection[str]): The set of regex patterns to match user-specified node names that should remain in FP32. Default is [].
        autocast_excluded_ops (Collection[Target]): The set of targets (ATen ops) that should remain in FP32. Default is [].
        autocast_max_output_threshold (float): Maximum absolute value for node outputs, nodes with outputs greater than this value will remain in FP32. Default is 512.
        autocast_max_depth_of_reduction (Optional[int]): Maximum depth of reduction allowed in low precision. Nodes with higher reduction depths will remain in FP32. This helps prevent excessive accuracy loss in operations particularly sensitive to reduced precision, as higher-depth reductions may amplify computation errors in low precision formats. If not provided, infinity will be used. Default is None.
        autocast_calibration_dataloader (Optional[torch.utils.data.DataLoader]): The dataloader to use for autocast calibration. Default is None.
        enable_resource_partitioning (bool): Enable resource-aware partitioning. This is useful when the model is large and the CPU memory is limited.
        cpu_memory_budget (Optional[int]): The maximum amount of CPU memory to use for the compilation. If the compilation requires more memory than this budget, the compilation will fail.
        **kwargs: Any,
    Returns:
        torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT
    ro   Frp   rq   rr   rQ   zi`use_explicit_typing` is deprecated. This setting will be removed and you should enable autocast instead.rt   ru   rv   rw   v`refit` is deprecated. Please set `immutable_weights=False` to build a refittable engine whose weights can be refittedrx   ry   rz   r{   r|   r   Please use torch_tensorrt.dynamo.cross_compile_for_windows() if you want to cross compile the module in Linux for inferencing in Windows.Nr}   r~   Tz9Autocast is enabled, setting use_explicit_typing to True.r   c                 3  r   r   r   r   r   r   r   r   y  r   zcompile.<locals>.<genexpr>r   r   zVautocast_low_precision_type must be a torch.dtype or torch_tensorrt._enums.dtype, got zeautocast_low_precision_type must be one of torch.float16, torch.bfloat16, dtype.f16, dtype.bf16, got r   r   r   r   c                 S  r   r   r   r   r   r   r   r     r   zcompile.<locals>.<setcomp>r5   r1   r3   r8   r>   r?   rA   rB   rC   rD   rE   r<   rF   r7   rG   r=   r2   r4   r6   r9   r:   r;   rH   rI   rJ   rK   rL   rM   rR   rS   rT   rU   )rV   rW   rX   rY   rZ   r   r   r   r   r   r   r   r[   r\   z"CPU memory usage before lowering: z MBr   r   z&CPU memory usage after post_lowering: r   r   r   z$CPU memory usage after CPU offload: r   r   )2r   r   r   r   r   r   r   r   r   r   r   r   ro   r   r   r   r   r   r   r   r   r   r   typefloat16bfloat16f16bf16r   r   r   r,   r.   r   r   r   r(   r   r   r%   r   r#   r   rh   r   r$   r'   r   r   r   )Ar]   r^   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   rZ   r   r   r   r   r   r   r   r\   r[   rj   r   r   engine_cacher   r   r   r   r   r   r   r   r   compile  s   









 !"#$%&'6


r   )_debugger_configr   sample_arg_inputsSequence[Input]sample_kwarg_inputsr   r   r   r   Optional[DebuggerConfig]c          #   	     s  t dd |   D rtd t }|du ri }t| t	| |j
\}}||_||_||_|jr?|jdkr?td |dksK||jk r[|js[t| d| d	|j  | S td
| d| d d>dd}	|	| sutd t| dd}
t| dd}d?dd}d}|jrztd tj| |j|j
|j||kd\}}W n tjjjjy   tjd d!d" d!}d|_Y nw |jstd# tj| |j|j
|jd$\}}|jrt||jd%}|j |_!|js|j"#t$| i }|j%j&D ]}d&|j'vrq|||j'< q||
|| i }t(| D ]}|)d'rt*| | q|+ D ]L\}}t||}t,|tjj-j.s4q|jr[d&|vr[|j"#t$| td(t/|t/|j% |0t1|j2 q||vrht3d)| d*t4|}t5|dksuJ d+d,g}|D ]&  || j6vr fd-d.|D }||| j6 < td/| d0  nq{t7 }||_8t5d1d. |j%j&D |_9t:|}|dusJ td2t/|d3d. |D t/|j% |j;rt<|||t1|j2|}t=|| | j>d7  _>|j?@| tjAB  |jsktC|||||d4}|||< |rk|jDrD|jEr#|jFd5krt3d6|G  n!|jFd5kr-t3d7tHjIJ|jKd8}tHjL|d!d9 |jG||jFd: |jMrktNtHjIJ|jKd;d<}|O|P  W d   n	1 sfw   Y  qt=| | |Q D ]\}}tR||| |jSr|jTst||}|U  qvtV|j%j&d= }|jWD ]} | D ]}!|!jX}"d&t/|"vrqt||"Yd! qq|rd!|_tZ||j |S )@aw  Compile a traced FX module

    Includes: Partitioning + Conversion Phases

    Args:
        module: FX GraphModule to convert
        arg_inputs: Inputs to the module
        kwarg_inputs: kwargs to the module
        settings: Compilation settings
        engine_cache: Engine cache instance to store/load compiled engines
    Returns:
        Compiled FX GraphModule
    c                 s  s    | ]}|j V  qd S r   )requires_grad)r   vr   r   r   r   +  s    z!compile_module.<locals>.<genexpr>zZThe model may be in training mode, which may affect the performance of the compiled model!Nr   z^It is recommended to run `dryrun` mode with `min_block_size=1`, for the most thorough analysisr   z6 supported operations detected in subgraph containing zV computational nodes. Skipping this subgraph, since min_block_size was detected to be zDetected support for z operators out of z in subgraph.r   rl   rk   rb   c                 S  sN   | j jD ] }|jdkr$|js$d|jvr$td|j d|j d  dS qdS )NoutputvalzNode z of op type zI does not have metadata. This could sometimes lead to undefined behavior.FT)r   nodesopmetar   r   name)r   noder   r   r   contains_metadataT  s   z)compile_module.<locals>.contains_metadatazSome nodes do not have metadata (shape and dtype information). This could lead to problems sometimes if the graph has PyTorch and TensorRT segments._in_spec	_out_specin_specr   out_spectarget_moduleNonec                 S  s$   | dur| |_ |dur||_dS dS )z
        Applies input and output specs to the target module.

        Args:
            in_spec: The input spec to apply
            out_spec: The output spec to apply
            target_module: The module to apply specs to
        N)r   r   )r   r   r   r   r   r   preserve_module_specsi  s
   
z-compile_module.<locals>.preserve_module_specsFz/Partitioning the graph via the fast partitioner)r>   r?   r=   skip_fusionziPartitioning failed on the subgraph with fast partition. See trace above. Retrying with global partition.Texc_infoz1Partitioning the graph via the global partitioner)r>   r?   r=   )r\   _run_on_acc_frozen_paramzSubmodule in PyTorch: %s
 %sznode_name: z0 does not exist in the submodule node dictionaryr   tensor_metac                   s   g | ]
} |v r|  qS r   r   )r   metadatar   r   r   
<listcomp>  s    z"compile_module.<locals>.<listcomp>zUpdated metadata for node: z) with its corresponding submodule outputsc                 S  s   g | ]	}|j d v r|qS ))call_functioncall_methodcall_module)r   )r   r   r   r   r   r     s
    
z.Converting submodule: %s
 Input shapes: %s
 %sc                 S  s   g | ]}|j qS r   )shape)r   inputr   r   r   r     s    )r   r   r   	cudagraphzProfiling with TREX can only be enabled when using the C++ runtime. Python runtime profiling only support cudagraph visualization.zProfiling with Cudagraph can only be enabled when using the Python runtime. C++ runtime profiling only support TREX/Perfetto visualization.engine_visualization_profile)exist_ok)profiling_results_dirprofile_formatzengine_layer_info.jsonw)r   rl   rk   rb   )r   r   r   r   r   rl   rk   r   )[r   
state_dictvaluesr   r   r   
CONVERTERSset_compilation_settingsr   get_graph_converter_supportr?   total_ops_in_graphsupported_ops_in_graphcompilation_settingsrH   r>   r   ro   getattrrF   fast_partitionr=   r   fxpassessplitter_baseFxNetSplitterInternalErrorerrorglobal_partitionr[   r&   r\   unsupported_operatorsunsupported_opsto_run_in_torchextendr   r   r   r   dir
startswithdelattrnamed_childrenr   graph_moduleGraphModulerh   tor-   r1   r   r*   r   r   r   subgraph_namesubgraph_op_countconstruct_submodule_inputsr<   r   r+   tensorrt_graph_countper_subgraph_dataappendr   empty_cacher   save_engine_profilerE   r
  enable_profilingospathjoinlogging_dirmakedirssave_layer_infoopenwriteget_layer_infor   setattrrK   r   setup_enginelistargstargetset_output_tensors_as_unownedr   )#r   r   r   r   r   r   dryrun_trackernum_supported_ops	total_opsr   original_in_specoriginal_out_specr   fast_partitioner_failedpartitioned_modulesupported_opssubmodule_node_dictr   trt_modulesattrr   _	submodulemetadata_listmetadata_keysmeta_val_listsubgraph_datasubmodule_inputs
trt_moduler2  foutput_nodeargr   r>  r   r   r   r     s  

















	
	




r   ),r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   rZ   bytesc       ,   ;   
     sf  |. ddrtjdtdd d|. v r)|tjurtd|.d }tjdtdd d	|. v rBtjd
tdd |(r=td|.d	  }(d|. v r[tjdtdd |(rVtd|.d  }(|&rc|(rctdd|. v rq|.d rqtdt	|	}	|dur|rt
d| d |$rt dkst fddtjtjtjtjhD std  d|%rt
d |)r|$std|du r|du r|du rtd|dur|durtd|p|}|du ri }t|tjjs|g}t|}/t|}0t|}dd  D  d}1|s| r|#dur	|#nt|!|"}1i d  r ntjd!|d"|d#|d$|d%|dur,|nt d&|d'|d(|d)|d*|d+|d,|d-|
d.|d/|d0|i d1|d2|	d3|d4|d5|d6|d7|d8|d9|d:|d;| d<|$d=|%d>|&d?|'d@|(dd|)|*|+|,|-dA}2tdOi |2}3t
 dB|3 t!| |3} | "t#|} | $ }4t
dCt%|4j&  t'|4|3}4t
dDt%|4j&  |,rt(| $ ddE t
 dF ntj)* \}5}6|5|6d k rt
dG t+| t,|/|0dH }7z
t-|4|7|3|1dI}8W n1 t.y   t
j/dJ|4 dKdLdM Y n t0y- }9 zt
j/dN|9 dLdM W Y d}9~9nd}9~9ww |8j1}:|:S )PaU"  Convert an ExportedProgram to a serialized TensorRT engine

    Converts an ExportedProgram to a serialized TensorRT engine given a dictionary of conversion settings

    Arguments:
        exported_program (torch.export.ExportedProgram): Source module, running torch.export on a ``torch.nn.Module``
        inputs (Optional[Sequence[Sequence[Any]]]): List of specifications of input shape, dtype and memory layout for inputs to the module. This argument is required. Input Sizes can be specified as torch sizes, tuples or lists. dtypes can be specified using
            torch datatypes or torch_tensorrt datatypes and you can use either torch devices or the torch_tensorrt device type enum
            to select device type.

                .. code-block:: py

                    inputs=[
                        torch_tensorrt.Input((1, 3, 224, 224)), # Static NCHW input shape for input #1
                        torch_tensorrt.Input(
                            min_shape=(1, 224, 224, 3),
                            opt_shape=(1, 512, 512, 3),
                            max_shape=(1, 1024, 1024, 3),
                            dtype=torch.int32
                            format=torch.channel_last
                        ), # Dynamic input shape for input #2
                        torch.randn((1, 3, 224, 244)) # Use an example tensor and let torch_tensorrt infer settings
                    ]

    Keyword Arguments:
        arg_inputs (Optional[Sequence[Sequence[Any]]]): Same as inputs. Alias for better understanding with kwarg_inputs.
        kwarg_inputs (Optional[dict[Any, Any]]): kwarg inputs to the module forward function.
        device (Union(torch_tensorrt.Device, torch.device, dict)): Target device for TensorRT engines to run on ::

            device=torch_tensorrt.Device("dla:1", allow_gpu_fallback=True)

        disable_tf32 (bool): Force FP32 layers to use traditional as FP32 format vs the default behavior of rounding the inputs to 10-bit mantissas before multiplying, but accumulates the sum using 23-bit mantissas
        assume_dynamic_shape_support (bool): Setting this to true enables the converters work for both dynamic and static shapes. Default: False
        sparse_weights (bool): Enable sparsity for convolution and fully connected layers.
        enabled_precisions (Union[Set[Union[torch.dtype, dtype]], Tuple[Union[torch.dtype, dtype]]]): The set of datatypes that TensorRT can use when selecting kernels
        engine_capability (torch_tensorrt.EngineCapability): Restrict kernel selection to safe gpu kernels or safe dla kernels
        num_avg_timing_iters (int): Number of averaging timing iterations used to select kernels
        workspace_size (int): Maximum size of workspace given to TensorRT
        dla_sram_size (int): Fast software managed RAM used by DLA to communicate within a layer.
        dla_local_dram_size (int): Host RAM used by DLA to share intermediate tensor data across operations
        dla_global_dram_size (int): Host RAM used by DLA to store weights and metadata for execution
        truncate_double (bool): Truncate weights provided in double (float64) to float32
        require_full_compilation (bool): Require modules to be compiled end to end or return an error as opposed to returning a hybrid graph where operations that cannot be run in TensorRT are run in PyTorch
        min_block_size (int): The minimum number of contiguous TensorRT convertible operations in order to run a set of operations in TensorRT
        torch_executed_ops (Optional[Collection[Target]]): Set of aten operators that must be run in PyTorch. An error will be thrown if this set is not empty but ``require_full_compilation`` is True
        torch_executed_modules (Optional[List[str]]): List of modules that must be run in PyTorch. An error will be thrown if this list is not empty but ``require_full_compilation`` is True
        pass_through_build_failures (bool): Error out if there are issues during compilation (only applicable to torch.compile workflows)
        max_aux_streams (Optional[int]): Maximum streams in the engine
        version_compatible (bool): Build the TensorRT engines compatible with future versions of TensorRT (Restrict to lean runtime operators to provide version forward compatibility for the engines)
        optimization_level: (Optional[int]): Setting a higher optimization level allows TensorRT to spend longer engine building time searching for more optimization options. The resulting engine may have better performance compared to an engine built with a lower optimization level. The default optimization level is 3. Valid values include integers from 0 to the maximum optimization level, which is currently 5. Setting it to be greater than the maximum level results in identical behavior to the maximum level.
        use_python_runtime: (bool): Return a graph using a pure Python runtime, reduces options for serialization
        use_fast_partitioner: (bool): Use the adjacency based partitioning scheme instead of the global partitioner. Adjacency partitioning is faster but may not be optimal. Use the global paritioner (``False``) if looking for best performance
        enable_experimental_decompositions (bool): Use the full set of operator decompositions. These decompositions may not be tested but serve to make the graph easier to convert to TensorRT, potentially increasing the amount of graphs run in TensorRT.
        dryrun (bool): Toggle for "Dryrun" mode, running everything except conversion to TRT and logging outputs
        hardware_compatible (bool): Build the TensorRT engines compatible with GPU architectures other than that of the GPU on which the engine was built (currently works for NVIDIA Ampere and newer)
        timing_cache_path (str): Path to the timing cache if it exists (or) where it will be saved after compilation
        lazy_engine_init (bool): Defer setting up engines until the compilation of all engines is complete. Can allow larger models with multiple graph breaks to compile but can lead to oversubscription of GPU memory at runtime.
        cache_built_engines (bool): Whether to save the compiled TRT engines to storage
        reuse_cached_engines (bool): Whether to load the compiled TRT engines from storage
        engine_cache_dir (str): Directory to store the cached TRT engines
        engine_cache_size (int): Maximum hard-disk space (bytes) to use for the engine cache, default is 1GB. If the cache exceeds this size, the oldest engines will be removed by default
        custom_engine_cache (Optional[BaseEngineCache]): Engine cache instance to use for saving and loading engines. Users can provide their own engine cache by inheriting from BaseEngineCache. If used, engine_cache_dir and engine_cache_size will be ignored.
        use_explicit_typing (bool): This flag enables strong typing in TensorRT compilation which respects the precisions set in the Pytorch model. This is useful when users have mixed precision graphs.
        use_fp32_acc (bool): This option inserts cast to FP32 nodes around matmul layers and TensorRT ensures the accumulation of matmul happens in FP32. Use this only when FP16 precision is configured in enabled_precisions.
        refit_identical_engine_weights (bool): Refit engines with identical weights. This is useful when the same model is compiled multiple times with different inputs and the weights are the same. This will save time by reusing the same engine for different inputs.
        strip_engine_weights (bool): Strip engine weights from the serialized engine. This is useful when the engine is to be deployed in an environment where the weights are not required.
        immutable_weights (bool): Build non-refittable engines. This is useful for some layers that are not refittable. If this argument is set to true, `strip_engine_weights` and `refit_identical_engine_weights` will be ignored.
        enable_weight_streaming (bool): Enable weight streaming.
        tiling_optimization_level (str): The optimization level of tiling strategies. A higher level allows TensorRT to spend more time searching for better tiling strategy. We currently support ["none", "fast", "moderate", "full"].
        l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit).
        offload_module_to_cpu (bool): Offload the module to CPU. This is useful when we need to minimize GPU memory usage.
        use_distributed_mode_trace (bool):  Using aot_autograd to trace the graph. This is enabled when DTensors or distributed tensors are present in distributed model.
        **kwargs: Any,
    Returns:
        bytes: Serialized TensorRT engine, can either be saved to a file or deserialized via TensorRT APIs
    ro   Frp   rq   rr   rt   ru   rv   rw   r   rx   ry   rz   r{   r|   r   r   Nr}   r~   r   c                 3  r   r   r   r   r   r   r   r     r   zDconvert_exported_program_to_serialized_trt_engine.<locals>.<genexpr>r   r   r   r   r   r   c                 S  r   r   r   r   r   r   r   r   =  r   zDconvert_exported_program_to_serialized_trt_engine.<locals>.<setcomp>r5   r1   r3   r8   r>   r?   rA   rB   rC   rD   rE   r<   rF   r7   rG   r=   r2   r4   r6   r9   r:   r;   rH   rI   rJ   rK   rL   rM   rQ   rR   rS   rT   rU   )rV   rW   rX   rY   rZ   r   r   r   r   r   r   r   )r^   r   r   zConversion of module z. not currently fully supported or convertible!Tr   z,While interpreting the module got an error: r   )2r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   ro   r   r   r   r   r,   r.   r   r   r   r   r   r%   r   r#   r   rh   r   r$   r'   r   r   r)   r<  r   r   r  	Exceptionserialized_engine);r]   r^   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   rZ   rj   r   r   r   r   r   r   r   r   flattened_input_listinterpreter_resulterX  r   r   r   1convert_exported_program_to_serialized_trt_engineN  s   






 !"#$%&'.





r\  	file_pathr   c                 C  sF   |st dddlm} || dd}tj|| td|  dS )z
    Save cross compiled exported program to disk.

    Arguments:
        module (torch.fx.GraphModule): Cross compiled Torch-TensorRT module
        file_path (str): the file path where the exported program will be saved to disk
    ;File path cannot be empty. Please provide a valid file pathr   )exportT)cross_compile_modulez-successfully saved the module for windows at N)r   torch_tensorrt.dynamo._exporterr_  r   saver   ro   )r   r]  r_  exp_programr   r   r   $save_cross_compiled_exported_program  s   rd   c              
   C  s   | st dt dkst dkrt dztd|  d tj| }W t|S  t	yK } ztj
d|  d| dd	 t d
|  dd}~ww )a%  
    Load an ExportedProgram file in Windows which was previously cross compiled in Linux

    Arguments:
        file_path (str): Path to file on the disk

    Raises:
        ValueError: If the api is not called in windows or there is no file or the file is a valid ExportedProgram file
    r^  WindowsAMD64zMcross runtime compiled model for windows can only be loaded in Windows systemzLoading the provided file z using torch.export.load()z: via torch.export.load() failed with the following error: Tr   zcross_load the file zL doesn't correspond to a valid ExportedProgram. Please verify the file path.N)r   r   r   machiner   ro   r   r_  loadrW  r   r   )r]  rc  r[  r   r   r   $load_cross_compiled_exported_program  s(   


rj  r   )dr]   r   r^   r_   r/   r_   r0   r`   r1   ra   r2   rb   r3   rb   r4   rb   r5   rc   r6   r   r7   rd   r8   rd   r9   rd   r:   rd   r;   rd   r<   rb   r=   rb   r>   rd   r?   re   r@   rf   rA   rb   rB   rg   rC   rb   rD   rg   rE   rb   rF   rb   rG   rb   rH   rb   rI   rb   rJ   rh   rK   rb   rL   rb   rM   rb   rN   rh   rO   rd   rP   ri   rQ   rb   rR   rb   rS   rb   rT   rb   rU   rb   rV   rb   rW   rh   rX   rd   rY   rb   rZ   rb   r[   rb   r\   rg   rj   r   rk   rl   )rr]   r   r^   r_   r/   r_   r0   r`   r1   ra   r2   rb   r3   rb   r4   rb   r5   rc   r6   r   r7   rd   r8   rd   r9   rd   r:   rd   r;   rd   r<   rb   r=   rb   r>   rd   r?   re   r@   rf   rA   rb   rB   rg   rC   rb   rD   rg   rE   rb   rF   rb   rG   rb   rH   rb   rI   rb   rJ   rh   rK   rb   rL   rb   rM   rb   rN   rh   rO   rd   rP   ri   rQ   rb   rR   rb   rS   rb   rT   rb   rU   rb   rV   rb   rW   rh   rX   rd   rY   rb   rZ   rb   r   rb   r   r   r   r   r   r   r   r   r   rg   r   r   r\   rg   r[   rb   rj   r   rk   rl   )r   rl   r   r   r   r`   r   r   r   ri   r   r   rk   rl   )`r]   r   r^   r_   r/   r_   r0   r`   r1   ra   r2   rb   r3   rb   r4   rb   r5   rc   r6   r   r7   rd   r8   rd   r9   rd   r:   rd   r;   rd   r<   rb   r=   rb   r>   rd   r?   re   r@   rf   rA   rb   rB   rg   rC   rb   rD   rg   rE   rb   rF   rb   rG   rb   rH   rb   rI   rb   rJ   rh   rK   rb   rL   rb   rM   rb   rN   rh   rO   rd   rP   ri   rQ   rb   rR   rb   rS   rb   rT   rb   rU   rb   rV   rb   rW   rh   rX   rd   rY   rb   rZ   rb   rj   r   rk   rV  )r   rl   r]  rh   rk   r   )re  )r]  rh   rk   r   )
__future__r   collections.abcr   loggingr1  r   r   typingr   r   r   r   r   r   r	   r
   r   torch.exportr   torch.fx.noder   torch_tensorrt._Devicer   torch_tensorrt._enumsr   r   torch_tensorrt._featuresr   torch_tensorrt._Inputr   torch_tensorrt.dynamor   r   $torch_tensorrt.dynamo._DryRunTrackerr   r   r   r   #torch_tensorrt.dynamo._engine_cacher   r   ra  r    torch_tensorrt.dynamo.conversionr   r   r   r   r   3torch_tensorrt.dynamo.conversion._ConverterRegistryr    r  +torch_tensorrt.dynamo.debug._DebuggerConfigr!   .torch_tensorrt.dynamo.debug._supports_debuggerr"   torch_tensorrt.dynamo.loweringr#   r$   r%   8torch_tensorrt.dynamo.partitioning._resource_partitionerr&   torch_tensorrt.dynamo.utilsr'   r(   r)   r*   r+   r,   r-   r.   	getLogger__name__r   DEVICEDISABLE_TF32ASSUME_DYNAMIC_SHAPE_SUPPORTSPARSE_WEIGHTSr   ENGINE_CAPABILITYNUM_AVG_TIMING_ITERSWORKSPACE_SIZEDLA_SRAM_SIZEDLA_LOCAL_DRAM_SIZEDLA_GLOBAL_DRAM_SIZEr   REQUIRE_FULL_COMPILATIONMIN_BLOCK_SIZEPASS_THROUGH_BUILD_FAILURESMAX_AUX_STREAMSVERSION_COMPATIBLEOPTIMIZATION_LEVELUSE_PYTHON_RUNTIMEUSE_FAST_PARTITIONER"ENABLE_EXPERIMENTAL_DECOMPOSITIONSDRYRUNHARDWARE_COMPATIBLETIMING_CACHE_PATHLAZY_ENGINE_INITCACHE_BUILT_ENGINESREUSE_CACHED_ENGINESENGINE_CACHE_DIRENGINE_CACHE_SIZECUSTOM_ENGINE_CACHEUSE_EXPLICIT_TYPINGUSE_FP32_ACCREFIT_IDENTICAL_ENGINE_WEIGHTSSTRIP_ENGINE_WEIGHTSIMMUTABLE_WEIGHTSENABLE_WEIGHT_STREAMINGTILING_OPTIMIZATION_LEVELL2_LIMIT_FOR_TILINGOFFLOAD_MODULE_TO_CPUUSE_DISTRIBUTED_MODE_TRACEENABLE_RESOURCE_PARTITIONINGCPU_MEMORY_BUDGETr   ENABLE_AUTOCASTAUTOCAST_LOW_PRECISION_TYPEAUTOCAST_EXCLUDED_NODESAUTOCAST_EXCLUDED_OPSAUTOCAST_MAX_OUTPUT_THRESHOLDAUTOCAST_MAX_DEPTH_OF_REDUCTIONAUTOCAST_CALIBRATION_DATALOADERr   r   r\  rd  rj  r   r   r   r   <module>   s   ((
  T   
  =  _