o
    Ơi@2                  	   @   s  d dl mZmZ d dlmZmZmZmZmZm	Z	 d dl
Zd dlZd dlmZ d dlmZ d dlmZmZ d dlmZmZmZmZmZmZmZmZmZmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8m9Z9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@mAZAmBZBmCZC eG dd	 d	ZDh d
ZEeFejGdseEHd deDdeDdeeIeeJ f fddZKdS )    )	dataclassfield)Any
CollectionOptionalSetTupleUnionN)Target)Device)EngineCapabilitydtype)/ASSUME_DYNAMIC_SHAPE_SUPPORTAUTOCAST_CALIBRATION_DATALOADERAUTOCAST_EXCLUDED_NODESAUTOCAST_EXCLUDED_OPSAUTOCAST_LOW_PRECISION_TYPEAUTOCAST_MAX_DEPTH_OF_REDUCTIONAUTOCAST_MAX_OUTPUT_THRESHOLDCACHE_BUILT_ENGINESCPU_MEMORY_BUDGETDISABLE_TF32DLA_GLOBAL_DRAM_SIZEDLA_LOCAL_DRAM_SIZEDLA_SRAM_SIZEDRYRUNENABLE_AUTOCAST ENABLE_CROSS_COMPILE_FOR_WINDOWS"ENABLE_EXPERIMENTAL_DECOMPOSITIONSENABLE_RESOURCE_PARTITIONINGENABLE_WEIGHT_STREAMINGENABLED_PRECISIONSENGINE_CAPABILITYHARDWARE_COMPATIBLEIMMUTABLE_WEIGHTSL2_LIMIT_FOR_TILINGLAZY_ENGINE_INITMAX_AUX_STREAMSMIN_BLOCK_SIZENUM_AVG_TIMING_ITERSOFFLOAD_MODULE_TO_CPUOPTIMIZATION_LEVELPASS_THROUGH_BUILD_FAILURESREFIT_IDENTICAL_ENGINE_WEIGHTSREQUIRE_FULL_COMPILATIONREUSE_CACHED_ENGINESSPARSE_WEIGHTSSTRIP_ENGINE_WEIGHTSTILING_OPTIMIZATION_LEVELTIMING_CACHE_PATHTRUNCATE_DOUBLEUSE_DISTRIBUTED_MODE_TRACEUSE_EXPLICIT_TYPINGUSE_FAST_PARTITIONERUSE_FP32_ACCUSE_PYTHON_RUNTIMEVERSION_COMPATIBLEWORKSPACE_SIZEdefault_devicec                   @   s  e Zd ZU dZedd dZee ed< e	Z
eed< eZeed< eedZee ed< eZeed	< eZee ed
< eZeed< eZee ed< eZee ed< eZeed< e Z!eed< e"Z#eed< ee$dZ%e&ed< e'Z(eed< e)Z*eed< e+Z,eed< e-Z.eed< edd dZ/e0ed< e1Z2eed< e3Z4eed< e5Z6eed< e7Z8eed< e9Z:e;ee<f ed< e=Z>eed< e?Z@e<ed< eAZBeed< eCZDeed < eEZFeed!< eGZHeed"< eIZJeed#< eKZLeed$< eMZNeed%< eOZPeed&< eQZReed'< eSZTeed(< eUZVe<ed)< eWZXeed*< eYZZeed+< e[Z\eed,< e]Z^eed-< e_Z`ee ed.< ed/d dZaee< ed0< ed1d dZbee ed2< ecZdeeed3< efZgee ed4< ehZieejjkjljm ed5< enZoeed6< epZqeed7< d8ere<esf fd9d:Ztd;ere<esf d8d<fd=d>Zud<S )?CompilationSettingsap  Compilation settings for Torch-TensorRT Dynamo Paths

    Args:
        enabled_precisions (Set[dtype]): Available kernel dtype precisions
        debug (bool): Whether to print out verbose debugging information
        workspace_size (int): Workspace TRT is allowed to use for the module (0 is default)
        min_block_size (int): Minimum number of operators per TRT-Engine Block
        torch_executed_ops (Collection[Target]): Collection of operations to run in Torch, regardless of converter coverage
        pass_through_build_failures (bool): Whether to fail on TRT engine build errors (True) or not (False)
        max_aux_streams (Optional[int]): Maximum number of allowed auxiliary TRT streams for each engine
        version_compatible (bool): Provide version forward-compatibility for engine plan files
        optimization_level (Optional[int]): Builder optimization 0-5, higher levels imply longer build time,
            searching for more optimization options. TRT defaults to 3
        use_python_runtime (Optional[bool]): Whether to strictly use Python runtime or C++ runtime. To auto-select a runtime
            based on C++ dependency presence (preferentially choosing C++ runtime if available), leave the
            argument as None
        truncate_double (bool): Whether to truncate float64 TRT engine inputs or weights to float32
        use_fast_partitioner (bool): Whether to use the fast or global graph partitioning system
        enable_experimental_decompositions (bool): Whether to enable all core aten decompositions
            or only a selected subset of them
        device (Device): GPU to compile the model on
        require_full_compilation (bool): Whether to require the graph is fully compiled in TensorRT.
            Only applicable for `ir="dynamo"`; has no effect for `torch.compile` path
        assume_dynamic_shape_support (bool): Setting this to true enables the converters work for both dynamic and static shapes. Default: False
        disable_tf32 (bool): Whether to disable TF32 computation for TRT layers
        sparse_weights (bool): Whether to allow the builder to use sparse weights
        engine_capability (trt.EngineCapability): Restrict kernel selection to safe gpu kernels or safe dla kernels
        num_avg_timing_iters (int): Number of averaging timing iterations used to select kernels
        dla_sram_size (int): Fast software managed RAM used by DLA to communicate within a layer.
        dla_local_dram_size (int): Host RAM used by DLA to share intermediate tensor data across operations
        dla_global_dram_size (int): Host RAM used by DLA to store weights and metadata for execution
        dryrun (Union[bool, str]): Toggle "Dryrun" mode, which runs everything through partitioning, short of conversion to
            TRT Engines. Prints detailed logs of the graph structure and nature of partitioning. Optionally saves the
            output to a file if a string path is specified
        hardware_compatible (bool): Build the TensorRT engines compatible with GPU architectures other than that of the GPU on which the engine was built (currently works for NVIDIA Ampere and newer)
        timing_cache_path (str): Path to the timing cache if it exists (or) where it will be saved after compilation
        cache_built_engines (bool): Whether to save the compiled TRT engines to storage
        reuse_cached_engines (bool): Whether to load the compiled TRT engines from storage
        use_strong_typing (bool): This flag enables strong typing in TensorRT compilation which respects the precisions set in the Pytorch model. This is useful when users have mixed precision graphs.
        use_fp32_acc (bool): This option inserts cast to FP32 nodes around matmul layers and TensorRT ensures the accumulation of matmul happens in FP32. Use this only when FP16 precision is configured in enabled_precisions.
        refit_identical_engine_weights (bool): Whether to refit the engine with identical weights
        strip_engine_weights (bool): Whether to strip the engine weights
        immutable_weights (bool): Build non-refittable engines. This is useful for some layers that are not refittable. If this argument is set to true, `strip_engine_weights` and `refit_identical_engine_weights` will be ignored
        enable_weight_streaming (bool): Enable weight streaming.
        enable_cross_compile_for_windows (bool): By default this is False means TensorRT engines can only be executed on the same platform where they were built.
            True will enable cross-platform compatibility which allows the engine to be built on Linux and run on Windows
        tiling_optimization_level (str): The optimization level of tiling strategies. A higher level allows TensorRT to spend more time searching for better tiling strategy. We currently support ["none", "fast", "moderate", "full"].
        l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit).
        use_distributed_mode_trace (bool):  Using aot_autograd to trace the graph. This is enabled when DTensors or distributed tensors are present in distributed model
        enable_autocast (bool): Whether to enable autocast. If enabled, use_explicit_typing will be set to True.
        autocast_low_precision_type (Optional[Union[torch.dtype, dtype]]): The precision to reduce to. We currently support torch.float16 and torch.bfloat16. Default is None, which means no low precision is used.
        autocast_excluded_nodes (Collection[str]): The set of regex patterns to match user-specified node names that should remain in FP32. Default is [].
        autocast_excluded_ops (Collection[Target]): The set of targets (ATen ops) that should remain in FP32. Default is [].
        autocast_max_output_threshold (float): Maximum absolute value for node outputs, nodes with outputs greater than this value will remain in FP32. Default is 512.
        autocast_max_depth_of_reduction (Optional[int]): Maximum depth of reduction allowed in low precision. Nodes with higher reduction depths will remain in FP32. This helps prevent excessive accuracy loss in operations particularly sensitive to reduced precision, as higher-depth reductions may amplify computation errors in low precision formats. If not provided, infinity will be used. Default is None.
        autocast_calibration_dataloader (Optional[torch.utils.data.DataLoader]): The dataloader to use for autocast calibration. Default is None.
    c                   C      t S N)r!    r@   r@   S/home/ubuntu/.local/lib/python3.10/site-packages/torch_tensorrt/dynamo/_settings.py<lambda>x       zCompilationSettings.<lambda>)default_factoryenabled_precisionsworkspace_sizemin_block_sizetorch_executed_opspass_through_build_failuresmax_aux_streamsversion_compatibleoptimization_leveluse_python_runtimetruncate_doubleuse_fast_partitioner"enable_experimental_decompositionsdevicerequire_full_compilationdisable_tf32assume_dynamic_shape_supportsparse_weightsc                   C   r>   r?   )r"   r@   r@   r@   rA   rB      rC   engine_capabilitynum_avg_timing_itersdla_sram_sizedla_local_dram_sizedla_global_dram_sizedryrunhardware_compatibletiming_cache_pathlazy_engine_initcache_built_enginesreuse_cached_enginesuse_explicit_typinguse_fp32_accrefit_identical_engine_weightsstrip_engine_weightsimmutable_weightsenable_weight_streaming enable_cross_compile_for_windowstiling_optimization_levell2_limit_for_tilinguse_distributed_mode_traceoffload_module_to_cpuenable_autocastautocast_low_precision_typec                   C   r>   r?   )r   r@   r@   r@   rA   rB      rC   autocast_excluded_nodesc                   C   r>   r?   )r   r@   r@   r@   rA   rB      rC   autocast_excluded_opsautocast_max_output_thresholdautocast_max_depth_of_reductionautocast_calibration_dataloaderenable_resource_partitioningcpu_memory_budgetreturnc                    s4   ddl m  | j } fdd|d D |d< |S )Nr   ConverterRegistryc                    s$   h | ]}t |tr|n |qS r@   )
isinstancestrqualified_name_or_str).0oprv   r@   rA   	<setcomp>   s    z3CompilationSettings.__getstate__.<locals>.<setcomp>rH   )3torch_tensorrt.dynamo.conversion._ConverterRegistryrw   __dict__copyselfstater@   rv   rA   __getstate__   s   


z CompilationSettings.__getstate__r   Nc                 C   s   | j | d S r?   )r   updater   r@   r@   rA   __setstate__   s   z CompilationSettings.__setstate__)v__name__
__module____qualname____doc__r   rE   r   r   __annotations__r;   rF   intr(   rG   setrH   r   r
   r,   rI   boolr'   rJ   r   r:   rK   r+   rL   r9   rM   r4   rN   r7   rO   r   rP   r<   rQ   r   r.   rR   r   rS   r   rT   r0   rU   rV   r   r)   rW   r   rX   r   rY   r   rZ   r   r[   r	   ry   r#   r\   r3   r]   r&   r^   r   r_   r/   r`   r6   ra   r8   rb   r-   rc   r1   rd   r$   re   r    rf   r   rg   r2   rh   r%   ri   r5   rj   r*   rk   r   rl   r   rm   rn   ro   r   rp   floatr   rq   r   rr   torchutilsdata
DataLoaderr   rs   r   rt   dictr   r   r   r@   r@   r@   rA   r=   <   sv   
 :r=   >   rS   rU   rl   rJ   rV   re   rE   rL   rK   r\   ri   ro   rn   rf   rh   rm   rp   rc   rr   rq   INCLUDE_REFITrd   set_aset_bru   c                 C   sL   t  }tD ]}t| |t||kr|| qt|dkr"dt  fS d|fS )Nr   TF)r    _SETTINGS_TO_BE_ENGINE_INVARIANTgetattraddlen)r   r   incompatible_settingsfr@   r@   rA   settings_are_compatible   s   

r   )Ldataclassesr   r   typingr   r   r   r   r   r	   tensorrttrtr   torch.fx.noder
   torch_tensorrt._Devicer   torch_tensorrt._enumsr   r   torch_tensorrt.dynamo._defaultsr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r   hasattrSerializationFlagr   r   ry   r   r@   r@   r@   rA   <module>   s*     3 
