fix: Add enabled/disabled sets for decompositions

gs-olive · gs-olive · commit bdb06d8a8600 · 2023-08-04T14:56:25.000-07:00
- Add sets to selectively enable or disable decompositions in Torch
- Add new runtime argument `enable_experimental_decompositions` to
enable all core aten decompositions, or a pre-selected subset thereof
- Improve documentation of compilation settings overall
diff --git a/py/torch_tensorrt/dynamo/_defaults.py b/py/torch_tensorrt/dynamo/_defaults.py
@@ -10,3 +10,4 @@
 OPTIMIZATION_LEVEL = None
 USE_PYTHON_RUNTIME = None
 TRUNCATE_LONG_AND_DOUBLE = False
+ENABLE_EXPERIMENTAL_DECOMPOSITIONS = False
diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py
@@ -12,11 +12,33 @@
     OPTIMIZATION_LEVEL,
     USE_PYTHON_RUNTIME,
     TRUNCATE_LONG_AND_DOUBLE,
+    ENABLE_EXPERIMENTAL_DECOMPOSITIONS,
 )
 
 
 @dataclass
 class CompilationSettings:
+    """Compilation settings for Torch-TensorRT Dynamo Paths
+
+    Args:
+        precision (torch.dtype): Model Layer precision
+        debug (bool): Whether to print out verbose debugging information
+        workspace_size (int): Workspace TRT is allowed to use for the module (0 is default)
+        min_block_size (int): Minimum number of operators per TRT-Engine Block
+        torch_executed_ops (Sequence[str]): Sequence of operations to run in Torch, regardless of converter coverage
+        pass_through_build_failures (bool): Whether to fail on TRT engine build errors (True) or not (False)
+        max_aux_streams (Optional[int]): Maximum number of allowed auxiliary TRT streams for each engine
+        version_compatible (bool): Provide version forward-compatibility for engine plan files
+        optimization_level (Optional[int]): Builder optimization 0-5, higher levels imply longer build time,
+            searching for more optimization options. TRT defaults to 3
+        use_python_runtime (Optional[bool]): Whether to strictly use Python runtime or C++ runtime. To auto-select a runtime
+            based on C++ dependency presence (preferentially choosing C++ runtime if available), leave the
+            argument as None
+        truncate_long_and_double (bool): Truncate int64/float64 TRT engine inputs or weights to int32/float32
+        enable_experimental_decompositions (bool): Whether to enable all core aten decompositions
+            or only a selected subset of them
+    """
+
     precision: torch.dtype = PRECISION
     debug: bool = DEBUG
     workspace_size: int = WORKSPACE_SIZE
@@ -28,3 +50,4 @@ class CompilationSettings:
     optimization_level: Optional[int] = OPTIMIZATION_LEVEL
     use_python_runtime: Optional[bool] = USE_PYTHON_RUNTIME
     truncate_long_and_double: bool = TRUNCATE_LONG_AND_DOUBLE
+    enable_experimental_decompositions: bool = ENABLE_EXPERIMENTAL_DECOMPOSITIONS
diff --git a/py/torch_tensorrt/dynamo/backend/backends.py b/py/torch_tensorrt/dynamo/backend/backends.py
@@ -55,7 +55,7 @@ def aot_torch_tensorrt_aten_backend(
         gm,
         sample_inputs,
         fw_compiler=make_boxed_compiler(custom_backend),
-        decompositions=get_decompositions(),
+        decompositions=get_decompositions(settings.enable_experimental_decompositions),
     )
 
 
diff --git a/py/torch_tensorrt/dynamo/compile.py b/py/torch_tensorrt/dynamo/compile.py
@@ -31,6 +31,7 @@
     OPTIMIZATION_LEVEL,
     USE_PYTHON_RUNTIME,
     TRUNCATE_LONG_AND_DOUBLE,
+    ENABLE_EXPERIMENTAL_DECOMPOSITIONS,
 )
 
 
@@ -64,6 +65,7 @@ def compile(
     version_compatible=VERSION_COMPATIBLE,
     optimization_level=OPTIMIZATION_LEVEL,
     use_python_runtime=USE_PYTHON_RUNTIME,
+    enable_experimental_decompositions=ENABLE_EXPERIMENTAL_DECOMPOSITIONS,
     **kwargs,
 ):
     if debug:
@@ -73,7 +75,7 @@ def compile(
         "The Dynamo backend is an experimental feature, for which only the "
         + "following arguments are supported: "
         + "{enabled_precisions, debug, workspace_size, min_block_size, "
-        + "torch_executed_ops, pass_through_build_failures}"
+        + "torch_executed_ops, pass_through_build_failures, enable_experimental_decompositions}"
     )
 
     if not isinstance(inputs, collections.abc.Sequence):
@@ -111,6 +113,7 @@ def compile(
         "optimization_level": optimization_level,
         "use_python_runtime": use_python_runtime,
         "truncate_long_and_double": truncate_long_and_double,
+        "enable_experimental_decompositions": enable_experimental_decompositions,
     }
 
     settings = CompilationSettings(**compilation_options)
diff --git a/py/torch_tensorrt/dynamo/lowering/_decompositions.py b/py/torch_tensorrt/dynamo/lowering/_decompositions.py
@@ -1,10 +1,187 @@
+from typing import Callable, Dict, Set
 import torch
-from torch._decomp import register_decomposition, core_aten_decompositions
+from torch._decomp import (
+    register_decomposition,
+    core_aten_decompositions,
+    get_decompositions as get_torch_decompositions,
+)
 
+aten = torch.ops.aten
 
-DECOMPOSITIONS = {**core_aten_decompositions()}
+_core_aten_decompositions: Dict[
+    torch._ops.OpOverload, Callable
+] = core_aten_decompositions()
+enabled_decompositions: Set[torch._ops.OpOverload] = {
+    aten._adaptive_avg_pool2d_backward,
+    aten.addcdiv,
+    aten.addcdiv_,
+    aten.addcmul,
+    aten.addcmul_,
+    aten.addr,
+    aten.aminmax,
+    aten.arange.default,
+    aten.arange.start,
+    aten.avg_pool2d_backward,
+    aten.binary_cross_entropy,
+    aten.binary_cross_entropy_backward,
+    aten.binary_cross_entropy_with_logits,
+    aten.celu,
+    aten.col2im,
+    aten.count_nonzero,
+    aten.cudnn_batch_norm,
+    aten.cudnn_batch_norm_backward,
+    aten.deg2rad,
+    aten.detach,
+    aten.diag_embed,
+    aten.diagonal_backward,
+    aten.dot,
+    aten.elu,
+    aten.elu_backward,
+    aten._embedding_bag,
+    aten.embedding_dense_backward,
+    aten._euclidean_dist.default,
+    aten.expand_as,
+    aten.eye,
+    aten.fill,
+    aten.frac,
+    aten._fused_moving_avg_obs_fq_helper,
+    aten.gelu,
+    aten.gelu_backward,
+    aten.glu_backward,
+    aten.grid_sampler_2d,
+    aten.hardshrink,
+    aten.hardshrink_backward,
+    aten.hardsigmoid,
+    aten.hardsigmoid_backward,
+    aten.hardswish,
+    aten.hardswish_,
+    aten.hardswish_backward,
+    aten.hardtanh,
+    aten.hardtanh_,
+    aten.hardtanh_backward,
+    aten.heaviside,
+    aten.huber_loss,
+    aten.huber_loss_backward,
+    aten.im2col,
+    aten.index_add,
+    aten.index_add_,
+    aten.index_copy,
+    aten.index_copy_,
+    aten.index_fill,
+    aten.index_fill_,
+    aten.index_select,
+    aten.isneginf,
+    aten.isposinf,
+    aten.l1_loss,
+    aten.leaky_relu,
+    aten.leaky_relu_,
+    aten.leaky_relu_backward,
+    aten.lerp,
+    aten.linspace,
+    aten.logaddexp,
+    aten.logaddexp2,
+    aten.logit,
+    aten.logit_backward,
+    aten.log_sigmoid_backward,
+    aten.log_sigmoid_forward,
+    aten._log_softmax,
+    aten._log_softmax_backward_data,
+    aten.logspace,
+    aten.logsumexp.default,
+    aten.masked_fill,
+    aten.masked_fill_,
+    aten.max_pool2d_with_indices_backward,
+    aten.mish,
+    aten.mse_loss,
+    aten.mse_loss_backward,
+    aten.mv,
+    aten.mvlgamma,
+    aten.nansum,
+    aten.nan_to_num,
+    aten.narrow,
+    # TODO: Disable the below operators once freezing is done
+    aten.native_batch_norm,
+    aten.native_batch_norm_backward,
+    aten._native_batch_norm_legit,
+    aten._native_batch_norm_legit_functional,
+    aten._native_batch_norm_legit_no_training,
+    aten.native_dropout_backward,
+    aten.native_group_norm,
+    aten.native_group_norm_backward,
+    aten.native_layer_norm,
+    aten.native_layer_norm_backward,
+    aten.new_empty,
+    aten.new_full,
+    aten.new_ones,
+    aten.new_zeros,
+    aten.nll_loss_backward,
+    aten.nll_loss_forward,
+    aten.norm,
+    aten.ones,
+    aten.ones_like,
+    aten._prelu_kernel,
+    aten._prelu_kernel_backward,
+    aten._reshape_alias,
+    aten.rad2deg,
+    aten.renorm,
+    aten.renorm_,
+    aten.rot90,
+    aten.rsub.Scalar,
+    aten.rsub.Tensor,
+    aten.select_backward,
+    aten.select_scatter,
+    aten.sgn,
+    aten.sigmoid_backward,
+    aten.silu,
+    aten.silu_,
+    aten.silu_backward,
+    aten.sinc,
+    aten.slice_backward,
+    aten.smooth_l1_loss,
+    aten.smooth_l1_loss_backward,
+    aten.soft_margin_loss,
+    aten.soft_margin_loss_backward,
+    aten._softmax,
+    aten._softmax_backward_data,
+    aten.softplus,
+    aten.softplus_backward,
+    aten.softshrink,
+    aten.softshrink_backward,
+    aten.special_entr,
+    aten.special_log_ndtr,
+    aten.special_xlog1py,
+    aten.stack,
+    aten.t,
+    aten.tanh_backward,
+    aten.threshold,
+    aten.threshold_backward,
+    aten.trace,
+    aten.transpose.int,
+    aten.tril.default,
+    aten.triu.default,
+    aten.unfold,
+    aten.unfold_backward,
+    aten.unfold_copy,
+    aten.upsample_bilinear2d,
+    aten.upsample_bilinear2d.vec,
+    aten.upsample_nearest2d_backward,
+    aten.xlogy,
+    aten.zero,
+    aten.zero_,
+    aten.zeros,
+    aten.zeros_like,
+}
+disabled_decompositions: Set[torch._ops.OpOverload] = {}
 
-aten = torch.ops.aten
+TORCH_DECOMPOSITIONS: Dict[torch._ops.OpOverload, Callable] = get_torch_decompositions(
+    enabled_decompositions
+)
+TORCH_EXPERIMENTAL_DECOMPOSITIONS: Dict[torch._ops.OpOverload, Callable] = {
+    decomp: _core_aten_decompositions[decomp]
+    for decomp in _core_aten_decompositions
+    if decomp not in disabled_decompositions
+}
+CUSTOM_DECOMPOSITIONS: Dict[torch._ops.OpOverload, Callable] = {}
 
 
 def replace_inplace_op(aten_op, outplace_op):
@@ -13,7 +190,7 @@ def replace_inplace_op(aten_op, outplace_op):
     https://github.com/pytorch/pytorch/blob/3344d79e3f732dadd5c85b99a7aa1a022f187929/torch/_decomp/decompositions.py#L3355-L3361
     """
 
-    @register_decomposition(aten_op, registry=DECOMPOSITIONS)
+    @register_decomposition(aten_op, registry=CUSTOM_DECOMPOSITIONS)
     def inplace_op(*args, **kwargs):
         out = outplace_op(*args, **kwargs)
         return args[0].copy_(out)
@@ -36,32 +213,32 @@ def inplace_op(*args, **kwargs):
 replace_inplace_op(aten.scatter_reduce_, aten.scatter_reduce)
 
 
-@register_decomposition(aten.std, registry=DECOMPOSITIONS)
+@register_decomposition(aten.std, registry=CUSTOM_DECOMPOSITIONS)
 def std_replacement(*args, **kwargs) -> torch.Tensor:
     return torch.sqrt(torch.var(*args, **kwargs))
 
 
-@register_decomposition(aten.rsqrt, registry=DECOMPOSITIONS)
+@register_decomposition(aten.rsqrt, registry=CUSTOM_DECOMPOSITIONS)
 def rsqrt_replacement(*args, **kwargs) -> torch.Tensor:
     return torch.reciprocal(torch.sqrt(*args, **kwargs))
 
 
-@register_decomposition(aten._unsafe_view, registry=DECOMPOSITIONS)
+@register_decomposition(aten._unsafe_view, registry=CUSTOM_DECOMPOSITIONS)
 def unsafe_view_replacement(x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
     return torch.reshape(x, *args, **kwargs)
 
 
-@register_decomposition(torch.ops.aten.lift_fresh_copy, registry=DECOMPOSITIONS)
+@register_decomposition(torch.ops.aten.lift_fresh_copy, registry=CUSTOM_DECOMPOSITIONS)
 def lift_fresh_copy_replacement(x: torch.Tensor) -> torch.Tensor:
     return x
 
 
-@register_decomposition(aten.alias, registry=DECOMPOSITIONS)
+@register_decomposition(aten.alias, registry=CUSTOM_DECOMPOSITIONS)
 def alias_replacement(x: torch.Tensor) -> torch.Tensor:
     return x
 
 
-@register_decomposition(torch.ops.aten.addmm, registry=DECOMPOSITIONS)
+@register_decomposition(torch.ops.aten.addmm, registry=CUSTOM_DECOMPOSITIONS)
 def addmm_replacement(
     input_: torch.Tensor, mat1: torch.Tensor, mat2: torch.Tensor, *, beta=1, alpha=1
 ) -> torch.Tensor:
@@ -70,12 +247,31 @@ def addmm_replacement(
     )
 
 
-@register_decomposition(torch.ops.aten.reciprocal.default, registry=DECOMPOSITIONS)
+@register_decomposition(
+    torch.ops.aten.reciprocal.default, registry=CUSTOM_DECOMPOSITIONS
+)
 def reciprocal_replacement(
     input_: torch.Tensor,
 ) -> torch.Tensor:
     return torch.div(1, input_)
 
 
-def get_decompositions():
-    return DECOMPOSITIONS
+def get_decompositions(
+    enable_experimental_decompositions: bool = False,
+) -> Dict[torch._ops.OpOverload, Callable]:
+    if enable_experimental_decompositions:
+        duplicate_registrations = set(
+            TORCH_EXPERIMENTAL_DECOMPOSITIONS.keys()
+        ).intersection(set(CUSTOM_DECOMPOSITIONS.keys()))
+        assert (
+            not duplicate_registrations
+        ), f"Detected duplicate decompositions on: {duplicate_registrations}"
+        return {**TORCH_EXPERIMENTAL_DECOMPOSITIONS, **CUSTOM_DECOMPOSITIONS}
+    else:
+        duplicate_registrations = set(TORCH_DECOMPOSITIONS.keys()).intersection(
+            set(CUSTOM_DECOMPOSITIONS.keys())
+        )
+        assert (
+            not duplicate_registrations
+        ), f"Detected duplicate decompositions on: {duplicate_registrations}"
+        return {**TORCH_DECOMPOSITIONS, **CUSTOM_DECOMPOSITIONS}

Original file line number	Diff line number	Diff line change
`@@ -55,7 +55,7 @@ def aot_torch_tensorrt_aten_backend(`
`55`	`55`	`gm,`
`56`	`56`	`sample_inputs,`
`57`	`57`	`fw_compiler=make_boxed_compiler(custom_backend),`
`58`		`- decompositions=get_decompositions(),`
	`58`	`+ decompositions=get_decompositions(settings.enable_experimental_decompositions),`
`59`	`59`	`)`
`60`	`60`
`61`	`61`