diff --git a/.github/workflows/build-test-linux-aarch64-jetpack.yml b/.github/workflows/build-test-linux-aarch64-jetpack.yml
index 3fb608eb30..097e8b1f2c 100644
--- a/.github/workflows/build-test-linux-aarch64-jetpack.yml
+++ b/.github/workflows/build-test-linux-aarch64-jetpack.yml
@@ -1,17 +1,16 @@
 name: Build and test Linux aarch64 wheels for Jetpack
 
 on:
-  # TODO: Uncomment this when we have a stable release
-  # pull_request:
-  # push:
-  #   branches:
-  #     - main
-  #     - nightly
-  #     - release/*
-  #   tags:
-  #     # NOTE: Binary build pipelines should only get triggered on release candidate builds
-  #     # Release candidate tags look like: v1.11.0-rc1
-  #     - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+  pull_request:
+  push:
+    branches:
+      - main
+      - nightly
+      - release/*
+    tags:
+      # NOTE: Binary build pipelines should only get triggered on release candidate builds
+      # Release candidate tags look like: v1.11.0-rc1
+      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
   workflow_dispatch:
 
 jobs:
diff --git a/.github/workflows/build_wheels_linux_aarch64.yml b/.github/workflows/build_wheels_linux_aarch64.yml
index 8cc9837f3a..b8b13f8897 100644
--- a/.github/workflows/build_wheels_linux_aarch64.yml
+++ b/.github/workflows/build_wheels_linux_aarch64.yml
@@ -264,7 +264,7 @@ jobs:
           if [[ ${{ inputs.is-jetpack }} == false ]]; then
             ${CONDA_RUN} python setup.py bdist_wheel
           else
-            ${CONDA_RUN} python setup.py bdist_wheel --jetpack --plat-name=linux_tegra_aarch64
+            ${CONDA_RUN} python setup.py bdist_wheel --jetpack
           fi
       - name: Repair Manylinux_2_28 Wheel
         shell: bash -l {0}
@@ -337,8 +337,8 @@ jobs:
     needs: build
     name: upload-wheel-${{ matrix.python_version }}-${{ matrix.desired_cuda }}-${{ matrix.gpu_arch_type }}-${{ inputs.is-jetpack }}
     uses: pytorch/test-infra/.github/workflows/_binary_upload.yml@main
-    # for jetpack builds, only upload to pytorch index for nightly builds
-    if: ${{ inputs.is-jetpack == false || (github.event_name == 'push' && startsWith(github.event.ref, 'refs/heads/nightly')) }}
+    # for jetpack builds, do not upload to pytorch nightly index, only upload to https://pypi.jetson-ai-lab.io/ manually for each release
+    if: ${{ inputs.is-jetpack == false }}
     with:
       repository: ${{ inputs.repository }}
       ref: ${{ inputs.ref }}
diff --git a/MODULE.bazel b/MODULE.bazel
index 1b66e2c900..3a497a2e8a 100644
--- a/MODULE.bazel
+++ b/MODULE.bazel
@@ -90,10 +90,9 @@ http_archive(
 http_archive(
     name = "torch_l4t",
     build_file = "@//third_party/libtorch:BUILD",
-    sha256 = "6eff643c0a7acda92734cc798338f733ff35c7df1a4434576f5ff7c66fc97319",
     strip_prefix = "torch",
     type = "zip",
-    urls = ["https://pypi.jetson-ai-lab.dev/jp6/cu126/+f/6ef/f643c0a7acda9/torch-2.7.0-cp310-cp310-linux_aarch64.whl"],
+    urls = ["https://pypi.jetson-ai-lab.io/jp6/cu126/+f/62a/1beee9f2f1470/torch-2.8.0-cp310-cp310-linux_aarch64.whl"],
 )
 
 # Download these tarballs manually from the NVIDIA website
diff --git a/docsrc/getting_started/jetpack.rst b/docsrc/getting_started/jetpack.rst
index edfe1ae52e..f032685b68 100644
--- a/docsrc/getting_started/jetpack.rst
+++ b/docsrc/getting_started/jetpack.rst
@@ -90,14 +90,14 @@ Build Environment Setup
    .. code-block:: sh
 
       # Can only install the torch and torchvision wheel from the JPL repo which is built specifically for JetPack 6.2
-      python -m pip install torch==2.7.0 torchvision==0.22.0  --index-url=https://pypi.jetson-ai-lab.dev/jp6/cu126/
+      python -m pip install torch==2.8.0 torchvision==0.23.0  --index-url=https://pypi.jetson-ai-lab.io/jp6/cu126
 
 
 Building the Wheel
 ==================
 
 .. code-block:: sh
-   python setup.py bdist_wheel
+   python setup.py bdist_wheel --jetpack
 
 Installation
 ============
diff --git a/packaging/pre_build_script.sh b/packaging/pre_build_script.sh
index 1f1a2120a9..6f48965741 100755
--- a/packaging/pre_build_script.sh
+++ b/packaging/pre_build_script.sh
@@ -42,8 +42,8 @@ curl -L https://github.com/bazelbuild/bazelisk/releases/download/v1.26.0/bazelis
 pip uninstall -y torch torchvision
 
 if [[ ${IS_JETPACK} == true ]]; then
-    # install torch 2.7 for jp6.2
-    pip install torch==2.7.0 --index-url=https://pypi.jetson-ai-lab.dev/jp6/cu126/
+    # install torch 2.8 for jp6.2
+    pip install torch==2.8.0 --index-url=https://pypi.jetson-ai-lab.io/jp6/cu126/
 else
     TORCH=$(grep "^torch>" py/requirements.txt)
     INDEX_URL=https://download.pytorch.org/whl/${CHANNEL}/${CU_VERSION}
diff --git a/py/torch_tensorrt/_enums.py b/py/torch_tensorrt/_enums.py
index e0a78e1a0b..637843eaeb 100644
--- a/py/torch_tensorrt/_enums.py
+++ b/py/torch_tensorrt/_enums.py
@@ -8,6 +8,7 @@
 import tensorrt as trt
 import torch
 from torch_tensorrt._features import ENABLED_FEATURES, needs_torch_tensorrt_runtime
+from torch_tensorrt._utils import is_tensorrt_version_supported
 
 
 class dtype(Enum):
@@ -199,8 +200,6 @@ def _from(
                 return dtype.i8
             elif t == trt.DataType.FP8:
                 return dtype.f8
-            elif t == trt.DataType.FP4:
-                return dtype.fp4
             elif t == trt.DataType.INT32:
                 return dtype.i32
             elif t == trt.DataType.INT64:
@@ -214,6 +213,8 @@ def _from(
             elif t == trt.DataType.BF16:
                 return dtype.bf16
             else:
+                if is_tensorrt_version_supported("10.8.0") and t == trt.DataType.FP4:
+                    return dtype.fp4
                 raise TypeError(
                     f"Provided an unsupported data type as a data type for translation (support: bool, int, half, float, bfloat16), got: {t}"
                 )
@@ -409,11 +410,11 @@ def to(
                 return trt.DataType.BOOL
             elif self == dtype.bf16:
                 return trt.DataType.BF16
-            elif self == dtype.f4:
-                return trt.DataType.FP4
             elif use_default:
                 return trt.DataType.FLOAT
             else:
+                if is_tensorrt_version_supported("10.8.0") and self == dtype.f4:
+                    return trt.DataType.FP4
                 raise TypeError("Unsupported tensorrt dtype")
 
         elif t == np.dtype:
diff --git a/py/torch_tensorrt/_utils.py b/py/torch_tensorrt/_utils.py
index 9c76257dee..f038ba85d2 100644
--- a/py/torch_tensorrt/_utils.py
+++ b/py/torch_tensorrt/_utils.py
@@ -24,3 +24,26 @@ def check_cross_compile_trt_win_lib() -> bool:
         target_lib = ".*libnvinfer_builder_resource_win.so.*"
         return any(re.match(target_lib, lib) for lib in loaded_libs)
     return False
+
+
+def is_tensorrt_version_supported(min_version: str = "10.8.0") -> bool:
+    """
+    Check if the installed TensorRT version supports the specified minimum version.
+    Args:
+        min_version (str): Minimum required TensorRT version (default: "10.8.0" for FP4 support)
+    Returns:
+        bool: True if TensorRT version is >= min_version, False otherwise
+    Example:
+        >>> if is_tensorrt_version_supported("10.8.0"):
+        ...     # Use FP4 features
+        ...     pass
+    """
+    try:
+        from importlib import metadata
+
+        from packaging.version import Version
+
+        return bool(Version(metadata.version("tensorrt")) >= Version(min_version))
+    except (ImportError, ValueError):
+        # If tensorrt is not installed or version cannot be determined
+        return False
diff --git a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py
index fe9a01b06c..f81f7cab32 100644
--- a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py
+++ b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py
@@ -7,6 +7,7 @@
 import numpy as np
 import torch
 from torch.fx.node import Argument, Node, Target
+from torch_tensorrt._utils import is_tensorrt_version_supported
 from torch_tensorrt.dynamo._settings import CompilationSettings
 from torch_tensorrt.dynamo._SourceIR import SourceIR
 from torch_tensorrt.dynamo.conversion import impl
@@ -620,40 +621,41 @@ def aten_ops_quantize_op(
         )
 
 
-try:
-    import modelopt.torch.quantization as mtq  # noqa: F401
+if is_tensorrt_version_supported("10.8.0"):
+    try:
+        import modelopt.torch.quantization as mtq  # noqa: F401
 
-    assert torch.ops.tensorrt.dynamic_block_quantize_op.default
-except Exception as e:
-    _LOGGER.warning(
-        "Unable to import quantize op. Please install modelopt library (https://github.com/NVIDIA/TensorRT-Model-Optimizer?tab=readme-ov-file#installation) to add support for compiling quantized models"
-    )
-else:
+        assert torch.ops.tensorrt.dynamic_block_quantize_op.default
+    except Exception as e:
+        _LOGGER.warning(
+            "Unable to import quantize op. Please install modelopt library (https://github.com/NVIDIA/TensorRT-Model-Optimizer?tab=readme-ov-file#installation) to add support for compiling quantized models"
+        )
+    else:
 
-    @dynamo_tensorrt_converter(
-        torch.ops.tensorrt.dynamic_block_quantize_op.default,
-        supports_dynamic_shapes=True,
-    )
-    def aten_ops_dynamic_block_quantize_op(
-        ctx: ConversionContext,
-        target: Target,
-        args: Tuple[Argument, ...],
-        kwargs: Dict[str, Argument],
-        name: str,
-    ) -> Union[TRTTensor, Sequence[TRTTensor]]:
-        return impl.dynamic_block_quantize.quantize(
-            ctx,
-            target,
-            SourceIR.ATEN,
-            name,
-            args[0],
-            args[1],
-            args[2],
-            args[3],
-            args[4],
-            args[5],
-            args[6],
+        @dynamo_tensorrt_converter(
+            torch.ops.tensorrt.dynamic_block_quantize_op.default,
+            supports_dynamic_shapes=True,
         )
+        def aten_ops_dynamic_block_quantize_op(
+            ctx: ConversionContext,
+            target: Target,
+            args: Tuple[Argument, ...],
+            kwargs: Dict[str, Argument],
+            name: str,
+        ) -> Union[TRTTensor, Sequence[TRTTensor]]:
+            return impl.dynamic_block_quantize.quantize(
+                ctx,
+                target,
+                SourceIR.ATEN,
+                name,
+                args[0],
+                args[1],
+                args[2],
+                args[3],
+                args[4],
+                args[5],
+                args[6],
+            )
 
 
 @dynamo_tensorrt_converter(torch.ops.aten.squeeze.dim, supports_dynamic_shapes=True)
diff --git a/py/torch_tensorrt/dynamo/conversion/converter_utils.py b/py/torch_tensorrt/dynamo/conversion/converter_utils.py
index 896bf37b42..53835ba1d5 100644
--- a/py/torch_tensorrt/dynamo/conversion/converter_utils.py
+++ b/py/torch_tensorrt/dynamo/conversion/converter_utils.py
@@ -32,7 +32,7 @@
     ConverterRegistry,
     DynamoConverterImplSignature,
 )
-
+from torch_tensorrt._utils import is_tensorrt_version_supported
 from ..types import Shape, TRTDataType, TRTLayer, TRTTensor
 
 _LOGGER: logging.Logger = logging.getLogger(__name__)
@@ -448,31 +448,35 @@ def create_constant(
         if torch_value is not None:
 
             if torch_value.dtype == torch.uint8:
-                if (
-                    target_quantized_type is None
-                    or target_quantized_type != trt.DataType.FP4
-                ):
-                    # Iconstant layer does not support Uint8, it only support that FP4 data packed in uint8
+                if is_tensorrt_version_supported("10.8.0"):
+                    if (
+                        target_quantized_type is None
+                        or target_quantized_type != trt.DataType.FP4
+                    ):
+                        # Iconstant layer does not support Uint8, it only support that FP4 data packed in uint8
+                        raise ValueError(
+                            "Currently supported target_quantized_type for uint8 is FP4, got {target_quantized_type=}"
+                        )
+                    shape[-1] = shape[-1] * 2
+                    weights = to_trt_weights(
+                        ctx,
+                        torch_value,
+                        name,
+                        "CONSTANT",
+                        "CONSTANT",
+                        dtype=trt.DataType.FP4,
+                        count=torch_value.numel() * 2,
+                    )
+                    constant = ctx.net.add_constant(
+                        shape,
+                        weights,
+                    )
+                    constant.name = name
+                    return constant.get_output(0)
+                else:
                     raise ValueError(
-                        "Currently supported target_quantized_type for uint8 is FP4, got {target_quantized_type=}"
+                        "Currently FP4 is only supported in TensorRT 10.8.0 and above"
                     )
-                shape[-1] = shape[-1] * 2
-                weights = to_trt_weights(
-                    ctx,
-                    torch_value,
-                    name,
-                    "CONSTANT",
-                    "CONSTANT",
-                    dtype=trt.DataType.FP4,
-                    count=torch_value.numel() * 2,
-                )
-                constant = ctx.net.add_constant(
-                    shape,
-                    weights,
-                )
-                constant.name = name
-                return constant.get_output(0)
-
             # Record the weight in ctx for refit and cpu memory reference
 
             # Convert the torch.Tensor to a trt.Weights object
diff --git a/py/torch_tensorrt/dynamo/conversion/custom_ops_converters.py b/py/torch_tensorrt/dynamo/conversion/custom_ops_converters.py
index 79611c7552..1442c2b17b 100644
--- a/py/torch_tensorrt/dynamo/conversion/custom_ops_converters.py
+++ b/py/torch_tensorrt/dynamo/conversion/custom_ops_converters.py
@@ -12,14 +12,14 @@
     dynamo_tensorrt_converter,
 )
 from torch_tensorrt.dynamo.conversion.converter_utils import load_tensorrt_llm
-from torch_tensorrt.dynamo.lowering.passes.fuse_distributed_ops import (
-    tensorrt_fused_nccl_all_gather_op,
-    tensorrt_fused_nccl_reduce_scatter_op,
-)
 
 _LOGGER: logging.Logger = logging.getLogger(__name__)
 
 if load_tensorrt_llm():
+    from torch_tensorrt.dynamo.lowering.passes.fuse_distributed_ops import (
+        tensorrt_fused_nccl_all_gather_op,
+        tensorrt_fused_nccl_reduce_scatter_op,
+    )
 
     @dynamo_tensorrt_converter(tensorrt_fused_nccl_all_gather_op)
     def fused_nccl_gather(
diff --git a/py/torch_tensorrt/dynamo/conversion/impl/dynamic_block_quantize.py b/py/torch_tensorrt/dynamo/conversion/impl/dynamic_block_quantize.py
index f76a84dea5..e935992bda 100644
--- a/py/torch_tensorrt/dynamo/conversion/impl/dynamic_block_quantize.py
+++ b/py/torch_tensorrt/dynamo/conversion/impl/dynamic_block_quantize.py
@@ -5,6 +5,7 @@
 import torch
 from torch.fx.experimental.proxy_tensor import unset_fake_temporarily
 from torch.fx.node import Target
+from torch_tensorrt._utils import is_tensorrt_version_supported
 from torch_tensorrt.dynamo._SourceIR import SourceIR
 from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext
 from torch_tensorrt.dynamo.conversion.converter_utils import (
@@ -13,260 +14,259 @@
 from torch_tensorrt.fx.converters.converter_utils import set_layer_name
 from torch_tensorrt.fx.types import TRTTensor
 
+if is_tensorrt_version_supported("10.8.0"):
 
-def quantize(
-    ctx: ConversionContext,
-    target: Target,
-    source_ir: Optional[SourceIR],
-    name: str,
-    input_tensor: TRTTensor,
-    block_size: int,
-    amax: Union[np.ndarray, torch.Tensor],
-    num_bits: int,
-    exponent_bits: int,
-    scale_num_bits: int,
-    scale_exponent_bits: int,
-) -> TRTTensor:
-    """
-    Adds quantize and dequantize ops (QDQ) which quantize to FP4 based
-    on the output_type set and dequantizes them back.
-    """
-    if len(input_tensor.shape) not in (2, 3):
-        raise ValueError(
-            f"dynamic_block_quantize converter received an input of {input_tensor.shape} shape. Supported shapes: 2D or 3D"
-        )
-    with unset_fake_temporarily():
-        axis = -1
-        global_scale = _calculate_global_scale(ctx, name, amax)
-        if ".weight_quantizer" in name:
-            output = _static_double_quantize(
-                ctx,
-                target,
-                source_ir,
-                name,
-                input_tensor,
-                global_scale,
-                axis,
-            )
-        elif ".input_quantizer" in name:
-            output = _dynamic_double_quantize(
-                ctx,
-                target,
-                source_ir,
-                name,
-                input_tensor,
-                global_scale,
-                axis,
-            )
-        else:
+    def quantize(
+        ctx: ConversionContext,
+        target: Target,
+        source_ir: Optional[SourceIR],
+        name: str,
+        input_tensor: TRTTensor,
+        block_size: int,
+        amax: Union[np.ndarray, torch.Tensor],
+        num_bits: int,
+        exponent_bits: int,
+        scale_num_bits: int,
+        scale_exponent_bits: int,
+    ) -> TRTTensor:
+        """
+        Adds quantize and dequantize ops (QDQ) which quantize to FP4 based
+        on the output_type set and dequantizes them back.
+        """
+        if len(input_tensor.shape) not in (2, 3):
             raise ValueError(
-                f"quantizer received an input of {name}. Supported values: weight_quantizer | input_quantizer"
+                f"dynamic_block_quantize converter received an input of {input_tensor.shape} shape. Supported shapes: 2D or 3D"
             )
-        return output
-
+        with unset_fake_temporarily():
+            axis = -1
+            global_scale = _calculate_global_scale(ctx, name, amax)
+            if ".weight_quantizer" in name:
+                output = _static_double_quantize(
+                    ctx,
+                    target,
+                    source_ir,
+                    name,
+                    input_tensor,
+                    global_scale,
+                    axis,
+                )
+            elif ".input_quantizer" in name:
+                output = _dynamic_double_quantize(
+                    ctx,
+                    target,
+                    source_ir,
+                    name,
+                    input_tensor,
+                    global_scale,
+                    axis,
+                )
+            else:
+                raise ValueError(
+                    f"quantizer received an input of {name}. Supported values: weight_quantizer | input_quantizer"
+                )
+            return output
 
-def _dynamic_double_quantize(
-    ctx: ConversionContext,
-    target: Target,
-    source_ir: Optional[SourceIR],
-    name: str,
-    input_tensor: TRTTensor,
-    global_scale: torch.Tensor,
-    axis: int = -1,
-    block_size: int = 16,
-    output_type: trt.DataType = trt.DataType.FP4,
-    scale_type: trt.DataType = trt.DataType.FP8,
-) -> TRTTensor:
-    """
-    quantize input tensor to fp4
-    Parameters:
+    def _dynamic_double_quantize(
         ctx: ConversionContext,
         target: Target,
-        source_ir: Optional[SourceIR]
-        name: str
-        input_tensor : TRTTensor (On GPU)
-            The input TRTTensor.
-        global_scale : Tensor (On GPU)
-            The global per-tensor scaling factor. It should contain only 1 element.
-        axis : int
-            The axis to quantize. Default is -1 (the last axis).
-        block_size : int
-            The block size for quantization. Default is 16.
-        output_type : trt.DataType
-            The data type for quantized data. Default is FP4.
-        scale_type : trt.DataType
-            The data type for block scale. Default is FP8.
+        source_ir: Optional[SourceIR],
+        name: str,
+        input_tensor: TRTTensor,
+        global_scale: torch.Tensor,
+        axis: int = -1,
+        block_size: int = 16,
+        output_type: trt.DataType = trt.DataType.FP4,
+        scale_type: trt.DataType = trt.DataType.FP8,
+    ) -> TRTTensor:
+        """
+        quantize input tensor to fp4
+        Parameters:
+            ctx: ConversionContext,
+            target: Target,
+            source_ir: Optional[SourceIR]
+            name: str
+            input_tensor : TRTTensor (On GPU)
+                The input TRTTensor.
+            global_scale : Tensor (On GPU)
+                The global per-tensor scaling factor. It should contain only 1 element.
+            axis : int
+                The axis to quantize. Default is -1 (the last axis).
+            block_size : int
+                The block size for quantization. Default is 16.
+            output_type : trt.DataType
+                The data type for quantized data. Default is FP4.
+            scale_type : trt.DataType
+                The data type for block scale. Default is FP8.
 
-    """
-    global_scale = get_trt_tensor(ctx, global_scale, name + "_global_scale")
+        """
+        global_scale = get_trt_tensor(ctx, global_scale, name + "_global_scale")
 
-    if input_tensor.dtype not in [
-        trt.DataType.HALF,
-        trt.DataType.FLOAT,
-        trt.DataType.BF16,
-    ]:
-        raise ValueError(
-            f"Currently supported input tensor type is float16 | float32 | bfloat16, got Unsupported dtype: {input_tensor.dtype}"
+        if input_tensor.dtype not in [
+            trt.DataType.HALF,
+            trt.DataType.FLOAT,
+            trt.DataType.BF16,
+        ]:
+            raise ValueError(
+                f"Currently supported input tensor type is float16 | float32 | bfloat16, got Unsupported dtype: {input_tensor.dtype}"
+            )
+        # dynamic quantize input tensor to fp4
+        dynamic_quantize_layer = ctx.net.add_dynamic_quantize(
+            input_tensor,
+            axis,
+            block_size,
+            output_type,
+            scale_type,
         )
-    # dynamic quantize input tensor to fp4
-    dynamic_quantize_layer = ctx.net.add_dynamic_quantize(
-        input_tensor,
-        axis,
-        block_size,
-        output_type,
-        scale_type,
-    )
-    dynamic_quantize_layer.set_input(1, global_scale)
-    set_layer_name(
-        dynamic_quantize_layer, target, name + "_dynamic_quantize", source_ir
-    )
-    quantized_data_in_fp4 = dynamic_quantize_layer.get_output(0)
-    quantized_scale_in_fp8 = dynamic_quantize_layer.get_output(1)
-
-    return _double_dequantize(
-        ctx,
-        target,
-        source_ir,
-        name,
-        quantized_data_in_fp4,
-        quantized_scale_in_fp8,
-        global_scale,
-        axis,
-        input_tensor.dtype,
-    )
+        dynamic_quantize_layer.set_input(1, global_scale)
+        set_layer_name(
+            dynamic_quantize_layer, target, name + "_dynamic_quantize", source_ir
+        )
+        quantized_data_in_fp4 = dynamic_quantize_layer.get_output(0)
+        quantized_scale_in_fp8 = dynamic_quantize_layer.get_output(1)
 
+        return _double_dequantize(
+            ctx,
+            target,
+            source_ir,
+            name,
+            quantized_data_in_fp4,
+            quantized_scale_in_fp8,
+            global_scale,
+            axis,
+            input_tensor.dtype,
+        )
 
-def _double_dequantize(
-    ctx: ConversionContext,
-    target: Target,
-    source_ir: Optional[SourceIR],
-    name: str,
-    quantized_data_in_fp4: TRTTensor,
-    quantized_scale_in_fp8: TRTTensor,
-    global_scale: torch.Tensor,
-    axis: int = -1,
-    output_type: trt.DataType = trt.DataType.FLOAT,
-) -> TRTTensor:
-    """
-    double dequantize will first dequantize scale from fp8 to orignal dtype(default is float32)
-    and then dequantize data from fp4 to orignal dtype(default is float32)
-    Parameters:
+    def _double_dequantize(
         ctx: ConversionContext,
         target: Target,
-        source_ir: Optional[SourceIR]
-        name: str
-        quantized_data_in_fp4: TRTTensor
-        quantized_scale_in_fp8: TRTTensor
-        global_scale: torch.Tensor
-        axis: int
-        output_type: trt.DataType
-    """
-    # dequantize scale from fp8 to orignal dtype(default is float32)
-    dequantize_scale_layer = ctx.net.add_dequantize(
-        quantized_scale_in_fp8, global_scale, output_type
-    )
-    dequantize_scale_layer.axis = axis
-    dequantize_scale_layer.to_type = output_type
-    set_layer_name(
-        dequantize_scale_layer, target, name + "_dequantize_scale", source_ir
-    )
-    dequantized_scale = dequantize_scale_layer.get_output(0)
-
-    # dequantize quantized_data_in_fp4 from fp4 to orignal dtype(default is float32)
-    dequantize_data_layer = ctx.net.add_dequantize(
-        quantized_data_in_fp4, dequantized_scale, output_type
-    )
-    dequantize_data_layer.axis = axis
-    dequantize_data_layer.to_type = output_type
-    set_layer_name(dequantize_data_layer, target, name + "_dequantize_data", source_ir)
-    dequantized_data = dequantize_data_layer.get_output(0)
-    return dequantized_data
+        source_ir: Optional[SourceIR],
+        name: str,
+        quantized_data_in_fp4: TRTTensor,
+        quantized_scale_in_fp8: TRTTensor,
+        global_scale: torch.Tensor,
+        axis: int = -1,
+        output_type: trt.DataType = trt.DataType.FLOAT,
+    ) -> TRTTensor:
+        """
+        double dequantize will first dequantize scale from fp8 to orignal dtype(default is float32)
+        and then dequantize data from fp4 to orignal dtype(default is float32)
+        Parameters:
+            ctx: ConversionContext,
+            target: Target,
+            source_ir: Optional[SourceIR]
+            name: str
+            quantized_data_in_fp4: TRTTensor
+            quantized_scale_in_fp8: TRTTensor
+            global_scale: torch.Tensor
+            axis: int
+            output_type: trt.DataType
+        """
+        # dequantize scale from fp8 to orignal dtype(default is float32)
+        dequantize_scale_layer = ctx.net.add_dequantize(
+            quantized_scale_in_fp8, global_scale, output_type
+        )
+        dequantize_scale_layer.axis = axis
+        dequantize_scale_layer.to_type = output_type
+        set_layer_name(
+            dequantize_scale_layer, target, name + "_dequantize_scale", source_ir
+        )
+        dequantized_scale = dequantize_scale_layer.get_output(0)
 
+        # dequantize quantized_data_in_fp4 from fp4 to orignal dtype(default is float32)
+        dequantize_data_layer = ctx.net.add_dequantize(
+            quantized_data_in_fp4, dequantized_scale, output_type
+        )
+        dequantize_data_layer.axis = axis
+        dequantize_data_layer.to_type = output_type
+        set_layer_name(
+            dequantize_data_layer, target, name + "_dequantize_data", source_ir
+        )
+        dequantized_data = dequantize_data_layer.get_output(0)
+        return dequantized_data
 
-def _static_double_quantize(
-    ctx: ConversionContext,
-    target: Target,
-    source_ir: Optional[SourceIR],
-    name: str,
-    weights_tensor: torch.Tensor,
-    global_scale: torch.Tensor,
-    axis: int,
-) -> TRTTensor:
-    """
-    Parameters:
+    def _static_double_quantize(
         ctx: ConversionContext,
         target: Target,
         source_ir: Optional[SourceIR],
         name: str,
-        weights_tensor : Tensor (On GPU)
-            The input tensor for weights.
-        global_scale : Tensor (On GPU)
-            The global per-tensor scaling factor. It should contain only 1 element.
-        axis: int
-            The axis to quantize. Default is -1 (the last axis).
-    Returns:
-        quantized data tensor in fp4
-    """
+        weights_tensor: torch.Tensor,
+        global_scale: torch.Tensor,
+        axis: int,
+    ) -> TRTTensor:
+        """
+        Parameters:
+            ctx: ConversionContext,
+            target: Target,
+            source_ir: Optional[SourceIR],
+            name: str,
+            weights_tensor : Tensor (On GPU)
+                The input tensor for weights.
+            global_scale : Tensor (On GPU)
+                The global per-tensor scaling factor. It should contain only 1 element.
+            axis: int
+                The axis to quantize. Default is -1 (the last axis).
+        Returns:
+            quantized data tensor in fp4
+        """
 
-    import modelopt.core.torch.quantization.qtensor.nvfp4_tensor as nvfp4_tensor
-
-    if weights_tensor.dtype == torch.float16:
-        original_dtype = trt.DataType.HALF
-    elif weights_tensor.dtype == torch.float32:
-        original_dtype = trt.DataType.FLOAT
-    elif weights_tensor.dtype == torch.bfloat16:
-        original_dtype = trt.DataType.BF16
-    else:
-        raise ValueError(
-            f"Currently supported weights tensor type is float16 | float32 | bfloat16, got Unsupported dtype: {weights_tensor.dtype}"
-        )
-    block_scale_fp8 = nvfp4_tensor.NVFP4QTensor.get_weights_scaling_factor(
-        weights_tensor,
-        16,
-        global_scale,
-    )[0]
-    weights_tensor_fp4 = nvfp4_tensor.NVFP4QTensor.quantize(
-        weights_tensor,
-        16,
-        block_scale_fp8,
-        global_scale,
-    )[0]._quantized_data
+        import modelopt.core.torch.quantization.qtensor.nvfp4_tensor as nvfp4_tensor
 
-    block_scale_fp8 = get_trt_tensor(
-        ctx,
-        block_scale_fp8,
-        name + "_block_scale_fp8",
-        target_quantized_type=trt.DataType.FP8,
-    )
-    global_scale = get_trt_tensor(ctx, global_scale, name + "_global_scale")
-    weights_tensor_fp4 = get_trt_tensor(
-        ctx,
-        weights_tensor_fp4,
-        name + "_weights_fp4",
-        target_quantized_type=trt.DataType.FP4,
-    )
+        if weights_tensor.dtype == torch.float16:
+            original_dtype = trt.DataType.HALF
+        elif weights_tensor.dtype == torch.float32:
+            original_dtype = trt.DataType.FLOAT
+        elif weights_tensor.dtype == torch.bfloat16:
+            original_dtype = trt.DataType.BF16
+        else:
+            raise ValueError(
+                f"Currently supported weights tensor type is float16 | float32 | bfloat16, got Unsupported dtype: {weights_tensor.dtype}"
+            )
+        block_scale_fp8 = nvfp4_tensor.NVFP4QTensor.get_weights_scaling_factor(
+            weights_tensor,
+            16,
+            global_scale,
+        )[0]
+        weights_tensor_fp4 = nvfp4_tensor.NVFP4QTensor.quantize(
+            weights_tensor,
+            16,
+            block_scale_fp8,
+            global_scale,
+        )[0]._quantized_data
 
-    dequantized_data = _double_dequantize(
-        ctx,
-        target,
-        source_ir,
-        name,
-        weights_tensor_fp4,
-        block_scale_fp8,
-        global_scale,
-        axis,
-        original_dtype,
-    )
-    return dequantized_data
+        block_scale_fp8 = get_trt_tensor(
+            ctx,
+            block_scale_fp8,
+            name + "_block_scale_fp8",
+            target_quantized_type=trt.DataType.FP8,
+        )
+        global_scale = get_trt_tensor(ctx, global_scale, name + "_global_scale")
+        weights_tensor_fp4 = get_trt_tensor(
+            ctx,
+            weights_tensor_fp4,
+            name + "_weights_fp4",
+            target_quantized_type=trt.DataType.FP4,
+        )
 
+        dequantized_data = _double_dequantize(
+            ctx,
+            target,
+            source_ir,
+            name,
+            weights_tensor_fp4,
+            block_scale_fp8,
+            global_scale,
+            axis,
+            original_dtype,
+        )
+        return dequantized_data
 
-def _calculate_global_scale(
-    ctx: ConversionContext,
-    name: str,
-    amax: torch.Tensor,
-) -> torch.Tensor:
-    # calculate global scale (the global per-tensor scaling factor, should only contain 1 element)
-    assert len(amax.shape) == 0, "amax should be a scalar"
-    global_scale = amax / 6 / 448
-    global_scale.masked_fill_(global_scale == 0, 1.0)
-    return global_scale
+    def _calculate_global_scale(
+        ctx: ConversionContext,
+        name: str,
+        amax: torch.Tensor,
+    ) -> torch.Tensor:
+        # calculate global scale (the global per-tensor scaling factor, should only contain 1 element)
+        assert len(amax.shape) == 0, "amax should be a scalar"
+        global_scale = amax / 6 / 448
+        global_scale.masked_fill_(global_scale == 0, 1.0)
+        return global_scale
diff --git a/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py b/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py
index fff4473b47..516c371e48 100644
--- a/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py
+++ b/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py
@@ -8,7 +8,6 @@
 from .accumulate_fp32_matmul import accumulate_fp32_matmul
 from .complex_graph_rewrite import complex_graph_detection
 from .constant_folding import constant_fold
-from .fuse_distributed_ops import fuse_distributed_ops
 from .fuse_prims_broadcast import fuse_prims_broadcast
 from .pass_manager import DynamoPassManager
 from .remove_assert_nodes import remove_assert_nodes
@@ -35,6 +34,8 @@
 ]
 
 if not is_tegra_platform():
+    from .fuse_distributed_ops import fuse_distributed_ops
+
     post_lowering_pass_list.append(fuse_distributed_ops)
 
 ATEN_POST_LOWERING_PASSES = DynamoPassManager.build_from_passlist(
diff --git a/pyproject.toml b/pyproject.toml
index d390e8b4a9..17f547e976 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,7 +7,7 @@ requires = [
     "pyyaml>=6.0",
     "cffi>=1.15.1",
     "torch>=2.9.0.dev,<2.10.0; platform_machine != 'aarch64' or (platform_machine == 'aarch64' and 'tegra' not in platform_release)",
-    "torch>=2.7.0,<2.8.0; platform_machine == 'aarch64' and 'tegra' in platform_release",
+    "torch>=2.8.0,<2.9.0; platform_machine == 'aarch64' and 'tegra' in platform_release",
     "pybind11==2.6.2",
 ]
 build-backend = "setuptools.build_meta"
@@ -52,18 +52,13 @@ keywords = [
 ]
 dependencies = [
     "torch>=2.9.0.dev,<2.10.0; platform_machine != 'aarch64' or (platform_machine == 'aarch64' and 'tegra' not in platform_release)",
-    "torch>=2.7.0,<2.8.0; platform_machine == 'aarch64' and 'tegra' in platform_release",
+    "torch>=2.8.0,<2.9.0; platform_machine == 'aarch64' and 'tegra' in platform_release",
 
     "tensorrt>=10.12.0,<10.13.0; platform_machine != 'aarch64' or (platform_machine == 'aarch64' and 'tegra' not in platform_release)",
-    "tensorrt-cu12>=10.12.0,<10.13.0; platform_machine != 'aarch64' or (platform_machine == 'aarch64' and 'tegra' not in platform_release)",
     "tensorrt-cu12-bindings>=10.12.0,<10.13.0; platform_machine != 'aarch64' or (platform_machine == 'aarch64' and 'tegra' not in platform_release)",
     "tensorrt-cu12-libs>=10.12.0,<10.13.0; platform_machine != 'aarch64' or (platform_machine == 'aarch64' and 'tegra' not in platform_release)",
 
     "tensorrt>=10.3.0,<10.4.0;platform_machine == 'aarch64' and 'tegra' in platform_release",
-    "tensorrt-cu12>=10.3.0,<10.4.0; platform_machine == 'aarch64' and 'tegra' in platform_release",
-    "tensorrt-cu12-bindings>=10.3.0,<10.4.0; platform_machine == 'aarch64' and 'tegra' in platform_release",
-    "tensorrt-cu12-libs>=10.3.0,<10.4.0; platform_machine == 'aarch64' and 'tegra' in platform_release",
-
     "packaging>=23",
 
     "numpy; platform_machine != 'aarch64' or (platform_machine == 'aarch64' and 'tegra' not in platform_release)",
@@ -104,8 +99,8 @@ test = [
 
 [project.optional-dependencies]
 torchvision = [
-    "torchvision>=0.23.0.dev,<0.24.0; platform_machine != 'aarch64' or (platform_machine == 'aarch64' and 'tegra' not in platform_release)",
-    "torchvision>=0.22.0,<0.23.0; platform_machine == 'aarch64' and 'tegra' in platform_release",
+    "torchvision>=0.24.0.dev,<0.25.0; platform_machine != 'aarch64' or (platform_machine == 'aarch64' and 'tegra' not in platform_release)",
+    "torchvision>=0.23.0,<0.24.0; platform_machine == 'aarch64' and 'tegra' in platform_release",
 ]
 quantization = ["nvidia-modelopt[all]>=0.27.1"]
 
@@ -128,11 +123,11 @@ index-strategy = "unsafe-best-match"
 [tool.uv.sources]
 torch = [
     { index = "pytorch-nightly-cu129", marker = "platform_machine != 'aarch64' or (platform_machine == 'aarch64' and 'tegra' not in platform_release)" },
-   # { index = "jetson-containers", marker = "platform_machine == 'aarch64' and 'tegra' in platform_release" },
+    { index = "jetson-containers", marker = "platform_machine == 'aarch64' and 'tegra' in platform_release" },
 ]
 torchvision = [
     { index = "pytorch-nightly-cu129", marker = "platform_machine != 'aarch64' or (platform_machine == 'aarch64' and 'tegra' not in platform_release)" },
-   # { index = "jetson-containers", marker = "platform_machine == 'aarch64' and 'tegra' in platform_release" },
+    { index = "jetson-containers", marker = "platform_machine == 'aarch64' and 'tegra' in platform_release" },
 ]
 
 [[tool.uv.index]]
@@ -140,10 +135,10 @@ name = "pytorch-nightly-cu129"
 url = "https://download.pytorch.org/whl/nightly/cu129"
 explicit = false
 
-# [[tool.uv.index]]
-# name = "jetson-containers"
-# url = "https://pypi.jetson-ai-lab.dev/jp6/cu126/+simple"
-# explicit = false
+[[tool.uv.index]]
+name = "jetson-containers"
+url = "https://pypi.jetson-ai-lab.io/jp6/cu126"
+explicit = false
 
 [[tool.uv.index]]
 name = "nvidia"
diff --git a/setup.py b/setup.py
index f829602f1a..aa4fe4bc6d 100644
--- a/setup.py
+++ b/setup.py
@@ -79,7 +79,7 @@ def load_dep_info():
 dir_path = os.path.join(str(get_root_dir()), "py")
 
 IS_AARCH64 = platform.uname().processor == "aarch64"
-IS_JETPACK = True if "tegra" in platform.uname().release else False
+IS_JETPACK = False
 
 PY_ONLY = False
 NO_TS = False
@@ -154,22 +154,6 @@ def load_dep_info():
 
 IS_SBSA = True if IS_AARCH64 and not IS_JETPACK else False
 
-if IS_JETPACK and "bdist_wheel" in sys.argv:
-    needs_append_plat_name = True
-    for i, arg in enumerate(sys.argv):
-        if (
-            arg == "--plat-name"
-            and i + 1 < len(sys.argv)
-            and sys.argv[i + 1] == "linux_tegra_aarch64"
-        ):
-            needs_append_plat_name = False
-            break
-        if arg == "--plat-name=linux_tegra_aarch64":
-            needs_append_plat_name = False
-            break
-    if needs_append_plat_name:
-        sys.argv.append("--plat-name=linux_tegra_aarch64")
-
 BAZEL_EXE = None
 if not PY_ONLY:
     BAZEL_EXE = which("bazelisk")