diff --git a/.github/workflows/build-test-linux-aarch64-jetpack.yml b/.github/workflows/build-test-linux-aarch64-jetpack.yml index 3fb608eb30..097e8b1f2c 100644 --- a/.github/workflows/build-test-linux-aarch64-jetpack.yml +++ b/.github/workflows/build-test-linux-aarch64-jetpack.yml @@ -1,17 +1,16 @@ name: Build and test Linux aarch64 wheels for Jetpack on: - # TODO: Uncomment this when we have a stable release - # pull_request: - # push: - # branches: - # - main - # - nightly - # - release/* - # tags: - # # NOTE: Binary build pipelines should only get triggered on release candidate builds - # # Release candidate tags look like: v1.11.0-rc1 - # - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ + pull_request: + push: + branches: + - main + - nightly + - release/* + tags: + # NOTE: Binary build pipelines should only get triggered on release candidate builds + # Release candidate tags look like: v1.11.0-rc1 + - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ workflow_dispatch: jobs: diff --git a/.github/workflows/build_wheels_linux_aarch64.yml b/.github/workflows/build_wheels_linux_aarch64.yml index 8cc9837f3a..b8b13f8897 100644 --- a/.github/workflows/build_wheels_linux_aarch64.yml +++ b/.github/workflows/build_wheels_linux_aarch64.yml @@ -264,7 +264,7 @@ jobs: if [[ ${{ inputs.is-jetpack }} == false ]]; then ${CONDA_RUN} python setup.py bdist_wheel else - ${CONDA_RUN} python setup.py bdist_wheel --jetpack --plat-name=linux_tegra_aarch64 + ${CONDA_RUN} python setup.py bdist_wheel --jetpack fi - name: Repair Manylinux_2_28 Wheel shell: bash -l {0} @@ -337,8 +337,8 @@ jobs: needs: build name: upload-wheel-${{ matrix.python_version }}-${{ matrix.desired_cuda }}-${{ matrix.gpu_arch_type }}-${{ inputs.is-jetpack }} uses: pytorch/test-infra/.github/workflows/_binary_upload.yml@main - # for jetpack builds, only upload to pytorch index for nightly builds - if: ${{ inputs.is-jetpack == false || (github.event_name == 'push' && startsWith(github.event.ref, 'refs/heads/nightly')) }} + # for jetpack builds, do not upload to pytorch nightly index, only upload to https://pypi.jetson-ai-lab.io/ manually for each release + if: ${{ inputs.is-jetpack == false }} with: repository: ${{ inputs.repository }} ref: ${{ inputs.ref }} diff --git a/MODULE.bazel b/MODULE.bazel index 1b66e2c900..3a497a2e8a 100644 --- a/MODULE.bazel +++ b/MODULE.bazel @@ -90,10 +90,9 @@ http_archive( http_archive( name = "torch_l4t", build_file = "@//third_party/libtorch:BUILD", - sha256 = "6eff643c0a7acda92734cc798338f733ff35c7df1a4434576f5ff7c66fc97319", strip_prefix = "torch", type = "zip", - urls = ["https://pypi.jetson-ai-lab.dev/jp6/cu126/+f/6ef/f643c0a7acda9/torch-2.7.0-cp310-cp310-linux_aarch64.whl"], + urls = ["https://pypi.jetson-ai-lab.io/jp6/cu126/+f/62a/1beee9f2f1470/torch-2.8.0-cp310-cp310-linux_aarch64.whl"], ) # Download these tarballs manually from the NVIDIA website diff --git a/docsrc/getting_started/jetpack.rst b/docsrc/getting_started/jetpack.rst index edfe1ae52e..f032685b68 100644 --- a/docsrc/getting_started/jetpack.rst +++ b/docsrc/getting_started/jetpack.rst @@ -90,14 +90,14 @@ Build Environment Setup .. code-block:: sh # Can only install the torch and torchvision wheel from the JPL repo which is built specifically for JetPack 6.2 - python -m pip install torch==2.7.0 torchvision==0.22.0 --index-url=https://pypi.jetson-ai-lab.dev/jp6/cu126/ + python -m pip install torch==2.8.0 torchvision==0.23.0 --index-url=https://pypi.jetson-ai-lab.io/jp6/cu126 Building the Wheel ================== .. code-block:: sh - python setup.py bdist_wheel + python setup.py bdist_wheel --jetpack Installation ============ diff --git a/packaging/pre_build_script.sh b/packaging/pre_build_script.sh index 1f1a2120a9..6f48965741 100755 --- a/packaging/pre_build_script.sh +++ b/packaging/pre_build_script.sh @@ -42,8 +42,8 @@ curl -L https://github.com/bazelbuild/bazelisk/releases/download/v1.26.0/bazelis pip uninstall -y torch torchvision if [[ ${IS_JETPACK} == true ]]; then - # install torch 2.7 for jp6.2 - pip install torch==2.7.0 --index-url=https://pypi.jetson-ai-lab.dev/jp6/cu126/ + # install torch 2.8 for jp6.2 + pip install torch==2.8.0 --index-url=https://pypi.jetson-ai-lab.io/jp6/cu126/ else TORCH=$(grep "^torch>" py/requirements.txt) INDEX_URL=https://download.pytorch.org/whl/${CHANNEL}/${CU_VERSION} diff --git a/py/torch_tensorrt/_enums.py b/py/torch_tensorrt/_enums.py index e0a78e1a0b..637843eaeb 100644 --- a/py/torch_tensorrt/_enums.py +++ b/py/torch_tensorrt/_enums.py @@ -8,6 +8,7 @@ import tensorrt as trt import torch from torch_tensorrt._features import ENABLED_FEATURES, needs_torch_tensorrt_runtime +from torch_tensorrt._utils import is_tensorrt_version_supported class dtype(Enum): @@ -199,8 +200,6 @@ def _from( return dtype.i8 elif t == trt.DataType.FP8: return dtype.f8 - elif t == trt.DataType.FP4: - return dtype.fp4 elif t == trt.DataType.INT32: return dtype.i32 elif t == trt.DataType.INT64: @@ -214,6 +213,8 @@ def _from( elif t == trt.DataType.BF16: return dtype.bf16 else: + if is_tensorrt_version_supported("10.8.0") and t == trt.DataType.FP4: + return dtype.fp4 raise TypeError( f"Provided an unsupported data type as a data type for translation (support: bool, int, half, float, bfloat16), got: {t}" ) @@ -409,11 +410,11 @@ def to( return trt.DataType.BOOL elif self == dtype.bf16: return trt.DataType.BF16 - elif self == dtype.f4: - return trt.DataType.FP4 elif use_default: return trt.DataType.FLOAT else: + if is_tensorrt_version_supported("10.8.0") and self == dtype.f4: + return trt.DataType.FP4 raise TypeError("Unsupported tensorrt dtype") elif t == np.dtype: diff --git a/py/torch_tensorrt/_utils.py b/py/torch_tensorrt/_utils.py index 9c76257dee..f038ba85d2 100644 --- a/py/torch_tensorrt/_utils.py +++ b/py/torch_tensorrt/_utils.py @@ -24,3 +24,26 @@ def check_cross_compile_trt_win_lib() -> bool: target_lib = ".*libnvinfer_builder_resource_win.so.*" return any(re.match(target_lib, lib) for lib in loaded_libs) return False + + +def is_tensorrt_version_supported(min_version: str = "10.8.0") -> bool: + """ + Check if the installed TensorRT version supports the specified minimum version. + Args: + min_version (str): Minimum required TensorRT version (default: "10.8.0" for FP4 support) + Returns: + bool: True if TensorRT version is >= min_version, False otherwise + Example: + >>> if is_tensorrt_version_supported("10.8.0"): + ... # Use FP4 features + ... pass + """ + try: + from importlib import metadata + + from packaging.version import Version + + return bool(Version(metadata.version("tensorrt")) >= Version(min_version)) + except (ImportError, ValueError): + # If tensorrt is not installed or version cannot be determined + return False diff --git a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py index fe9a01b06c..f81f7cab32 100644 --- a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py +++ b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py @@ -7,6 +7,7 @@ import numpy as np import torch from torch.fx.node import Argument, Node, Target +from torch_tensorrt._utils import is_tensorrt_version_supported from torch_tensorrt.dynamo._settings import CompilationSettings from torch_tensorrt.dynamo._SourceIR import SourceIR from torch_tensorrt.dynamo.conversion import impl @@ -620,40 +621,41 @@ def aten_ops_quantize_op( ) -try: - import modelopt.torch.quantization as mtq # noqa: F401 +if is_tensorrt_version_supported("10.8.0"): + try: + import modelopt.torch.quantization as mtq # noqa: F401 - assert torch.ops.tensorrt.dynamic_block_quantize_op.default -except Exception as e: - _LOGGER.warning( - "Unable to import quantize op. Please install modelopt library (https://github.com/NVIDIA/TensorRT-Model-Optimizer?tab=readme-ov-file#installation) to add support for compiling quantized models" - ) -else: + assert torch.ops.tensorrt.dynamic_block_quantize_op.default + except Exception as e: + _LOGGER.warning( + "Unable to import quantize op. Please install modelopt library (https://github.com/NVIDIA/TensorRT-Model-Optimizer?tab=readme-ov-file#installation) to add support for compiling quantized models" + ) + else: - @dynamo_tensorrt_converter( - torch.ops.tensorrt.dynamic_block_quantize_op.default, - supports_dynamic_shapes=True, - ) - def aten_ops_dynamic_block_quantize_op( - ctx: ConversionContext, - target: Target, - args: Tuple[Argument, ...], - kwargs: Dict[str, Argument], - name: str, - ) -> Union[TRTTensor, Sequence[TRTTensor]]: - return impl.dynamic_block_quantize.quantize( - ctx, - target, - SourceIR.ATEN, - name, - args[0], - args[1], - args[2], - args[3], - args[4], - args[5], - args[6], + @dynamo_tensorrt_converter( + torch.ops.tensorrt.dynamic_block_quantize_op.default, + supports_dynamic_shapes=True, ) + def aten_ops_dynamic_block_quantize_op( + ctx: ConversionContext, + target: Target, + args: Tuple[Argument, ...], + kwargs: Dict[str, Argument], + name: str, + ) -> Union[TRTTensor, Sequence[TRTTensor]]: + return impl.dynamic_block_quantize.quantize( + ctx, + target, + SourceIR.ATEN, + name, + args[0], + args[1], + args[2], + args[3], + args[4], + args[5], + args[6], + ) @dynamo_tensorrt_converter(torch.ops.aten.squeeze.dim, supports_dynamic_shapes=True) diff --git a/py/torch_tensorrt/dynamo/conversion/converter_utils.py b/py/torch_tensorrt/dynamo/conversion/converter_utils.py index 896bf37b42..53835ba1d5 100644 --- a/py/torch_tensorrt/dynamo/conversion/converter_utils.py +++ b/py/torch_tensorrt/dynamo/conversion/converter_utils.py @@ -32,7 +32,7 @@ ConverterRegistry, DynamoConverterImplSignature, ) - +from torch_tensorrt._utils import is_tensorrt_version_supported from ..types import Shape, TRTDataType, TRTLayer, TRTTensor _LOGGER: logging.Logger = logging.getLogger(__name__) @@ -448,31 +448,35 @@ def create_constant( if torch_value is not None: if torch_value.dtype == torch.uint8: - if ( - target_quantized_type is None - or target_quantized_type != trt.DataType.FP4 - ): - # Iconstant layer does not support Uint8, it only support that FP4 data packed in uint8 + if is_tensorrt_version_supported("10.8.0"): + if ( + target_quantized_type is None + or target_quantized_type != trt.DataType.FP4 + ): + # Iconstant layer does not support Uint8, it only support that FP4 data packed in uint8 + raise ValueError( + "Currently supported target_quantized_type for uint8 is FP4, got {target_quantized_type=}" + ) + shape[-1] = shape[-1] * 2 + weights = to_trt_weights( + ctx, + torch_value, + name, + "CONSTANT", + "CONSTANT", + dtype=trt.DataType.FP4, + count=torch_value.numel() * 2, + ) + constant = ctx.net.add_constant( + shape, + weights, + ) + constant.name = name + return constant.get_output(0) + else: raise ValueError( - "Currently supported target_quantized_type for uint8 is FP4, got {target_quantized_type=}" + "Currently FP4 is only supported in TensorRT 10.8.0 and above" ) - shape[-1] = shape[-1] * 2 - weights = to_trt_weights( - ctx, - torch_value, - name, - "CONSTANT", - "CONSTANT", - dtype=trt.DataType.FP4, - count=torch_value.numel() * 2, - ) - constant = ctx.net.add_constant( - shape, - weights, - ) - constant.name = name - return constant.get_output(0) - # Record the weight in ctx for refit and cpu memory reference # Convert the torch.Tensor to a trt.Weights object diff --git a/py/torch_tensorrt/dynamo/conversion/custom_ops_converters.py b/py/torch_tensorrt/dynamo/conversion/custom_ops_converters.py index 79611c7552..1442c2b17b 100644 --- a/py/torch_tensorrt/dynamo/conversion/custom_ops_converters.py +++ b/py/torch_tensorrt/dynamo/conversion/custom_ops_converters.py @@ -12,14 +12,14 @@ dynamo_tensorrt_converter, ) from torch_tensorrt.dynamo.conversion.converter_utils import load_tensorrt_llm -from torch_tensorrt.dynamo.lowering.passes.fuse_distributed_ops import ( - tensorrt_fused_nccl_all_gather_op, - tensorrt_fused_nccl_reduce_scatter_op, -) _LOGGER: logging.Logger = logging.getLogger(__name__) if load_tensorrt_llm(): + from torch_tensorrt.dynamo.lowering.passes.fuse_distributed_ops import ( + tensorrt_fused_nccl_all_gather_op, + tensorrt_fused_nccl_reduce_scatter_op, + ) @dynamo_tensorrt_converter(tensorrt_fused_nccl_all_gather_op) def fused_nccl_gather( diff --git a/py/torch_tensorrt/dynamo/conversion/impl/dynamic_block_quantize.py b/py/torch_tensorrt/dynamo/conversion/impl/dynamic_block_quantize.py index f76a84dea5..e935992bda 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/dynamic_block_quantize.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/dynamic_block_quantize.py @@ -5,6 +5,7 @@ import torch from torch.fx.experimental.proxy_tensor import unset_fake_temporarily from torch.fx.node import Target +from torch_tensorrt._utils import is_tensorrt_version_supported from torch_tensorrt.dynamo._SourceIR import SourceIR from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext from torch_tensorrt.dynamo.conversion.converter_utils import ( @@ -13,260 +14,259 @@ from torch_tensorrt.fx.converters.converter_utils import set_layer_name from torch_tensorrt.fx.types import TRTTensor +if is_tensorrt_version_supported("10.8.0"): -def quantize( - ctx: ConversionContext, - target: Target, - source_ir: Optional[SourceIR], - name: str, - input_tensor: TRTTensor, - block_size: int, - amax: Union[np.ndarray, torch.Tensor], - num_bits: int, - exponent_bits: int, - scale_num_bits: int, - scale_exponent_bits: int, -) -> TRTTensor: - """ - Adds quantize and dequantize ops (QDQ) which quantize to FP4 based - on the output_type set and dequantizes them back. - """ - if len(input_tensor.shape) not in (2, 3): - raise ValueError( - f"dynamic_block_quantize converter received an input of {input_tensor.shape} shape. Supported shapes: 2D or 3D" - ) - with unset_fake_temporarily(): - axis = -1 - global_scale = _calculate_global_scale(ctx, name, amax) - if ".weight_quantizer" in name: - output = _static_double_quantize( - ctx, - target, - source_ir, - name, - input_tensor, - global_scale, - axis, - ) - elif ".input_quantizer" in name: - output = _dynamic_double_quantize( - ctx, - target, - source_ir, - name, - input_tensor, - global_scale, - axis, - ) - else: + def quantize( + ctx: ConversionContext, + target: Target, + source_ir: Optional[SourceIR], + name: str, + input_tensor: TRTTensor, + block_size: int, + amax: Union[np.ndarray, torch.Tensor], + num_bits: int, + exponent_bits: int, + scale_num_bits: int, + scale_exponent_bits: int, + ) -> TRTTensor: + """ + Adds quantize and dequantize ops (QDQ) which quantize to FP4 based + on the output_type set and dequantizes them back. + """ + if len(input_tensor.shape) not in (2, 3): raise ValueError( - f"quantizer received an input of {name}. Supported values: weight_quantizer | input_quantizer" + f"dynamic_block_quantize converter received an input of {input_tensor.shape} shape. Supported shapes: 2D or 3D" ) - return output - + with unset_fake_temporarily(): + axis = -1 + global_scale = _calculate_global_scale(ctx, name, amax) + if ".weight_quantizer" in name: + output = _static_double_quantize( + ctx, + target, + source_ir, + name, + input_tensor, + global_scale, + axis, + ) + elif ".input_quantizer" in name: + output = _dynamic_double_quantize( + ctx, + target, + source_ir, + name, + input_tensor, + global_scale, + axis, + ) + else: + raise ValueError( + f"quantizer received an input of {name}. Supported values: weight_quantizer | input_quantizer" + ) + return output -def _dynamic_double_quantize( - ctx: ConversionContext, - target: Target, - source_ir: Optional[SourceIR], - name: str, - input_tensor: TRTTensor, - global_scale: torch.Tensor, - axis: int = -1, - block_size: int = 16, - output_type: trt.DataType = trt.DataType.FP4, - scale_type: trt.DataType = trt.DataType.FP8, -) -> TRTTensor: - """ - quantize input tensor to fp4 - Parameters: + def _dynamic_double_quantize( ctx: ConversionContext, target: Target, - source_ir: Optional[SourceIR] - name: str - input_tensor : TRTTensor (On GPU) - The input TRTTensor. - global_scale : Tensor (On GPU) - The global per-tensor scaling factor. It should contain only 1 element. - axis : int - The axis to quantize. Default is -1 (the last axis). - block_size : int - The block size for quantization. Default is 16. - output_type : trt.DataType - The data type for quantized data. Default is FP4. - scale_type : trt.DataType - The data type for block scale. Default is FP8. + source_ir: Optional[SourceIR], + name: str, + input_tensor: TRTTensor, + global_scale: torch.Tensor, + axis: int = -1, + block_size: int = 16, + output_type: trt.DataType = trt.DataType.FP4, + scale_type: trt.DataType = trt.DataType.FP8, + ) -> TRTTensor: + """ + quantize input tensor to fp4 + Parameters: + ctx: ConversionContext, + target: Target, + source_ir: Optional[SourceIR] + name: str + input_tensor : TRTTensor (On GPU) + The input TRTTensor. + global_scale : Tensor (On GPU) + The global per-tensor scaling factor. It should contain only 1 element. + axis : int + The axis to quantize. Default is -1 (the last axis). + block_size : int + The block size for quantization. Default is 16. + output_type : trt.DataType + The data type for quantized data. Default is FP4. + scale_type : trt.DataType + The data type for block scale. Default is FP8. - """ - global_scale = get_trt_tensor(ctx, global_scale, name + "_global_scale") + """ + global_scale = get_trt_tensor(ctx, global_scale, name + "_global_scale") - if input_tensor.dtype not in [ - trt.DataType.HALF, - trt.DataType.FLOAT, - trt.DataType.BF16, - ]: - raise ValueError( - f"Currently supported input tensor type is float16 | float32 | bfloat16, got Unsupported dtype: {input_tensor.dtype}" + if input_tensor.dtype not in [ + trt.DataType.HALF, + trt.DataType.FLOAT, + trt.DataType.BF16, + ]: + raise ValueError( + f"Currently supported input tensor type is float16 | float32 | bfloat16, got Unsupported dtype: {input_tensor.dtype}" + ) + # dynamic quantize input tensor to fp4 + dynamic_quantize_layer = ctx.net.add_dynamic_quantize( + input_tensor, + axis, + block_size, + output_type, + scale_type, ) - # dynamic quantize input tensor to fp4 - dynamic_quantize_layer = ctx.net.add_dynamic_quantize( - input_tensor, - axis, - block_size, - output_type, - scale_type, - ) - dynamic_quantize_layer.set_input(1, global_scale) - set_layer_name( - dynamic_quantize_layer, target, name + "_dynamic_quantize", source_ir - ) - quantized_data_in_fp4 = dynamic_quantize_layer.get_output(0) - quantized_scale_in_fp8 = dynamic_quantize_layer.get_output(1) - - return _double_dequantize( - ctx, - target, - source_ir, - name, - quantized_data_in_fp4, - quantized_scale_in_fp8, - global_scale, - axis, - input_tensor.dtype, - ) + dynamic_quantize_layer.set_input(1, global_scale) + set_layer_name( + dynamic_quantize_layer, target, name + "_dynamic_quantize", source_ir + ) + quantized_data_in_fp4 = dynamic_quantize_layer.get_output(0) + quantized_scale_in_fp8 = dynamic_quantize_layer.get_output(1) + return _double_dequantize( + ctx, + target, + source_ir, + name, + quantized_data_in_fp4, + quantized_scale_in_fp8, + global_scale, + axis, + input_tensor.dtype, + ) -def _double_dequantize( - ctx: ConversionContext, - target: Target, - source_ir: Optional[SourceIR], - name: str, - quantized_data_in_fp4: TRTTensor, - quantized_scale_in_fp8: TRTTensor, - global_scale: torch.Tensor, - axis: int = -1, - output_type: trt.DataType = trt.DataType.FLOAT, -) -> TRTTensor: - """ - double dequantize will first dequantize scale from fp8 to orignal dtype(default is float32) - and then dequantize data from fp4 to orignal dtype(default is float32) - Parameters: + def _double_dequantize( ctx: ConversionContext, target: Target, - source_ir: Optional[SourceIR] - name: str - quantized_data_in_fp4: TRTTensor - quantized_scale_in_fp8: TRTTensor - global_scale: torch.Tensor - axis: int - output_type: trt.DataType - """ - # dequantize scale from fp8 to orignal dtype(default is float32) - dequantize_scale_layer = ctx.net.add_dequantize( - quantized_scale_in_fp8, global_scale, output_type - ) - dequantize_scale_layer.axis = axis - dequantize_scale_layer.to_type = output_type - set_layer_name( - dequantize_scale_layer, target, name + "_dequantize_scale", source_ir - ) - dequantized_scale = dequantize_scale_layer.get_output(0) - - # dequantize quantized_data_in_fp4 from fp4 to orignal dtype(default is float32) - dequantize_data_layer = ctx.net.add_dequantize( - quantized_data_in_fp4, dequantized_scale, output_type - ) - dequantize_data_layer.axis = axis - dequantize_data_layer.to_type = output_type - set_layer_name(dequantize_data_layer, target, name + "_dequantize_data", source_ir) - dequantized_data = dequantize_data_layer.get_output(0) - return dequantized_data + source_ir: Optional[SourceIR], + name: str, + quantized_data_in_fp4: TRTTensor, + quantized_scale_in_fp8: TRTTensor, + global_scale: torch.Tensor, + axis: int = -1, + output_type: trt.DataType = trt.DataType.FLOAT, + ) -> TRTTensor: + """ + double dequantize will first dequantize scale from fp8 to orignal dtype(default is float32) + and then dequantize data from fp4 to orignal dtype(default is float32) + Parameters: + ctx: ConversionContext, + target: Target, + source_ir: Optional[SourceIR] + name: str + quantized_data_in_fp4: TRTTensor + quantized_scale_in_fp8: TRTTensor + global_scale: torch.Tensor + axis: int + output_type: trt.DataType + """ + # dequantize scale from fp8 to orignal dtype(default is float32) + dequantize_scale_layer = ctx.net.add_dequantize( + quantized_scale_in_fp8, global_scale, output_type + ) + dequantize_scale_layer.axis = axis + dequantize_scale_layer.to_type = output_type + set_layer_name( + dequantize_scale_layer, target, name + "_dequantize_scale", source_ir + ) + dequantized_scale = dequantize_scale_layer.get_output(0) + # dequantize quantized_data_in_fp4 from fp4 to orignal dtype(default is float32) + dequantize_data_layer = ctx.net.add_dequantize( + quantized_data_in_fp4, dequantized_scale, output_type + ) + dequantize_data_layer.axis = axis + dequantize_data_layer.to_type = output_type + set_layer_name( + dequantize_data_layer, target, name + "_dequantize_data", source_ir + ) + dequantized_data = dequantize_data_layer.get_output(0) + return dequantized_data -def _static_double_quantize( - ctx: ConversionContext, - target: Target, - source_ir: Optional[SourceIR], - name: str, - weights_tensor: torch.Tensor, - global_scale: torch.Tensor, - axis: int, -) -> TRTTensor: - """ - Parameters: + def _static_double_quantize( ctx: ConversionContext, target: Target, source_ir: Optional[SourceIR], name: str, - weights_tensor : Tensor (On GPU) - The input tensor for weights. - global_scale : Tensor (On GPU) - The global per-tensor scaling factor. It should contain only 1 element. - axis: int - The axis to quantize. Default is -1 (the last axis). - Returns: - quantized data tensor in fp4 - """ + weights_tensor: torch.Tensor, + global_scale: torch.Tensor, + axis: int, + ) -> TRTTensor: + """ + Parameters: + ctx: ConversionContext, + target: Target, + source_ir: Optional[SourceIR], + name: str, + weights_tensor : Tensor (On GPU) + The input tensor for weights. + global_scale : Tensor (On GPU) + The global per-tensor scaling factor. It should contain only 1 element. + axis: int + The axis to quantize. Default is -1 (the last axis). + Returns: + quantized data tensor in fp4 + """ - import modelopt.core.torch.quantization.qtensor.nvfp4_tensor as nvfp4_tensor - - if weights_tensor.dtype == torch.float16: - original_dtype = trt.DataType.HALF - elif weights_tensor.dtype == torch.float32: - original_dtype = trt.DataType.FLOAT - elif weights_tensor.dtype == torch.bfloat16: - original_dtype = trt.DataType.BF16 - else: - raise ValueError( - f"Currently supported weights tensor type is float16 | float32 | bfloat16, got Unsupported dtype: {weights_tensor.dtype}" - ) - block_scale_fp8 = nvfp4_tensor.NVFP4QTensor.get_weights_scaling_factor( - weights_tensor, - 16, - global_scale, - )[0] - weights_tensor_fp4 = nvfp4_tensor.NVFP4QTensor.quantize( - weights_tensor, - 16, - block_scale_fp8, - global_scale, - )[0]._quantized_data + import modelopt.core.torch.quantization.qtensor.nvfp4_tensor as nvfp4_tensor - block_scale_fp8 = get_trt_tensor( - ctx, - block_scale_fp8, - name + "_block_scale_fp8", - target_quantized_type=trt.DataType.FP8, - ) - global_scale = get_trt_tensor(ctx, global_scale, name + "_global_scale") - weights_tensor_fp4 = get_trt_tensor( - ctx, - weights_tensor_fp4, - name + "_weights_fp4", - target_quantized_type=trt.DataType.FP4, - ) + if weights_tensor.dtype == torch.float16: + original_dtype = trt.DataType.HALF + elif weights_tensor.dtype == torch.float32: + original_dtype = trt.DataType.FLOAT + elif weights_tensor.dtype == torch.bfloat16: + original_dtype = trt.DataType.BF16 + else: + raise ValueError( + f"Currently supported weights tensor type is float16 | float32 | bfloat16, got Unsupported dtype: {weights_tensor.dtype}" + ) + block_scale_fp8 = nvfp4_tensor.NVFP4QTensor.get_weights_scaling_factor( + weights_tensor, + 16, + global_scale, + )[0] + weights_tensor_fp4 = nvfp4_tensor.NVFP4QTensor.quantize( + weights_tensor, + 16, + block_scale_fp8, + global_scale, + )[0]._quantized_data - dequantized_data = _double_dequantize( - ctx, - target, - source_ir, - name, - weights_tensor_fp4, - block_scale_fp8, - global_scale, - axis, - original_dtype, - ) - return dequantized_data + block_scale_fp8 = get_trt_tensor( + ctx, + block_scale_fp8, + name + "_block_scale_fp8", + target_quantized_type=trt.DataType.FP8, + ) + global_scale = get_trt_tensor(ctx, global_scale, name + "_global_scale") + weights_tensor_fp4 = get_trt_tensor( + ctx, + weights_tensor_fp4, + name + "_weights_fp4", + target_quantized_type=trt.DataType.FP4, + ) + dequantized_data = _double_dequantize( + ctx, + target, + source_ir, + name, + weights_tensor_fp4, + block_scale_fp8, + global_scale, + axis, + original_dtype, + ) + return dequantized_data -def _calculate_global_scale( - ctx: ConversionContext, - name: str, - amax: torch.Tensor, -) -> torch.Tensor: - # calculate global scale (the global per-tensor scaling factor, should only contain 1 element) - assert len(amax.shape) == 0, "amax should be a scalar" - global_scale = amax / 6 / 448 - global_scale.masked_fill_(global_scale == 0, 1.0) - return global_scale + def _calculate_global_scale( + ctx: ConversionContext, + name: str, + amax: torch.Tensor, + ) -> torch.Tensor: + # calculate global scale (the global per-tensor scaling factor, should only contain 1 element) + assert len(amax.shape) == 0, "amax should be a scalar" + global_scale = amax / 6 / 448 + global_scale.masked_fill_(global_scale == 0, 1.0) + return global_scale diff --git a/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py b/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py index fff4473b47..516c371e48 100644 --- a/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py +++ b/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py @@ -8,7 +8,6 @@ from .accumulate_fp32_matmul import accumulate_fp32_matmul from .complex_graph_rewrite import complex_graph_detection from .constant_folding import constant_fold -from .fuse_distributed_ops import fuse_distributed_ops from .fuse_prims_broadcast import fuse_prims_broadcast from .pass_manager import DynamoPassManager from .remove_assert_nodes import remove_assert_nodes @@ -35,6 +34,8 @@ ] if not is_tegra_platform(): + from .fuse_distributed_ops import fuse_distributed_ops + post_lowering_pass_list.append(fuse_distributed_ops) ATEN_POST_LOWERING_PASSES = DynamoPassManager.build_from_passlist( diff --git a/pyproject.toml b/pyproject.toml index d390e8b4a9..17f547e976 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ requires = [ "pyyaml>=6.0", "cffi>=1.15.1", "torch>=2.9.0.dev,<2.10.0; platform_machine != 'aarch64' or (platform_machine == 'aarch64' and 'tegra' not in platform_release)", - "torch>=2.7.0,<2.8.0; platform_machine == 'aarch64' and 'tegra' in platform_release", + "torch>=2.8.0,<2.9.0; platform_machine == 'aarch64' and 'tegra' in platform_release", "pybind11==2.6.2", ] build-backend = "setuptools.build_meta" @@ -52,18 +52,13 @@ keywords = [ ] dependencies = [ "torch>=2.9.0.dev,<2.10.0; platform_machine != 'aarch64' or (platform_machine == 'aarch64' and 'tegra' not in platform_release)", - "torch>=2.7.0,<2.8.0; platform_machine == 'aarch64' and 'tegra' in platform_release", + "torch>=2.8.0,<2.9.0; platform_machine == 'aarch64' and 'tegra' in platform_release", "tensorrt>=10.12.0,<10.13.0; platform_machine != 'aarch64' or (platform_machine == 'aarch64' and 'tegra' not in platform_release)", - "tensorrt-cu12>=10.12.0,<10.13.0; platform_machine != 'aarch64' or (platform_machine == 'aarch64' and 'tegra' not in platform_release)", "tensorrt-cu12-bindings>=10.12.0,<10.13.0; platform_machine != 'aarch64' or (platform_machine == 'aarch64' and 'tegra' not in platform_release)", "tensorrt-cu12-libs>=10.12.0,<10.13.0; platform_machine != 'aarch64' or (platform_machine == 'aarch64' and 'tegra' not in platform_release)", "tensorrt>=10.3.0,<10.4.0;platform_machine == 'aarch64' and 'tegra' in platform_release", - "tensorrt-cu12>=10.3.0,<10.4.0; platform_machine == 'aarch64' and 'tegra' in platform_release", - "tensorrt-cu12-bindings>=10.3.0,<10.4.0; platform_machine == 'aarch64' and 'tegra' in platform_release", - "tensorrt-cu12-libs>=10.3.0,<10.4.0; platform_machine == 'aarch64' and 'tegra' in platform_release", - "packaging>=23", "numpy; platform_machine != 'aarch64' or (platform_machine == 'aarch64' and 'tegra' not in platform_release)", @@ -104,8 +99,8 @@ test = [ [project.optional-dependencies] torchvision = [ - "torchvision>=0.23.0.dev,<0.24.0; platform_machine != 'aarch64' or (platform_machine == 'aarch64' and 'tegra' not in platform_release)", - "torchvision>=0.22.0,<0.23.0; platform_machine == 'aarch64' and 'tegra' in platform_release", + "torchvision>=0.24.0.dev,<0.25.0; platform_machine != 'aarch64' or (platform_machine == 'aarch64' and 'tegra' not in platform_release)", + "torchvision>=0.23.0,<0.24.0; platform_machine == 'aarch64' and 'tegra' in platform_release", ] quantization = ["nvidia-modelopt[all]>=0.27.1"] @@ -128,11 +123,11 @@ index-strategy = "unsafe-best-match" [tool.uv.sources] torch = [ { index = "pytorch-nightly-cu129", marker = "platform_machine != 'aarch64' or (platform_machine == 'aarch64' and 'tegra' not in platform_release)" }, - # { index = "jetson-containers", marker = "platform_machine == 'aarch64' and 'tegra' in platform_release" }, + { index = "jetson-containers", marker = "platform_machine == 'aarch64' and 'tegra' in platform_release" }, ] torchvision = [ { index = "pytorch-nightly-cu129", marker = "platform_machine != 'aarch64' or (platform_machine == 'aarch64' and 'tegra' not in platform_release)" }, - # { index = "jetson-containers", marker = "platform_machine == 'aarch64' and 'tegra' in platform_release" }, + { index = "jetson-containers", marker = "platform_machine == 'aarch64' and 'tegra' in platform_release" }, ] [[tool.uv.index]] @@ -140,10 +135,10 @@ name = "pytorch-nightly-cu129" url = "https://download.pytorch.org/whl/nightly/cu129" explicit = false -# [[tool.uv.index]] -# name = "jetson-containers" -# url = "https://pypi.jetson-ai-lab.dev/jp6/cu126/+simple" -# explicit = false +[[tool.uv.index]] +name = "jetson-containers" +url = "https://pypi.jetson-ai-lab.io/jp6/cu126" +explicit = false [[tool.uv.index]] name = "nvidia" diff --git a/setup.py b/setup.py index f829602f1a..aa4fe4bc6d 100644 --- a/setup.py +++ b/setup.py @@ -79,7 +79,7 @@ def load_dep_info(): dir_path = os.path.join(str(get_root_dir()), "py") IS_AARCH64 = platform.uname().processor == "aarch64" -IS_JETPACK = True if "tegra" in platform.uname().release else False +IS_JETPACK = False PY_ONLY = False NO_TS = False @@ -154,22 +154,6 @@ def load_dep_info(): IS_SBSA = True if IS_AARCH64 and not IS_JETPACK else False -if IS_JETPACK and "bdist_wheel" in sys.argv: - needs_append_plat_name = True - for i, arg in enumerate(sys.argv): - if ( - arg == "--plat-name" - and i + 1 < len(sys.argv) - and sys.argv[i + 1] == "linux_tegra_aarch64" - ): - needs_append_plat_name = False - break - if arg == "--plat-name=linux_tegra_aarch64": - needs_append_plat_name = False - break - if needs_append_plat_name: - sys.argv.append("--plat-name=linux_tegra_aarch64") - BAZEL_EXE = None if not PY_ONLY: BAZEL_EXE = which("bazelisk")