IntelPython · ndgrigorian · Jun 19, 2025 · Jun 4, 2025 · Jun 5, 2025 · Jun 5, 2025
@@ -25,9 +25,13 @@ option(DPCTL_GENERATE_COVERAGE_FOR_PYBIND11_EXTENSIONS
     "Build dpctl pybind11 offloading extensions with coverage instrumentation"
     OFF
 )
-option(DPCTL_TARGET_CUDA
-    "Build DPCTL to target CUDA devices"
-    OFF
+
+set(DPCTL_TARGET_CUDA
+    ""
+    CACHE STRING
+    "Build DPCTL to target CUDA device. \
+Set to a truthy value (e.g., ON, TRUE) to use default architecture (sm_50), \
+or to a specific architecture like sm_80."
 )
 set(DPCTL_TARGET_HIP
     ""
@@ -51,15 +55,24 @@ set(_dpctl_sycl_target_compile_options)
 set(_dpctl_sycl_target_link_options)
 
 set(_dpctl_sycl_targets)
+set(_dpctl_cuda_arch)
 set(_dpctl_amd_targets)
+
 if ("x${DPCTL_SYCL_TARGETS}" STREQUAL "x")
    if (DPCTL_TARGET_CUDA)
-      set(_dpctl_sycl_targets "nvptx64-nvidia-cuda,spir64-unknown-unknown")
-   else()
-      if (DEFINED ENV{DPCTL_TARGET_CUDA})
-          set(_dpctl_sycl_targets "nvptx64-nvidia-cuda,spir64-unknown-unknown")
+      if(DPCTL_TARGET_CUDA MATCHES "^sm_")
+         set(_dpctl_cuda_arch ${DPCTL_TARGET_CUDA})
+      elseif(DPCTL_TARGET_CUDA MATCHES "^(ON|TRUE|YES|Y|1)$")
+         set(_dpctl_cuda_arch "sm_50")
+      else()
+         message(FATAL_ERROR
+            "Invalid value for DPCTL_TARGET_CUDA: \"${DPCTL_TARGET_CUDA}\". "
+            "Expected 'ON', 'TRUE', 'YES', 'Y', '1', or a CUDA architecture like 'sm_80'."
+         )
       endif()
+      set(_dpctl_sycl_targets "nvidia_gpu_${_dpctl_cuda_arch},spir64-unknown-unknown")
    endif()
+
    if (NOT "x${DPCTL_TARGET_HIP}" STREQUAL "x")
       set(_dpctl_amd_targets ${DPCTL_TARGET_HIP})
       if(_dpctl_sycl_targets)

@@ -159,13 +159,41 @@ The following plugins from CodePlay are supported:
 .. _codeplay_nv_plugin: https://developer.codeplay.com/products/oneapi/nvidia/
 .. _codeplay_amd_plugin: https://developer.codeplay.com/products/oneapi/amd/
 
-``dpctl`` can be built for CUDA devices as follows:
+Builds for CUDA and AMD devices internally use SYCL alias targets that are passed to the compiler.
+A full list of available SYCL alias targets is available in the
+`DPC++ Compiler User Manual <https://intel.github.io/llvm/UsersManual.html>`_.
+
+CUDA build
+~~~~~~~~~~
+
+``dpctl`` can be built for CUDA devices using the ``DPCTL_TARGET_CUDA`` CMake option,
+which accepts a specific compute architecture string:
+
+.. code-block:: bash
+
+    python scripts/build_locally.py --verbose --cmake-opts="-DDPCTL_TARGET_CUDA=sm_80"
+
+To use the default architecture (``sm_50``),
+set ``DPCTL_TARGET_CUDA`` to a value such as ``ON``, ``TRUE``, ``YES``, ``Y``, or ``1``:
 
 .. code-block:: bash
 
     python scripts/build_locally.py --verbose --cmake-opts="-DDPCTL_TARGET_CUDA=ON"
 
-And for AMD devices
+Note that kernels are built for the default architecture (``sm_50``), allowing them to work on a
+wider range of architectures, but limiting the usage of more recent CUDA features.
+
+For reference, compute architecture strings like ``sm_80`` correspond to specific
+CUDA Compute Capabilities (e.g., Compute Capability 8.0 corresponds to ``sm_80``).
+A complete mapping between NVIDIA GPU models and their respective
+Compute Capabilities can be found in the official
+`CUDA GPU Compute Capability <https://developer.nvidia.com/cuda-gpus>`_ documentation.
+
+AMD build
+~~~~~~~~~
+
+``dpctl`` can be built for AMD devices using the ``DPCTL_TARGET_HIP`` CMake option,
+which requires specifying a compute architecture string:
 
 .. code-block:: bash
 
@@ -174,8 +202,13 @@ And for AMD devices
 Note that the  `oneAPI for AMD GPUs` plugin requires the architecture be specified and only
 one architecture can be specified at a time.
 
-It is, however, possible to build for Intel devices, CUDA devices, and an AMD device
-architecture all at once:
+Multi-target build
+~~~~~~~~~~~~~~~~~~
+
+The default ``dpctl`` build from the source enables support of Intel devices only.
+Extending the build with a custom SYCL target additionally enables support of CUDA or AMD
+device in ``dpctl``. Besides, the support can be also extended to enable both CUDA and AMD
+devices at the same time:
 
 .. code-block:: bash