diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 000000000..431bb7c50
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,12 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+repos:
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.6.4
+    hooks:
+      - id: ruff
+        args: [--fix, --show-fixes]
+      - id: ruff-format
+
+default_language_version:
+      python: python3
diff --git a/cuda_bindings/benchmarks/kernels.py b/cuda_bindings/benchmarks/kernels.py
index d31cc58a3..259c6e3ca 100644
--- a/cuda_bindings/benchmarks/kernels.py
+++ b/cuda_bindings/benchmarks/kernels.py
@@ -5,9 +5,9 @@
 # this software. Any use, reproduction, disclosure, or distribution of
 # this software and related documentation outside the terms of the EULA
 # is strictly prohibited.
-kernel_string = '''\
+kernel_string = """\
 #define ITEM_PARAM(x, T) T x
-#define REP1(x, T)   , ITEM_PARAM(x, T)	
+#define REP1(x, T)   , ITEM_PARAM(x, T)
 #define REP2(x, T)   REP1(x##0, T)   REP1(x##1, T)
 #define REP4(x, T)   REP2(x##0, T)   REP2(x##1, T)
 #define REP8(x, T)   REP4(x##0, T)   REP4(x##1, T)
@@ -160,4 +160,4 @@
     // Do not touch param to prevent compiler from copying
     // the whole structure from const bank to lmem.
 }
-'''
+"""
diff --git a/cuda_bindings/benchmarks/perf_test_utils.py b/cuda_bindings/benchmarks/perf_test_utils.py
index 11f5cc30b..7cfcf9f9a 100644
--- a/cuda_bindings/benchmarks/perf_test_utils.py
+++ b/cuda_bindings/benchmarks/perf_test_utils.py
@@ -5,27 +5,30 @@
 # this software. Any use, reproduction, disclosure, or distribution of
 # this software and related documentation outside the terms of the EULA
 # is strictly prohibited.
+import numpy as np
 import pytest
+
 from cuda import cuda, cudart, nvrtc
-import numpy as np
+
 
 def ASSERT_DRV(err):
     if isinstance(err, cuda.CUresult):
         if err != cuda.CUresult.CUDA_SUCCESS:
-            raise RuntimeError('Cuda Error: {}'.format(err))
+            raise RuntimeError(f"Cuda Error: {err}")
     elif isinstance(err, cudart.cudaError_t):
         if err != cudart.cudaError_t.cudaSuccess:
-            raise RuntimeError('Cudart Error: {}'.format(err))
+            raise RuntimeError(f"Cudart Error: {err}")
     elif isinstance(err, nvrtc.nvrtcResult):
         if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
-            raise RuntimeError('Nvrtc Error: {}'.format(err))
+            raise RuntimeError(f"Nvrtc Error: {err}")
     else:
-        raise RuntimeError('Unknown error type: {}'.format(err))
+        raise RuntimeError(f"Unknown error type: {err}")
+
 
 @pytest.fixture
 def init_cuda():
     # Initialize
-    err, = cuda.cuInit(0)
+    (err,) = cuda.cuInit(0)
     ASSERT_DRV(err)
     err, device = cuda.cuDeviceGet(0)
     ASSERT_DRV(err)
@@ -38,31 +41,37 @@ def init_cuda():
 
     yield device, ctx, stream
 
-    err, = cuda.cuStreamDestroy(stream)
+    (err,) = cuda.cuStreamDestroy(stream)
     ASSERT_DRV(err)
-    err, = cuda.cuCtxDestroy(ctx)
+    (err,) = cuda.cuCtxDestroy(ctx)
     ASSERT_DRV(err)
 
+
 @pytest.fixture
 def load_module():
     module = None
+
     def _load_module(kernel_string, device):
         nonlocal module
         # Get module
-        err, major = cuda.cuDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device)
+        err, major = cuda.cuDeviceGetAttribute(
+            cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device
+        )
         ASSERT_DRV(err)
-        err, minor = cuda.cuDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device)
+        err, minor = cuda.cuDeviceGetAttribute(
+            cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device
+        )
         ASSERT_DRV(err)
 
-        err, prog = nvrtc.nvrtcCreateProgram(str.encode(kernel_string), b'kernelString.cu', 0, [], [])
+        err, prog = nvrtc.nvrtcCreateProgram(str.encode(kernel_string), b"kernelString.cu", 0, [], [])
         ASSERT_DRV(err)
-        opts = [b'--fmad=false', bytes('--gpu-architecture=sm_' + str(major) + str(minor), 'ascii')]
-        err, = nvrtc.nvrtcCompileProgram(prog, 2, opts)
+        opts = [b"--fmad=false", bytes("--gpu-architecture=sm_" + str(major) + str(minor), "ascii")]
+        (err,) = nvrtc.nvrtcCompileProgram(prog, 2, opts)
 
         err_log, logSize = nvrtc.nvrtcGetProgramLogSize(prog)
         ASSERT_DRV(err_log)
-        log = b' ' * logSize
-        err_log, = nvrtc.nvrtcGetProgramLog(prog, log)
+        log = b" " * logSize
+        (err_log,) = nvrtc.nvrtcGetProgramLog(prog, log)
         ASSERT_DRV(err_log)
         result = log.decode()
         if len(result) > 1:
@@ -71,8 +80,8 @@ def _load_module(kernel_string, device):
         ASSERT_DRV(err)
         err, cubinSize = nvrtc.nvrtcGetCUBINSize(prog)
         ASSERT_DRV(err)
-        cubin = b' ' * cubinSize
-        err, = nvrtc.nvrtcGetCUBIN(prog, cubin)
+        cubin = b" " * cubinSize
+        (err,) = nvrtc.nvrtcGetCUBIN(prog, cubin)
         ASSERT_DRV(err)
         cubin = np.char.array(cubin)
         err, module = cuda.cuModuleLoadData(cubin)
@@ -82,5 +91,5 @@ def _load_module(kernel_string, device):
 
     yield _load_module
 
-    err, = cuda.cuModuleUnload(module)
+    (err,) = cuda.cuModuleUnload(module)
     ASSERT_DRV(err)
diff --git a/cuda_bindings/benchmarks/test_cupy.py b/cuda_bindings/benchmarks/test_cupy.py
index 43aaa5b12..6e847853e 100644
--- a/cuda_bindings/benchmarks/test_cupy.py
+++ b/cuda_bindings/benchmarks/test_cupy.py
@@ -5,29 +5,33 @@
 # this software. Any use, reproduction, disclosure, or distribution of
 # this software and related documentation outside the terms of the EULA
 # is strictly prohibited.
-import pytest
 import ctypes
 
+import pytest
+
 # Always skip since cupy is not CTK 12.x yet
 skip_tests = True
 if not skip_tests:
     try:
         import cupy
+
         skip_tests = False
     except ImportError:
         skip_tests = True
 
 from .kernels import kernel_string
 
+
 def launch(kernel, args=()):
     kernel((1,), (1,), args)
 
+
 # Measure launch latency with no parmaeters
 @pytest.mark.skipif(skip_tests, reason="cupy is not installed")
 @pytest.mark.benchmark(group="cupy")
 def test_launch_latency_empty_kernel(benchmark):
     module = cupy.RawModule(code=kernel_string)
-    kernel = module.get_function('empty_kernel')
+    kernel = module.get_function("empty_kernel")
 
     stream = cupy.cuda.stream.Stream(non_blocking=True)
 
@@ -35,12 +39,13 @@ def test_launch_latency_empty_kernel(benchmark):
         benchmark(launch, kernel)
         stream.synchronize()
 
+
 # Measure launch latency with a single parameter
 @pytest.mark.skipif(skip_tests, reason="cupy is not installed")
 @pytest.mark.benchmark(group="cupy")
 def test_launch_latency_small_kernel(benchmark):
     module = cupy.RawModule(code=kernel_string)
-    kernel = module.get_function('small_kernel')
+    kernel = module.get_function("small_kernel")
     cupy.cuda.set_allocator()
     arg = cupy.cuda.alloc(ctypes.sizeof(ctypes.c_float))
 
@@ -50,12 +55,13 @@ def test_launch_latency_small_kernel(benchmark):
         benchmark(launch, kernel, (arg,))
         stream.synchronize()
 
+
 # Measure launch latency with many parameters using builtin parameter packing
 @pytest.mark.skipif(skip_tests, reason="cupy is not installed")
 @pytest.mark.benchmark(group="cupy")
 def test_launch_latency_small_kernel_512_args(benchmark):
     module = cupy.RawModule(code=kernel_string)
-    kernel = module.get_function('small_kernel_512_args')
+    kernel = module.get_function("small_kernel_512_args")
     cupy.cuda.set_allocator()
 
     args = []
@@ -69,12 +75,13 @@ def test_launch_latency_small_kernel_512_args(benchmark):
         benchmark(launch, kernel, args)
         stream.synchronize()
 
+
 # Measure launch latency with many parameters using builtin parameter packing
 @pytest.mark.skipif(skip_tests, reason="cupy is not installed")
 @pytest.mark.benchmark(group="cupy")
 def test_launch_latency_small_kernel_512_bools(benchmark):
     module = cupy.RawModule(code=kernel_string)
-    kernel = module.get_function('small_kernel_512_bools')
+    kernel = module.get_function("small_kernel_512_bools")
     cupy.cuda.set_allocator()
 
     args = [True] * 512
@@ -86,12 +93,13 @@ def test_launch_latency_small_kernel_512_bools(benchmark):
         benchmark(launch, kernel, args)
         stream.synchronize()
 
+
 # Measure launch latency with many parameters using builtin parameter packing
 @pytest.mark.skipif(skip_tests, reason="cupy is not installed")
 @pytest.mark.benchmark(group="cupy")
 def test_launch_latency_small_kernel_512_doubles(benchmark):
     module = cupy.RawModule(code=kernel_string)
-    kernel = module.get_function('small_kernel_512_doubles')
+    kernel = module.get_function("small_kernel_512_doubles")
     cupy.cuda.set_allocator()
 
     args = [1.2345] * 512
@@ -103,12 +111,13 @@ def test_launch_latency_small_kernel_512_doubles(benchmark):
         benchmark(launch, kernel, args)
         stream.synchronize()
 
+
 # Measure launch latency with many parameters using builtin parameter packing
 @pytest.mark.skipif(skip_tests, reason="cupy is not installed")
 @pytest.mark.benchmark(group="cupy")
 def test_launch_latency_small_kernel_512_ints(benchmark):
     module = cupy.RawModule(code=kernel_string)
-    kernel = module.get_function('small_kernel_512_ints')
+    kernel = module.get_function("small_kernel_512_ints")
     cupy.cuda.set_allocator()
 
     args = [123] * 512
@@ -120,12 +129,13 @@ def test_launch_latency_small_kernel_512_ints(benchmark):
         benchmark(launch, kernel, args)
         stream.synchronize()
 
+
 # Measure launch latency with many parameters using builtin parameter packing
 @pytest.mark.skipif(skip_tests, reason="cupy is not installed")
 @pytest.mark.benchmark(group="cupy")
 def test_launch_latency_small_kernel_512_bytes(benchmark):
     module = cupy.RawModule(code=kernel_string)
-    kernel = module.get_function('small_kernel_512_chars')
+    kernel = module.get_function("small_kernel_512_chars")
     cupy.cuda.set_allocator()
 
     args = [127] * 512
@@ -137,12 +147,13 @@ def test_launch_latency_small_kernel_512_bytes(benchmark):
         benchmark(launch, kernel, args)
         stream.synchronize()
 
+
 # Measure launch latency with many parameters using builtin parameter packing
 @pytest.mark.skipif(skip_tests, reason="cupy is not installed")
 @pytest.mark.benchmark(group="cupy")
 def test_launch_latency_small_kernel_512_longlongs(benchmark):
     module = cupy.RawModule(code=kernel_string)
-    kernel = module.get_function('small_kernel_512_longlongs')
+    kernel = module.get_function("small_kernel_512_longlongs")
     cupy.cuda.set_allocator()
 
     args = [9223372036854775806] * 512
@@ -154,12 +165,13 @@ def test_launch_latency_small_kernel_512_longlongs(benchmark):
         benchmark(launch, kernel, args)
         stream.synchronize()
 
+
 # Measure launch latency with many parameters using builtin parameter packing
 @pytest.mark.skipif(skip_tests, reason="cupy is not installed")
 @pytest.mark.benchmark(group="cupy")
 def test_launch_latency_small_kernel_256_args(benchmark):
     module = cupy.RawModule(code=kernel_string)
-    kernel = module.get_function('small_kernel_256_args')
+    kernel = module.get_function("small_kernel_256_args")
     cupy.cuda.set_allocator()
 
     args = []
@@ -173,12 +185,13 @@ def test_launch_latency_small_kernel_256_args(benchmark):
         benchmark(launch, kernel, args)
         stream.synchronize()
 
+
 # Measure launch latency with many parameters using builtin parameter packing
 @pytest.mark.skipif(skip_tests, reason="cupy is not installed")
 @pytest.mark.benchmark(group="cupy")
 def test_launch_latency_small_kernel_16_args(benchmark):
     module = cupy.RawModule(code=kernel_string)
-    kernel = module.get_function('small_kernel_16_args')
+    kernel = module.get_function("small_kernel_16_args")
     cupy.cuda.set_allocator()
 
     args = []
diff --git a/cuda_bindings/benchmarks/test_launch_latency.py b/cuda_bindings/benchmarks/test_launch_latency.py
index 79668a637..8d70bfe24 100755
--- a/cuda_bindings/benchmarks/test_launch_latency.py
+++ b/cuda_bindings/benchmarks/test_launch_latency.py
@@ -5,26 +5,47 @@
 # this software. Any use, reproduction, disclosure, or distribution of
 # this software and related documentation outside the terms of the EULA
 # is strictly prohibited.
+import ctypes
+
 import pytest
+
 from cuda import cuda
-import ctypes
 
-from .perf_test_utils import ASSERT_DRV, init_cuda, load_module
 from .kernels import kernel_string
+from .perf_test_utils import ASSERT_DRV
+
 
 def launch(kernel, stream, args=(), arg_types=()):
-    cuda.cuLaunchKernel(kernel,
-                        1, 1, 1,   # grid dim
-                        1, 1, 1,   # block dim
-                        0, stream, # shared mem and stream
-                        (args, arg_types), 0) # arguments
+    cuda.cuLaunchKernel(
+        kernel,
+        1,
+        1,
+        1,  # grid dim
+        1,
+        1,
+        1,  # block dim
+        0,
+        stream,  # shared mem and stream
+        (args, arg_types),
+        0,
+    )  # arguments
+
 
 def launch_packed(kernel, stream, params):
-    cuda.cuLaunchKernel(kernel,
-                        1, 1, 1,   # grid dim
-                        1, 1, 1,   # block dim
-                        0, stream, # shared mem and stream
-                        params, 0) # arguments
+    cuda.cuLaunchKernel(
+        kernel,
+        1,
+        1,
+        1,  # grid dim
+        1,
+        1,
+        1,  # block dim
+        0,
+        stream,  # shared mem and stream
+        params,
+        0,
+    )  # arguments
+
 
 # Measure launch latency with no parmaeters
 @pytest.mark.benchmark(group="launch-latency")
@@ -32,20 +53,21 @@ def test_launch_latency_empty_kernel(benchmark, init_cuda, load_module):
     device, ctx, stream = init_cuda
     module = load_module(kernel_string, device)
 
-    err, func = cuda.cuModuleGetFunction(module, b'empty_kernel')
+    err, func = cuda.cuModuleGetFunction(module, b"empty_kernel")
     ASSERT_DRV(err)
 
     benchmark(launch, func, stream)
 
     cuda.cuCtxSynchronize()
 
+
 # Measure launch latency with a single parameter
 @pytest.mark.benchmark(group="launch-latency")
 def test_launch_latency_small_kernel(benchmark, init_cuda, load_module):
     device, ctx, stream = init_cuda
     module = load_module(kernel_string, device)
 
-    err, func = cuda.cuModuleGetFunction(module, b'small_kernel')
+    err, func = cuda.cuModuleGetFunction(module, b"small_kernel")
     ASSERT_DRV(err)
 
     err, f = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_float))
@@ -55,16 +77,17 @@ def test_launch_latency_small_kernel(benchmark, init_cuda, load_module):
 
     cuda.cuCtxSynchronize()
 
-    err, = cuda.cuMemFree(f)
+    (err,) = cuda.cuMemFree(f)
     ASSERT_DRV(err)
 
+
 # Measure launch latency with many parameters using builtin parameter packing
 @pytest.mark.benchmark(group="launch-latency")
 def test_launch_latency_small_kernel_512_args(benchmark, init_cuda, load_module):
     device, ctx, stream = init_cuda
     module = load_module(kernel_string, device)
 
-    err, func = cuda.cuModuleGetFunction(module, b'small_kernel_512_args')
+    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_args")
     ASSERT_DRV(err)
 
     args = []
@@ -82,15 +105,16 @@ def test_launch_latency_small_kernel_512_args(benchmark, init_cuda, load_module)
     cuda.cuCtxSynchronize()
 
     for p in args:
-        err, = cuda.cuMemFree(p)
+        (err,) = cuda.cuMemFree(p)
         ASSERT_DRV(err)
 
+
 @pytest.mark.benchmark(group="launch-latency")
 def test_launch_latency_small_kernel_512_bools(benchmark, init_cuda, load_module):
     device, ctx, stream = init_cuda
     module = load_module(kernel_string, device)
 
-    err, func = cuda.cuModuleGetFunction(module, b'small_kernel_512_bools')
+    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_bools")
     ASSERT_DRV(err)
 
     args = [True] * 512
@@ -103,12 +127,13 @@ def test_launch_latency_small_kernel_512_bools(benchmark, init_cuda, load_module
 
     cuda.cuCtxSynchronize()
 
+
 @pytest.mark.benchmark(group="launch-latency")
 def test_launch_latency_small_kernel_512_doubles(benchmark, init_cuda, load_module):
     device, ctx, stream = init_cuda
     module = load_module(kernel_string, device)
 
-    err, func = cuda.cuModuleGetFunction(module, b'small_kernel_512_doubles')
+    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_doubles")
     ASSERT_DRV(err)
 
     args = [1.2345] * 512
@@ -121,12 +146,13 @@ def test_launch_latency_small_kernel_512_doubles(benchmark, init_cuda, load_modu
 
     cuda.cuCtxSynchronize()
 
+
 @pytest.mark.benchmark(group="launch-latency")
 def test_launch_latency_small_kernel_512_ints(benchmark, init_cuda, load_module):
     device, ctx, stream = init_cuda
     module = load_module(kernel_string, device)
 
-    err, func = cuda.cuModuleGetFunction(module, b'small_kernel_512_ints')
+    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_ints")
     ASSERT_DRV(err)
 
     args = [123] * 512
@@ -139,12 +165,13 @@ def test_launch_latency_small_kernel_512_ints(benchmark, init_cuda, load_module)
 
     cuda.cuCtxSynchronize()
 
+
 @pytest.mark.benchmark(group="launch-latency")
 def test_launch_latency_small_kernel_512_bytes(benchmark, init_cuda, load_module):
     device, ctx, stream = init_cuda
     module = load_module(kernel_string, device)
 
-    err, func = cuda.cuModuleGetFunction(module, b'small_kernel_512_chars')
+    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_chars")
     ASSERT_DRV(err)
 
     args = [127] * 512
@@ -157,12 +184,13 @@ def test_launch_latency_small_kernel_512_bytes(benchmark, init_cuda, load_module
 
     cuda.cuCtxSynchronize()
 
+
 @pytest.mark.benchmark(group="launch-latency")
 def test_launch_latency_small_kernel_512_longlongs(benchmark, init_cuda, load_module):
     device, ctx, stream = init_cuda
     module = load_module(kernel_string, device)
 
-    err, func = cuda.cuModuleGetFunction(module, b'small_kernel_512_longlongs')
+    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_longlongs")
     ASSERT_DRV(err)
 
     args = [9223372036854775806] * 512
@@ -175,13 +203,14 @@ def test_launch_latency_small_kernel_512_longlongs(benchmark, init_cuda, load_mo
 
     cuda.cuCtxSynchronize()
 
+
 # Measure launch latency with many parameters using builtin parameter packing
 @pytest.mark.benchmark(group="launch-latency")
 def test_launch_latency_small_kernel_256_args(benchmark, init_cuda, load_module):
     device, ctx, stream = init_cuda
     module = load_module(kernel_string, device)
 
-    err, func = cuda.cuModuleGetFunction(module, b'small_kernel_256_args')
+    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_256_args")
     ASSERT_DRV(err)
 
     args = []
@@ -199,16 +228,17 @@ def test_launch_latency_small_kernel_256_args(benchmark, init_cuda, load_module)
     cuda.cuCtxSynchronize()
 
     for p in args:
-        err, = cuda.cuMemFree(p)
+        (err,) = cuda.cuMemFree(p)
         ASSERT_DRV(err)
 
+
 # Measure launch latency with many parameters using builtin parameter packing
 @pytest.mark.benchmark(group="launch-latency")
 def test_launch_latency_small_kernel_16_args(benchmark, init_cuda, load_module):
     device, ctx, stream = init_cuda
     module = load_module(kernel_string, device)
 
-    err, func = cuda.cuModuleGetFunction(module, b'small_kernel_16_args')
+    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_16_args")
     ASSERT_DRV(err)
 
     args = []
@@ -226,16 +256,17 @@ def test_launch_latency_small_kernel_16_args(benchmark, init_cuda, load_module):
     cuda.cuCtxSynchronize()
 
     for p in args:
-        err, = cuda.cuMemFree(p)
+        (err,) = cuda.cuMemFree(p)
         ASSERT_DRV(err)
 
+
 # Measure launch latency with many parameters, excluding parameter packing
 @pytest.mark.benchmark(group="launch-latency")
 def test_launch_latency_small_kernel_512_args_ctypes(benchmark, init_cuda, load_module):
     device, ctx, stream = init_cuda
     module = load_module(kernel_string, device)
 
-    err, func = cuda.cuModuleGetFunction(module, b'small_kernel_512_args')
+    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_args")
     ASSERT_DRV(err)
 
     vals = []
@@ -255,9 +286,10 @@ def test_launch_latency_small_kernel_512_args_ctypes(benchmark, init_cuda, load_
     cuda.cuCtxSynchronize()
 
     for p in vals:
-        err, = cuda.cuMemFree(p)
+        (err,) = cuda.cuMemFree(p)
         ASSERT_DRV(err)
 
+
 def pack_and_launch(kernel, stream, params):
     packed_params = (ctypes.c_void_p * len(params))()
     ptrs = [0] * len(params)
@@ -265,11 +297,8 @@ def pack_and_launch(kernel, stream, params):
         ptrs[i] = ctypes.c_void_p(int(params[i]))
         packed_params[i] = ctypes.addressof(ptrs[i])
 
-    cuda.cuLaunchKernel(kernel,
-                        1, 1, 1,
-                        1, 1, 1,
-                        0, stream,
-                        packed_params, 0)
+    cuda.cuLaunchKernel(kernel, 1, 1, 1, 1, 1, 1, 0, stream, packed_params, 0)
+
 
 # Measure launch latency plus parameter packing using ctypes
 @pytest.mark.benchmark(group="launch-latency")
@@ -277,7 +306,7 @@ def test_launch_latency_small_kernel_512_args_ctypes_with_packing(benchmark, ini
     device, ctx, stream = init_cuda
     module = load_module(kernel_string, device)
 
-    err, func = cuda.cuModuleGetFunction(module, b'small_kernel_512_args')
+    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_args")
     ASSERT_DRV(err)
 
     vals = []
@@ -291,20 +320,21 @@ def test_launch_latency_small_kernel_512_args_ctypes_with_packing(benchmark, ini
     cuda.cuCtxSynchronize()
 
     for p in vals:
-        err, = cuda.cuMemFree(p)
+        (err,) = cuda.cuMemFree(p)
         ASSERT_DRV(err)
 
+
 # Measure launch latency with a single large struct parameter
 @pytest.mark.benchmark(group="launch-latency")
 def test_launch_latency_small_kernel_2048B(benchmark, init_cuda, load_module):
     device, ctx, stream = init_cuda
     module = load_module(kernel_string, device)
 
-    err, func = cuda.cuModuleGetFunction(module, b'small_kernel_2048B')
+    err, func = cuda.cuModuleGetFunction(module, b"small_kernel_2048B")
     ASSERT_DRV(err)
 
     class struct_2048B(ctypes.Structure):
-        _fields_ = [('values',ctypes.c_uint8 * 2048)]
+        _fields_ = [("values", ctypes.c_uint8 * 2048)]
 
     benchmark(launch, func, stream, args=(struct_2048B(),), arg_types=(None,))
 
diff --git a/cuda_bindings/benchmarks/test_numba.py b/cuda_bindings/benchmarks/test_numba.py
index f0dd12316..f7a4db7e9 100644
--- a/cuda_bindings/benchmarks/test_numba.py
+++ b/cuda_bindings/benchmarks/test_numba.py
@@ -5,19 +5,24 @@
 # this software. Any use, reproduction, disclosure, or distribution of
 # this software and related documentation outside the terms of the EULA
 # is strictly prohibited.
-import pytest
 import numpy as np
+import pytest
+
 try:
     from numba import cuda
+
     skip_tests = False
 except ImportError:
     skip_tests = True
 
+
 def launch_empty(kernel, stream):
-    kernel[1,1, stream]()
+    kernel[1, 1, stream]()
+
 
 def launch(kernel, stream, arg):
-    kernel[1,1, stream](arg)
+    kernel[1, 1, stream](arg)
+
 
 # Measure launch latency with no parmaeters
 @pytest.mark.skipif(skip_tests, reason="Numba is not installed")
@@ -33,6 +38,7 @@ def empty_kernel():
 
     cuda.synchronize()
 
+
 # Measure launch latency with a single parameter
 @pytest.mark.skipif(skip_tests, reason="Numba is not installed")
 @pytest.mark.benchmark(group="numba", min_rounds=1000)
diff --git a/cuda_bindings/benchmarks/test_pointer_attributes.py b/cuda_bindings/benchmarks/test_pointer_attributes.py
index 72de39641..f03be1e1e 100644
--- a/cuda_bindings/benchmarks/test_pointer_attributes.py
+++ b/cuda_bindings/benchmarks/test_pointer_attributes.py
@@ -5,30 +5,35 @@
 # this software. Any use, reproduction, disclosure, or distribution of
 # this software and related documentation outside the terms of the EULA
 # is strictly prohibited.
+import random
+
 import pytest
+
 from cuda import cuda
-import ctypes
-import random
 
-from .perf_test_utils import ASSERT_DRV, init_cuda
+from .perf_test_utils import ASSERT_DRV
 
 random.seed(0)
 
 idx = 0
+
+
 def query_attribute(attribute, ptrs):
     global idx
     ptr = ptrs[idx]
-    idx = (idx + 1 ) % len(ptrs)
+    idx = (idx + 1) % len(ptrs)
 
     cuda.cuPointerGetAttribute(attribute, ptr)
 
+
 def query_attributes(attributes, ptrs):
     global idx
     ptr = ptrs[idx]
-    idx = (idx + 1 ) % len(ptrs)
+    idx = (idx + 1) % len(ptrs)
 
     cuda.cuPointerGetAttributes(len(attributes), attributes, ptr)
 
+
 @pytest.mark.benchmark(group="pointer-attributes")
 # Measure cuPointerGetAttribute in the same way as C benchmarks
 def test_pointer_get_attribute(benchmark, init_cuda):
@@ -45,9 +50,10 @@ def test_pointer_get_attribute(benchmark, init_cuda):
     benchmark(query_attribute, cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMORY_TYPE, ptrs)
 
     for p in ptrs:
-        err, = cuda.cuMemFree(p)
+        (err,) = cuda.cuMemFree(p)
         ASSERT_DRV(err)
 
+
 @pytest.mark.benchmark(group="pointer-attributes")
 # Measure cuPointerGetAttributes with all attributes
 def test_pointer_get_attributes_all(benchmark, init_cuda):
@@ -61,29 +67,32 @@ def test_pointer_get_attributes_all(benchmark, init_cuda):
 
     random.shuffle(ptrs)
 
-    attributes = [cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_CONTEXT,
-                  cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
-                  cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_HOST_POINTER,
-                  cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_P2P_TOKENS,
-                  cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
-                  cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_BUFFER_ID,
-                  cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_IS_MANAGED,
-                  cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL,
-                  cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE,
-                  cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_RANGE_START_ADDR,
-                  cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_RANGE_SIZE,
-                  cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MAPPED,
-                  cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES,
-                  cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_IS_GPU_DIRECT_RDMA_CAPABLE,
-                  cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_ACCESS_FLAGS,
-                  cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE]
+    attributes = [
+        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_CONTEXT,
+        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
+        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_HOST_POINTER,
+        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_P2P_TOKENS,
+        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
+        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_BUFFER_ID,
+        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_IS_MANAGED,
+        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL,
+        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE,
+        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_RANGE_START_ADDR,
+        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_RANGE_SIZE,
+        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MAPPED,
+        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES,
+        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_IS_GPU_DIRECT_RDMA_CAPABLE,
+        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_ACCESS_FLAGS,
+        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE,
+    ]
 
     benchmark(query_attributes, attributes, ptrs)
 
     for p in ptrs:
-        err, = cuda.cuMemFree(p)
+        (err,) = cuda.cuMemFree(p)
         ASSERT_DRV(err)
 
+
 @pytest.mark.benchmark(group="pointer-attributes")
 # Measure cuPointerGetAttributes with a single attribute
 def test_pointer_get_attributes_single(benchmark, init_cuda):
@@ -97,10 +106,12 @@ def test_pointer_get_attributes_single(benchmark, init_cuda):
 
     random.shuffle(ptrs)
 
-    attributes = [cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMORY_TYPE,]
+    attributes = [
+        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
+    ]
 
     benchmark(query_attributes, attributes, ptrs)
 
     for p in ptrs:
-        err, = cuda.cuMemFree(p)
+        (err,) = cuda.cuMemFree(p)
         ASSERT_DRV(err)
diff --git a/cuda_bindings/cuda/__init__.py b/cuda_bindings/cuda/__init__.py
index 8b302752d..577d5f8b6 100644
--- a/cuda_bindings/cuda/__init__.py
+++ b/cuda_bindings/cuda/__init__.py
@@ -1,10 +1,14 @@
 def __getattr__(name):
     if name == "__version__":
         import warnings
-        warnings.warn("accessing cuda.__version__ is deprecated, "
-                      "please switch to use cuda.bindings.__version__ instead",
-                      DeprecationWarning, stacklevel=2)
+
+        warnings.warn(
+            "accessing cuda.__version__ is deprecated, " "please switch to use cuda.bindings.__version__ instead",
+            DeprecationWarning,
+            stacklevel=2,
+        )
         from . import bindings
+
         return bindings.__version__
 
     raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/cuda_bindings/cuda/bindings/__init__.py b/cuda_bindings/cuda/bindings/__init__.py
index ecd3379a8..4d52a61e7 100644
--- a/cuda_bindings/cuda/bindings/__init__.py
+++ b/cuda_bindings/cuda/bindings/__init__.py
@@ -1,3 +1,3 @@
-
 from . import _version
-__version__ = _version.get_versions()['version']
+
+__version__ = _version.get_versions()["version"]
diff --git a/cuda_bindings/docs/source/conf.py b/cuda_bindings/docs/source/conf.py
index be6f55172..01f0fb492 100644
--- a/cuda_bindings/docs/source/conf.py
+++ b/cuda_bindings/docs/source/conf.py
@@ -10,15 +10,16 @@
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 import os
+
 # import sys
 # sys.path.insert(0, os.path.abspath('.'))
 
 
 # -- Project information -----------------------------------------------------
 
-project = 'cuda.bindings'
-copyright = '2021-2024, NVIDIA'
-author = 'NVIDIA'
+project = "cuda.bindings"
+copyright = "2021-2024, NVIDIA"
+author = "NVIDIA"
 
 # The full version, including alpha/beta/rc tags
 release = os.environ["SPHINX_CUDA_BINDINGS_VER"]
@@ -29,18 +30,13 @@
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
-extensions = [
-	'sphinx.ext.autodoc',
-	'sphinx.ext.napoleon',
-	'myst_nb',
-	'enum_tools.autoenum'
-]
+extensions = ["sphinx.ext.autodoc", "sphinx.ext.napoleon", "myst_nb", "enum_tools.autoenum"]
 
 jupyter_execute_notebooks = "force"
-numfig=True
+numfig = True
 
 # Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
+templates_path = ["_templates"]
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
@@ -51,36 +47,36 @@
 
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
-html_baseurl = 'docs'
-html_theme = 'furo'
-#html_theme = 'pydata_sphinx_theme'
+html_baseurl = "docs"
+html_theme = "furo"
+# html_theme = 'pydata_sphinx_theme'
 html_theme_options = {
     "light_logo": "logo-light-mode.png",
     "dark_logo": "logo-dark-mode.png",
     # For pydata_sphinx_theme:
-    #"logo": {
-	#    "image_light": "_static/logo-light-mode.png",
+    # "logo": {
+    #    "image_light": "_static/logo-light-mode.png",
     #    "image_dark": "_static/logo-dark-mode.png",
-    #},
-    #"switcher": {
+    # },
+    # "switcher": {
     #    "json_url": "https://nvidia.github.io/cuda-python/cuda-bindings/versions.json",
     #    "version_match": release,
-    #},
+    # },
     ## Add light/dark mode and documentation version switcher
-    #"navbar_end": [
+    # "navbar_end": [
     #    "search-button",
     #    "theme-switcher",
     #    "version-switcher",
     #    "navbar-icon-links",
-    #],
+    # ],
 }
 
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+html_static_path = ["_static"]
 
 suppress_warnings = [
     # for warnings about multiple possible targets, see NVIDIA/cuda-python#152
-    'ref.python',
+    "ref.python",
 ]
diff --git a/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py b/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py
index 49a7a1e0a..2717bd193 100644
--- a/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py
+++ b/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py
@@ -6,11 +6,12 @@
 # this software and related documentation outside the terms of the EULA
 # is strictly prohibited.
 import numpy as np
-from cuda import cuda
 from common import common
 from common.helper_cuda import checkCudaErrors, findCudaDevice
 
-clock_nvrtc = '''\
+from cuda import cuda
+
+clock_nvrtc = """\
 extern "C" __global__  void timedReduction(const float *hinput, float *output, clock_t *timer)
 {
     // __shared__ float shared[2 * blockDim.x];
@@ -49,39 +50,47 @@
 
     if (tid == 0) timer[bid+gridDim.x] = clock();
 }
-'''
+"""
+
+NUM_BLOCKS = 64
+NUM_THREADS = 256
 
-NUM_BLOCKS = 64 
-NUM_THREADS  = 256
 
 def main():
     print("CUDA Clock sample")
 
-    timer = np.empty(NUM_BLOCKS * 2, dtype='int64')
-    hinput = np.empty(NUM_THREADS * 2, dtype='float32')
+    timer = np.empty(NUM_BLOCKS * 2, dtype="int64")
+    hinput = np.empty(NUM_THREADS * 2, dtype="float32")
 
     for i in range(0, NUM_THREADS * 2):
         hinput[i] = i
 
     devID = findCudaDevice()
     kernelHelper = common.KernelHelper(clock_nvrtc, devID)
-    kernel_addr = kernelHelper.getFunction(b'timedReduction')
+    kernel_addr = kernelHelper.getFunction(b"timedReduction")
 
     dinput = checkCudaErrors(cuda.cuMemAlloc(np.dtype(np.float32).itemsize * NUM_THREADS * 2))
     doutput = checkCudaErrors(cuda.cuMemAlloc(np.dtype(np.float32).itemsize * NUM_BLOCKS))
     dtimer = checkCudaErrors(cuda.cuMemAlloc(np.dtype(np.int64).itemsize * NUM_BLOCKS * 2))
     checkCudaErrors(cuda.cuMemcpyHtoD(dinput, hinput, np.dtype(np.float32).itemsize * NUM_THREADS * 2))
 
-
-
-    arr = ((dinput, doutput, dtimer),
-           (None, None, None))
-
-    checkCudaErrors(cuda.cuLaunchKernel(kernel_addr,
-                                        NUM_BLOCKS, 1, 1,  # grid dim
-                                        NUM_THREADS, 1, 1, # block dim
-                                        np.dtype(np.float32).itemsize * 2 *NUM_THREADS, 0, # shared mem, stream
-                                        arr, 0)) # arguments
+    arr = ((dinput, doutput, dtimer), (None, None, None))
+
+    checkCudaErrors(
+        cuda.cuLaunchKernel(
+            kernel_addr,
+            NUM_BLOCKS,
+            1,
+            1,  # grid dim
+            NUM_THREADS,
+            1,
+            1,  # block dim
+            np.dtype(np.float32).itemsize * 2 * NUM_THREADS,
+            0,  # shared mem, stream
+            arr,
+            0,
+        )
+    )  # arguments
 
     checkCudaErrors(cuda.cuCtxSynchronize())
     checkCudaErrors(cuda.cuMemcpyDtoH(timer, dtimer, np.dtype(np.int64).itemsize * NUM_BLOCKS * 2))
@@ -91,11 +100,12 @@ def main():
 
     avgElapsedClocks = 0.0
 
-    for i in range(0,NUM_BLOCKS):
+    for i in range(0, NUM_BLOCKS):
         avgElapsedClocks += timer[i + NUM_BLOCKS] - timer[i]
 
-    avgElapsedClocks = avgElapsedClocks/NUM_BLOCKS;
-    print("Average clocks/block = {}".format(avgElapsedClocks))
+    avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS
+    print(f"Average clocks/block = {avgElapsedClocks}")
+
 
-if __name__=="__main__":
+if __name__ == "__main__":
     main()
diff --git a/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py b/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py
index adb5a5606..6e1d16e72 100644
--- a/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py
+++ b/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py
@@ -7,14 +7,16 @@
 # is strictly prohibited.
 import ctypes
 import math
-import numpy as np
 import sys
 import time
-from cuda import cuda, cudart
+
+import numpy as np
 from common import common
 from common.helper_cuda import checkCudaErrors, findCudaDevice
 
-simpleCubemapTexture = '''\
+from cuda import cuda, cudart
+
+simpleCubemapTexture = """\
 extern "C"
 __global__ void transformKernel(float *g_odata, int width, cudaTextureObject_t tex)
 {
@@ -80,20 +82,20 @@
         g_odata[face*width*width + y*width + x] = -texCubemap<float>(tex, cx, cy, cz);
     }
 }
-'''
+"""
+
 
 def main():
     # Use command-line specified CUDA device, otherwise use device with highest Gflops/s
     devID = findCudaDevice()
 
     # Get number of SMs on this GPU
-    deviceProps = checkCudaErrors(cudart.cudaGetDeviceProperties(devID));
-    print("CUDA device [{}] has {} Multi-Processors SM {}.{}".format(deviceProps.name,
-                                                                     deviceProps.multiProcessorCount,
-                                                                     deviceProps.major,
-                                                                     deviceProps.minor))
-    if (deviceProps.major < 2):
-        print("{} requires SM 2.0 or higher for support of Texture Arrays.  Test will exit...".format(sSDKname))
+    deviceProps = checkCudaErrors(cudart.cudaGetDeviceProperties(devID))
+    print(
+        f"CUDA device [{deviceProps.name}] has {deviceProps.multiProcessorCount} Multi-Processors SM {deviceProps.major}.{deviceProps.minor}"
+    )
+    if deviceProps.major < 2:
+        print("Test requires SM 2.0 or higher for support of Texture Arrays.  Test will exit...")
         sys.exit()
 
     # Generate input data for layered texture
@@ -102,27 +104,35 @@ def main():
     num_layers = 1
     cubemap_size = width * width * num_faces
     size = cubemap_size * num_layers * np.dtype(np.float32).itemsize
-    h_data = np.zeros(cubemap_size * num_layers, dtype='float32')
+    h_data = np.zeros(cubemap_size * num_layers, dtype="float32")
 
     for i in range(cubemap_size * num_layers):
         h_data[i] = i
 
     # This is the expected transformation of the input data (the expected output)
-    h_data_ref = np.zeros(cubemap_size * num_layers, dtype='float32')
+    h_data_ref = np.zeros(cubemap_size * num_layers, dtype="float32")
 
     for layer in range(num_layers):
         for i in range(cubemap_size):
-            h_data_ref[layer*cubemap_size + i] = -h_data[layer*cubemap_size + i] + layer
+            h_data_ref[layer * cubemap_size + i] = -h_data[layer * cubemap_size + i] + layer
 
     # Allocate device memory for result
     d_data = checkCudaErrors(cudart.cudaMalloc(size))
 
     # Allocate array and copy image data
-    channelDesc = checkCudaErrors(cudart.cudaCreateChannelDesc(32, 0, 0, 0, cudart.cudaChannelFormatKind.cudaChannelFormatKindFloat))
-    cu_3darray = checkCudaErrors(cudart.cudaMalloc3DArray(channelDesc, cudart.make_cudaExtent(width, width, num_faces), cudart.cudaArrayCubemap))
+    channelDesc = checkCudaErrors(
+        cudart.cudaCreateChannelDesc(32, 0, 0, 0, cudart.cudaChannelFormatKind.cudaChannelFormatKindFloat)
+    )
+    cu_3darray = checkCudaErrors(
+        cudart.cudaMalloc3DArray(
+            channelDesc,
+            cudart.make_cudaExtent(width, width, num_faces),
+            cudart.cudaArrayCubemap,
+        )
+    )
     myparms = cudart.cudaMemcpy3DParms()
-    myparms.srcPos = cudart.make_cudaPos(0,0,0)
-    myparms.dstPos = cudart.make_cudaPos(0,0,0)
+    myparms.srcPos = cudart.make_cudaPos(0, 0, 0)
+    myparms.dstPos = cudart.make_cudaPos(0, 0, 0)
     myparms.srcPtr = cudart.make_cudaPitchedPtr(h_data, width * np.dtype(np.float32).itemsize, width, width)
     myparms.dstArray = cu_3darray
     myparms.extent = cudart.make_cudaExtent(width, width, num_faces)
@@ -130,12 +140,12 @@ def main():
     checkCudaErrors(cudart.cudaMemcpy3D(myparms))
 
     texRes = cudart.cudaResourceDesc()
-    texRes.resType            = cudart.cudaResourceType.cudaResourceTypeArray
-    texRes.res.array.array    = cu_3darray
+    texRes.resType = cudart.cudaResourceType.cudaResourceTypeArray
+    texRes.res.array.array = cu_3darray
 
     texDescr = cudart.cudaTextureDesc()
     texDescr.normalizedCoords = True
-    texDescr.filterMode       = cudart.cudaTextureFilterMode.cudaFilterModeLinear
+    texDescr.filterMode = cudart.cudaTextureFilterMode.cudaFilterModeLinear
     texDescr.addressMode[0] = cudart.cudaTextureAddressMode.cudaAddressModeWrap
     texDescr.addressMode[1] = cudart.cudaTextureAddressMode.cudaAddressModeWrap
     texDescr.addressMode[2] = cudart.cudaTextureAddressMode.cudaAddressModeWrap
@@ -151,36 +161,57 @@ def main():
     dimGrid.y = width / dimBlock.y
     dimGrid.z = 1
 
-    print("Covering Cubemap data array of {}~3 x {}: Grid size is {} x {}, each block has 8 x 8 threads".format(
-           width, num_layers, dimGrid.x, dimGrid.y))
+    print(
+        f"Covering Cubemap data array of {width}~3 x {num_layers}: Grid size is {dimGrid.x} x {dimGrid.y}, each block has 8 x 8 threads"
+    )
 
     kernelHelper = common.KernelHelper(simpleCubemapTexture, devID)
-    _transformKernel = kernelHelper.getFunction(b'transformKernel')
-    kernelArgs = ((d_data, width, tex),(ctypes.c_void_p, ctypes.c_int, None))
-    checkCudaErrors(cuda.cuLaunchKernel(_transformKernel,
-                                        dimGrid.x, dimGrid.y, dimGrid.z,         # grid dim
-                                        dimBlock.x, dimBlock.y, dimBlock.z,      # block dim
-                                        0, 0,                                    # shared mem and stream
-                                        kernelArgs, 0))                          # arguments
+    _transformKernel = kernelHelper.getFunction(b"transformKernel")
+    kernelArgs = ((d_data, width, tex), (ctypes.c_void_p, ctypes.c_int, None))
+    checkCudaErrors(
+        cuda.cuLaunchKernel(
+            _transformKernel,
+            dimGrid.x,
+            dimGrid.y,
+            dimGrid.z,  # grid dim
+            dimBlock.x,
+            dimBlock.y,
+            dimBlock.z,  # block dim
+            0,
+            0,  # shared mem and stream
+            kernelArgs,
+            0,
+        )
+    )  # arguments
 
     checkCudaErrors(cudart.cudaDeviceSynchronize())
 
     start = time.time()
 
     # Execute the kernel
-    checkCudaErrors(cuda.cuLaunchKernel(_transformKernel,
-                                        dimGrid.x, dimGrid.y, dimGrid.z,         # grid dim
-                                        dimBlock.x, dimBlock.y, dimBlock.z,      # block dim
-                                        0, 0,                                    # shared mem and stream
-                                        kernelArgs, 0))                          # arguments
+    checkCudaErrors(
+        cuda.cuLaunchKernel(
+            _transformKernel,
+            dimGrid.x,
+            dimGrid.y,
+            dimGrid.z,  # grid dim
+            dimBlock.x,
+            dimBlock.y,
+            dimBlock.z,  # block dim
+            0,
+            0,  # shared mem and stream
+            kernelArgs,
+            0,
+        )
+    )  # arguments
 
     checkCudaErrors(cudart.cudaDeviceSynchronize())
     stop = time.time()
-    print("Processing time: {:.3f} msec".format(stop - start))
-    print("{:.2f} Mtexlookups/sec".format(cubemap_size / ((stop - start + 1) / 1000.0) / 1e6))
+    print(f"Processing time: {stop - start:.3f} msec")
+    print(f"{cubemap_size / ((stop - start + 1) / 1000.0) / 1e6:.2f} Mtexlookups/sec")
 
     # Allocate mem for the result on host side
-    h_odata = np.zeros(cubemap_size * num_layers, dtype='float32')
+    h_odata = np.zeros(cubemap_size * num_layers, dtype="float32")
     # Copy result from device to host
     checkCudaErrors(cudart.cudaMemcpy(h_odata, d_data, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost))
 
@@ -197,5 +228,6 @@ def main():
     checkCudaErrors(cudart.cudaFree(d_data))
     checkCudaErrors(cudart.cudaFreeArray(cu_3darray))
 
-if __name__=="__main__":
+
+if __name__ == "__main__":
     main()
diff --git a/cuda_bindings/examples/0_Introduction/simpleP2P_test.py b/cuda_bindings/examples/0_Introduction/simpleP2P_test.py
index d4d17de1f..7ff1ae270 100644
--- a/cuda_bindings/examples/0_Introduction/simpleP2P_test.py
+++ b/cuda_bindings/examples/0_Introduction/simpleP2P_test.py
@@ -6,13 +6,15 @@
 # this software and related documentation outside the terms of the EULA
 # is strictly prohibited.
 import ctypes
-import numpy as np
 import sys
-from cuda import cuda, cudart
+
+import numpy as np
 from common import common
 from common.helper_cuda import checkCudaErrors
 
-simplep2p = '''\
+from cuda import cuda, cudart
+
+simplep2p = """\
 extern "C"
 __global__ void SimpleKernel(float *src, float *dst)
 {
@@ -21,7 +23,8 @@
     const int idx = blockIdx.x * blockDim.x + threadIdx.x;
     dst[idx] = src[idx] * 2.0f;
 }
-'''
+"""
+
 
 def main():
     print("Starting...")
@@ -29,7 +32,7 @@ def main():
     # Number of GPUs
     print("Checking for multiple GPUs...")
     gpu_n = checkCudaErrors(cudart.cudaGetDeviceCount())
-    print("CUDA-capable device count: {}".format(gpu_n))
+    print(f"CUDA-capable device count: {gpu_n}")
 
     if gpu_n < 2:
         print("Two or more GPUs with Peer-to-Peer access capability are required")
@@ -47,10 +50,16 @@ def main():
                 continue
             i_access_j = checkCudaErrors(cudart.cudaDeviceCanAccessPeer(i, j))
             j_access_i = checkCudaErrors(cudart.cudaDeviceCanAccessPeer(j, i))
-            print("> Peer access from {} (GPU{}) -> {} (GPU{}) : {}\n".format(
-                    prop[i].name, i, prop[j].name, j, "Yes" if i_access_j else "No"))
-            print("> Peer access from {} (GPU{}) -> {} (GPU{}) : {}\n".format(
-                    prop[j].name, j, prop[i].name, i, "Yes" if i_access_j else "No"))
+            print(
+                "> Peer access from {} (GPU{}) -> {} (GPU{}) : {}\n".format(
+                    prop[i].name, i, prop[j].name, j, "Yes" if i_access_j else "No"
+                )
+            )
+            print(
+                "> Peer access from {} (GPU{}) -> {} (GPU{}) : {}\n".format(
+                    prop[j].name, j, prop[i].name, i, "Yes" if i_access_j else "No"
+                )
+            )
             if i_access_j and j_access_i:
                 p2pCapableGPUs[1] = j
                 break
@@ -66,7 +75,7 @@ def main():
     gpuid = [p2pCapableGPUs[0], p2pCapableGPUs[1]]
 
     # Enable peer access
-    print("Enabling peer access between GPU{} and GPU{}...".format(gpuid[0], gpuid[1]))
+    print(f"Enabling peer access between GPU{gpuid[0]} and GPU{gpuid[1]}...")
     checkCudaErrors(cudart.cudaSetDevice(gpuid[0]))
     checkCudaErrors(cudart.cudaDeviceEnablePeerAccess(gpuid[1], 0))
     checkCudaErrors(cudart.cudaSetDevice(gpuid[1]))
@@ -74,12 +83,12 @@ def main():
 
     # Allocate buffers
     buf_size = 1024 * 1024 * 16 * np.dtype(np.float32).itemsize
-    print("Allocating buffers ({}MB on GPU{}, GPU{} and CPU Host)...".format(int(buf_size / 1024 / 1024), gpuid[0], gpuid[1]))
+    print(f"Allocating buffers ({int(buf_size / 1024 / 1024)}MB on GPU{gpuid[0]}, GPU{gpuid[1]} and CPU Host)...")
     checkCudaErrors(cudart.cudaSetDevice(gpuid[0]))
     g0 = checkCudaErrors(cudart.cudaMalloc(buf_size))
     checkCudaErrors(cudart.cudaSetDevice(gpuid[1]))
     g1 = checkCudaErrors(cudart.cudaMalloc(buf_size))
-    h0 = checkCudaErrors(cudart.cudaMallocHost(buf_size)) # Automatically portable with UVA
+    h0 = checkCudaErrors(cudart.cudaMallocHost(buf_size))  # Automatically portable with UVA
 
     # Create CUDA event handles
     print("Creating event handles...")
@@ -102,11 +111,12 @@ def main():
     checkCudaErrors(cudart.cudaEventRecord(stop_event, cudart.cudaStream_t(0)))
     checkCudaErrors(cudart.cudaEventSynchronize(stop_event))
     time_memcpy = checkCudaErrors(cudart.cudaEventElapsedTime(start_event, stop_event))
-    print("cudaMemcpyPeer / cudaMemcpy between GPU{} and GPU{}: {:.2f}GB/s".format(gpuid[0], gpuid[1],
-            (1.0 / (time_memcpy / 1000.0)) * ((100.0 * buf_size)) / 1024.0 / 1024.0 / 1024.0))
+    print(
+        f"cudaMemcpyPeer / cudaMemcpy between GPU{gpuid[0]} and GPU{gpuid[1]}: {(1.0 / (time_memcpy / 1000.0)) * (100.0 * buf_size) / 1024.0 / 1024.0 / 1024.0:.2f}GB/s"
+    )
 
     # Prepare host buffer and copy to GPU 0
-    print("Preparing host buffer and memcpy to GPU{}...".format(gpuid[0]))
+    print(f"Preparing host buffer and memcpy to GPU{gpuid[0]}...")
 
     h0_local = (ctypes.c_float * int(buf_size / np.dtype(np.float32).itemsize)).from_address(h0)
     for i in range(int(buf_size / np.dtype(np.float32).itemsize)):
@@ -127,43 +137,61 @@ def main():
 
     # Run kernel on GPU 1, reading input from the GPU 0 buffer, writing
     # output to the GPU 1 buffer
-    print("Run kernel on GPU{}, taking source data from GPU{} and writing to GPU{}...".format(
-          gpuid[1], gpuid[0], gpuid[1]))
+    print(f"Run kernel on GPU{gpuid[1]}, taking source data from GPU{gpuid[0]} and writing to GPU{gpuid[1]}...")
     checkCudaErrors(cudart.cudaSetDevice(gpuid[1]))
 
-    kernelHelper = [None]*2
-    _simpleKernel = [None]*2
-    kernelArgs = [None]*2
+    kernelHelper = [None] * 2
+    _simpleKernel = [None] * 2
+    kernelArgs = [None] * 2
 
     kernelHelper[1] = common.KernelHelper(simplep2p, gpuid[1])
-    _simpleKernel[1] = kernelHelper[1].getFunction(b'SimpleKernel')
+    _simpleKernel[1] = kernelHelper[1].getFunction(b"SimpleKernel")
     kernelArgs[1] = ((g0, g1), (ctypes.c_void_p, ctypes.c_void_p))
-    checkCudaErrors(cuda.cuLaunchKernel(_simpleKernel[1],
-                                        blocks.x, blocks.y, blocks.z,
-                                        threads.x, threads.y, threads.z,
-                                        0, 0,
-                                        kernelArgs[1], 0))
+    checkCudaErrors(
+        cuda.cuLaunchKernel(
+            _simpleKernel[1],
+            blocks.x,
+            blocks.y,
+            blocks.z,
+            threads.x,
+            threads.y,
+            threads.z,
+            0,
+            0,
+            kernelArgs[1],
+            0,
+        )
+    )
 
     checkCudaErrors(cudart.cudaDeviceSynchronize())
 
     # Run kernel on GPU 0, reading input from the GPU 1 buffer, writing
     # output to the GPU 0 buffer
-    print("Run kernel on GPU{}, taking source data from GPU{} and writing to GPU{}...".format(
-          gpuid[0], gpuid[1], gpuid[0]))
+    print(f"Run kernel on GPU{gpuid[0]}, taking source data from GPU{gpuid[1]} and writing to GPU{gpuid[0]}...")
     checkCudaErrors(cudart.cudaSetDevice(gpuid[0]))
     kernelHelper[0] = common.KernelHelper(simplep2p, gpuid[0])
-    _simpleKernel[0] = kernelHelper[0].getFunction(b'SimpleKernel')
+    _simpleKernel[0] = kernelHelper[0].getFunction(b"SimpleKernel")
     kernelArgs[0] = ((g1, g0), (ctypes.c_void_p, ctypes.c_void_p))
-    checkCudaErrors(cuda.cuLaunchKernel(_simpleKernel[0],
-                                        blocks.x, blocks.y, blocks.z,
-                                        threads.x, threads.y, threads.z,
-                                        0, 0,
-                                        kernelArgs[0], 0))
+    checkCudaErrors(
+        cuda.cuLaunchKernel(
+            _simpleKernel[0],
+            blocks.x,
+            blocks.y,
+            blocks.z,
+            threads.x,
+            threads.y,
+            threads.z,
+            0,
+            0,
+            kernelArgs[0],
+            0,
+        )
+    )
 
     checkCudaErrors(cudart.cudaDeviceSynchronize())
 
     # Copy data back to host and verify
-    print("Copy data back to host from GPU{} and verify results...".format(gpuid[0]))
+    print(f"Copy data back to host from GPU{gpuid[0]} and verify results...")
     checkCudaErrors(cudart.cudaMemcpy(h0, g0, buf_size, cudart.cudaMemcpyKind.cudaMemcpyDefault))
 
     error_count = 0
@@ -172,7 +200,7 @@ def main():
         # Re-generate input data and apply 2x '* 2.0f' computation of both
         # kernel runs
         if h0_local[i] != float(i % 4096) * 2.0 * 2.0:
-            print("Verification error @ element {}: val = {}, ref = {}\n".format(i, h0_local[i], (float(i%4096)*2.0*2.0)))
+            print(f"Verification error @ element {i}: val = {h0_local[i]}, ref = {float(i % 4096) * 2.0 * 2.0}\n")
             error_count += 1
             if error_count > 10:
                 break
@@ -202,5 +230,6 @@ def main():
         sys.exit(-1)
     print("Test passed!")
 
-if __name__=="__main__":
+
+if __name__ == "__main__":
     main()
diff --git a/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py b/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py
index 834eb32af..267cb3924 100644
--- a/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py
+++ b/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py
@@ -7,15 +7,17 @@
 # is strictly prohibited.
 import ctypes
 import math
-import numpy as np
 import random as rnd
 import sys
-from cuda import cuda, cudart
+
+import numpy as np
 from common import common
 from common.helper_cuda import checkCudaErrors
-from common.helper_string import checkCmdLineFlag
+from common.helper_string import checkCmdLineFlag, getCmdLineArgumentInt
 
-simpleZeroCopy = '''\
+from cuda import cuda, cudart
+
+simpleZeroCopy = """\
 extern "C"
 __global__ void vectorAddGPU(float *a, float *b, float *c, int N)
 {
@@ -26,7 +28,8 @@
         c[idx] = a[idx] + b[idx];
     }
 }
-'''
+"""
+
 
 def main():
     idev = 0
@@ -45,16 +48,16 @@ def main():
         idev = int(getCmdLineArgumentInt("device="))
 
         if idev >= deviceCount or idev < 0:
-            print("Device number {} is invalid, will use default CUDA device 0.".format(idev))
+            print(f"Device number {idev} is invalid, will use default CUDA device 0.")
             idev = 0
 
     if checkCmdLineFlag("use_generic_memory"):
         bPinGenericMemory = True
 
     if bPinGenericMemory:
-        print("> Using Generic System Paged Memory (malloc)");
+        print("> Using Generic System Paged Memory (malloc)")
     else:
-        print("> Using CUDA Host Allocated (cudaHostAlloc)");
+        print("> Using CUDA Host Allocated (cudaHostAlloc)")
 
     checkCudaErrors(cudart.cudaSetDevice(idev))
 
@@ -62,7 +65,7 @@ def main():
     deviceProp = checkCudaErrors(cudart.cudaGetDeviceProperties(idev))
 
     if not deviceProp.canMapHostMemory:
-        print("Device {} does not support mapping CPU host memory!".format(idev))
+        print(f"Device {idev} does not support mapping CPU host memory!")
         return
 
     checkCudaErrors(cudart.cudaSetDeviceFlags(cudart.cudaDeviceMapHost))
@@ -70,7 +73,7 @@ def main():
     # Allocate mapped CPU memory
 
     nelem = 1048576
-    num_bytes = nelem*np.dtype(np.float32).itemsize
+    num_bytes = nelem * np.dtype(np.float32).itemsize
 
     if bPinGenericMemory:
         a = np.empty(nelem, dtype=np.float32)
@@ -107,21 +110,33 @@ def main():
     block.y = 1
     block.z = 1
     grid = cudart.dim3()
-    grid.x = math.ceil(nelem/float(block.x))
+    grid.x = math.ceil(nelem / float(block.x))
     grid.y = 1
     grid.z = 1
     kernelHelper = common.KernelHelper(simpleZeroCopy, idev)
-    _vectorAddGPU = kernelHelper.getFunction(b'vectorAddGPU')
-    kernelArgs = ((d_a, d_b, d_c, nelem),(ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int))
-    checkCudaErrors(cuda.cuLaunchKernel(_vectorAddGPU,
-                                        grid.x, grid.y, grid.z,
-                                        block.x, block.y, block.z,
-                                        0, cuda.CU_STREAM_LEGACY,
-                                        kernelArgs, 0))
+    _vectorAddGPU = kernelHelper.getFunction(b"vectorAddGPU")
+    kernelArgs = (
+        (d_a, d_b, d_c, nelem),
+        (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int),
+    )
+    checkCudaErrors(
+        cuda.cuLaunchKernel(
+            _vectorAddGPU,
+            grid.x,
+            grid.y,
+            grid.z,
+            block.x,
+            block.y,
+            block.z,
+            0,
+            cuda.CU_STREAM_LEGACY,
+            kernelArgs,
+            0,
+        )
+    )
     checkCudaErrors(cudart.cudaDeviceSynchronize())
 
-    print("> Checking the results from vectorAddGPU() ...");
-
+    print("> Checking the results from vectorAddGPU() ...")
     # Compare the results
     errorNorm = 0.0
     refNorm = 0.0
@@ -129,8 +144,8 @@ def main():
     for n in range(nelem):
         ref = a[n] + b[n]
         diff = c[n] - ref
-        errorNorm += diff*diff
-        refNorm += ref*ref
+        errorNorm += diff * diff
+        refNorm += ref * ref
 
     errorNorm = math.sqrt(errorNorm)
     refNorm = math.sqrt(refNorm)
@@ -148,10 +163,11 @@ def main():
         checkCudaErrors(cudart.cudaFreeHost(b))
         checkCudaErrors(cudart.cudaFreeHost(c))
 
-    if errorNorm/refNorm >= 1.0e-7:
+    if errorNorm / refNorm >= 1.0e-7:
         print("FAILED")
         sys.exit(-1)
     print("PASSED")
 
-if __name__=="__main__":
+
+if __name__ == "__main__":
     main()
diff --git a/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py b/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py
index f34f31958..64ae4d390 100644
--- a/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py
+++ b/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py
@@ -6,14 +6,16 @@
 # this software and related documentation outside the terms of the EULA
 # is strictly prohibited.
 import ctypes
-import numpy as np
-import sys
 import os
-from cuda import cuda, cudart
+import sys
+
+import numpy as np
 from common import common
 from common.helper_cuda import checkCudaErrors, findCudaDevice
 
-systemWideAtomics =  '''\
+from cuda import cuda, cudart
+
+systemWideAtomics = """\
 #define LOOP_NUM 50
 
 extern "C"
@@ -54,9 +56,10 @@
         atomicXor_system(&atom_arr[9], tid);
   }
 }
-'''
+"""
+
+LOOP_NUM = 50
 
-LOOP_NUM = 50 
 
 #! Compute reference data set
 #! Each element is multiplied with the number of threads / array length
@@ -70,8 +73,8 @@ def verify(testData, length):
         val += 10
 
     if val != testData[0]:
-       print(f"atomicAdd failed val = {val} testData = {testData[0]}")
-       return False
+        print(f"atomicAdd failed val = {val} testData = {testData[0]}")
+        return False
 
     val = 0
     found = False
@@ -82,7 +85,7 @@ def verify(testData, length):
             break
 
     if not found:
-        print("atomicExch failed") 
+        print("atomicExch failed")
         return False
 
     val = -(1 << 8)
@@ -136,11 +139,11 @@ def verify(testData, length):
         print("atomicCAS failed")
         return False
 
-    val = 0xff
+    val = 0xFF
 
     for i in range(length):
         # 8th element should be 1
-        val &= (2 * i + 7)
+        val &= 2 * i + 7
 
     if val != testData[7]:
         print("atomicAnd failed")
@@ -152,11 +155,11 @@ def verify(testData, length):
         print("atomicOr failed")
         return False
 
-    val = 0xff
+    val = 0xFF
 
     for i in range(length):
         # 11th element should be 0xff
-        val ^= i;
+        val ^= i
 
     if val != testData[9]:
         print("atomicXor failed")
@@ -164,8 +167,9 @@ def verify(testData, length):
 
     return True
 
+
 def main():
-    if os.name == 'nt':
+    if os.name == "nt":
         print("Atomics not supported on Windows")
         return
 
@@ -182,7 +186,7 @@ def main():
         # This sample requires being run with a default or process exclusive mode
         print("This sample requires a device in either default or process exclusive mode")
         return
-    
+
     if device_prop.major < 6:
         print("Requires a minimum CUDA compute 6.0 capability, waiving testing.")
         return
@@ -197,24 +201,35 @@ def main():
         atom_arr = ctypes.addressof(atom_arr_h)
     else:
         print("CANNOT access pageable memory")
-        atom_arr = checkCudaErrors(cudart.cudaMallocManaged(np.dtype(np.int32).itemsize * numData, cudart.cudaMemAttachGlobal))
+        atom_arr = checkCudaErrors(
+            cudart.cudaMallocManaged(np.dtype(np.int32).itemsize * numData, cudart.cudaMemAttachGlobal)
+        )
         atom_arr_h = (ctypes.c_int * numData).from_address(atom_arr)
 
     for i in range(numData):
         atom_arr_h[i] = 0
 
     # To make the AND and XOR tests generate something other than 0...
-    atom_arr_h[7] = atom_arr_h[9] = 0xff
+    atom_arr_h[7] = atom_arr_h[9] = 0xFF
 
     kernelHelper = common.KernelHelper(systemWideAtomics, dev_id)
-    _atomicKernel = kernelHelper.getFunction(b'atomicKernel')
-    kernelArgs = ((atom_arr,),
-                  (ctypes.c_void_p,))
-    checkCudaErrors(cuda.cuLaunchKernel(_atomicKernel,
-                                        numBlocks, 1, 1,                         # grid dim
-                                        numThreads, 1, 1,                        # block dim
-                                        0, cuda.CU_STREAM_LEGACY,                # shared mem and stream
-                                        kernelArgs, 0))                          # arguments
+    _atomicKernel = kernelHelper.getFunction(b"atomicKernel")
+    kernelArgs = ((atom_arr,), (ctypes.c_void_p,))
+    checkCudaErrors(
+        cuda.cuLaunchKernel(
+            _atomicKernel,
+            numBlocks,
+            1,
+            1,  # grid dim
+            numThreads,
+            1,
+            1,  # block dim
+            0,
+            cuda.CU_STREAM_LEGACY,  # shared mem and stream
+            kernelArgs,
+            0,
+        )
+    )  # arguments
     # NOTE: Python doesn't have an equivalent system atomic operations
     # atomicKernel_CPU(atom_arr_h, numBlocks * numThreads)
 
@@ -232,5 +247,6 @@ def main():
     if not testResult:
         sys.exit(-1)
 
-if __name__=="__main__":
+
+if __name__ == "__main__":
     main()
diff --git a/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py b/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py
index 329340406..8aae6b9da 100644
--- a/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py
+++ b/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py
@@ -7,12 +7,15 @@
 # is strictly prohibited.
 import ctypes
 import math
+import sys
+
 import numpy as np
-from cuda import cuda
 from common import common
 from common.helper_cuda import checkCudaErrors, findCudaDeviceDRV
 
-vectorAddDrv = '''\
+from cuda import cuda
+
+vectorAddDrv = """\
 /* Vector addition: C = A + B.
  *
  * This sample is a very basic sample that implements element by element
@@ -29,28 +32,29 @@
     if (i < N)
         C[i] = A[i] + B[i];
 }
-'''
+"""
+
 
 def main():
     print("Vector Addition (Driver API)")
     N = 50000
-    devID = 0
     size = N * np.dtype(np.float32).itemsize
 
     # Initialize
-    checkCudaErrors(cuda.cuInit(0));
-
+    checkCudaErrors(cuda.cuInit(0))
     cuDevice = findCudaDeviceDRV()
     # Create context
     cuContext = checkCudaErrors(cuda.cuCtxCreate(0, cuDevice))
 
-    uvaSupported = checkCudaErrors(cuda.cuDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, cuDevice))
+    uvaSupported = checkCudaErrors(
+        cuda.cuDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, cuDevice)
+    )
     if not uvaSupported:
         print("Accessing pageable memory directly requires UVA")
         return
 
     kernelHelper = common.KernelHelper(vectorAddDrv, int(cuDevice))
-    _VecAdd_kernel = kernelHelper.getFunction(b'VecAdd_kernel')
+    _VecAdd_kernel = kernelHelper.getFunction(b"VecAdd_kernel")
 
     # Allocate input vectors h_A and h_B in host memory
     h_A = np.random.rand(size).astype(dtype=np.float32)
@@ -69,17 +73,26 @@ def main():
     if True:
         # Grid/Block configuration
         threadsPerBlock = 256
-        blocksPerGrid   = (N + threadsPerBlock - 1) / threadsPerBlock
+        blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock
 
-        kernelArgs = ((d_A, d_B, d_C, N),
-                      (None, None, None, ctypes.c_int))
+        kernelArgs = ((d_A, d_B, d_C, N), (None, None, None, ctypes.c_int))
 
         # Launch the CUDA kernel
-        checkCudaErrors(cuda.cuLaunchKernel(_VecAdd_kernel,
-                                            blocksPerGrid, 1, 1,
-                                            threadsPerBlock, 1, 1,
-                                            0, 0,
-                                            kernelArgs, 0))
+        checkCudaErrors(
+            cuda.cuLaunchKernel(
+                _VecAdd_kernel,
+                blocksPerGrid,
+                1,
+                1,
+                threadsPerBlock,
+                1,
+                1,
+                0,
+                0,
+                kernelArgs,
+                0,
+            )
+        )
     else:
         pass
 
@@ -98,9 +111,10 @@ def main():
     checkCudaErrors(cuda.cuMemFree(d_C))
 
     checkCudaErrors(cuda.cuCtxDestroy(cuContext))
-    print("{}".format("Result = PASS" if i+1 == N else "Result = FAIL"))
-    if i+1 != N:
+    print("{}".format("Result = PASS" if i + 1 == N else "Result = FAIL"))
+    if i + 1 != N:
         sys.exit(-1)
 
+
 if __name__ == "__main__":
     main()
diff --git a/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py b/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py
index 8af4a8339..b0044dbf9 100644
--- a/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py
+++ b/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py
@@ -7,13 +7,15 @@
 # is strictly prohibited.
 import ctypes
 import math
-import numpy as np
 import sys
-from cuda import cuda
+
+import numpy as np
 from common import common
 from common.helper_cuda import checkCudaErrors, findCudaDeviceDRV
 
-vectorAddMMAP = '''\
+from cuda import cuda
+
+vectorAddMMAP = """\
 /* Vector addition: C = A + B.
  *
  * This sample is a very basic sample that implements element by element
@@ -30,10 +32,12 @@
     if (i < N)
         C[i] = A[i] + B[i];
 }
-'''
+"""
+
 
 def round_up(x, y):
-    return int((x - 1)/y + 1) * y
+    return int((x - 1) / y + 1) * y
+
 
 def getBackingDevices(cuDevice):
     num_devices = checkCudaErrors(cuda.cuDeviceGetCount())
@@ -50,16 +54,20 @@ def getBackingDevices(cuDevice):
             continue
 
         # The device needs to support virtual address management for the required apis to work
-        attributeVal = checkCudaErrors(cuda.cuDeviceGetAttribute(
-                                            cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED,
-                                            cuDevice))
+        attributeVal = checkCudaErrors(
+            cuda.cuDeviceGetAttribute(
+                cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED,
+                cuDevice,
+            )
+        )
         if attributeVal == 0:
             continue
 
         backingDevices.append(cuda.CUdevice(dev))
     return backingDevices
 
-def simpleMallocMultiDeviceMmap(size, residentDevices, mappingDevices, align = 0):
+
+def simpleMallocMultiDeviceMmap(size, residentDevices, mappingDevices, align=0):
     min_granularity = 0
 
     # Setup the properties common for all the chunks
@@ -74,7 +82,9 @@ def simpleMallocMultiDeviceMmap(size, residentDevices, mappingDevices, align = 0
     # (the max of the minimum granularity of each participating device)
     for device in residentDevices:
         prop.location.id = device
-        status, granularity = cuda.cuMemGetAllocationGranularity(prop, cuda.CUmemAllocationGranularity_flags.CU_MEM_ALLOC_GRANULARITY_MINIMUM)
+        status, granularity = cuda.cuMemGetAllocationGranularity(
+            prop, cuda.CUmemAllocationGranularity_flags.CU_MEM_ALLOC_GRANULARITY_MINIMUM
+        )
         if status != cuda.CUresult.CUDA_SUCCESS:
             return status, None, None
         if min_granularity < granularity:
@@ -84,7 +94,9 @@ def simpleMallocMultiDeviceMmap(size, residentDevices, mappingDevices, align = 0
     # (the max of the minimum granularity of each participating device)
     for device in mappingDevices:
         prop.location.id = device
-        status, granularity = cuda.cuMemGetAllocationGranularity(prop, cuda.CUmemAllocationGranularity_flags.CU_MEM_ALLOC_GRANULARITY_MINIMUM)
+        status, granularity = cuda.cuMemGetAllocationGranularity(
+            prop, cuda.CUmemAllocationGranularity_flags.CU_MEM_ALLOC_GRANULARITY_MINIMUM
+        )
         if status != cuda.CUresult.CUDA_SUCCESS:
             return status, None, None
         if min_granularity < granularity:
@@ -123,10 +135,10 @@ def simpleMallocMultiDeviceMmap(size, residentDevices, mappingDevices, align = 0
         # Since we do not need to make any other mappings of this memory or export it,
         # we no longer need and can release the allocationHandle.
         # The allocation will be kept live until it is unmapped.
-        status, = cuda.cuMemMap(int(dptr) + (stripeSize * idx), stripeSize, 0, allocationHandle, 0)
-        
+        (status,) = cuda.cuMemMap(int(dptr) + (stripeSize * idx), stripeSize, 0, allocationHandle, 0)
+
         # the handle needs to be released even if the mapping failed.
-        status2, = cuda.cuMemRelease(allocationHandle)
+        (status2,) = cuda.cuMemRelease(allocationHandle)
         if status != cuda.CUresult.CUDA_SUCCESS:
             # cuMemRelease should not have failed here
             # as the handle was just allocated successfully
@@ -151,13 +163,14 @@ def simpleMallocMultiDeviceMmap(size, residentDevices, mappingDevices, align = 0
         accessDescriptors[idx].flags = cuda.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE
 
     # Apply the access descriptors to the whole VA range.
-    status, = cuda.cuMemSetAccess(dptr, size, accessDescriptors, len(accessDescriptors))
+    (status,) = cuda.cuMemSetAccess(dptr, size, accessDescriptors, len(accessDescriptors))
     if status != cuda.CUresult.CUDA_SUCCESS:
         simpleFreeMultiDeviceMmap(dptr, size)
         return status, None, None
 
     return (status, dptr, allocationSize)
 
+
 def simpleFreeMultiDeviceMmap(dptr, size):
     # Unmap the mapped virtual memory region
     # Since the handles to the mapped backing stores have already been released
@@ -165,7 +178,7 @@ def simpleFreeMultiDeviceMmap(dptr, size):
     # The backing stores will be freed.
     # Since the memory has been unmapped after this call, accessing the specified
     # va range will result in a fault (unitll it is remapped).
-    status = cuda.cuMemUnmap(dptr, size);
+    status = cuda.cuMemUnmap(dptr, size)
     if status[0] != cuda.CUresult.CUDA_SUCCESS:
         return status
 
@@ -178,6 +191,7 @@ def simpleFreeMultiDeviceMmap(dptr, size):
         return status
     return status
 
+
 def main():
     print("Vector Addition (Driver API)")
     N = 50000
@@ -189,12 +203,15 @@ def main():
     cuDevice = findCudaDeviceDRV()
 
     # Check that the selected device supports virtual address management
-    attributeVal = checkCudaErrors(cuda.cuDeviceGetAttribute(
-                        cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED,
-                        cuDevice))
-    print("Device {} VIRTUAL ADDRESS MANAGEMENT SUPPORTED = {}.".format(cuDevice, attributeVal))
+    attributeVal = checkCudaErrors(
+        cuda.cuDeviceGetAttribute(
+            cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED,
+            cuDevice,
+        )
+    )
+    print(f"Device {cuDevice} VIRTUAL ADDRESS MANAGEMENT SUPPORTED = {attributeVal}.")
     if not attributeVal:
-        print("Device {} doesn't support VIRTUAL ADDRESS MANAGEMENT.".format(cuDevice))
+        print(f"Device {cuDevice} doesn't support VIRTUAL ADDRESS MANAGEMENT.")
         return
 
     # The vector addition happens on cuDevice, so the allocations need to be mapped there.
@@ -207,7 +224,7 @@ def main():
     cuContext = checkCudaErrors(cuda.cuCtxCreate(0, cuDevice))
 
     kernelHelper = common.KernelHelper(vectorAddMMAP, int(cuDevice))
-    _VecAdd_kernel = kernelHelper.getFunction(b'VecAdd_kernel')
+    _VecAdd_kernel = kernelHelper.getFunction(b"VecAdd_kernel")
 
     # Allocate input vectors h_A and h_B in host memory
     h_A = np.random.rand(size).astype(dtype=np.float32)
@@ -231,17 +248,26 @@ def main():
 
     # Grid/Block configuration
     threadsPerBlock = 256
-    blocksPerGrid   = (N + threadsPerBlock - 1) / threadsPerBlock
+    blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock
 
-    kernelArgs = ((d_A, d_B, d_C, N),
-                  (None, None, None, ctypes.c_int))
+    kernelArgs = ((d_A, d_B, d_C, N), (None, None, None, ctypes.c_int))
 
     # Launch the CUDA kernel
-    checkCudaErrors(cuda.cuLaunchKernel(_VecAdd_kernel,
-                                        blocksPerGrid, 1, 1,
-                                        threadsPerBlock, 1, 1,
-                                        0, 0,
-                                        kernelArgs, 0))
+    checkCudaErrors(
+        cuda.cuLaunchKernel(
+            _VecAdd_kernel,
+            blocksPerGrid,
+            1,
+            1,
+            threadsPerBlock,
+            1,
+            1,
+            0,
+            0,
+            kernelArgs,
+            0,
+        )
+    )
 
     # Copy result from device memory to host memory
     # h_C contains the result in host memory
@@ -259,9 +285,10 @@ def main():
 
     checkCudaErrors(cuda.cuCtxDestroy(cuContext))
 
-    print("{}".format("Result = PASS" if i+1 == N else "Result = FAIL"))
-    if i+1 != N:
+    print("{}".format("Result = PASS" if i + 1 == N else "Result = FAIL"))
+    if i + 1 != N:
         sys.exit(-1)
 
+
 if __name__ == "__main__":
     main()
diff --git a/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py b/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py
index 84af8717f..71dc797f7 100644
--- a/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py
+++ b/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py
@@ -7,15 +7,17 @@
 # is strictly prohibited.
 import ctypes
 import math
-import numpy as np
 import random as rnd
 import sys
-from cuda import cuda, cudart
+
+import numpy as np
 from common import common
 from common.helper_cuda import checkCudaErrors, findCudaDevice
 from common.helper_string import checkCmdLineFlag
 
-streamOrderedAllocation = '''\
+from cuda import cuda, cudart
+
+streamOrderedAllocation = """\
 /* Add two vectors on the GPU */
 extern "C"
 __global__ void vectorAddGPU(const float *a, const float *b, float *c, int N)
@@ -26,12 +28,13 @@
         c[idx] =  a[idx] + b[idx];
     }
 }
-'''
+"""
 
 MAX_ITER = 20
 
+
 def basicStreamOrderedAllocation(dev, nelem, a, b, c):
-    num_bytes = nelem*np.dtype(np.float32).itemsize
+    num_bytes = nelem * np.dtype(np.float32).itemsize
 
     print("Starting basicStreamOrderedAllocation()")
     checkCudaErrors(cudart.cudaSetDevice(dev))
@@ -48,17 +51,29 @@ def basicStreamOrderedAllocation(dev, nelem, a, b, c):
     block.y = 1
     block.z = 1
     grid = cudart.dim3()
-    grid.x = math.ceil(nelem/float(block.x))
+    grid.x = math.ceil(nelem / float(block.x))
     grid.y = 1
     grid.z = 1
 
-    kernelArgs = ((d_a, d_b, d_c, nelem),
-                  (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int))
-    checkCudaErrors(cuda.cuLaunchKernel(_vectorAddGPU,
-                                        grid.x, grid.y, grid.z,    # grid dim
-                                        block.x, block.y, block.z, # block dim
-                                        0, stream,                 # shared mem and stream
-                                        kernelArgs, 0))            # arguments
+    kernelArgs = (
+        (d_a, d_b, d_c, nelem),
+        (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int),
+    )
+    checkCudaErrors(
+        cuda.cuLaunchKernel(
+            _vectorAddGPU,
+            grid.x,
+            grid.y,
+            grid.z,  # grid dim
+            block.x,
+            block.y,
+            block.z,  # block dim
+            0,
+            stream,  # shared mem and stream
+            kernelArgs,
+            0,
+        )
+    )  # arguments
 
     checkCudaErrors(cudart.cudaFreeAsync(d_a, stream))
     checkCudaErrors(cudart.cudaFreeAsync(d_b, stream))
@@ -67,31 +82,32 @@ def basicStreamOrderedAllocation(dev, nelem, a, b, c):
     checkCudaErrors(cudart.cudaStreamSynchronize(stream))
 
     # Compare the results
-    print("> Checking the results from vectorAddGPU() ...");
+    print("> Checking the results from vectorAddGPU() ...")
     errorNorm = 0.0
     refNorm = 0.0
 
     for n in range(nelem):
         ref = a[n] + b[n]
         diff = c[n] - ref
-        errorNorm += diff*diff
-        refNorm += ref*ref
+        errorNorm += diff * diff
+        refNorm += ref * ref
 
     errorNorm = math.sqrt(errorNorm)
     refNorm = math.sqrt(refNorm)
 
-    if errorNorm/refNorm < 1.e-6:
+    if errorNorm / refNorm < 1.0e-6:
         print("basicStreamOrderedAllocation PASSED")
-    
+
     checkCudaErrors(cudart.cudaStreamDestroy(stream))
 
-    return errorNorm/refNorm < 1.e-6
+    return errorNorm / refNorm < 1.0e-6
+
 
 # streamOrderedAllocationPostSync(): demonstrates If the application wants the memory to persist in the pool beyond
 # synchronization, then it sets the release threshold on the pool. This way, when the application reaches the "steady state",
 # it is no longer allocating/freeing memory from the OS.
-def streamOrderedAllocationPostSync(dev, nelem, a, b, c) :
-    num_bytes = nelem*np.dtype(np.float32).itemsize
+def streamOrderedAllocationPostSync(dev, nelem, a, b, c):
+    num_bytes = nelem * np.dtype(np.float32).itemsize
 
     print("Starting streamOrderedAllocationPostSync()")
     checkCudaErrors(cudart.cudaSetDevice(dev))
@@ -102,13 +118,18 @@ def streamOrderedAllocationPostSync(dev, nelem, a, b, c) :
     memPool = checkCudaErrors(cudart.cudaDeviceGetDefaultMemPool(dev))
     thresholdVal = cuda.cuuint64_t(ctypes.c_uint64(-1).value)
     # Set high release threshold on the default pool so that cudaFreeAsync will not actually release memory to the system.
-    # By default, the release threshold for a memory pool is set to zero. This implies that the CUDA driver is 
+    # By default, the release threshold for a memory pool is set to zero. This implies that the CUDA driver is
     # allowed to release a memory chunk back to the system as long as it does not contain any active suballocations.
-    checkCudaErrors(cudart.cudaMemPoolSetAttribute(memPool, cudart.cudaMemPoolAttr.cudaMemPoolAttrReleaseThreshold, thresholdVal));
-
+    checkCudaErrors(
+        cudart.cudaMemPoolSetAttribute(
+            memPool,
+            cudart.cudaMemPoolAttr.cudaMemPoolAttrReleaseThreshold,
+            thresholdVal,
+        )
+    )
     # Record teh start event
     checkCudaErrors(cudart.cudaEventRecord(start, stream))
-    for i in range(MAX_ITER):
+    for _i in range(MAX_ITER):
         d_a = checkCudaErrors(cudart.cudaMallocAsync(num_bytes, stream))
         d_b = checkCudaErrors(cudart.cudaMallocAsync(num_bytes, stream))
         d_c = checkCudaErrors(cudart.cudaMallocAsync(num_bytes, stream))
@@ -120,17 +141,29 @@ def streamOrderedAllocationPostSync(dev, nelem, a, b, c) :
         block.y = 1
         block.z = 1
         grid = cudart.dim3()
-        grid.x = math.ceil(nelem/float(block.x))
+        grid.x = math.ceil(nelem / float(block.x))
         grid.y = 1
         grid.z = 1
 
-        kernelArgs = ((d_a, d_b, d_c, nelem),
-                      (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int))
-        checkCudaErrors(cuda.cuLaunchKernel(_vectorAddGPU,
-                                            grid.x, grid.y, grid.z,    # grid dim
-                                            block.x, block.y, block.z, # block dim
-                                            0, stream,                 # shared mem and stream
-                                            kernelArgs, 0))            # arguments
+        kernelArgs = (
+            (d_a, d_b, d_c, nelem),
+            (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int),
+        )
+        checkCudaErrors(
+            cuda.cuLaunchKernel(
+                _vectorAddGPU,
+                grid.x,
+                grid.y,
+                grid.z,  # grid dim
+                block.x,
+                block.y,
+                block.z,  # block dim
+                0,
+                stream,  # shared mem and stream
+                kernelArgs,
+                0,
+            )
+        )  # arguments
 
         checkCudaErrors(cudart.cudaFreeAsync(d_a, stream))
         checkCudaErrors(cudart.cudaFreeAsync(d_b, stream))
@@ -142,7 +175,7 @@ def streamOrderedAllocationPostSync(dev, nelem, a, b, c) :
     checkCudaErrors(cudart.cudaEventSynchronize(end))
 
     msecTotal = checkCudaErrors(cudart.cudaEventElapsedTime(start, end))
-    print("Total elapsed time = {} ms over {} iterations".format(msecTotal, MAX_ITER))
+    print(f"Total elapsed time = {msecTotal} ms over {MAX_ITER} iterations")
 
     # Compare the results
     print("> Checking the results from vectorAddGPU() ...")
@@ -152,25 +185,26 @@ def streamOrderedAllocationPostSync(dev, nelem, a, b, c) :
     for n in range(nelem):
         ref = a[n] + b[n]
         diff = c[n] - ref
-        errorNorm += diff*diff
-        refNorm += ref*ref
+        errorNorm += diff * diff
+        refNorm += ref * ref
 
     errorNorm = math.sqrt(errorNorm)
     refNorm = math.sqrt(refNorm)
 
-    if errorNorm/refNorm < 1.e-6:
+    if errorNorm / refNorm < 1.0e-6:
         print("streamOrderedAllocationPostSync PASSED")
 
     checkCudaErrors(cudart.cudaStreamDestroy(stream))
 
-    return errorNorm/refNorm < 1.e-6
+    return errorNorm / refNorm < 1.0e-6
+
 
 def main():
     cuda.cuInit(0)
     if checkCmdLineFlag("help"):
-        print("Usage:  streamOrderedAllocation [OPTION]\n");
-        print("Options:");
-        print("  device=[device #]  Specify the device to be used");
+        print("Usage:  streamOrderedAllocation [OPTION]\n")
+        print("Options:")
+        print("  device=[device #]  Specify the device to be used")
         return
 
     dev = findCudaDevice()
@@ -179,22 +213,24 @@ def main():
     if version < 11030:
         isMemPoolSupported = False
     else:
-        isMemPoolSupported = checkCudaErrors(cudart.cudaDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED, dev))
+        isMemPoolSupported = checkCudaErrors(
+            cudart.cudaDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED, dev)
+        )
     if not isMemPoolSupported:
         print("Waiving execution as device does not support Memory Pools")
         return
 
     global _vectorAddGPU
     kernelHelper = common.KernelHelper(streamOrderedAllocation, dev)
-    _vectorAddGPU = kernelHelper.getFunction(b'vectorAddGPU')
+    _vectorAddGPU = kernelHelper.getFunction(b"vectorAddGPU")
 
     # Allocate CPU memory
     nelem = 1048576
-    num_bytes = nelem*np.dtype(np.float32).itemsize
+    nelem * np.dtype(np.float32).itemsize
 
-    a = np.zeros(nelem, dtype='float32')
-    b = np.zeros(nelem, dtype='float32')
-    c = np.zeros(nelem, dtype='float32')
+    a = np.zeros(nelem, dtype="float32")
+    b = np.zeros(nelem, dtype="float32")
+    c = np.zeros(nelem, dtype="float32")
     # Initialize the vectors
     for i in range(nelem):
         a[i] = rnd.random()
@@ -206,5 +242,6 @@ def main():
     if not ret1 or not ret2:
         sys.exit(-1)
 
-if __name__=="__main__":
+
+if __name__ == "__main__":
     main()
diff --git a/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py b/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py
index bb6b5cb04..aacd99cf3 100644
--- a/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py
+++ b/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py
@@ -7,31 +7,43 @@
 # is strictly prohibited.
 import ctypes
 import math
-import numpy as np
 import sys
-import pytest
-from cuda import cuda, cudart
 from enum import Enum
+
+import numpy as np
+import pytest
 from common import common
 from common.helper_cuda import checkCudaErrors, findCudaDevice
 from common.helper_string import checkCmdLineFlag, getCmdLineArgumentInt
 
+from cuda import cuda, cudart
+
 blockSize = 16
+
+
 class kernels(Enum):
-    AsyncCopyMultiStageLargeChunk  = 0
-    AsyncCopyLargeChunk            = 1
-    AsyncCopyLargeChunkAWBarrier   = 2
+    AsyncCopyMultiStageLargeChunk = 0
+    AsyncCopyLargeChunk = 1
+    AsyncCopyLargeChunkAWBarrier = 2
     AsyncCopyMultiStageSharedState = 3
-    AsyncCopyMultiStage            = 4
-    AsyncCopySingleStage           = 5
-    Naive                          = 6
-    NaiveLargeChunk                = 7
-
-kernelNames = ["AsyncCopyMultiStageLargeChunk", "AsyncCopyLargeChunk",
-               "AsyncCopyLargeChunkAWBarrier", "AsyncCopyMultiStageSharedState",
-               "AsyncCopyMultiStage", "AsyncCopySingleStage", "Naive", "NaiveLargeChunk"]
-     
-globalToShmemAsyncCopy = '''\
+    AsyncCopyMultiStage = 4
+    AsyncCopySingleStage = 5
+    Naive = 6
+    NaiveLargeChunk = 7
+
+
+kernelNames = [
+    "AsyncCopyMultiStageLargeChunk",
+    "AsyncCopyLargeChunk",
+    "AsyncCopyLargeChunkAWBarrier",
+    "AsyncCopyMultiStageSharedState",
+    "AsyncCopyMultiStage",
+    "AsyncCopySingleStage",
+    "Naive",
+    "NaiveLargeChunk",
+]
+
+globalToShmemAsyncCopy = """\
 #line __LINE__
 #if __CUDA_ARCH__ >= 700
 #include <cuda/barrier>
@@ -41,16 +53,16 @@ class kernels(Enum):
 #include <cuda/pipeline>
 namespace cg = cooperative_groups;
 
-#define BLOCK_SIZE 16 
+#define BLOCK_SIZE 16
 #define BLOCK_SIZE_X 16
 
 // Multi Stage memcpy_async pipeline with large chunk copy
 extern "C"
-__global__ void MatrixMulAsyncCopyMultiStageLargeChunk(float* __restrict__ C, 
+__global__ void MatrixMulAsyncCopyMultiStageLargeChunk(float* __restrict__ C,
                                                        const float* __restrict__ A,
                                                        const float* __restrict__ B, int wA,
                                                        int wB) {
-    // Requires BLOCK_SIZE % 4 == 0 
+    // Requires BLOCK_SIZE % 4 == 0
 
     // Multi-stage pipeline version
     constexpr size_t maxPipelineStages = 4;
@@ -125,17 +137,17 @@ class kernels(Enum):
 
     // Write the block sub-matrix to device memory;
     // each thread writes four element
-    int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x;    
+    int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x;
     C[c + wB * threadIdx.y + threadIdx.x] = Csub;
 }
 
 // Single Stage memcpy_async pipeline with Large copy chunk (float4)
 extern "C"
-__global__ void MatrixMulAsyncCopyLargeChunk(float* __restrict__ C, 
+__global__ void MatrixMulAsyncCopyLargeChunk(float* __restrict__ C,
                                                         const float* __restrict__ A,
                                                         const float* __restrict__ B, int wA,
                                                         int wB) {
-    // Requires BLOCK_SIZE % 4 == 0 
+    // Requires BLOCK_SIZE % 4 == 0
 
     // Declaration of the shared memory array As used to
     // store the sub-matrix of A
@@ -170,7 +182,7 @@ class kernels(Enum):
     // Loop over all the sub-matrices of A and B
     // required to compute the block sub-matrix
     for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
-        // Load the matrices from device memory to shared memory; 
+        // Load the matrices from device memory to shared memory;
         // a subset of threads loads a contiguous chunk of elements.
 
         // Previously, per-thread:
@@ -216,13 +228,13 @@ class kernels(Enum):
 
 // Single Stage memcpy_async pipeline with Large copy chunk (float4) using arrive-wait barrier
 extern "C"
-__global__ void MatrixMulAsyncCopyLargeChunkAWBarrier(float* __restrict__ C, 
+__global__ void MatrixMulAsyncCopyLargeChunkAWBarrier(float* __restrict__ C,
                                                       const float* __restrict__ A,
                                                       const float* __restrict__ B, int wA,
                                                       int wB) {
 #if __CUDA_ARCH__ >= 700
 #pragma diag_suppress static_var_with_dynamic_init
-    // Requires BLOCK_SIZE % 4 == 0 
+    // Requires BLOCK_SIZE % 4 == 0
 
     __shared__ cuda::barrier<cuda::thread_scope_block> bar;
 
@@ -261,7 +273,7 @@ class kernels(Enum):
     // Loop over all the sub-matrices of A and B
     // required to compute the block sub-matrix
     for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
-        // Load the matrices from device memory to shared memory; 
+        // Load the matrices from device memory to shared memory;
         // a subset of threads loads a contiguous chunk of elements.
 
         // Now, one fourth of the threads load four elements of each matrix
@@ -375,7 +387,7 @@ class kernels(Enum):
 
 // Multi Stage memcpy_async thread_scope_thread pipeline with single-element async-copy
 extern "C"
-__global__ void MatrixMulAsyncCopyMultiStage(float* __restrict__ C, 
+__global__ void MatrixMulAsyncCopyMultiStage(float* __restrict__ C,
                                                         const float* __restrict__ A,
                                                         const float* __restrict__ B, int wA,
                                                         int wB) {
@@ -461,7 +473,7 @@ class kernels(Enum):
 // group which issues memcpy_async operations and rest all warps are part of
 // consumer group which perform gemm computation on the loaded matrices by producer.
 extern "C"
-__global__ void MatrixMulAsyncCopyMultiStageSharedState(float* __restrict__ C, 
+__global__ void MatrixMulAsyncCopyMultiStageSharedState(float* __restrict__ C,
                                                         const float* __restrict__ A,
                                                         const float* __restrict__ B, int wA,
                                                         int wB) {
@@ -510,7 +522,7 @@ class kernels(Enum):
                                                 a <= aEnd; a += aStep, b += bStep, ++i) {
         if (threadIdx.y >= consumer_row_count) {
             // this is a whole producer warp because threadIdx.y >= 16 where 16 == consumer_row_count,
-            // which loads the matrices from device memory to shared memory; 
+            // which loads the matrices from device memory to shared memory;
             for (; aStage <= a + aStep * maxPipelineStages; aStage += aStep, bStage += bStep, ++iStage) {
                 if (aStage <= aEnd) {
                     // Rotating buffer
@@ -518,7 +530,7 @@ class kernels(Enum):
                     const int strideRows = (blockDim.y - consumer_row_count);
                     pipe.producer_acquire();
                     for (int rowId = threadIdx.y - consumer_row_count; rowId < BLOCK_SIZE_X; rowId += strideRows) {
-                        cuda::memcpy_async(&As[j][rowId][threadIdx.x], 
+                        cuda::memcpy_async(&As[j][rowId][threadIdx.x],
                                             &A[aStage + wA * rowId + threadIdx.x], shape1, pipe);
                         cuda::memcpy_async(&Bs[j][rowId][threadIdx.x],
                                             &B[bStage + wB * rowId + threadIdx.x], shape1, pipe);
@@ -529,7 +541,7 @@ class kernels(Enum):
         }
         else {
             // this is a whole set of consumer group because threadIdx.y < consumer_row_count where consumer_row_count == 16,
-            // which computes gemm operation on matrices loaded in shared memory by producer warp. 
+            // which computes gemm operation on matrices loaded in shared memory by producer warp.
             const int j = i % maxPipelineStages;
             // Synchronize consumer group to make sure the matrices are loaded by producer group.
             pipe.consumer_wait();
@@ -663,7 +675,7 @@ class kernels(Enum):
             a += aStep, b += bStep) {
 
         // Load the matrices from device memory
-        // to shared memory; 
+        // to shared memory;
 
         // One fourth of the threads load four elements of each matrix
         if ( t4x < BLOCK_SIZE ) {
@@ -697,13 +709,15 @@ class kernels(Enum):
     int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x;
     C[c + wB * threadIdx.y + threadIdx.x] = Csub;
 }
-'''
+"""
+
 
 def ConstantInit(data, size, val):
     p_data = (ctypes.c_float * size).from_address(data)
     for i in range(size):
         p_data[i] = val
 
+
 #
 # Run matrix multiplication using CUDA
 #
@@ -768,74 +782,147 @@ def MatrixMultiply(dimsA, dimsB, kernel_number):
     gridSharedStateKernel.x = dimsB.x / threadsSharedStateKernel.x
     gridSharedStateKernel.y = dimsA.y / threadsSharedStateKernel.x
 
-    print("Running kernel = {} - {}".format(kernel_number, kernelNames[kernel_number.value]))
+    print(f"Running kernel = {kernel_number} - {kernelNames[kernel_number.value]}")
     # Create and start timer
     print("Computing result using CUDA Kernel...")
 
     # Performs warmup operation using matrixMul CUDA kernel
-    kernelArguments = ((d_C, d_A, d_B, dimsA.x, dimsB.x),
-                       (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int))
+    kernelArguments = (
+        (d_C, d_A, d_B, dimsA.x, dimsB.x),
+        (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int),
+    )
     if kernel_number == kernels.AsyncCopyMultiStageLargeChunk:
-        checkCudaErrors(cuda.cuLaunchKernel(_MatrixMulAsyncCopyMultiStageLargeChunk, 
-                                            grid.x, grid.y, grid.z,          # grid dim
-                                            threads.x, threads.y, threads.z, # block dim
-                                            0,                               # shared mem
-                                            stream,                          # stream
-                                            kernelArguments, 0))             # arguments
+        checkCudaErrors(
+            cuda.cuLaunchKernel(
+                _MatrixMulAsyncCopyMultiStageLargeChunk,
+                grid.x,
+                grid.y,
+                grid.z,  # grid dim
+                threads.x,
+                threads.y,
+                threads.z,  # block dim
+                0,  # shared mem
+                stream,  # stream
+                kernelArguments,
+                0,
+            )
+        )  # arguments
     elif kernel_number == kernels.AsyncCopyLargeChunk:
-        checkCudaErrors(cuda.cuLaunchKernel(_MatrixMulAsyncCopyLargeChunk,
-                                            grid.x, grid.y, grid.z,          # grid dim
-                                            threads.x, threads.y, threads.z, # block dim
-                                            0,                               # shared mem
-                                            stream,                          # stream
-                                            kernelArguments, 0))             # arguments
+        checkCudaErrors(
+            cuda.cuLaunchKernel(
+                _MatrixMulAsyncCopyLargeChunk,
+                grid.x,
+                grid.y,
+                grid.z,  # grid dim
+                threads.x,
+                threads.y,
+                threads.z,  # block dim
+                0,  # shared mem
+                stream,  # stream
+                kernelArguments,
+                0,
+            )
+        )  # arguments
     elif kernel_number == kernels.AsyncCopyLargeChunkAWBarrier:
-        checkCudaErrors(cuda.cuLaunchKernel(_MatrixMulAsyncCopyLargeChunkAWBarrier,
-                                            grid.x, grid.y, grid.z,          # grid dim
-                                            threads.x, threads.y, threads.z, # block dim
-                                            0,                               # shared mem
-                                            stream,                          # stream
-                                            kernelArguments, 0))             # arguments
+        checkCudaErrors(
+            cuda.cuLaunchKernel(
+                _MatrixMulAsyncCopyLargeChunkAWBarrier,
+                grid.x,
+                grid.y,
+                grid.z,  # grid dim
+                threads.x,
+                threads.y,
+                threads.z,  # block dim
+                0,  # shared mem
+                stream,  # stream
+                kernelArguments,
+                0,
+            )
+        )  # arguments
     elif kernel_number == kernels.AsyncCopyMultiStageSharedState:
-        checkCudaErrors(cuda.cuLaunchKernel(_MatrixMulAsyncCopyMultiStageSharedState,
-                                            gridSharedStateKernel.x, gridSharedStateKernel.y, gridSharedStateKernel.z,          # grid dim
-                                            threadsSharedStateKernel.x, threadsSharedStateKernel.y, threadsSharedStateKernel.z, # block dim
-                                            0,                               # shared mem
-                                            stream,                          # stream
-                                            kernelArguments, 0))             # arguments
+        checkCudaErrors(
+            cuda.cuLaunchKernel(
+                _MatrixMulAsyncCopyMultiStageSharedState,
+                gridSharedStateKernel.x,
+                gridSharedStateKernel.y,
+                gridSharedStateKernel.z,  # grid dim
+                threadsSharedStateKernel.x,
+                threadsSharedStateKernel.y,
+                threadsSharedStateKernel.z,  # block dim
+                0,  # shared mem
+                stream,  # stream
+                kernelArguments,
+                0,
+            )
+        )  # arguments
     elif kernel_number == kernels.AsyncCopyMultiStage:
-        checkCudaErrors(cuda.cuLaunchKernel(_MatrixMulAsyncCopyMultiStage,
-                                            grid.x, grid.y, grid.z,          # grid dim
-                                            threads.x, threads.y, threads.z, # block dim
-                                            0,                               # shared mem
-                                            stream,                          # stream
-                                            kernelArguments, 0))             # arguments
+        checkCudaErrors(
+            cuda.cuLaunchKernel(
+                _MatrixMulAsyncCopyMultiStage,
+                grid.x,
+                grid.y,
+                grid.z,  # grid dim
+                threads.x,
+                threads.y,
+                threads.z,  # block dim
+                0,  # shared mem
+                stream,  # stream
+                kernelArguments,
+                0,
+            )
+        )  # arguments
     elif kernel_number == kernels.AsyncCopySingleStage:
-        checkCudaErrors(cuda.cuLaunchKernel(_MatrixMulAsyncCopySingleStage,
-                                            grid.x, grid.y, grid.z,          # grid dim
-                                            threads.x, threads.y, threads.z, # block dim
-                                            0,                               # shared mem
-                                            stream,                          # stream
-                                            kernelArguments, 0))             # arguments
+        checkCudaErrors(
+            cuda.cuLaunchKernel(
+                _MatrixMulAsyncCopySingleStage,
+                grid.x,
+                grid.y,
+                grid.z,  # grid dim
+                threads.x,
+                threads.y,
+                threads.z,  # block dim
+                0,  # shared mem
+                stream,  # stream
+                kernelArguments,
+                0,
+            )
+        )  # arguments
     elif kernel_number == kernels.Naive:
-        checkCudaErrors(cuda.cuLaunchKernel(_MatrixMulNaive,
-                                            grid.x, grid.y, grid.z,          # grid dim
-                                            threads.x, threads.y, threads.z, # block dim
-                                            0,                               # shared mem
-                                            stream,                          # stream
-                                            kernelArguments, 0))             # arguments
+        checkCudaErrors(
+            cuda.cuLaunchKernel(
+                _MatrixMulNaive,
+                grid.x,
+                grid.y,
+                grid.z,  # grid dim
+                threads.x,
+                threads.y,
+                threads.z,  # block dim
+                0,  # shared mem
+                stream,  # stream
+                kernelArguments,
+                0,
+            )
+        )  # arguments
     elif kernel_number == kernels.NaiveLargeChunk:
-        checkCudaErrors(cuda.cuLaunchKernel(_MatrixMulNaiveLargeChunk,
-                                            grid.x, grid.y, grid.z,          # grid dim
-                                            threads.x, threads.y, threads.z, # block dim
-                                            0,                               # shared mem
-                                            stream,                          # stream
-                                            kernelArguments, 0))             # arguments
-
-    print('done')
+        checkCudaErrors(
+            cuda.cuLaunchKernel(
+                _MatrixMulNaiveLargeChunk,
+                grid.x,
+                grid.y,
+                grid.z,  # grid dim
+                threads.x,
+                threads.y,
+                threads.z,  # block dim
+                0,  # shared mem
+                stream,  # stream
+                kernelArguments,
+                0,
+            )
+        )  # arguments
+
+    print("done")
     checkCudaErrors(cudart.cudaStreamSynchronize(stream))
 
-
     # Execute the kernel
     nIter = 100
 
@@ -843,61 +930,133 @@ def MatrixMultiply(dimsA, dimsB, kernel_number):
     checkCudaErrors(cudart.cudaEventRecord(start, stream))
 
     if kernel_number == kernels.AsyncCopyMultiStageLargeChunk:
-        checkCudaErrors(cuda.cuLaunchKernel(_MatrixMulAsyncCopyMultiStageLargeChunk, 
-                                            grid.x, grid.y, grid.z,          # grid dim
-                                            threads.x, threads.y, threads.z, # block dim
-                                            0,                               # shared mem
-                                            stream,                          # stream
-                                            kernelArguments, 0))             # arguments
+        checkCudaErrors(
+            cuda.cuLaunchKernel(
+                _MatrixMulAsyncCopyMultiStageLargeChunk,
+                grid.x,
+                grid.y,
+                grid.z,  # grid dim
+                threads.x,
+                threads.y,
+                threads.z,  # block dim
+                0,  # shared mem
+                stream,  # stream
+                kernelArguments,
+                0,
+            )
+        )  # arguments
     elif kernel_number == kernels.AsyncCopyLargeChunk:
-        checkCudaErrors(cuda.cuLaunchKernel(_MatrixMulAsyncCopyLargeChunk,
-                                            grid.x, grid.y, grid.z,          # grid dim
-                                            threads.x, threads.y, threads.z, # block dim
-                                            0,                               # shared mem
-                                            stream,                          # stream
-                                            kernelArguments, 0))             # arguments
+        checkCudaErrors(
+            cuda.cuLaunchKernel(
+                _MatrixMulAsyncCopyLargeChunk,
+                grid.x,
+                grid.y,
+                grid.z,  # grid dim
+                threads.x,
+                threads.y,
+                threads.z,  # block dim
+                0,  # shared mem
+                stream,  # stream
+                kernelArguments,
+                0,
+            )
+        )  # arguments
     elif kernel_number == kernels.AsyncCopyLargeChunkAWBarrier:
-        checkCudaErrors(cuda.cuLaunchKernel(_MatrixMulAsyncCopyLargeChunkAWBarrier,
-                                            grid.x, grid.y, grid.z,          # grid dim
-                                            threads.x, threads.y, threads.z, # block dim
-                                            0,                               # shared mem
-                                            stream,                          # stream
-                                            kernelArguments, 0))             # arguments
+        checkCudaErrors(
+            cuda.cuLaunchKernel(
+                _MatrixMulAsyncCopyLargeChunkAWBarrier,
+                grid.x,
+                grid.y,
+                grid.z,  # grid dim
+                threads.x,
+                threads.y,
+                threads.z,  # block dim
+                0,  # shared mem
+                stream,  # stream
+                kernelArguments,
+                0,
+            )
+        )  # arguments
     elif kernel_number == kernels.AsyncCopyMultiStageSharedState:
-        checkCudaErrors(cuda.cuLaunchKernel(_MatrixMulAsyncCopyMultiStageSharedState,
-                                            gridSharedStateKernel.x, gridSharedStateKernel.y, gridSharedStateKernel.z,          # grid dim
-                                            threadsSharedStateKernel.x, threadsSharedStateKernel.y, threadsSharedStateKernel.z, # block dim
-                                            0,                               # shared mem
-                                            stream,                          # stream
-                                            kernelArguments, 0))             # arguments
+        checkCudaErrors(
+            cuda.cuLaunchKernel(
+                _MatrixMulAsyncCopyMultiStageSharedState,
+                gridSharedStateKernel.x,
+                gridSharedStateKernel.y,
+                gridSharedStateKernel.z,  # grid dim
+                threadsSharedStateKernel.x,
+                threadsSharedStateKernel.y,
+                threadsSharedStateKernel.z,  # block dim
+                0,  # shared mem
+                stream,  # stream
+                kernelArguments,
+                0,
+            )
+        )  # arguments
     elif kernel_number == kernels.AsyncCopyMultiStage:
-        checkCudaErrors(cuda.cuLaunchKernel(_MatrixMulAsyncCopyMultiStage,
-                                            grid.x, grid.y, grid.z,          # grid dim
-                                            threads.x, threads.y, threads.z, # block dim
-                                            0,                               # shared mem
-                                            stream,                          # stream
-                                            kernelArguments, 0))             # arguments
+        checkCudaErrors(
+            cuda.cuLaunchKernel(
+                _MatrixMulAsyncCopyMultiStage,
+                grid.x,
+                grid.y,
+                grid.z,  # grid dim
+                threads.x,
+                threads.y,
+                threads.z,  # block dim
+                0,  # shared mem
+                stream,  # stream
+                kernelArguments,
+                0,
+            )
+        )  # arguments
     elif kernel_number == kernels.AsyncCopySingleStage:
-        checkCudaErrors(cuda.cuLaunchKernel(_MatrixMulAsyncCopySingleStage,
-                                            grid.x, grid.y, grid.z,          # grid dim
-                                            threads.x, threads.y, threads.z, # block dim
-                                            0,                               # shared mem
-                                            stream,                          # stream
-                                            kernelArguments, 0))             # arguments
+        checkCudaErrors(
+            cuda.cuLaunchKernel(
+                _MatrixMulAsyncCopySingleStage,
+                grid.x,
+                grid.y,
+                grid.z,  # grid dim
+                threads.x,
+                threads.y,
+                threads.z,  # block dim
+                0,  # shared mem
+                stream,  # stream
+                kernelArguments,
+                0,
+            )
+        )  # arguments
     elif kernel_number == kernels.Naive:
-        checkCudaErrors(cuda.cuLaunchKernel(_MatrixMulNaive,
-                                            grid.x, grid.y, grid.z,          # grid dim
-                                            threads.x, threads.y, threads.z, # block dim
-                                            0,                               # shared mem
-                                            stream,                          # stream
-                                            kernelArguments, 0))             # arguments
+        checkCudaErrors(
+            cuda.cuLaunchKernel(
+                _MatrixMulNaive,
+                grid.x,
+                grid.y,
+                grid.z,  # grid dim
+                threads.x,
+                threads.y,
+                threads.z,  # block dim
+                0,  # shared mem
+                stream,  # stream
+                kernelArguments,
+                0,
+            )
+        )  # arguments
     elif kernel_number == kernels.NaiveLargeChunk:
-        checkCudaErrors(cuda.cuLaunchKernel(_MatrixMulNaiveLargeChunk,
-                                            grid.x, grid.y, grid.z,          # grid dim
-                                            threads.x, threads.y, threads.z, # block dim
-                                            0,                               # shared mem
-                                            stream,                          # stream
-                                            kernelArguments, 0))             # arguments
+        checkCudaErrors(
+            cuda.cuLaunchKernel(
+                _MatrixMulNaiveLargeChunk,
+                grid.x,
+                grid.y,
+                grid.z,  # grid dim
+                threads.x,
+                threads.y,
+                threads.z,  # block dim
+                0,  # shared mem
+                stream,  # stream
+                kernelArguments,
+                0,
+            )
+        )  # arguments
 
     # Record the stop event
     checkCudaErrors(cudart.cudaEventRecord(stop, stream))
@@ -912,11 +1071,9 @@ def MatrixMultiply(dimsA, dimsB, kernel_number):
     flopsPerMatrixMul = 2.0 * dimsA.x * dimsA.y * dimsB.x
     gigaFlops = (flopsPerMatrixMul * 1.0e-9) / (msecPerMatrixMul / 1000.0)
 
-    print("Performance= {:.2f} GFlop/s, Time= {:.2f} msec, Size= {:.0f} Ops, WorkgroupSize= {} threads/block".format(
-            gigaFlops,
-            msecPerMatrixMul,
-            flopsPerMatrixMul,
-            threads.x * threads.y))
+    print(
+        f"Performance= {gigaFlops:.2f} GFlop/s, Time= {msecPerMatrixMul:.2f} msec, Size= {flopsPerMatrixMul:.0f} Ops, WorkgroupSize= {threads.x * threads.y} threads/block"
+    )
 
     # Copy result from device to host
     checkCudaErrors(cudart.cudaMemcpyAsync(h_C, d_C, mem_size_C, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream))
@@ -927,7 +1084,7 @@ def MatrixMultiply(dimsA, dimsB, kernel_number):
 
     # test relative error by the formula
     # |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|>  < eps
-    eps = 1.e-6
+    eps = 1.0e-6
 
     h_C_local = (ctypes.c_float * (dimsC.x * dimsC.y)).from_address(h_C)
     for i in range(dimsC.x * dimsC.y):
@@ -937,10 +1094,10 @@ def MatrixMultiply(dimsA, dimsB, kernel_number):
         rel_err = abs_err / abs_val / dot_length
 
         if rel_err > eps:
-            print("Error! Matrix[{:.5f}]={:.8f} ref={:.8f} err term is > {}".format(i, h_C_local[i], dimsA.x * valB, rel_err))
+            print(f"Error! Matrix[{i:.5f}]={h_C_local[i]:.8f} ref={dimsA.x * valB:.8f} err term is > {rel_err}")
             correct = False
 
-    print("Result = PASS" if correct else "Result = FAIL")   
+    print("Result = PASS" if correct else "Result = FAIL")
 
     # Clean up memory
     checkCudaErrors(cudart.cudaFreeHost(h_A))
@@ -951,15 +1108,17 @@ def MatrixMultiply(dimsA, dimsB, kernel_number):
     checkCudaErrors(cudart.cudaFree(d_C))
     checkCudaErrors(cudart.cudaEventDestroy(start))
     checkCudaErrors(cudart.cudaEventDestroy(stop))
-    print("\nNOTE: The CUDA Samples are not meant for performance "\
-          "measurements. Results may vary when GPU Boost is enabled.");
-
+    print(
+        "\nNOTE: The CUDA Samples are not meant for performance "
+        "measurements. Results may vary when GPU Boost is enabled."
+    )
     if correct:
         return 0
     return -1
 
+
 def checkKernelCompiles():
-    kernel_headers = '''\
+    kernel_headers = """\
     #line __LINE__
     #if __CUDA_ARCH__ >= 700
     #include <cuda/barrier>
@@ -967,7 +1126,7 @@ def checkKernelCompiles():
     #include <cooperative_groups.h>
     #include <cooperative_groups/reduce.h>
     #include <cuda/pipeline>
-    '''
+    """
     try:
         common.KernelHelper(kernel_headers, findCudaDevice())
     except:
@@ -977,6 +1136,7 @@ def checkKernelCompiles():
         return False
     return True
 
+
 @pytest.mark.skipif(not checkKernelCompiles(), reason="Automation filter against incompatible kernel")
 def main():
     print("[globalToShmemAsyncCopy] - Starting...")
@@ -986,13 +1146,15 @@ def main():
         print("CUDA Toolkit 11.1 or greater is required")
         return
 
-    if (checkCmdLineFlag("help") or checkCmdLineFlag("?")):
+    if checkCmdLineFlag("help") or checkCmdLineFlag("?"):
         print("Usage device=n (n >= 0 for deviceID)")
         print("      wA=WidthA hA=HeightA (Width x Height of Matrix A)")
         print("      wB=WidthB hB=HeightB (Width x Height of Matrix B)")
         print("      kernel=kernel_number (0 - AsyncCopyMultiStageLargeChunk; 1 - AsyncCopyLargeChunk)")
         print("                            (2 - AsyncCopyLargeChunkAWBarrier; 3 - AsyncCopyMultiStageSharedState)")
-        print("                            (4 - AsyncCopyMultiStage; 5 - AsyncCopySingleStage; 6 - Naive without memcpy_async)")
+        print(
+            "                            (4 - AsyncCopyMultiStage; 5 - AsyncCopySingleStage; 6 - Naive without memcpy_async)"
+        )
         print("                            (7 - NaiveLargeChunk without memcpy_async)")
         print("  Note: Outer matrix dimensions of A & B matrices must be equal.")
         return
@@ -1026,7 +1188,7 @@ def main():
         dimsB.y = int(getCmdLineArgumentInt("hB="))
 
     if dimsA.x != dimsB.y:
-        print("Error: outer matrix dimensions must be equal. ({} != {})".format(dimsA.x, dimsB.y))
+        print(f"Error: outer matrix dimensions must be equal. ({dimsA.x} != {dimsB.y})")
         sys.exit(-1)
 
     selected_kernel = kernels.AsyncCopyMultiStageLargeChunk
@@ -1037,16 +1199,17 @@ def main():
         if kernel_number < 8:
             selected_kernel = kernels(kernel_number)
         else:
-            print("Error: kernel number should be between 0 to 7, you have entered %d".format(kernel_number))
+            print("Error: kernel number should be between 0 to 7, you have entered %d".format())
             sys.exit(-1)
 
-    major = checkCudaErrors(cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, devID))
+    major = checkCudaErrors(
+        cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, devID)
+    )
     if major < 7:
         print("globalToShmemAsyncCopy requires SM 7.0 or higher.  Exiting...")
         return
 
-    print("MatrixA({},{}), MatrixB({},{})".format(dimsA.x, dimsA.y,
-                                                  dimsB.x, dimsB.y))
+    print(f"MatrixA({dimsA.x},{dimsA.y}), MatrixB({dimsB.x},{dimsB.y})")
 
     global _MatrixMulAsyncCopyMultiStageLargeChunk
     global _MatrixMulAsyncCopyLargeChunk
@@ -1057,19 +1220,20 @@ def main():
     global _MatrixMulNaive
     global _MatrixMulNaiveLargeChunk
     kernelHelper = common.KernelHelper(globalToShmemAsyncCopy, devID)
-    _MatrixMulAsyncCopyMultiStageLargeChunk = kernelHelper.getFunction(b'MatrixMulAsyncCopyMultiStageLargeChunk')
-    _MatrixMulAsyncCopyLargeChunk = kernelHelper.getFunction(b'MatrixMulAsyncCopyLargeChunk')
-    _MatrixMulAsyncCopyLargeChunkAWBarrier = kernelHelper.getFunction(b'MatrixMulAsyncCopyLargeChunkAWBarrier')
-    _MatrixMulAsyncCopyMultiStageSharedState = kernelHelper.getFunction(b'MatrixMulAsyncCopyMultiStageSharedState')
-    _MatrixMulAsyncCopyMultiStage = kernelHelper.getFunction(b'MatrixMulAsyncCopyMultiStage')
-    _MatrixMulAsyncCopySingleStage = kernelHelper.getFunction(b'MatrixMulAsyncCopySingleStage')
-    _MatrixMulNaive = kernelHelper.getFunction(b'MatrixMulNaive')
-    _MatrixMulNaiveLargeChunk = kernelHelper.getFunction(b'MatrixMulNaiveLargeChunk')
+    _MatrixMulAsyncCopyMultiStageLargeChunk = kernelHelper.getFunction(b"MatrixMulAsyncCopyMultiStageLargeChunk")
+    _MatrixMulAsyncCopyLargeChunk = kernelHelper.getFunction(b"MatrixMulAsyncCopyLargeChunk")
+    _MatrixMulAsyncCopyLargeChunkAWBarrier = kernelHelper.getFunction(b"MatrixMulAsyncCopyLargeChunkAWBarrier")
+    _MatrixMulAsyncCopyMultiStageSharedState = kernelHelper.getFunction(b"MatrixMulAsyncCopyMultiStageSharedState")
+    _MatrixMulAsyncCopyMultiStage = kernelHelper.getFunction(b"MatrixMulAsyncCopyMultiStage")
+    _MatrixMulAsyncCopySingleStage = kernelHelper.getFunction(b"MatrixMulAsyncCopySingleStage")
+    _MatrixMulNaive = kernelHelper.getFunction(b"MatrixMulNaive")
+    _MatrixMulNaiveLargeChunk = kernelHelper.getFunction(b"MatrixMulNaiveLargeChunk")
 
     matrix_result = MatrixMultiply(dimsA, dimsB, selected_kernel)
 
     if matrix_result != 0:
         sys.exit(-1)
 
+
 if __name__ == "__main__":
     main()
diff --git a/cuda_bindings/examples/3_CUDA_Features/simpleCudaGraphs_test.py b/cuda_bindings/examples/3_CUDA_Features/simpleCudaGraphs_test.py
index 7a895acb8..ee8343632 100644
--- a/cuda_bindings/examples/3_CUDA_Features/simpleCudaGraphs_test.py
+++ b/cuda_bindings/examples/3_CUDA_Features/simpleCudaGraphs_test.py
@@ -6,17 +6,19 @@
 # this software and related documentation outside the terms of the EULA
 # is strictly prohibited.
 import ctypes
+import random as rnd
+
 import numpy as np
 import pytest
-import random as rnd
-from cuda import cuda, cudart
 from common import common
 from common.helper_cuda import checkCudaErrors, findCudaDevice
 
+from cuda import cuda, cudart
+
 THREADS_PER_BLOCK = 512
 GRAPH_LAUNCH_ITERATIONS = 3
 
-simpleCudaGraphs = '''\
+simpleCudaGraphs = """\
 #include <cooperative_groups.h>
 #include <cuda_runtime.h>
 
@@ -113,7 +115,8 @@
     // write result for this block to global mem
     if (cta.thread_rank() == 0) result[0] = temp_sum;
 }
-'''
+"""
+
 
 def init_input(a, size):
     ctypes.c_float.from_address(a)
@@ -121,6 +124,7 @@ def init_input(a, size):
     for i in range(0, size):
         a_list[i] = rnd.random()
 
+
 def cudaGraphsManual(inputVec_h, inputVec_d, outputVec_d, result_d, inputSize, numOfBlocks):
     result_h = ctypes.c_double(0.0)
     nodeDependencies = []
@@ -133,17 +137,21 @@ def cudaGraphsManual(inputVec_h, inputVec_d, outputVec_d, result_d, inputSize, n
 
     memcpyParams.srcArray = None
     memcpyParams.srcPos = cudart.make_cudaPos(0, 0, 0)
-    memcpyParams.srcPtr = cudart.make_cudaPitchedPtr(inputVec_h, np.dtype(np.float32).itemsize * inputSize, inputSize, 1)
+    memcpyParams.srcPtr = cudart.make_cudaPitchedPtr(
+        inputVec_h, np.dtype(np.float32).itemsize * inputSize, inputSize, 1
+    )
     memcpyParams.dstArray = None
     memcpyParams.dstPos = cudart.make_cudaPos(0, 0, 0)
-    memcpyParams.dstPtr = cudart.make_cudaPitchedPtr(inputVec_d, np.dtype(np.float32).itemsize * inputSize, inputSize, 1)
+    memcpyParams.dstPtr = cudart.make_cudaPitchedPtr(
+        inputVec_d, np.dtype(np.float32).itemsize * inputSize, inputSize, 1
+    )
     memcpyParams.extent = cudart.make_cudaExtent(np.dtype(np.float32).itemsize * inputSize, 1, 1)
     memcpyParams.kind = cudart.cudaMemcpyKind.cudaMemcpyHostToDevice
 
     memsetParams.dst = outputVec_d
     memsetParams.value = 0
     memsetParams.pitch = 0
-    memsetParams.elementSize = np.dtype(np.float32).itemsize # elementSize can be max 4 bytes
+    memsetParams.elementSize = np.dtype(np.float32).itemsize  # elementSize can be max 4 bytes
     memsetParams.width = numOfBlocks * 2
     memsetParams.height = 1
 
@@ -155,8 +163,10 @@ def cudaGraphsManual(inputVec_h, inputVec_d, outputVec_d, result_d, inputSize, n
     nodeDependencies.append(memsetNode)
     nodeDependencies.append(memcpyNode)
 
-    kernelArgs = ((inputVec_d, outputVec_d, inputSize, numOfBlocks),
-                  (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, ctypes.c_uint))
+    kernelArgs = (
+        (inputVec_d, outputVec_d, inputSize, numOfBlocks),
+        (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, ctypes.c_uint),
+    )
 
     kernelNodeParams.func = _reduce
     kernelNodeParams.gridDimX = numOfBlocks
@@ -167,7 +177,9 @@ def cudaGraphsManual(inputVec_h, inputVec_d, outputVec_d, result_d, inputSize, n
     kernelNodeParams.kernelParams = kernelArgs
     # kernelNodeParams.extra = None
 
-    kernelNode = checkCudaErrors(cuda.cuGraphAddKernelNode(graph, nodeDependencies, len(nodeDependencies), kernelNodeParams))
+    kernelNode = checkCudaErrors(
+        cuda.cuGraphAddKernelNode(graph, nodeDependencies, len(nodeDependencies), kernelNodeParams)
+    )
 
     nodeDependencies.clear()
     nodeDependencies.append(kernelNode)
@@ -188,12 +200,16 @@ def cudaGraphsManual(inputVec_h, inputVec_d, outputVec_d, result_d, inputSize, n
     kernelNodeParams.blockDimX = THREADS_PER_BLOCK
     kernelNodeParams.blockDimY = kernelNodeParams.blockDimZ = 1
     kernelNodeParams.sharedMemBytes = 0
-    kernelArgs2 = ((outputVec_d, result_d, numOfBlocks),
-                   (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_uint))
+    kernelArgs2 = (
+        (outputVec_d, result_d, numOfBlocks),
+        (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_uint),
+    )
     kernelNodeParams.kernelParams = kernelArgs2
     # kernelNodeParams.extra = None
 
-    kernelNode = checkCudaErrors(cuda.cuGraphAddKernelNode(graph, nodeDependencies, len(nodeDependencies), kernelNodeParams))
+    kernelNode = checkCudaErrors(
+        cuda.cuGraphAddKernelNode(graph, nodeDependencies, len(nodeDependencies), kernelNodeParams)
+    )
 
     nodeDependencies.clear()
     nodeDependencies.append(kernelNode)
@@ -208,7 +224,9 @@ def cudaGraphsManual(inputVec_h, inputVec_d, outputVec_d, result_d, inputSize, n
     memcpyParams.dstPtr = cudart.make_cudaPitchedPtr(result_h, np.dtype(np.float64).itemsize, 1, 1)
     memcpyParams.extent = cudart.make_cudaExtent(np.dtype(np.float64).itemsize, 1, 1)
     memcpyParams.kind = cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost
-    memcpyNode = checkCudaErrors(cudart.cudaGraphAddMemcpyNode(graph, nodeDependencies, len(nodeDependencies), memcpyParams))
+    memcpyNode = checkCudaErrors(
+        cudart.cudaGraphAddMemcpyNode(graph, nodeDependencies, len(nodeDependencies), memcpyParams)
+    )
 
     nodeDependencies.clear()
     nodeDependencies.append(memcpyNode)
@@ -216,20 +234,20 @@ def cudaGraphsManual(inputVec_h, inputVec_d, outputVec_d, result_d, inputSize, n
     # WIP: Host nodes
 
     nodes, numNodes = checkCudaErrors(cudart.cudaGraphGetNodes(graph))
-    print("\nNum of nodes in the graph created manually = {}".format(numNodes))
+    print(f"\nNum of nodes in the graph created manually = {numNodes}")
 
     graphExec = checkCudaErrors(cudart.cudaGraphInstantiate(graph, 0))
 
     clonedGraph = checkCudaErrors(cudart.cudaGraphClone(graph))
     clonedGraphExec = checkCudaErrors(cudart.cudaGraphInstantiate(clonedGraph, 0))
 
-    for i in range(GRAPH_LAUNCH_ITERATIONS):
+    for _i in range(GRAPH_LAUNCH_ITERATIONS):
         checkCudaErrors(cudart.cudaGraphLaunch(graphExec, streamForGraph))
 
     checkCudaErrors(cudart.cudaStreamSynchronize(streamForGraph))
 
     print("Cloned Graph Output..")
-    for i in range(GRAPH_LAUNCH_ITERATIONS):
+    for _i in range(GRAPH_LAUNCH_ITERATIONS):
         checkCudaErrors(cudart.cudaGraphLaunch(clonedGraphExec, streamForGraph))
 
     checkCudaErrors(cudart.cudaStreamSynchronize(streamForGraph))
@@ -240,6 +258,7 @@ def cudaGraphsManual(inputVec_h, inputVec_d, outputVec_d, result_d, inputSize, n
     checkCudaErrors(cudart.cudaGraphDestroy(clonedGraph))
     checkCudaErrors(cudart.cudaStreamDestroy(streamForGraph))
 
+
 def cudaGraphsUsingStreamCapture(inputVec_h, inputVec_d, outputVec_d, result_d, inputSize, numOfBlocks):
     result_h = ctypes.c_double(0.0)
 
@@ -258,9 +277,15 @@ def cudaGraphsUsingStreamCapture(inputVec_h, inputVec_d, outputVec_d, result_d,
     checkCudaErrors(cudart.cudaStreamWaitEvent(stream2, forkStreamEvent, 0))
     checkCudaErrors(cudart.cudaStreamWaitEvent(stream3, forkStreamEvent, 0))
 
-    checkCudaErrors(cudart.cudaMemcpyAsync(inputVec_d, inputVec_h,
-                                           np.dtype(np.float32).itemsize * inputSize, cudart.cudaMemcpyKind.cudaMemcpyDefault,
-                                           stream1))
+    checkCudaErrors(
+        cudart.cudaMemcpyAsync(
+            inputVec_d,
+            inputVec_h,
+            np.dtype(np.float32).itemsize * inputSize,
+            cudart.cudaMemcpyKind.cudaMemcpyDefault,
+            stream1,
+        )
+    )
 
     checkCudaErrors(cudart.cudaMemsetAsync(outputVec_d, 0, np.dtype(np.float64).itemsize * numOfBlocks, stream2))
 
@@ -271,46 +296,63 @@ def cudaGraphsUsingStreamCapture(inputVec_h, inputVec_d, outputVec_d, result_d,
 
     checkCudaErrors(cudart.cudaStreamWaitEvent(stream1, memsetEvent1, 0))
 
-    kernelArgs = ((inputVec_d, outputVec_d, inputSize, numOfBlocks),
-                  (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, ctypes.c_uint))
-    checkCudaErrors(cuda.cuLaunchKernel(_reduce,
-                                        numOfBlocks, 1, 1,
-                                        THREADS_PER_BLOCK, 1, 1,
-                                        0, stream1,
-                                        kernelArgs, 0))
+    kernelArgs = (
+        (inputVec_d, outputVec_d, inputSize, numOfBlocks),
+        (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, ctypes.c_uint),
+    )
+    checkCudaErrors(
+        cuda.cuLaunchKernel(
+            _reduce,
+            numOfBlocks,
+            1,
+            1,
+            THREADS_PER_BLOCK,
+            1,
+            1,
+            0,
+            stream1,
+            kernelArgs,
+            0,
+        )
+    )
 
     checkCudaErrors(cudart.cudaStreamWaitEvent(stream1, memsetEvent2, 0))
 
-    kernelArgs2 = ((outputVec_d, result_d, numOfBlocks),
-                   (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_uint))
-    checkCudaErrors(cuda.cuLaunchKernel(_reduceFinal,
-                                        1, 1, 1,
-                                        THREADS_PER_BLOCK, 1, 1,
-                                        0, stream1,
-                                        kernelArgs2, 0))
-
-    checkCudaErrors(cudart.cudaMemcpyAsync(result_h, result_d, np.dtype(np.float64).itemsize,
-                                           cudart.cudaMemcpyKind.cudaMemcpyDefault, stream1))
+    kernelArgs2 = (
+        (outputVec_d, result_d, numOfBlocks),
+        (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_uint),
+    )
+    checkCudaErrors(cuda.cuLaunchKernel(_reduceFinal, 1, 1, 1, THREADS_PER_BLOCK, 1, 1, 0, stream1, kernelArgs2, 0))
+
+    checkCudaErrors(
+        cudart.cudaMemcpyAsync(
+            result_h,
+            result_d,
+            np.dtype(np.float64).itemsize,
+            cudart.cudaMemcpyKind.cudaMemcpyDefault,
+            stream1,
+        )
+    )
 
     # WIP: Host nodes
 
     graph = checkCudaErrors(cudart.cudaStreamEndCapture(stream1))
 
     nodes, numNodes = checkCudaErrors(cudart.cudaGraphGetNodes(graph))
-    print("\nNum of nodes in the graph created using stream capture API = {}".format(numNodes))
+    print(f"\nNum of nodes in the graph created using stream capture API = {numNodes}")
 
     graphExec = checkCudaErrors(cudart.cudaGraphInstantiate(graph, 0))
 
     clonedGraph = checkCudaErrors(cudart.cudaGraphClone(graph))
     clonedGraphExec = checkCudaErrors(cudart.cudaGraphInstantiate(clonedGraph, 0))
 
-    for i in range(GRAPH_LAUNCH_ITERATIONS):
+    for _i in range(GRAPH_LAUNCH_ITERATIONS):
         checkCudaErrors(cudart.cudaGraphLaunch(graphExec, streamForGraph))
 
     checkCudaErrors(cudart.cudaStreamSynchronize(streamForGraph))
 
     print("Cloned Graph Output..")
-    for i in range(GRAPH_LAUNCH_ITERATIONS):
+    for _i in range(GRAPH_LAUNCH_ITERATIONS):
         checkCudaErrors(cudart.cudaGraphLaunch(clonedGraphExec, streamForGraph))
 
     checkCudaErrors(cudart.cudaStreamSynchronize(streamForGraph))
@@ -323,10 +365,11 @@ def cudaGraphsUsingStreamCapture(inputVec_h, inputVec_d, outputVec_d, result_d,
     checkCudaErrors(cudart.cudaStreamDestroy(stream2))
     checkCudaErrors(cudart.cudaStreamDestroy(streamForGraph))
 
+
 def checkKernelCompiles():
-    kernel_headers = '''\
+    kernel_headers = """\
     #include <cooperative_groups.h>
-    '''
+    """
     try:
         common.KernelHelper(kernel_headers, findCudaDevice())
     except:
@@ -338,9 +381,10 @@ def checkKernelCompiles():
         return False
     return True
 
+
 @pytest.mark.skipif(not checkKernelCompiles(), reason="Automation filter against incompatible kernel")
 def main():
-    size = 1 << 24 # number of elements to reduce
+    size = 1 << 24  # number of elements to reduce
     maxBlocks = 512
 
     # This will pick the best possible CUDA capable device
@@ -349,12 +393,12 @@ def main():
     global _reduce
     global _reduceFinal
     kernelHelper = common.KernelHelper(simpleCudaGraphs, devID)
-    _reduce = kernelHelper.getFunction(b'reduce')
-    _reduceFinal = kernelHelper.getFunction(b'reduceFinal')
+    _reduce = kernelHelper.getFunction(b"reduce")
+    _reduceFinal = kernelHelper.getFunction(b"reduceFinal")
 
-    print("{} elements".format(size))
-    print("threads per block  = {}".format(THREADS_PER_BLOCK))
-    print("Graph Launch iterations = {}".format(GRAPH_LAUNCH_ITERATIONS))
+    print(f"{size} elements")
+    print(f"threads per block  = {THREADS_PER_BLOCK}")
+    print(f"Graph Launch iterations = {GRAPH_LAUNCH_ITERATIONS}")
 
     inputVec_h = checkCudaErrors(cudart.cudaMallocHost(size * np.dtype(np.float32).itemsize))
     inputVec_d = checkCudaErrors(cudart.cudaMalloc(size * np.dtype(np.float32).itemsize))
@@ -371,5 +415,6 @@ def main():
     checkCudaErrors(cudart.cudaFree(result_d))
     checkCudaErrors(cudart.cudaFreeHost(inputVec_h))
 
+
 if __name__ == "__main__":
     main()
diff --git a/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py b/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py
index 6f64066f8..38be5798f 100644
--- a/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py
+++ b/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py
@@ -7,14 +7,16 @@
 # is strictly prohibited.
 import ctypes
 import math
-import numpy as np
 import sys
-from cuda import cuda, cudart
+from random import random
+
+import numpy as np
 from common import common
 from common.helper_cuda import checkCudaErrors, findCudaDevice
-from random import random
 
-conjugateGradientMultiBlockCG = '''\
+from cuda import cuda, cudart
+
+conjugateGradientMultiBlockCG = """\
 #line __LINE__
 #include <cooperative_groups.h>
 #include <cooperative_groups/reduce.h>
@@ -160,12 +162,13 @@
     k++;
   }
 }
-'''
+"""
+
 
 def genTridiag(I, J, val, N, nz):
-    I[0] = 0 
+    I[0] = 0
     J[0] = 0
-    J[1]= 0 
+    J[1] = 0
 
     val[0] = float(random()) + 10.0
     val[1] = float(random())
@@ -190,13 +193,15 @@ def genTridiag(I, J, val, N, nz):
             val[start + 2] = float(random())
     I[N] = nz
 
+
 THREADS_PER_BLOCK = 512
-sSDKname = "conjugateGradientMultiBlockCG";
+sSDKname = "conjugateGradientMultiBlockCG"
+
+
 def main():
     tol = 1e-5
 
-    print("Starting [%s]...\n" % sSDKname);
-
+    print(f"Starting [{sSDKname}]...\n")
     # WAIVE: Due to bug in NVRTC
     return
 
@@ -212,23 +217,24 @@ def main():
     # This sample requires being run on a device that supports Cooperative Kernel
     # Launch
     if not deviceProp.cooperativeLaunch:
-        print("\nSelected GPU (%d) does not support Cooperative Kernel Launch, Waiving the run" %
-                (devID))
+        print("\nSelected GPU (%d) does not support Cooperative Kernel Launch, Waiving the run" % (devID))
         return
 
     # Statistics about the GPU device
-    print("> GPU device has %d Multi-Processors, SM %d.%d compute capabilities\n" % 
-            (deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor))
+    print(
+        "> GPU device has %d Multi-Processors, SM %d.%d compute capabilities\n"
+        % (deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor)
+    )
 
     # Get kernel
     kernelHelper = common.KernelHelper(conjugateGradientMultiBlockCG, devID)
-    _gpuConjugateGradient = kernelHelper.getFunction(b'gpuConjugateGradient')
+    _gpuConjugateGradient = kernelHelper.getFunction(b"gpuConjugateGradient")
 
     # Generate a random tridiagonal symmetric matrix in CSR format
     N = 1048576
     nz = (N - 2) * 3 + 4
 
-    I = checkCudaErrors(cudart.cudaMallocManaged(np.dtype(np.int32).itemsize * (N+1), cudart.cudaMemAttachGlobal))
+    I = checkCudaErrors(cudart.cudaMallocManaged(np.dtype(np.int32).itemsize * (N + 1), cudart.cudaMemAttachGlobal))
     J = checkCudaErrors(cudart.cudaMallocManaged(np.dtype(np.int32).itemsize * nz, cudart.cudaMemAttachGlobal))
     val = checkCudaErrors(cudart.cudaMallocManaged(np.dtype(np.float32).itemsize * nz, cudart.cudaMemAttachGlobal))
     I_local = (ctypes.c_int * (N + 1)).from_address(I)
@@ -250,8 +256,6 @@ def main():
     p = checkCudaErrors(cudart.cudaMallocManaged(np.dtype(np.float32).itemsize * N, cudart.cudaMemAttachGlobal))
     Ax = checkCudaErrors(cudart.cudaMallocManaged(np.dtype(np.float32).itemsize * N, cudart.cudaMemAttachGlobal))
     r_local = (ctypes.c_float * N).from_address(r)
-    p_local = (ctypes.c_float * N).from_address(p)
-    Ax_local = (ctypes.c_float * N).from_address(Ax)
 
     checkCudaErrors(cudart.cudaDeviceSynchronize())
 
@@ -262,18 +266,27 @@ def main():
         r_local[i] = rhs_local[i] = 1.0
         x_local[i] = 0.0
 
-    kernelArgs_value = (I, J, val, x,
-                        Ax, p, r, dot_result,
-                        nz, N, tol)
-    kernelArgs_types = (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p,
-                        ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p,
-                        ctypes.c_int, ctypes.c_int, ctypes.c_float)
+    kernelArgs_value = (I, J, val, x, Ax, p, r, dot_result, nz, N, tol)
+    kernelArgs_types = (
+        ctypes.c_void_p,
+        ctypes.c_void_p,
+        ctypes.c_void_p,
+        ctypes.c_void_p,
+        ctypes.c_void_p,
+        ctypes.c_void_p,
+        ctypes.c_void_p,
+        ctypes.c_void_p,
+        ctypes.c_int,
+        ctypes.c_int,
+        ctypes.c_float,
+    )
     kernelArgs = (kernelArgs_value, kernelArgs_types)
 
-    sMemSize = np.dtype(np.float64).itemsize * ((THREADS_PER_BLOCK/32) + 1)
+    sMemSize = np.dtype(np.float64).itemsize * ((THREADS_PER_BLOCK / 32) + 1)
     numThreads = THREADS_PER_BLOCK
-    numBlocksPerSm = checkCudaErrors(cuda.cuOccupancyMaxActiveBlocksPerMultiprocessor(
-                                        _gpuConjugateGradient, numThreads, sMemSize))
+    numBlocksPerSm = checkCudaErrors(
+        cuda.cuOccupancyMaxActiveBlocksPerMultiprocessor(_gpuConjugateGradient, numThreads, sMemSize)
+    )
     numSms = deviceProp.multiProcessorCount
     dimGrid = cudart.dim3()
     dimGrid.x = numSms * numBlocksPerSm
@@ -285,24 +298,31 @@ def main():
     dimBlock.z = 1
 
     checkCudaErrors(cudart.cudaEventRecord(start, 0))
-    checkCudaErrors(cuda.cuLaunchCooperativeKernel(_gpuConjugateGradient,
-                                                   dimGrid.x, dimGrid.y, dimGrid.z,
-                                                   dimBlock.x, dimBlock.y, dimBlock.z,
-                                                   0, 0,
-                                                   kernelArgs))
+    checkCudaErrors(
+        cuda.cuLaunchCooperativeKernel(
+            _gpuConjugateGradient,
+            dimGrid.x,
+            dimGrid.y,
+            dimGrid.z,
+            dimBlock.x,
+            dimBlock.y,
+            dimBlock.z,
+            0,
+            0,
+            kernelArgs,
+        )
+    )
     checkCudaErrors(cudart.cudaEventRecord(stop, 0))
     checkCudaErrors(cudart.cudaDeviceSynchronize())
 
-    time = checkCudaErrors(cudart.cudaEventElapsedTime(start, stop));
-
-    print("GPU Final, residual = %e, kernel execution time = %f ms" % 
-           (math.sqrt(dot_result_local), time))
+    time = checkCudaErrors(cudart.cudaEventElapsedTime(start, stop))
+    print(f"GPU Final, residual = {math.sqrt(dot_result_local):e}, kernel execution time = {time:f} ms")
 
     err = 0.0
     for i in range(N):
         rsum = 0.0
 
-        for j in range(I_local[i], I_local[i+1]):
+        for j in range(I_local[i], I_local[i + 1]):
             rsum += val_local[j] * x_local[J_local[j]]
 
         diff = math.fabs(rsum - rhs_local[i])
@@ -322,9 +342,8 @@ def main():
     checkCudaErrors(cudart.cudaEventDestroy(start))
     checkCudaErrors(cudart.cudaEventDestroy(stop))
 
-    print("Test Summary:  Error amount = %f" % err)
-    print("&&&& conjugateGradientMultiBlockCG %s\n" %
-          ("PASSED" if math.sqrt(dot_result_local) < tol else "FAILED"))
+    print(f"Test Summary:  Error amount = {err:f}")
+    print("&&&& conjugateGradientMultiBlockCG %s\n" % ("PASSED" if math.sqrt(dot_result_local) < tol else "FAILED"))
 
     if math.sqrt(dot_result_local) >= tol:
         sys.exit(-1)
diff --git a/cuda_bindings/examples/common/common.py b/cuda_bindings/examples/common/common.py
index c24322f94..ec55c1ac5 100644
--- a/cuda_bindings/examples/common/common.py
+++ b/cuda_bindings/examples/common/common.py
@@ -5,39 +5,50 @@
 # this software. Any use, reproduction, disclosure, or distribution of
 # this software and related documentation outside the terms of the EULA
 # is strictly prohibited.
-import ctypes
-import numpy as np
 import os
-from cuda import cuda, cudart, nvrtc
+
+import numpy as np
 from common.helper_cuda import checkCudaErrors
 
+from cuda import cuda, cudart, nvrtc
+
+
 class KernelHelper:
     def __init__(self, code, devID):
-        prog = checkCudaErrors(nvrtc.nvrtcCreateProgram(str.encode(code), b'sourceCode.cu', 0, None, None))
-        CUDA_HOME = os.getenv('CUDA_HOME')
-        if CUDA_HOME == None:
-            CUDA_HOME = os.getenv('CUDA_PATH')
-        if CUDA_HOME == None:
-            raise RuntimeError('Environment variable CUDA_HOME or CUDA_PATH is not set')
-        include_dirs = os.path.join(CUDA_HOME, 'include')
+        prog = checkCudaErrors(nvrtc.nvrtcCreateProgram(str.encode(code), b"sourceCode.cu", 0, None, None))
+        CUDA_HOME = os.getenv("CUDA_HOME")
+        if CUDA_HOME is None:
+            CUDA_HOME = os.getenv("CUDA_PATH")
+        if CUDA_HOME is None:
+            raise RuntimeError("Environment variable CUDA_HOME or CUDA_PATH is not set")
+        include_dirs = os.path.join(CUDA_HOME, "include")
 
         # Initialize CUDA
         checkCudaErrors(cudart.cudaFree(0))
 
-        major = checkCudaErrors(cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, devID))
-        minor = checkCudaErrors(cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, devID))
+        major = checkCudaErrors(
+            cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, devID)
+        )
+        minor = checkCudaErrors(
+            cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, devID)
+        )
         _, nvrtc_minor = checkCudaErrors(nvrtc.nvrtcVersion())
-        use_cubin = (nvrtc_minor >= 1)
-        prefix = 'sm' if use_cubin else 'compute'
-        arch_arg = bytes(f'--gpu-architecture={prefix}_{major}{minor}', 'ascii')
+        use_cubin = nvrtc_minor >= 1
+        prefix = "sm" if use_cubin else "compute"
+        arch_arg = bytes(f"--gpu-architecture={prefix}_{major}{minor}", "ascii")
 
         try:
-            opts = [b'--fmad=true', arch_arg, '--include-path={}'.format(include_dirs).encode('UTF-8'),
-                    b'--std=c++11', b'-default-device']
+            opts = [
+                b"--fmad=true",
+                arch_arg,
+                f"--include-path={include_dirs}".encode(),
+                b"--std=c++11",
+                b"-default-device",
+            ]
             checkCudaErrors(nvrtc.nvrtcCompileProgram(prog, len(opts), opts))
         except RuntimeError as err:
             logSize = checkCudaErrors(nvrtc.nvrtcGetProgramLogSize(prog))
-            log = b' ' * logSize
+            log = b" " * logSize
             checkCudaErrors(nvrtc.nvrtcGetProgramLog(prog, log))
             print(log.decode())
             print(err)
@@ -45,11 +56,11 @@ def __init__(self, code, devID):
 
         if use_cubin:
             dataSize = checkCudaErrors(nvrtc.nvrtcGetCUBINSize(prog))
-            data = b' ' * dataSize
+            data = b" " * dataSize
             checkCudaErrors(nvrtc.nvrtcGetCUBIN(prog, data))
         else:
             dataSize = checkCudaErrors(nvrtc.nvrtcGetPTXSize(prog))
-            data = b' ' * dataSize
+            data = b" " * dataSize
             checkCudaErrors(nvrtc.nvrtcGetPTX(prog, data))
 
         self.module = checkCudaErrors(cuda.cuModuleLoadData(np.char.array(data)))
diff --git a/cuda_bindings/examples/common/helper_cuda.py b/cuda_bindings/examples/common/helper_cuda.py
index cbd0d2da4..6cc4026dd 100644
--- a/cuda_bindings/examples/common/helper_cuda.py
+++ b/cuda_bindings/examples/common/helper_cuda.py
@@ -5,8 +5,10 @@
 # this software. Any use, reproduction, disclosure, or distribution of
 # this software and related documentation outside the terms of the EULA
 # is strictly prohibited.
+from common.helper_string import checkCmdLineFlag, getCmdLineArgumentInt
+
 from cuda import cuda, cudart, nvrtc
-from common.helper_string import getCmdLineArgumentInt, checkCmdLineFlag
+
 
 def _cudaGetErrorEnum(error):
     if isinstance(error, cuda.CUresult):
@@ -17,11 +19,12 @@ def _cudaGetErrorEnum(error):
     elif isinstance(error, nvrtc.nvrtcResult):
         return nvrtc.nvrtcGetErrorString(error)[1]
     else:
-        raise RuntimeError('Unknown error type: {}'.format(error))
+        raise RuntimeError(f"Unknown error type: {error}")
+
 
 def checkCudaErrors(result):
     if result[0].value:
-        raise RuntimeError("CUDA error code={}({})".format(result[0].value, _cudaGetErrorEnum(result[0])))
+        raise RuntimeError(f"CUDA error code={result[0].value}({_cudaGetErrorEnum(result[0])})")
     if len(result) == 1:
         return None
     elif len(result) == 2:
@@ -29,6 +32,7 @@ def checkCudaErrors(result):
     else:
         return result[1:]
 
+
 def findCudaDevice():
     devID = 0
     if checkCmdLineFlag("device="):
@@ -36,6 +40,7 @@ def findCudaDevice():
     checkCudaErrors(cudart.cudaSetDevice(devID))
     return devID
 
+
 def findCudaDeviceDRV():
     devID = 0
     if checkCmdLineFlag("device="):
diff --git a/cuda_bindings/examples/common/helper_string.py b/cuda_bindings/examples/common/helper_string.py
index 1e0d65f1f..7677047a3 100644
--- a/cuda_bindings/examples/common/helper_string.py
+++ b/cuda_bindings/examples/common/helper_string.py
@@ -7,18 +7,13 @@
 # is strictly prohibited.
 import sys
 
+
 def checkCmdLineFlag(stringRef):
-    k = 0
-    for i in sys.argv:
-        if stringRef == i and k < len(sys.argv) - 1:
-           return True
-        k += 1
-    return False
+    return any(stringRef == i and k < len(sys.argv) - 1 for i, k in enumerate(sys.argv))
+
 
 def getCmdLineArgumentInt(stringRef):
-    k = 0
-    for i in sys.argv:
+    for i, k in enumerate(sys.argv):
         if stringRef == i and k < len(sys.argv) - 1:
-           return sys.argv[k+1]
-        k += 1
+            return sys.argv[k + 1]
     return 0
diff --git a/cuda_bindings/examples/extra/isoFDModelling_test.py b/cuda_bindings/examples/extra/isoFDModelling_test.py
index dd4781824..01e5f5714 100644
--- a/cuda_bindings/examples/extra/isoFDModelling_test.py
+++ b/cuda_bindings/examples/extra/isoFDModelling_test.py
@@ -5,13 +5,15 @@
 # this software. Any use, reproduction, disclosure, or distribution of
 # this software and related documentation outside the terms of the EULA
 # is strictly prohibited.
-import numpy as np
 import time
-from cuda import cuda, cudart
+
+import numpy as np
 from common import common
 from common.helper_cuda import checkCudaErrors
 
-isoPropagator = '''\
+from cuda import cuda, cudart
+
+isoPropagator = """\
 extern "C"
 __global__ void injectSource(float *__restrict__ in, float *__restrict__ src, int it)
 {
@@ -153,52 +155,65 @@
         idx_in += stride;
     }
 }
-'''
+"""
 
 display_graph = False
 verbose_prints = False
 
+
 def align_nx(nx, blk, nops):
-    n_align = (int)((nx - 1)/blk) + 1
+    n_align = (int)((nx - 1) / blk) + 1
     n_align *= blk
-    n_align += 2*nops
+    n_align += 2 * nops
     n_align = (int)((n_align - 1) / 64) + 1
     n_align *= 64
     return (int)(n_align)
 
+
 def align_ny(ny, blk, nops):
-    n_align = (int)((ny - 1)/blk) + 1
+    n_align = (int)((ny - 1) / blk) + 1
     n_align *= blk
-    n_align += 2*nops
+    n_align += 2 * nops
     return (int)(n_align)
 
+
 #
 # this class contains the input params
 #
-class params ():
+class params:
     def __init__(self):
-        self.BDIMX = 32 # tiles x y for fd operators
+        self.BDIMX = 32  # tiles x y for fd operators
         self.BDIMY = 16
         self.FD_ORDER = 4
-        self.lead = 64 -  self.FD_ORDER
-        self.nx = align_nx(700, 2*self.BDIMX, self.FD_ORDER)
+        self.lead = 64 - self.FD_ORDER
+        self.nx = align_nx(700, 2 * self.BDIMX, self.FD_ORDER)
         self.ny = align_ny(600, self.BDIMY, self.FD_ORDER)
-        self.blkx = (int) ((self.nx - 2*self.FD_ORDER) / (2*self.BDIMX))
-        self.blky = (int) ((self.ny - 2*self.FD_ORDER) / self.BDIMY)
+        self.blkx = (int)((self.nx - 2 * self.FD_ORDER) / (2 * self.BDIMX))
+        self.blky = (int)((self.ny - 2 * self.FD_ORDER) / self.BDIMY)
 
-        self.nz = (int)(200)
+        self.nz = 200
         self.delta = 25.0
-        self.dt = 0.3 * 1000.0 * self.delta  / 4500.0
+        self.dt = 0.3 * 1000.0 * self.delta / 4500.0
         self.tmax_propag = 1000.0
         self.nt = int(self.tmax_propag / self.dt)
-        self.freqMax = 3.5* 1000.0  / (4.0 * self.delta)
-        print("dt= ",self.dt, " delta= ", self.delta, " nt= ", self.nt, " freq max= " , self.freqMax)
+        self.freqMax = 3.5 * 1000.0 / (4.0 * self.delta)
+        print(
+            "dt= ",
+            self.dt,
+            " delta= ",
+            self.delta,
+            " nt= ",
+            self.nt,
+            " freq max= ",
+            self.freqMax,
+        )
+
 
 #
 # this class contains all the kernels to be used bu propagator
 #
-class cudaKernels():
-    def __init__ (self, cntx):
+class cudaKernels:
+    def __init__(self, cntx):
         checkCudaErrors(cuda.cuInit(0))
         checkCudaErrors(cuda.cuCtxSetCurrent(cntx))
         dev = checkCudaErrors(cuda.cuCtxGetDevice())
@@ -206,15 +221,16 @@ def __init__ (self, cntx):
         self.kernelHelper = common.KernelHelper(isoPropagator, int(dev))
 
         # kernel to create a source fnction with some max frequency
-        self.creatSource = self.kernelHelper.getFunction(b'createSource')
+        self.creatSource = self.kernelHelper.getFunction(b"createSource")
         # create a velocity to try things: just a sphere on the middle 4500 m/s and 2500 m/s all around
-        self.createVelocity = self.kernelHelper.getFunction(b'createVelocity')
+        self.createVelocity = self.kernelHelper.getFunction(b"createVelocity")
 
         # kernel to propagate the wavefield by 1 step in time
-        self.fdPropag = self.kernelHelper.getFunction(b'fwd_3D_orderX2k')
+        self.fdPropag = self.kernelHelper.getFunction(b"fwd_3D_orderX2k")
 
         # kernel to propagate the wavefield by 1 step in time
-        self.injectSource = self.kernelHelper.getFunction(b'injectSource')
+        self.injectSource = self.kernelHelper.getFunction(b"injectSource")
+
 
 #
 # this class contains: propagator, source creation, velocity creation
@@ -249,7 +265,7 @@ def __del__(self):
     #
     def swap(self):
         if verbose_prints:
-            print("swap in out ", int(self.waveIn), " " , int(self.waveOut))
+            print("swap in out ", int(self.waveIn), " ", int(self.waveOut))
         i = int(self.waveIn)
         j = int(self.waveOut)
         a = i
@@ -262,8 +278,8 @@ def swap(self):
     # allocate the device memory
     #
     def allocate(self):
-        nel = self.params.nx * self.params.ny  *  self.params.nz
-        n = np.array( nel, dtype=np.uint32)
+        nel = self.params.nx * self.params.ny * self.params.nz
+        n = np.array(nel, dtype=np.uint32)
 
         bufferSize = n * np.dtype(np.float32).itemsize
         checkCudaErrors(cuda.cuCtxSetCurrent(self.context))
@@ -272,7 +288,7 @@ def allocate(self):
         checkCudaErrors(cuda.cuMemsetD32(self.velocity, 0, n))
 
         nel += self.params.lead
-        n = np.array(nel, dtype=np.uint32) ## we need to align at the beginning of the tile
+        n = np.array(nel, dtype=np.uint32)  ## we need to align at the beginning of the tile
 
         bufferSize = n * np.dtype(np.float32).itemsize
         self.waveIn = checkCudaErrors(cuda.cuMemAlloc(bufferSize))
@@ -294,17 +310,27 @@ def createSource(self, kernel):
 
         buf = np.array([int(self.source)], dtype=np.uint64)
         nt = np.array(self.params.nt, dtype=np.uint32)
-        dt = np.array(self.params.dt,  dtype=np.float32)
+        dt = np.array(self.params.dt, dtype=np.float32)
         freq = np.array(self.params.freqMax, dtype=np.float32)
 
         args = [buf, dt, freq, nt]
         args = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
         checkCudaErrors(cuda.cuCtxSetCurrent(self.context))
-        checkCudaErrors(cuda.cuLaunchKernel(kernel.creatSource,
-                                1, 1, 1,                        # grid dim
-                                1024, 1, 1,                     # block dim
-                                0, self.streamHalo,             # shared mem and stream
-                                args.ctypes.data, 0))     # arguments
+        checkCudaErrors(
+            cuda.cuLaunchKernel(
+                kernel.creatSource,
+                1,
+                1,
+                1,  # grid dim
+                1024,
+                1,
+                1,  # block dim
+                0,
+                self.streamHalo,  # shared mem and stream
+                args.ctypes.data,
+                0,
+            )
+        )  # arguments
         checkCudaErrors(cuda.cuStreamSynchronize(self.streamHalo))
 
     #
@@ -314,23 +340,37 @@ def injectSource(self, kernel, iter):
         checkCudaErrors(cuda.cuCtxSetCurrent(self.context))
 
         if self.dev != 0:
-           return
+            return
 
         wavein = np.array([int(self.waveIn)], dtype=np.uint64)
         src = np.array([int(self.source)], dtype=np.uint64)
-        offset_sourceInject = self.params.lead + (int)(self.params.nz/2) * self.params.nx * self.params.ny + \
-                              (int)(self.params.ny/2)  * self.params.nx +  (int) (self.params.nx/2)
+        offset_sourceInject = (
+            self.params.lead
+            + (int)(self.params.nz / 2) * self.params.nx * self.params.ny
+            + (int)(self.params.ny / 2) * self.params.nx
+            + (int)(self.params.nx / 2)
+        )
         offset_sourceInject *= np.dtype(np.float32).itemsize
 
         np_it = np.array(iter, dtype=np.uint32)
 
-        args = [wavein+offset_sourceInject, src, np_it]
+        args = [wavein + offset_sourceInject, src, np_it]
         args = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
-        checkCudaErrors(cuda.cuLaunchKernel(kernel.injectSource,
-                                1, 1, 1,                        # grid dim
-                                1, 1, 1,                        # block dim
-                                0, self.streamHalo,             # shared mem and stream
-                                args.ctypes.data, 0))     # arguments
+        checkCudaErrors(
+            cuda.cuLaunchKernel(
+                kernel.injectSource,
+                1,
+                1,
+                1,  # grid dim
+                1,
+                1,
+                1,  # block dim
+                0,
+                self.streamHalo,  # shared mem and stream
+                args.ctypes.data,
+                0,
+            )
+        )  # arguments
 
     #
     # create velocity
@@ -338,8 +378,11 @@ def injectSource(self, kernel, iter):
     def createVelocity(self, kernel):
         print("running create velocity on device ", self.dev)
 
-        offset_velocity = self.params.FD_ORDER * self.params.nx * self.params.ny + \
-                          self.params.FD_ORDER * self.params.nx + self.params.FD_ORDER
+        offset_velocity = (
+            self.params.FD_ORDER * self.params.nx * self.params.ny
+            + self.params.FD_ORDER * self.params.nx
+            + self.params.FD_ORDER
+        )
         offset_velocity *= np.dtype(np.float32).itemsize
 
         vel = np.array([int(self.velocity)], dtype=np.uint64)
@@ -347,32 +390,45 @@ def createVelocity(self, kernel):
 
         stride = self.params.nx * self.params.ny
         np_dx_dt2 = np.array(dx_dt2, dtype=np.float32)
-        np_nz = np.array((self.params.nz-2*self.params.FD_ORDER), dtype=np.uint32)
+        np_nz = np.array((self.params.nz - 2 * self.params.FD_ORDER), dtype=np.uint32)
         np_nx = np.array(self.params.nx, dtype=np.uint32)
         np_stride = np.array(stride, dtype=np.uint32)
 
-        args = [vel+  offset_velocity, np_dx_dt2, np_nz, np_nx, np_stride]
+        args = [vel + offset_velocity, np_dx_dt2, np_nz, np_nx, np_stride]
         args = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
 
         checkCudaErrors(cuda.cuCtxSetCurrent(self.context))
 
         # do halo up
-        checkCudaErrors(cuda.cuLaunchKernel(kernel.createVelocity,
-                                            self.params.blkx, self.params.blky, 1,     # grid dim
-                                            2*self.params.BDIMX, self.params.BDIMY, 1, # block dim
-                                            0, self.streamHalo,                        # shared mem and stream
-                                            args.ctypes.data, 0))                # arguments
+        checkCudaErrors(
+            cuda.cuLaunchKernel(
+                kernel.createVelocity,
+                self.params.blkx,
+                self.params.blky,
+                1,  # grid dim
+                2 * self.params.BDIMX,
+                self.params.BDIMY,
+                1,  # block dim
+                0,
+                self.streamHalo,  # shared mem and stream
+                args.ctypes.data,
+                0,
+            )
+        )  # arguments
         checkCudaErrors(cuda.cuStreamSynchronize(self.streamHalo))
 
     #
     # execute the center part of propagation
     #
-    def executeCenter(self,  kernel):
+    def executeCenter(self, kernel):
         if verbose_prints:
             print("running center on device ", self.dev)
         checkCudaErrors(cuda.cuCtxSetCurrent(self.context))
-        offset_velocity = 2* self.params.FD_ORDER * self.params.nx * self.params.ny + \
-                             self.params.FD_ORDER * self.params.nx + self.params.FD_ORDER
+        offset_velocity = (
+            2 * self.params.FD_ORDER * self.params.nx * self.params.ny
+            + self.params.FD_ORDER * self.params.nx
+            + self.params.FD_ORDER
+        )
 
         offset_wave = self.params.lead + offset_velocity
 
@@ -384,19 +440,36 @@ def executeCenter(self,  kernel):
 
         vel = np.array([int(self.velocity)], dtype=np.uint64)
         stride = self.params.nx * self.params.ny
-        np_nz = np.array(self.params.nz - 4*self.params.FD_ORDER, dtype=np.uint32)
+        np_nz = np.array(self.params.nz - 4 * self.params.FD_ORDER, dtype=np.uint32)
         np_nx = np.array(self.params.nx, dtype=np.uint32)
         np_stride = np.array(stride, dtype=np.uint32)
 
-        args = [wavein+offset_wave, waveout+offset_wave, vel+offset_velocity, np_nz, np_nx, np_stride]
+        args = [
+            wavein + offset_wave,
+            waveout + offset_wave,
+            vel + offset_velocity,
+            np_nz,
+            np_nx,
+            np_stride,
+        ]
         args = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
 
         # do center propagation from 2 * fd_order to nz - 2 * fd_order
-        checkCudaErrors(cuda.cuLaunchKernel(kernel.fdPropag,
-                                self.params.blkx, self.params.blky, 1,   # grid dim
-                                self.params.BDIMX, self.params.BDIMY, 1, # block dim
-                                0, self.streamCenter,                    # shared mem and stream
-                                args.ctypes.data, 0))              # arguments
+        checkCudaErrors(
+            cuda.cuLaunchKernel(
+                kernel.fdPropag,
+                self.params.blkx,
+                self.params.blky,
+                1,  # grid dim
+                self.params.BDIMX,
+                self.params.BDIMY,
+                1,  # block dim
+                0,
+                self.streamCenter,  # shared mem and stream
+                args.ctypes.data,
+                0,
+            )
+        )  # arguments
 
     #
     # execute the halo part of propagation
@@ -406,8 +479,11 @@ def executeHalo(self, kernel):
             print("running halos on device ", self.dev)
         checkCudaErrors(cuda.cuCtxSetCurrent(self.context))
 
-        offset_velocity = self.params.FD_ORDER * self.params.nx * self.params.ny + \
-                          self.params.FD_ORDER * self.params.nx + self.params.FD_ORDER
+        offset_velocity = (
+            self.params.FD_ORDER * self.params.nx * self.params.ny
+            + self.params.FD_ORDER * self.params.nx
+            + self.params.FD_ORDER
+        )
 
         offset_wave = self.params.lead + offset_velocity
 
@@ -423,31 +499,68 @@ def executeHalo(self, kernel):
         np_nx = np.array(self.params.nx, dtype=np.uint32)
         np_stride = np.array(stride, dtype=np.uint32)
 
-        args = [wavein+offset_wave, waveout+offset_wave, vel+offset_velocity, np_nz, np_nx, np_stride]
+        args = [
+            wavein + offset_wave,
+            waveout + offset_wave,
+            vel + offset_velocity,
+            np_nz,
+            np_nx,
+            np_stride,
+        ]
         args = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
 
         # do halo up
-        checkCudaErrors(cuda.cuLaunchKernel(kernel.fdPropag,
-                                self.params.blkx, self.params.blky, 1,   # grid dim
-                                self.params.BDIMX, self.params.BDIMY, 1, # block dim
-                                0, self.streamHalo,                      # shared mem and stream
-                                args.ctypes.data, 0))              # arguments
+        checkCudaErrors(
+            cuda.cuLaunchKernel(
+                kernel.fdPropag,
+                self.params.blkx,
+                self.params.blky,
+                1,  # grid dim
+                self.params.BDIMX,
+                self.params.BDIMY,
+                1,  # block dim
+                0,
+                self.streamHalo,  # shared mem and stream
+                args.ctypes.data,
+                0,
+            )
+        )  # arguments
 
         # do halo down
-        offset_velocity = (self.params.nz - 2*self.params.FD_ORDER) * self.params.nx * self.params.ny + \
-                           self.params.FD_ORDER * self.params.nx + self.params.FD_ORDER
+        offset_velocity = (
+            (self.params.nz - 2 * self.params.FD_ORDER) * self.params.nx * self.params.ny
+            + self.params.FD_ORDER * self.params.nx
+            + self.params.FD_ORDER
+        )
         offset_wave = self.params.lead + offset_velocity
 
         offset_wave *= np.dtype(np.float32).itemsize
         offset_velocity *= np.dtype(np.float32).itemsize
 
-        args = [wavein+offset_wave, waveout+offset_wave, vel+offset_velocity, np_nz, np_nx, np_stride]
+        args = [
+            wavein + offset_wave,
+            waveout + offset_wave,
+            vel + offset_velocity,
+            np_nz,
+            np_nx,
+            np_stride,
+        ]
         args = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
-        checkCudaErrors(cuda.cuLaunchKernel(kernel.fdPropag,
-                                self.params.blkx, self.params.blky, 1,   # grid dim
-                                self.params.BDIMX, self.params.BDIMY, 1, # block dim
-                                0, self.streamHalo,                      # shared mem and stream
-                                args.ctypes.data, 0))              # arguments
+        checkCudaErrors(
+            cuda.cuLaunchKernel(
+                kernel.fdPropag,
+                self.params.blkx,
+                self.params.blky,
+                1,  # grid dim
+                self.params.BDIMX,
+                self.params.BDIMY,
+                1,  # block dim
+                0,
+                self.streamHalo,  # shared mem and stream
+                args.ctypes.data,
+                0,
+            )
+        )  # arguments
 
     #
     # exchange the halos
@@ -460,7 +573,7 @@ def exchangeHalo(self, propag):
         #
         # the following variables don't change
         #
-        nstride  = self.params.nx * self.params.ny
+        nstride = self.params.nx * self.params.ny
 
         devS = self.context
         devD = propag.context
@@ -470,7 +583,7 @@ def exchangeHalo(self, propag):
 
         if self.dev < propag.dev:
             # exchange up
-            offsetS = self.params.lead + (self.params.nz - 2*self.params.FD_ORDER) * nstride
+            offsetS = self.params.lead + (self.params.nz - 2 * self.params.FD_ORDER) * nstride
             offsetD = propag.params.lead
 
             offsetS *= np.dtype(np.float32).itemsize
@@ -482,8 +595,8 @@ def exchangeHalo(self, propag):
             checkCudaErrors(cuda.cuMemcpyPeerAsync(waveD, devD, waveS, devS, n_exch, self.streamHalo))
         else:
             # exchange down
-            offsetS = self.params.lead  + self.params.FD_ORDER * nstride
-            offsetD = propag.params.lead  + (propag.params.nz - propag.params.FD_ORDER) * nstride
+            offsetS = self.params.lead + self.params.FD_ORDER * nstride
+            offsetD = propag.params.lead + (propag.params.nz - propag.params.FD_ORDER) * nstride
 
             offsetS *= np.dtype(np.float32).itemsize
             offsetD *= np.dtype(np.float32).itemsize
@@ -500,13 +613,14 @@ def syncStream(self, stream):
         checkCudaErrors(cuda.cuCtxSetCurrent(self.context))
         checkCudaErrors(cuda.cuStreamSynchronize(stream))
 
+
 def main():
     checkCudaErrors(cuda.cuInit(0))
 
     # Number of GPUs
     print("Checking for multiple GPUs...")
     gpu_n = checkCudaErrors(cuda.cuDeviceGetCount())
-    print("CUDA-capable device count: {}".format(gpu_n))
+    print(f"CUDA-capable device count: {gpu_n}")
 
     if gpu_n < 2:
         print("Two or more GPUs with Peer-to-Peer access capability are required")
@@ -524,10 +638,16 @@ def main():
                 continue
             i_access_j = checkCudaErrors(cudart.cudaDeviceCanAccessPeer(i, j))
             j_access_i = checkCudaErrors(cudart.cudaDeviceCanAccessPeer(j, i))
-            print("> Peer access from {} (GPU{}) -> {} (GPU{}) : {}\n".format(
-                    prop[i].name, i, prop[j].name, j, "Yes" if i_access_j else "No"))
-            print("> Peer access from {} (GPU{}) -> {} (GPU{}) : {}\n".format(
-                    prop[j].name, j, prop[i].name, i, "Yes" if i_access_j else "No"))
+            print(
+                "> Peer access from {} (GPU{}) -> {} (GPU{}) : {}\n".format(
+                    prop[i].name, i, prop[j].name, j, "Yes" if i_access_j else "No"
+                )
+            )
+            print(
+                "> Peer access from {} (GPU{}) -> {} (GPU{}) : {}\n".format(
+                    prop[j].name, j, prop[i].name, i, "Yes" if i_access_j else "No"
+                )
+            )
             if i_access_j and j_access_i:
                 p2pCapableGPUs[1] = j
                 break
@@ -542,7 +662,6 @@ def main():
     # Use first pair of p2p capable GPUs detected
     gpuid = [p2pCapableGPUs[0], p2pCapableGPUs[1]]
 
-
     #
     # init device
     #
@@ -552,7 +671,7 @@ def main():
     # create propagators
     #
     propags = []
-    kerns   = []
+    kerns = []
 
     #
     # create kernels and propagators that are going to be used on device
@@ -574,7 +693,6 @@ def main():
     #
     start = time.time()
     for it in range(pars.nt):
-
         for propag in propags:
             propag.syncStream(propag.streamHalo)
 
@@ -605,26 +723,30 @@ def main():
 
     nops = 1.0e-9 * pars.nt * npoints / (end - start)
 
-    print("this code generates " , nops , " GPoints/sec / device ")
+    print("this code generates ", nops, " GPoints/sec / device ")
 
     #
     # get the result out of gpu
     #
     nz = 2 * (int)(pars.nz - 2 * pars.FD_ORDER)
     print(" nz= ", nz, " nx= ", pars.nx)
-    hOut = np.zeros((nz, pars.nx), dtype='float32')
+    hOut = np.zeros((nz, pars.nx), dtype="float32")
 
     istart = 0
     for propag in propags:
         checkCudaErrors(cuda.cuCtxSetCurrent(propag.context))
-        offset = pars.lead + pars.FD_ORDER * pars.nx * pars.ny + \
-                 (int)(pars.ny/2) * pars.nx
-
-        for j in range(pars.nz- 2*pars.FD_ORDER):
-            ptr = cuda.CUdeviceptr(int(propag.waveOut) + offset*4)
-
-            checkCudaErrors(cuda.cuMemcpyDtoH(hOut[istart].ctypes.data, ptr,
-                                              pars.nx * np.dtype(np.float32).itemsize))
+        offset = pars.lead + pars.FD_ORDER * pars.nx * pars.ny + (int)(pars.ny / 2) * pars.nx
+
+        for j in range(pars.nz - 2 * pars.FD_ORDER):
+            ptr = cuda.CUdeviceptr(int(propag.waveOut) + offset * 4)
+
+            checkCudaErrors(
+                cuda.cuMemcpyDtoH(
+                    hOut[istart].ctypes.data,
+                    ptr,
+                    pars.nx * np.dtype(np.float32).itemsize,
+                )
+            )
             offset += pars.nx * pars.ny
             istart += 1
 
@@ -638,19 +760,26 @@ def main():
         nrows = nz
         ncols = pars.nx
         dbz = hOut
-        dbz = np.reshape(dbz,(nrows, ncols))
+        dbz = np.reshape(dbz, (nrows, ncols))
 
         ##
         ## those are to plot results
         ##
         import matplotlib.pyplot as plt
-        import matplotlib.cm as cm
+
         fig, ax = plt.subplots()
         title = "test fd kernels up to " + str(pars.tmax_propag) + " ms "
         plt.title(title, fontsize=20)
-        im = ax.imshow(dbz, interpolation='bilinear', cmap=plt.get_cmap('Greys'), aspect='auto',
-                       origin='upper',extent=[1, pars.nx, nz, 1],
-                       vmax=abs(dbz).max(), vmin=-abs(dbz).max())
+        im = ax.imshow(
+            dbz,
+            interpolation="bilinear",
+            cmap=plt.get_cmap("Greys"),
+            aspect="auto",
+            origin="upper",
+            extent=[1, pars.nx, nz, 1],
+            vmax=abs(dbz).max(),
+            vmin=-abs(dbz).max(),
+        )
 
         fig.colorbar(im, ax=ax)
 
@@ -658,6 +787,7 @@ def main():
 
     print("Done")
 
+
 if __name__ == "__main__":
     display_graph = True
     verbose_prints = True
diff --git a/cuda_bindings/examples/extra/jit_program_test.py b/cuda_bindings/examples/extra/jit_program_test.py
index e55b48ec2..18835ec9d 100644
--- a/cuda_bindings/examples/extra/jit_program_test.py
+++ b/cuda_bindings/examples/extra/jit_program_test.py
@@ -6,20 +6,24 @@
 # this software and related documentation outside the terms of the EULA
 # is strictly prohibited.
 import ctypes
+
 import numpy as np
+
 from cuda import cuda, nvrtc
 
+
 def ASSERT_DRV(err):
     if isinstance(err, cuda.CUresult):
         if err != cuda.CUresult.CUDA_SUCCESS:
-            raise RuntimeError('Cuda Error: {}'.format(err))
+            raise RuntimeError(f"Cuda Error: {err}")
     elif isinstance(err, nvrtc.nvrtcResult):
         if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
-            raise RuntimeError('Nvrtc Error: {}'.format(err))
+            raise RuntimeError(f"Nvrtc Error: {err}")
     else:
-        raise RuntimeError('Unknown error type: {}'.format(err))
+        raise RuntimeError(f"Unknown error type: {err}")
 
-saxpy = '''\
+
+saxpy = """\
 extern "C" __global__
 void saxpy(float a, float *x, float *y, float *out, size_t n)
 {
@@ -28,11 +32,12 @@ def ASSERT_DRV(err):
         out[tid] = a * x[tid] + y[tid];
     }
 }
-'''
+"""
+
 
 def main():
     # Init
-    err, = cuda.cuInit(0)
+    (err,) = cuda.cuInit(0)
     ASSERT_DRV(err)
 
     # Device
@@ -44,30 +49,34 @@ def main():
     ASSERT_DRV(err)
 
     # Create program
-    err, prog = nvrtc.nvrtcCreateProgram(str.encode(saxpy), b'saxpy.cu', 0, None, None)
+    err, prog = nvrtc.nvrtcCreateProgram(str.encode(saxpy), b"saxpy.cu", 0, None, None)
     ASSERT_DRV(err)
 
     # Get target architecture
-    err, major = cuda.cuDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice)
+    err, major = cuda.cuDeviceGetAttribute(
+        cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice
+    )
     ASSERT_DRV(err)
-    err, minor = cuda.cuDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice)
+    err, minor = cuda.cuDeviceGetAttribute(
+        cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice
+    )
     ASSERT_DRV(err)
     err, nvrtc_major, nvrtc_minor = nvrtc.nvrtcVersion()
     ASSERT_DRV(err)
-    use_cubin = (nvrtc_minor >= 1)
-    prefix = 'sm' if use_cubin else 'compute'
-    arch_arg = bytes(f'--gpu-architecture={prefix}_{major}{minor}', 'ascii')
+    use_cubin = nvrtc_minor >= 1
+    prefix = "sm" if use_cubin else "compute"
+    arch_arg = bytes(f"--gpu-architecture={prefix}_{major}{minor}", "ascii")
 
     # Compile program
-    opts = [b'--fmad=false', arch_arg]
-    err, = nvrtc.nvrtcCompileProgram(prog, len(opts), opts)
+    opts = [b"--fmad=false", arch_arg]
+    (err,) = nvrtc.nvrtcCompileProgram(prog, len(opts), opts)
     ASSERT_DRV(err)
 
     # Get log from compilation
     err, logSize = nvrtc.nvrtcGetProgramLogSize(prog)
     ASSERT_DRV(err)
-    log = b' ' * logSize
-    err, = nvrtc.nvrtcGetProgramLog(prog, log)
+    log = b" " * logSize
+    (err,) = nvrtc.nvrtcGetProgramLog(prog, log)
     ASSERT_DRV(err)
     print(log.decode())
 
@@ -75,21 +84,21 @@ def main():
     if use_cubin:
         err, dataSize = nvrtc.nvrtcGetCUBINSize(prog)
         ASSERT_DRV(err)
-        data = b' ' * dataSize
-        err, = nvrtc.nvrtcGetCUBIN(prog, data)
+        data = b" " * dataSize
+        (err,) = nvrtc.nvrtcGetCUBIN(prog, data)
         ASSERT_DRV(err)
     else:
         err, dataSize = nvrtc.nvrtcGetPTXSize(prog)
         ASSERT_DRV(err)
-        data = b' ' * dataSize
-        err, = nvrtc.nvrtcGetPTX(prog, data)
+        data = b" " * dataSize
+        (err,) = nvrtc.nvrtcGetPTX(prog, data)
         ASSERT_DRV(err)
 
     # Load data as module data and retrieve function
     data = np.char.array(data)
     err, module = cuda.cuModuleLoadData(data)
     ASSERT_DRV(err)
-    err, kernel = cuda.cuModuleGetFunction(module, b'saxpy')
+    err, kernel = cuda.cuModuleGetFunction(module, b"saxpy")
     ASSERT_DRV(err)
 
     # Test the kernel
@@ -114,52 +123,61 @@ def main():
     err, stream = cuda.cuStreamCreate(0)
     ASSERT_DRV(err)
 
-    err, = cuda.cuMemcpyHtoDAsync(dX, hX, bufferSize, stream)
+    (err,) = cuda.cuMemcpyHtoDAsync(dX, hX, bufferSize, stream)
     ASSERT_DRV(err)
-    err, = cuda.cuMemcpyHtoDAsync(dY, hY, bufferSize, stream)
+    (err,) = cuda.cuMemcpyHtoDAsync(dY, hY, bufferSize, stream)
     ASSERT_DRV(err)
 
-    err, = cuda.cuStreamSynchronize(stream)
+    (err,) = cuda.cuStreamSynchronize(stream)
     ASSERT_DRV(err)
 
     # Assert values are different before running kernel
     hZ = a * hX + hY
     if np.allclose(hOut, hZ):
-        raise ValueError('Error inside tolerence for host-device vectors')
+        raise ValueError("Error inside tolerence for host-device vectors")
 
     arg_values = (a, dX, dY, dOut, n)
     arg_types = (ctypes.c_float, None, None, None, ctypes.c_size_t)
-    err, = cuda.cuLaunchKernel(kernel,
-                              NUM_BLOCKS, 1, 1,           # grid dim
-                              NUM_THREADS, 1, 1,          # block dim
-                              0, stream,                  # shared mem and stream
-                              (arg_values, arg_types), 0) # arguments
+    (err,) = cuda.cuLaunchKernel(
+        kernel,
+        NUM_BLOCKS,
+        1,
+        1,  # grid dim
+        NUM_THREADS,
+        1,
+        1,  # block dim
+        0,
+        stream,  # shared mem and stream
+        (arg_values, arg_types),
+        0,
+    )  # arguments
     ASSERT_DRV(err)
 
-    err, = cuda.cuMemcpyDtoHAsync(hOut, dOut, bufferSize, stream)
+    (err,) = cuda.cuMemcpyDtoHAsync(hOut, dOut, bufferSize, stream)
     ASSERT_DRV(err)
-    err, = cuda.cuStreamSynchronize(stream)
+    (err,) = cuda.cuStreamSynchronize(stream)
     ASSERT_DRV(err)
 
     # Assert values are same after running kernel
     hZ = a * hX + hY
     if not np.allclose(hOut, hZ):
-        raise ValueError('Error outside tolerence for host-device vectors')
+        raise ValueError("Error outside tolerence for host-device vectors")
 
-    err, = cuda.cuStreamDestroy(stream)
+    (err,) = cuda.cuStreamDestroy(stream)
     ASSERT_DRV(err)
 
-    err, = cuda.cuMemFree(dX)
+    (err,) = cuda.cuMemFree(dX)
     ASSERT_DRV(err)
-    err, = cuda.cuMemFree(dY)
+    (err,) = cuda.cuMemFree(dY)
     ASSERT_DRV(err)
-    err, = cuda.cuMemFree(dOut)
+    (err,) = cuda.cuMemFree(dOut)
     ASSERT_DRV(err)
 
-    err, = cuda.cuModuleUnload(module)
+    (err,) = cuda.cuModuleUnload(module)
     ASSERT_DRV(err)
-    err, = cuda.cuCtxDestroy(context)
+    (err,) = cuda.cuCtxDestroy(context)
     ASSERT_DRV(err)
 
-if __name__=="__main__":
+
+if __name__ == "__main__":
     main()
diff --git a/cuda_bindings/examples/extra/numba_emm_plugin.py b/cuda_bindings/examples/extra/numba_emm_plugin.py
index a80c3bbfe..45015ada4 100644
--- a/cuda_bindings/examples/extra/numba_emm_plugin.py
+++ b/cuda_bindings/examples/extra/numba_emm_plugin.py
@@ -52,18 +52,22 @@
 be restored after freeing it.
 """
 
+from ctypes import c_size_t
+
 from numba import cuda
-from numba.cuda import (HostOnlyCUDAMemoryManager, GetIpcHandleMixin,
-                        MemoryPointer, MemoryInfo)
+from numba.cuda import (
+    GetIpcHandleMixin,
+    HostOnlyCUDAMemoryManager,
+    MemoryInfo,
+    MemoryPointer,
+)
 
 from cuda import cuda as cuda_driver
 
-from ctypes import c_size_t
-
-
 # Python functions for allocation, deallocation, and memory info via the NVIDIA
 # CUDA Python Driver API
 
+
 def driver_alloc(size):
     """
     Allocate `size` bytes of device memory and return a device pointer to the
@@ -71,7 +75,7 @@ def driver_alloc(size):
     """
     err, ptr = cuda_driver.cuMemAlloc(size)
     if err != cuda_driver.CUresult.CUDA_SUCCESS:
-        raise RuntimeError(f'Unexpected error code {err} from cuMemAlloc')
+        raise RuntimeError(f"Unexpected error code {err} from cuMemAlloc")
     return ptr
 
 
@@ -79,9 +83,9 @@ def driver_free(ptr):
     """
     Free device memory pointed to by `ptr`.
     """
-    err, = cuda_driver.cuMemFree(ptr)
+    (err,) = cuda_driver.cuMemFree(ptr)
     if err != cuda_driver.CUresult.CUDA_SUCCESS:
-        raise RuntimeError(f'Unexpected error code {err} from cuMemFree')
+        raise RuntimeError(f"Unexpected error code {err} from cuMemFree")
 
 
 def driver_memory_info():
@@ -90,7 +94,7 @@ def driver_memory_info():
     """
     err, free, total = cuda_driver.cuMemGetInfo()
     if err != cuda_driver.CUresult.CUDA_SUCCESS:
-        raise RuntimeError(f'Unexpected error code {err} from cuMemGetInfo')
+        raise RuntimeError(f"Unexpected error code {err} from cuMemGetInfo")
     return free, total
 
 
@@ -99,6 +103,7 @@ def driver_memory_info():
 #
 #    https://numba.readthedocs.io/en/stable/cuda/external-memory.html#numba.cuda.BaseCUDAMemoryManager
 
+
 class DriverEMMPlugin(GetIpcHandleMixin, HostOnlyCUDAMemoryManager):
     def memalloc(self, size):
         ptr = driver_alloc(size)
@@ -152,10 +157,10 @@ def main():
     print(f"Free after freeing device array: {ctx.get_memory_info().free}")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     import argparse
+
     formatter = argparse.RawDescriptionHelpFormatter
-    parser = argparse.ArgumentParser(description=__doc__,
-                                     formatter_class=formatter)
+    parser = argparse.ArgumentParser(description=__doc__, formatter_class=formatter)
     parser.parse_args()
     main()
diff --git a/cuda_bindings/pyproject.toml b/cuda_bindings/pyproject.toml
index 63c09db5c..374f43e59 100644
--- a/cuda_bindings/pyproject.toml
+++ b/cuda_bindings/pyproject.toml
@@ -46,3 +46,60 @@ versionfile_source = "cuda/bindings/_version.py"
 versionfile_build = "cuda/bindings/_version.py"
 tag_prefix = "v"
 parentdir_prefix = "cuda-python-"
+
+[tool.ruff]
+line-length = 120
+
+[tool.ruff.format]
+docstring-code-format = true
+
+exclude = ["cuda/bindings/_version.py"]
+
+[tool.ruff.lint]
+select = [
+    # pycodestyle Error
+    "E",
+    # Pyflakes
+    "F",
+    # pycodestyle Warning
+    "W",
+    # pyupgrade
+    "UP",
+    # flake8-bugbear
+    "B",
+    # flake8-simplify
+    "SIM",
+    # isort
+    "I",
+]
+
+ignore = [
+    "UP006", 
+    "UP007", 
+    "E741", # ambiguous variable name such as I
+    "B007", # rename unsued loop variable to _name 
+    "UP035" # UP006, UP007, UP035 complain about deprecated Typing.<type> use, but disregard backward compatibility of python version
+]
+
+exclude = ["cuda/bindings/_version.py"]
+
+[tool.ruff.lint.per-file-ignores]
+"setup.py" = ["F401"]
+"__init__.py" = ["F401"]
+
+"examples/**/*" = [
+  "E722",
+  "E501" # line too long
+  ]
+
+"tests/**/*" = [
+  "E722", 
+  "UP022",
+  "E402", # module level import not at top of file
+  "F841"] # F841 complains about unused variables, but some assignments have side-effects that could be useful for tests (func calls for example)
+
+"benchmarks/**/*" = [
+  "E722", 
+  "UP022",
+  "E402", # module level import not at top of file
+  "F841"] # F841 complains about unused variables, but some assignments have side-effects that could be useful for tests (func calls for example)
diff --git a/cuda_bindings/setup.py b/cuda_bindings/setup.py
index 2342e62ab..2cc9027bb 100644
--- a/cuda_bindings/setup.py
+++ b/cuda_bindings/setup.py
@@ -6,23 +6,23 @@
 # this software and related documentation outside the terms of the EULA
 # is strictly prohibited.
 
+import atexit
+import contextlib
 import glob
 import os
 import platform
+import shutil
 import sys
 import sysconfig
-import atexit
+import tempfile
 
+import versioneer
 from Cython import Tempita
 from Cython.Build import cythonize
 from pyclibrary import CParser
 from setuptools import find_packages, setup
-from setuptools.extension import Extension
 from setuptools.command.build_ext import build_ext
-import versioneer
-import tempfile
-import shutil
-
+from setuptools.extension import Extension
 
 # ----------------------------------------------------------------------
 # Fetch configuration options
@@ -31,48 +31,49 @@
 if not CUDA_HOME:
     CUDA_HOME = os.environ.get("CUDA_PATH")
 if not CUDA_HOME:
-    raise RuntimeError('Environment variable CUDA_HOME or CUDA_PATH is not set')
+    raise RuntimeError("Environment variable CUDA_HOME or CUDA_PATH is not set")
 
 CUDA_HOME = CUDA_HOME.split(os.pathsep)
 nthreads = int(os.environ.get("PARALLEL_LEVEL", "0") or "0")
 PARSER_CACHING = os.environ.get("CUDA_PYTHON_PARSER_CACHING", False)
-PARSER_CACHING = True if PARSER_CACHING else False
+PARSER_CACHING = bool(PARSER_CACHING)
 
 # ----------------------------------------------------------------------
 # Parse user-provided CUDA headers
 
 header_dict = {
-    'driver' : ['cuda.h',
-                'cudaProfiler.h',
-                'cudaEGL.h',
-                'cudaGL.h',
-                'cudaVDPAU.h'],
-    'runtime' : ['driver_types.h',
-                 'vector_types.h',
-                 'cuda_runtime.h',
-                 'surface_types.h',
-                 'texture_types.h',
-                 'library_types.h',
-                 'cuda_runtime_api.h',
-                 'device_types.h',
-                 'driver_functions.h',
-                 'cuda_profiler_api.h',
-                 'cuda_egl_interop.h',
-                 'cuda_gl_interop.h',
-                 'cuda_vdpau_interop.h'],
-    'nvrtc' : ['nvrtc.h']}
-
-replace = {' __device_builtin__ ':' ',
-           'CUDARTAPI ':' ',
-           'typedef __device_builtin__ enum cudaError cudaError_t;' : 'typedef cudaError cudaError_t;',
-           'typedef __device_builtin__ enum cudaOutputMode cudaOutputMode_t;' : 'typedef cudaOutputMode cudaOutputMode_t;',
-           'typedef enum cudaError cudaError_t;' : 'typedef cudaError cudaError_t;',
-           'typedef enum cudaOutputMode cudaOutputMode_t;' : 'typedef cudaOutputMode cudaOutputMode_t;',
-           'typedef enum cudaDataType_t cudaDataType_t;' : '',
-           'typedef enum libraryPropertyType_t libraryPropertyType_t;' : '',
-           '  enum ' : '   ',
-           ', enum ' : ', ',
-           '\\(enum ' : '(',}
+    "driver": ["cuda.h", "cudaProfiler.h", "cudaEGL.h", "cudaGL.h", "cudaVDPAU.h"],
+    "runtime": [
+        "driver_types.h",
+        "vector_types.h",
+        "cuda_runtime.h",
+        "surface_types.h",
+        "texture_types.h",
+        "library_types.h",
+        "cuda_runtime_api.h",
+        "device_types.h",
+        "driver_functions.h",
+        "cuda_profiler_api.h",
+        "cuda_egl_interop.h",
+        "cuda_gl_interop.h",
+        "cuda_vdpau_interop.h",
+    ],
+    "nvrtc": ["nvrtc.h"],
+}
+
+replace = {
+    " __device_builtin__ ": " ",
+    "CUDARTAPI ": " ",
+    "typedef __device_builtin__ enum cudaError cudaError_t;": "typedef cudaError cudaError_t;",
+    "typedef __device_builtin__ enum cudaOutputMode cudaOutputMode_t;": "typedef cudaOutputMode cudaOutputMode_t;",
+    "typedef enum cudaError cudaError_t;": "typedef cudaError cudaError_t;",
+    "typedef enum cudaOutputMode cudaOutputMode_t;": "typedef cudaOutputMode cudaOutputMode_t;",
+    "typedef enum cudaDataType_t cudaDataType_t;": "",
+    "typedef enum libraryPropertyType_t libraryPropertyType_t;": "",
+    "  enum ": "   ",
+    ", enum ": ", ",
+    "\\(enum ": "(",
+}
 
 found_types = []
 found_structs = {}
@@ -80,7 +81,7 @@
 found_functions = []
 found_values = []
 
-include_path_list = [os.path.join(path, 'include') for path in CUDA_HOME]
+include_path_list = [os.path.join(path, "include") for path in CUDA_HOME]
 print(f'Parsing headers in "{include_path_list}" (Caching {PARSER_CACHING})')
 for library, header_list in header_dict.items():
     header_paths = []
@@ -91,35 +92,37 @@
                 header_paths += [path]
                 break
         if not os.path.exists(path):
-            print(f'Missing header {header}')
+            print(f"Missing header {header}")
 
-    print(f'Parsing {library} headers')
-    parser = CParser(header_paths,
-                     cache='./cache_{}'.format(library.split('.')[0]) if PARSER_CACHING else None,
-                     replace=replace)
+    print(f"Parsing {library} headers")
+    parser = CParser(
+        header_paths, cache="./cache_{}".format(library.split(".")[0]) if PARSER_CACHING else None, replace=replace
+    )
 
-    if library == 'driver':
-        CUDA_VERSION = parser.defs['macros']['CUDA_VERSION'] if 'CUDA_VERSION' in parser.defs['macros'] else 'Unknown'
-        print(f'Found CUDA_VERSION: {CUDA_VERSION}')
+    if library == "driver":
+        CUDA_VERSION = parser.defs["macros"].get("CUDA_VERSION", "Unknown")
+        print(f"Found CUDA_VERSION: {CUDA_VERSION}")
 
     # Combine types with others since they sometimes get tangled
-    found_types += {key for key in parser.defs['types']}
-    found_types += {key for key in parser.defs['structs']}
-    found_structs.update(parser.defs['structs'])
-    found_types += {key for key in parser.defs['unions']}
-    found_unions.update(parser.defs['unions'])
-    found_types += {key for key in parser.defs['enums']}
-    found_functions += {key for key in parser.defs['functions']}
-    found_values += {key for key in parser.defs['values']}
+    found_types += {key for key in parser.defs["types"]}
+    found_types += {key for key in parser.defs["structs"]}
+    found_structs.update(parser.defs["structs"])
+    found_types += {key for key in parser.defs["unions"]}
+    found_unions.update(parser.defs["unions"])
+    found_types += {key for key in parser.defs["enums"]}
+    found_functions += {key for key in parser.defs["functions"]}
+    found_values += {key for key in parser.defs["values"]}
 
 if len(found_functions) == 0:
     raise RuntimeError(f'Parser found no functions. Is CUDA_HOME setup correctly? (CUDA_HOME="{CUDA_HOME}")')
 
+
 # Unwrap struct and union members
 def unwrapMembers(found_dict):
     for key in found_dict:
-        members = [var for var, _, _ in found_dict[key]['members']]
-        found_dict[key]['members'] = members
+        members = [var for var, _, _ in found_dict[key]["members"]]
+        found_dict[key]["members"] = members
+
 
 unwrapMembers(found_structs)
 unwrapMembers(found_unions)
@@ -127,11 +130,13 @@ def unwrapMembers(found_dict):
 # ----------------------------------------------------------------------
 # Generate
 
+
 def fetch_input_files(path):
-    return [os.path.join(path, f) for f in os.listdir(path) if f.endswith('.in')]
+    return [os.path.join(path, f) for f in os.listdir(path) if f.endswith(".in")]
+
 
 def generate_output(infile, local):
-    assert infile.endswith('.in')
+    assert infile.endswith(".in")
     outfile = infile[:-3]
 
     with open(infile) as f:
@@ -140,19 +145,21 @@ def generate_output(infile, local):
     if os.path.exists(outfile):
         with open(outfile) as f:
             if f.read() == pxdcontent:
-                print(f'Skipping {infile} (No change)')
+                print(f"Skipping {infile} (No change)")
                 return
     with open(outfile, "w") as f:
-        print(f'Generating {infile}')
+        print(f"Generating {infile}")
         f.write(pxdcontent)
 
-path_list = [os.path.join('cuda'),
-             os.path.join('cuda', 'bindings'),
-             os.path.join('cuda', 'bindings', '_bindings'),
-             os.path.join('cuda', 'bindings', '_lib'),
-             os.path.join('cuda', 'bindings', '_lib', 'cyruntime'),
-             os.path.join('cuda', 'bindings', '_internal'),
-            ]
+
+path_list = [
+    os.path.join("cuda"),
+    os.path.join("cuda", "bindings"),
+    os.path.join("cuda", "bindings", "_bindings"),
+    os.path.join("cuda", "bindings", "_lib"),
+    os.path.join("cuda", "bindings", "_lib", "cyruntime"),
+    os.path.join("cuda", "bindings", "_internal"),
+]
 input_files = []
 for path in path_list:
     input_files += fetch_input_files(path)
@@ -171,19 +178,19 @@ def generate_output(infile, local):
 
 extra_compile_args = []
 extra_cythonize_kwargs = {}
-if sys.platform != 'win32':
+if sys.platform != "win32":
     extra_compile_args += [
-        '-std=c++14',
-        '-fpermissive',
-        '-Wno-deprecated-declarations',
-        '-D _GLIBCXX_ASSERTIONS',
-        '-fno-var-tracking-assignments'
+        "-std=c++14",
+        "-fpermissive",
+        "-Wno-deprecated-declarations",
+        "-D _GLIBCXX_ASSERTIONS",
+        "-fno-var-tracking-assignments",
     ]
-    if '--debug' in sys.argv:
-        extra_cythonize_kwargs['gdb_debug'] = True
-        extra_compile_args += ['-g', '-O0']
+    if "--debug" in sys.argv:
+        extra_cythonize_kwargs["gdb_debug"] = True
+        extra_compile_args += ["-g", "-O0"]
     else:
-        extra_compile_args += ['-O3']
+        extra_compile_args += ["-O3"]
 
 # For Setup
 extensions = []
@@ -193,6 +200,7 @@ def generate_output(infile, local):
 # ----------------------------------------------------------------------
 # Cythonize
 
+
 def prep_extensions(sources):
     pattern = sources[0]
     files = glob.glob(pattern)
@@ -216,21 +224,20 @@ def prep_extensions(sources):
 
 # new path for the bindings from cybind
 def rename_architecture_specific_files():
-    architechture_specific_files_dir = 'cuda/bindings/_internal/'
-    if sys.platform == 'linux':
-        src_files = glob.glob(os.path.join(path, '*_linux.pyx'))
-    elif sys.platform == 'win32':
-        src_files = glob.glob(os.path.join(path, '*_windows.pyx'))
+    if sys.platform == "linux":
+        src_files = glob.glob(os.path.join(path, "*_linux.pyx"))
+    elif sys.platform == "win32":
+        src_files = glob.glob(os.path.join(path, "*_windows.pyx"))
     else:
-        raise RuntimeError(f'platform is unrecognized: {sys.platform}')
+        raise RuntimeError(f"platform is unrecognized: {sys.platform}")
     dst_files = []
     for src in src_files:
         # Set up a temporary file; it must be under the cache directory so
         # that atomic moves within the same filesystem can be guaranteed
-        with tempfile.NamedTemporaryFile(delete=False, dir='.') as f:
+        with tempfile.NamedTemporaryFile(delete=False, dir=".") as f:
             shutil.copy2(src, f.name)
             f_name = f.name
-        dst = src.replace('_linux', '').replace('_windows', '')
+        dst = src.replace("_linux", "").replace("_windows", "")
         # atomic move with the destination guaranteed to be overwritten
         os.replace(f_name, f"./{dst}")
         dst_files.append(dst)
@@ -243,20 +250,17 @@ def rename_architecture_specific_files():
 @atexit.register
 def cleanup_dst_files():
     for dst in dst_files:
-        try:
+        with contextlib.suppress(FileNotFoundError):
             os.remove(dst)
-        except FileNotFoundError:
-            pass
 
 
 def do_cythonize(extensions):
     return cythonize(
         extensions,
         nthreads=nthreads,
-        compiler_directives=dict(
-            profile=True, language_level=3, embedsignature=True, binding=True
-        ),
-        **extra_cythonize_kwargs)
+        compiler_directives=dict(profile=True, language_level=3, embedsignature=True, binding=True),
+        **extra_cythonize_kwargs,
+    )
 
 
 sources_list = [
@@ -270,8 +274,8 @@ def do_cythonize(extensions):
     # public (deprecated, to be removed)
     ["cuda/*.pyx"],
     # interal files used by generated bindings
-    ['cuda/bindings/_internal/nvjitlink.pyx'],
-    ['cuda/bindings/_internal/utils.pyx'],
+    ["cuda/bindings/_internal/nvjitlink.pyx"],
+    ["cuda/bindings/_internal/utils.pyx"],
 ]
 
 for sources in sources_list:
@@ -284,6 +288,7 @@ def do_cythonize(extensions):
 # 2) Compiled to .o files as part of build_ext
 # This class is solely for passing the value of nthreads to build_ext
 
+
 class ParallelBuildExtensions(build_ext):
     def initialize_options(self):
         build_ext.initialize_options(self)
@@ -293,6 +298,7 @@ def initialize_options(self):
     def finalize_options(self):
         build_ext.finalize_options(self)
 
+
 cmdclass = {"build_ext": ParallelBuildExtensions}
 cmdclass = versioneer.get_cmdclass(cmdclass)
 
diff --git a/cuda_bindings/tests/cython/test_cython.py b/cuda_bindings/tests/cython/test_cython.py
index 439d7aa07..ed5069219 100644
--- a/cuda_bindings/tests/cython/test_cython.py
+++ b/cuda_bindings/tests/cython/test_cython.py
@@ -22,9 +22,7 @@ def wrapped(*args, **kwargs):
     return wrapped
 
 
-cython_test_modules = ["test_ccuda",
-                       "test_ccudart",
-                       "test_interoperability_cython"]
+cython_test_modules = ["test_ccuda", "test_ccudart", "test_interoperability_cython"]
 
 
 for mod in cython_test_modules:
diff --git a/cuda_bindings/tests/test_cuda.py b/cuda_bindings/tests/test_cuda.py
index d55a4209c..984f6aab0 100644
--- a/cuda_bindings/tests/test_cuda.py
+++ b/cuda_bindings/tests/test_cuda.py
@@ -6,211 +6,222 @@
 # this software and related documentation outside the terms of the EULA
 # is strictly prohibited.
 import platform
+import shutil
+import textwrap
+
+import numpy as np
 import pytest
+
 import cuda.cuda as cuda
 import cuda.cudart as cudart
-import numpy as np
-import textwrap
-import shutil
-from sysconfig import get_paths
+
 
 def driverVersionLessThan(target):
-    err, = cuda.cuInit(0)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    (err,) = cuda.cuInit(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
     err, version = cuda.cuDriverGetVersion()
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    assert err == cuda.CUresult.CUDA_SUCCESS
     return version < target
 
+
 def supportsMemoryPool():
     err, isSupported = cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrMemoryPoolsSupported, 0)
     return err == cudart.cudaError_t.cudaSuccess and isSupported
 
+
 def supportsManagedMemory():
     err, isSupported = cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrManagedMemory, 0)
     return err == cudart.cudaError_t.cudaSuccess and isSupported
 
+
 def supportsCudaAPI(name):
     return name in dir(cuda)
 
+
 def callableBinary(name):
-    return shutil.which(name) != None
+    return shutil.which(name) is not None
+
 
 def test_cuda_memcpy():
     # Init CUDA
-    err, = cuda.cuInit(0)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    (err,) = cuda.cuInit(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
 
     # Get device
     err, device = cuda.cuDeviceGet(0)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    assert err == cuda.CUresult.CUDA_SUCCESS
 
     # Construct context
     err, ctx = cuda.cuCtxCreate(0, device)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    assert err == cuda.CUresult.CUDA_SUCCESS
 
     # Allocate dev memory
     size = int(1024 * np.uint8().itemsize)
     err, dptr = cuda.cuMemAlloc(size)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    assert err == cuda.CUresult.CUDA_SUCCESS
 
     # Set h1 and h2 memory to be different
     h1 = np.full(size, 1).astype(np.uint8)
     h2 = np.full(size, 2).astype(np.uint8)
-    assert(np.array_equal(h1, h2) is False)
+    assert np.array_equal(h1, h2) is False
 
     # h1 to D
-    err, = cuda.cuMemcpyHtoD(dptr, h1, size)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    (err,) = cuda.cuMemcpyHtoD(dptr, h1, size)
+    assert err == cuda.CUresult.CUDA_SUCCESS
 
     # D to h2
-    err, = cuda.cuMemcpyDtoH(h2, dptr, size)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    (err,) = cuda.cuMemcpyDtoH(h2, dptr, size)
+    assert err == cuda.CUresult.CUDA_SUCCESS
 
     # Validate h1 == h2
-    assert(np.array_equal(h1, h2))
+    assert np.array_equal(h1, h2)
 
     # Cleanup
-    err, = cuda.cuMemFree(dptr)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
-    err, = cuda.cuCtxDestroy(ctx)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    (err,) = cuda.cuMemFree(dptr)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    (err,) = cuda.cuCtxDestroy(ctx)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
 
 def test_cuda_array():
-    err, = cuda.cuInit(0)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    (err,) = cuda.cuInit(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
     err, device = cuda.cuDeviceGet(0)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    assert err == cuda.CUresult.CUDA_SUCCESS
 
     # No context created
     desc = cuda.CUDA_ARRAY_DESCRIPTOR()
     err, arr = cuda.cuArrayCreate(desc)
-    assert(err == cuda.CUresult.CUDA_ERROR_INVALID_CONTEXT or err == cuda.CUresult.CUDA_ERROR_INVALID_VALUE)
+    assert err == cuda.CUresult.CUDA_ERROR_INVALID_CONTEXT or err == cuda.CUresult.CUDA_ERROR_INVALID_VALUE
 
     err, ctx = cuda.cuCtxCreate(0, device)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    assert err == cuda.CUresult.CUDA_SUCCESS
 
     # Desciption not filled
     err, arr = cuda.cuArrayCreate(desc)
-    assert(err == cuda.CUresult.CUDA_ERROR_INVALID_VALUE)
+    assert err == cuda.CUresult.CUDA_ERROR_INVALID_VALUE
 
     # Pass
     desc.Format = cuda.CUarray_format.CU_AD_FORMAT_SIGNED_INT8
     desc.NumChannels = 1
     desc.Width = 1
     err, arr = cuda.cuArrayCreate(desc)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    (err,) = cuda.cuArrayDestroy(arr)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    (err,) = cuda.cuCtxDestroy(ctx)
+    assert err == cuda.CUresult.CUDA_SUCCESS
 
-    err, = cuda.cuArrayDestroy(arr)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
-    err, = cuda.cuCtxDestroy(ctx)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
 
 def test_cuda_repr_primitive():
-    err, = cuda.cuInit(0)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    (err,) = cuda.cuInit(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
 
     err, device = cuda.cuDeviceGet(0)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
-    assert(str(device) == '<CUdevice 0>')
-    assert(int(device) == 0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    assert str(device) == "<CUdevice 0>"
+    assert int(device) == 0
 
     err, ctx = cuda.cuCtxCreate(0, device)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
-    assert(str(ctx).startswith('<CUcontext 0x'))
-    assert(int(ctx) > 0)
-    assert(hex(ctx) == hex(int(ctx)))
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    assert str(ctx).startswith("<CUcontext 0x")
+    assert int(ctx) > 0
+    assert hex(ctx) == hex(int(ctx))
 
     # CUdeviceptr
     err, dptr = cuda.cuMemAlloc(1024 * np.uint8().itemsize)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
-    assert(str(dptr).startswith('<CUdeviceptr '))
-    assert(int(dptr) > 0)
-    err, = cuda.cuMemFree(dptr)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    assert str(dptr).startswith("<CUdeviceptr ")
+    assert int(dptr) > 0
+    (err,) = cuda.cuMemFree(dptr)
     size = 7
     dptr = cuda.CUdeviceptr(size)
-    assert(str(dptr) == '<CUdeviceptr {}>'.format(size))
-    assert(int(dptr) == size)
+    assert str(dptr) == f"<CUdeviceptr {size}>"
+    assert int(dptr) == size
     size = 4294967295
     dptr = cuda.CUdeviceptr(size)
-    assert(str(dptr) == '<CUdeviceptr {}>'.format(size))
-    assert(int(dptr) == size)
+    assert str(dptr) == f"<CUdeviceptr {size}>"
+    assert int(dptr) == size
     size = 18446744073709551615
     dptr = cuda.CUdeviceptr(size)
-    assert(str(dptr) == '<CUdeviceptr {}>'.format(size))
-    assert(int(dptr) == size)
+    assert str(dptr) == f"<CUdeviceptr {size}>"
+    assert int(dptr) == size
 
     # cuuint32_t
     size = 7
     int32 = cuda.cuuint32_t(size)
-    assert(str(int32) == '<cuuint32_t {}>'.format(size))
-    assert(int(int32) == size)
+    assert str(int32) == f"<cuuint32_t {size}>"
+    assert int(int32) == size
     size = 4294967295
     int32 = cuda.cuuint32_t(size)
-    assert(str(int32) == '<cuuint32_t {}>'.format(size))
-    assert(int(int32) == size)
+    assert str(int32) == f"<cuuint32_t {size}>"
+    assert int(int32) == size
     size = 18446744073709551615
     try:
         int32 = cuda.cuuint32_t(size)
-        raise RuntimeError('int32 = cuda.cuuint32_t(18446744073709551615) did not fail')
+        raise RuntimeError("int32 = cuda.cuuint32_t(18446744073709551615) did not fail")
     except OverflowError as err:
         pass
 
     # cuuint64_t
     size = 7
     int64 = cuda.cuuint64_t(size)
-    assert(str(int64) == '<cuuint64_t {}>'.format(size))
-    assert(int(int64) == size)
+    assert str(int64) == f"<cuuint64_t {size}>"
+    assert int(int64) == size
     size = 4294967295
     int64 = cuda.cuuint64_t(size)
-    assert(str(int64) == '<cuuint64_t {}>'.format(size))
-    assert(int(int64) == size)
+    assert str(int64) == f"<cuuint64_t {size}>"
+    assert int(int64) == size
     size = 18446744073709551615
     int64 = cuda.cuuint64_t(size)
-    assert(str(int64) == '<cuuint64_t {}>'.format(size))
-    assert(int(int64) == size)
+    assert str(int64) == f"<cuuint64_t {size}>"
+    assert int(int64) == size
+
+    (err,) = cuda.cuCtxDestroy(ctx)
+    assert err == cuda.CUresult.CUDA_SUCCESS
 
-    err, = cuda.cuCtxDestroy(ctx)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
 
 def test_cuda_repr_pointer():
-    err, = cuda.cuInit(0)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    (err,) = cuda.cuInit(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
     err, device = cuda.cuDeviceGet(0)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    assert err == cuda.CUresult.CUDA_SUCCESS
 
     # Test 1: Classes representing pointers
     err, ctx = cuda.cuCtxCreate(0, device)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
-    assert(str(ctx).startswith('<CUcontext 0x'))
-    assert(int(ctx) > 0)
-    assert(hex(ctx) == hex(int(ctx)))
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    assert str(ctx).startswith("<CUcontext 0x")
+    assert int(ctx) > 0
+    assert hex(ctx) == hex(int(ctx))
     randomCtxPointer = 12345
     randomCtx = cuda.CUcontext(randomCtxPointer)
-    assert(str(randomCtx) == '<CUcontext {}>'.format(hex(randomCtxPointer)))
-    assert(int(randomCtx) == randomCtxPointer)
-    assert(hex(randomCtx) == hex(randomCtxPointer))
+    assert str(randomCtx) == f"<CUcontext {hex(randomCtxPointer)}>"
+    assert int(randomCtx) == randomCtxPointer
+    assert hex(randomCtx) == hex(randomCtxPointer)
 
     # Test 2: Function pointers
     func = 12345
     b2d_cb = cuda.CUoccupancyB2DSize(func)
-    assert(str(b2d_cb) == '<CUoccupancyB2DSize {}>'.format(hex(func)))
-    assert(int(b2d_cb) == func)
-    assert(hex(b2d_cb) == hex(func))
+    assert str(b2d_cb) == f"<CUoccupancyB2DSize {hex(func)}>"
+    assert int(b2d_cb) == func
+    assert hex(b2d_cb) == hex(func)
+
+    (err,) = cuda.cuCtxDestroy(ctx)
+    assert err == cuda.CUresult.CUDA_SUCCESS
 
-    err, = cuda.cuCtxDestroy(ctx)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
 
 def test_cuda_uuid_list_access():
-    err, = cuda.cuInit(0)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    (err,) = cuda.cuInit(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
     err, device = cuda.cuDeviceGet(0)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    assert err == cuda.CUresult.CUDA_SUCCESS
     err, ctx = cuda.cuCtxCreate(0, device)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    assert err == cuda.CUresult.CUDA_SUCCESS
 
     err, uuid = cuda.cuDeviceGetUuid(device)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
-    assert(len(uuid.bytes) <= 16)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    assert len(uuid.bytes) <= 16
 
     jit_option = cuda.CUjit_option
     options = {
@@ -221,29 +232,29 @@ def test_cuda_uuid_list_access():
         jit_option.CU_JIT_LOG_VERBOSE: 5,
     }
 
-    err, = cuda.cuCtxDestroy(ctx)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    (err,) = cuda.cuCtxDestroy(ctx)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
 
 def test_cuda_cuModuleLoadDataEx():
-    err, = cuda.cuInit(0)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    (err,) = cuda.cuInit(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
     err, dev = cuda.cuDeviceGet(0)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    assert err == cuda.CUresult.CUDA_SUCCESS
     err, ctx = cuda.cuCtxCreate(0, dev)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
-
+    assert err == cuda.CUresult.CUDA_SUCCESS
 
     option_keys = [
         cuda.CUjit_option.CU_JIT_INFO_LOG_BUFFER,
         cuda.CUjit_option.CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
         cuda.CUjit_option.CU_JIT_ERROR_LOG_BUFFER,
         cuda.CUjit_option.CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
-        cuda.CUjit_option.CU_JIT_LOG_VERBOSE
+        cuda.CUjit_option.CU_JIT_LOG_VERBOSE,
     ]
     err, mod = cuda.cuModuleLoadDataEx(0, 0, option_keys, [])
 
-    err, = cuda.cuCtxDestroy(ctx)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    (err,) = cuda.cuCtxDestroy(ctx)
+    assert err == cuda.CUresult.CUDA_SUCCESS
 
 
 def test_cuda_repr():
@@ -284,9 +295,11 @@ def test_cuda_repr():
 
 def test_cuda_struct_list_of_enums():
     desc = cuda.CUDA_TEXTURE_DESC_st()
-    desc.addressMode = [cuda.CUaddress_mode.CU_TR_ADDRESS_MODE_WRAP,
-                        cuda.CUaddress_mode.CU_TR_ADDRESS_MODE_CLAMP,
-                        cuda.CUaddress_mode.CU_TR_ADDRESS_MODE_MIRROR]
+    desc.addressMode = [
+        cuda.CUaddress_mode.CU_TR_ADDRESS_MODE_WRAP,
+        cuda.CUaddress_mode.CU_TR_ADDRESS_MODE_CLAMP,
+        cuda.CUaddress_mode.CU_TR_ADDRESS_MODE_MIRROR,
+    ]
 
     # # Too many args
     # desc.addressMode = [cuda.CUaddress_mode.CU_TR_ADDRESS_MODE_WRAP,
@@ -298,6 +311,7 @@ def test_cuda_struct_list_of_enums():
     # desc.addressMode = [cuda.CUaddress_mode.CU_TR_ADDRESS_MODE_WRAP,
     #                     cuda.CUaddress_mode.CU_TR_ADDRESS_MODE_CLAMP]
 
+
 def test_cuda_CUstreamBatchMemOpParams():
     params = cuda.CUstreamBatchMemOpParams()
     params.operation = cuda.CUstreamBatchMemOpType.CU_STREAM_MEM_OP_WAIT_VALUE_32
@@ -305,16 +319,19 @@ def test_cuda_CUstreamBatchMemOpParams():
     params.writeValue.operation = cuda.CUstreamBatchMemOpType.CU_STREAM_MEM_OP_WAIT_VALUE_32
     params.flushRemoteWrites.operation = cuda.CUstreamBatchMemOpType.CU_STREAM_MEM_OP_WAIT_VALUE_32
     params.waitValue.value64 = 666
-    assert(int(params.waitValue.value64) == 666)
+    assert int(params.waitValue.value64) == 666
 
-@pytest.mark.skipif(driverVersionLessThan(11030) or not supportsMemoryPool(), reason='When new attributes were introduced')
+
+@pytest.mark.skipif(
+    driverVersionLessThan(11030) or not supportsMemoryPool(), reason="When new attributes were introduced"
+)
 def test_cuda_memPool_attr():
-    err, = cuda.cuInit(0)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    (err,) = cuda.cuInit(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
     err, device = cuda.cuDeviceGet(0)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    assert err == cuda.CUresult.CUDA_SUCCESS
     err, ctx = cuda.cuCtxCreate(0, device)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    assert err == cuda.CUresult.CUDA_SUCCESS
 
     poolProps = cuda.CUmemPoolProps()
     poolProps.allocType = cuda.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED
@@ -323,136 +340,158 @@ def test_cuda_memPool_attr():
 
     attr_list = [None] * 8
     err, pool = cuda.cuMemPoolCreate(poolProps)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
-
-    for idx, attr in enumerate([cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES,
-                                cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC,
-                                cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES,
-                                cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
-                                cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT,
-                                cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH,
-                                cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_USED_MEM_CURRENT,
-                                cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_USED_MEM_HIGH]):
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    for idx, attr in enumerate(
+        [
+            cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES,
+            cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC,
+            cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES,
+            cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
+            cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT,
+            cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH,
+            cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_USED_MEM_CURRENT,
+            cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_USED_MEM_HIGH,
+        ]
+    ):
         err, attr_tmp = cuda.cuMemPoolGetAttribute(pool, attr)
-        assert(err == cuda.CUresult.CUDA_SUCCESS)
+        assert err == cuda.CUresult.CUDA_SUCCESS
         attr_list[idx] = attr_tmp
 
-    for idxA, attr in enumerate([cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES,
-                                 cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC,
-                                 cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES]):
-        err, = cuda.cuMemPoolSetAttribute(pool, attr, 0)
-        assert(err == cuda.CUresult.CUDA_SUCCESS)
+    for idxA, attr in enumerate(
+        [
+            cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES,
+            cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC,
+            cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES,
+        ]
+    ):
+        (err,) = cuda.cuMemPoolSetAttribute(pool, attr, 0)
+        assert err == cuda.CUresult.CUDA_SUCCESS
     for idx, attr in enumerate([cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD]):
-        err, = cuda.cuMemPoolSetAttribute(pool, attr, cuda.cuuint64_t(9))
-        assert(err == cuda.CUresult.CUDA_SUCCESS)
-
-    for idx, attr in enumerate([cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES,
-                                cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC,
-                                cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES,
-                                cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD]):
+        (err,) = cuda.cuMemPoolSetAttribute(pool, attr, cuda.cuuint64_t(9))
+        assert err == cuda.CUresult.CUDA_SUCCESS
+
+    for idx, attr in enumerate(
+        [
+            cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES,
+            cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC,
+            cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES,
+            cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
+        ]
+    ):
         err, attr_tmp = cuda.cuMemPoolGetAttribute(pool, attr)
-        assert(err == cuda.CUresult.CUDA_SUCCESS)
+        assert err == cuda.CUresult.CUDA_SUCCESS
         attr_list[idx] = attr_tmp
-    assert(attr_list[0] == 0)
-    assert(attr_list[1] == 0)
-    assert(attr_list[2] == 0)
-    assert(int(attr_list[3]) == 9)
+    assert attr_list[0] == 0
+    assert attr_list[1] == 0
+    assert attr_list[2] == 0
+    assert int(attr_list[3]) == 9
+
+    (err,) = cuda.cuMemPoolDestroy(pool)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    (err,) = cuda.cuCtxDestroy(ctx)
+    assert err == cuda.CUresult.CUDA_SUCCESS
 
-    err, = cuda.cuMemPoolDestroy(pool)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
-    err, = cuda.cuCtxDestroy(ctx)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
 
-@pytest.mark.skipif(driverVersionLessThan(11030) or not supportsManagedMemory(), reason='When new attributes were introduced')
+@pytest.mark.skipif(
+    driverVersionLessThan(11030) or not supportsManagedMemory(), reason="When new attributes were introduced"
+)
 def test_cuda_pointer_attr():
-    err, = cuda.cuInit(0)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    (err,) = cuda.cuInit(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
     err, device = cuda.cuDeviceGet(0)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    assert err == cuda.CUresult.CUDA_SUCCESS
     err, ctx = cuda.cuCtxCreate(0, device)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    assert err == cuda.CUresult.CUDA_SUCCESS
     err, ptr = cuda.cuMemAllocManaged(0x1000, cuda.CUmemAttach_flags.CU_MEM_ATTACH_GLOBAL.value)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    assert err == cuda.CUresult.CUDA_SUCCESS
 
     # Individual version
-    attr_type_list = [cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_CONTEXT,
-                      cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
-                      cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_DEVICE_POINTER,
-                      cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_HOST_POINTER,
-                      # cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_P2P_TOKENS, # TODO: Can I somehow test this?
-                      cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
-                      cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_BUFFER_ID,
-                      cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_IS_MANAGED,
-                      cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL,
-                      cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE,
-                      cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_RANGE_START_ADDR,
-                      cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_RANGE_SIZE,
-                      cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MAPPED,
-                      cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES,
-                      cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_IS_GPU_DIRECT_RDMA_CAPABLE,
-                      cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_ACCESS_FLAGS,
-                      cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE]
+    attr_type_list = [
+        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_CONTEXT,
+        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
+        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_DEVICE_POINTER,
+        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_HOST_POINTER,
+        # cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_P2P_TOKENS, # TODO: Can I somehow test this?
+        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
+        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_BUFFER_ID,
+        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_IS_MANAGED,
+        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL,
+        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE,
+        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_RANGE_START_ADDR,
+        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_RANGE_SIZE,
+        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MAPPED,
+        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES,
+        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_IS_GPU_DIRECT_RDMA_CAPABLE,
+        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_ACCESS_FLAGS,
+        cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE,
+    ]
     attr_value_list = [None] * len(attr_type_list)
     for idx, attr in enumerate(attr_type_list):
         err, attr_tmp = cuda.cuPointerGetAttribute(attr, ptr)
-        assert(err == cuda.CUresult.CUDA_SUCCESS)
+        assert err == cuda.CUresult.CUDA_SUCCESS
         attr_value_list[idx] = attr_tmp
 
     # List version
     err, attr_value_list_v2 = cuda.cuPointerGetAttributes(len(attr_type_list), attr_type_list, ptr)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    assert err == cuda.CUresult.CUDA_SUCCESS
     for attr1, attr2 in zip(attr_value_list, attr_value_list_v2):
-        assert(str(attr1) == str(attr2))
+        assert str(attr1) == str(attr2)
 
     # Test setting values
     for val in (True, False):
-        err, = cuda.cuPointerSetAttribute(val, cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, ptr)
-        assert(err == cuda.CUresult.CUDA_SUCCESS)
+        (err,) = cuda.cuPointerSetAttribute(val, cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, ptr)
+        assert err == cuda.CUresult.CUDA_SUCCESS
         err, attr_tmp = cuda.cuPointerGetAttribute(cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, ptr)
-        assert(err == cuda.CUresult.CUDA_SUCCESS)
-        assert(attr_tmp == val)
+        assert err == cuda.CUresult.CUDA_SUCCESS
+        assert attr_tmp == val
+
+    (err,) = cuda.cuMemFree(ptr)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    (err,) = cuda.cuCtxDestroy(ctx)
+    assert err == cuda.CUresult.CUDA_SUCCESS
 
-    err, = cuda.cuMemFree(ptr)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
-    err, = cuda.cuCtxDestroy(ctx)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
 
-@pytest.mark.skipif(not supportsManagedMemory(), reason='When new attributes were introduced')
+@pytest.mark.skipif(not supportsManagedMemory(), reason="When new attributes were introduced")
 def test_cuda_mem_range_attr():
-    err, = cuda.cuInit(0)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    (err,) = cuda.cuInit(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
     err, device = cuda.cuDeviceGet(0)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    assert err == cuda.CUresult.CUDA_SUCCESS
     err, ctx = cuda.cuCtxCreate(0, device)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
-    size = int(0x1000)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    size = 0x1000
     err, ptr = cuda.cuMemAllocManaged(size, cuda.CUmemAttach_flags.CU_MEM_ATTACH_GLOBAL.value)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
-    err, = cuda.cuMemAdvise(ptr, size, cuda.CUmem_advise.CU_MEM_ADVISE_SET_READ_MOSTLY, device)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
-    err, = cuda.cuMemAdvise(ptr, size, cuda.CUmem_advise.CU_MEM_ADVISE_SET_PREFERRED_LOCATION, cuda.CU_DEVICE_CPU)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
-    err, = cuda.cuMemAdvise(ptr, size, cuda.CUmem_advise.CU_MEM_ADVISE_SET_ACCESSED_BY, cuda.CU_DEVICE_CPU)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
-    err, concurrentSupported = cuda.cuDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, device)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    (err,) = cuda.cuMemAdvise(ptr, size, cuda.CUmem_advise.CU_MEM_ADVISE_SET_READ_MOSTLY, device)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    (err,) = cuda.cuMemAdvise(ptr, size, cuda.CUmem_advise.CU_MEM_ADVISE_SET_PREFERRED_LOCATION, cuda.CU_DEVICE_CPU)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    (err,) = cuda.cuMemAdvise(ptr, size, cuda.CUmem_advise.CU_MEM_ADVISE_SET_ACCESSED_BY, cuda.CU_DEVICE_CPU)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    err, concurrentSupported = cuda.cuDeviceGetAttribute(
+        cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, device
+    )
+    assert err == cuda.CUresult.CUDA_SUCCESS
     if concurrentSupported:
-        err, = cuda.cuMemAdvise(ptr, size, cuda.CUmem_advise.CU_MEM_ADVISE_SET_ACCESSED_BY, device)
-        assert(err == cuda.CUresult.CUDA_SUCCESS)
+        (err,) = cuda.cuMemAdvise(ptr, size, cuda.CUmem_advise.CU_MEM_ADVISE_SET_ACCESSED_BY, device)
+        assert err == cuda.CUresult.CUDA_SUCCESS
         expected_values_list = ([1, -1, [0, -1, -2], -2],)
     else:
         expected_values_list = ([1, -1, [-1, -2, -2], -2], [0, -2, [-2, -2, -2], -2])
 
     # Individual version
-    attr_type_list = [cuda.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY,
-                      cuda.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION,
-                      cuda.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY,
-                      cuda.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION]
+    attr_type_list = [
+        cuda.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY,
+        cuda.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION,
+        cuda.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY,
+        cuda.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION,
+    ]
     attr_type_size_list = [4, 4, 12, 4]
     attr_value_list = [None] * len(attr_type_list)
     for idx in range(len(attr_type_list)):
         err, attr_tmp = cuda.cuMemRangeGetAttribute(attr_type_size_list[idx], attr_type_list[idx], ptr, size)
-        assert(err == cuda.CUresult.CUDA_SUCCESS)
+        assert err == cuda.CUresult.CUDA_SUCCESS
         attr_value_list[idx] = attr_tmp
 
     matched = False
@@ -461,33 +500,36 @@ def test_cuda_mem_range_attr():
             matched = True
             break
     if not matched:
-        raise RuntimeError(f'attr_value_list {attr_value_list} did not match any {expected_values_list}')
+        raise RuntimeError(f"attr_value_list {attr_value_list} did not match any {expected_values_list}")
 
     # List version
-    err, attr_value_list_v2 = cuda.cuMemRangeGetAttributes(attr_type_size_list, attr_type_list, len(attr_type_list), ptr, size)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    err, attr_value_list_v2 = cuda.cuMemRangeGetAttributes(
+        attr_type_size_list, attr_type_list, len(attr_type_list), ptr, size
+    )
+    assert err == cuda.CUresult.CUDA_SUCCESS
     for attr1, attr2 in zip(attr_value_list, attr_value_list_v2):
-        assert(str(attr1) == str(attr2))
+        assert str(attr1) == str(attr2)
+
+    (err,) = cuda.cuMemFree(ptr)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    (err,) = cuda.cuCtxDestroy(ctx)
+    assert err == cuda.CUresult.CUDA_SUCCESS
 
-    err, = cuda.cuMemFree(ptr)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
-    err, = cuda.cuCtxDestroy(ctx)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
 
-@pytest.mark.skipif(driverVersionLessThan(11040) or not supportsMemoryPool(), reason='Mempool for graphs not supported')
+@pytest.mark.skipif(driverVersionLessThan(11040) or not supportsMemoryPool(), reason="Mempool for graphs not supported")
 def test_cuda_graphMem_attr():
-    err, = cuda.cuInit(0)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    (err,) = cuda.cuInit(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
     err, device = cuda.cuDeviceGet(0)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    assert err == cuda.CUresult.CUDA_SUCCESS
     err, ctx = cuda.cuCtxCreate(0, device)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    assert err == cuda.CUresult.CUDA_SUCCESS
 
     err, stream = cuda.cuStreamCreate(0)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    assert err == cuda.CUresult.CUDA_SUCCESS
 
     err, graph = cuda.cuGraphCreate(0)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    assert err == cuda.CUresult.CUDA_SUCCESS
 
     allocSize = 1
 
@@ -498,83 +540,95 @@ def test_cuda_graphMem_attr():
     params.bytesize = allocSize
 
     err, allocNode = cuda.cuGraphAddMemAllocNode(graph, None, 0, params)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    assert err == cuda.CUresult.CUDA_SUCCESS
     err, freeNode = cuda.cuGraphAddMemFreeNode(graph, [allocNode], 1, params.dptr)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    assert err == cuda.CUresult.CUDA_SUCCESS
 
     err, graphExec = cuda.cuGraphInstantiate(graph, 0)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    assert err == cuda.CUresult.CUDA_SUCCESS
 
-    err, = cuda.cuGraphLaunch(graphExec, stream)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    (err,) = cuda.cuGraphLaunch(graphExec, stream)
+    assert err == cuda.CUresult.CUDA_SUCCESS
 
     err, used = cuda.cuDeviceGetGraphMemAttribute(device, cuda.CUgraphMem_attribute.CU_GRAPH_MEM_ATTR_USED_MEM_CURRENT)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    assert err == cuda.CUresult.CUDA_SUCCESS
     err, usedHigh = cuda.cuDeviceGetGraphMemAttribute(device, cuda.CUgraphMem_attribute.CU_GRAPH_MEM_ATTR_USED_MEM_HIGH)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
-    err, reserved = cuda.cuDeviceGetGraphMemAttribute(device, cuda.CUgraphMem_attribute.CU_GRAPH_MEM_ATTR_RESERVED_MEM_CURRENT)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
-    err, reservedHigh = cuda.cuDeviceGetGraphMemAttribute(device, cuda.CUgraphMem_attribute.CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    err, reserved = cuda.cuDeviceGetGraphMemAttribute(
+        device, cuda.CUgraphMem_attribute.CU_GRAPH_MEM_ATTR_RESERVED_MEM_CURRENT
+    )
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    err, reservedHigh = cuda.cuDeviceGetGraphMemAttribute(
+        device, cuda.CUgraphMem_attribute.CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH
+    )
+    assert err == cuda.CUresult.CUDA_SUCCESS
 
     assert int(used) >= allocSize
     assert int(usedHigh) == int(used)
     assert int(reserved) == int(usedHigh)
     assert int(reservedHigh) == int(reserved)
 
-    err, = cuda.cuGraphDestroy(graph)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
-    err, = cuda.cuStreamDestroy(stream)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
-    err, = cuda.cuCtxDestroy(ctx)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    (err,) = cuda.cuGraphDestroy(graph)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    (err,) = cuda.cuStreamDestroy(stream)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    (err,) = cuda.cuCtxDestroy(ctx)
+    assert err == cuda.CUresult.CUDA_SUCCESS
 
-@pytest.mark.skipif(driverVersionLessThan(12010)
-                    or not supportsCudaAPI('cuCoredumpSetAttributeGlobal')
-                    or not supportsCudaAPI('cuCoredumpGetAttributeGlobal'), reason='Coredump API not present')
+
+@pytest.mark.skipif(
+    driverVersionLessThan(12010)
+    or not supportsCudaAPI("cuCoredumpSetAttributeGlobal")
+    or not supportsCudaAPI("cuCoredumpGetAttributeGlobal"),
+    reason="Coredump API not present",
+)
 def test_cuda_coredump_attr():
-    err, = cuda.cuInit(0)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    (err,) = cuda.cuInit(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
     err, device = cuda.cuDeviceGet(0)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    assert err == cuda.CUresult.CUDA_SUCCESS
     err, ctx = cuda.cuCtxCreate(0, device)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    assert err == cuda.CUresult.CUDA_SUCCESS
 
     attr_list = [None] * 6
 
-    err, = cuda.cuCoredumpSetAttributeGlobal(cuda.CUcoredumpSettings.CU_COREDUMP_TRIGGER_HOST, False)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
-    err, = cuda.cuCoredumpSetAttributeGlobal(cuda.CUcoredumpSettings.CU_COREDUMP_FILE, b'corefile')
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
-    err, = cuda.cuCoredumpSetAttributeGlobal(cuda.CUcoredumpSettings.CU_COREDUMP_PIPE, b'corepipe')
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
-    err, = cuda.cuCoredumpSetAttributeGlobal(cuda.CUcoredumpSettings.CU_COREDUMP_LIGHTWEIGHT, True)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
-
-    for idx, attr in enumerate([cuda.CUcoredumpSettings.CU_COREDUMP_TRIGGER_HOST,
-                                cuda.CUcoredumpSettings.CU_COREDUMP_FILE,
-                                cuda.CUcoredumpSettings.CU_COREDUMP_PIPE,
-                                cuda.CUcoredumpSettings.CU_COREDUMP_LIGHTWEIGHT,
-                                ]):
+    (err,) = cuda.cuCoredumpSetAttributeGlobal(cuda.CUcoredumpSettings.CU_COREDUMP_TRIGGER_HOST, False)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    (err,) = cuda.cuCoredumpSetAttributeGlobal(cuda.CUcoredumpSettings.CU_COREDUMP_FILE, b"corefile")
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    (err,) = cuda.cuCoredumpSetAttributeGlobal(cuda.CUcoredumpSettings.CU_COREDUMP_PIPE, b"corepipe")
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    (err,) = cuda.cuCoredumpSetAttributeGlobal(cuda.CUcoredumpSettings.CU_COREDUMP_LIGHTWEIGHT, True)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+    for idx, attr in enumerate(
+        [
+            cuda.CUcoredumpSettings.CU_COREDUMP_TRIGGER_HOST,
+            cuda.CUcoredumpSettings.CU_COREDUMP_FILE,
+            cuda.CUcoredumpSettings.CU_COREDUMP_PIPE,
+            cuda.CUcoredumpSettings.CU_COREDUMP_LIGHTWEIGHT,
+        ]
+    ):
         err, attr_tmp = cuda.cuCoredumpGetAttributeGlobal(attr)
-        assert(err == cuda.CUresult.CUDA_SUCCESS)
+        assert err == cuda.CUresult.CUDA_SUCCESS
         attr_list[idx] = attr_tmp
 
-    assert(attr_list[0] == False)
-    assert(attr_list[1] == b'corefile')
-    assert(attr_list[2] == b'corepipe')
-    assert(attr_list[3] == True)
+    assert attr_list[0] is False
+    assert attr_list[1] == b"corefile"
+    assert attr_list[2] == b"corepipe"
+    assert attr_list[3] is True
+
+    (err,) = cuda.cuCtxDestroy(ctx)
+    assert err == cuda.CUresult.CUDA_SUCCESS
 
-    err, = cuda.cuCtxDestroy(ctx)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
 
 def test_get_error_name_and_string():
-    err, = cuda.cuInit(0)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    (err,) = cuda.cuInit(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
     err, device = cuda.cuDeviceGet(0)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    assert err == cuda.CUresult.CUDA_SUCCESS
     err, ctx = cuda.cuCtxCreate(0, device)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    assert err == cuda.CUresult.CUDA_SUCCESS
 
     err, device = cuda.cuDeviceGet(0)
     _, s = cuda.cuGetErrorString(err)
@@ -587,134 +641,144 @@ def test_get_error_name_and_string():
     assert s == b"invalid device ordinal"
     _, s = cuda.cuGetErrorName(err)
     assert s == b"CUDA_ERROR_INVALID_DEVICE"
-    err, = cuda.cuCtxDestroy(ctx)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    (err,) = cuda.cuCtxDestroy(ctx)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
 
-@pytest.mark.skipif(not callableBinary('nvidia-smi'), reason='Binary existance needed')
+@pytest.mark.skipif(not callableBinary("nvidia-smi"), reason="Binary existance needed")
 def test_device_get_name():
     import subprocess
 
-    err, = cuda.cuInit(0)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    (err,) = cuda.cuInit(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
     err, device = cuda.cuDeviceGet(0)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    assert err == cuda.CUresult.CUDA_SUCCESS
     err, ctx = cuda.cuCtxCreate(0, device)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    assert err == cuda.CUresult.CUDA_SUCCESS
 
     p = subprocess.run(
-        ["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"],
-        stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        ["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
     )
 
-    delimiter = b'\r\n' if platform.system() == "Windows" else b'\n'
+    delimiter = b"\r\n" if platform.system() == "Windows" else b"\n"
     expect = p.stdout.split(delimiter)
     size = 64
     _, got = cuda.cuDeviceGetName(size, device)
-    got = got.split(b'\x00')[0]
-    if any(b'Unable to determine the device handle for' in result for result in expect):
+    got = got.split(b"\x00")[0]
+    if any(b"Unable to determine the device handle for" in result for result in expect):
         # Undeterministic devices get waived
         pass
     else:
         assert any(got in result for result in expect)
 
+    (err,) = cuda.cuCtxDestroy(ctx)
+    assert err == cuda.CUresult.CUDA_SUCCESS
 
-    err, = cuda.cuCtxDestroy(ctx)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
 
 # TODO: cuStreamGetCaptureInfo_v2
-@pytest.mark.skipif(driverVersionLessThan(11030), reason='Driver too old for cuStreamGetCaptureInfo_v2')
+@pytest.mark.skipif(driverVersionLessThan(11030), reason="Driver too old for cuStreamGetCaptureInfo_v2")
 def test_stream_capture():
     pass
 
+
 def test_profiler():
-    err, = cuda.cuInit(0)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    (err,) = cuda.cuInit(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
     err, device = cuda.cuDeviceGet(0)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    assert err == cuda.CUresult.CUDA_SUCCESS
     err, ctx = cuda.cuCtxCreate(0, device)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
-    err, = cuda.cuProfilerStart()
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
-    err, = cuda.cuProfilerStop()
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
-    err, = cuda.cuCtxDestroy(ctx)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    (err,) = cuda.cuProfilerStart()
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    (err,) = cuda.cuProfilerStop()
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    (err,) = cuda.cuCtxDestroy(ctx)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
 
 def test_eglFrame():
     val = cuda.CUeglFrame()
     # [<CUarray 0x0>, <CUarray 0x0>, <CUarray 0x0>]
-    assert(int(val.frame.pArray[0]) == 0)
-    assert(int(val.frame.pArray[1]) == 0)
-    assert(int(val.frame.pArray[2]) == 0)
-    val.frame.pArray = [1,2,3]
+    assert int(val.frame.pArray[0]) == 0
+    assert int(val.frame.pArray[1]) == 0
+    assert int(val.frame.pArray[2]) == 0
+    val.frame.pArray = [1, 2, 3]
     # [<CUarray 0x1>, <CUarray 0x2>, <CUarray 0x3>]
-    assert(int(val.frame.pArray[0]) == 1)
-    assert(int(val.frame.pArray[1]) == 2)
-    assert(int(val.frame.pArray[2]) == 3)
-    val.frame.pArray = [cuda.CUarray(4),2,3]
+    assert int(val.frame.pArray[0]) == 1
+    assert int(val.frame.pArray[1]) == 2
+    assert int(val.frame.pArray[2]) == 3
+    val.frame.pArray = [cuda.CUarray(4), 2, 3]
     # [<CUarray 0x4>, <CUarray 0x2>, <CUarray 0x3>]
-    assert(int(val.frame.pArray[0]) == 4)
-    assert(int(val.frame.pArray[1]) == 2)
-    assert(int(val.frame.pArray[2]) == 3)
+    assert int(val.frame.pArray[0]) == 4
+    assert int(val.frame.pArray[1]) == 2
+    assert int(val.frame.pArray[2]) == 3
     val.frame.pPitch = [4, 2, 3]
     # [4, 2, 3]
-    assert(int(val.frame.pPitch[0]) == 4)
-    assert(int(val.frame.pPitch[1]) == 2)
-    assert(int(val.frame.pPitch[2]) == 3)
-    val.frame.pPitch = [1,2,3]
-    assert(int(val.frame.pPitch[0]) == 1)
-    assert(int(val.frame.pPitch[1]) == 2)
-    assert(int(val.frame.pPitch[2]) == 3)
+    assert int(val.frame.pPitch[0]) == 4
+    assert int(val.frame.pPitch[1]) == 2
+    assert int(val.frame.pPitch[2]) == 3
+    val.frame.pPitch = [1, 2, 3]
+    assert int(val.frame.pPitch[0]) == 1
+    assert int(val.frame.pPitch[1]) == 2
+    assert int(val.frame.pPitch[2]) == 3
+
 
 def test_char_range():
     val = cuda.CUipcMemHandle_st()
     for x in range(-128, 0):
         val.reserved = [x] * 64
-        assert(val.reserved[0] == 256 + x)
+        assert val.reserved[0] == 256 + x
     for x in range(0, 256):
         val.reserved = [x] * 64
-        assert(val.reserved[0] == x)
+        assert val.reserved[0] == x
+
 
 def test_anon_assign():
     val1 = cuda.CUexecAffinityParam_st()
     val2 = cuda.CUexecAffinityParam_st()
 
-    assert(val1.param.smCount.val == 0)
+    assert val1.param.smCount.val == 0
     val1.param.smCount.val = 5
-    assert(val1.param.smCount.val == 5)
+    assert val1.param.smCount.val == 5
     val2.param.smCount.val = 11
-    assert(val2.param.smCount.val == 11)
+    assert val2.param.smCount.val == 11
 
     val1.param = val2.param
-    assert(val1.param.smCount.val == 11)
+    assert val1.param.smCount.val == 11
+
 
 def test_union_assign():
     val = cuda.CUlaunchAttributeValue()
-    val.clusterDim.x, val.clusterDim.y, val.clusterDim.z = 9,9,9
+    val.clusterDim.x, val.clusterDim.y, val.clusterDim.z = 9, 9, 9
     attr = cuda.CUlaunchAttribute()
     attr.value = val
 
-    assert(val.clusterDim.x == 9)
-    assert(val.clusterDim.y == 9)
-    assert(val.clusterDim.z == 9)
+    assert val.clusterDim.x == 9
+    assert val.clusterDim.y == 9
+    assert val.clusterDim.z == 9
+
 
 def test_invalid_repr_attribute():
     val = cuda.CUlaunchAttributeValue()
     string = str(val)
 
-@pytest.mark.skipif(driverVersionLessThan(12020)
-                    or not supportsCudaAPI('cuGraphAddNode')
-                    or not supportsCudaAPI('cuGraphNodeSetParams')
-                    or not supportsCudaAPI('cuGraphExecNodeSetParams'), reason='Polymorphic graph APIs required')
+
+@pytest.mark.skipif(
+    driverVersionLessThan(12020)
+    or not supportsCudaAPI("cuGraphAddNode")
+    or not supportsCudaAPI("cuGraphNodeSetParams")
+    or not supportsCudaAPI("cuGraphExecNodeSetParams"),
+    reason="Polymorphic graph APIs required",
+)
 def test_graph_poly():
-    err, = cuda.cuInit(0)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    (err,) = cuda.cuInit(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
     err, device = cuda.cuDeviceGet(0)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    assert err == cuda.CUresult.CUDA_SUCCESS
     err, ctx = cuda.cuCtxCreate(0, device)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    assert err == cuda.CUresult.CUDA_SUCCESS
     err, stream = cuda.cuStreamCreate(0)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    assert err == cuda.CUresult.CUDA_SUCCESS
 
     # cuGraphAddNode
 
@@ -723,18 +787,18 @@ def test_graph_poly():
     buffers = []
     for _ in range(2):
         err, dptr = cuda.cuMemAlloc(size)
-        assert(err == cuda.CUresult.CUDA_SUCCESS)
+        assert err == cuda.CUresult.CUDA_SUCCESS
         buffers += [(np.full(size, 2).astype(np.uint8), dptr)]
 
     # Update dev buffers
     for host, device in buffers:
-        err, = cuda.cuMemcpyHtoD(device, host, size)
-        assert(err == cuda.CUresult.CUDA_SUCCESS)
+        (err,) = cuda.cuMemcpyHtoD(device, host, size)
+        assert err == cuda.CUresult.CUDA_SUCCESS
 
     # Create graph
     nodes = []
     err, graph = cuda.cuGraphCreate(0)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    assert err == cuda.CUresult.CUDA_SUCCESS
 
     # Memset
     host, device = buffers[0]
@@ -746,7 +810,7 @@ def test_graph_poly():
     memsetParams.memset.dst = device
     memsetParams.memset.value = 1
     err, node = cuda.cuGraphAddNode(graph, None, 0, memsetParams)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    assert err == cuda.CUresult.CUDA_SUCCESS
     nodes += [node]
 
     # Memcpy
@@ -761,103 +825,109 @@ def test_graph_poly():
     memcpyParams.memcpy.copyParams.Height = 1
     memcpyParams.memcpy.copyParams.Depth = 1
     err, node = cuda.cuGraphAddNode(graph, None, 0, memcpyParams)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    assert err == cuda.CUresult.CUDA_SUCCESS
     nodes += [node]
 
     # Instantiate, execute, validate
     err, graphExec = cuda.cuGraphInstantiate(graph, 0)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
-    err, = cuda.cuGraphLaunch(graphExec, stream)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
-    err, = cuda.cuStreamSynchronize(stream)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    (err,) = cuda.cuGraphLaunch(graphExec, stream)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    (err,) = cuda.cuStreamSynchronize(stream)
+    assert err == cuda.CUresult.CUDA_SUCCESS
 
     # Validate
     for host, device in buffers:
-        err, = cuda.cuMemcpyDtoH(host, device, size)
-        assert(err == cuda.CUresult.CUDA_SUCCESS)
-    assert(np.array_equal(buffers[0][0], np.full(size, 1).astype(np.uint8)))
-    assert(np.array_equal(buffers[1][0], np.full(size, 2).astype(np.uint8)))
+        (err,) = cuda.cuMemcpyDtoH(host, device, size)
+        assert err == cuda.CUresult.CUDA_SUCCESS
+    assert np.array_equal(buffers[0][0], np.full(size, 1).astype(np.uint8))
+    assert np.array_equal(buffers[1][0], np.full(size, 2).astype(np.uint8))
 
     # cuGraphNodeSetParams
     host, device = buffers[1]
     err, memcpyParamsCopy = cuda.cuGraphMemcpyNodeGetParams(nodes[1])
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
-    assert(int(memcpyParamsCopy.srcDevice) == int(device))
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    assert int(memcpyParamsCopy.srcDevice) == int(device)
     host, device = buffers[0]
     memcpyParams.memcpy.copyParams.srcDevice = device
-    err, = cuda.cuGraphNodeSetParams(nodes[1], memcpyParams)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    (err,) = cuda.cuGraphNodeSetParams(nodes[1], memcpyParams)
+    assert err == cuda.CUresult.CUDA_SUCCESS
     err, memcpyParamsCopy = cuda.cuGraphMemcpyNodeGetParams(nodes[1])
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
-    assert(int(memcpyParamsCopy.srcDevice) == int(device))
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    assert int(memcpyParamsCopy.srcDevice) == int(device)
 
     # cuGraphExecNodeSetParams
     memsetParams.memset.value = 11
-    err, = cuda.cuGraphExecNodeSetParams(graphExec, nodes[0], memsetParams)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
-    err, = cuda.cuGraphLaunch(graphExec, stream)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
-    err, = cuda.cuStreamSynchronize(stream)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
-    err, = cuda.cuMemcpyDtoH(buffers[0][0], buffers[0][1], size)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
-    assert(np.array_equal(buffers[0][0], np.full(size, 11).astype(np.uint8)))
+    (err,) = cuda.cuGraphExecNodeSetParams(graphExec, nodes[0], memsetParams)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    (err,) = cuda.cuGraphLaunch(graphExec, stream)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    (err,) = cuda.cuStreamSynchronize(stream)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    (err,) = cuda.cuMemcpyDtoH(buffers[0][0], buffers[0][1], size)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    assert np.array_equal(buffers[0][0], np.full(size, 11).astype(np.uint8))
 
     # Cleanup
-    err, = cuda.cuMemFree(buffers[0][1])
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
-    err, = cuda.cuMemFree(buffers[1][1])
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
-    err, = cuda.cuGraphExecDestroy(graphExec)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
-    err, = cuda.cuGraphDestroy(graph)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
-    err, = cuda.cuStreamDestroy(stream)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
-    err, = cuda.cuCtxDestroy(ctx)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
-
-@pytest.mark.skipif(driverVersionLessThan(12040)
-                    or not supportsCudaAPI('cuDeviceGetDevResource'), reason='Polymorphic graph APIs required')
+    (err,) = cuda.cuMemFree(buffers[0][1])
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    (err,) = cuda.cuMemFree(buffers[1][1])
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    (err,) = cuda.cuGraphExecDestroy(graphExec)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    (err,) = cuda.cuGraphDestroy(graph)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    (err,) = cuda.cuStreamDestroy(stream)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    (err,) = cuda.cuCtxDestroy(ctx)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+
+
+@pytest.mark.skipif(
+    driverVersionLessThan(12040) or not supportsCudaAPI("cuDeviceGetDevResource"),
+    reason="Polymorphic graph APIs required",
+)
 def test_cuDeviceGetDevResource():
-    err, = cuda.cuInit(0)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    (err,) = cuda.cuInit(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
     err, device = cuda.cuDeviceGet(0)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    assert err == cuda.CUresult.CUDA_SUCCESS
     err, resource_in = cuda.cuDeviceGetDevResource(device, cuda.CUdevResourceType.CU_DEV_RESOURCE_TYPE_SM)
     err, ctx = cuda.cuCtxCreate(0, device)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    assert err == cuda.CUresult.CUDA_SUCCESS
 
     err, res, count, rem = cuda.cuDevSmResourceSplitByCount(0, resource_in, 0, 2)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
-    assert(count != 0)
-    assert(len(res) == 0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    assert count != 0
+    assert len(res) == 0
     err, res, count_same, rem = cuda.cuDevSmResourceSplitByCount(count, resource_in, 0, 2)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
-    assert(count == count_same)
-    assert(len(res) == count)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    assert count == count_same
+    assert len(res) == count
     err, res, count, rem = cuda.cuDevSmResourceSplitByCount(3, resource_in, 0, 2)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
-    assert(len(res) == 3)
+    assert err == cuda.CUresult.CUDA_SUCCESS
+    assert len(res) == 3
+
+    (err,) = cuda.cuCtxDestroy(ctx)
+    assert err == cuda.CUresult.CUDA_SUCCESS
 
-    err, = cuda.cuCtxDestroy(ctx)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
 
-@pytest.mark.skipif(driverVersionLessThan(12030)
-                    or not supportsCudaAPI('cuGraphConditionalHandleCreate'), reason='Conditional graph APIs required')
+@pytest.mark.skipif(
+    driverVersionLessThan(12030) or not supportsCudaAPI("cuGraphConditionalHandleCreate"),
+    reason="Conditional graph APIs required",
+)
 def test_conditional():
-    err, = cuda.cuInit(0)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    (err,) = cuda.cuInit(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
     err, device = cuda.cuDeviceGet(0)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    assert err == cuda.CUresult.CUDA_SUCCESS
     err, ctx = cuda.cuCtxCreate(0, device)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    assert err == cuda.CUresult.CUDA_SUCCESS
 
     err, graph = cuda.cuGraphCreate(0)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    assert err == cuda.CUresult.CUDA_SUCCESS
     err, handle = cuda.cuGraphConditionalHandleCreate(graph, ctx, 0, 0)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    assert err == cuda.CUresult.CUDA_SUCCESS
 
     params = cuda.CUgraphNodeParams()
     params.type = cuda.CUgraphNodeType.CU_GRAPH_NODE_TYPE_CONDITIONAL
@@ -866,10 +936,10 @@ def test_conditional():
     params.conditional.size = 1
     params.conditional.ctx = ctx
 
-    assert(len(params.conditional.phGraph_out) == 1)
-    assert(int(params.conditional.phGraph_out[0]) == 0)
+    assert len(params.conditional.phGraph_out) == 1
+    assert int(params.conditional.phGraph_out[0]) == 0
     err, node = cuda.cuGraphAddNode(graph, None, 0, params)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    assert err == cuda.CUresult.CUDA_SUCCESS
 
-    assert(len(params.conditional.phGraph_out) == 1)
-    assert(int(params.conditional.phGraph_out[0]) != 0)
+    assert len(params.conditional.phGraph_out) == 1
+    assert int(params.conditional.phGraph_out[0]) != 0
diff --git a/cuda_bindings/tests/test_cudart.py b/cuda_bindings/tests/test_cudart.py
index 0e2c0af09..88f1b968a 100644
--- a/cuda_bindings/tests/test_cudart.py
+++ b/cuda_bindings/tests/test_cudart.py
@@ -5,36 +5,44 @@
 # this software. Any use, reproduction, disclosure, or distribution of
 # this software and related documentation outside the terms of the EULA
 # is strictly prohibited.
-from _pytest.mark.structures import store_mark
 import ctypes
-import cuda.cuda as cuda
-import cuda.cudart as cudart
 import math
+
 import numpy as np
 import pytest
 
+import cuda.cuda as cuda
+import cuda.cudart as cudart
+
+
 def isSuccess(err):
     return err == cudart.cudaError_t.cudaSuccess
 
+
 def assertSuccess(err):
-    assert(isSuccess(err))
+    assert isSuccess(err)
+
 
 def driverVersionLessThan(target):
     err, version = cudart.cudaDriverGetVersion()
     assertSuccess(err)
     return version < target
 
+
 def supportsMemoryPool():
     err, isSupported = cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrMemoryPoolsSupported, 0)
     return isSuccess(err) and isSupported
 
+
 def supportsSparseTexturesDeviceFilter():
     err, isSupported = cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrSparseCudaArraySupported, 0)
     return isSuccess(err) and isSupported
 
+
 def supportsCudaAPI(name):
     return name in dir(cuda) or dir(cudart)
 
+
 def test_cudart_memcpy():
     # Allocate dev memory
     size = 1024 * np.uint8().itemsize
@@ -44,44 +52,46 @@ def test_cudart_memcpy():
     # Set h1 and h2 memory to be different
     h1 = np.full(size, 1).astype(np.uint8)
     h2 = np.full(size, 2).astype(np.uint8)
-    assert(np.array_equal(h1, h2) is False)
+    assert np.array_equal(h1, h2) is False
 
     # h1 to D
-    err, = cudart.cudaMemcpy(dptr, h1, size, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)
+    (err,) = cudart.cudaMemcpy(dptr, h1, size, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)
     assertSuccess(err)
 
     # D to h2
-    err, = cudart.cudaMemcpy(h2, dptr, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)
+    (err,) = cudart.cudaMemcpy(h2, dptr, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)
     assertSuccess(err)
 
     # Validate h1 == h2
-    assert(np.array_equal(h1, h2))
+    assert np.array_equal(h1, h2)
 
     # Cleanup
-    err, = cudart.cudaFree(dptr)
+    (err,) = cudart.cudaFree(dptr)
     assertSuccess(err)
 
+
 def test_cudart_hostRegister():
     # Use hostRegister API to check for correct enum return values
     page_size = 80
     addr_host = np.full(page_size * 3, 1).astype(np.uint8)
     addr = addr_host.ctypes.data
 
-    size_0 = ((16 * page_size) / 8)
-    addr_0 = addr + int(((0 * page_size) / 8))
-    size_1 = ((16 * page_size) / 8)
-    addr_1 = addr + int(((8 * page_size) / 8))
+    size_0 = (16 * page_size) / 8
+    addr_0 = addr + int((0 * page_size) / 8)
+    size_1 = (16 * page_size) / 8
+    addr_1 = addr + int((8 * page_size) / 8)
 
-    err, = cudart.cudaHostRegister(addr_0, size_0, 3)
+    (err,) = cudart.cudaHostRegister(addr_0, size_0, 3)
     assertSuccess(err)
-    err, = cudart.cudaHostRegister(addr_1, size_1, 3)
-    assert(err == cudart.cudaError_t.cudaErrorHostMemoryAlreadyRegistered)
+    (err,) = cudart.cudaHostRegister(addr_1, size_1, 3)
+    assert err == cudart.cudaError_t.cudaErrorHostMemoryAlreadyRegistered
 
-    err, = cudart.cudaHostUnregister(addr_1)
-    assert(err == cudart.cudaError_t.cudaErrorInvalidValue)
-    err, = cudart.cudaHostUnregister(addr_0)
+    (err,) = cudart.cudaHostUnregister(addr_1)
+    assert err == cudart.cudaError_t.cudaErrorInvalidValue
+    (err,) = cudart.cudaHostUnregister(addr_0)
     assertSuccess(err)
 
+
 def test_cudart_class_reference():
     offset = 1
     width = 4
@@ -95,9 +105,9 @@ def test_cudart_class_reference():
     externalMemoryMipmappedArrayDesc = cudart.cudaExternalMemoryMipmappedArrayDesc()
 
     # Get/set class attributes
-    extent.width  = width
+    extent.width = width
     extent.height = height
-    extent.depth  = depth
+    extent.depth = depth
 
     formatDesc.x = 8
     formatDesc.y = 0
@@ -105,50 +115,51 @@ def test_cudart_class_reference():
     formatDesc.w = 0
     formatDesc.f = cudart.cudaChannelFormatKind.cudaChannelFormatKindSigned
 
-    externalMemoryMipmappedArrayDesc.offset     = offset
+    externalMemoryMipmappedArrayDesc.offset = offset
     externalMemoryMipmappedArrayDesc.formatDesc = formatDesc
-    externalMemoryMipmappedArrayDesc.extent     = extent
-    externalMemoryMipmappedArrayDesc.flags      = flags
-    externalMemoryMipmappedArrayDesc.numLevels  = numMipLevels
+    externalMemoryMipmappedArrayDesc.extent = extent
+    externalMemoryMipmappedArrayDesc.flags = flags
+    externalMemoryMipmappedArrayDesc.numLevels = numMipLevels
 
     # Can manipulate child structure values directly
-    externalMemoryMipmappedArrayDesc.extent.width  = width+1
-    externalMemoryMipmappedArrayDesc.extent.height = height+1
-    externalMemoryMipmappedArrayDesc.extent.depth  = depth+1
-    assert(externalMemoryMipmappedArrayDesc.extent.width == width+1)
-    assert(externalMemoryMipmappedArrayDesc.extent.height == height+1)
-    assert(externalMemoryMipmappedArrayDesc.extent.depth == depth+1)
+    externalMemoryMipmappedArrayDesc.extent.width = width + 1
+    externalMemoryMipmappedArrayDesc.extent.height = height + 1
+    externalMemoryMipmappedArrayDesc.extent.depth = depth + 1
+    assert externalMemoryMipmappedArrayDesc.extent.width == width + 1
+    assert externalMemoryMipmappedArrayDesc.extent.height == height + 1
+    assert externalMemoryMipmappedArrayDesc.extent.depth == depth + 1
 
     externalMemoryMipmappedArrayDesc.formatDesc.x = 20
     externalMemoryMipmappedArrayDesc.formatDesc.y = 21
     externalMemoryMipmappedArrayDesc.formatDesc.z = 22
     externalMemoryMipmappedArrayDesc.formatDesc.w = 23
     externalMemoryMipmappedArrayDesc.formatDesc.f = cudart.cudaChannelFormatKind.cudaChannelFormatKindFloat
-    assert(externalMemoryMipmappedArrayDesc.formatDesc.x == 20)
-    assert(externalMemoryMipmappedArrayDesc.formatDesc.y == 21)
-    assert(externalMemoryMipmappedArrayDesc.formatDesc.z == 22)
-    assert(externalMemoryMipmappedArrayDesc.formatDesc.w == 23)
-    assert(externalMemoryMipmappedArrayDesc.formatDesc.f == cudart.cudaChannelFormatKind.cudaChannelFormatKindFloat)
+    assert externalMemoryMipmappedArrayDesc.formatDesc.x == 20
+    assert externalMemoryMipmappedArrayDesc.formatDesc.y == 21
+    assert externalMemoryMipmappedArrayDesc.formatDesc.z == 22
+    assert externalMemoryMipmappedArrayDesc.formatDesc.w == 23
+    assert externalMemoryMipmappedArrayDesc.formatDesc.f == cudart.cudaChannelFormatKind.cudaChannelFormatKindFloat
 
     # Can copy classes over
     externalMemoryMipmappedArrayDesc.extent = extent
-    assert(externalMemoryMipmappedArrayDesc.extent.width == width)
-    assert(externalMemoryMipmappedArrayDesc.extent.height == height)
-    assert(externalMemoryMipmappedArrayDesc.extent.depth == depth)
+    assert externalMemoryMipmappedArrayDesc.extent.width == width
+    assert externalMemoryMipmappedArrayDesc.extent.height == height
+    assert externalMemoryMipmappedArrayDesc.extent.depth == depth
 
     externalMemoryMipmappedArrayDesc.formatDesc = formatDesc
-    assert(externalMemoryMipmappedArrayDesc.formatDesc.x == 8)
-    assert(externalMemoryMipmappedArrayDesc.formatDesc.y == 0)
-    assert(externalMemoryMipmappedArrayDesc.formatDesc.z == 0)
-    assert(externalMemoryMipmappedArrayDesc.formatDesc.w == 0)
-    assert(externalMemoryMipmappedArrayDesc.formatDesc.f == cudart.cudaChannelFormatKind.cudaChannelFormatKindSigned)
+    assert externalMemoryMipmappedArrayDesc.formatDesc.x == 8
+    assert externalMemoryMipmappedArrayDesc.formatDesc.y == 0
+    assert externalMemoryMipmappedArrayDesc.formatDesc.z == 0
+    assert externalMemoryMipmappedArrayDesc.formatDesc.w == 0
+    assert externalMemoryMipmappedArrayDesc.formatDesc.f == cudart.cudaChannelFormatKind.cudaChannelFormatKindSigned
+
 
-@pytest.mark.skipif(not supportsSparseTexturesDeviceFilter(), reason='Sparse Texture Device Filter')
+@pytest.mark.skipif(not supportsSparseTexturesDeviceFilter(), reason="Sparse Texture Device Filter")
 def test_cudart_class_inline():
     extent = cudart.cudaExtent()
-    extent.width  = 1000
+    extent.width = 1000
     extent.height = 500
-    extent.depth  = 0
+    extent.depth = 0
 
     desc = cudart.cudaChannelFormatDesc()
     desc.x = 32
@@ -158,12 +169,12 @@ def test_cudart_class_inline():
     desc.f = cudart.cudaChannelFormatKind.cudaChannelFormatKindFloat
 
     numChannels = 4
-    numBytesPerChannel = desc.x/8
+    numBytesPerChannel = desc.x / 8
     numBytesPerTexel = numChannels * numBytesPerChannel
 
     flags = cudart.cudaArraySparse
     maxDim = max(extent.width, extent.height)
-    numLevels = int(float(1.0) + math.log(maxDim, 2))
+    numLevels = int(1.0 + math.log(maxDim, 2))
 
     err, mipmap = cudart.cudaMallocMipmappedArray(desc, extent, numLevels, flags)
     assertSuccess(err)
@@ -174,9 +185,9 @@ def test_cudart_class_inline():
     # tileExtent
     # TODO: Will these values always be this same? Maybe need a more stable test?
     # TODO: Are these values even correct? Need to research the function some more.. Maybe need an easier API test
-    assert(sparseProp.tileExtent.width == 64)
-    assert(sparseProp.tileExtent.height == 64)
-    assert(sparseProp.tileExtent.depth == 1)
+    assert sparseProp.tileExtent.width == 64
+    assert sparseProp.tileExtent.height == 64
+    assert sparseProp.tileExtent.depth == 1
 
     sparsePropNew = cudart.cudaArraySparseProperties()
     sparsePropNew.tileExtent.width = 15
@@ -185,20 +196,24 @@ def test_cudart_class_inline():
 
     # Check that we can copy inner structs
     sparseProp.tileExtent = sparsePropNew.tileExtent
-    assert(sparseProp.tileExtent.width == 15)
-    assert(sparseProp.tileExtent.height == 16)
-    assert(sparseProp.tileExtent.depth == 17)
+    assert sparseProp.tileExtent.width == 15
+    assert sparseProp.tileExtent.height == 16
+    assert sparseProp.tileExtent.depth == 17
 
-    assert(sparseProp.miptailFirstLevel == 3)
-    assert(sparseProp.miptailSize == 196608)
-    assert(sparseProp.flags == 0)
+    assert sparseProp.miptailFirstLevel == 3
+    assert sparseProp.miptailSize == 196608
+    assert sparseProp.flags == 0
 
-    err, = cudart.cudaFreeMipmappedArray(mipmap)
+    (err,) = cudart.cudaFreeMipmappedArray(mipmap)
     assertSuccess(err)
 
     # TODO
     example = cudart.cudaExternalSemaphoreSignalNodeParams()
-    example.extSemArray = [cudart.cudaExternalSemaphore_t(0), cudart.cudaExternalSemaphore_t(123), cudart.cudaExternalSemaphore_t(999)]
+    example.extSemArray = [
+        cudart.cudaExternalSemaphore_t(0),
+        cudart.cudaExternalSemaphore_t(123),
+        cudart.cudaExternalSemaphore_t(999),
+    ]
     a1 = cudart.cudaExternalSemaphoreSignalParams()
     a1.params.fence.value = 7
     a1.params.nvSciSync.fence = 999
@@ -222,6 +237,7 @@ def test_cudart_class_inline():
     a3.flags = 4
     example.numExtSems = 3
 
+
 def test_cudart_graphs():
     err, graph = cudart.cudaGraphCreate(0)
     assertSuccess(err)
@@ -241,9 +257,11 @@ def test_cudart_graphs():
     err, stream_with_flags = cudart.cudaStreamCreateWithFlags(cudart.cudaStreamNonBlocking)
     assertSuccess(err)
 
+
 def test_cudart_list_access():
     err, prop = cudart.cudaGetDeviceProperties(0)
-    prop.name = prop.name + b' '*(256-len(prop.name))
+    prop.name = prop.name + b" " * (256 - len(prop.name))
+
 
 def test_cudart_class_setters():
     dim = cudart.dim3()
@@ -256,30 +274,119 @@ def test_cudart_class_setters():
     assert dim.y == 2
     assert dim.z == 3
 
+
 def test_cudart_both_type():
     err, mode = cudart.cudaThreadExchangeStreamCaptureMode(cudart.cudaStreamCaptureMode.cudaStreamCaptureModeGlobal)
     assertSuccess(err)
     err, mode = cudart.cudaThreadExchangeStreamCaptureMode(cudart.cudaStreamCaptureMode.cudaStreamCaptureModeRelaxed)
     assertSuccess(err)
-    assert(mode == cudart.cudaStreamCaptureMode.cudaStreamCaptureModeGlobal)
-    err, mode = cudart.cudaThreadExchangeStreamCaptureMode(cudart.cudaStreamCaptureMode.cudaStreamCaptureModeThreadLocal)
+    assert mode == cudart.cudaStreamCaptureMode.cudaStreamCaptureModeGlobal
+    err, mode = cudart.cudaThreadExchangeStreamCaptureMode(
+        cudart.cudaStreamCaptureMode.cudaStreamCaptureModeThreadLocal
+    )
     assertSuccess(err)
-    assert(mode == cudart.cudaStreamCaptureMode.cudaStreamCaptureModeRelaxed)
+    assert mode == cudart.cudaStreamCaptureMode.cudaStreamCaptureModeRelaxed
     err, mode = cudart.cudaThreadExchangeStreamCaptureMode(cudart.cudaStreamCaptureMode.cudaStreamCaptureModeGlobal)
     assertSuccess(err)
-    assert(mode == cudart.cudaStreamCaptureMode.cudaStreamCaptureModeThreadLocal)
+    assert mode == cudart.cudaStreamCaptureMode.cudaStreamCaptureModeThreadLocal
+
 
 def test_cudart_cudaGetDeviceProperties():
     err, prop = cudart.cudaGetDeviceProperties(0)
     assertSuccess(err)
-    attrs = ['accessPolicyMaxWindowSize', 'asyncEngineCount', 'canMapHostMemory', 'canUseHostPointerForRegisteredMem', 'clockRate', 'computeMode', 'computePreemptionSupported', 'concurrentKernels', 'concurrentManagedAccess', 'cooperativeLaunch', 'cooperativeMultiDeviceLaunch', 'deviceOverlap', 'directManagedMemAccessFromHost', 'getPtr', 'globalL1CacheSupported', 'hostNativeAtomicSupported', 'integrated', 'isMultiGpuBoard', 'kernelExecTimeoutEnabled', 'l2CacheSize', 'localL1CacheSupported', 'luid', 'luidDeviceNodeMask', 'major', 'managedMemory', 'maxBlocksPerMultiProcessor', 'maxGridSize', 'maxSurface1D', 'maxSurface1DLayered', 'maxSurface2D', 'maxSurface2DLayered', 'maxSurface3D', 'maxSurfaceCubemap', 'maxSurfaceCubemapLayered', 'maxTexture1D', 'maxTexture1DLayered', 'maxTexture1DLinear', 'maxTexture1DMipmap', 'maxTexture2D', 'maxTexture2DGather', 'maxTexture2DLayered', 'maxTexture2DLinear', 'maxTexture2DMipmap', 'maxTexture3D', 'maxTexture3DAlt', 'maxTextureCubemap', 'maxTextureCubemapLayered', 'maxThreadsDim', 'maxThreadsPerBlock', 'maxThreadsPerMultiProcessor', 'memPitch', 'memoryBusWidth', 'memoryClockRate', 'minor', 'multiGpuBoardGroupID', 'multiProcessorCount', 'name', 'pageableMemoryAccess', 'pageableMemoryAccessUsesHostPageTables', 'pciBusID', 'pciDeviceID', 'pciDomainID', 'persistingL2CacheMaxSize', 'regsPerBlock', 'regsPerMultiprocessor', 'reservedSharedMemPerBlock', 'sharedMemPerBlock', 'sharedMemPerBlockOptin', 'sharedMemPerMultiprocessor', 'singleToDoublePrecisionPerfRatio', 'streamPrioritiesSupported', 'surfaceAlignment', 'tccDriver', 'textureAlignment', 'texturePitchAlignment', 'totalConstMem', 'totalGlobalMem', 'unifiedAddressing', 'uuid', 'warpSize']
+    attrs = [
+        "accessPolicyMaxWindowSize",
+        "asyncEngineCount",
+        "canMapHostMemory",
+        "canUseHostPointerForRegisteredMem",
+        "clockRate",
+        "computeMode",
+        "computePreemptionSupported",
+        "concurrentKernels",
+        "concurrentManagedAccess",
+        "cooperativeLaunch",
+        "cooperativeMultiDeviceLaunch",
+        "deviceOverlap",
+        "directManagedMemAccessFromHost",
+        "getPtr",
+        "globalL1CacheSupported",
+        "hostNativeAtomicSupported",
+        "integrated",
+        "isMultiGpuBoard",
+        "kernelExecTimeoutEnabled",
+        "l2CacheSize",
+        "localL1CacheSupported",
+        "luid",
+        "luidDeviceNodeMask",
+        "major",
+        "managedMemory",
+        "maxBlocksPerMultiProcessor",
+        "maxGridSize",
+        "maxSurface1D",
+        "maxSurface1DLayered",
+        "maxSurface2D",
+        "maxSurface2DLayered",
+        "maxSurface3D",
+        "maxSurfaceCubemap",
+        "maxSurfaceCubemapLayered",
+        "maxTexture1D",
+        "maxTexture1DLayered",
+        "maxTexture1DLinear",
+        "maxTexture1DMipmap",
+        "maxTexture2D",
+        "maxTexture2DGather",
+        "maxTexture2DLayered",
+        "maxTexture2DLinear",
+        "maxTexture2DMipmap",
+        "maxTexture3D",
+        "maxTexture3DAlt",
+        "maxTextureCubemap",
+        "maxTextureCubemapLayered",
+        "maxThreadsDim",
+        "maxThreadsPerBlock",
+        "maxThreadsPerMultiProcessor",
+        "memPitch",
+        "memoryBusWidth",
+        "memoryClockRate",
+        "minor",
+        "multiGpuBoardGroupID",
+        "multiProcessorCount",
+        "name",
+        "pageableMemoryAccess",
+        "pageableMemoryAccessUsesHostPageTables",
+        "pciBusID",
+        "pciDeviceID",
+        "pciDomainID",
+        "persistingL2CacheMaxSize",
+        "regsPerBlock",
+        "regsPerMultiprocessor",
+        "reservedSharedMemPerBlock",
+        "sharedMemPerBlock",
+        "sharedMemPerBlockOptin",
+        "sharedMemPerMultiprocessor",
+        "singleToDoublePrecisionPerfRatio",
+        "streamPrioritiesSupported",
+        "surfaceAlignment",
+        "tccDriver",
+        "textureAlignment",
+        "texturePitchAlignment",
+        "totalConstMem",
+        "totalGlobalMem",
+        "unifiedAddressing",
+        "uuid",
+        "warpSize",
+    ]
     for attr in attrs:
         assert hasattr(prop, attr)
     assert len(prop.name.decode("utf-8")) != 0
     assert len(prop.uuid.bytes.hex()) != 0
 
     example = cudart.cudaExternalSemaphoreSignalNodeParams()
-    example.extSemArray = [cudart.cudaExternalSemaphore_t(0), cudart.cudaExternalSemaphore_t(123), cudart.cudaExternalSemaphore_t(999)]
+    example.extSemArray = [
+        cudart.cudaExternalSemaphore_t(0),
+        cudart.cudaExternalSemaphore_t(123),
+        cudart.cudaExternalSemaphore_t(999),
+    ]
     a1 = cudart.cudaExternalSemaphoreSignalParams()
     a1.params.fence.value = 7
     a1.params.nvSciSync.fence = 999
@@ -303,7 +410,10 @@ def test_cudart_cudaGetDeviceProperties():
     a3.flags = 4
     example.numExtSems = 3
 
-@pytest.mark.skipif(driverVersionLessThan(11030) or not supportsMemoryPool(), reason='When new attributes were introduced')
+
+@pytest.mark.skipif(
+    driverVersionLessThan(11030) or not supportsMemoryPool(), reason="When new attributes were introduced"
+)
 def test_cudart_MemPool_attr():
     poolProps = cudart.cudaMemPoolProps()
     poolProps.allocType = cudart.cudaMemAllocationType.cudaMemAllocationTypePinned
@@ -314,69 +424,85 @@ def test_cudart_MemPool_attr():
     err, pool = cudart.cudaMemPoolCreate(poolProps)
     assertSuccess(err)
 
-    for idx, attr in enumerate([cudart.cudaMemPoolAttr.cudaMemPoolReuseFollowEventDependencies,
-                                cudart.cudaMemPoolAttr.cudaMemPoolReuseAllowOpportunistic,
-                                cudart.cudaMemPoolAttr.cudaMemPoolReuseAllowInternalDependencies,
-                                cudart.cudaMemPoolAttr.cudaMemPoolAttrReleaseThreshold,
-                                cudart.cudaMemPoolAttr.cudaMemPoolAttrReservedMemCurrent,
-                                cudart.cudaMemPoolAttr.cudaMemPoolAttrReservedMemHigh,
-                                cudart.cudaMemPoolAttr.cudaMemPoolAttrUsedMemCurrent,
-                                cudart.cudaMemPoolAttr.cudaMemPoolAttrUsedMemHigh]):
+    for idx, attr in enumerate(
+        [
+            cudart.cudaMemPoolAttr.cudaMemPoolReuseFollowEventDependencies,
+            cudart.cudaMemPoolAttr.cudaMemPoolReuseAllowOpportunistic,
+            cudart.cudaMemPoolAttr.cudaMemPoolReuseAllowInternalDependencies,
+            cudart.cudaMemPoolAttr.cudaMemPoolAttrReleaseThreshold,
+            cudart.cudaMemPoolAttr.cudaMemPoolAttrReservedMemCurrent,
+            cudart.cudaMemPoolAttr.cudaMemPoolAttrReservedMemHigh,
+            cudart.cudaMemPoolAttr.cudaMemPoolAttrUsedMemCurrent,
+            cudart.cudaMemPoolAttr.cudaMemPoolAttrUsedMemHigh,
+        ]
+    ):
         err, attr_tmp = cudart.cudaMemPoolGetAttribute(pool, attr)
         assertSuccess(err)
         attr_list[idx] = attr_tmp
 
-    for idxA, attr in enumerate([cudart.cudaMemPoolAttr.cudaMemPoolReuseFollowEventDependencies,
-                                 cudart.cudaMemPoolAttr.cudaMemPoolReuseAllowOpportunistic,
-                                 cudart.cudaMemPoolAttr.cudaMemPoolReuseAllowInternalDependencies]):
-        err, = cudart.cudaMemPoolSetAttribute(pool, attr, 0)
+    for idxA, attr in enumerate(
+        [
+            cudart.cudaMemPoolAttr.cudaMemPoolReuseFollowEventDependencies,
+            cudart.cudaMemPoolAttr.cudaMemPoolReuseAllowOpportunistic,
+            cudart.cudaMemPoolAttr.cudaMemPoolReuseAllowInternalDependencies,
+        ]
+    ):
+        (err,) = cudart.cudaMemPoolSetAttribute(pool, attr, 0)
         assertSuccess(err)
     for idx, attr in enumerate([cudart.cudaMemPoolAttr.cudaMemPoolAttrReleaseThreshold]):
-        err, = cudart.cudaMemPoolSetAttribute(pool, attr, cuda.cuuint64_t(9))
+        (err,) = cudart.cudaMemPoolSetAttribute(pool, attr, cuda.cuuint64_t(9))
         assertSuccess(err)
 
-    for idx, attr in enumerate([cudart.cudaMemPoolAttr.cudaMemPoolReuseFollowEventDependencies,
-                                cudart.cudaMemPoolAttr.cudaMemPoolReuseAllowOpportunistic,
-                                cudart.cudaMemPoolAttr.cudaMemPoolReuseAllowInternalDependencies,
-                                cudart.cudaMemPoolAttr.cudaMemPoolAttrReleaseThreshold]):
+    for idx, attr in enumerate(
+        [
+            cudart.cudaMemPoolAttr.cudaMemPoolReuseFollowEventDependencies,
+            cudart.cudaMemPoolAttr.cudaMemPoolReuseAllowOpportunistic,
+            cudart.cudaMemPoolAttr.cudaMemPoolReuseAllowInternalDependencies,
+            cudart.cudaMemPoolAttr.cudaMemPoolAttrReleaseThreshold,
+        ]
+    ):
         err, attr_tmp = cudart.cudaMemPoolGetAttribute(pool, attr)
         assertSuccess(err)
         attr_list[idx] = attr_tmp
-    assert(attr_list[0] == 0)
-    assert(attr_list[1] == 0)
-    assert(attr_list[2] == 0)
-    assert(int(attr_list[3]) == 9)
+    assert attr_list[0] == 0
+    assert attr_list[1] == 0
+    assert attr_list[2] == 0
+    assert int(attr_list[3]) == 9
 
-    err, = cudart.cudaMemPoolDestroy(pool)
+    (err,) = cudart.cudaMemPoolDestroy(pool)
     assertSuccess(err)
 
+
 def test_cudart_make_api():
-    err, channelDesc = cudart.cudaCreateChannelDesc(32,0,0,0,cudart.cudaChannelFormatKind.cudaChannelFormatKindFloat)
+    err, channelDesc = cudart.cudaCreateChannelDesc(
+        32, 0, 0, 0, cudart.cudaChannelFormatKind.cudaChannelFormatKindFloat
+    )
     assertSuccess(err)
-    assert(channelDesc.x == 32)
-    assert(channelDesc.y == 0)
-    assert(channelDesc.z == 0)
-    assert(channelDesc.w == 0)
-    assert(channelDesc.f == cudart.cudaChannelFormatKind.cudaChannelFormatKindFloat)
+    assert channelDesc.x == 32
+    assert channelDesc.y == 0
+    assert channelDesc.z == 0
+    assert channelDesc.w == 0
+    assert channelDesc.f == cudart.cudaChannelFormatKind.cudaChannelFormatKindFloat
 
     # make_cudaPitchedPtr
-    cudaPitchedPtr = cudart.make_cudaPitchedPtr(1,2,3,4)
-    assert(cudaPitchedPtr.ptr == 1)
-    assert(cudaPitchedPtr.pitch == 2)
-    assert(cudaPitchedPtr.xsize == 3)
-    assert(cudaPitchedPtr.ysize == 4)
+    cudaPitchedPtr = cudart.make_cudaPitchedPtr(1, 2, 3, 4)
+    assert cudaPitchedPtr.ptr == 1
+    assert cudaPitchedPtr.pitch == 2
+    assert cudaPitchedPtr.xsize == 3
+    assert cudaPitchedPtr.ysize == 4
 
     # make_cudaPos
-    cudaPos = cudart.make_cudaPos(1,2,3)
-    assert(cudaPos.x == 1)
-    assert(cudaPos.y == 2)
-    assert(cudaPos.z == 3)
+    cudaPos = cudart.make_cudaPos(1, 2, 3)
+    assert cudaPos.x == 1
+    assert cudaPos.y == 2
+    assert cudaPos.z == 3
 
     # make_cudaExtent
-    cudaExtent = cudart.make_cudaExtent(1,2,3)
-    assert(cudaExtent.width == 1)
-    assert(cudaExtent.height == 2)
-    assert(cudaExtent.depth == 3)
+    cudaExtent = cudart.make_cudaExtent(1, 2, 3)
+    assert cudaExtent.width == 1
+    assert cudaExtent.height == 2
+    assert cudaExtent.depth == 3
+
 
 def test_cudart_cudaStreamGetCaptureInfo():
     # create stream
@@ -386,30 +512,27 @@ def test_cudart_cudaStreamGetCaptureInfo():
     # validate that stream is not capturing
     err, status, *info = cudart.cudaStreamGetCaptureInfo(stream)
     assertSuccess(err)
-    assert(status == cudart.cudaStreamCaptureStatus.cudaStreamCaptureStatusNone)
+    assert status == cudart.cudaStreamCaptureStatus.cudaStreamCaptureStatusNone
 
     # start capture
-    err, = cudart.cudaStreamBeginCapture(
-        stream, cudart.cudaStreamCaptureMode.cudaStreamCaptureModeGlobal
-    )
+    (err,) = cudart.cudaStreamBeginCapture(stream, cudart.cudaStreamCaptureMode.cudaStreamCaptureModeGlobal)
     assertSuccess(err)
 
     # validate that stream is capturing now
     err, status, *info = cudart.cudaStreamGetCaptureInfo(stream)
     assertSuccess(err)
-    assert(status == cudart.cudaStreamCaptureStatus.cudaStreamCaptureStatusActive)
+    assert status == cudart.cudaStreamCaptureStatus.cudaStreamCaptureStatusActive
 
     # clean up
     err, pgraph = cudart.cudaStreamEndCapture(stream)
     assertSuccess(err)
 
+
 def test_cudart_cudaArrayGetInfo():
     # create channel descriptor
     x, y, z, w = 8, 0, 0, 0
     f = cudart.cudaChannelFormatKind.cudaChannelFormatKindUnsigned
-    err, desc = cudart.cudaCreateChannelDesc(
-        x, y, z, w, f
-    )
+    err, desc = cudart.cudaCreateChannelDesc(x, y, z, w, f)
     assertSuccess(err)
 
     # allocate device array
@@ -424,30 +547,29 @@ def test_cudart_cudaArrayGetInfo():
     assertSuccess(err)
 
     # validate descriptor, extent, flags
-    assert(desc.x == x)
-    assert(desc.y == y)
-    assert(desc.z == z)
-    assert(desc.w == w)
-    assert(desc.f == f)
-    assert(extent.width == width)
-    assert(extent.height == height)
-    assert(inFlags == outFlags)
+    assert desc.x == x
+    assert desc.y == y
+    assert desc.z == z
+    assert desc.w == w
+    assert desc.f == f
+    assert extent.width == width
+    assert extent.height == height
+    assert inFlags == outFlags
 
     # clean up
-    err, = cudart.cudaFreeArray(arr)
+    (err,) = cudart.cudaFreeArray(arr)
     assertSuccess(err)
-    
+
+
 def test_cudart_cudaMemcpy2DToArray():
     # create host arrays
     size = int(1024 * np.uint8().itemsize)
     h1 = np.full(size, 1).astype(np.uint8)
     h2 = np.full(size, 2).astype(np.uint8)
-    assert(np.array_equal(h1, h2) is False)
+    assert np.array_equal(h1, h2) is False
 
     # create channel descriptor
-    err, desc = cudart.cudaCreateChannelDesc(
-        8, 0, 0, 0, cudart.cudaChannelFormatKind.cudaChannelFormatKindUnsigned
-    )
+    err, desc = cudart.cudaCreateChannelDesc(8, 0, 0, 0, cudart.cudaChannelFormatKind.cudaChannelFormatKindUnsigned)
     assertSuccess(err)
 
     # allocate device array
@@ -455,26 +577,21 @@ def test_cudart_cudaMemcpy2DToArray():
     assertSuccess(err)
 
     # h1 to arr
-    err, = cudart.cudaMemcpy2DToArray(
-        arr, 0, 0, h1, size, size, 1,
-        cudart.cudaMemcpyKind.cudaMemcpyHostToDevice
-    )
+    (err,) = cudart.cudaMemcpy2DToArray(arr, 0, 0, h1, size, size, 1, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)
     assertSuccess(err)
 
     # arr to h2
-    err, = cudart.cudaMemcpy2DFromArray(
-        h2, size, arr, 0, 0, size, 1,
-        cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost
-    )
+    (err,) = cudart.cudaMemcpy2DFromArray(h2, size, arr, 0, 0, size, 1, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)
     assertSuccess(err)
 
     # validate h1 == h2
-    assert(np.array_equal(h1, h2))
+    assert np.array_equal(h1, h2)
 
     # clean up
-    err, = cudart.cudaFreeArray(arr)
+    (err,) = cudart.cudaFreeArray(arr)
     assertSuccess(err)
 
+
 def test_cudart_cudaMemcpy2DToArray_DtoD():
     # allocate device memory
     size = 1024 * np.uint8().itemsize
@@ -486,12 +603,10 @@ def test_cudart_cudaMemcpy2DToArray_DtoD():
     # create host arrays
     h1 = np.full(size, 1).astype(np.uint8)
     h2 = np.full(size, 2).astype(np.uint8)
-    assert(np.array_equal(h1, h2) is False)
+    assert np.array_equal(h1, h2) is False
 
     # create channel descriptor
-    err, desc = cudart.cudaCreateChannelDesc(
-        8, 0, 0, 0, cudart.cudaChannelFormatKind.cudaChannelFormatKindUnsigned
-    )
+    err, desc = cudart.cudaCreateChannelDesc(8, 0, 0, 0, cudart.cudaChannelFormatKind.cudaChannelFormatKindUnsigned)
     assertSuccess(err)
 
     # allocate device array
@@ -499,49 +614,42 @@ def test_cudart_cudaMemcpy2DToArray_DtoD():
     assertSuccess(err)
 
     # h1 to d1
-    err, = cudart.cudaMemcpy(d1, h1, size, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)
+    (err,) = cudart.cudaMemcpy(d1, h1, size, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)
     assertSuccess(err)
 
     # d1 to arr
-    err, = cudart.cudaMemcpy2DToArray(
-        arr, 0, 0, d1, size, size, 1,
-        cudart.cudaMemcpyKind.cudaMemcpyDeviceToDevice
-    )
+    (err,) = cudart.cudaMemcpy2DToArray(arr, 0, 0, d1, size, size, 1, cudart.cudaMemcpyKind.cudaMemcpyDeviceToDevice)
     assertSuccess(err)
 
     # arr to d2
-    err, = cudart.cudaMemcpy2DFromArray(
-        d2, size, arr, 0, 0, size, 1,
-        cudart.cudaMemcpyKind.cudaMemcpyDeviceToDevice
-    )
+    (err,) = cudart.cudaMemcpy2DFromArray(d2, size, arr, 0, 0, size, 1, cudart.cudaMemcpyKind.cudaMemcpyDeviceToDevice)
     assertSuccess(err)
 
     # d2 to h2
-    err, = cudart.cudaMemcpy(h2, d2, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)
+    (err,) = cudart.cudaMemcpy(h2, d2, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)
     assertSuccess(err)
 
     # validate h1 == h2
-    assert(np.array_equal(h1, h2))
+    assert np.array_equal(h1, h2)
 
     # clean up
-    err, = cudart.cudaFreeArray(arr)
+    (err,) = cudart.cudaFreeArray(arr)
     assertSuccess(err)
-    err, = cudart.cudaFree(d2)
+    (err,) = cudart.cudaFree(d2)
     assertSuccess(err)
-    err, = cudart.cudaFree(d1)
+    (err,) = cudart.cudaFree(d1)
     assertSuccess(err)
 
+
 def test_cudart_cudaMemcpy2DArrayToArray():
     # create host arrays
     size = 1024 * np.uint8().itemsize
     h1 = np.full(size, 1).astype(np.uint8)
     h2 = np.full(size, 2).astype(np.uint8)
-    assert(np.array_equal(h1, h2) is False)
+    assert np.array_equal(h1, h2) is False
 
     # create channel descriptor
-    err, desc = cudart.cudaCreateChannelDesc(
-        8, 0, 0, 0, cudart.cudaChannelFormatKind.cudaChannelFormatKindUnsigned
-    )
+    err, desc = cudart.cudaCreateChannelDesc(8, 0, 0, 0, cudart.cudaChannelFormatKind.cudaChannelFormatKindUnsigned)
     assertSuccess(err)
 
     # allocate device arrays
@@ -551,46 +659,38 @@ def test_cudart_cudaMemcpy2DArrayToArray():
     assertSuccess(err)
 
     # h1 to a1
-    err, = cudart.cudaMemcpy2DToArray(
-        a1, 0, 0, h1, size, size, 1,
-        cudart.cudaMemcpyKind.cudaMemcpyHostToDevice
-    )
+    (err,) = cudart.cudaMemcpy2DToArray(a1, 0, 0, h1, size, size, 1, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)
     assertSuccess(err)
 
     # a1 to a2
-    err, = cudart.cudaMemcpy2DArrayToArray(
-        a2, 0, 0, a1, 0, 0, size, 1,
-        cudart.cudaMemcpyKind.cudaMemcpyDeviceToDevice    
+    (err,) = cudart.cudaMemcpy2DArrayToArray(
+        a2, 0, 0, a1, 0, 0, size, 1, cudart.cudaMemcpyKind.cudaMemcpyDeviceToDevice
     )
     assertSuccess(err)
 
     # a2 to h2
-    err, = cudart.cudaMemcpy2DFromArray(
-        h2, size, a2, 0, 0, size, 1,
-        cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost
-    )
+    (err,) = cudart.cudaMemcpy2DFromArray(h2, size, a2, 0, 0, size, 1, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)
     assertSuccess(err)
 
     # validate h1 == h2
-    assert(np.array_equal(h1, h2))
+    assert np.array_equal(h1, h2)
 
     # clean up
-    err, = cudart.cudaFreeArray(a2)
+    (err,) = cudart.cudaFreeArray(a2)
     assertSuccess(err)
-    err, = cudart.cudaFreeArray(a1)
+    (err,) = cudart.cudaFreeArray(a1)
     assertSuccess(err)
 
+
 def test_cudart_cudaMemcpyArrayToArray():
     # create host arrays
     size = 1024 * np.uint8().itemsize
     h1 = np.full(size, 1).astype(np.uint8)
     h2 = np.full(size, 2).astype(np.uint8)
-    assert(np.array_equal(h1, h2) is False)
+    assert np.array_equal(h1, h2) is False
 
     # create channel descriptor
-    err, desc = cudart.cudaCreateChannelDesc(
-        8, 0, 0, 0, cudart.cudaChannelFormatKind.cudaChannelFormatKindUnsigned
-    )
+    err, desc = cudart.cudaCreateChannelDesc(8, 0, 0, 0, cudart.cudaChannelFormatKind.cudaChannelFormatKindUnsigned)
     assertSuccess(err)
 
     # allocate device arrays
@@ -600,42 +700,32 @@ def test_cudart_cudaMemcpyArrayToArray():
     assertSuccess(err)
 
     # h1 to a1
-    err, = cudart.cudaMemcpy2DToArray(
-        a1, 0, 0, h1, size, size, 1,
-        cudart.cudaMemcpyKind.cudaMemcpyHostToDevice
-    )
+    (err,) = cudart.cudaMemcpy2DToArray(a1, 0, 0, h1, size, size, 1, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)
     assertSuccess(err)
 
     # a1 to a2
-    err, = cudart.cudaMemcpyArrayToArray(
-        a2, 0, 0, a1, 0, 0, size,
-        cudart.cudaMemcpyKind.cudaMemcpyDeviceToDevice    
-    )
+    (err,) = cudart.cudaMemcpyArrayToArray(a2, 0, 0, a1, 0, 0, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToDevice)
     assertSuccess(err)
 
     # a2 to h2
-    err, = cudart.cudaMemcpy2DFromArray(
-        h2, size, a2, 0, 0, size, 1,
-        cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost
-    )
+    (err,) = cudart.cudaMemcpy2DFromArray(h2, size, a2, 0, 0, size, 1, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)
     assertSuccess(err)
 
     # validate h1 == h2
-    assert(np.array_equal(h1, h2))
+    assert np.array_equal(h1, h2)
 
     # clean up
-    err, = cudart.cudaFreeArray(a2)
+    (err,) = cudart.cudaFreeArray(a2)
     assertSuccess(err)
-    err, = cudart.cudaFreeArray(a1)
+    (err,) = cudart.cudaFreeArray(a1)
     assertSuccess(err)
 
+
 def test_cudart_cudaGetChannelDesc():
     # create channel descriptor
     x, y, z, w = 8, 0, 0, 0
     f = cudart.cudaChannelFormatKind.cudaChannelFormatKindUnsigned
-    err, desc = cudart.cudaCreateChannelDesc(
-        x, y, z, w, f
-    )
+    err, desc = cudart.cudaCreateChannelDesc(x, y, z, w, f)
     assertSuccess(err)
 
     # allocate device array
@@ -650,16 +740,17 @@ def test_cudart_cudaGetChannelDesc():
     assertSuccess(err)
 
     # validate array channel descriptor
-    assert(desc.x == x)
-    assert(desc.y == y)
-    assert(desc.z == z)
-    assert(desc.w == w)
-    assert(desc.f == f)
+    assert desc.x == x
+    assert desc.y == y
+    assert desc.z == z
+    assert desc.w == w
+    assert desc.f == f
 
     # clean up
-    err, = cudart.cudaFreeArray(arr)
+    (err,) = cudart.cudaFreeArray(arr)
     assertSuccess(err)
 
+
 def test_cudart_cudaGetTextureObjectTextureDesc():
     # create channel descriptor
     err, channelDesc = cudart.cudaCreateChannelDesc(
@@ -689,18 +780,19 @@ def test_cudart_cudaGetTextureObjectTextureDesc():
         if attr in ["borderColor", "getPtr"]:
             continue
         if not attr.startswith("_"):
-            assert(getattr(outTexDesc, attr) == getattr(inTexDesc, attr))
-    
+            assert getattr(outTexDesc, attr) == getattr(inTexDesc, attr)
+
     # clean up
-    err, = cudart.cudaDestroyTextureObject(texObject)
+    (err,) = cudart.cudaDestroyTextureObject(texObject)
     assertSuccess(err)
 
+
 def test_cudart_cudaMemset3D():
     # create host arrays
     size = 1024 * np.uint8().itemsize
     h1 = np.full(size, 1).astype(np.uint8)
     h2 = np.full(size, 2).astype(np.uint8)
-    assert(np.array_equal(h1, h2) is False)
+    assert np.array_equal(h1, h2) is False
 
     # allocate device memory
     devExtent = cudart.make_cudaExtent(32, 32, 1)
@@ -709,27 +801,26 @@ def test_cudart_cudaMemset3D():
 
     # set memory
     memExtent = cudart.make_cudaExtent(devPitchedPtr.pitch, devPitchedPtr.ysize, 1)
-    err, = cudart.cudaMemset3D(devPitchedPtr, 1, memExtent)
+    (err,) = cudart.cudaMemset3D(devPitchedPtr, 1, memExtent)
     assertSuccess(err)
 
     # D to h2
-    err, = cudart.cudaMemcpy(
-        h2, devPitchedPtr.ptr, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost
-    )
+    (err,) = cudart.cudaMemcpy(h2, devPitchedPtr.ptr, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)
 
     # validate h1 == h2
-    assert(np.array_equal(h1, h2))
+    assert np.array_equal(h1, h2)
 
     # clean up
-    err, = cudart.cudaFree(devPitchedPtr.ptr)
+    (err,) = cudart.cudaFree(devPitchedPtr.ptr)
     assertSuccess(err)
 
+
 def test_cudart_cudaMemset3D_2D():
     # create host arrays
     size = 512 * np.uint8().itemsize
     h1 = np.full(size, 1).astype(np.uint8)
     h2 = np.full(size, 2).astype(np.uint8)
-    assert(np.array_equal(h1, h2) is False)
+    assert np.array_equal(h1, h2) is False
 
     # allocate device memory
     devExtent = cudart.make_cudaExtent(1024, 1, 1)
@@ -738,58 +829,51 @@ def test_cudart_cudaMemset3D_2D():
 
     # set memory
     memExtent = cudart.make_cudaExtent(size, devPitchedPtr.ysize, 1)
-    err, = cudart.cudaMemset3D(devPitchedPtr, 1, memExtent)
+    (err,) = cudart.cudaMemset3D(devPitchedPtr, 1, memExtent)
     assertSuccess(err)
 
     # D to h2
-    err, = cudart.cudaMemcpy(
-        h2, devPitchedPtr.ptr, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost
-    )
+    (err,) = cudart.cudaMemcpy(h2, devPitchedPtr.ptr, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)
 
     # validate h1 == h2
-    assert(np.array_equal(h1, h2))
+    assert np.array_equal(h1, h2)
 
     # clean up
-    err, = cudart.cudaFree(devPitchedPtr.ptr)
+    (err,) = cudart.cudaFree(devPitchedPtr.ptr)
     assertSuccess(err)
 
+
 def test_cudart_cudaMemcpyToArray():
     # create host arrays
     size = 1024 * np.uint8().itemsize
     h1 = np.full(size, 1).astype(np.uint8)
     h2 = np.full(size, 2).astype(np.uint8)
-    assert(np.array_equal(h1, h2) is False)
+    assert np.array_equal(h1, h2) is False
 
     # create channel descriptor
-    err, desc = cudart.cudaCreateChannelDesc(
-        8, 0, 0, 0, cudart.cudaChannelFormatKind.cudaChannelFormatKindUnsigned
-    )
+    err, desc = cudart.cudaCreateChannelDesc(8, 0, 0, 0, cudart.cudaChannelFormatKind.cudaChannelFormatKindUnsigned)
     assertSuccess(err)
-    
+
     # allocate device array
     err, arr = cudart.cudaMallocArray(desc, size, 0, 0)
     assertSuccess(err)
 
     # h1 to arr
-    err, = cudart.cudaMemcpyToArray(
-        arr, 0, 0, h1, size, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice
-    )
+    (err,) = cudart.cudaMemcpyToArray(arr, 0, 0, h1, size, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)
     assertSuccess(err)
 
     # arr to h2
-    err, = cudart.cudaMemcpyFromArray(
-        h2, arr, 0, 0, size,
-        cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost
-    )
+    (err,) = cudart.cudaMemcpyFromArray(h2, arr, 0, 0, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)
     assertSuccess(err)
 
     # validate h1 == h2
-    assert(np.array_equal(h1, h2))
+    assert np.array_equal(h1, h2)
 
     # clean up
-    err, = cudart.cudaFreeArray(arr)
+    (err,) = cudart.cudaFreeArray(arr)
     assertSuccess(err)
 
+
 def test_cudart_cudaMemcpyToArray_DtoD():
     # allocate device memory
     size = int(1024 * np.uint8().itemsize)
@@ -801,61 +885,53 @@ def test_cudart_cudaMemcpyToArray_DtoD():
     # create host arrays
     h1 = np.full(size, 1).astype(np.uint8)
     h2 = np.full(size, 2).astype(np.uint8)
-    assert(np.array_equal(h1, h2) is False)
+    assert np.array_equal(h1, h2) is False
 
     # create channel descriptor
-    err, desc = cudart.cudaCreateChannelDesc(
-        8, 0, 0, 0, cudart.cudaChannelFormatKind.cudaChannelFormatKindUnsigned
-    )
+    err, desc = cudart.cudaCreateChannelDesc(8, 0, 0, 0, cudart.cudaChannelFormatKind.cudaChannelFormatKindUnsigned)
     assertSuccess(err)
-    
+
     # allocate device array
     err, arr = cudart.cudaMallocArray(desc, size, 0, 0)
     assertSuccess(err)
 
     # h1 to d1
-    err, = cudart.cudaMemcpy(d1, h1, size, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)
+    (err,) = cudart.cudaMemcpy(d1, h1, size, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)
     assertSuccess(err)
 
     # d1 to arr
-    err, = cudart.cudaMemcpyToArray(
-        arr, 0, 0, d1, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToDevice
-    )
+    (err,) = cudart.cudaMemcpyToArray(arr, 0, 0, d1, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToDevice)
     assertSuccess(err)
 
     # arr to d2
-    err, = cudart.cudaMemcpyFromArray(
-        d2, arr, 0, 0, size,
-        cudart.cudaMemcpyKind.cudaMemcpyDeviceToDevice
-    )
+    (err,) = cudart.cudaMemcpyFromArray(d2, arr, 0, 0, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToDevice)
     assertSuccess(err)
 
     # d2 to h2
-    err, = cudart.cudaMemcpy(h2, d2, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)
+    (err,) = cudart.cudaMemcpy(h2, d2, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)
     assertSuccess(err)
 
     # validate h1 == h2
-    assert(np.array_equal(h1, h2))
+    assert np.array_equal(h1, h2)
 
     # clean up
-    err, = cudart.cudaFreeArray(arr)
+    (err,) = cudart.cudaFreeArray(arr)
     assertSuccess(err)
-    err, = cudart.cudaFree(d2)
+    (err,) = cudart.cudaFree(d2)
     assertSuccess(err)
-    err, = cudart.cudaFree(d1)
+    (err,) = cudart.cudaFree(d1)
     assertSuccess(err)
 
+
 def test_cudart_cudaMemcpy3DAsync():
     # create host arrays
     size = int(1024 * np.uint8().itemsize)
     h1 = np.full(size, 1).astype(np.uint8)
     h2 = np.full(size, 2).astype(np.uint8)
-    assert(np.array_equal(h1, h2) is False)
+    assert np.array_equal(h1, h2) is False
 
     # create channel descriptor
-    err, desc = cudart.cudaCreateChannelDesc(
-        8, 0, 0, 0, cudart.cudaChannelFormatKind.cudaChannelFormatKindUnsigned
-    )
+    err, desc = cudart.cudaCreateChannelDesc(8, 0, 0, 0, cudart.cudaChannelFormatKind.cudaChannelFormatKindUnsigned)
     assertSuccess(err)
 
     # allocate device array
@@ -874,27 +950,25 @@ def test_cudart_cudaMemcpy3DAsync():
     params.kind = cudart.cudaMemcpyKind.cudaMemcpyHostToDevice
 
     # h1 to arr
-    err, = cudart.cudaMemcpy3DAsync(params, stream)
+    (err,) = cudart.cudaMemcpy3DAsync(params, stream)
     assertSuccess(err)
 
     # await results
-    err, = cudart.cudaStreamSynchronize(stream)
+    (err,) = cudart.cudaStreamSynchronize(stream)
     assertSuccess(err)
 
     # arr to h2
-    err, = cudart.cudaMemcpy2DFromArray(
-        h2, size, arr, 0, 0, size, 1,
-        cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost
-    )
+    (err,) = cudart.cudaMemcpy2DFromArray(h2, size, arr, 0, 0, size, 1, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)
     assertSuccess(err)
 
     # validate h1 == h2
-    assert(np.array_equal(h1, h2))
+    assert np.array_equal(h1, h2)
 
     # clean up
-    err, = cudart.cudaFreeArray(arr)
+    (err,) = cudart.cudaFreeArray(arr)
     assertSuccess(err)
 
+
 def test_cudart_cudaGraphAddMemcpyNode1D():
     # allocate device memory
     size = 1024 * np.uint8().itemsize
@@ -904,7 +978,7 @@ def test_cudart_cudaGraphAddMemcpyNode1D():
     # create host arrays
     h1 = np.full(size, 1).astype(np.uint8)
     h2 = np.full(size, 2).astype(np.uint8)
-    assert(np.array_equal(h1, h2) is False)
+    assert np.array_equal(h1, h2) is False
 
     # build graph
     err, graph = cudart.cudaGraphCreate(0)
@@ -916,8 +990,7 @@ def test_cudart_cudaGraphAddMemcpyNode1D():
     )
     assertSuccess(err)
     err, dToHNode = cudart.cudaGraphAddMemcpyNode1D(
-        graph, [hToDNode], 1, h2, dptr, size,
-        cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost
+        graph, [hToDNode], 1, h2, dptr, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost
     )
     assertSuccess(err)
 
@@ -928,19 +1001,20 @@ def test_cudart_cudaGraphAddMemcpyNode1D():
     # execute graph
     err, execGraph = cudart.cudaGraphInstantiate(graph, 0)
     assertSuccess(err)
-    err, = cudart.cudaGraphLaunch(execGraph, stream)
+    (err,) = cudart.cudaGraphLaunch(execGraph, stream)
 
     # await results
-    err, = cudart.cudaStreamSynchronize(stream)
+    (err,) = cudart.cudaStreamSynchronize(stream)
     assertSuccess(err)
 
     # validate h1 == h2
-    assert(np.array_equal(h1, h2))
+    assert np.array_equal(h1, h2)
 
     # clean up
-    err, = cudart.cudaFree(dptr)
+    (err,) = cudart.cudaFree(dptr)
     assertSuccess(err)
 
+
 def test_cudart_cudaGraphAddMemsetNode():
     # allocate device memory
     size = 1024 * np.uint8().itemsize
@@ -950,7 +1024,7 @@ def test_cudart_cudaGraphAddMemsetNode():
     # create host arrays
     h1 = np.full(size, 1).astype(np.uint8)
     h2 = np.full(size, 2).astype(np.uint8)
-    assert(np.array_equal(h1, h2) is False)
+    assert np.array_equal(h1, h2) is False
 
     # build graph
     err, graph = cudart.cudaGraphCreate(0)
@@ -966,17 +1040,13 @@ def test_cudart_cudaGraphAddMemsetNode():
     params.height = 1
 
     # add nodes
-    err, setNode = cudart.cudaGraphAddMemsetNode(
-        graph, [], 0, params
-    )
+    err, setNode = cudart.cudaGraphAddMemsetNode(graph, [], 0, params)
     assertSuccess(err)
     err, cpyNode = cudart.cudaGraphAddMemcpyNode1D(
-        graph, [setNode], 1, h2, dptr, size,
-        cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost
+        graph, [setNode], 1, h2, dptr, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost
     )
     assertSuccess(err)
 
-
     # create stream
     err, stream = cudart.cudaStreamCreate()
     assertSuccess(err)
@@ -984,20 +1054,21 @@ def test_cudart_cudaGraphAddMemsetNode():
     # execute graph
     err, execGraph = cudart.cudaGraphInstantiate(graph, 0)
     assertSuccess(err)
-    err, = cudart.cudaGraphLaunch(execGraph, stream)
+    (err,) = cudart.cudaGraphLaunch(execGraph, stream)
     assertSuccess(err)
 
     # await results
-    err, = cudart.cudaStreamSynchronize(stream)
+    (err,) = cudart.cudaStreamSynchronize(stream)
     assertSuccess(err)
 
     # validate h1 == h2
-    assert(np.array_equal(h1, h2))
+    assert np.array_equal(h1, h2)
 
     # clean up
-    err, = cudart.cudaFree(dptr)
+    (err,) = cudart.cudaFree(dptr)
     assertSuccess(err)
 
+
 def test_cudart_cudaMemcpy3DPeer():
     # allocate device memory
     size = int(1024 * np.uint8().itemsize)
@@ -1007,12 +1078,10 @@ def test_cudart_cudaMemcpy3DPeer():
     # create host arrays
     h1 = np.full(size, 1).astype(np.uint8)
     h2 = np.full(size, 2).astype(np.uint8)
-    assert(np.array_equal(h1, h2) is False)
+    assert np.array_equal(h1, h2) is False
 
     # create channel descriptor
-    err, desc = cudart.cudaCreateChannelDesc(
-        8, 0, 0, 0, cudart.cudaChannelFormatKind.cudaChannelFormatKindUnsigned
-    )
+    err, desc = cudart.cudaCreateChannelDesc(8, 0, 0, 0, cudart.cudaChannelFormatKind.cudaChannelFormatKindUnsigned)
     assertSuccess(err)
 
     # allocate device array
@@ -1026,29 +1095,27 @@ def test_cudart_cudaMemcpy3DPeer():
     params.extent = cudart.make_cudaExtent(size, 1, 1)
 
     # h1 to D
-    err, = cudart.cudaMemcpy(dptr, h1, size, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)
+    (err,) = cudart.cudaMemcpy(dptr, h1, size, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)
     assertSuccess(err)
 
     # D to arr
-    err, = cudart.cudaMemcpy3DPeer(params)
+    (err,) = cudart.cudaMemcpy3DPeer(params)
     assertSuccess(err)
 
     # arr to h2
-    err, = cudart.cudaMemcpy2DFromArray(
-        h2, size, arr, 0, 0, size, 1,
-        cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost
-    )
+    (err,) = cudart.cudaMemcpy2DFromArray(h2, size, arr, 0, 0, size, 1, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)
     assertSuccess(err)
 
     # validate h1 == h2
-    assert(np.array_equal(h1, h2))
+    assert np.array_equal(h1, h2)
 
     # clean up
-    err, = cudart.cudaFreeArray(arr)
+    (err,) = cudart.cudaFreeArray(arr)
     assertSuccess(err)
-    err, = cudart.cudaFree(dptr)
+    (err,) = cudart.cudaFree(dptr)
     assertSuccess(err)
 
+
 def test_cudart_cudaMemcpy3DPeerAsync():
     # allocate device memory
     size = 1024 * np.uint8().itemsize
@@ -1058,12 +1125,10 @@ def test_cudart_cudaMemcpy3DPeerAsync():
     # create host arrays
     h1 = np.full(size, 1).astype(np.uint8)
     h2 = np.full(size, 2).astype(np.uint8)
-    assert(np.array_equal(h1, h2) is False)
+    assert np.array_equal(h1, h2) is False
 
     # create channel descriptor
-    err, desc = cudart.cudaCreateChannelDesc(
-        8, 0, 0, 0, cudart.cudaChannelFormatKind.cudaChannelFormatKindUnsigned
-    )
+    err, desc = cudart.cudaCreateChannelDesc(8, 0, 0, 0, cudart.cudaChannelFormatKind.cudaChannelFormatKindUnsigned)
     assertSuccess(err)
 
     # allocate device array
@@ -1081,55 +1146,54 @@ def test_cudart_cudaMemcpy3DPeerAsync():
     params.extent = cudart.make_cudaExtent(size, 1, 1)
 
     # h1 to D
-    err, = cudart.cudaMemcpy(dptr, h1, size, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)
+    (err,) = cudart.cudaMemcpy(dptr, h1, size, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)
     assertSuccess(err)
 
     # D to arr
-    err, = cudart.cudaMemcpy3DPeerAsync(params, stream)
+    (err,) = cudart.cudaMemcpy3DPeerAsync(params, stream)
     assertSuccess(err)
 
     # await results
-    err, = cudart.cudaStreamSynchronize(stream)
+    (err,) = cudart.cudaStreamSynchronize(stream)
     assertSuccess(err)
 
     # arr to h2
-    err, = cudart.cudaMemcpy2DFromArray(
-        h2, size, arr, 0, 0, size, 1,
-        cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost
-    )
+    (err,) = cudart.cudaMemcpy2DFromArray(h2, size, arr, 0, 0, size, 1, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)
     assertSuccess(err)
 
     # validate h1 == h2
-    assert(np.array_equal(h1, h2))
+    assert np.array_equal(h1, h2)
 
     # clean up
-    err, = cudart.cudaFreeArray(arr)
+    (err,) = cudart.cudaFreeArray(arr)
     assertSuccess(err)
-    err, = cudart.cudaFree(dptr)
+    (err,) = cudart.cudaFree(dptr)
     assertSuccess(err)
 
+
 def test_profiler():
-    err, = cudart.cudaProfilerStart()
+    (err,) = cudart.cudaProfilerStart()
     assertSuccess(err)
-    err, = cudart.cudaProfilerStop()
+    (err,) = cudart.cudaProfilerStop()
     assertSuccess(err)
 
+
 def test_cudart_eglFrame():
     frame = cudart.cudaEglFrame()
     # [<cudaArray_t 0x0>, <cudaArray_t 0x0>, <cudaArray_t 0x0>]
-    assert(int(frame.frame.pArray[0]) == 0)
-    assert(int(frame.frame.pArray[1]) == 0)
-    assert(int(frame.frame.pArray[2]) == 0)
-    frame.frame.pArray = [1,2,3]
+    assert int(frame.frame.pArray[0]) == 0
+    assert int(frame.frame.pArray[1]) == 0
+    assert int(frame.frame.pArray[2]) == 0
+    frame.frame.pArray = [1, 2, 3]
     # [<cudaArray_t 0x1>, <cudaArray_t 0x2>, <cudaArray_t 0x3>]
-    assert(int(frame.frame.pArray[0]) == 1)
-    assert(int(frame.frame.pArray[1]) == 2)
-    assert(int(frame.frame.pArray[2]) == 3)
-    frame.frame.pArray = [1,2,cudart.cudaArray_t(4)]
+    assert int(frame.frame.pArray[0]) == 1
+    assert int(frame.frame.pArray[1]) == 2
+    assert int(frame.frame.pArray[2]) == 3
+    frame.frame.pArray = [1, 2, cudart.cudaArray_t(4)]
     # [<cudaArray_t 0x1>, <cudaArray_t 0x2>, <cudaArray_t 0x4>]
-    assert(int(frame.frame.pArray[0]) == 1)
-    assert(int(frame.frame.pArray[1]) == 2)
-    assert(int(frame.frame.pArray[2]) == 4)
+    assert int(frame.frame.pArray[0]) == 1
+    assert int(frame.frame.pArray[1]) == 2
+    assert int(frame.frame.pArray[2]) == 4
     # frame.frame.pPitch
     # [ptr : 0x1
     # pitch : 2
@@ -1141,18 +1205,18 @@ def test_cudart_eglFrame():
     # pitch : 0
     # xsize : 0
     # ysize : 0]
-    assert(int(frame.frame.pPitch[0].ptr) == 1)
-    assert(int(frame.frame.pPitch[0].pitch) == 2)
-    assert(int(frame.frame.pPitch[0].xsize) == 4)
-    assert(int(frame.frame.pPitch[0].ysize) == 0)
-    assert(int(frame.frame.pPitch[1].ptr) == 0)
-    assert(int(frame.frame.pPitch[1].pitch) == 0)
-    assert(int(frame.frame.pPitch[1].xsize) == 0)
-    assert(int(frame.frame.pPitch[1].ysize) == 0)
-    assert(int(frame.frame.pPitch[2].ptr) == 0)
-    assert(int(frame.frame.pPitch[2].pitch) == 0)
-    assert(int(frame.frame.pPitch[2].xsize) == 0)
-    assert(int(frame.frame.pPitch[2].ysize) == 0)
+    assert int(frame.frame.pPitch[0].ptr) == 1
+    assert int(frame.frame.pPitch[0].pitch) == 2
+    assert int(frame.frame.pPitch[0].xsize) == 4
+    assert int(frame.frame.pPitch[0].ysize) == 0
+    assert int(frame.frame.pPitch[1].ptr) == 0
+    assert int(frame.frame.pPitch[1].pitch) == 0
+    assert int(frame.frame.pPitch[1].xsize) == 0
+    assert int(frame.frame.pPitch[1].ysize) == 0
+    assert int(frame.frame.pPitch[2].ptr) == 0
+    assert int(frame.frame.pPitch[2].pitch) == 0
+    assert int(frame.frame.pPitch[2].xsize) == 0
+    assert int(frame.frame.pPitch[2].ysize) == 0
     frame.frame.pPitch = [cudart.cudaPitchedPtr(), cudart.cudaPitchedPtr(), cudart.cudaPitchedPtr()]
     # [ptr : 0x0
     # pitch : 0
@@ -1164,21 +1228,21 @@ def test_cudart_eglFrame():
     # pitch : 0
     # xsize : 0
     # ysize : 0]
-    assert(int(frame.frame.pPitch[0].ptr) == 0)
-    assert(int(frame.frame.pPitch[0].pitch) == 0)
-    assert(int(frame.frame.pPitch[0].xsize) == 0)
-    assert(int(frame.frame.pPitch[0].ysize) == 0)
-    assert(int(frame.frame.pPitch[1].ptr) == 0)
-    assert(int(frame.frame.pPitch[1].pitch) == 0)
-    assert(int(frame.frame.pPitch[1].xsize) == 0)
-    assert(int(frame.frame.pPitch[1].ysize) == 0)
-    assert(int(frame.frame.pPitch[2].ptr) == 0)
-    assert(int(frame.frame.pPitch[2].pitch) == 0)
-    assert(int(frame.frame.pPitch[2].xsize) == 0)
-    assert(int(frame.frame.pPitch[2].ysize) == 0)
+    assert int(frame.frame.pPitch[0].ptr) == 0
+    assert int(frame.frame.pPitch[0].pitch) == 0
+    assert int(frame.frame.pPitch[0].xsize) == 0
+    assert int(frame.frame.pPitch[0].ysize) == 0
+    assert int(frame.frame.pPitch[1].ptr) == 0
+    assert int(frame.frame.pPitch[1].pitch) == 0
+    assert int(frame.frame.pPitch[1].xsize) == 0
+    assert int(frame.frame.pPitch[1].ysize) == 0
+    assert int(frame.frame.pPitch[2].ptr) == 0
+    assert int(frame.frame.pPitch[2].pitch) == 0
+    assert int(frame.frame.pPitch[2].xsize) == 0
+    assert int(frame.frame.pPitch[2].ysize) == 0
     x = frame.frame.pPitch[0]
     x.pitch = 123
-    frame.frame.pPitch = [x,x,x]
+    frame.frame.pPitch = [x, x, x]
     # [ptr : 0x0
     # pitch : 123
     # xsize : 0
@@ -1189,18 +1253,18 @@ def test_cudart_eglFrame():
     # pitch : 123
     # xsize : 0
     # ysize : 0]
-    assert(int(frame.frame.pPitch[0].ptr) == 0)
-    assert(int(frame.frame.pPitch[0].pitch) == 123)
-    assert(int(frame.frame.pPitch[0].xsize) == 0)
-    assert(int(frame.frame.pPitch[0].ysize) == 0)
-    assert(int(frame.frame.pPitch[1].ptr) == 0)
-    assert(int(frame.frame.pPitch[1].pitch) == 123)
-    assert(int(frame.frame.pPitch[1].xsize) == 0)
-    assert(int(frame.frame.pPitch[1].ysize) == 0)
-    assert(int(frame.frame.pPitch[2].ptr) == 0)
-    assert(int(frame.frame.pPitch[2].pitch) == 123)
-    assert(int(frame.frame.pPitch[2].xsize) == 0)
-    assert(int(frame.frame.pPitch[2].ysize) == 0)
+    assert int(frame.frame.pPitch[0].ptr) == 0
+    assert int(frame.frame.pPitch[0].pitch) == 123
+    assert int(frame.frame.pPitch[0].xsize) == 0
+    assert int(frame.frame.pPitch[0].ysize) == 0
+    assert int(frame.frame.pPitch[1].ptr) == 0
+    assert int(frame.frame.pPitch[1].pitch) == 123
+    assert int(frame.frame.pPitch[1].xsize) == 0
+    assert int(frame.frame.pPitch[1].ysize) == 0
+    assert int(frame.frame.pPitch[2].ptr) == 0
+    assert int(frame.frame.pPitch[2].pitch) == 123
+    assert int(frame.frame.pPitch[2].xsize) == 0
+    assert int(frame.frame.pPitch[2].ysize) == 0
     x.pitch = 1234
     # [ptr : 0x0
     # pitch : 123
@@ -1212,37 +1276,40 @@ def test_cudart_eglFrame():
     # pitch : 123
     # xsize : 0
     # ysize : 0]
-    assert(int(frame.frame.pPitch[0].ptr) == 0)
-    assert(int(frame.frame.pPitch[0].pitch) == 123)
-    assert(int(frame.frame.pPitch[0].xsize) == 0)
-    assert(int(frame.frame.pPitch[0].ysize) == 0)
-    assert(int(frame.frame.pPitch[1].ptr) == 0)
-    assert(int(frame.frame.pPitch[1].pitch) == 123)
-    assert(int(frame.frame.pPitch[1].xsize) == 0)
-    assert(int(frame.frame.pPitch[1].ysize) == 0)
-    assert(int(frame.frame.pPitch[2].ptr) == 0)
-    assert(int(frame.frame.pPitch[2].pitch) == 123)
-    assert(int(frame.frame.pPitch[2].xsize) == 0)
-    assert(int(frame.frame.pPitch[2].ysize) == 0)
+    assert int(frame.frame.pPitch[0].ptr) == 0
+    assert int(frame.frame.pPitch[0].pitch) == 123
+    assert int(frame.frame.pPitch[0].xsize) == 0
+    assert int(frame.frame.pPitch[0].ysize) == 0
+    assert int(frame.frame.pPitch[1].ptr) == 0
+    assert int(frame.frame.pPitch[1].pitch) == 123
+    assert int(frame.frame.pPitch[1].xsize) == 0
+    assert int(frame.frame.pPitch[1].ysize) == 0
+    assert int(frame.frame.pPitch[2].ptr) == 0
+    assert int(frame.frame.pPitch[2].pitch) == 123
+    assert int(frame.frame.pPitch[2].xsize) == 0
+    assert int(frame.frame.pPitch[2].ysize) == 0
+
 
 def cudart_func_stream_callback(use_host_api):
     class testStruct(ctypes.Structure):
-        _fields_ = [('a', ctypes.c_int),
-                    ('b', ctypes.c_int),
-                    ('c', ctypes.c_int),]
+        _fields_ = [
+            ("a", ctypes.c_int),
+            ("b", ctypes.c_int),
+            ("c", ctypes.c_int),
+        ]
 
     def task_callback_host(userData):
         data = testStruct.from_address(userData)
-        assert(data.a == 1)
-        assert(data.b == 2)
-        assert(data.c == 3)
+        assert data.a == 1
+        assert data.b == 2
+        assert data.c == 3
         return 0
 
     def task_callback_stream(stream, status, userData):
         data = testStruct.from_address(userData)
-        assert(data.a == 1)
-        assert(data.b == 2)
-        assert(data.c == 3)
+        assert data.a == 1
+        assert data.b == 2
+        assert data.c == 3
         return 0
 
     if use_host_api:
@@ -1266,12 +1333,12 @@ def task_callback_stream(stream, status, userData):
     err, stream = cudart.cudaStreamCreate()
     assertSuccess(err)
     if use_host_api:
-        err, = cudart.cudaLaunchHostFunc(stream, callback, ctypes.addressof(c_data))
+        (err,) = cudart.cudaLaunchHostFunc(stream, callback, ctypes.addressof(c_data))
         assertSuccess(err)
     else:
-        err, = cudart.cudaStreamAddCallback(stream, callback, ctypes.addressof(c_data), 0)
+        (err,) = cudart.cudaStreamAddCallback(stream, callback, ctypes.addressof(c_data), 0)
         assertSuccess(err)
-    err, = cudart.cudaDeviceSynchronize()
+    (err,) = cudart.cudaDeviceSynchronize()
     assertSuccess(err)
 
 
@@ -1279,8 +1346,11 @@ def test_cudart_func_callback():
     cudart_func_stream_callback(use_host_api=False)
     cudart_func_stream_callback(use_host_api=True)
 
-@pytest.mark.skipif(driverVersionLessThan(12030)
-                    or not supportsCudaAPI('cudaGraphConditionalHandleCreate'), reason='Conditional graph APIs required')
+
+@pytest.mark.skipif(
+    driverVersionLessThan(12030) or not supportsCudaAPI("cudaGraphConditionalHandleCreate"),
+    reason="Conditional graph APIs required",
+)
 def test_cudart_conditional():
     err, graph = cudart.cudaGraphCreate(0)
     assertSuccess(err)
@@ -1293,10 +1363,10 @@ def test_cudart_conditional():
     params.conditional.type = cudart.cudaGraphConditionalNodeType.cudaGraphCondTypeIf
     params.conditional.size = 1
 
-    assert(len(params.conditional.phGraph_out) == 1)
-    assert(int(params.conditional.phGraph_out[0]) == 0)
+    assert len(params.conditional.phGraph_out) == 1
+    assert int(params.conditional.phGraph_out[0]) == 0
     err, node = cudart.cudaGraphAddNode(graph, None, 0, params)
     assertSuccess(err)
 
-    assert(len(params.conditional.phGraph_out) == 1)
-    assert(int(params.conditional.phGraph_out[0]) != 0)
+    assert len(params.conditional.phGraph_out) == 1
+    assert int(params.conditional.phGraph_out[0]) != 0
diff --git a/cuda_bindings/tests/test_interoperability.py b/cuda_bindings/tests/test_interoperability.py
index aab3b9a93..998535992 100644
--- a/cuda_bindings/tests/test_interoperability.py
+++ b/cuda_bindings/tests/test_interoperability.py
@@ -5,229 +5,240 @@
 # this software. Any use, reproduction, disclosure, or distribution of
 # this software and related documentation outside the terms of the EULA
 # is strictly prohibited.
+import numpy as np
 import pytest
+
 import cuda.cuda as cuda
 import cuda.cudart as cudart
-import numpy as np
+
 
 def supportsMemoryPool():
     err, isSupported = cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrMemoryPoolsSupported, 0)
     return err == cudart.cudaError_t.cudaSuccess and isSupported
 
+
 def test_interop_stream():
-    err_dr, = cuda.cuInit(0)
-    assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
+    (err_dr,) = cuda.cuInit(0)
+    assert err_dr == cuda.CUresult.CUDA_SUCCESS
     err_dr, device = cuda.cuDeviceGet(0)
-    assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
+    assert err_dr == cuda.CUresult.CUDA_SUCCESS
     err_dr, ctx = cuda.cuCtxCreate(0, device)
-    assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
+    assert err_dr == cuda.CUresult.CUDA_SUCCESS
 
     # DRV to RT
     err_dr, stream = cuda.cuStreamCreate(0)
-    assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
-    err_rt, = cudart.cudaStreamDestroy(stream)
-    assert(err_rt == cudart.cudaError_t.cudaSuccess)
+    assert err_dr == cuda.CUresult.CUDA_SUCCESS
+    (err_rt,) = cudart.cudaStreamDestroy(stream)
+    assert err_rt == cudart.cudaError_t.cudaSuccess
 
     # RT to DRV
     err_rt, stream = cudart.cudaStreamCreate()
-    assert(err_rt == cudart.cudaError_t.cudaSuccess)
-    err_dr, = cuda.cuStreamDestroy(stream)
-    assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
+    assert err_rt == cudart.cudaError_t.cudaSuccess
+    (err_dr,) = cuda.cuStreamDestroy(stream)
+    assert err_dr == cuda.CUresult.CUDA_SUCCESS
+
+    (err_dr,) = cuda.cuCtxDestroy(ctx)
+    assert err_dr == cuda.CUresult.CUDA_SUCCESS
 
-    err_dr, = cuda.cuCtxDestroy(ctx)
-    assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
 
 def test_interop_event():
-    err_dr, = cuda.cuInit(0)
-    assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
+    (err_dr,) = cuda.cuInit(0)
+    assert err_dr == cuda.CUresult.CUDA_SUCCESS
     err_dr, device = cuda.cuDeviceGet(0)
-    assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
+    assert err_dr == cuda.CUresult.CUDA_SUCCESS
     err_dr, ctx = cuda.cuCtxCreate(0, device)
-    assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
+    assert err_dr == cuda.CUresult.CUDA_SUCCESS
 
     # DRV to RT
     err_dr, event = cuda.cuEventCreate(0)
-    assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
-    err_rt, = cudart.cudaEventDestroy(event)
-    assert(err_rt == cudart.cudaError_t.cudaSuccess)
+    assert err_dr == cuda.CUresult.CUDA_SUCCESS
+    (err_rt,) = cudart.cudaEventDestroy(event)
+    assert err_rt == cudart.cudaError_t.cudaSuccess
 
     # RT to DRV
     err_rt, event = cudart.cudaEventCreate()
-    assert(err_rt == cudart.cudaError_t.cudaSuccess)
-    err_dr, = cuda.cuEventDestroy(event)
-    assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
+    assert err_rt == cudart.cudaError_t.cudaSuccess
+    (err_dr,) = cuda.cuEventDestroy(event)
+    assert err_dr == cuda.CUresult.CUDA_SUCCESS
+
+    (err_dr,) = cuda.cuCtxDestroy(ctx)
+    assert err_dr == cuda.CUresult.CUDA_SUCCESS
 
-    err_dr, = cuda.cuCtxDestroy(ctx)
-    assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
 
 def test_interop_graph():
-    err_dr, = cuda.cuInit(0)
-    assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
+    (err_dr,) = cuda.cuInit(0)
+    assert err_dr == cuda.CUresult.CUDA_SUCCESS
     err_dr, device = cuda.cuDeviceGet(0)
-    assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
+    assert err_dr == cuda.CUresult.CUDA_SUCCESS
     err_dr, ctx = cuda.cuCtxCreate(0, device)
-    assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
+    assert err_dr == cuda.CUresult.CUDA_SUCCESS
 
     # DRV to RT
     err_dr, graph = cuda.cuGraphCreate(0)
-    assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
-    err_rt, = cudart.cudaGraphDestroy(graph)
-    assert(err_rt == cudart.cudaError_t.cudaSuccess)
+    assert err_dr == cuda.CUresult.CUDA_SUCCESS
+    (err_rt,) = cudart.cudaGraphDestroy(graph)
+    assert err_rt == cudart.cudaError_t.cudaSuccess
 
     # RT to DRV
     err_rt, graph = cudart.cudaGraphCreate(0)
-    assert(err_rt == cudart.cudaError_t.cudaSuccess)
-    err_dr, = cuda.cuGraphDestroy(graph)
-    assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
+    assert err_rt == cudart.cudaError_t.cudaSuccess
+    (err_dr,) = cuda.cuGraphDestroy(graph)
+    assert err_dr == cuda.CUresult.CUDA_SUCCESS
+
+    (err_dr,) = cuda.cuCtxDestroy(ctx)
+    assert err_dr == cuda.CUresult.CUDA_SUCCESS
 
-    err_dr, = cuda.cuCtxDestroy(ctx)
-    assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
 
 def test_interop_graphNode():
-    err_dr, = cuda.cuInit(0)
-    assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
+    (err_dr,) = cuda.cuInit(0)
+    assert err_dr == cuda.CUresult.CUDA_SUCCESS
     err_dr, device = cuda.cuDeviceGet(0)
-    assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
+    assert err_dr == cuda.CUresult.CUDA_SUCCESS
     err_dr, ctx = cuda.cuCtxCreate(0, device)
-    assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
+    assert err_dr == cuda.CUresult.CUDA_SUCCESS
 
     err_dr, graph = cuda.cuGraphCreate(0)
-    assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
+    assert err_dr == cuda.CUresult.CUDA_SUCCESS
 
     # DRV to RT
     err_dr, node = cuda.cuGraphAddEmptyNode(graph, [], 0)
-    assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
-    err_rt, = cudart.cudaGraphDestroyNode(node)
-    assert(err_rt == cudart.cudaError_t.cudaSuccess)
+    assert err_dr == cuda.CUresult.CUDA_SUCCESS
+    (err_rt,) = cudart.cudaGraphDestroyNode(node)
+    assert err_rt == cudart.cudaError_t.cudaSuccess
 
     # RT to DRV
     err_rt, node = cudart.cudaGraphAddEmptyNode(graph, [], 0)
-    assert(err_rt == cudart.cudaError_t.cudaSuccess)
-    err_dr, = cuda.cuGraphDestroyNode(node)
-    assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
+    assert err_rt == cudart.cudaError_t.cudaSuccess
+    (err_dr,) = cuda.cuGraphDestroyNode(node)
+    assert err_dr == cuda.CUresult.CUDA_SUCCESS
+
+    (err_rt,) = cudart.cudaGraphDestroy(graph)
+    assert err_rt == cudart.cudaError_t.cudaSuccess
+    (err_dr,) = cuda.cuCtxDestroy(ctx)
+    assert err_dr == cuda.CUresult.CUDA_SUCCESS
 
-    err_rt, = cudart.cudaGraphDestroy(graph)
-    assert(err_rt == cudart.cudaError_t.cudaSuccess)
-    err_dr, = cuda.cuCtxDestroy(ctx)
-    assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
 
 def test_interop_userObject():
-    err_dr, = cuda.cuInit(0)
-    assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
+    (err_dr,) = cuda.cuInit(0)
+    assert err_dr == cuda.CUresult.CUDA_SUCCESS
     err_dr, device = cuda.cuDeviceGet(0)
-    assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
+    assert err_dr == cuda.CUresult.CUDA_SUCCESS
     err_dr, ctx = cuda.cuCtxCreate(0, device)
-    assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
+    assert err_dr == cuda.CUresult.CUDA_SUCCESS
 
     # cudaUserObject_t
     # TODO
 
-    err_dr, = cuda.cuCtxDestroy(ctx)
-    assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
+    (err_dr,) = cuda.cuCtxDestroy(ctx)
+    assert err_dr == cuda.CUresult.CUDA_SUCCESS
+
 
 def test_interop_function():
-    err_dr, = cuda.cuInit(0)
-    assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
+    (err_dr,) = cuda.cuInit(0)
+    assert err_dr == cuda.CUresult.CUDA_SUCCESS
     err_dr, device = cuda.cuDeviceGet(0)
-    assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
+    assert err_dr == cuda.CUresult.CUDA_SUCCESS
     err_dr, ctx = cuda.cuCtxCreate(0, device)
-    assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
+    assert err_dr == cuda.CUresult.CUDA_SUCCESS
 
     # cudaFunction_t
     # TODO
 
-    err_dr, = cuda.cuCtxDestroy(ctx)
-    assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
+    (err_dr,) = cuda.cuCtxDestroy(ctx)
+    assert err_dr == cuda.CUresult.CUDA_SUCCESS
 
-@pytest.mark.skipif(not supportsMemoryPool(), reason='Requires mempool operations')
+
+@pytest.mark.skipif(not supportsMemoryPool(), reason="Requires mempool operations")
 def test_interop_memPool():
-    err_dr, = cuda.cuInit(0)
-    assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
+    (err_dr,) = cuda.cuInit(0)
+    assert err_dr == cuda.CUresult.CUDA_SUCCESS
     err_dr, device = cuda.cuDeviceGet(0)
-    assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
+    assert err_dr == cuda.CUresult.CUDA_SUCCESS
     err_dr, ctx = cuda.cuCtxCreate(0, device)
-    assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
+    assert err_dr == cuda.CUresult.CUDA_SUCCESS
 
     # DRV to RT
     err_dr, pool = cuda.cuDeviceGetDefaultMemPool(0)
-    assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
-    err_rt, = cudart.cudaDeviceSetMemPool(0, pool)
-    assert(err_rt == cudart.cudaError_t.cudaSuccess)
+    assert err_dr == cuda.CUresult.CUDA_SUCCESS
+    (err_rt,) = cudart.cudaDeviceSetMemPool(0, pool)
+    assert err_rt == cudart.cudaError_t.cudaSuccess
 
     # RT to DRV
     err_rt, pool = cudart.cudaDeviceGetDefaultMemPool(0)
-    assert(err_rt == cudart.cudaError_t.cudaSuccess)
-    err_dr, = cuda.cuDeviceSetMemPool(0, pool)
-    assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
+    assert err_rt == cudart.cudaError_t.cudaSuccess
+    (err_dr,) = cuda.cuDeviceSetMemPool(0, pool)
+    assert err_dr == cuda.CUresult.CUDA_SUCCESS
+
+    (err_dr,) = cuda.cuCtxDestroy(ctx)
+    assert err_dr == cuda.CUresult.CUDA_SUCCESS
 
-    err_dr, = cuda.cuCtxDestroy(ctx)
-    assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
 
 def test_interop_graphExec():
-    err_dr, = cuda.cuInit(0)
-    assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
+    (err_dr,) = cuda.cuInit(0)
+    assert err_dr == cuda.CUresult.CUDA_SUCCESS
     err_dr, device = cuda.cuDeviceGet(0)
-    assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
+    assert err_dr == cuda.CUresult.CUDA_SUCCESS
     err_dr, ctx = cuda.cuCtxCreate(0, device)
-    assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
+    assert err_dr == cuda.CUresult.CUDA_SUCCESS
     err_dr, graph = cuda.cuGraphCreate(0)
-    assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
+    assert err_dr == cuda.CUresult.CUDA_SUCCESS
     err_dr, node = cuda.cuGraphAddEmptyNode(graph, [], 0)
-    assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
+    assert err_dr == cuda.CUresult.CUDA_SUCCESS
 
     # DRV to RT
     err_dr, graphExec = cuda.cuGraphInstantiate(graph, 0)
-    assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
-    err_rt, = cudart.cudaGraphExecDestroy(graphExec)
-    assert(err_rt == cudart.cudaError_t.cudaSuccess)
+    assert err_dr == cuda.CUresult.CUDA_SUCCESS
+    (err_rt,) = cudart.cudaGraphExecDestroy(graphExec)
+    assert err_rt == cudart.cudaError_t.cudaSuccess
 
     # RT to DRV
     err_rt, graphExec = cudart.cudaGraphInstantiate(graph, 0)
-    assert(err_rt == cudart.cudaError_t.cudaSuccess)
-    err_dr, = cuda.cuGraphExecDestroy(graphExec)
-    assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
+    assert err_rt == cudart.cudaError_t.cudaSuccess
+    (err_dr,) = cuda.cuGraphExecDestroy(graphExec)
+    assert err_dr == cuda.CUresult.CUDA_SUCCESS
+
+    (err_rt,) = cudart.cudaGraphDestroy(graph)
+    assert err_rt == cudart.cudaError_t.cudaSuccess
+    (err_dr,) = cuda.cuCtxDestroy(ctx)
+    assert err_dr == cuda.CUresult.CUDA_SUCCESS
 
-    err_rt, = cudart.cudaGraphDestroy(graph)
-    assert(err_rt == cudart.cudaError_t.cudaSuccess)
-    err_dr, = cuda.cuCtxDestroy(ctx)
-    assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
 
 def test_interop_deviceptr():
     # Init CUDA
-    err, = cuda.cuInit(0)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    (err,) = cuda.cuInit(0)
+    assert err == cuda.CUresult.CUDA_SUCCESS
 
     # Get device
     err, device = cuda.cuDeviceGet(0)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    assert err == cuda.CUresult.CUDA_SUCCESS
 
     # Construct context
     err, ctx = cuda.cuCtxCreate(0, device)
-    assert(err == cuda.CUresult.CUDA_SUCCESS)
+    assert err == cuda.CUresult.CUDA_SUCCESS
 
     # Allocate dev memory
     size = 1024 * np.uint8().itemsize
     err_dr, dptr = cuda.cuMemAlloc(size)
-    assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
+    assert err_dr == cuda.CUresult.CUDA_SUCCESS
 
     # Allocate host memory
     h1 = np.full(size, 1).astype(np.uint8)
     h2 = np.full(size, 2).astype(np.uint8)
-    assert(np.array_equal(h1, h2) is False)
+    assert np.array_equal(h1, h2) is False
 
     # Initialize device memory
-    err_rt, = cudart.cudaMemset(dptr, 1, size)
-    assert(err_rt == cudart.cudaError_t.cudaSuccess)
+    (err_rt,) = cudart.cudaMemset(dptr, 1, size)
+    assert err_rt == cudart.cudaError_t.cudaSuccess
 
     # D to h2
-    err_rt, = cudart.cudaMemcpy(h2, dptr, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)
-    assert(err_rt == cudart.cudaError_t.cudaSuccess)
+    (err_rt,) = cudart.cudaMemcpy(h2, dptr, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)
+    assert err_rt == cudart.cudaError_t.cudaSuccess
 
     # Validate h1 == h2
-    assert(np.array_equal(h1, h2))
+    assert np.array_equal(h1, h2)
 
     # Cleanup
-    err_dr, = cuda.cuMemFree(dptr)
-    assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
-    err_dr, = cuda.cuCtxDestroy(ctx)
-    assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
+    (err_dr,) = cuda.cuMemFree(dptr)
+    assert err_dr == cuda.CUresult.CUDA_SUCCESS
+    (err_dr,) = cuda.cuCtxDestroy(ctx)
+    assert err_dr == cuda.CUresult.CUDA_SUCCESS
diff --git a/cuda_bindings/tests/test_kernelParams.py b/cuda_bindings/tests/test_kernelParams.py
index 5bf745ef2..c45bf46ab 100644
--- a/cuda_bindings/tests/test_kernelParams.py
+++ b/cuda_bindings/tests/test_kernelParams.py
@@ -5,23 +5,26 @@
 # this software. Any use, reproduction, disclosure, or distribution of
 # this software and related documentation outside the terms of the EULA
 # is strictly prohibited.
-import pytest
-from cuda import cuda, cudart, nvrtc
-import numpy as np
 import ctypes
 
+import numpy as np
+
+from cuda import cuda, cudart, nvrtc
+
+
 def ASSERT_DRV(err):
     if isinstance(err, cuda.CUresult):
         if err != cuda.CUresult.CUDA_SUCCESS:
-            raise RuntimeError('Cuda Error: {}'.format(err))
+            raise RuntimeError(f"Cuda Error: {err}")
     elif isinstance(err, cudart.cudaError_t):
         if err != cudart.cudaError_t.cudaSuccess:
-            raise RuntimeError('Cudart Error: {}'.format(err))
+            raise RuntimeError(f"Cudart Error: {err}")
     elif isinstance(err, nvrtc.nvrtcResult):
         if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
-            raise RuntimeError('Nvrtc Error: {}'.format(err))
+            raise RuntimeError(f"Nvrtc Error: {err}")
     else:
-        raise RuntimeError('Unknown error type: {}'.format(err))
+        raise RuntimeError(f"Unknown error type: {err}")
+
 
 def common_nvrtc(allKernelStrings, dev):
     err, major = cuda.cuDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev)
@@ -30,19 +33,19 @@ def common_nvrtc(allKernelStrings, dev):
     ASSERT_DRV(err)
     err, _, nvrtc_minor = nvrtc.nvrtcVersion()
     ASSERT_DRV(err)
-    use_cubin = (nvrtc_minor >= 1)
-    prefix = 'sm' if use_cubin else 'compute'
-    arch_arg = bytes(f'--gpu-architecture={prefix}_{major}{minor}', 'ascii')
+    use_cubin = nvrtc_minor >= 1
+    prefix = "sm" if use_cubin else "compute"
+    arch_arg = bytes(f"--gpu-architecture={prefix}_{major}{minor}", "ascii")
 
-    err, prog = nvrtc.nvrtcCreateProgram(str.encode(allKernelStrings), b'allKernelStrings.cu', 0, None, None)
+    err, prog = nvrtc.nvrtcCreateProgram(str.encode(allKernelStrings), b"allKernelStrings.cu", 0, None, None)
     ASSERT_DRV(err)
-    opts = (b'--fmad=false', arch_arg)
-    err, = nvrtc.nvrtcCompileProgram(prog, len(opts), opts)
+    opts = (b"--fmad=false", arch_arg)
+    (err,) = nvrtc.nvrtcCompileProgram(prog, len(opts), opts)
 
     err_log, logSize = nvrtc.nvrtcGetProgramLogSize(prog)
     ASSERT_DRV(err_log)
-    log = b' ' * logSize
-    err_log, = nvrtc.nvrtcGetProgramLog(prog, log)
+    log = b" " * logSize
+    (err_log,) = nvrtc.nvrtcGetProgramLog(prog, log)
     ASSERT_DRV(err_log)
     result = log.decode()
     if len(result) > 1:
@@ -52,14 +55,14 @@ def common_nvrtc(allKernelStrings, dev):
     if use_cubin:
         err, dataSize = nvrtc.nvrtcGetCUBINSize(prog)
         ASSERT_DRV(err)
-        data = b' ' * dataSize
-        err, = nvrtc.nvrtcGetCUBIN(prog, data)
+        data = b" " * dataSize
+        (err,) = nvrtc.nvrtcGetCUBIN(prog, data)
         ASSERT_DRV(err)
     else:
         err, dataSize = nvrtc.nvrtcGetPTXSize(prog)
         ASSERT_DRV(err)
-        data = b' ' * dataSize
-        err, = nvrtc.nvrtcGetPTX(prog, data)
+        data = b" " * dataSize
+        (err,) = nvrtc.nvrtcGetPTX(prog, data)
         ASSERT_DRV(err)
 
     err, module = cuda.cuModuleLoadData(np.char.array(data))
@@ -67,15 +70,16 @@ def common_nvrtc(allKernelStrings, dev):
 
     return module
 
+
 def test_kernelParams_empty():
-    err, = cuda.cuInit(0)
+    (err,) = cuda.cuInit(0)
     ASSERT_DRV(err)
     err, cuDevice = cuda.cuDeviceGet(0)
     ASSERT_DRV(err)
     err, context = cuda.cuCtxCreate(0, cuDevice)
     ASSERT_DRV(err)
 
-    kernelString = '''\
+    kernelString = """\
     static __device__ bool isDone;
     extern "C" __global__
     void empty_kernel()
@@ -83,50 +87,67 @@ def test_kernelParams_empty():
         isDone = true;
         if (isDone) return;
     }
-    '''
+    """
 
     module = common_nvrtc(kernelString, cuDevice)
 
     # cudaStructs kernel
-    err, kernel = cuda.cuModuleGetFunction(module, b'empty_kernel')
+    err, kernel = cuda.cuModuleGetFunction(module, b"empty_kernel")
     ASSERT_DRV(err)
 
     err, stream = cuda.cuStreamCreate(0)
     ASSERT_DRV(err)
 
-    err, = cuda.cuLaunchKernel(kernel,
-                               1, 1, 1,   # grid dim
-                               1, 1, 1,   # block dim
-                               0, stream, # shared mem and stream
-                               ((), ()), 0) # arguments
-    ASSERT_DRV(err)
-    err, = cuda.cuLaunchKernel(kernel,
-                               1, 1, 1,   # grid dim
-                               1, 1, 1,   # block dim
-                               0, stream, # shared mem and stream
-                               None, 0) # arguments
+    (err,) = cuda.cuLaunchKernel(
+        kernel,
+        1,
+        1,
+        1,  # grid dim
+        1,
+        1,
+        1,  # block dim
+        0,
+        stream,  # shared mem and stream
+        ((), ()),
+        0,
+    )  # arguments
+    ASSERT_DRV(err)
+    (err,) = cuda.cuLaunchKernel(
+        kernel,
+        1,
+        1,
+        1,  # grid dim
+        1,
+        1,
+        1,  # block dim
+        0,
+        stream,  # shared mem and stream
+        None,
+        0,
+    )  # arguments
     ASSERT_DRV(err)
 
     # Retrieve global and validate
     isDone_host = ctypes.c_bool()
-    err, isDonePtr_device, isDonePtr_device_size = cuda.cuModuleGetGlobal(module, b'isDone')
+    err, isDonePtr_device, isDonePtr_device_size = cuda.cuModuleGetGlobal(module, b"isDone")
     ASSERT_DRV(err)
-    assert(isDonePtr_device_size == ctypes.sizeof(ctypes.c_bool))
-    err, = cuda.cuMemcpyDtoHAsync(isDone_host, isDonePtr_device, ctypes.sizeof(ctypes.c_bool), stream)
+    assert isDonePtr_device_size == ctypes.sizeof(ctypes.c_bool)
+    (err,) = cuda.cuMemcpyDtoHAsync(isDone_host, isDonePtr_device, ctypes.sizeof(ctypes.c_bool), stream)
     ASSERT_DRV(err)
-    err, = cuda.cuStreamSynchronize(stream)
+    (err,) = cuda.cuStreamSynchronize(stream)
     ASSERT_DRV(err)
-    assert(isDone_host.value == True)
+    assert isDone_host.value is True
 
-    err, = cuda.cuStreamDestroy(stream)
+    (err,) = cuda.cuStreamDestroy(stream)
     ASSERT_DRV(err)
-    err, = cuda.cuModuleUnload(module)
+    (err,) = cuda.cuModuleUnload(module)
     ASSERT_DRV(err)
-    err, = cuda.cuCtxDestroy(context)
+    (err,) = cuda.cuCtxDestroy(context)
     ASSERT_DRV(err)
 
+
 def kernelParams_basic(use_ctypes_as_values):
-    err, = cuda.cuInit(0)
+    (err,) = cuda.cuInit(0)
     ASSERT_DRV(err)
     err, cuDevice = cuda.cuDeviceGet(0)
     ASSERT_DRV(err)
@@ -134,39 +155,66 @@ def kernelParams_basic(use_ctypes_as_values):
     ASSERT_DRV(err)
 
     if use_ctypes_as_values:
-        assertValues_host = (ctypes.c_bool(True),
-                             ctypes.c_char(b'Z'), ctypes.c_wchar('Ā'),
-                             ctypes.c_byte(-127), ctypes.c_ubyte(255),
-                             ctypes.c_short(1), ctypes.c_ushort(1),
-                             ctypes.c_int(2), ctypes.c_uint(2),
-                             ctypes.c_long(3), ctypes.c_ulong(3),
-                             ctypes.c_longlong(4), ctypes.c_ulonglong(4),
-                             ctypes.c_size_t(5),
-                             ctypes.c_float(float(123.456)), ctypes.c_float(float(123.456)),
-                             ctypes.c_void_p(0xdeadbeef))
+        assertValues_host = (
+            ctypes.c_bool(True),
+            ctypes.c_char(b"Z"),
+            ctypes.c_wchar("Ā"),
+            ctypes.c_byte(-127),
+            ctypes.c_ubyte(255),
+            ctypes.c_short(1),
+            ctypes.c_ushort(1),
+            ctypes.c_int(2),
+            ctypes.c_uint(2),
+            ctypes.c_long(3),
+            ctypes.c_ulong(3),
+            ctypes.c_longlong(4),
+            ctypes.c_ulonglong(4),
+            ctypes.c_size_t(5),
+            ctypes.c_float(123.456),
+            ctypes.c_float(123.456),
+            ctypes.c_void_p(0xDEADBEEF),
+        )
     else:
-        assertValues_host = (True,
-                             b'Z', 'Ā',
-                             -127, 255,
-                             90, 72,
-                             85, 82,
-                             66, 65,
-                             86, 90,
-                             33,
-                             float(123.456), float(123.456),
-                             0xdeadbeef)
-    assertTypes_host = (ctypes.c_bool,
-                        ctypes.c_char, ctypes.c_wchar,
-                        ctypes.c_byte, ctypes.c_ubyte,
-                        ctypes.c_short, ctypes.c_ushort,
-                        ctypes.c_int, ctypes.c_uint,
-                        ctypes.c_long, ctypes.c_ulong,
-                        ctypes.c_longlong, ctypes.c_ulonglong,
-                        ctypes.c_size_t,
-                        ctypes.c_float, ctypes.c_double,
-                        ctypes.c_void_p)
-
-    basicKernelString = '''\
+        assertValues_host = (
+            True,
+            b"Z",
+            "Ā",
+            -127,
+            255,
+            90,
+            72,
+            85,
+            82,
+            66,
+            65,
+            86,
+            90,
+            33,
+            123.456,
+            123.456,
+            0xDEADBEEF,
+        )
+    assertTypes_host = (
+        ctypes.c_bool,
+        ctypes.c_char,
+        ctypes.c_wchar,
+        ctypes.c_byte,
+        ctypes.c_ubyte,
+        ctypes.c_short,
+        ctypes.c_ushort,
+        ctypes.c_int,
+        ctypes.c_uint,
+        ctypes.c_long,
+        ctypes.c_ulong,
+        ctypes.c_longlong,
+        ctypes.c_ulonglong,
+        ctypes.c_size_t,
+        ctypes.c_float,
+        ctypes.c_double,
+        ctypes.c_void_p,
+    )
+
+    basicKernelString = """\
     extern "C" __global__
     void basic(bool b,
                char c, wchar_t wc,
@@ -222,25 +270,25 @@ def kernelParams_basic(use_ctypes_as_values):
         *pf = f;
         *pd = d;
     }
-    '''
+    """
     idx = 0
-    while '{}' in basicKernelString:
+    while "{}" in basicKernelString:
         val = assertValues_host[idx].value if use_ctypes_as_values else assertValues_host[idx]
         if assertTypes_host[idx] == ctypes.c_float:
-            basicKernelString = basicKernelString.replace('{}', str(float(val)) + 'f', 1)
+            basicKernelString = basicKernelString.replace("{}", str(float(val)) + "f", 1)
         elif assertTypes_host[idx] == ctypes.c_double:
-            basicKernelString = basicKernelString.replace('{}', str(float(val)), 1)
+            basicKernelString = basicKernelString.replace("{}", str(float(val)), 1)
         elif assertTypes_host[idx] == ctypes.c_char:
-            basicKernelString = basicKernelString.replace('{}', str(val)[1:], 1)
+            basicKernelString = basicKernelString.replace("{}", str(val)[1:], 1)
         elif assertTypes_host[idx] == ctypes.c_wchar:
-            basicKernelString = basicKernelString.replace('{}', str(ord(val)), 1)
+            basicKernelString = basicKernelString.replace("{}", str(ord(val)), 1)
         else:
-            basicKernelString = basicKernelString.replace('{}', str(int(val)), 1)
+            basicKernelString = basicKernelString.replace("{}", str(int(val)), 1)
         idx += 1
 
     module = common_nvrtc(basicKernelString, cuDevice)
 
-    err, kernel = cuda.cuModuleGetFunction(module, b'basic')
+    err, kernel = cuda.cuModuleGetFunction(module, b"basic")
     ASSERT_DRV(err)
 
     err, stream = cuda.cuStreamCreate(0)
@@ -280,108 +328,124 @@ def kernelParams_basic(use_ctypes_as_values):
     err, pd = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_double))
     ASSERT_DRV(err)
 
-    assertValues_device = (pb,
-                           pc, pwc,
-                           pbyte, pubyte,
-                           ps, pus,
-                           pi, pui,
-                           pl, pul,
-                           pll, pull,
-                           psize,
-                           pf, pd)
-    assertTypes_device = (None,
-                          None, None,
-                          None, None,
-                          None, None,
-                          None, None,
-                          None, None,
-                          None, None,
-                          None,
-                          None, None)
+    assertValues_device = (pb, pc, pwc, pbyte, pubyte, ps, pus, pi, pui, pl, pul, pll, pull, psize, pf, pd)
+    assertTypes_device = (
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+    )
 
     basicKernelValues = assertValues_host + assertValues_device
     basicKernelTypes = assertTypes_host + assertTypes_device
-    err, = cuda.cuLaunchKernel(kernel,
-                               1, 1, 1,   # grid dim
-                               1, 1, 1,   # block dim
-                               0, stream, # shared mem and stream
-                               (basicKernelValues, basicKernelTypes), 0) # arguments
+    (err,) = cuda.cuLaunchKernel(
+        kernel,
+        1,
+        1,
+        1,  # grid dim
+        1,
+        1,
+        1,  # block dim
+        0,
+        stream,  # shared mem and stream
+        (basicKernelValues, basicKernelTypes),
+        0,
+    )  # arguments
     ASSERT_DRV(err)
 
     # Retrieve each dptr
     host_params = tuple([valueType() for valueType in assertTypes_host[:-1]])
     for i in range(len(host_params)):
-        err, = cuda.cuMemcpyDtoHAsync(host_params[i], assertValues_device[i], ctypes.sizeof(assertTypes_host[i]), stream)
+        (err,) = cuda.cuMemcpyDtoHAsync(
+            host_params[i], assertValues_device[i], ctypes.sizeof(assertTypes_host[i]), stream
+        )
         ASSERT_DRV(err)
 
     # Validate retrieved values
-    err, = cuda.cuStreamSynchronize(stream)
+    (err,) = cuda.cuStreamSynchronize(stream)
     ASSERT_DRV(err)
     for i in range(len(host_params)):
         val = basicKernelValues[i].value if use_ctypes_as_values else basicKernelValues[i]
         if basicKernelTypes[i] == ctypes.c_float:
             if use_ctypes_as_values:
-                assert(val == host_params[i].value)
+                assert val == host_params[i].value
             else:
-                assert(val == (int(host_params[i].value * 1000) / 1000))
+                assert val == (int(host_params[i].value * 1000) / 1000)
         else:
-            assert(val == host_params[i].value)
+            assert val == host_params[i].value
 
-    err, = cuda.cuMemFree(pb)
+    (err,) = cuda.cuMemFree(pb)
     ASSERT_DRV(err)
-    err, = cuda.cuMemFree(pc)
+    (err,) = cuda.cuMemFree(pc)
     ASSERT_DRV(err)
-    err, = cuda.cuMemFree(pwc)
+    (err,) = cuda.cuMemFree(pwc)
     ASSERT_DRV(err)
-    err, = cuda.cuMemFree(pbyte)
+    (err,) = cuda.cuMemFree(pbyte)
     ASSERT_DRV(err)
-    err, = cuda.cuMemFree(pubyte)
+    (err,) = cuda.cuMemFree(pubyte)
     ASSERT_DRV(err)
-    err, = cuda.cuMemFree(ps)
+    (err,) = cuda.cuMemFree(ps)
     ASSERT_DRV(err)
-    err, = cuda.cuMemFree(pus)
+    (err,) = cuda.cuMemFree(pus)
     ASSERT_DRV(err)
-    err, = cuda.cuMemFree(pi)
+    (err,) = cuda.cuMemFree(pi)
     ASSERT_DRV(err)
-    err, = cuda.cuMemFree(pui)
+    (err,) = cuda.cuMemFree(pui)
     ASSERT_DRV(err)
-    err, = cuda.cuMemFree(pl)
+    (err,) = cuda.cuMemFree(pl)
     ASSERT_DRV(err)
-    err, = cuda.cuMemFree(pul)
+    (err,) = cuda.cuMemFree(pul)
     ASSERT_DRV(err)
-    err, = cuda.cuMemFree(pll)
+    (err,) = cuda.cuMemFree(pll)
     ASSERT_DRV(err)
-    err, = cuda.cuMemFree(pull)
+    (err,) = cuda.cuMemFree(pull)
     ASSERT_DRV(err)
-    err, = cuda.cuMemFree(psize)
+    (err,) = cuda.cuMemFree(psize)
     ASSERT_DRV(err)
-    err, = cuda.cuMemFree(pf)
+    (err,) = cuda.cuMemFree(pf)
     ASSERT_DRV(err)
-    err, = cuda.cuMemFree(pd)
+    (err,) = cuda.cuMemFree(pd)
     ASSERT_DRV(err)
-    err, = cuda.cuStreamDestroy(stream)
+    (err,) = cuda.cuStreamDestroy(stream)
     ASSERT_DRV(err)
-    err, = cuda.cuModuleUnload(module)
+    (err,) = cuda.cuModuleUnload(module)
     ASSERT_DRV(err)
-    err, = cuda.cuCtxDestroy(context)
+    (err,) = cuda.cuCtxDestroy(context)
     ASSERT_DRV(err)
 
+
 def test_kernelParams_basic():
     # Kernel is given basic Python primative values as value input
-    kernelParams_basic(use_ctypes_as_values = False)
+    kernelParams_basic(use_ctypes_as_values=False)
+
 
 def test_kernelParams_basic_ctypes():
     # Kernel is given basic c_type instances as primative value input
-    kernelParams_basic(use_ctypes_as_values = True)
+    kernelParams_basic(use_ctypes_as_values=True)
+
 
 def test_kernelParams_types_cuda():
-    err, = cuda.cuInit(0)
+    (err,) = cuda.cuInit(0)
     ASSERT_DRV(err)
     err, cuDevice = cuda.cuDeviceGet(0)
     ASSERT_DRV(err)
     err, context = cuda.cuCtxCreate(0, cuDevice)
     ASSERT_DRV(err)
-    err, uvaSupported = cuda.cuDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, cuDevice)
+    err, uvaSupported = cuda.cuDeviceGetAttribute(
+        cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, cuDevice
+    )
     ASSERT_DRV(err)
 
     err, perr = cudart.cudaMalloc(ctypes.sizeof(ctypes.c_int))
@@ -393,25 +457,33 @@ def test_kernelParams_types_cuda():
 
     # Get device pointer if UVM is not enabled
     if uvaSupported:
-        kernelValues = (cudart.cudaError_t.cudaErrorUnknown, perr,                                         # enums
-                        cudart.cudaSurfaceObject_t(248), cudart.cudaSurfaceObject_t(_ptr=pSurface_host),   # typedef of primative
-                        cudart.dim3(), cudart.dim3(_ptr=pDim3_host))                                       # struct
+        kernelValues = (
+            cudart.cudaError_t.cudaErrorUnknown,
+            perr,  # enums
+            cudart.cudaSurfaceObject_t(248),
+            cudart.cudaSurfaceObject_t(_ptr=pSurface_host),  # typedef of primative
+            cudart.dim3(),
+            cudart.dim3(_ptr=pDim3_host),
+        )  # struct
     else:
         err, pSurface_device = cudart.cudaHostGetDevicePointer(pSurface_host, 0)
         ASSERT_DRV(err)
         err, pDim3_device = cudart.cudaHostGetDevicePointer(pDim3_host, 0)
         ASSERT_DRV(err)
-        kernelValues = (cudart.cudaError_t.cudaErrorUnknown, perr,                                         # enums
-                        cudart.cudaSurfaceObject_t(248), cudart.cudaSurfaceObject_t(_ptr=pSurface_device), # typedef of primative
-                        cudart.dim3(), cudart.dim3(_ptr=pDim3_device))                                     # struct
-    kernelTypes = (None, ctypes.c_void_p,
-                   None, ctypes.c_void_p,
-                   None, ctypes.c_void_p)
+        kernelValues = (
+            cudart.cudaError_t.cudaErrorUnknown,
+            perr,  # enums
+            cudart.cudaSurfaceObject_t(248),
+            cudart.cudaSurfaceObject_t(_ptr=pSurface_device),  # typedef of primative
+            cudart.dim3(),
+            cudart.dim3(_ptr=pDim3_device),
+        )  # struct
+    kernelTypes = (None, ctypes.c_void_p, None, ctypes.c_void_p, None, ctypes.c_void_p)
     kernelValues[4].x = 1
     kernelValues[4].y = 2
     kernelValues[4].z = 3
 
-    kernelString = '''\
+    kernelString = """\
     extern "C" __global__
     void structsCuda(cudaError_t err, cudaError_t *perr,
                      cudaSurfaceObject_t surface, cudaSurfaceObject_t *pSurface,
@@ -423,73 +495,90 @@ def test_kernelParams_types_cuda():
         pdim->y = dim.y;
         pdim->z = dim.z;
     }
-    '''
+    """
 
     module = common_nvrtc(kernelString, cuDevice)
 
     # cudaStructs kernel
-    err, kernel = cuda.cuModuleGetFunction(module, b'structsCuda')
+    err, kernel = cuda.cuModuleGetFunction(module, b"structsCuda")
     ASSERT_DRV(err)
 
     err, stream = cuda.cuStreamCreate(0)
     ASSERT_DRV(err)
 
-    err, = cuda.cuLaunchKernel(kernel,
-                               1, 1, 1,   # grid dim
-                               1, 1, 1,   # block dim
-                               0, stream, # shared mem and stream
-                               (kernelValues, kernelTypes), 0) # arguments
+    (err,) = cuda.cuLaunchKernel(
+        kernel,
+        1,
+        1,
+        1,  # grid dim
+        1,
+        1,
+        1,  # block dim
+        0,
+        stream,  # shared mem and stream
+        (kernelValues, kernelTypes),
+        0,
+    )  # arguments
     ASSERT_DRV(err)
 
     # Retrieve each dptr
     host_err = ctypes.c_int()
-    err, = cudart.cudaMemcpyAsync(ctypes.addressof(host_err), perr, ctypes.sizeof(ctypes.c_int()), cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream)
+    (err,) = cudart.cudaMemcpyAsync(
+        ctypes.addressof(host_err),
+        perr,
+        ctypes.sizeof(ctypes.c_int()),
+        cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost,
+        stream,
+    )
     ASSERT_DRV(err)
 
     # Validate kernel values
-    err, = cuda.cuStreamSynchronize(stream)
+    (err,) = cuda.cuStreamSynchronize(stream)
     ASSERT_DRV(err)
     cuda_err = cudart.cudaError_t(host_err.value)
 
     if uvaSupported:
-        assert(kernelValues[0] == cuda_err)
-        assert(int(kernelValues[2]) == int(kernelValues[3]))
-        assert(kernelValues[4].x == kernelValues[5].x)
-        assert(kernelValues[4].y == kernelValues[5].y)
-        assert(kernelValues[4].z == kernelValues[5].z)
+        assert kernelValues[0] == cuda_err
+        assert int(kernelValues[2]) == int(kernelValues[3])
+        assert kernelValues[4].x == kernelValues[5].x
+        assert kernelValues[4].y == kernelValues[5].y
+        assert kernelValues[4].z == kernelValues[5].z
     else:
         surface_host = cudart.cudaSurfaceObject_t(_ptr=pSurface_host)
         dim3_host = cudart.dim3(_ptr=pDim3_host)
-        assert(kernelValues[0] == cuda_err)
-        assert(int(kernelValues[2]) == int(surface_host))
-        assert(kernelValues[4].x == dim3_host.x)
-        assert(kernelValues[4].y == dim3_host.y)
-        assert(kernelValues[4].z == dim3_host.z)
+        assert kernelValues[0] == cuda_err
+        assert int(kernelValues[2]) == int(surface_host)
+        assert kernelValues[4].x == dim3_host.x
+        assert kernelValues[4].y == dim3_host.y
+        assert kernelValues[4].z == dim3_host.z
 
-    err, = cudart.cudaFree(perr)
+    (err,) = cudart.cudaFree(perr)
     ASSERT_DRV(err)
-    err, = cudart.cudaFreeHost(pSurface_host)
+    (err,) = cudart.cudaFreeHost(pSurface_host)
     ASSERT_DRV(err)
-    err, = cudart.cudaFreeHost(pDim3_host)
+    (err,) = cudart.cudaFreeHost(pDim3_host)
     ASSERT_DRV(err)
-    err, = cuda.cuStreamDestroy(stream)
+    (err,) = cuda.cuStreamDestroy(stream)
     ASSERT_DRV(err)
-    err, = cuda.cuModuleUnload(module)
+    (err,) = cuda.cuModuleUnload(module)
     ASSERT_DRV(err)
-    err, = cuda.cuCtxDestroy(context)
+    (err,) = cuda.cuCtxDestroy(context)
     ASSERT_DRV(err)
 
+
 def test_kernelParams_struct_custom():
-    err, = cuda.cuInit(0)
+    (err,) = cuda.cuInit(0)
     ASSERT_DRV(err)
     err, cuDevice = cuda.cuDeviceGet(0)
     ASSERT_DRV(err)
     err, context = cuda.cuCtxCreate(0, cuDevice)
     ASSERT_DRV(err)
-    err, uvaSupported = cuda.cuDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, cuDevice)
+    err, uvaSupported = cuda.cuDeviceGetAttribute(
+        cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, cuDevice
+    )
     ASSERT_DRV(err)
 
-    kernelString = '''\
+    kernelString = """\
     struct testStruct {
         int value;
     };
@@ -499,11 +588,11 @@ def test_kernelParams_struct_custom():
     {
         dst->value = src.value;
     }
-    '''
+    """
 
     module = common_nvrtc(kernelString, cuDevice)
 
-    err, kernel = cuda.cuModuleGetFunction(module, b'structCustom')
+    err, kernel = cuda.cuModuleGetFunction(module, b"structCustom")
     ASSERT_DRV(err)
 
     err, stream = cuda.cuStreamCreate(0)
@@ -511,7 +600,7 @@ def test_kernelParams_struct_custom():
 
     # structCustom kernel
     class testStruct(ctypes.Structure):
-        _fields_ = [('value',ctypes.c_int)]
+        _fields_ = [("value", ctypes.c_int)]
 
     err, pStruct_host = cudart.cudaHostAlloc(ctypes.sizeof(testStruct), cudart.cudaHostAllocMapped)
     ASSERT_DRV(err)
@@ -525,39 +614,50 @@ class testStruct(ctypes.Structure):
         kernelValues = (testStruct(5), pStruct_device)
     kernelTypes = (None, ctypes.c_void_p)
 
-    err, = cuda.cuLaunchKernel(kernel,
-                               1, 1, 1,   # grid dim
-                               1, 1, 1,   # block dim
-                               0, stream, # shared mem and stream
-                               (kernelValues, kernelTypes), 0) # arguments
+    (err,) = cuda.cuLaunchKernel(
+        kernel,
+        1,
+        1,
+        1,  # grid dim
+        1,
+        1,
+        1,  # block dim
+        0,
+        stream,  # shared mem and stream
+        (kernelValues, kernelTypes),
+        0,
+    )  # arguments
     ASSERT_DRV(err)
 
     # Validate kernel values
-    err, = cuda.cuStreamSynchronize(stream)
+    (err,) = cuda.cuStreamSynchronize(stream)
     ASSERT_DRV(err)
     struct_shared = testStruct.from_address(pStruct_host)
-    assert(kernelValues[0].value == struct_shared.value)
+    assert kernelValues[0].value == struct_shared.value
 
-    err, = cudart.cudaFreeHost(pStruct_host)
+    (err,) = cudart.cudaFreeHost(pStruct_host)
     ASSERT_DRV(err)
-    err, = cuda.cuStreamDestroy(stream)
+    (err,) = cuda.cuStreamDestroy(stream)
     ASSERT_DRV(err)
-    err, = cuda.cuModuleUnload(module)
+    (err,) = cuda.cuModuleUnload(module)
     ASSERT_DRV(err)
-    err, = cuda.cuCtxDestroy(context)
+    (err,) = cuda.cuCtxDestroy(context)
     ASSERT_DRV(err)
 
+
 def kernelParams_buffer_protocol_ctypes_common(pass_by_address):
-    err, = cuda.cuInit(0)
+    (err,) = cuda.cuInit(0)
     ASSERT_DRV(err)
     err, cuDevice = cuda.cuDeviceGet(0)
     ASSERT_DRV(err)
     err, context = cuda.cuCtxCreate(0, cuDevice)
     ASSERT_DRV(err)
-    err, uvaSupported = cuda.cuDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, cuDevice)
+    err, uvaSupported = cuda.cuDeviceGetAttribute(
+        cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, cuDevice
+    )
     ASSERT_DRV(err)
 
-    kernelString = '''\
+    kernelString = """\
     struct testStruct {
         int value;
     };
@@ -570,11 +670,11 @@ def kernelParams_buffer_protocol_ctypes_common(pass_by_address):
         *pf = f;
         ps->value = s.value;
     }
-    '''
+    """
 
     module = common_nvrtc(kernelString, cuDevice)
 
-    err, kernel = cuda.cuModuleGetFunction(module, b'testkernel')
+    err, kernel = cuda.cuModuleGetFunction(module, b"testkernel")
     ASSERT_DRV(err)
 
     err, stream = cuda.cuStreamCreate(0)
@@ -582,7 +682,7 @@ def kernelParams_buffer_protocol_ctypes_common(pass_by_address):
 
     # testkernel kernel
     class testStruct(ctypes.Structure):
-        _fields_ = [('value',ctypes.c_int)]
+        _fields_ = [("value", ctypes.c_int)]
 
     err, pInt_host = cudart.cudaHostAlloc(ctypes.sizeof(ctypes.c_int), cudart.cudaHostAllocMapped)
     ASSERT_DRV(err)
@@ -593,9 +693,14 @@ class testStruct(ctypes.Structure):
 
     # Get device pointer if UVM is not enabled
     if uvaSupported:
-        kernelValues = (ctypes.c_int(1), ctypes.c_void_p(pInt_host),
-                        ctypes.c_float(float(123.456)), ctypes.c_void_p(pFloat_host),
-                        testStruct(5), ctypes.c_void_p(pStruct_host))
+        kernelValues = (
+            ctypes.c_int(1),
+            ctypes.c_void_p(pInt_host),
+            ctypes.c_float(123.456),
+            ctypes.c_void_p(pFloat_host),
+            testStruct(5),
+            ctypes.c_void_p(pStruct_host),
+        )
     else:
         err, pInt_device = cudart.cudaHostGetDevicePointer(pInt_host, 0)
         ASSERT_DRV(err)
@@ -603,51 +708,68 @@ class testStruct(ctypes.Structure):
         ASSERT_DRV(err)
         err, pStruct_device = cudart.cudaHostGetDevicePointer(pStruct_host, 0)
         ASSERT_DRV(err)
-        kernelValues = (ctypes.c_int(1), ctypes.c_void_p(pInt_device),
-                        ctypes.c_float(float(123.456)), ctypes.c_void_p(pFloat_device),
-                        testStruct(5), ctypes.c_void_p(pStruct_device))
-
-    packagedParams = (ctypes.c_void_p*len(kernelValues))()
+        kernelValues = (
+            ctypes.c_int(1),
+            ctypes.c_void_p(pInt_device),
+            ctypes.c_float(123.456),
+            ctypes.c_void_p(pFloat_device),
+            testStruct(5),
+            ctypes.c_void_p(pStruct_device),
+        )
+
+    packagedParams = (ctypes.c_void_p * len(kernelValues))()
     for idx in range(len(packagedParams)):
         packagedParams[idx] = ctypes.addressof(kernelValues[idx])
-    err, = cuda.cuLaunchKernel(kernel,
-                               1, 1, 1,   # grid dim
-                               1, 1, 1,   # block dim
-                               0, stream, # shared mem and stream
-                               ctypes.addressof(packagedParams) if pass_by_address else packagedParams, 0) # arguments
+    (err,) = cuda.cuLaunchKernel(
+        kernel,
+        1,
+        1,
+        1,  # grid dim
+        1,
+        1,
+        1,  # block dim
+        0,
+        stream,  # shared mem and stream
+        ctypes.addressof(packagedParams) if pass_by_address else packagedParams,
+        0,
+    )  # arguments
     ASSERT_DRV(err)
 
     # Validate kernel values
-    err, = cuda.cuStreamSynchronize(stream)
+    (err,) = cuda.cuStreamSynchronize(stream)
     ASSERT_DRV(err)
-    assert(kernelValues[0].value == ctypes.c_int.from_address(pInt_host).value)
-    assert(kernelValues[2].value == ctypes.c_float.from_address(pFloat_host).value)
-    assert(kernelValues[4].value == testStruct.from_address(pStruct_host).value)
+    assert kernelValues[0].value == ctypes.c_int.from_address(pInt_host).value
+    assert kernelValues[2].value == ctypes.c_float.from_address(pFloat_host).value
+    assert kernelValues[4].value == testStruct.from_address(pStruct_host).value
 
-    err, = cudart.cudaFreeHost(pStruct_host)
+    (err,) = cudart.cudaFreeHost(pStruct_host)
     ASSERT_DRV(err)
-    err, = cuda.cuStreamDestroy(stream)
+    (err,) = cuda.cuStreamDestroy(stream)
     ASSERT_DRV(err)
-    err, = cuda.cuModuleUnload(module)
+    (err,) = cuda.cuModuleUnload(module)
     ASSERT_DRV(err)
-    err, = cuda.cuCtxDestroy(context)
+    (err,) = cuda.cuCtxDestroy(context)
     ASSERT_DRV(err)
 
+
 def test_kernelParams_buffer_protocol_ctypes():
     kernelParams_buffer_protocol_ctypes_common(pass_by_address=True)
     kernelParams_buffer_protocol_ctypes_common(pass_by_address=False)
 
+
 def test_kernelParams_buffer_protocol_numpy():
-    err, = cuda.cuInit(0)
+    (err,) = cuda.cuInit(0)
     ASSERT_DRV(err)
     err, cuDevice = cuda.cuDeviceGet(0)
     ASSERT_DRV(err)
     err, context = cuda.cuCtxCreate(0, cuDevice)
     ASSERT_DRV(err)
-    err, uvaSupported = cuda.cuDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, cuDevice)
+    err, uvaSupported = cuda.cuDeviceGetAttribute(
+        cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, cuDevice
+    )
     ASSERT_DRV(err)
 
-    kernelString = '''\
+    kernelString = """\
     struct testStruct {
         int value;
     };
@@ -660,18 +782,18 @@ def test_kernelParams_buffer_protocol_numpy():
         *pf = f;
         ps->value = s.value;
     }
-    '''
+    """
 
     module = common_nvrtc(kernelString, cuDevice)
 
-    err, kernel = cuda.cuModuleGetFunction(module, b'testkernel')
+    err, kernel = cuda.cuModuleGetFunction(module, b"testkernel")
     ASSERT_DRV(err)
 
     err, stream = cuda.cuStreamCreate(0)
     ASSERT_DRV(err)
 
     # testkernel kernel
-    testStruct = np.dtype([('value', np.int32)])
+    testStruct = np.dtype([("value", np.int32)])
 
     err, pInt_host = cudart.cudaHostAlloc(np.dtype(np.int32).itemsize, cudart.cudaHostAllocMapped)
     ASSERT_DRV(err)
@@ -682,9 +804,14 @@ def test_kernelParams_buffer_protocol_numpy():
 
     # Get device pointer if UVM is not enabled
     if uvaSupported:
-        kernelValues = (np.array(1, dtype=np.uint32), np.array([pInt_host], dtype=np.uint64),
-                        np.array(float(123.456), dtype=np.float32), np.array([pFloat_host], dtype=np.uint64),
-                        np.array([5], testStruct), np.array([pStruct_host], dtype=np.uint64))
+        kernelValues = (
+            np.array(1, dtype=np.uint32),
+            np.array([pInt_host], dtype=np.uint64),
+            np.array(123.456, dtype=np.float32),
+            np.array([pFloat_host], dtype=np.uint64),
+            np.array([5], testStruct),
+            np.array([pStruct_host], dtype=np.uint64),
+        )
     else:
         err, pInt_device = cudart.cudaHostGetDevicePointer(pInt_host, 0)
         ASSERT_DRV(err)
@@ -692,37 +819,48 @@ def test_kernelParams_buffer_protocol_numpy():
         ASSERT_DRV(err)
         err, pStruct_device = cudart.cudaHostGetDevicePointer(pStruct_host, 0)
         ASSERT_DRV(err)
-        kernelValues = (np.array(1, dtype=np.int32), np.array([pInt_device], dtype=np.uint64),
-                        np.array(float(123.456), dtype=np.float32), np.array([pFloat_device], dtype=np.uint64),
-                        np.array([5], testStruct), np.array([pStruct_device], dtype=np.uint64))
+        kernelValues = (
+            np.array(1, dtype=np.int32),
+            np.array([pInt_device], dtype=np.uint64),
+            np.array(123.456, dtype=np.float32),
+            np.array([pFloat_device], dtype=np.uint64),
+            np.array([5], testStruct),
+            np.array([pStruct_device], dtype=np.uint64),
+        )
 
     packagedParams = np.array([arg.ctypes.data for arg in kernelValues], dtype=np.uint64)
-    err, = cuda.cuLaunchKernel(kernel,
-                               1, 1, 1,   # grid dim
-                               1, 1, 1,   # block dim
-                               0, stream, # shared mem and stream
-                               packagedParams, 0) # arguments
+    (err,) = cuda.cuLaunchKernel(
+        kernel,
+        1,
+        1,
+        1,  # grid dim
+        1,
+        1,
+        1,  # block dim
+        0,
+        stream,  # shared mem and stream
+        packagedParams,
+        0,
+    )  # arguments
     ASSERT_DRV(err)
 
     # Validate kernel values
-    err, = cuda.cuStreamSynchronize(stream)
+    (err,) = cuda.cuStreamSynchronize(stream)
     ASSERT_DRV(err)
 
-    class numpy_address_wrapper():
+    class numpy_address_wrapper:
         def __init__(self, address, typestr):
-            self.__array_interface__ = {'data': (address, False),
-                                        'typestr': typestr,
-                                        'shape': (1,)}
+            self.__array_interface__ = {"data": (address, False), "typestr": typestr, "shape": (1,)}
 
-    assert(kernelValues[0] == np.array(numpy_address_wrapper(pInt_host, '<i4')))
-    assert(kernelValues[2] == np.array(numpy_address_wrapper(pFloat_host, '<f4')))
-    assert(kernelValues[4]['value'] == np.array(numpy_address_wrapper(pStruct_host, '<i4'), dtype=testStruct)['value'])
+    assert kernelValues[0] == np.array(numpy_address_wrapper(pInt_host, "<i4"))
+    assert kernelValues[2] == np.array(numpy_address_wrapper(pFloat_host, "<f4"))
+    assert kernelValues[4]["value"] == np.array(numpy_address_wrapper(pStruct_host, "<i4"), dtype=testStruct)["value"]
 
-    err, = cudart.cudaFreeHost(pStruct_host)
+    (err,) = cudart.cudaFreeHost(pStruct_host)
     ASSERT_DRV(err)
-    err, = cuda.cuStreamDestroy(stream)
+    (err,) = cuda.cuStreamDestroy(stream)
     ASSERT_DRV(err)
-    err, = cuda.cuModuleUnload(module)
+    (err,) = cuda.cuModuleUnload(module)
     ASSERT_DRV(err)
-    err, = cuda.cuCtxDestroy(context)
+    (err,) = cuda.cuCtxDestroy(context)
     ASSERT_DRV(err)
diff --git a/cuda_bindings/tests/test_nvjitlink.py b/cuda_bindings/tests/test_nvjitlink.py
index 182d2bc40..aaf32df33 100644
--- a/cuda_bindings/tests/test_nvjitlink.py
+++ b/cuda_bindings/tests/test_nvjitlink.py
@@ -6,7 +6,6 @@
 
 from cuda.bindings import nvjitlink
 
-
 ptx_kernel = """
 .version 8.5
 .target sm_90
@@ -39,8 +38,9 @@
 }
 """
 
-ptx_kernel_bytes = ptx_kernel.encode('utf-8')
-minimal_ptx_kernel_bytes = minimal_ptx_kernel.encode('utf-8')
+ptx_kernel_bytes = ptx_kernel.encode("utf-8")
+minimal_ptx_kernel_bytes = minimal_ptx_kernel.encode("utf-8")
+
 
 def test_unrecognized_option_error():
     with pytest.raises(nvjitlink.nvJitLinkError, match="ERROR_UNRECOGNIZED_OPTION"):
@@ -67,7 +67,9 @@ def test_complete_empty():
 def test_add_data():
     handle = nvjitlink.create(1, ["-arch=sm_90"])
     nvjitlink.add_data(handle, nvjitlink.InputType.ANY, ptx_kernel_bytes, len(ptx_kernel_bytes), "test_data")
-    nvjitlink.add_data(handle, nvjitlink.InputType.ANY, minimal_ptx_kernel_bytes, len(minimal_ptx_kernel_bytes), "minimal_test_data")
+    nvjitlink.add_data(
+        handle, nvjitlink.InputType.ANY, minimal_ptx_kernel_bytes, len(minimal_ptx_kernel_bytes), "minimal_test_data"
+    )
     nvjitlink.complete(handle)
     nvjitlink.destroy(handle)
 
diff --git a/cuda_bindings/tests/test_nvrtc.py b/cuda_bindings/tests/test_nvrtc.py
index a4d911508..077484010 100644
--- a/cuda_bindings/tests/test_nvrtc.py
+++ b/cuda_bindings/tests/test_nvrtc.py
@@ -6,21 +6,25 @@
 # this software and related documentation outside the terms of the EULA
 # is strictly prohibited.
 import pytest
+
 from cuda import nvrtc
 
+
 def ASSERT_DRV(err):
     if isinstance(err, nvrtc.nvrtcResult):
         if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
-            raise RuntimeError('Nvrtc Error: {}'.format(err))
+            raise RuntimeError(f"Nvrtc Error: {err}")
     else:
-        raise RuntimeError('Unknown error type: {}'.format(err))
+        raise RuntimeError(f"Unknown error type: {err}")
+
 
 def nvrtcVersionLessThan(major, minor):
     err, major_version, minor_version = nvrtc.nvrtcVersion()
     ASSERT_DRV(err)
     return major_version < major or (major == major_version and minor_version < minor)
 
-@pytest.mark.skipif(nvrtcVersionLessThan(11, 3), reason='When nvrtcGetSupportedArchs was introduced')
+
+@pytest.mark.skipif(nvrtcVersionLessThan(11, 3), reason="When nvrtcGetSupportedArchs was introduced")
 def test_nvrtcGetSupportedArchs():
     err, supportedArchs = nvrtc.nvrtcGetSupportedArchs()
     ASSERT_DRV(err)
diff --git a/cuda_core/cuda/core/experimental/_context.py b/cuda_core/cuda/core/experimental/_context.py
index 216953aba..349f448c7 100644
--- a/cuda_core/cuda/core/experimental/_context.py
+++ b/cuda_core/cuda/core/experimental/_context.py
@@ -4,8 +4,7 @@
 
 from dataclasses import dataclass
 
-from cuda import cuda, cudart
-from cuda.core.experimental._utils import handle_return
+from cuda import cuda
 
 
 @dataclass
@@ -14,7 +13,6 @@ class ContextOptions:
 
 
 class Context:
-
     __slots__ = ("_handle", "_id")
 
     def __init__(self):
diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py
index 2899282c2..0c03c789f 100644
--- a/cuda_core/cuda/core/experimental/_device.py
+++ b/cuda_core/cuda/core/experimental/_device.py
@@ -3,16 +3,13 @@
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 import threading
-from typing import Optional, Union
-import warnings
+from typing import Union
 
 from cuda import cuda, cudart
-from cuda.core.experimental._utils import handle_return, ComputeCapability, CUDAError, \
-                             precondition
 from cuda.core.experimental._context import Context, ContextOptions
-from cuda.core.experimental._memory import _DefaultAsyncMempool, Buffer, MemoryResource
-from cuda.core.experimental._stream import default_stream, Stream, StreamOptions
-
+from cuda.core.experimental._memory import Buffer, MemoryResource, _DefaultAsyncMempool
+from cuda.core.experimental._stream import Stream, StreamOptions, default_stream
+from cuda.core.experimental._utils import ComputeCapability, CUDAError, handle_return, precondition
 
 _tls = threading.local()
 _tls_lock = threading.Lock()
@@ -44,6 +41,7 @@ class Device:
         Default value of `None` return the currently used device.
 
     """
+
     __slots__ = ("_id", "_mr", "_has_inited")
 
     def __new__(cls, device_id=None):
@@ -54,8 +52,7 @@ def __new__(cls, device_id=None):
         else:
             total = handle_return(cudart.cudaGetDeviceCount())
             if not isinstance(device_id, int) or not (0 <= device_id < total):
-                raise ValueError(
-                    f"device_id must be within [0, {total}), got {device_id}")
+                raise ValueError(f"device_id must be within [0, {total}), got {device_id}")
 
         # ensure Device is singleton
         with _tls_lock:
@@ -73,8 +70,7 @@ def __new__(cls, device_id=None):
 
     def _check_context_initialized(self, *args, **kwargs):
         if not self._has_inited:
-            raise CUDAError("the device is not yet initialized, "
-                            "perhaps you forgot to call .set_current() first?")
+            raise CUDAError("the device is not yet initialized, perhaps you forgot to call .set_current() first?")
 
     @property
     def device_id(self) -> int:
@@ -115,7 +111,7 @@ def name(self) -> str:
         """Return the device name."""
         # Use 256 characters to be consistent with CUDA Runtime
         name = handle_return(cuda.cuDeviceGetName(256, self._id))
-        name = name.split(b'\0')[0]
+        name = name.split(b"\0")[0]
         return name.decode()
 
     @property
@@ -127,10 +123,12 @@ def properties(self) -> dict:
     @property
     def compute_capability(self) -> ComputeCapability:
         """Return a named tuple with 2 fields: major and minor."""
-        major = handle_return(cudart.cudaDeviceGetAttribute(
-            cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, self._id))
-        minor = handle_return(cudart.cudaDeviceGetAttribute(
-            cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, self._id))
+        major = handle_return(
+            cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, self._id)
+        )
+        minor = handle_return(
+            cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, self._id)
+        )
         return ComputeCapability(major, minor)
 
     @property
@@ -178,7 +176,7 @@ def __int__(self):
     def __repr__(self):
         return f"<Device {self._id} ({self.name})>"
 
-    def set_current(self, ctx: Context=None) -> Union[Context, None]:
+    def set_current(self, ctx: Context = None) -> Union[Context, None]:
         """Set device to be used for GPU executions.
 
         Initializes CUDA and sets the calling thread to a valid CUDA
@@ -212,8 +210,10 @@ def set_current(self, ctx: Context=None) -> Union[Context, None]:
             if not isinstance(ctx, Context):
                 raise TypeError("a Context object is required")
             if ctx._id != self._id:
-                raise RuntimeError("the provided context was created on a different "
-                                  f"device {ctx._id} other than the target {self._id}")
+                raise RuntimeError(
+                    "the provided context was created on a different "
+                    f"device {ctx._id} other than the target {self._id}"
+                )
             prev_ctx = handle_return(cuda.cuCtxPopCurrent())
             handle_return(cuda.cuCtxPushCurrent(ctx._handle))
             self._has_inited = True
@@ -257,7 +257,7 @@ def create_context(self, options: ContextOptions = None) -> Context:
         raise NotImplementedError("TODO")
 
     @precondition(_check_context_initialized)
-    def create_stream(self, obj=None, options: StreamOptions=None) -> Stream:
+    def create_stream(self, obj=None, options: StreamOptions = None) -> Stream:
         """Create a Stream object.
 
         New stream objects can be created in two different ways:
diff --git a/cuda_core/cuda/core/experimental/_event.py b/cuda_core/cuda/core/experimental/_event.py
index a6d5da281..af4768953 100644
--- a/cuda_core/cuda/core/experimental/_event.py
+++ b/cuda_core/cuda/core/experimental/_event.py
@@ -6,9 +6,7 @@
 from typing import Optional
 
 from cuda import cuda
-from cuda.core.experimental._utils import check_or_create_options
-from cuda.core.experimental._utils import CUDAError
-from cuda.core.experimental._utils import handle_return
+from cuda.core.experimental._utils import CUDAError, check_or_create_options, handle_return
 
 
 @dataclass
@@ -30,6 +28,7 @@ class EventOptions:
         Note that enable_timing must be False. (Default to False)
 
     """
+
     enable_timing: Optional[bool] = False
     busy_waited_sync: Optional[bool] = False
     support_ipc: Optional[bool] = False
@@ -50,16 +49,17 @@ class Event:
     and they should instead be created through a :obj:`Stream` object.
 
     """
+
     __slots__ = ("_handle", "_timing_disabled", "_busy_waited")
 
     def __init__(self):
         self._handle = None
         raise NotImplementedError(
-            "directly creating an Event object can be ambiguous. Please call "
-            "call Stream.record().")
+            "directly creating an Event object can be ambiguous. Please call call Stream.record()."
+        )
 
     @staticmethod
-    def _init(options: Optional[EventOptions]=None):
+    def _init(options: Optional[EventOptions] = None):
         self = Event.__new__(Event)
         # minimal requirements for the destructor
         self._handle = None
@@ -119,7 +119,7 @@ def sync(self):
     @property
     def is_done(self) -> bool:
         """Return True if all captured works have been completed, otherwise False."""
-        result, = cuda.cuEventQuery(self._handle)
+        (result,) = cuda.cuEventQuery(self._handle)
         if result == cuda.CUresult.CUDA_SUCCESS:
             return True
         elif result == cuda.CUresult.CUDA_ERROR_NOT_READY:
diff --git a/cuda_core/cuda/core/experimental/_launcher.py b/cuda_core/cuda/core/experimental/_launcher.py
index 9991638f3..77af6b7e1 100644
--- a/cuda_core/cuda/core/experimental/_launcher.py
+++ b/cuda_core/cuda/core/experimental/_launcher.py
@@ -2,20 +2,16 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-from dataclasses import dataclass
 import importlib.metadata
+from dataclasses import dataclass
 from typing import Optional, Union
 
-import numpy as np
-
-from cuda import cuda, cudart
+from cuda import cuda
 from cuda.core.experimental._kernel_arg_handler import ParamHolder
-from cuda.core.experimental._memory import Buffer
 from cuda.core.experimental._module import Kernel
 from cuda.core.experimental._stream import Stream
 from cuda.core.experimental._utils import CUDAError, check_or_create_options, handle_return
 
-
 # TODO: revisit this treatment for py313t builds
 _inited = False
 _use_ex = None
@@ -28,8 +24,7 @@ def _lazy_init():
 
     global _use_ex
     # binding availability depends on cuda-python version
-    _py_major_minor = tuple(int(v) for v in (
-        importlib.metadata.version("cuda-python").split(".")[:2]))
+    _py_major_minor = tuple(int(v) for v in (importlib.metadata.version("cuda-python").split(".")[:2]))
     _driver_ver = handle_return(cuda.cuDriverGetVersion())
     _use_ex = (_driver_ver >= 11080) and (_py_major_minor >= (11, 8))
     _inited = True
@@ -55,6 +50,7 @@ class LaunchConfig:
         (Default to size 0)
 
     """
+
     # TODO: expand LaunchConfig to include other attributes
     grid: Union[tuple, int] = None
     block: Union[tuple, int] = None
@@ -65,14 +61,11 @@ def __post_init__(self):
         self.grid = self._cast_to_3_tuple(self.grid)
         self.block = self._cast_to_3_tuple(self.block)
         # we handle "stream=None" in the launch API
-        if self.stream is not None:
-            if not isinstance(self.stream, Stream):
-                try:
-                    self.stream = Stream._init(self.stream)
-                except Exception as e:
-                    raise ValueError(
-                        "stream must either be a Stream object "
-                        "or support __cuda_stream__") from e
+        if self.stream is not None and not isinstance(self.stream, Stream):
+            try:
+                self.stream = Stream._init(self.stream)
+            except Exception as e:
+                raise ValueError("stream must either be a Stream object or support __cuda_stream__") from e
         if self.shmem_size is None:
             self.shmem_size = 0
 
@@ -141,14 +134,11 @@ def launch(kernel, config, *kernel_args):
         drv_cfg.hStream = config.stream._handle
         drv_cfg.sharedMemBytes = config.shmem_size
         drv_cfg.numAttrs = 0  # TODO
-        handle_return(cuda.cuLaunchKernelEx(
-            drv_cfg, int(kernel._handle), args_ptr, 0))
+        handle_return(cuda.cuLaunchKernelEx(drv_cfg, int(kernel._handle), args_ptr, 0))
     else:
         # TODO: check if config has any unsupported attrs
-        handle_return(cuda.cuLaunchKernel(
-            int(kernel._handle),
-            *config.grid,
-            *config.block,
-            config.shmem_size,
-            config.stream._handle,
-            args_ptr, 0))
+        handle_return(
+            cuda.cuLaunchKernel(
+                int(kernel._handle), *config.grid, *config.block, config.shmem_size, config.stream._handle, args_ptr, 0
+            )
+        )
diff --git a/cuda_core/cuda/core/experimental/_memory.py b/cuda_core/cuda/core/experimental/_memory.py
index 678f26ee8..415b5151f 100644
--- a/cuda_core/cuda/core/experimental/_memory.py
+++ b/cuda_core/cuda/core/experimental/_memory.py
@@ -6,14 +6,12 @@
 
 import abc
 from typing import Optional, Tuple, TypeVar
-import warnings
 
 from cuda import cuda
 from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule
 from cuda.core.experimental._stream import default_stream
 from cuda.core.experimental._utils import handle_return
 
-
 PyCapsule = TypeVar("PyCapsule")
 
 
@@ -44,9 +42,9 @@ class Buffer:
     """
 
     # TODO: handle ownership? (_mr could be None)
-    __slots__ = ("_ptr", "_size", "_mr",)
+    __slots__ = ("_ptr", "_size", "_mr")
 
-    def __init__(self, ptr, size, mr: MemoryResource=None):
+    def __init__(self, ptr, size, mr: MemoryResource = None):
         self._ptr = ptr
         self._size = size
         self._mr = mr
@@ -112,7 +110,7 @@ def device_id(self) -> int:
             return self._mr.device_id
         raise NotImplementedError
 
-    def copy_to(self, dst: Buffer=None, *, stream) -> Buffer:
+    def copy_to(self, dst: Buffer = None, *, stream) -> Buffer:
         """Copy from this buffer to the dst buffer asynchronously on the given stream.
 
         Copies the data from this buffer to the provided dst buffer.
@@ -136,8 +134,7 @@ def copy_to(self, dst: Buffer=None, *, stream) -> Buffer:
             dst = self._mr.allocate(self._size, stream)
         if dst._size != self._size:
             raise ValueError("buffer sizes mismatch between src and dst")
-        handle_return(
-            cuda.cuMemcpyAsync(dst._ptr, self._ptr, self._size, stream._handle))
+        handle_return(cuda.cuMemcpyAsync(dst._ptr, self._ptr, self._size, stream._handle))
         return dst
 
     def copy_from(self, src: Buffer, *, stream):
@@ -156,14 +153,16 @@ def copy_from(self, src: Buffer, *, stream):
             raise ValueError("stream must be provided")
         if src._size != self._size:
             raise ValueError("buffer sizes mismatch between src and dst")
-        handle_return(
-            cuda.cuMemcpyAsync(self._ptr, src._ptr, self._size, stream._handle))
-
-    def __dlpack__(self, *,
-                   stream: Optional[int] = None,
-                   max_version: Optional[Tuple[int, int]] = None, 
-                   dl_device: Optional[Tuple[int, int]] = None, 
-                   copy: Optional[bool] = None) -> PyCapsule:
+        handle_return(cuda.cuMemcpyAsync(self._ptr, src._ptr, self._size, stream._handle))
+
+    def __dlpack__(
+        self,
+        *,
+        stream: Optional[int] = None,
+        max_version: Optional[Tuple[int, int]] = None,
+        dl_device: Optional[Tuple[int, int]] = None,
+        copy: Optional[bool] = None,
+    ) -> PyCapsule:
         # Note: we ignore the stream argument entirely (as if it is -1).
         # It is the user's responsibility to maintain stream order.
         if dl_device is not None or copy is True:
@@ -172,10 +171,7 @@ def __dlpack__(self, *,
             versioned = False
         else:
             assert len(max_version) == 2
-            if max_version >= (1, 0):
-                versioned = True
-            else:
-                versioned = False
+            versioned = max_version >= (1, 0)
         capsule = make_py_capsule(self, versioned)
         return capsule
 
@@ -191,10 +187,10 @@ def __dlpack_device__(self) -> Tuple[int, int]:
             raise BufferError("invalid buffer")
 
     def __buffer__(self, flags: int, /) -> memoryview:
-        # Support for Python-level buffer protocol as per PEP 688. 
-        # This raises a BufferError unless: 
+        # Support for Python-level buffer protocol as per PEP 688.
+        # This raises a BufferError unless:
         #   1. Python is 3.12+
-        #   2. This Buffer object is host accessible 
+        #   2. This Buffer object is host accessible
         raise NotImplementedError("TODO")
 
     def __release_buffer__(self, buffer: memoryview, /):
@@ -203,20 +199,16 @@ def __release_buffer__(self, buffer: memoryview, /):
 
 
 class MemoryResource(abc.ABC):
-
     __slots__ = ("_handle",)
 
     @abc.abstractmethod
-    def __init__(self, *args, **kwargs):
-        ...
+    def __init__(self, *args, **kwargs): ...
 
     @abc.abstractmethod
-    def allocate(self, size, stream=None) -> Buffer:
-        ...
+    def allocate(self, size, stream=None) -> Buffer: ...
 
     @abc.abstractmethod
-    def deallocate(self, ptr, size, stream=None):
-        ...
+    def deallocate(self, ptr, size, stream=None): ...
 
     @property
     @abc.abstractmethod
@@ -241,7 +233,6 @@ def device_id(self) -> int:
 
 
 class _DefaultAsyncMempool(MemoryResource):
-
     __slots__ = ("_dev_id",)
 
     def __init__(self, dev_id):
@@ -273,7 +264,6 @@ def device_id(self) -> int:
 
 
 class _DefaultPinnedMemorySource(MemoryResource):
-
     def __init__(self):
         # TODO: support flags from cuMemHostAlloc?
         self._handle = None
diff --git a/cuda_core/cuda/core/experimental/_module.py b/cuda_core/cuda/core/experimental/_module.py
index 7621b9ee4..836d5064a 100644
--- a/cuda_core/cuda/core/experimental/_module.py
+++ b/cuda_core/cuda/core/experimental/_module.py
@@ -4,10 +4,9 @@
 
 import importlib.metadata
 
-from cuda import cuda, cudart
+from cuda import cuda
 from cuda.core.experimental._utils import handle_return
 
-
 _backend = {
     "old": {
         "file": cuda.cuModuleLoad,
@@ -56,7 +55,10 @@ class Kernel:
 
     """
 
-    __slots__ = ("_handle", "_module",)
+    __slots__ = (
+        "_handle",
+        "_module",
+    )
 
     def __init__(self):
         raise NotImplementedError("directly constructing a Kernel instance is not supported")
@@ -107,8 +109,7 @@ class ObjectCode:
     __slots__ = ("_handle", "_code_type", "_module", "_loader", "_sym_map")
     _supported_code_type = ("cubin", "ptx", "ltoir", "fatbin")
 
-    def __init__(self, module, code_type, jit_options=None, *,
-                 symbol_mapping=None):
+    def __init__(self, module, code_type, jit_options=None, *, symbol_mapping=None):
         if code_type not in self._supported_code_type:
             raise ValueError
         _lazy_init()
@@ -129,9 +130,16 @@ def __init__(self, module, code_type, jit_options=None, *,
             if jit_options is None:
                 jit_options = {}
             if backend == "new":
-                args = (module, list(jit_options.keys()), list(jit_options.values()), len(jit_options),
-                        # TODO: support library options
-                        [], [], 0)
+                args = (
+                    module,
+                    list(jit_options.keys()),
+                    list(jit_options.values()),
+                    len(jit_options),
+                    # TODO: support library options
+                    [],
+                    [],
+                    0,
+                )
             else:  # "old" backend
                 args = (module, len(jit_options), list(jit_options.keys()), list(jit_options.values()))
             self._handle = handle_return(self._loader["data"](*args))
diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index 75b7313f4..6cf13c83b 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -3,8 +3,8 @@
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 from cuda import nvrtc
-from cuda.core.experimental._utils import handle_return
 from cuda.core.experimental._module import ObjectCode
+from cuda.core.experimental._utils import handle_return
 
 
 class Program:
@@ -24,8 +24,8 @@ class Program:
 
     """
 
-    __slots__ = ("_handle", "_backend", )
-    _supported_code_type = ("c++", )
+    __slots__ = ("_handle", "_backend")
+    _supported_code_type = ("c++",)
     _supported_target_type = ("ptx", "cubin", "ltoir")
 
     def __init__(self, code, code_type):
@@ -38,8 +38,7 @@ def __init__(self, code, code_type):
                 raise TypeError
             # TODO: support pre-loaded headers & include names
             # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved
-            self._handle = handle_return(
-                nvrtc.nvrtcCreateProgram(code.encode(), b"", 0, [], []))
+            self._handle = handle_return(nvrtc.nvrtcCreateProgram(code.encode(), b"", 0, [], []))
             self._backend = "nvrtc"
         else:
             raise NotImplementedError
@@ -85,14 +84,10 @@ def compile(self, target_type, options=(), name_expressions=(), logs=None):
         if self._backend == "nvrtc":
             if name_expressions:
                 for n in name_expressions:
-                    handle_return(
-                        nvrtc.nvrtcAddNameExpression(self._handle, n.encode()),
-                        handle=self._handle)
+                    handle_return(nvrtc.nvrtcAddNameExpression(self._handle, n.encode()), handle=self._handle)
             # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved
             options = list(o.encode() for o in options)
-            handle_return(
-                nvrtc.nvrtcCompileProgram(self._handle, len(options), options),
-                handle=self._handle)
+            handle_return(nvrtc.nvrtcCompileProgram(self._handle, len(options), options), handle=self._handle)
 
             size_func = getattr(nvrtc, f"nvrtcGet{target_type.upper()}Size")
             comp_func = getattr(nvrtc, f"nvrtcGet{target_type.upper()}")
@@ -103,16 +98,15 @@ def compile(self, target_type, options=(), name_expressions=(), logs=None):
             symbol_mapping = {}
             if name_expressions:
                 for n in name_expressions:
-                    symbol_mapping[n] = handle_return(nvrtc.nvrtcGetLoweredName(
-                        self._handle, n.encode()), handle=self._handle)
+                    symbol_mapping[n] = handle_return(
+                        nvrtc.nvrtcGetLoweredName(self._handle, n.encode()), handle=self._handle
+                    )
 
             if logs is not None:
-                logsize = handle_return(nvrtc.nvrtcGetProgramLogSize(self._handle),
-                                        handle=self._handle)
+                logsize = handle_return(nvrtc.nvrtcGetProgramLogSize(self._handle), handle=self._handle)
                 if logsize > 1:
                     log = b" " * logsize
-                    handle_return(nvrtc.nvrtcGetProgramLog(self._handle, log),
-                                  handle=self._handle)
+                    handle_return(nvrtc.nvrtcGetProgramLog(self._handle, log), handle=self._handle)
                     logs.write(log.decode())
 
             # TODO: handle jit_options for ptx?
diff --git a/cuda_core/cuda/core/experimental/_stream.py b/cuda_core/cuda/core/experimental/_stream.py
index 7f50dafdb..6a68d1753 100644
--- a/cuda_core/cuda/core/experimental/_stream.py
+++ b/cuda_core/cuda/core/experimental/_stream.py
@@ -4,18 +4,16 @@
 
 from __future__ import annotations
 
-from dataclasses import dataclass
 import os
-from typing import Optional, Tuple, TYPE_CHECKING, Union
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Optional, Tuple, Union
 
 if TYPE_CHECKING:
     from cuda.core.experimental._device import Device
 from cuda import cuda, cudart
 from cuda.core.experimental._context import Context
 from cuda.core.experimental._event import Event, EventOptions
-from cuda.core.experimental._utils import check_or_create_options
-from cuda.core.experimental._utils import get_device_from_ctx
-from cuda.core.experimental._utils import handle_return
+from cuda.core.experimental._utils import check_or_create_options, get_device_from_ctx, handle_return
 
 
 @dataclass
@@ -31,6 +29,7 @@ class StreamOptions:
         higher priority. (Default to lowest priority)
 
     """
+
     nonblocking: bool = True
     priority: Optional[int] = None
 
@@ -53,8 +52,7 @@ class Stream:
 
     """
 
-    __slots__ = ("_handle", "_nonblocking", "_priority", "_owner", "_builtin",
-                 "_device_id", "_ctx_handle")
+    __slots__ = ("_handle", "_nonblocking", "_priority", "_owner", "_builtin", "_device_id", "_ctx_handle")
 
     def __init__(self):
         # minimal requirements for the destructor
@@ -64,10 +62,11 @@ def __init__(self):
         raise NotImplementedError(
             "directly creating a Stream object can be ambiguous. Please either "
             "call Device.create_stream() or, if a stream pointer is already "
-            "available from somewhere else, Stream.from_handle()")
+            "available from somewhere else, Stream.from_handle()"
+        )
 
     @staticmethod
-    def _init(obj=None, *, options: Optional[StreamOptions]=None):
+    def _init(obj=None, *, options: Optional[StreamOptions] = None):
         self = Stream.__new__(Stream)
 
         # minimal requirements for the destructor
@@ -95,10 +94,7 @@ def _init(obj=None, *, options: Optional[StreamOptions]=None):
         nonblocking = options.nonblocking
         priority = options.priority
 
-        if nonblocking:
-            flags = cuda.CUstream_flags.CU_STREAM_NON_BLOCKING
-        else:
-            flags = cuda.CUstream_flags.CU_STREAM_DEFAULT
+        flags = cuda.CUstream_flags.CU_STREAM_NON_BLOCKING if nonblocking else cuda.CUstream_flags.CU_STREAM_DEFAULT
 
         high, low = handle_return(cudart.cudaDeviceGetStreamPriorityRange())
         if priority is not None:
@@ -107,8 +103,7 @@ def _init(obj=None, *, options: Optional[StreamOptions]=None):
         else:
             priority = high
 
-        self._handle = handle_return(
-            cuda.cuStreamCreateWithPriority(flags, priority))
+        self._handle = handle_return(cuda.cuStreamCreateWithPriority(flags, priority))
         self._owner = None
         self._nonblocking = nonblocking
         self._priority = priority
@@ -169,7 +164,7 @@ def sync(self):
         """Synchronize the stream."""
         handle_return(cuda.cuStreamSynchronize(self._handle))
 
-    def record(self, event: Event=None, options: EventOptions=None) -> Event:
+    def record(self, event: Event = None, options: EventOptions = None) -> Event:
         """Record an event onto the stream.
 
         Creates an Event object (or reuses the given one) by
@@ -217,12 +212,11 @@ def wait(self, event_or_stream: Union[Event, Stream]):
                     stream = Stream._init(event_or_stream)
                 except Exception as e:
                     raise ValueError(
-                        "only an Event, Stream, or object supporting "
-                        "__cuda_stream__ can be waited") from e
+                        "only an Event, Stream, or object supporting __cuda_stream__ can be waited"
+                    ) from e
             else:
                 stream = event_or_stream
-            event = handle_return(
-                cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DISABLE_TIMING))
+            event = handle_return(cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DISABLE_TIMING))
             handle_return(cuda.cuEventRecord(event, stream.handle))
             discard_event = True
 
@@ -243,11 +237,11 @@ def device(self) -> Device:
 
         """
         from cuda.core.experimental._device import Device  # avoid circular import
+
         if self._device_id is None:
             # Get the stream context first
             if self._ctx_handle is None:
-                self._ctx_handle = handle_return(
-                    cuda.cuStreamGetCtx(self._handle))
+                self._ctx_handle = handle_return(cuda.cuStreamGetCtx(self._handle))
             self._device_id = get_device_from_ctx(self._ctx_handle)
         return Device(self._device_id)
 
@@ -255,8 +249,7 @@ def device(self) -> Device:
     def context(self) -> Context:
         """Return the :obj:`Context` associated with this stream."""
         if self._ctx_handle is None:
-            self._ctx_handle = handle_return(
-                cuda.cuStreamGetCtx(self._handle))
+            self._ctx_handle = handle_return(cuda.cuStreamGetCtx(self._handle))
         if self._device_id is None:
             self._device_id = get_device_from_ctx(self._ctx_handle)
         return Context._from_ctx(self._ctx_handle, self._device_id)
@@ -285,15 +278,16 @@ def from_handle(handle: int) -> Stream:
             Newly created stream object.
 
         """
+
         class _stream_holder:
             @property
             def __cuda_stream__(self):
                 return (0, handle)
+
         return Stream._init(obj=_stream_holder())
 
 
 class _LegacyDefaultStream(Stream):
-
     def __init__(self):
         self._handle = cuda.CUstream(cuda.CU_STREAM_LEGACY)
         self._owner = None
@@ -303,7 +297,6 @@ def __init__(self):
 
 
 class _PerThreadDefaultStream(Stream):
-
     def __init__(self):
         self._handle = cuda.CUstream(cuda.CU_STREAM_PER_THREAD)
         self._owner = None
@@ -327,7 +320,7 @@ def default_stream():
 
     """
     # TODO: flip the default
-    use_ptds = int(os.environ.get('CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM', 0))
+    use_ptds = int(os.environ.get("CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM", 0))
     if use_ptds:
         return PER_THREAD_DEFAULT_STREAM
     else:
diff --git a/cuda_core/cuda/core/experimental/_utils.py b/cuda_core/cuda/core/experimental/_utils.py
index 894e21653..9cb47a33e 100644
--- a/cuda_core/cuda/core/experimental/_utils.py
+++ b/cuda_core/cuda/core/experimental/_utils.py
@@ -2,17 +2,19 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-from collections import namedtuple
 import functools
+from collections import namedtuple
 from typing import Callable, Dict
 
 from cuda import cuda, cudart, nvrtc
 
 
-class CUDAError(Exception): pass
+class CUDAError(Exception):
+    pass
 
 
-class NVRTCError(CUDAError): pass
+class NVRTCError(CUDAError):
+    pass
 
 
 ComputeCapability = namedtuple("ComputeCapability", ("major", "minor"))
@@ -50,7 +52,7 @@ def _check_error(error, handle=None):
             err += f", compilation log:\n\n{log.decode()}"
         raise NVRTCError(err)
     else:
-        raise RuntimeError('Unknown error type: {}'.format(error))
+        raise RuntimeError(f"Unknown error type: {error}")
 
 
 def handle_return(result, handle=None):
@@ -76,9 +78,11 @@ def check_or_create_options(cls, options, options_description, *, keep_none=Fals
         options = cls(**options)
 
     if not isinstance(options, cls):
-        raise TypeError(f"The {options_description} must be provided as an object "
-                        f"of type {cls.__name__} or as a dict with valid {options_description}. "
-                        f"The provided object is '{options}'.")
+        raise TypeError(
+            f"The {options_description} must be provided as an object "
+            f"of type {cls.__name__} or as a dict with valid {options_description}. "
+            f"The provided object is '{options}'."
+        )
 
     return options
 
@@ -88,17 +92,19 @@ def precondition(checker: Callable[..., None], what: str = "") -> Callable:
     A decorator that adds checks to ensure any preconditions are met.
 
     Args:
-        checker: The function to call to check whether the preconditions are met. It has the same signature as the wrapped
-            function with the addition of the keyword argument `what`.
+        checker: The function to call to check whether the preconditions are met. It has
+        the same signature as the wrapped function with the addition of the keyword argument `what`.
         what: A string that is passed in to `checker` to provide context information.
 
     Returns:
         Callable: A decorator that creates the wrapping.
     """
+
     def outer(wrapped_function):
         """
         A decorator that actually wraps the function for checking preconditions.
         """
+
         @functools.wraps(wrapped_function)
         def inner(*args, **kwargs):
             """
@@ -116,17 +122,15 @@ def inner(*args, **kwargs):
 
 def get_device_from_ctx(ctx_handle) -> int:
     """Get device ID from the given ctx."""
-    from cuda.core.experimental._device import Device # avoid circular import
+    from cuda.core.experimental._device import Device  # avoid circular import
+
     prev_ctx = Device().context._handle
-    if int(ctx_handle) != int(prev_ctx):
-        switch_context = True
-    else:
-        switch_context = False
+    switch_context = int(ctx_handle) != int(prev_ctx)
     if switch_context:
         assert prev_ctx == handle_return(cuda.cuCtxPopCurrent())
         handle_return(cuda.cuCtxPushCurrent(ctx_handle))
     device_id = int(handle_return(cuda.cuCtxGetDevice()))
     if switch_context:
-        assert ctx_handle ==  handle_return(cuda.cuCtxPopCurrent())
+        assert ctx_handle == handle_return(cuda.cuCtxPopCurrent())
         handle_return(cuda.cuCtxPushCurrent(prev_ctx))
     return device_id
diff --git a/cuda_core/cuda/core/experimental/utils.py b/cuda_core/cuda/core/experimental/utils.py
index 74f41e4d3..0717b41aa 100644
--- a/cuda_core/cuda/core/experimental/utils.py
+++ b/cuda_core/cuda/core/experimental/utils.py
@@ -2,4 +2,3 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-from cuda.core.experimental._memoryview import StridedMemoryView, viewable
diff --git a/cuda_core/docs/source/conf.py b/cuda_core/docs/source/conf.py
index 5b28d331f..ce37b3aa3 100644
--- a/cuda_core/docs/source/conf.py
+++ b/cuda_core/docs/source/conf.py
@@ -10,15 +10,16 @@
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 import os
+
 # import sys
 # sys.path.insert(0, os.path.abspath('.'))
 
 
 # -- Project information -----------------------------------------------------
 
-project = 'cuda.core'
-copyright = '2024, NVIDIA'
-author = 'NVIDIA'
+project = "cuda.core"
+copyright = "2024, NVIDIA"
+author = "NVIDIA"
 
 # The full version, including alpha/beta/rc tags
 release = os.environ["SPHINX_CUDA_CORE_VER"]
@@ -30,16 +31,16 @@
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
 extensions = [
-	'sphinx.ext.autodoc',
-    'sphinx.ext.autosummary',
-	'sphinx.ext.napoleon',
-	'myst_nb',
-	'enum_tools.autoenum',
-    'sphinx_copybutton',
+    "sphinx.ext.autodoc",
+    "sphinx.ext.autosummary",
+    "sphinx.ext.napoleon",
+    "myst_nb",
+    "enum_tools.autoenum",
+    "sphinx_copybutton",
 ]
 
 # Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
+templates_path = ["_templates"]
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
@@ -50,34 +51,34 @@
 
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
-html_baseurl = 'docs'
-html_theme = 'furo'
-#html_theme = 'pydata_sphinx_theme'
+html_baseurl = "docs"
+html_theme = "furo"
+# html_theme = 'pydata_sphinx_theme'
 html_theme_options = {
     "light_logo": "logo-light-mode.png",
     "dark_logo": "logo-dark-mode.png",
     # For pydata_sphinx_theme:
-    #"logo": {
-	#    "image_light": "_static/logo-light-mode.png",
+    # "logo": {
+    #    "image_light": "_static/logo-light-mode.png",
     #    "image_dark": "_static/logo-dark-mode.png",
-    #},
-    #"switcher": {
+    # },
+    # "switcher": {
     #    "json_url": "https://nvidia.github.io/cuda-python/cuda-core/versions.json",
     #    "version_match": release,
-    #},
+    # },
     ## Add light/dark mode and documentation version switcher
-    #"navbar_end": [
+    # "navbar_end": [
     #    "search-button",
     #    "theme-switcher",
     #    "version-switcher",
     #    "navbar-icon-links",
-    #],
+    # ],
 }
 
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+html_static_path = ["_static"]
 
 # skip cmdline prompts
-copybutton_exclude = '.linenos, .gp'
+copybutton_exclude = ".linenos, .gp"
diff --git a/cuda_core/examples/saxpy.py b/cuda_core/examples/saxpy.py
index 37ad49330..8caa4d4a5 100644
--- a/cuda_core/examples/saxpy.py
+++ b/cuda_core/examples/saxpy.py
@@ -4,12 +4,9 @@
 
 import sys
 
-from cuda.core.experimental import Device
-from cuda.core.experimental import LaunchConfig, launch
-from cuda.core.experimental import Program
-
 import cupy as cp
 
+from cuda.core.experimental import Device, LaunchConfig, Program, launch
 
 # compute out = a * x + y
 code = """
@@ -35,9 +32,13 @@
 prog = Program(code, code_type="c++")
 mod = prog.compile(
     "cubin",
-    options=("-std=c++11", "-arch=sm_" + "".join(f"{i}" for i in dev.compute_capability),),
+    options=(
+        "-std=c++11",
+        "-arch=sm_" + "".join(f"{i}" for i in dev.compute_capability),
+    ),
     logs=sys.stdout,
-    name_expressions=("saxpy<float>", "saxpy<double>"))
+    name_expressions=("saxpy<float>", "saxpy<double>"),
+)
 
 # run in single precision
 ker = mod.get_kernel("saxpy<float>")
@@ -62,7 +63,7 @@
 s.sync()
 
 # check result
-assert cp.allclose(out, a*x+y)
+assert cp.allclose(out, a * x + y)
 
 # let's repeat again, this time allocates our own out buffer instead of cupy's
 # run in double precision
@@ -77,8 +78,10 @@
 dev.sync()
 
 # prepare output
-buf = dev.allocate(size * 8,  # = dtype.itemsize
-                   stream=s)
+buf = dev.allocate(
+    size * 8,  # = dtype.itemsize
+    stream=s,
+)
 
 # prepare launch
 block = 64
@@ -92,9 +95,10 @@
 
 # check result
 # we wrap output buffer as a cupy array for simplicity
-out = cp.ndarray(size, dtype=dtype,
-                 memptr=cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(int(buf.handle), buf.size, buf), 0))
-assert cp.allclose(out, a*x+y)
+out = cp.ndarray(
+    size, dtype=dtype, memptr=cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(int(buf.handle), buf.size, buf), 0)
+)
+assert cp.allclose(out, a * x + y)
 
 # clean up resources that we allocate
 # cupy cleans up automatically the rest
diff --git a/cuda_core/examples/vector_add.py b/cuda_core/examples/vector_add.py
index baee409aa..550eaf2a2 100644
--- a/cuda_core/examples/vector_add.py
+++ b/cuda_core/examples/vector_add.py
@@ -2,12 +2,9 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-from cuda.core.experimental import Device
-from cuda.core.experimental import LaunchConfig, launch
-from cuda.core.experimental import Program
-
 import cupy as cp
 
+from cuda.core.experimental import Device, LaunchConfig, Program, launch
 
 # compute c = a + b
 code = """
@@ -32,8 +29,12 @@
 prog = Program(code, code_type="c++")
 mod = prog.compile(
     "cubin",
-    options=("-std=c++17", "-arch=sm_" + "".join(f"{i}" for i in dev.compute_capability),),
-    name_expressions=("vector_add<float>",))
+    options=(
+        "-std=c++17",
+        "-arch=sm_" + "".join(f"{i}" for i in dev.compute_capability),
+    ),
+    name_expressions=("vector_add<float>",),
+)
 
 # run in single precision
 ker = mod.get_kernel("vector_add<float>")
@@ -58,5 +59,5 @@
 s.sync()
 
 # check result
-assert cp.allclose(c, a+b)
+assert cp.allclose(c, a + b)
 print("done!")
diff --git a/cuda_core/pyproject.toml b/cuda_core/pyproject.toml
index 35d1c42a1..6993825f3 100644
--- a/cuda_core/pyproject.toml
+++ b/cuda_core/pyproject.toml
@@ -55,3 +55,43 @@ include = ["cuda.core*"]
 [tool.setuptools.dynamic]
 version = { attr = "cuda.core._version.__version__" }
 readme = { file = ["README.md"], content-type = "text/markdown" }
+
+[tool.ruff]
+line-length = 120
+
+[tool.ruff.format]
+docstring-code-format = true
+
+exclude = ["cuda/core/_version.py"]
+
+[tool.ruff.lint]
+select = [
+    # pycodestyle Error
+    "E",
+    # Pyflakes
+    "F",
+    # pycodestyle Warning
+    "W",
+    # pyupgrade
+    "UP",
+    # flake8-bugbear
+    "B",
+    # flake8-simplify
+    "SIM",
+    # isort
+    "I",
+]
+
+ignore = [
+    "UP006", 
+    "UP007", 
+    "E741", # ambiguous variable name such as I
+    "B007", # rename unsued loop variable to _name 
+    "UP035" # UP006, UP007, UP035 complain about deprecated Typing.<type> use, but disregard backward compatibility of python version
+]
+
+exclude = ["cuda/core/_version.py"]
+
+[tool.ruff.lint.per-file-ignores]
+"__init__.py" = ["F401"]
+"setup.py" = ["F401"]
diff --git a/cuda_core/setup.py b/cuda_core/setup.py
index 8d20f2c94..029a19317 100644
--- a/cuda_core/setup.py
+++ b/cuda_core/setup.py
@@ -5,10 +5,9 @@
 import os
 
 from Cython.Build import cythonize
-from setuptools import setup, Extension
+from setuptools import Extension, setup
 from setuptools.command.build_ext import build_ext as _build_ext
 
-
 ext_modules = (
     Extension(
         "cuda.core.experimental._dlpack",
@@ -29,16 +28,15 @@
 
 
 class build_ext(_build_ext):
-
     def build_extensions(self):
         self.parallel = os.cpu_count() // 2
         super().build_extensions()
 
 
 setup(
-    ext_modules=cythonize(ext_modules,
-        verbose=True, language_level=3,
-        compiler_directives={'embedsignature': True}),
-    cmdclass = {'build_ext': build_ext,},
+    ext_modules=cythonize(ext_modules, verbose=True, language_level=3, compiler_directives={"embedsignature": True}),
+    cmdclass={
+        "build_ext": build_ext,
+    },
     zip_safe=False,
 )
diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py
index 3c7eccd0c..bb99fb33c 100644
--- a/cuda_core/tests/conftest.py
+++ b/cuda_core/tests/conftest.py
@@ -10,11 +10,12 @@
 except ImportError:
     from cuda import cuda as driver
 
-from cuda.core.experimental import Device
-from cuda.core.experimental import _device
-from cuda.core.experimental._utils import handle_return
 import pytest
 
+from cuda.core.experimental import Device, _device
+from cuda.core.experimental._utils import handle_return
+
+
 @pytest.fixture(scope="function")
 def init_cuda():
     device = Device()
@@ -22,12 +23,14 @@ def init_cuda():
     yield
     _device_unset_current()
 
+
 def _device_unset_current():
     handle_return(driver.cuCtxPopCurrent())
     with _device._tls_lock:
         del _device._tls.devices
 
+
 @pytest.fixture(scope="function")
 def deinit_cuda():
     yield
-    _device_unset_current()
\ No newline at end of file
+    _device_unset_current()
diff --git a/cuda_core/tests/example_tests/test_basic_examples.py b/cuda_core/tests/example_tests/test_basic_examples.py
index c02ea43fc..9b94ecd3e 100644
--- a/cuda_core/tests/example_tests/test_basic_examples.py
+++ b/cuda_core/tests/example_tests/test_basic_examples.py
@@ -8,18 +8,18 @@
 
 # If we have subcategories of examples in the future, this file can be split along those lines
 
-from .utils import run_example
-import os
 import glob
+import os
+
 import pytest
 
-samples_path = os.path.join(
-    os.path.dirname(__file__), '..', '..', 'examples')
-sample_files = glob.glob(samples_path+'**/*.py', recursive=True)
-@pytest.mark.parametrize(
-    'example', sample_files
-)
+from .utils import run_example
+
+samples_path = os.path.join(os.path.dirname(__file__), "..", "..", "examples")
+sample_files = glob.glob(samples_path + "**/*.py", recursive=True)
+
+
+@pytest.mark.parametrize("example", sample_files)
 class TestExamples:
     def test_example(self, example, deinit_cuda):
-        filename = os.path.basename(example)
         run_example(samples_path, example)
diff --git a/cuda_core/tests/example_tests/utils.py b/cuda_core/tests/example_tests/utils.py
index 23a3018ce..f6ac3e15d 100644
--- a/cuda_core/tests/example_tests/utils.py
+++ b/cuda_core/tests/example_tests/utils.py
@@ -9,16 +9,19 @@
 import gc
 import os
 import sys
-import pytest
+
 import cupy as cp
+import pytest
+
 
 class SampleTestError(Exception):
     pass
 
+
 def parse_python_script(filepath):
-    if not filepath.endswith('.py'):
+    if not filepath.endswith(".py"):
         raise ValueError(f"{filepath} not supported")
-    with open(filepath, "r", encoding='utf-8') as f:
+    with open(filepath, encoding="utf-8") as f:
         script = f.read()
     return script
 
@@ -34,17 +37,17 @@ def run_example(samples_path, filename, env=None):
         exec(script, env if env else {})
     except ImportError as e:
         # for samples requiring any of optional dependencies
-        for m in ('cupy',):
+        for m in ("cupy",):
             if f"No module named '{m}'" in str(e):
-                pytest.skip(f'{m} not installed, skipping related tests')
+                pytest.skip(f"{m} not installed, skipping related tests")
                 break
         else:
             raise
     except Exception as e:
-            msg = "\n"
-            msg += f'Got error ({filename}):\n'
-            msg += str(e)
-            raise SampleTestError(msg) from e
+        msg = "\n"
+        msg += f"Got error ({filename}):\n"
+        msg += str(e)
+        raise SampleTestError(msg) from e
     finally:
         sys.path = old_sys_path
         sys.argv = old_argv
diff --git a/cuda_core/tests/test_device.py b/cuda_core/tests/test_device.py
index c809bfb3f..afc3ed5b6 100644
--- a/cuda_core/tests/test_device.py
+++ b/cuda_core/tests/test_device.py
@@ -13,16 +13,19 @@
     from cuda import cudart as runtime
 
 from cuda.core.experimental import Device
-from cuda.core.experimental._utils import handle_return, ComputeCapability
+from cuda.core.experimental._utils import ComputeCapability, handle_return
+
 
 def test_device_set_current(deinit_cuda):
     device = Device()
     device.set_current()
     assert handle_return(driver.cuCtxGetCurrent()) is not None
-    
+
+
 def test_device_repr():
     device = Device(0)
-    assert str(device).startswith('<Device 0')
+    assert str(device).startswith("<Device 0")
+
 
 def test_device_alloc(init_cuda):
     device = Device()
@@ -32,17 +35,20 @@ def test_device_alloc(init_cuda):
     assert buffer.size == 1024
     assert buffer.device_id == 0
 
+
 def test_device_create_stream(init_cuda):
     device = Device()
     stream = device.create_stream()
     assert stream is not None
     assert stream.handle
 
+
 def test_pci_bus_id():
     device = Device()
     bus_id = handle_return(runtime.cudaDeviceGetPCIBusId(13, device.device_id))
     assert device.pci_bus_id == bus_id[:12].decode()
 
+
 def test_uuid():
     device = Device()
     driver_ver = handle_return(driver.cuDriverGetVersion())
@@ -54,18 +60,21 @@ def test_uuid():
     expected_uuid = f"{uuid[:8]}-{uuid[8:12]}-{uuid[12:16]}-{uuid[16:20]}-{uuid[20:]}"
     assert device.uuid == expected_uuid
 
+
 def test_name():
     device = Device()
     name = handle_return(driver.cuDeviceGetName(128, device.device_id))
-    name = name.split(b'\0')[0]
+    name = name.split(b"\0")[0]
     assert device.name == name.decode()
 
+
 def test_compute_capability():
     device = Device()
-    major = handle_return(runtime.cudaDeviceGetAttribute(
-        runtime.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, device.device_id))
-    minor = handle_return(runtime.cudaDeviceGetAttribute(
-        runtime.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, device.device_id))
+    major = handle_return(
+        runtime.cudaDeviceGetAttribute(runtime.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, device.device_id)
+    )
+    minor = handle_return(
+        runtime.cudaDeviceGetAttribute(runtime.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, device.device_id)
+    )
     expected_cc = ComputeCapability(major, minor)
     assert device.compute_capability == expected_cc
- 
\ No newline at end of file
diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py
index 42d1ef95a..21548078f 100644
--- a/cuda_core/tests/test_event.py
+++ b/cuda_core/tests/test_event.py
@@ -6,38 +6,41 @@
 # this software and related documentation outside the terms of the EULA
 # is strictly prohibited.
 
-from cuda.core.experimental import Device, EventOptions
 import pytest
 
+from cuda.core.experimental import Device, EventOptions
+
+
 @pytest.mark.parametrize("enable_timing", [True, False, None])
 def test_timing(init_cuda, enable_timing):
     options = EventOptions(enable_timing=enable_timing)
     stream = Device().create_stream()
     event = stream.record(options=options)
     assert event.is_timing_disabled == (not enable_timing if enable_timing is not None else True)
-    
+
 
 def test_is_sync_busy_waited(init_cuda):
     options = EventOptions(enable_timing=False, busy_waited_sync=True)
     stream = Device().create_stream()
     event = stream.record(options=options)
-    assert event.is_sync_busy_waited == True
+    assert event.is_sync_busy_waited is True
 
     options = EventOptions(enable_timing=False)
     stream = Device().create_stream()
     event = stream.record(options=options)
-    assert event.is_sync_busy_waited == False
+    assert event.is_sync_busy_waited is False
+
 
 def test_sync(init_cuda):
     options = EventOptions(enable_timing=False)
     stream = Device().create_stream()
     event = stream.record(options=options)
     event.sync()
-    assert event.is_done == True
+    assert event.is_done is True
+
 
 def test_is_done(init_cuda):
     options = EventOptions(enable_timing=False)
     stream = Device().create_stream()
     event = stream.record(options=options)
-    assert event.is_done == True
-
+    assert event.is_done is True
diff --git a/cuda_core/tests/test_launcher.py b/cuda_core/tests/test_launcher.py
index 796050a82..874d7f07b 100644
--- a/cuda_core/tests/test_launcher.py
+++ b/cuda_core/tests/test_launcher.py
@@ -6,9 +6,11 @@
 # this software and related documentation outside the terms of the EULA
 # is strictly prohibited.
 
-from cuda.core.experimental import Device, Stream, LaunchConfig
 import pytest
 
+from cuda.core.experimental import Device, LaunchConfig, Stream
+
+
 def test_launch_config_init(init_cuda):
     config = LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1), stream=None, shmem_size=0)
     assert config.grid == (1, 1, 1)
@@ -22,6 +24,7 @@ def test_launch_config_init(init_cuda):
     assert isinstance(config.stream, Stream)
     assert config.shmem_size == 1024
 
+
 def test_launch_config_cast_to_3_tuple():
     config = LaunchConfig(grid=1, block=1)
     assert config._cast_to_3_tuple(1) == (1, 1, 1)
@@ -33,6 +36,7 @@ def test_launch_config_cast_to_3_tuple():
     assert config._cast_to_3_tuple((999, 888)) == (999, 888, 1)
     assert config._cast_to_3_tuple((999, 888, 777)) == (999, 888, 777)
 
+
 def test_launch_config_invalid_values():
     with pytest.raises(ValueError):
         LaunchConfig(grid=0, block=1)
@@ -46,6 +50,7 @@ def test_launch_config_invalid_values():
     with pytest.raises(ValueError):
         LaunchConfig(grid=(1, 1, 1), block=(0, 1))
 
+
 def test_launch_config_stream(init_cuda):
     stream = Device().create_stream()
     config = LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1), stream=stream, shmem_size=0)
@@ -54,6 +59,7 @@ def test_launch_config_stream(init_cuda):
     with pytest.raises(ValueError):
         LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1), stream="invalid_stream", shmem_size=0)
 
+
 def test_launch_config_shmem_size():
     config = LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1), stream=None, shmem_size=2048)
     assert config.shmem_size == 2048
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 74c50b7ae..c78b56733 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -11,10 +11,12 @@
 except ImportError:
     from cuda import cuda as driver
 
+import ctypes
+
 from cuda.core.experimental import Device
 from cuda.core.experimental._memory import Buffer, MemoryResource
 from cuda.core.experimental._utils import handle_return
-import ctypes
+
 
 class DummyDeviceMemoryResource(MemoryResource):
     def __init__(self, device):
@@ -38,7 +40,8 @@ def is_host_accessible(self) -> bool:
     @property
     def device_id(self) -> int:
         return 0
- 
+
+
 class DummyHostMemoryResource(MemoryResource):
     def __init__(self):
         pass
@@ -49,7 +52,7 @@ def allocate(self, size, stream=None) -> Buffer:
         return Buffer(ptr=ptr, size=size, mr=self)
 
     def deallocate(self, ptr, size, stream=None):
-        #the memory is deallocated per the ctypes deallocation at garbage collection time
+        # the memory is deallocated per the ctypes deallocation at garbage collection time
         pass
 
     @property
@@ -64,6 +67,7 @@ def is_host_accessible(self) -> bool:
     def device_id(self) -> int:
         raise RuntimeError("the pinned memory resource is not bound to any GPU")
 
+
 class DummyUnifiedMemoryResource(MemoryResource):
     def __init__(self, device):
         self.device = device
@@ -87,6 +91,7 @@ def is_host_accessible(self) -> bool:
     def device_id(self) -> int:
         return 0
 
+
 class DummyPinnedMemoryResource(MemoryResource):
     def __init__(self, device):
         self.device = device
@@ -110,7 +115,8 @@ def is_host_accessible(self) -> bool:
     def device_id(self) -> int:
         raise RuntimeError("the pinned memory resource is not bound to any GPU")
 
-def buffer_initialization(dummy_mr : MemoryResource):
+
+def buffer_initialization(dummy_mr: MemoryResource):
     buffer = dummy_mr.allocate(size=1024)
     assert buffer.handle != 0
     assert buffer.size == 1024
@@ -119,6 +125,7 @@ def buffer_initialization(dummy_mr : MemoryResource):
     assert buffer.is_host_accessible == dummy_mr.is_host_accessible
     buffer.close()
 
+
 def test_buffer_initialization():
     device = Device()
     device.set_current()
@@ -126,8 +133,9 @@ def test_buffer_initialization():
     buffer_initialization(DummyHostMemoryResource())
     buffer_initialization(DummyUnifiedMemoryResource(device))
     buffer_initialization(DummyPinnedMemoryResource(device))
-    
-def buffer_copy_to(dummy_mr : MemoryResource, device : Device, check = False):
+
+
+def buffer_copy_to(dummy_mr: MemoryResource, device: Device, check=False):
     src_buffer = dummy_mr.allocate(size=1024)
     dst_buffer = dummy_mr.allocate(size=1024)
     stream = device.create_stream()
@@ -136,27 +144,29 @@ def buffer_copy_to(dummy_mr : MemoryResource, device : Device, check = False):
         src_ptr = ctypes.cast(src_buffer.handle, ctypes.POINTER(ctypes.c_byte))
         for i in range(1024):
             src_ptr[i] = ctypes.c_byte(i)
-        
+
     src_buffer.copy_to(dst_buffer, stream=stream)
     device.sync()
 
     if check:
         dst_ptr = ctypes.cast(dst_buffer.handle, ctypes.POINTER(ctypes.c_byte))
-        
+
         for i in range(10):
             assert dst_ptr[i] == src_ptr[i]
 
     dst_buffer.close()
     src_buffer.close()
 
+
 def test_buffer_copy_to():
     device = Device()
     device.set_current()
     buffer_copy_to(DummyDeviceMemoryResource(device), device)
     buffer_copy_to(DummyUnifiedMemoryResource(device), device)
-    buffer_copy_to(DummyPinnedMemoryResource(device), device, check = True)
+    buffer_copy_to(DummyPinnedMemoryResource(device), device, check=True)
+
 
-def buffer_copy_from(dummy_mr : MemoryResource, device, check = False):
+def buffer_copy_from(dummy_mr: MemoryResource, device, check=False):
     src_buffer = dummy_mr.allocate(size=1024)
     dst_buffer = dummy_mr.allocate(size=1024)
     stream = device.create_stream()
@@ -165,31 +175,34 @@ def buffer_copy_from(dummy_mr : MemoryResource, device, check = False):
         src_ptr = ctypes.cast(src_buffer.handle, ctypes.POINTER(ctypes.c_byte))
         for i in range(1024):
             src_ptr[i] = ctypes.c_byte(i)
-        
+
     dst_buffer.copy_from(src_buffer, stream=stream)
     device.sync()
 
     if check:
         dst_ptr = ctypes.cast(dst_buffer.handle, ctypes.POINTER(ctypes.c_byte))
-        
+
         for i in range(10):
             assert dst_ptr[i] == src_ptr[i]
 
     dst_buffer.close()
     src_buffer.close()
 
+
 def test_buffer_copy_from():
     device = Device()
     device.set_current()
     buffer_copy_from(DummyDeviceMemoryResource(device), device)
     buffer_copy_from(DummyUnifiedMemoryResource(device), device)
-    buffer_copy_from(DummyPinnedMemoryResource(device), device, check = True)
+    buffer_copy_from(DummyPinnedMemoryResource(device), device, check=True)
 
-def buffer_close(dummy_mr : MemoryResource):
+
+def buffer_close(dummy_mr: MemoryResource):
     buffer = dummy_mr.allocate(size=1024)
     buffer.close()
     assert buffer.handle == 0
-    assert buffer.memory_resource == None
+    assert buffer.memory_resource is None
+
 
 def test_buffer_close():
     device = Device()
diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py
index 832963777..5f0b6056d 100644
--- a/cuda_core/tests/test_module.py
+++ b/cuda_core/tests/test_module.py
@@ -6,11 +6,17 @@
 # this software and related documentation outside the terms of the EULA
 # is strictly prohibited.
 
-from cuda.core.experimental._module import ObjectCode
-import pytest
 import importlib
 
-@pytest.mark.skipif(int(importlib.metadata.version("cuda-python").split(".")[0]) < 12, reason='Module loading for older drivers validate require valid module code.')
+import pytest
+
+from cuda.core.experimental._module import ObjectCode
+
+
+@pytest.mark.skipif(
+    int(importlib.metadata.version("cuda-python").split(".")[0]) < 12,
+    reason="Module loading for older drivers validate require valid module code.",
+)
 def test_object_code_initialization():
     # Test with supported code types
     for code_type in ["cubin", "ptx", "fatbin"]:
@@ -24,15 +30,19 @@ def test_object_code_initialization():
     with pytest.raises(ValueError):
         ObjectCode(b"dummy_data", "unsupported_code_type")
 
-#TODO add ObjectCode tests which provide the appropriate data for cuLibraryLoadFromFile
+
+# TODO add ObjectCode tests which provide the appropriate data for cuLibraryLoadFromFile
 def test_object_code_initialization_with_str():
     assert True
 
+
 def test_object_code_initialization_with_jit_options():
     assert True
 
+
 def test_object_code_get_kernel():
     assert True
 
+
 def test_kernel_from_obj():
     assert True
diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index caa7369eb..af94a7ba0 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -6,52 +6,61 @@
 # this software and related documentation outside the terms of the EULA
 # is strictly prohibited.
 
-from cuda.core.experimental import Program
-from cuda.core.experimental._module import ObjectCode, Kernel
 import pytest
 
+from cuda.core.experimental import Program
+from cuda.core.experimental._module import Kernel, ObjectCode
+
+
 def test_program_init_valid_code_type():
-    code = "extern \"C\" __global__ void my_kernel() {}"
+    code = 'extern "C" __global__ void my_kernel() {}'
     program = Program(code, "c++")
     assert program.backend == "nvrtc"
     assert program.handle is not None
 
+
 def test_program_init_invalid_code_type():
-    code = "extern \"C\" __global__ void my_kernel() {}"
+    code = 'extern "C" __global__ void my_kernel() {}'
     with pytest.raises(NotImplementedError):
         Program(code, "python")
 
+
 def test_program_init_invalid_code_format():
     code = 12345
     with pytest.raises(TypeError):
         Program(code, "c++")
 
+
 def test_program_compile_valid_target_type():
-    code = "extern \"C\" __global__ void my_kernel() {}"
+    code = 'extern "C" __global__ void my_kernel() {}'
     program = Program(code, "c++")
     object_code = program.compile("ptx")
     kernel = object_code.get_kernel("my_kernel")
     assert isinstance(object_code, ObjectCode)
     assert isinstance(kernel, Kernel)
 
+
 def test_program_compile_invalid_target_type():
-    code = "extern \"C\" __global__ void my_kernel() {}"
+    code = 'extern "C" __global__ void my_kernel() {}'
     program = Program(code, "c++")
     with pytest.raises(NotImplementedError):
         program.compile("invalid_target")
 
+
 def test_program_backend_property():
-    code = "extern \"C\" __global__ void my_kernel() {}"
+    code = 'extern "C" __global__ void my_kernel() {}'
     program = Program(code, "c++")
     assert program.backend == "nvrtc"
 
+
 def test_program_handle_property():
-    code = "extern \"C\" __global__ void my_kernel() {}"
+    code = 'extern "C" __global__ void my_kernel() {}'
     program = Program(code, "c++")
     assert program.handle is not None
 
+
 def test_program_close():
-    code = "extern \"C\" __global__ void my_kernel() {}"
+    code = 'extern "C" __global__ void my_kernel() {}'
     program = Program(code, "c++")
     program.close()
     assert program.handle is None
diff --git a/cuda_core/tests/test_stream.py b/cuda_core/tests/test_stream.py
index faf0ad7d8..03cdd8526 100644
--- a/cuda_core/tests/test_stream.py
+++ b/cuda_core/tests/test_stream.py
@@ -6,28 +6,34 @@
 # this software and related documentation outside the terms of the EULA
 # is strictly prohibited.
 
+import pytest
+
 from cuda.core.experimental import Device, Stream, StreamOptions
-from cuda.core.experimental._stream import LEGACY_DEFAULT_STREAM, PER_THREAD_DEFAULT_STREAM, default_stream
 from cuda.core.experimental._event import Event
-import pytest
+from cuda.core.experimental._stream import LEGACY_DEFAULT_STREAM, PER_THREAD_DEFAULT_STREAM, default_stream
+
 
 def test_stream_init():
     with pytest.raises(NotImplementedError):
         Stream()
 
+
 def test_stream_init_with_options(init_cuda):
     stream = Device().create_stream(options=StreamOptions(nonblocking=True, priority=0))
     assert stream.is_nonblocking is True
     assert stream.priority == 0
 
+
 def test_stream_handle(init_cuda):
     stream = Device().create_stream(options=StreamOptions())
     assert isinstance(stream.handle, int)
 
+
 def test_stream_is_nonblocking(init_cuda):
     stream = Device().create_stream(options=StreamOptions(nonblocking=True))
     assert stream.is_nonblocking is True
 
+
 def test_stream_priority(init_cuda):
     stream = Device().create_stream(options=StreamOptions(priority=0))
     assert stream.priority == 0
@@ -36,20 +42,24 @@ def test_stream_priority(init_cuda):
     with pytest.raises(ValueError):
         stream = Device().create_stream(options=StreamOptions(priority=1))
 
+
 def test_stream_sync(init_cuda):
     stream = Device().create_stream(options=StreamOptions())
     stream.sync()  # Should not raise any exceptions
 
+
 def test_stream_record(init_cuda):
     stream = Device().create_stream(options=StreamOptions())
     event = stream.record()
     assert isinstance(event, Event)
 
+
 def test_stream_record_invalid_event(init_cuda):
     stream = Device().create_stream(options=StreamOptions())
     with pytest.raises(TypeError):
         stream.record(event="invalid_event")
 
+
 def test_stream_wait_event(init_cuda):
     s1 = Device().create_stream()
     s2 = Device().create_stream()
@@ -57,21 +67,25 @@ def test_stream_wait_event(init_cuda):
     s2.wait(e1)  # Should not raise any exceptions
     s2.sync()
 
+
 def test_stream_wait_invalid_event(init_cuda):
     stream = Device().create_stream(options=StreamOptions())
     with pytest.raises(ValueError):
         stream.wait(event_or_stream="invalid_event")
 
+
 def test_stream_device(init_cuda):
     stream = Device().create_stream(options=StreamOptions())
     device = stream.device
     assert isinstance(device, Device)
 
+
 def test_stream_context(init_cuda):
     stream = Device().create_stream(options=StreamOptions())
     context = stream.context
     assert context is not None
 
+
 def test_stream_from_foreign_stream(init_cuda):
     device = Device()
     other_stream = device.create_stream(options=StreamOptions())
@@ -81,17 +95,21 @@ def test_stream_from_foreign_stream(init_cuda):
     assert isinstance(device, Device)
     context = stream.context
     assert context is not None
-    
+
+
 def test_stream_from_handle():
     stream = Stream.from_handle(0)
     assert isinstance(stream, Stream)
 
+
 def test_legacy_default_stream():
     assert isinstance(LEGACY_DEFAULT_STREAM, Stream)
 
+
 def test_per_thread_default_stream():
     assert isinstance(PER_THREAD_DEFAULT_STREAM, Stream)
 
+
 def test_default_stream():
     stream = default_stream()
     assert isinstance(stream, Stream)
diff --git a/cuda_python/docs/source/conf.py b/cuda_python/docs/source/conf.py
index 8a5ab87f9..ab00c2203 100644
--- a/cuda_python/docs/source/conf.py
+++ b/cuda_python/docs/source/conf.py
@@ -16,9 +16,9 @@
 
 # -- Project information -----------------------------------------------------
 
-project = 'CUDA Python'
-copyright = '2021-2024, NVIDIA'
-author = 'NVIDIA'
+project = "CUDA Python"
+copyright = "2021-2024, NVIDIA"
+author = "NVIDIA"
 
 # The full version, including alpha/beta/rc tags
 release = os.environ["SPHINX_CUDA_PYTHON_VER"]
@@ -30,14 +30,14 @@
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
 extensions = [
-	'sphinx.ext.autodoc',
-	'sphinx.ext.napoleon',
-	'myst_nb',
-	'enum_tools.autoenum'
+    "sphinx.ext.autodoc",
+    "sphinx.ext.napoleon",
+    "myst_nb",
+    "enum_tools.autoenum",
 ]
 
 # Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
+templates_path = ["_templates"]
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
@@ -48,40 +48,41 @@
 
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
-html_baseurl = 'docs'
-html_theme = 'furo'
-#html_theme = 'pydata_sphinx_theme'
+html_baseurl = "docs"
+html_theme = "furo"
+# html_theme = 'pydata_sphinx_theme'
 html_theme_options = {
     "light_logo": "logo-light-mode.png",
     "dark_logo": "logo-dark-mode.png",
     # For pydata_sphinx_theme:
-    #"logo": {
-	#    "image_light": "_static/logo-light-mode.png",
+    # "logo": {
+    #    "image_light": "_static/logo-light-mode.png",
     #    "image_dark": "_static/logo-dark-mode.png",
-    #},
-    #"switcher": {
+    # },
+    # "switcher": {
     #    "json_url": "https://nvidia.github.io/cuda-python/cuda-core/versions.json",
     #    "version_match": release,
-    #},
+    # },
     ## Add light/dark mode and documentation version switcher
-    #"navbar_end": [
+    # "navbar_end": [
     #    "search-button",
     #    "theme-switcher",
     #    "version-switcher",
     #    "navbar-icon-links",
-    #],
+    # ],
 }
 
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+html_static_path = ["_static"]
 
 # Allow overwriting CUDA Python's domain name for local development. See:
 #   - https://stackoverflow.com/a/61694897/2344149
 #   - https://www.sphinx-doc.org/en/master/usage/configuration.html#confval-rst_epilog
-CUDA_PYTHON_DOMAIN = os.environ.get('CUDA_PYTHON_DOMAIN',
-                                    'https://nvidia.github.io/cuda-python')
+CUDA_PYTHON_DOMAIN = os.environ.get(
+    "CUDA_PYTHON_DOMAIN", "https://nvidia.github.io/cuda-python"
+)
 rst_epilog = f"""
 .. _cuda.core: {CUDA_PYTHON_DOMAIN}/cuda-core/latest
 .. _cuda.bindings: {CUDA_PYTHON_DOMAIN}/cuda-bindings/latest