diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 000000000..431bb7c50 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,12 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +repos: + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.6.4 + hooks: + - id: ruff + args: [--fix, --show-fixes] + - id: ruff-format + +default_language_version: + python: python3 diff --git a/cuda_bindings/benchmarks/kernels.py b/cuda_bindings/benchmarks/kernels.py index d31cc58a3..259c6e3ca 100644 --- a/cuda_bindings/benchmarks/kernels.py +++ b/cuda_bindings/benchmarks/kernels.py @@ -5,9 +5,9 @@ # this software. Any use, reproduction, disclosure, or distribution of # this software and related documentation outside the terms of the EULA # is strictly prohibited. -kernel_string = '''\ +kernel_string = """\ #define ITEM_PARAM(x, T) T x -#define REP1(x, T) , ITEM_PARAM(x, T) +#define REP1(x, T) , ITEM_PARAM(x, T) #define REP2(x, T) REP1(x##0, T) REP1(x##1, T) #define REP4(x, T) REP2(x##0, T) REP2(x##1, T) #define REP8(x, T) REP4(x##0, T) REP4(x##1, T) @@ -160,4 +160,4 @@ // Do not touch param to prevent compiler from copying // the whole structure from const bank to lmem. } -''' +""" diff --git a/cuda_bindings/benchmarks/perf_test_utils.py b/cuda_bindings/benchmarks/perf_test_utils.py index 11f5cc30b..7cfcf9f9a 100644 --- a/cuda_bindings/benchmarks/perf_test_utils.py +++ b/cuda_bindings/benchmarks/perf_test_utils.py @@ -5,27 +5,30 @@ # this software. Any use, reproduction, disclosure, or distribution of # this software and related documentation outside the terms of the EULA # is strictly prohibited. +import numpy as np import pytest + from cuda import cuda, cudart, nvrtc -import numpy as np + def ASSERT_DRV(err): if isinstance(err, cuda.CUresult): if err != cuda.CUresult.CUDA_SUCCESS: - raise RuntimeError('Cuda Error: {}'.format(err)) + raise RuntimeError(f"Cuda Error: {err}") elif isinstance(err, cudart.cudaError_t): if err != cudart.cudaError_t.cudaSuccess: - raise RuntimeError('Cudart Error: {}'.format(err)) + raise RuntimeError(f"Cudart Error: {err}") elif isinstance(err, nvrtc.nvrtcResult): if err != nvrtc.nvrtcResult.NVRTC_SUCCESS: - raise RuntimeError('Nvrtc Error: {}'.format(err)) + raise RuntimeError(f"Nvrtc Error: {err}") else: - raise RuntimeError('Unknown error type: {}'.format(err)) + raise RuntimeError(f"Unknown error type: {err}") + @pytest.fixture def init_cuda(): # Initialize - err, = cuda.cuInit(0) + (err,) = cuda.cuInit(0) ASSERT_DRV(err) err, device = cuda.cuDeviceGet(0) ASSERT_DRV(err) @@ -38,31 +41,37 @@ def init_cuda(): yield device, ctx, stream - err, = cuda.cuStreamDestroy(stream) + (err,) = cuda.cuStreamDestroy(stream) ASSERT_DRV(err) - err, = cuda.cuCtxDestroy(ctx) + (err,) = cuda.cuCtxDestroy(ctx) ASSERT_DRV(err) + @pytest.fixture def load_module(): module = None + def _load_module(kernel_string, device): nonlocal module # Get module - err, major = cuda.cuDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device) + err, major = cuda.cuDeviceGetAttribute( + cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device + ) ASSERT_DRV(err) - err, minor = cuda.cuDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device) + err, minor = cuda.cuDeviceGetAttribute( + cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device + ) ASSERT_DRV(err) - err, prog = nvrtc.nvrtcCreateProgram(str.encode(kernel_string), b'kernelString.cu', 0, [], []) + err, prog = nvrtc.nvrtcCreateProgram(str.encode(kernel_string), b"kernelString.cu", 0, [], []) ASSERT_DRV(err) - opts = [b'--fmad=false', bytes('--gpu-architecture=sm_' + str(major) + str(minor), 'ascii')] - err, = nvrtc.nvrtcCompileProgram(prog, 2, opts) + opts = [b"--fmad=false", bytes("--gpu-architecture=sm_" + str(major) + str(minor), "ascii")] + (err,) = nvrtc.nvrtcCompileProgram(prog, 2, opts) err_log, logSize = nvrtc.nvrtcGetProgramLogSize(prog) ASSERT_DRV(err_log) - log = b' ' * logSize - err_log, = nvrtc.nvrtcGetProgramLog(prog, log) + log = b" " * logSize + (err_log,) = nvrtc.nvrtcGetProgramLog(prog, log) ASSERT_DRV(err_log) result = log.decode() if len(result) > 1: @@ -71,8 +80,8 @@ def _load_module(kernel_string, device): ASSERT_DRV(err) err, cubinSize = nvrtc.nvrtcGetCUBINSize(prog) ASSERT_DRV(err) - cubin = b' ' * cubinSize - err, = nvrtc.nvrtcGetCUBIN(prog, cubin) + cubin = b" " * cubinSize + (err,) = nvrtc.nvrtcGetCUBIN(prog, cubin) ASSERT_DRV(err) cubin = np.char.array(cubin) err, module = cuda.cuModuleLoadData(cubin) @@ -82,5 +91,5 @@ def _load_module(kernel_string, device): yield _load_module - err, = cuda.cuModuleUnload(module) + (err,) = cuda.cuModuleUnload(module) ASSERT_DRV(err) diff --git a/cuda_bindings/benchmarks/test_cupy.py b/cuda_bindings/benchmarks/test_cupy.py index 43aaa5b12..6e847853e 100644 --- a/cuda_bindings/benchmarks/test_cupy.py +++ b/cuda_bindings/benchmarks/test_cupy.py @@ -5,29 +5,33 @@ # this software. Any use, reproduction, disclosure, or distribution of # this software and related documentation outside the terms of the EULA # is strictly prohibited. -import pytest import ctypes +import pytest + # Always skip since cupy is not CTK 12.x yet skip_tests = True if not skip_tests: try: import cupy + skip_tests = False except ImportError: skip_tests = True from .kernels import kernel_string + def launch(kernel, args=()): kernel((1,), (1,), args) + # Measure launch latency with no parmaeters @pytest.mark.skipif(skip_tests, reason="cupy is not installed") @pytest.mark.benchmark(group="cupy") def test_launch_latency_empty_kernel(benchmark): module = cupy.RawModule(code=kernel_string) - kernel = module.get_function('empty_kernel') + kernel = module.get_function("empty_kernel") stream = cupy.cuda.stream.Stream(non_blocking=True) @@ -35,12 +39,13 @@ def test_launch_latency_empty_kernel(benchmark): benchmark(launch, kernel) stream.synchronize() + # Measure launch latency with a single parameter @pytest.mark.skipif(skip_tests, reason="cupy is not installed") @pytest.mark.benchmark(group="cupy") def test_launch_latency_small_kernel(benchmark): module = cupy.RawModule(code=kernel_string) - kernel = module.get_function('small_kernel') + kernel = module.get_function("small_kernel") cupy.cuda.set_allocator() arg = cupy.cuda.alloc(ctypes.sizeof(ctypes.c_float)) @@ -50,12 +55,13 @@ def test_launch_latency_small_kernel(benchmark): benchmark(launch, kernel, (arg,)) stream.synchronize() + # Measure launch latency with many parameters using builtin parameter packing @pytest.mark.skipif(skip_tests, reason="cupy is not installed") @pytest.mark.benchmark(group="cupy") def test_launch_latency_small_kernel_512_args(benchmark): module = cupy.RawModule(code=kernel_string) - kernel = module.get_function('small_kernel_512_args') + kernel = module.get_function("small_kernel_512_args") cupy.cuda.set_allocator() args = [] @@ -69,12 +75,13 @@ def test_launch_latency_small_kernel_512_args(benchmark): benchmark(launch, kernel, args) stream.synchronize() + # Measure launch latency with many parameters using builtin parameter packing @pytest.mark.skipif(skip_tests, reason="cupy is not installed") @pytest.mark.benchmark(group="cupy") def test_launch_latency_small_kernel_512_bools(benchmark): module = cupy.RawModule(code=kernel_string) - kernel = module.get_function('small_kernel_512_bools') + kernel = module.get_function("small_kernel_512_bools") cupy.cuda.set_allocator() args = [True] * 512 @@ -86,12 +93,13 @@ def test_launch_latency_small_kernel_512_bools(benchmark): benchmark(launch, kernel, args) stream.synchronize() + # Measure launch latency with many parameters using builtin parameter packing @pytest.mark.skipif(skip_tests, reason="cupy is not installed") @pytest.mark.benchmark(group="cupy") def test_launch_latency_small_kernel_512_doubles(benchmark): module = cupy.RawModule(code=kernel_string) - kernel = module.get_function('small_kernel_512_doubles') + kernel = module.get_function("small_kernel_512_doubles") cupy.cuda.set_allocator() args = [1.2345] * 512 @@ -103,12 +111,13 @@ def test_launch_latency_small_kernel_512_doubles(benchmark): benchmark(launch, kernel, args) stream.synchronize() + # Measure launch latency with many parameters using builtin parameter packing @pytest.mark.skipif(skip_tests, reason="cupy is not installed") @pytest.mark.benchmark(group="cupy") def test_launch_latency_small_kernel_512_ints(benchmark): module = cupy.RawModule(code=kernel_string) - kernel = module.get_function('small_kernel_512_ints') + kernel = module.get_function("small_kernel_512_ints") cupy.cuda.set_allocator() args = [123] * 512 @@ -120,12 +129,13 @@ def test_launch_latency_small_kernel_512_ints(benchmark): benchmark(launch, kernel, args) stream.synchronize() + # Measure launch latency with many parameters using builtin parameter packing @pytest.mark.skipif(skip_tests, reason="cupy is not installed") @pytest.mark.benchmark(group="cupy") def test_launch_latency_small_kernel_512_bytes(benchmark): module = cupy.RawModule(code=kernel_string) - kernel = module.get_function('small_kernel_512_chars') + kernel = module.get_function("small_kernel_512_chars") cupy.cuda.set_allocator() args = [127] * 512 @@ -137,12 +147,13 @@ def test_launch_latency_small_kernel_512_bytes(benchmark): benchmark(launch, kernel, args) stream.synchronize() + # Measure launch latency with many parameters using builtin parameter packing @pytest.mark.skipif(skip_tests, reason="cupy is not installed") @pytest.mark.benchmark(group="cupy") def test_launch_latency_small_kernel_512_longlongs(benchmark): module = cupy.RawModule(code=kernel_string) - kernel = module.get_function('small_kernel_512_longlongs') + kernel = module.get_function("small_kernel_512_longlongs") cupy.cuda.set_allocator() args = [9223372036854775806] * 512 @@ -154,12 +165,13 @@ def test_launch_latency_small_kernel_512_longlongs(benchmark): benchmark(launch, kernel, args) stream.synchronize() + # Measure launch latency with many parameters using builtin parameter packing @pytest.mark.skipif(skip_tests, reason="cupy is not installed") @pytest.mark.benchmark(group="cupy") def test_launch_latency_small_kernel_256_args(benchmark): module = cupy.RawModule(code=kernel_string) - kernel = module.get_function('small_kernel_256_args') + kernel = module.get_function("small_kernel_256_args") cupy.cuda.set_allocator() args = [] @@ -173,12 +185,13 @@ def test_launch_latency_small_kernel_256_args(benchmark): benchmark(launch, kernel, args) stream.synchronize() + # Measure launch latency with many parameters using builtin parameter packing @pytest.mark.skipif(skip_tests, reason="cupy is not installed") @pytest.mark.benchmark(group="cupy") def test_launch_latency_small_kernel_16_args(benchmark): module = cupy.RawModule(code=kernel_string) - kernel = module.get_function('small_kernel_16_args') + kernel = module.get_function("small_kernel_16_args") cupy.cuda.set_allocator() args = [] diff --git a/cuda_bindings/benchmarks/test_launch_latency.py b/cuda_bindings/benchmarks/test_launch_latency.py index 79668a637..8d70bfe24 100755 --- a/cuda_bindings/benchmarks/test_launch_latency.py +++ b/cuda_bindings/benchmarks/test_launch_latency.py @@ -5,26 +5,47 @@ # this software. Any use, reproduction, disclosure, or distribution of # this software and related documentation outside the terms of the EULA # is strictly prohibited. +import ctypes + import pytest + from cuda import cuda -import ctypes -from .perf_test_utils import ASSERT_DRV, init_cuda, load_module from .kernels import kernel_string +from .perf_test_utils import ASSERT_DRV + def launch(kernel, stream, args=(), arg_types=()): - cuda.cuLaunchKernel(kernel, - 1, 1, 1, # grid dim - 1, 1, 1, # block dim - 0, stream, # shared mem and stream - (args, arg_types), 0) # arguments + cuda.cuLaunchKernel( + kernel, + 1, + 1, + 1, # grid dim + 1, + 1, + 1, # block dim + 0, + stream, # shared mem and stream + (args, arg_types), + 0, + ) # arguments + def launch_packed(kernel, stream, params): - cuda.cuLaunchKernel(kernel, - 1, 1, 1, # grid dim - 1, 1, 1, # block dim - 0, stream, # shared mem and stream - params, 0) # arguments + cuda.cuLaunchKernel( + kernel, + 1, + 1, + 1, # grid dim + 1, + 1, + 1, # block dim + 0, + stream, # shared mem and stream + params, + 0, + ) # arguments + # Measure launch latency with no parmaeters @pytest.mark.benchmark(group="launch-latency") @@ -32,20 +53,21 @@ def test_launch_latency_empty_kernel(benchmark, init_cuda, load_module): device, ctx, stream = init_cuda module = load_module(kernel_string, device) - err, func = cuda.cuModuleGetFunction(module, b'empty_kernel') + err, func = cuda.cuModuleGetFunction(module, b"empty_kernel") ASSERT_DRV(err) benchmark(launch, func, stream) cuda.cuCtxSynchronize() + # Measure launch latency with a single parameter @pytest.mark.benchmark(group="launch-latency") def test_launch_latency_small_kernel(benchmark, init_cuda, load_module): device, ctx, stream = init_cuda module = load_module(kernel_string, device) - err, func = cuda.cuModuleGetFunction(module, b'small_kernel') + err, func = cuda.cuModuleGetFunction(module, b"small_kernel") ASSERT_DRV(err) err, f = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_float)) @@ -55,16 +77,17 @@ def test_launch_latency_small_kernel(benchmark, init_cuda, load_module): cuda.cuCtxSynchronize() - err, = cuda.cuMemFree(f) + (err,) = cuda.cuMemFree(f) ASSERT_DRV(err) + # Measure launch latency with many parameters using builtin parameter packing @pytest.mark.benchmark(group="launch-latency") def test_launch_latency_small_kernel_512_args(benchmark, init_cuda, load_module): device, ctx, stream = init_cuda module = load_module(kernel_string, device) - err, func = cuda.cuModuleGetFunction(module, b'small_kernel_512_args') + err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_args") ASSERT_DRV(err) args = [] @@ -82,15 +105,16 @@ def test_launch_latency_small_kernel_512_args(benchmark, init_cuda, load_module) cuda.cuCtxSynchronize() for p in args: - err, = cuda.cuMemFree(p) + (err,) = cuda.cuMemFree(p) ASSERT_DRV(err) + @pytest.mark.benchmark(group="launch-latency") def test_launch_latency_small_kernel_512_bools(benchmark, init_cuda, load_module): device, ctx, stream = init_cuda module = load_module(kernel_string, device) - err, func = cuda.cuModuleGetFunction(module, b'small_kernel_512_bools') + err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_bools") ASSERT_DRV(err) args = [True] * 512 @@ -103,12 +127,13 @@ def test_launch_latency_small_kernel_512_bools(benchmark, init_cuda, load_module cuda.cuCtxSynchronize() + @pytest.mark.benchmark(group="launch-latency") def test_launch_latency_small_kernel_512_doubles(benchmark, init_cuda, load_module): device, ctx, stream = init_cuda module = load_module(kernel_string, device) - err, func = cuda.cuModuleGetFunction(module, b'small_kernel_512_doubles') + err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_doubles") ASSERT_DRV(err) args = [1.2345] * 512 @@ -121,12 +146,13 @@ def test_launch_latency_small_kernel_512_doubles(benchmark, init_cuda, load_modu cuda.cuCtxSynchronize() + @pytest.mark.benchmark(group="launch-latency") def test_launch_latency_small_kernel_512_ints(benchmark, init_cuda, load_module): device, ctx, stream = init_cuda module = load_module(kernel_string, device) - err, func = cuda.cuModuleGetFunction(module, b'small_kernel_512_ints') + err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_ints") ASSERT_DRV(err) args = [123] * 512 @@ -139,12 +165,13 @@ def test_launch_latency_small_kernel_512_ints(benchmark, init_cuda, load_module) cuda.cuCtxSynchronize() + @pytest.mark.benchmark(group="launch-latency") def test_launch_latency_small_kernel_512_bytes(benchmark, init_cuda, load_module): device, ctx, stream = init_cuda module = load_module(kernel_string, device) - err, func = cuda.cuModuleGetFunction(module, b'small_kernel_512_chars') + err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_chars") ASSERT_DRV(err) args = [127] * 512 @@ -157,12 +184,13 @@ def test_launch_latency_small_kernel_512_bytes(benchmark, init_cuda, load_module cuda.cuCtxSynchronize() + @pytest.mark.benchmark(group="launch-latency") def test_launch_latency_small_kernel_512_longlongs(benchmark, init_cuda, load_module): device, ctx, stream = init_cuda module = load_module(kernel_string, device) - err, func = cuda.cuModuleGetFunction(module, b'small_kernel_512_longlongs') + err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_longlongs") ASSERT_DRV(err) args = [9223372036854775806] * 512 @@ -175,13 +203,14 @@ def test_launch_latency_small_kernel_512_longlongs(benchmark, init_cuda, load_mo cuda.cuCtxSynchronize() + # Measure launch latency with many parameters using builtin parameter packing @pytest.mark.benchmark(group="launch-latency") def test_launch_latency_small_kernel_256_args(benchmark, init_cuda, load_module): device, ctx, stream = init_cuda module = load_module(kernel_string, device) - err, func = cuda.cuModuleGetFunction(module, b'small_kernel_256_args') + err, func = cuda.cuModuleGetFunction(module, b"small_kernel_256_args") ASSERT_DRV(err) args = [] @@ -199,16 +228,17 @@ def test_launch_latency_small_kernel_256_args(benchmark, init_cuda, load_module) cuda.cuCtxSynchronize() for p in args: - err, = cuda.cuMemFree(p) + (err,) = cuda.cuMemFree(p) ASSERT_DRV(err) + # Measure launch latency with many parameters using builtin parameter packing @pytest.mark.benchmark(group="launch-latency") def test_launch_latency_small_kernel_16_args(benchmark, init_cuda, load_module): device, ctx, stream = init_cuda module = load_module(kernel_string, device) - err, func = cuda.cuModuleGetFunction(module, b'small_kernel_16_args') + err, func = cuda.cuModuleGetFunction(module, b"small_kernel_16_args") ASSERT_DRV(err) args = [] @@ -226,16 +256,17 @@ def test_launch_latency_small_kernel_16_args(benchmark, init_cuda, load_module): cuda.cuCtxSynchronize() for p in args: - err, = cuda.cuMemFree(p) + (err,) = cuda.cuMemFree(p) ASSERT_DRV(err) + # Measure launch latency with many parameters, excluding parameter packing @pytest.mark.benchmark(group="launch-latency") def test_launch_latency_small_kernel_512_args_ctypes(benchmark, init_cuda, load_module): device, ctx, stream = init_cuda module = load_module(kernel_string, device) - err, func = cuda.cuModuleGetFunction(module, b'small_kernel_512_args') + err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_args") ASSERT_DRV(err) vals = [] @@ -255,9 +286,10 @@ def test_launch_latency_small_kernel_512_args_ctypes(benchmark, init_cuda, load_ cuda.cuCtxSynchronize() for p in vals: - err, = cuda.cuMemFree(p) + (err,) = cuda.cuMemFree(p) ASSERT_DRV(err) + def pack_and_launch(kernel, stream, params): packed_params = (ctypes.c_void_p * len(params))() ptrs = [0] * len(params) @@ -265,11 +297,8 @@ def pack_and_launch(kernel, stream, params): ptrs[i] = ctypes.c_void_p(int(params[i])) packed_params[i] = ctypes.addressof(ptrs[i]) - cuda.cuLaunchKernel(kernel, - 1, 1, 1, - 1, 1, 1, - 0, stream, - packed_params, 0) + cuda.cuLaunchKernel(kernel, 1, 1, 1, 1, 1, 1, 0, stream, packed_params, 0) + # Measure launch latency plus parameter packing using ctypes @pytest.mark.benchmark(group="launch-latency") @@ -277,7 +306,7 @@ def test_launch_latency_small_kernel_512_args_ctypes_with_packing(benchmark, ini device, ctx, stream = init_cuda module = load_module(kernel_string, device) - err, func = cuda.cuModuleGetFunction(module, b'small_kernel_512_args') + err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_args") ASSERT_DRV(err) vals = [] @@ -291,20 +320,21 @@ def test_launch_latency_small_kernel_512_args_ctypes_with_packing(benchmark, ini cuda.cuCtxSynchronize() for p in vals: - err, = cuda.cuMemFree(p) + (err,) = cuda.cuMemFree(p) ASSERT_DRV(err) + # Measure launch latency with a single large struct parameter @pytest.mark.benchmark(group="launch-latency") def test_launch_latency_small_kernel_2048B(benchmark, init_cuda, load_module): device, ctx, stream = init_cuda module = load_module(kernel_string, device) - err, func = cuda.cuModuleGetFunction(module, b'small_kernel_2048B') + err, func = cuda.cuModuleGetFunction(module, b"small_kernel_2048B") ASSERT_DRV(err) class struct_2048B(ctypes.Structure): - _fields_ = [('values',ctypes.c_uint8 * 2048)] + _fields_ = [("values", ctypes.c_uint8 * 2048)] benchmark(launch, func, stream, args=(struct_2048B(),), arg_types=(None,)) diff --git a/cuda_bindings/benchmarks/test_numba.py b/cuda_bindings/benchmarks/test_numba.py index f0dd12316..f7a4db7e9 100644 --- a/cuda_bindings/benchmarks/test_numba.py +++ b/cuda_bindings/benchmarks/test_numba.py @@ -5,19 +5,24 @@ # this software. Any use, reproduction, disclosure, or distribution of # this software and related documentation outside the terms of the EULA # is strictly prohibited. -import pytest import numpy as np +import pytest + try: from numba import cuda + skip_tests = False except ImportError: skip_tests = True + def launch_empty(kernel, stream): - kernel[1,1, stream]() + kernel[1, 1, stream]() + def launch(kernel, stream, arg): - kernel[1,1, stream](arg) + kernel[1, 1, stream](arg) + # Measure launch latency with no parmaeters @pytest.mark.skipif(skip_tests, reason="Numba is not installed") @@ -33,6 +38,7 @@ def empty_kernel(): cuda.synchronize() + # Measure launch latency with a single parameter @pytest.mark.skipif(skip_tests, reason="Numba is not installed") @pytest.mark.benchmark(group="numba", min_rounds=1000) diff --git a/cuda_bindings/benchmarks/test_pointer_attributes.py b/cuda_bindings/benchmarks/test_pointer_attributes.py index 72de39641..f03be1e1e 100644 --- a/cuda_bindings/benchmarks/test_pointer_attributes.py +++ b/cuda_bindings/benchmarks/test_pointer_attributes.py @@ -5,30 +5,35 @@ # this software. Any use, reproduction, disclosure, or distribution of # this software and related documentation outside the terms of the EULA # is strictly prohibited. +import random + import pytest + from cuda import cuda -import ctypes -import random -from .perf_test_utils import ASSERT_DRV, init_cuda +from .perf_test_utils import ASSERT_DRV random.seed(0) idx = 0 + + def query_attribute(attribute, ptrs): global idx ptr = ptrs[idx] - idx = (idx + 1 ) % len(ptrs) + idx = (idx + 1) % len(ptrs) cuda.cuPointerGetAttribute(attribute, ptr) + def query_attributes(attributes, ptrs): global idx ptr = ptrs[idx] - idx = (idx + 1 ) % len(ptrs) + idx = (idx + 1) % len(ptrs) cuda.cuPointerGetAttributes(len(attributes), attributes, ptr) + @pytest.mark.benchmark(group="pointer-attributes") # Measure cuPointerGetAttribute in the same way as C benchmarks def test_pointer_get_attribute(benchmark, init_cuda): @@ -45,9 +50,10 @@ def test_pointer_get_attribute(benchmark, init_cuda): benchmark(query_attribute, cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMORY_TYPE, ptrs) for p in ptrs: - err, = cuda.cuMemFree(p) + (err,) = cuda.cuMemFree(p) ASSERT_DRV(err) + @pytest.mark.benchmark(group="pointer-attributes") # Measure cuPointerGetAttributes with all attributes def test_pointer_get_attributes_all(benchmark, init_cuda): @@ -61,29 +67,32 @@ def test_pointer_get_attributes_all(benchmark, init_cuda): random.shuffle(ptrs) - attributes = [cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_CONTEXT, - cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMORY_TYPE, - cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_HOST_POINTER, - cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_P2P_TOKENS, - cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, - cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_BUFFER_ID, - cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_IS_MANAGED, - cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, - cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE, - cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_RANGE_START_ADDR, - cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_RANGE_SIZE, - cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MAPPED, - cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES, - cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_IS_GPU_DIRECT_RDMA_CAPABLE, - cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_ACCESS_FLAGS, - cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE] + attributes = [ + cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_CONTEXT, + cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMORY_TYPE, + cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_HOST_POINTER, + cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_P2P_TOKENS, + cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, + cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_BUFFER_ID, + cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_IS_MANAGED, + cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, + cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE, + cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_RANGE_START_ADDR, + cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_RANGE_SIZE, + cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MAPPED, + cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES, + cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_IS_GPU_DIRECT_RDMA_CAPABLE, + cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_ACCESS_FLAGS, + cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE, + ] benchmark(query_attributes, attributes, ptrs) for p in ptrs: - err, = cuda.cuMemFree(p) + (err,) = cuda.cuMemFree(p) ASSERT_DRV(err) + @pytest.mark.benchmark(group="pointer-attributes") # Measure cuPointerGetAttributes with a single attribute def test_pointer_get_attributes_single(benchmark, init_cuda): @@ -97,10 +106,12 @@ def test_pointer_get_attributes_single(benchmark, init_cuda): random.shuffle(ptrs) - attributes = [cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMORY_TYPE,] + attributes = [ + cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMORY_TYPE, + ] benchmark(query_attributes, attributes, ptrs) for p in ptrs: - err, = cuda.cuMemFree(p) + (err,) = cuda.cuMemFree(p) ASSERT_DRV(err) diff --git a/cuda_bindings/cuda/__init__.py b/cuda_bindings/cuda/__init__.py index 8b302752d..577d5f8b6 100644 --- a/cuda_bindings/cuda/__init__.py +++ b/cuda_bindings/cuda/__init__.py @@ -1,10 +1,14 @@ def __getattr__(name): if name == "__version__": import warnings - warnings.warn("accessing cuda.__version__ is deprecated, " - "please switch to use cuda.bindings.__version__ instead", - DeprecationWarning, stacklevel=2) + + warnings.warn( + "accessing cuda.__version__ is deprecated, " "please switch to use cuda.bindings.__version__ instead", + DeprecationWarning, + stacklevel=2, + ) from . import bindings + return bindings.__version__ raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/cuda_bindings/cuda/bindings/__init__.py b/cuda_bindings/cuda/bindings/__init__.py index ecd3379a8..4d52a61e7 100644 --- a/cuda_bindings/cuda/bindings/__init__.py +++ b/cuda_bindings/cuda/bindings/__init__.py @@ -1,3 +1,3 @@ - from . import _version -__version__ = _version.get_versions()['version'] + +__version__ = _version.get_versions()["version"] diff --git a/cuda_bindings/docs/source/conf.py b/cuda_bindings/docs/source/conf.py index be6f55172..01f0fb492 100644 --- a/cuda_bindings/docs/source/conf.py +++ b/cuda_bindings/docs/source/conf.py @@ -10,15 +10,16 @@ # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. import os + # import sys # sys.path.insert(0, os.path.abspath('.')) # -- Project information ----------------------------------------------------- -project = 'cuda.bindings' -copyright = '2021-2024, NVIDIA' -author = 'NVIDIA' +project = "cuda.bindings" +copyright = "2021-2024, NVIDIA" +author = "NVIDIA" # The full version, including alpha/beta/rc tags release = os.environ["SPHINX_CUDA_BINDINGS_VER"] @@ -29,18 +30,13 @@ # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. -extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.napoleon', - 'myst_nb', - 'enum_tools.autoenum' -] +extensions = ["sphinx.ext.autodoc", "sphinx.ext.napoleon", "myst_nb", "enum_tools.autoenum"] jupyter_execute_notebooks = "force" -numfig=True +numfig = True # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. @@ -51,36 +47,36 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. -html_baseurl = 'docs' -html_theme = 'furo' -#html_theme = 'pydata_sphinx_theme' +html_baseurl = "docs" +html_theme = "furo" +# html_theme = 'pydata_sphinx_theme' html_theme_options = { "light_logo": "logo-light-mode.png", "dark_logo": "logo-dark-mode.png", # For pydata_sphinx_theme: - #"logo": { - # "image_light": "_static/logo-light-mode.png", + # "logo": { + # "image_light": "_static/logo-light-mode.png", # "image_dark": "_static/logo-dark-mode.png", - #}, - #"switcher": { + # }, + # "switcher": { # "json_url": "https://nvidia.github.io/cuda-python/cuda-bindings/versions.json", # "version_match": release, - #}, + # }, ## Add light/dark mode and documentation version switcher - #"navbar_end": [ + # "navbar_end": [ # "search-button", # "theme-switcher", # "version-switcher", # "navbar-icon-links", - #], + # ], } # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] suppress_warnings = [ # for warnings about multiple possible targets, see NVIDIA/cuda-python#152 - 'ref.python', + "ref.python", ] diff --git a/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py b/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py index 49a7a1e0a..2717bd193 100644 --- a/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py +++ b/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py @@ -6,11 +6,12 @@ # this software and related documentation outside the terms of the EULA # is strictly prohibited. import numpy as np -from cuda import cuda from common import common from common.helper_cuda import checkCudaErrors, findCudaDevice -clock_nvrtc = '''\ +from cuda import cuda + +clock_nvrtc = """\ extern "C" __global__ void timedReduction(const float *hinput, float *output, clock_t *timer) { // __shared__ float shared[2 * blockDim.x]; @@ -49,39 +50,47 @@ if (tid == 0) timer[bid+gridDim.x] = clock(); } -''' +""" + +NUM_BLOCKS = 64 +NUM_THREADS = 256 -NUM_BLOCKS = 64 -NUM_THREADS = 256 def main(): print("CUDA Clock sample") - timer = np.empty(NUM_BLOCKS * 2, dtype='int64') - hinput = np.empty(NUM_THREADS * 2, dtype='float32') + timer = np.empty(NUM_BLOCKS * 2, dtype="int64") + hinput = np.empty(NUM_THREADS * 2, dtype="float32") for i in range(0, NUM_THREADS * 2): hinput[i] = i devID = findCudaDevice() kernelHelper = common.KernelHelper(clock_nvrtc, devID) - kernel_addr = kernelHelper.getFunction(b'timedReduction') + kernel_addr = kernelHelper.getFunction(b"timedReduction") dinput = checkCudaErrors(cuda.cuMemAlloc(np.dtype(np.float32).itemsize * NUM_THREADS * 2)) doutput = checkCudaErrors(cuda.cuMemAlloc(np.dtype(np.float32).itemsize * NUM_BLOCKS)) dtimer = checkCudaErrors(cuda.cuMemAlloc(np.dtype(np.int64).itemsize * NUM_BLOCKS * 2)) checkCudaErrors(cuda.cuMemcpyHtoD(dinput, hinput, np.dtype(np.float32).itemsize * NUM_THREADS * 2)) - - - arr = ((dinput, doutput, dtimer), - (None, None, None)) - - checkCudaErrors(cuda.cuLaunchKernel(kernel_addr, - NUM_BLOCKS, 1, 1, # grid dim - NUM_THREADS, 1, 1, # block dim - np.dtype(np.float32).itemsize * 2 *NUM_THREADS, 0, # shared mem, stream - arr, 0)) # arguments + arr = ((dinput, doutput, dtimer), (None, None, None)) + + checkCudaErrors( + cuda.cuLaunchKernel( + kernel_addr, + NUM_BLOCKS, + 1, + 1, # grid dim + NUM_THREADS, + 1, + 1, # block dim + np.dtype(np.float32).itemsize * 2 * NUM_THREADS, + 0, # shared mem, stream + arr, + 0, + ) + ) # arguments checkCudaErrors(cuda.cuCtxSynchronize()) checkCudaErrors(cuda.cuMemcpyDtoH(timer, dtimer, np.dtype(np.int64).itemsize * NUM_BLOCKS * 2)) @@ -91,11 +100,12 @@ def main(): avgElapsedClocks = 0.0 - for i in range(0,NUM_BLOCKS): + for i in range(0, NUM_BLOCKS): avgElapsedClocks += timer[i + NUM_BLOCKS] - timer[i] - avgElapsedClocks = avgElapsedClocks/NUM_BLOCKS; - print("Average clocks/block = {}".format(avgElapsedClocks)) + avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS + print(f"Average clocks/block = {avgElapsedClocks}") + -if __name__=="__main__": +if __name__ == "__main__": main() diff --git a/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py b/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py index adb5a5606..6e1d16e72 100644 --- a/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py +++ b/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py @@ -7,14 +7,16 @@ # is strictly prohibited. import ctypes import math -import numpy as np import sys import time -from cuda import cuda, cudart + +import numpy as np from common import common from common.helper_cuda import checkCudaErrors, findCudaDevice -simpleCubemapTexture = '''\ +from cuda import cuda, cudart + +simpleCubemapTexture = """\ extern "C" __global__ void transformKernel(float *g_odata, int width, cudaTextureObject_t tex) { @@ -80,20 +82,20 @@ g_odata[face*width*width + y*width + x] = -texCubemap(tex, cx, cy, cz); } } -''' +""" + def main(): # Use command-line specified CUDA device, otherwise use device with highest Gflops/s devID = findCudaDevice() # Get number of SMs on this GPU - deviceProps = checkCudaErrors(cudart.cudaGetDeviceProperties(devID)); - print("CUDA device [{}] has {} Multi-Processors SM {}.{}".format(deviceProps.name, - deviceProps.multiProcessorCount, - deviceProps.major, - deviceProps.minor)) - if (deviceProps.major < 2): - print("{} requires SM 2.0 or higher for support of Texture Arrays. Test will exit...".format(sSDKname)) + deviceProps = checkCudaErrors(cudart.cudaGetDeviceProperties(devID)) + print( + f"CUDA device [{deviceProps.name}] has {deviceProps.multiProcessorCount} Multi-Processors SM {deviceProps.major}.{deviceProps.minor}" + ) + if deviceProps.major < 2: + print("Test requires SM 2.0 or higher for support of Texture Arrays. Test will exit...") sys.exit() # Generate input data for layered texture @@ -102,27 +104,35 @@ def main(): num_layers = 1 cubemap_size = width * width * num_faces size = cubemap_size * num_layers * np.dtype(np.float32).itemsize - h_data = np.zeros(cubemap_size * num_layers, dtype='float32') + h_data = np.zeros(cubemap_size * num_layers, dtype="float32") for i in range(cubemap_size * num_layers): h_data[i] = i # This is the expected transformation of the input data (the expected output) - h_data_ref = np.zeros(cubemap_size * num_layers, dtype='float32') + h_data_ref = np.zeros(cubemap_size * num_layers, dtype="float32") for layer in range(num_layers): for i in range(cubemap_size): - h_data_ref[layer*cubemap_size + i] = -h_data[layer*cubemap_size + i] + layer + h_data_ref[layer * cubemap_size + i] = -h_data[layer * cubemap_size + i] + layer # Allocate device memory for result d_data = checkCudaErrors(cudart.cudaMalloc(size)) # Allocate array and copy image data - channelDesc = checkCudaErrors(cudart.cudaCreateChannelDesc(32, 0, 0, 0, cudart.cudaChannelFormatKind.cudaChannelFormatKindFloat)) - cu_3darray = checkCudaErrors(cudart.cudaMalloc3DArray(channelDesc, cudart.make_cudaExtent(width, width, num_faces), cudart.cudaArrayCubemap)) + channelDesc = checkCudaErrors( + cudart.cudaCreateChannelDesc(32, 0, 0, 0, cudart.cudaChannelFormatKind.cudaChannelFormatKindFloat) + ) + cu_3darray = checkCudaErrors( + cudart.cudaMalloc3DArray( + channelDesc, + cudart.make_cudaExtent(width, width, num_faces), + cudart.cudaArrayCubemap, + ) + ) myparms = cudart.cudaMemcpy3DParms() - myparms.srcPos = cudart.make_cudaPos(0,0,0) - myparms.dstPos = cudart.make_cudaPos(0,0,0) + myparms.srcPos = cudart.make_cudaPos(0, 0, 0) + myparms.dstPos = cudart.make_cudaPos(0, 0, 0) myparms.srcPtr = cudart.make_cudaPitchedPtr(h_data, width * np.dtype(np.float32).itemsize, width, width) myparms.dstArray = cu_3darray myparms.extent = cudart.make_cudaExtent(width, width, num_faces) @@ -130,12 +140,12 @@ def main(): checkCudaErrors(cudart.cudaMemcpy3D(myparms)) texRes = cudart.cudaResourceDesc() - texRes.resType = cudart.cudaResourceType.cudaResourceTypeArray - texRes.res.array.array = cu_3darray + texRes.resType = cudart.cudaResourceType.cudaResourceTypeArray + texRes.res.array.array = cu_3darray texDescr = cudart.cudaTextureDesc() texDescr.normalizedCoords = True - texDescr.filterMode = cudart.cudaTextureFilterMode.cudaFilterModeLinear + texDescr.filterMode = cudart.cudaTextureFilterMode.cudaFilterModeLinear texDescr.addressMode[0] = cudart.cudaTextureAddressMode.cudaAddressModeWrap texDescr.addressMode[1] = cudart.cudaTextureAddressMode.cudaAddressModeWrap texDescr.addressMode[2] = cudart.cudaTextureAddressMode.cudaAddressModeWrap @@ -151,36 +161,57 @@ def main(): dimGrid.y = width / dimBlock.y dimGrid.z = 1 - print("Covering Cubemap data array of {}~3 x {}: Grid size is {} x {}, each block has 8 x 8 threads".format( - width, num_layers, dimGrid.x, dimGrid.y)) + print( + f"Covering Cubemap data array of {width}~3 x {num_layers}: Grid size is {dimGrid.x} x {dimGrid.y}, each block has 8 x 8 threads" + ) kernelHelper = common.KernelHelper(simpleCubemapTexture, devID) - _transformKernel = kernelHelper.getFunction(b'transformKernel') - kernelArgs = ((d_data, width, tex),(ctypes.c_void_p, ctypes.c_int, None)) - checkCudaErrors(cuda.cuLaunchKernel(_transformKernel, - dimGrid.x, dimGrid.y, dimGrid.z, # grid dim - dimBlock.x, dimBlock.y, dimBlock.z, # block dim - 0, 0, # shared mem and stream - kernelArgs, 0)) # arguments + _transformKernel = kernelHelper.getFunction(b"transformKernel") + kernelArgs = ((d_data, width, tex), (ctypes.c_void_p, ctypes.c_int, None)) + checkCudaErrors( + cuda.cuLaunchKernel( + _transformKernel, + dimGrid.x, + dimGrid.y, + dimGrid.z, # grid dim + dimBlock.x, + dimBlock.y, + dimBlock.z, # block dim + 0, + 0, # shared mem and stream + kernelArgs, + 0, + ) + ) # arguments checkCudaErrors(cudart.cudaDeviceSynchronize()) start = time.time() # Execute the kernel - checkCudaErrors(cuda.cuLaunchKernel(_transformKernel, - dimGrid.x, dimGrid.y, dimGrid.z, # grid dim - dimBlock.x, dimBlock.y, dimBlock.z, # block dim - 0, 0, # shared mem and stream - kernelArgs, 0)) # arguments + checkCudaErrors( + cuda.cuLaunchKernel( + _transformKernel, + dimGrid.x, + dimGrid.y, + dimGrid.z, # grid dim + dimBlock.x, + dimBlock.y, + dimBlock.z, # block dim + 0, + 0, # shared mem and stream + kernelArgs, + 0, + ) + ) # arguments checkCudaErrors(cudart.cudaDeviceSynchronize()) stop = time.time() - print("Processing time: {:.3f} msec".format(stop - start)) - print("{:.2f} Mtexlookups/sec".format(cubemap_size / ((stop - start + 1) / 1000.0) / 1e6)) + print(f"Processing time: {stop - start:.3f} msec") + print(f"{cubemap_size / ((stop - start + 1) / 1000.0) / 1e6:.2f} Mtexlookups/sec") # Allocate mem for the result on host side - h_odata = np.zeros(cubemap_size * num_layers, dtype='float32') + h_odata = np.zeros(cubemap_size * num_layers, dtype="float32") # Copy result from device to host checkCudaErrors(cudart.cudaMemcpy(h_odata, d_data, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)) @@ -197,5 +228,6 @@ def main(): checkCudaErrors(cudart.cudaFree(d_data)) checkCudaErrors(cudart.cudaFreeArray(cu_3darray)) -if __name__=="__main__": + +if __name__ == "__main__": main() diff --git a/cuda_bindings/examples/0_Introduction/simpleP2P_test.py b/cuda_bindings/examples/0_Introduction/simpleP2P_test.py index d4d17de1f..7ff1ae270 100644 --- a/cuda_bindings/examples/0_Introduction/simpleP2P_test.py +++ b/cuda_bindings/examples/0_Introduction/simpleP2P_test.py @@ -6,13 +6,15 @@ # this software and related documentation outside the terms of the EULA # is strictly prohibited. import ctypes -import numpy as np import sys -from cuda import cuda, cudart + +import numpy as np from common import common from common.helper_cuda import checkCudaErrors -simplep2p = '''\ +from cuda import cuda, cudart + +simplep2p = """\ extern "C" __global__ void SimpleKernel(float *src, float *dst) { @@ -21,7 +23,8 @@ const int idx = blockIdx.x * blockDim.x + threadIdx.x; dst[idx] = src[idx] * 2.0f; } -''' +""" + def main(): print("Starting...") @@ -29,7 +32,7 @@ def main(): # Number of GPUs print("Checking for multiple GPUs...") gpu_n = checkCudaErrors(cudart.cudaGetDeviceCount()) - print("CUDA-capable device count: {}".format(gpu_n)) + print(f"CUDA-capable device count: {gpu_n}") if gpu_n < 2: print("Two or more GPUs with Peer-to-Peer access capability are required") @@ -47,10 +50,16 @@ def main(): continue i_access_j = checkCudaErrors(cudart.cudaDeviceCanAccessPeer(i, j)) j_access_i = checkCudaErrors(cudart.cudaDeviceCanAccessPeer(j, i)) - print("> Peer access from {} (GPU{}) -> {} (GPU{}) : {}\n".format( - prop[i].name, i, prop[j].name, j, "Yes" if i_access_j else "No")) - print("> Peer access from {} (GPU{}) -> {} (GPU{}) : {}\n".format( - prop[j].name, j, prop[i].name, i, "Yes" if i_access_j else "No")) + print( + "> Peer access from {} (GPU{}) -> {} (GPU{}) : {}\n".format( + prop[i].name, i, prop[j].name, j, "Yes" if i_access_j else "No" + ) + ) + print( + "> Peer access from {} (GPU{}) -> {} (GPU{}) : {}\n".format( + prop[j].name, j, prop[i].name, i, "Yes" if i_access_j else "No" + ) + ) if i_access_j and j_access_i: p2pCapableGPUs[1] = j break @@ -66,7 +75,7 @@ def main(): gpuid = [p2pCapableGPUs[0], p2pCapableGPUs[1]] # Enable peer access - print("Enabling peer access between GPU{} and GPU{}...".format(gpuid[0], gpuid[1])) + print(f"Enabling peer access between GPU{gpuid[0]} and GPU{gpuid[1]}...") checkCudaErrors(cudart.cudaSetDevice(gpuid[0])) checkCudaErrors(cudart.cudaDeviceEnablePeerAccess(gpuid[1], 0)) checkCudaErrors(cudart.cudaSetDevice(gpuid[1])) @@ -74,12 +83,12 @@ def main(): # Allocate buffers buf_size = 1024 * 1024 * 16 * np.dtype(np.float32).itemsize - print("Allocating buffers ({}MB on GPU{}, GPU{} and CPU Host)...".format(int(buf_size / 1024 / 1024), gpuid[0], gpuid[1])) + print(f"Allocating buffers ({int(buf_size / 1024 / 1024)}MB on GPU{gpuid[0]}, GPU{gpuid[1]} and CPU Host)...") checkCudaErrors(cudart.cudaSetDevice(gpuid[0])) g0 = checkCudaErrors(cudart.cudaMalloc(buf_size)) checkCudaErrors(cudart.cudaSetDevice(gpuid[1])) g1 = checkCudaErrors(cudart.cudaMalloc(buf_size)) - h0 = checkCudaErrors(cudart.cudaMallocHost(buf_size)) # Automatically portable with UVA + h0 = checkCudaErrors(cudart.cudaMallocHost(buf_size)) # Automatically portable with UVA # Create CUDA event handles print("Creating event handles...") @@ -102,11 +111,12 @@ def main(): checkCudaErrors(cudart.cudaEventRecord(stop_event, cudart.cudaStream_t(0))) checkCudaErrors(cudart.cudaEventSynchronize(stop_event)) time_memcpy = checkCudaErrors(cudart.cudaEventElapsedTime(start_event, stop_event)) - print("cudaMemcpyPeer / cudaMemcpy between GPU{} and GPU{}: {:.2f}GB/s".format(gpuid[0], gpuid[1], - (1.0 / (time_memcpy / 1000.0)) * ((100.0 * buf_size)) / 1024.0 / 1024.0 / 1024.0)) + print( + f"cudaMemcpyPeer / cudaMemcpy between GPU{gpuid[0]} and GPU{gpuid[1]}: {(1.0 / (time_memcpy / 1000.0)) * (100.0 * buf_size) / 1024.0 / 1024.0 / 1024.0:.2f}GB/s" + ) # Prepare host buffer and copy to GPU 0 - print("Preparing host buffer and memcpy to GPU{}...".format(gpuid[0])) + print(f"Preparing host buffer and memcpy to GPU{gpuid[0]}...") h0_local = (ctypes.c_float * int(buf_size / np.dtype(np.float32).itemsize)).from_address(h0) for i in range(int(buf_size / np.dtype(np.float32).itemsize)): @@ -127,43 +137,61 @@ def main(): # Run kernel on GPU 1, reading input from the GPU 0 buffer, writing # output to the GPU 1 buffer - print("Run kernel on GPU{}, taking source data from GPU{} and writing to GPU{}...".format( - gpuid[1], gpuid[0], gpuid[1])) + print(f"Run kernel on GPU{gpuid[1]}, taking source data from GPU{gpuid[0]} and writing to GPU{gpuid[1]}...") checkCudaErrors(cudart.cudaSetDevice(gpuid[1])) - kernelHelper = [None]*2 - _simpleKernel = [None]*2 - kernelArgs = [None]*2 + kernelHelper = [None] * 2 + _simpleKernel = [None] * 2 + kernelArgs = [None] * 2 kernelHelper[1] = common.KernelHelper(simplep2p, gpuid[1]) - _simpleKernel[1] = kernelHelper[1].getFunction(b'SimpleKernel') + _simpleKernel[1] = kernelHelper[1].getFunction(b"SimpleKernel") kernelArgs[1] = ((g0, g1), (ctypes.c_void_p, ctypes.c_void_p)) - checkCudaErrors(cuda.cuLaunchKernel(_simpleKernel[1], - blocks.x, blocks.y, blocks.z, - threads.x, threads.y, threads.z, - 0, 0, - kernelArgs[1], 0)) + checkCudaErrors( + cuda.cuLaunchKernel( + _simpleKernel[1], + blocks.x, + blocks.y, + blocks.z, + threads.x, + threads.y, + threads.z, + 0, + 0, + kernelArgs[1], + 0, + ) + ) checkCudaErrors(cudart.cudaDeviceSynchronize()) # Run kernel on GPU 0, reading input from the GPU 1 buffer, writing # output to the GPU 0 buffer - print("Run kernel on GPU{}, taking source data from GPU{} and writing to GPU{}...".format( - gpuid[0], gpuid[1], gpuid[0])) + print(f"Run kernel on GPU{gpuid[0]}, taking source data from GPU{gpuid[1]} and writing to GPU{gpuid[0]}...") checkCudaErrors(cudart.cudaSetDevice(gpuid[0])) kernelHelper[0] = common.KernelHelper(simplep2p, gpuid[0]) - _simpleKernel[0] = kernelHelper[0].getFunction(b'SimpleKernel') + _simpleKernel[0] = kernelHelper[0].getFunction(b"SimpleKernel") kernelArgs[0] = ((g1, g0), (ctypes.c_void_p, ctypes.c_void_p)) - checkCudaErrors(cuda.cuLaunchKernel(_simpleKernel[0], - blocks.x, blocks.y, blocks.z, - threads.x, threads.y, threads.z, - 0, 0, - kernelArgs[0], 0)) + checkCudaErrors( + cuda.cuLaunchKernel( + _simpleKernel[0], + blocks.x, + blocks.y, + blocks.z, + threads.x, + threads.y, + threads.z, + 0, + 0, + kernelArgs[0], + 0, + ) + ) checkCudaErrors(cudart.cudaDeviceSynchronize()) # Copy data back to host and verify - print("Copy data back to host from GPU{} and verify results...".format(gpuid[0])) + print(f"Copy data back to host from GPU{gpuid[0]} and verify results...") checkCudaErrors(cudart.cudaMemcpy(h0, g0, buf_size, cudart.cudaMemcpyKind.cudaMemcpyDefault)) error_count = 0 @@ -172,7 +200,7 @@ def main(): # Re-generate input data and apply 2x '* 2.0f' computation of both # kernel runs if h0_local[i] != float(i % 4096) * 2.0 * 2.0: - print("Verification error @ element {}: val = {}, ref = {}\n".format(i, h0_local[i], (float(i%4096)*2.0*2.0))) + print(f"Verification error @ element {i}: val = {h0_local[i]}, ref = {float(i % 4096) * 2.0 * 2.0}\n") error_count += 1 if error_count > 10: break @@ -202,5 +230,6 @@ def main(): sys.exit(-1) print("Test passed!") -if __name__=="__main__": + +if __name__ == "__main__": main() diff --git a/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py b/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py index 834eb32af..267cb3924 100644 --- a/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py +++ b/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py @@ -7,15 +7,17 @@ # is strictly prohibited. import ctypes import math -import numpy as np import random as rnd import sys -from cuda import cuda, cudart + +import numpy as np from common import common from common.helper_cuda import checkCudaErrors -from common.helper_string import checkCmdLineFlag +from common.helper_string import checkCmdLineFlag, getCmdLineArgumentInt -simpleZeroCopy = '''\ +from cuda import cuda, cudart + +simpleZeroCopy = """\ extern "C" __global__ void vectorAddGPU(float *a, float *b, float *c, int N) { @@ -26,7 +28,8 @@ c[idx] = a[idx] + b[idx]; } } -''' +""" + def main(): idev = 0 @@ -45,16 +48,16 @@ def main(): idev = int(getCmdLineArgumentInt("device=")) if idev >= deviceCount or idev < 0: - print("Device number {} is invalid, will use default CUDA device 0.".format(idev)) + print(f"Device number {idev} is invalid, will use default CUDA device 0.") idev = 0 if checkCmdLineFlag("use_generic_memory"): bPinGenericMemory = True if bPinGenericMemory: - print("> Using Generic System Paged Memory (malloc)"); + print("> Using Generic System Paged Memory (malloc)") else: - print("> Using CUDA Host Allocated (cudaHostAlloc)"); + print("> Using CUDA Host Allocated (cudaHostAlloc)") checkCudaErrors(cudart.cudaSetDevice(idev)) @@ -62,7 +65,7 @@ def main(): deviceProp = checkCudaErrors(cudart.cudaGetDeviceProperties(idev)) if not deviceProp.canMapHostMemory: - print("Device {} does not support mapping CPU host memory!".format(idev)) + print(f"Device {idev} does not support mapping CPU host memory!") return checkCudaErrors(cudart.cudaSetDeviceFlags(cudart.cudaDeviceMapHost)) @@ -70,7 +73,7 @@ def main(): # Allocate mapped CPU memory nelem = 1048576 - num_bytes = nelem*np.dtype(np.float32).itemsize + num_bytes = nelem * np.dtype(np.float32).itemsize if bPinGenericMemory: a = np.empty(nelem, dtype=np.float32) @@ -107,21 +110,33 @@ def main(): block.y = 1 block.z = 1 grid = cudart.dim3() - grid.x = math.ceil(nelem/float(block.x)) + grid.x = math.ceil(nelem / float(block.x)) grid.y = 1 grid.z = 1 kernelHelper = common.KernelHelper(simpleZeroCopy, idev) - _vectorAddGPU = kernelHelper.getFunction(b'vectorAddGPU') - kernelArgs = ((d_a, d_b, d_c, nelem),(ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int)) - checkCudaErrors(cuda.cuLaunchKernel(_vectorAddGPU, - grid.x, grid.y, grid.z, - block.x, block.y, block.z, - 0, cuda.CU_STREAM_LEGACY, - kernelArgs, 0)) + _vectorAddGPU = kernelHelper.getFunction(b"vectorAddGPU") + kernelArgs = ( + (d_a, d_b, d_c, nelem), + (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int), + ) + checkCudaErrors( + cuda.cuLaunchKernel( + _vectorAddGPU, + grid.x, + grid.y, + grid.z, + block.x, + block.y, + block.z, + 0, + cuda.CU_STREAM_LEGACY, + kernelArgs, + 0, + ) + ) checkCudaErrors(cudart.cudaDeviceSynchronize()) - print("> Checking the results from vectorAddGPU() ..."); - + print("> Checking the results from vectorAddGPU() ...") # Compare the results errorNorm = 0.0 refNorm = 0.0 @@ -129,8 +144,8 @@ def main(): for n in range(nelem): ref = a[n] + b[n] diff = c[n] - ref - errorNorm += diff*diff - refNorm += ref*ref + errorNorm += diff * diff + refNorm += ref * ref errorNorm = math.sqrt(errorNorm) refNorm = math.sqrt(refNorm) @@ -148,10 +163,11 @@ def main(): checkCudaErrors(cudart.cudaFreeHost(b)) checkCudaErrors(cudart.cudaFreeHost(c)) - if errorNorm/refNorm >= 1.0e-7: + if errorNorm / refNorm >= 1.0e-7: print("FAILED") sys.exit(-1) print("PASSED") -if __name__=="__main__": + +if __name__ == "__main__": main() diff --git a/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py b/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py index f34f31958..64ae4d390 100644 --- a/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py +++ b/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py @@ -6,14 +6,16 @@ # this software and related documentation outside the terms of the EULA # is strictly prohibited. import ctypes -import numpy as np -import sys import os -from cuda import cuda, cudart +import sys + +import numpy as np from common import common from common.helper_cuda import checkCudaErrors, findCudaDevice -systemWideAtomics = '''\ +from cuda import cuda, cudart + +systemWideAtomics = """\ #define LOOP_NUM 50 extern "C" @@ -54,9 +56,10 @@ atomicXor_system(&atom_arr[9], tid); } } -''' +""" + +LOOP_NUM = 50 -LOOP_NUM = 50 #! Compute reference data set #! Each element is multiplied with the number of threads / array length @@ -70,8 +73,8 @@ def verify(testData, length): val += 10 if val != testData[0]: - print(f"atomicAdd failed val = {val} testData = {testData[0]}") - return False + print(f"atomicAdd failed val = {val} testData = {testData[0]}") + return False val = 0 found = False @@ -82,7 +85,7 @@ def verify(testData, length): break if not found: - print("atomicExch failed") + print("atomicExch failed") return False val = -(1 << 8) @@ -136,11 +139,11 @@ def verify(testData, length): print("atomicCAS failed") return False - val = 0xff + val = 0xFF for i in range(length): # 8th element should be 1 - val &= (2 * i + 7) + val &= 2 * i + 7 if val != testData[7]: print("atomicAnd failed") @@ -152,11 +155,11 @@ def verify(testData, length): print("atomicOr failed") return False - val = 0xff + val = 0xFF for i in range(length): # 11th element should be 0xff - val ^= i; + val ^= i if val != testData[9]: print("atomicXor failed") @@ -164,8 +167,9 @@ def verify(testData, length): return True + def main(): - if os.name == 'nt': + if os.name == "nt": print("Atomics not supported on Windows") return @@ -182,7 +186,7 @@ def main(): # This sample requires being run with a default or process exclusive mode print("This sample requires a device in either default or process exclusive mode") return - + if device_prop.major < 6: print("Requires a minimum CUDA compute 6.0 capability, waiving testing.") return @@ -197,24 +201,35 @@ def main(): atom_arr = ctypes.addressof(atom_arr_h) else: print("CANNOT access pageable memory") - atom_arr = checkCudaErrors(cudart.cudaMallocManaged(np.dtype(np.int32).itemsize * numData, cudart.cudaMemAttachGlobal)) + atom_arr = checkCudaErrors( + cudart.cudaMallocManaged(np.dtype(np.int32).itemsize * numData, cudart.cudaMemAttachGlobal) + ) atom_arr_h = (ctypes.c_int * numData).from_address(atom_arr) for i in range(numData): atom_arr_h[i] = 0 # To make the AND and XOR tests generate something other than 0... - atom_arr_h[7] = atom_arr_h[9] = 0xff + atom_arr_h[7] = atom_arr_h[9] = 0xFF kernelHelper = common.KernelHelper(systemWideAtomics, dev_id) - _atomicKernel = kernelHelper.getFunction(b'atomicKernel') - kernelArgs = ((atom_arr,), - (ctypes.c_void_p,)) - checkCudaErrors(cuda.cuLaunchKernel(_atomicKernel, - numBlocks, 1, 1, # grid dim - numThreads, 1, 1, # block dim - 0, cuda.CU_STREAM_LEGACY, # shared mem and stream - kernelArgs, 0)) # arguments + _atomicKernel = kernelHelper.getFunction(b"atomicKernel") + kernelArgs = ((atom_arr,), (ctypes.c_void_p,)) + checkCudaErrors( + cuda.cuLaunchKernel( + _atomicKernel, + numBlocks, + 1, + 1, # grid dim + numThreads, + 1, + 1, # block dim + 0, + cuda.CU_STREAM_LEGACY, # shared mem and stream + kernelArgs, + 0, + ) + ) # arguments # NOTE: Python doesn't have an equivalent system atomic operations # atomicKernel_CPU(atom_arr_h, numBlocks * numThreads) @@ -232,5 +247,6 @@ def main(): if not testResult: sys.exit(-1) -if __name__=="__main__": + +if __name__ == "__main__": main() diff --git a/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py b/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py index 329340406..8aae6b9da 100644 --- a/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py +++ b/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py @@ -7,12 +7,15 @@ # is strictly prohibited. import ctypes import math +import sys + import numpy as np -from cuda import cuda from common import common from common.helper_cuda import checkCudaErrors, findCudaDeviceDRV -vectorAddDrv = '''\ +from cuda import cuda + +vectorAddDrv = """\ /* Vector addition: C = A + B. * * This sample is a very basic sample that implements element by element @@ -29,28 +32,29 @@ if (i < N) C[i] = A[i] + B[i]; } -''' +""" + def main(): print("Vector Addition (Driver API)") N = 50000 - devID = 0 size = N * np.dtype(np.float32).itemsize # Initialize - checkCudaErrors(cuda.cuInit(0)); - + checkCudaErrors(cuda.cuInit(0)) cuDevice = findCudaDeviceDRV() # Create context cuContext = checkCudaErrors(cuda.cuCtxCreate(0, cuDevice)) - uvaSupported = checkCudaErrors(cuda.cuDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, cuDevice)) + uvaSupported = checkCudaErrors( + cuda.cuDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, cuDevice) + ) if not uvaSupported: print("Accessing pageable memory directly requires UVA") return kernelHelper = common.KernelHelper(vectorAddDrv, int(cuDevice)) - _VecAdd_kernel = kernelHelper.getFunction(b'VecAdd_kernel') + _VecAdd_kernel = kernelHelper.getFunction(b"VecAdd_kernel") # Allocate input vectors h_A and h_B in host memory h_A = np.random.rand(size).astype(dtype=np.float32) @@ -69,17 +73,26 @@ def main(): if True: # Grid/Block configuration threadsPerBlock = 256 - blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock + blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock - kernelArgs = ((d_A, d_B, d_C, N), - (None, None, None, ctypes.c_int)) + kernelArgs = ((d_A, d_B, d_C, N), (None, None, None, ctypes.c_int)) # Launch the CUDA kernel - checkCudaErrors(cuda.cuLaunchKernel(_VecAdd_kernel, - blocksPerGrid, 1, 1, - threadsPerBlock, 1, 1, - 0, 0, - kernelArgs, 0)) + checkCudaErrors( + cuda.cuLaunchKernel( + _VecAdd_kernel, + blocksPerGrid, + 1, + 1, + threadsPerBlock, + 1, + 1, + 0, + 0, + kernelArgs, + 0, + ) + ) else: pass @@ -98,9 +111,10 @@ def main(): checkCudaErrors(cuda.cuMemFree(d_C)) checkCudaErrors(cuda.cuCtxDestroy(cuContext)) - print("{}".format("Result = PASS" if i+1 == N else "Result = FAIL")) - if i+1 != N: + print("{}".format("Result = PASS" if i + 1 == N else "Result = FAIL")) + if i + 1 != N: sys.exit(-1) + if __name__ == "__main__": main() diff --git a/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py b/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py index 8af4a8339..b0044dbf9 100644 --- a/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py +++ b/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py @@ -7,13 +7,15 @@ # is strictly prohibited. import ctypes import math -import numpy as np import sys -from cuda import cuda + +import numpy as np from common import common from common.helper_cuda import checkCudaErrors, findCudaDeviceDRV -vectorAddMMAP = '''\ +from cuda import cuda + +vectorAddMMAP = """\ /* Vector addition: C = A + B. * * This sample is a very basic sample that implements element by element @@ -30,10 +32,12 @@ if (i < N) C[i] = A[i] + B[i]; } -''' +""" + def round_up(x, y): - return int((x - 1)/y + 1) * y + return int((x - 1) / y + 1) * y + def getBackingDevices(cuDevice): num_devices = checkCudaErrors(cuda.cuDeviceGetCount()) @@ -50,16 +54,20 @@ def getBackingDevices(cuDevice): continue # The device needs to support virtual address management for the required apis to work - attributeVal = checkCudaErrors(cuda.cuDeviceGetAttribute( - cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, - cuDevice)) + attributeVal = checkCudaErrors( + cuda.cuDeviceGetAttribute( + cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, + cuDevice, + ) + ) if attributeVal == 0: continue backingDevices.append(cuda.CUdevice(dev)) return backingDevices -def simpleMallocMultiDeviceMmap(size, residentDevices, mappingDevices, align = 0): + +def simpleMallocMultiDeviceMmap(size, residentDevices, mappingDevices, align=0): min_granularity = 0 # Setup the properties common for all the chunks @@ -74,7 +82,9 @@ def simpleMallocMultiDeviceMmap(size, residentDevices, mappingDevices, align = 0 # (the max of the minimum granularity of each participating device) for device in residentDevices: prop.location.id = device - status, granularity = cuda.cuMemGetAllocationGranularity(prop, cuda.CUmemAllocationGranularity_flags.CU_MEM_ALLOC_GRANULARITY_MINIMUM) + status, granularity = cuda.cuMemGetAllocationGranularity( + prop, cuda.CUmemAllocationGranularity_flags.CU_MEM_ALLOC_GRANULARITY_MINIMUM + ) if status != cuda.CUresult.CUDA_SUCCESS: return status, None, None if min_granularity < granularity: @@ -84,7 +94,9 @@ def simpleMallocMultiDeviceMmap(size, residentDevices, mappingDevices, align = 0 # (the max of the minimum granularity of each participating device) for device in mappingDevices: prop.location.id = device - status, granularity = cuda.cuMemGetAllocationGranularity(prop, cuda.CUmemAllocationGranularity_flags.CU_MEM_ALLOC_GRANULARITY_MINIMUM) + status, granularity = cuda.cuMemGetAllocationGranularity( + prop, cuda.CUmemAllocationGranularity_flags.CU_MEM_ALLOC_GRANULARITY_MINIMUM + ) if status != cuda.CUresult.CUDA_SUCCESS: return status, None, None if min_granularity < granularity: @@ -123,10 +135,10 @@ def simpleMallocMultiDeviceMmap(size, residentDevices, mappingDevices, align = 0 # Since we do not need to make any other mappings of this memory or export it, # we no longer need and can release the allocationHandle. # The allocation will be kept live until it is unmapped. - status, = cuda.cuMemMap(int(dptr) + (stripeSize * idx), stripeSize, 0, allocationHandle, 0) - + (status,) = cuda.cuMemMap(int(dptr) + (stripeSize * idx), stripeSize, 0, allocationHandle, 0) + # the handle needs to be released even if the mapping failed. - status2, = cuda.cuMemRelease(allocationHandle) + (status2,) = cuda.cuMemRelease(allocationHandle) if status != cuda.CUresult.CUDA_SUCCESS: # cuMemRelease should not have failed here # as the handle was just allocated successfully @@ -151,13 +163,14 @@ def simpleMallocMultiDeviceMmap(size, residentDevices, mappingDevices, align = 0 accessDescriptors[idx].flags = cuda.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE # Apply the access descriptors to the whole VA range. - status, = cuda.cuMemSetAccess(dptr, size, accessDescriptors, len(accessDescriptors)) + (status,) = cuda.cuMemSetAccess(dptr, size, accessDescriptors, len(accessDescriptors)) if status != cuda.CUresult.CUDA_SUCCESS: simpleFreeMultiDeviceMmap(dptr, size) return status, None, None return (status, dptr, allocationSize) + def simpleFreeMultiDeviceMmap(dptr, size): # Unmap the mapped virtual memory region # Since the handles to the mapped backing stores have already been released @@ -165,7 +178,7 @@ def simpleFreeMultiDeviceMmap(dptr, size): # The backing stores will be freed. # Since the memory has been unmapped after this call, accessing the specified # va range will result in a fault (unitll it is remapped). - status = cuda.cuMemUnmap(dptr, size); + status = cuda.cuMemUnmap(dptr, size) if status[0] != cuda.CUresult.CUDA_SUCCESS: return status @@ -178,6 +191,7 @@ def simpleFreeMultiDeviceMmap(dptr, size): return status return status + def main(): print("Vector Addition (Driver API)") N = 50000 @@ -189,12 +203,15 @@ def main(): cuDevice = findCudaDeviceDRV() # Check that the selected device supports virtual address management - attributeVal = checkCudaErrors(cuda.cuDeviceGetAttribute( - cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, - cuDevice)) - print("Device {} VIRTUAL ADDRESS MANAGEMENT SUPPORTED = {}.".format(cuDevice, attributeVal)) + attributeVal = checkCudaErrors( + cuda.cuDeviceGetAttribute( + cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, + cuDevice, + ) + ) + print(f"Device {cuDevice} VIRTUAL ADDRESS MANAGEMENT SUPPORTED = {attributeVal}.") if not attributeVal: - print("Device {} doesn't support VIRTUAL ADDRESS MANAGEMENT.".format(cuDevice)) + print(f"Device {cuDevice} doesn't support VIRTUAL ADDRESS MANAGEMENT.") return # The vector addition happens on cuDevice, so the allocations need to be mapped there. @@ -207,7 +224,7 @@ def main(): cuContext = checkCudaErrors(cuda.cuCtxCreate(0, cuDevice)) kernelHelper = common.KernelHelper(vectorAddMMAP, int(cuDevice)) - _VecAdd_kernel = kernelHelper.getFunction(b'VecAdd_kernel') + _VecAdd_kernel = kernelHelper.getFunction(b"VecAdd_kernel") # Allocate input vectors h_A and h_B in host memory h_A = np.random.rand(size).astype(dtype=np.float32) @@ -231,17 +248,26 @@ def main(): # Grid/Block configuration threadsPerBlock = 256 - blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock + blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock - kernelArgs = ((d_A, d_B, d_C, N), - (None, None, None, ctypes.c_int)) + kernelArgs = ((d_A, d_B, d_C, N), (None, None, None, ctypes.c_int)) # Launch the CUDA kernel - checkCudaErrors(cuda.cuLaunchKernel(_VecAdd_kernel, - blocksPerGrid, 1, 1, - threadsPerBlock, 1, 1, - 0, 0, - kernelArgs, 0)) + checkCudaErrors( + cuda.cuLaunchKernel( + _VecAdd_kernel, + blocksPerGrid, + 1, + 1, + threadsPerBlock, + 1, + 1, + 0, + 0, + kernelArgs, + 0, + ) + ) # Copy result from device memory to host memory # h_C contains the result in host memory @@ -259,9 +285,10 @@ def main(): checkCudaErrors(cuda.cuCtxDestroy(cuContext)) - print("{}".format("Result = PASS" if i+1 == N else "Result = FAIL")) - if i+1 != N: + print("{}".format("Result = PASS" if i + 1 == N else "Result = FAIL")) + if i + 1 != N: sys.exit(-1) + if __name__ == "__main__": main() diff --git a/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py b/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py index 84af8717f..71dc797f7 100644 --- a/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py +++ b/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py @@ -7,15 +7,17 @@ # is strictly prohibited. import ctypes import math -import numpy as np import random as rnd import sys -from cuda import cuda, cudart + +import numpy as np from common import common from common.helper_cuda import checkCudaErrors, findCudaDevice from common.helper_string import checkCmdLineFlag -streamOrderedAllocation = '''\ +from cuda import cuda, cudart + +streamOrderedAllocation = """\ /* Add two vectors on the GPU */ extern "C" __global__ void vectorAddGPU(const float *a, const float *b, float *c, int N) @@ -26,12 +28,13 @@ c[idx] = a[idx] + b[idx]; } } -''' +""" MAX_ITER = 20 + def basicStreamOrderedAllocation(dev, nelem, a, b, c): - num_bytes = nelem*np.dtype(np.float32).itemsize + num_bytes = nelem * np.dtype(np.float32).itemsize print("Starting basicStreamOrderedAllocation()") checkCudaErrors(cudart.cudaSetDevice(dev)) @@ -48,17 +51,29 @@ def basicStreamOrderedAllocation(dev, nelem, a, b, c): block.y = 1 block.z = 1 grid = cudart.dim3() - grid.x = math.ceil(nelem/float(block.x)) + grid.x = math.ceil(nelem / float(block.x)) grid.y = 1 grid.z = 1 - kernelArgs = ((d_a, d_b, d_c, nelem), - (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int)) - checkCudaErrors(cuda.cuLaunchKernel(_vectorAddGPU, - grid.x, grid.y, grid.z, # grid dim - block.x, block.y, block.z, # block dim - 0, stream, # shared mem and stream - kernelArgs, 0)) # arguments + kernelArgs = ( + (d_a, d_b, d_c, nelem), + (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int), + ) + checkCudaErrors( + cuda.cuLaunchKernel( + _vectorAddGPU, + grid.x, + grid.y, + grid.z, # grid dim + block.x, + block.y, + block.z, # block dim + 0, + stream, # shared mem and stream + kernelArgs, + 0, + ) + ) # arguments checkCudaErrors(cudart.cudaFreeAsync(d_a, stream)) checkCudaErrors(cudart.cudaFreeAsync(d_b, stream)) @@ -67,31 +82,32 @@ def basicStreamOrderedAllocation(dev, nelem, a, b, c): checkCudaErrors(cudart.cudaStreamSynchronize(stream)) # Compare the results - print("> Checking the results from vectorAddGPU() ..."); + print("> Checking the results from vectorAddGPU() ...") errorNorm = 0.0 refNorm = 0.0 for n in range(nelem): ref = a[n] + b[n] diff = c[n] - ref - errorNorm += diff*diff - refNorm += ref*ref + errorNorm += diff * diff + refNorm += ref * ref errorNorm = math.sqrt(errorNorm) refNorm = math.sqrt(refNorm) - if errorNorm/refNorm < 1.e-6: + if errorNorm / refNorm < 1.0e-6: print("basicStreamOrderedAllocation PASSED") - + checkCudaErrors(cudart.cudaStreamDestroy(stream)) - return errorNorm/refNorm < 1.e-6 + return errorNorm / refNorm < 1.0e-6 + # streamOrderedAllocationPostSync(): demonstrates If the application wants the memory to persist in the pool beyond # synchronization, then it sets the release threshold on the pool. This way, when the application reaches the "steady state", # it is no longer allocating/freeing memory from the OS. -def streamOrderedAllocationPostSync(dev, nelem, a, b, c) : - num_bytes = nelem*np.dtype(np.float32).itemsize +def streamOrderedAllocationPostSync(dev, nelem, a, b, c): + num_bytes = nelem * np.dtype(np.float32).itemsize print("Starting streamOrderedAllocationPostSync()") checkCudaErrors(cudart.cudaSetDevice(dev)) @@ -102,13 +118,18 @@ def streamOrderedAllocationPostSync(dev, nelem, a, b, c) : memPool = checkCudaErrors(cudart.cudaDeviceGetDefaultMemPool(dev)) thresholdVal = cuda.cuuint64_t(ctypes.c_uint64(-1).value) # Set high release threshold on the default pool so that cudaFreeAsync will not actually release memory to the system. - # By default, the release threshold for a memory pool is set to zero. This implies that the CUDA driver is + # By default, the release threshold for a memory pool is set to zero. This implies that the CUDA driver is # allowed to release a memory chunk back to the system as long as it does not contain any active suballocations. - checkCudaErrors(cudart.cudaMemPoolSetAttribute(memPool, cudart.cudaMemPoolAttr.cudaMemPoolAttrReleaseThreshold, thresholdVal)); - + checkCudaErrors( + cudart.cudaMemPoolSetAttribute( + memPool, + cudart.cudaMemPoolAttr.cudaMemPoolAttrReleaseThreshold, + thresholdVal, + ) + ) # Record teh start event checkCudaErrors(cudart.cudaEventRecord(start, stream)) - for i in range(MAX_ITER): + for _i in range(MAX_ITER): d_a = checkCudaErrors(cudart.cudaMallocAsync(num_bytes, stream)) d_b = checkCudaErrors(cudart.cudaMallocAsync(num_bytes, stream)) d_c = checkCudaErrors(cudart.cudaMallocAsync(num_bytes, stream)) @@ -120,17 +141,29 @@ def streamOrderedAllocationPostSync(dev, nelem, a, b, c) : block.y = 1 block.z = 1 grid = cudart.dim3() - grid.x = math.ceil(nelem/float(block.x)) + grid.x = math.ceil(nelem / float(block.x)) grid.y = 1 grid.z = 1 - kernelArgs = ((d_a, d_b, d_c, nelem), - (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int)) - checkCudaErrors(cuda.cuLaunchKernel(_vectorAddGPU, - grid.x, grid.y, grid.z, # grid dim - block.x, block.y, block.z, # block dim - 0, stream, # shared mem and stream - kernelArgs, 0)) # arguments + kernelArgs = ( + (d_a, d_b, d_c, nelem), + (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int), + ) + checkCudaErrors( + cuda.cuLaunchKernel( + _vectorAddGPU, + grid.x, + grid.y, + grid.z, # grid dim + block.x, + block.y, + block.z, # block dim + 0, + stream, # shared mem and stream + kernelArgs, + 0, + ) + ) # arguments checkCudaErrors(cudart.cudaFreeAsync(d_a, stream)) checkCudaErrors(cudart.cudaFreeAsync(d_b, stream)) @@ -142,7 +175,7 @@ def streamOrderedAllocationPostSync(dev, nelem, a, b, c) : checkCudaErrors(cudart.cudaEventSynchronize(end)) msecTotal = checkCudaErrors(cudart.cudaEventElapsedTime(start, end)) - print("Total elapsed time = {} ms over {} iterations".format(msecTotal, MAX_ITER)) + print(f"Total elapsed time = {msecTotal} ms over {MAX_ITER} iterations") # Compare the results print("> Checking the results from vectorAddGPU() ...") @@ -152,25 +185,26 @@ def streamOrderedAllocationPostSync(dev, nelem, a, b, c) : for n in range(nelem): ref = a[n] + b[n] diff = c[n] - ref - errorNorm += diff*diff - refNorm += ref*ref + errorNorm += diff * diff + refNorm += ref * ref errorNorm = math.sqrt(errorNorm) refNorm = math.sqrt(refNorm) - if errorNorm/refNorm < 1.e-6: + if errorNorm / refNorm < 1.0e-6: print("streamOrderedAllocationPostSync PASSED") checkCudaErrors(cudart.cudaStreamDestroy(stream)) - return errorNorm/refNorm < 1.e-6 + return errorNorm / refNorm < 1.0e-6 + def main(): cuda.cuInit(0) if checkCmdLineFlag("help"): - print("Usage: streamOrderedAllocation [OPTION]\n"); - print("Options:"); - print(" device=[device #] Specify the device to be used"); + print("Usage: streamOrderedAllocation [OPTION]\n") + print("Options:") + print(" device=[device #] Specify the device to be used") return dev = findCudaDevice() @@ -179,22 +213,24 @@ def main(): if version < 11030: isMemPoolSupported = False else: - isMemPoolSupported = checkCudaErrors(cudart.cudaDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED, dev)) + isMemPoolSupported = checkCudaErrors( + cudart.cudaDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED, dev) + ) if not isMemPoolSupported: print("Waiving execution as device does not support Memory Pools") return global _vectorAddGPU kernelHelper = common.KernelHelper(streamOrderedAllocation, dev) - _vectorAddGPU = kernelHelper.getFunction(b'vectorAddGPU') + _vectorAddGPU = kernelHelper.getFunction(b"vectorAddGPU") # Allocate CPU memory nelem = 1048576 - num_bytes = nelem*np.dtype(np.float32).itemsize + nelem * np.dtype(np.float32).itemsize - a = np.zeros(nelem, dtype='float32') - b = np.zeros(nelem, dtype='float32') - c = np.zeros(nelem, dtype='float32') + a = np.zeros(nelem, dtype="float32") + b = np.zeros(nelem, dtype="float32") + c = np.zeros(nelem, dtype="float32") # Initialize the vectors for i in range(nelem): a[i] = rnd.random() @@ -206,5 +242,6 @@ def main(): if not ret1 or not ret2: sys.exit(-1) -if __name__=="__main__": + +if __name__ == "__main__": main() diff --git a/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py b/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py index bb6b5cb04..aacd99cf3 100644 --- a/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py +++ b/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py @@ -7,31 +7,43 @@ # is strictly prohibited. import ctypes import math -import numpy as np import sys -import pytest -from cuda import cuda, cudart from enum import Enum + +import numpy as np +import pytest from common import common from common.helper_cuda import checkCudaErrors, findCudaDevice from common.helper_string import checkCmdLineFlag, getCmdLineArgumentInt +from cuda import cuda, cudart + blockSize = 16 + + class kernels(Enum): - AsyncCopyMultiStageLargeChunk = 0 - AsyncCopyLargeChunk = 1 - AsyncCopyLargeChunkAWBarrier = 2 + AsyncCopyMultiStageLargeChunk = 0 + AsyncCopyLargeChunk = 1 + AsyncCopyLargeChunkAWBarrier = 2 AsyncCopyMultiStageSharedState = 3 - AsyncCopyMultiStage = 4 - AsyncCopySingleStage = 5 - Naive = 6 - NaiveLargeChunk = 7 - -kernelNames = ["AsyncCopyMultiStageLargeChunk", "AsyncCopyLargeChunk", - "AsyncCopyLargeChunkAWBarrier", "AsyncCopyMultiStageSharedState", - "AsyncCopyMultiStage", "AsyncCopySingleStage", "Naive", "NaiveLargeChunk"] - -globalToShmemAsyncCopy = '''\ + AsyncCopyMultiStage = 4 + AsyncCopySingleStage = 5 + Naive = 6 + NaiveLargeChunk = 7 + + +kernelNames = [ + "AsyncCopyMultiStageLargeChunk", + "AsyncCopyLargeChunk", + "AsyncCopyLargeChunkAWBarrier", + "AsyncCopyMultiStageSharedState", + "AsyncCopyMultiStage", + "AsyncCopySingleStage", + "Naive", + "NaiveLargeChunk", +] + +globalToShmemAsyncCopy = """\ #line __LINE__ #if __CUDA_ARCH__ >= 700 #include @@ -41,16 +53,16 @@ class kernels(Enum): #include namespace cg = cooperative_groups; -#define BLOCK_SIZE 16 +#define BLOCK_SIZE 16 #define BLOCK_SIZE_X 16 // Multi Stage memcpy_async pipeline with large chunk copy extern "C" -__global__ void MatrixMulAsyncCopyMultiStageLargeChunk(float* __restrict__ C, +__global__ void MatrixMulAsyncCopyMultiStageLargeChunk(float* __restrict__ C, const float* __restrict__ A, const float* __restrict__ B, int wA, int wB) { - // Requires BLOCK_SIZE % 4 == 0 + // Requires BLOCK_SIZE % 4 == 0 // Multi-stage pipeline version constexpr size_t maxPipelineStages = 4; @@ -125,17 +137,17 @@ class kernels(Enum): // Write the block sub-matrix to device memory; // each thread writes four element - int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x; + int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x; C[c + wB * threadIdx.y + threadIdx.x] = Csub; } // Single Stage memcpy_async pipeline with Large copy chunk (float4) extern "C" -__global__ void MatrixMulAsyncCopyLargeChunk(float* __restrict__ C, +__global__ void MatrixMulAsyncCopyLargeChunk(float* __restrict__ C, const float* __restrict__ A, const float* __restrict__ B, int wA, int wB) { - // Requires BLOCK_SIZE % 4 == 0 + // Requires BLOCK_SIZE % 4 == 0 // Declaration of the shared memory array As used to // store the sub-matrix of A @@ -170,7 +182,7 @@ class kernels(Enum): // Loop over all the sub-matrices of A and B // required to compute the block sub-matrix for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) { - // Load the matrices from device memory to shared memory; + // Load the matrices from device memory to shared memory; // a subset of threads loads a contiguous chunk of elements. // Previously, per-thread: @@ -216,13 +228,13 @@ class kernels(Enum): // Single Stage memcpy_async pipeline with Large copy chunk (float4) using arrive-wait barrier extern "C" -__global__ void MatrixMulAsyncCopyLargeChunkAWBarrier(float* __restrict__ C, +__global__ void MatrixMulAsyncCopyLargeChunkAWBarrier(float* __restrict__ C, const float* __restrict__ A, const float* __restrict__ B, int wA, int wB) { #if __CUDA_ARCH__ >= 700 #pragma diag_suppress static_var_with_dynamic_init - // Requires BLOCK_SIZE % 4 == 0 + // Requires BLOCK_SIZE % 4 == 0 __shared__ cuda::barrier bar; @@ -261,7 +273,7 @@ class kernels(Enum): // Loop over all the sub-matrices of A and B // required to compute the block sub-matrix for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) { - // Load the matrices from device memory to shared memory; + // Load the matrices from device memory to shared memory; // a subset of threads loads a contiguous chunk of elements. // Now, one fourth of the threads load four elements of each matrix @@ -375,7 +387,7 @@ class kernels(Enum): // Multi Stage memcpy_async thread_scope_thread pipeline with single-element async-copy extern "C" -__global__ void MatrixMulAsyncCopyMultiStage(float* __restrict__ C, +__global__ void MatrixMulAsyncCopyMultiStage(float* __restrict__ C, const float* __restrict__ A, const float* __restrict__ B, int wA, int wB) { @@ -461,7 +473,7 @@ class kernels(Enum): // group which issues memcpy_async operations and rest all warps are part of // consumer group which perform gemm computation on the loaded matrices by producer. extern "C" -__global__ void MatrixMulAsyncCopyMultiStageSharedState(float* __restrict__ C, +__global__ void MatrixMulAsyncCopyMultiStageSharedState(float* __restrict__ C, const float* __restrict__ A, const float* __restrict__ B, int wA, int wB) { @@ -510,7 +522,7 @@ class kernels(Enum): a <= aEnd; a += aStep, b += bStep, ++i) { if (threadIdx.y >= consumer_row_count) { // this is a whole producer warp because threadIdx.y >= 16 where 16 == consumer_row_count, - // which loads the matrices from device memory to shared memory; + // which loads the matrices from device memory to shared memory; for (; aStage <= a + aStep * maxPipelineStages; aStage += aStep, bStage += bStep, ++iStage) { if (aStage <= aEnd) { // Rotating buffer @@ -518,7 +530,7 @@ class kernels(Enum): const int strideRows = (blockDim.y - consumer_row_count); pipe.producer_acquire(); for (int rowId = threadIdx.y - consumer_row_count; rowId < BLOCK_SIZE_X; rowId += strideRows) { - cuda::memcpy_async(&As[j][rowId][threadIdx.x], + cuda::memcpy_async(&As[j][rowId][threadIdx.x], &A[aStage + wA * rowId + threadIdx.x], shape1, pipe); cuda::memcpy_async(&Bs[j][rowId][threadIdx.x], &B[bStage + wB * rowId + threadIdx.x], shape1, pipe); @@ -529,7 +541,7 @@ class kernels(Enum): } else { // this is a whole set of consumer group because threadIdx.y < consumer_row_count where consumer_row_count == 16, - // which computes gemm operation on matrices loaded in shared memory by producer warp. + // which computes gemm operation on matrices loaded in shared memory by producer warp. const int j = i % maxPipelineStages; // Synchronize consumer group to make sure the matrices are loaded by producer group. pipe.consumer_wait(); @@ -663,7 +675,7 @@ class kernels(Enum): a += aStep, b += bStep) { // Load the matrices from device memory - // to shared memory; + // to shared memory; // One fourth of the threads load four elements of each matrix if ( t4x < BLOCK_SIZE ) { @@ -697,13 +709,15 @@ class kernels(Enum): int c = wB * BLOCK_SIZE * blockIdx.y + BLOCK_SIZE * blockIdx.x; C[c + wB * threadIdx.y + threadIdx.x] = Csub; } -''' +""" + def ConstantInit(data, size, val): p_data = (ctypes.c_float * size).from_address(data) for i in range(size): p_data[i] = val + # # Run matrix multiplication using CUDA # @@ -768,74 +782,147 @@ def MatrixMultiply(dimsA, dimsB, kernel_number): gridSharedStateKernel.x = dimsB.x / threadsSharedStateKernel.x gridSharedStateKernel.y = dimsA.y / threadsSharedStateKernel.x - print("Running kernel = {} - {}".format(kernel_number, kernelNames[kernel_number.value])) + print(f"Running kernel = {kernel_number} - {kernelNames[kernel_number.value]}") # Create and start timer print("Computing result using CUDA Kernel...") # Performs warmup operation using matrixMul CUDA kernel - kernelArguments = ((d_C, d_A, d_B, dimsA.x, dimsB.x), - (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int)) + kernelArguments = ( + (d_C, d_A, d_B, dimsA.x, dimsB.x), + (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int), + ) if kernel_number == kernels.AsyncCopyMultiStageLargeChunk: - checkCudaErrors(cuda.cuLaunchKernel(_MatrixMulAsyncCopyMultiStageLargeChunk, - grid.x, grid.y, grid.z, # grid dim - threads.x, threads.y, threads.z, # block dim - 0, # shared mem - stream, # stream - kernelArguments, 0)) # arguments + checkCudaErrors( + cuda.cuLaunchKernel( + _MatrixMulAsyncCopyMultiStageLargeChunk, + grid.x, + grid.y, + grid.z, # grid dim + threads.x, + threads.y, + threads.z, # block dim + 0, # shared mem + stream, # stream + kernelArguments, + 0, + ) + ) # arguments elif kernel_number == kernels.AsyncCopyLargeChunk: - checkCudaErrors(cuda.cuLaunchKernel(_MatrixMulAsyncCopyLargeChunk, - grid.x, grid.y, grid.z, # grid dim - threads.x, threads.y, threads.z, # block dim - 0, # shared mem - stream, # stream - kernelArguments, 0)) # arguments + checkCudaErrors( + cuda.cuLaunchKernel( + _MatrixMulAsyncCopyLargeChunk, + grid.x, + grid.y, + grid.z, # grid dim + threads.x, + threads.y, + threads.z, # block dim + 0, # shared mem + stream, # stream + kernelArguments, + 0, + ) + ) # arguments elif kernel_number == kernels.AsyncCopyLargeChunkAWBarrier: - checkCudaErrors(cuda.cuLaunchKernel(_MatrixMulAsyncCopyLargeChunkAWBarrier, - grid.x, grid.y, grid.z, # grid dim - threads.x, threads.y, threads.z, # block dim - 0, # shared mem - stream, # stream - kernelArguments, 0)) # arguments + checkCudaErrors( + cuda.cuLaunchKernel( + _MatrixMulAsyncCopyLargeChunkAWBarrier, + grid.x, + grid.y, + grid.z, # grid dim + threads.x, + threads.y, + threads.z, # block dim + 0, # shared mem + stream, # stream + kernelArguments, + 0, + ) + ) # arguments elif kernel_number == kernels.AsyncCopyMultiStageSharedState: - checkCudaErrors(cuda.cuLaunchKernel(_MatrixMulAsyncCopyMultiStageSharedState, - gridSharedStateKernel.x, gridSharedStateKernel.y, gridSharedStateKernel.z, # grid dim - threadsSharedStateKernel.x, threadsSharedStateKernel.y, threadsSharedStateKernel.z, # block dim - 0, # shared mem - stream, # stream - kernelArguments, 0)) # arguments + checkCudaErrors( + cuda.cuLaunchKernel( + _MatrixMulAsyncCopyMultiStageSharedState, + gridSharedStateKernel.x, + gridSharedStateKernel.y, + gridSharedStateKernel.z, # grid dim + threadsSharedStateKernel.x, + threadsSharedStateKernel.y, + threadsSharedStateKernel.z, # block dim + 0, # shared mem + stream, # stream + kernelArguments, + 0, + ) + ) # arguments elif kernel_number == kernels.AsyncCopyMultiStage: - checkCudaErrors(cuda.cuLaunchKernel(_MatrixMulAsyncCopyMultiStage, - grid.x, grid.y, grid.z, # grid dim - threads.x, threads.y, threads.z, # block dim - 0, # shared mem - stream, # stream - kernelArguments, 0)) # arguments + checkCudaErrors( + cuda.cuLaunchKernel( + _MatrixMulAsyncCopyMultiStage, + grid.x, + grid.y, + grid.z, # grid dim + threads.x, + threads.y, + threads.z, # block dim + 0, # shared mem + stream, # stream + kernelArguments, + 0, + ) + ) # arguments elif kernel_number == kernels.AsyncCopySingleStage: - checkCudaErrors(cuda.cuLaunchKernel(_MatrixMulAsyncCopySingleStage, - grid.x, grid.y, grid.z, # grid dim - threads.x, threads.y, threads.z, # block dim - 0, # shared mem - stream, # stream - kernelArguments, 0)) # arguments + checkCudaErrors( + cuda.cuLaunchKernel( + _MatrixMulAsyncCopySingleStage, + grid.x, + grid.y, + grid.z, # grid dim + threads.x, + threads.y, + threads.z, # block dim + 0, # shared mem + stream, # stream + kernelArguments, + 0, + ) + ) # arguments elif kernel_number == kernels.Naive: - checkCudaErrors(cuda.cuLaunchKernel(_MatrixMulNaive, - grid.x, grid.y, grid.z, # grid dim - threads.x, threads.y, threads.z, # block dim - 0, # shared mem - stream, # stream - kernelArguments, 0)) # arguments + checkCudaErrors( + cuda.cuLaunchKernel( + _MatrixMulNaive, + grid.x, + grid.y, + grid.z, # grid dim + threads.x, + threads.y, + threads.z, # block dim + 0, # shared mem + stream, # stream + kernelArguments, + 0, + ) + ) # arguments elif kernel_number == kernels.NaiveLargeChunk: - checkCudaErrors(cuda.cuLaunchKernel(_MatrixMulNaiveLargeChunk, - grid.x, grid.y, grid.z, # grid dim - threads.x, threads.y, threads.z, # block dim - 0, # shared mem - stream, # stream - kernelArguments, 0)) # arguments - - print('done') + checkCudaErrors( + cuda.cuLaunchKernel( + _MatrixMulNaiveLargeChunk, + grid.x, + grid.y, + grid.z, # grid dim + threads.x, + threads.y, + threads.z, # block dim + 0, # shared mem + stream, # stream + kernelArguments, + 0, + ) + ) # arguments + + print("done") checkCudaErrors(cudart.cudaStreamSynchronize(stream)) - # Execute the kernel nIter = 100 @@ -843,61 +930,133 @@ def MatrixMultiply(dimsA, dimsB, kernel_number): checkCudaErrors(cudart.cudaEventRecord(start, stream)) if kernel_number == kernels.AsyncCopyMultiStageLargeChunk: - checkCudaErrors(cuda.cuLaunchKernel(_MatrixMulAsyncCopyMultiStageLargeChunk, - grid.x, grid.y, grid.z, # grid dim - threads.x, threads.y, threads.z, # block dim - 0, # shared mem - stream, # stream - kernelArguments, 0)) # arguments + checkCudaErrors( + cuda.cuLaunchKernel( + _MatrixMulAsyncCopyMultiStageLargeChunk, + grid.x, + grid.y, + grid.z, # grid dim + threads.x, + threads.y, + threads.z, # block dim + 0, # shared mem + stream, # stream + kernelArguments, + 0, + ) + ) # arguments elif kernel_number == kernels.AsyncCopyLargeChunk: - checkCudaErrors(cuda.cuLaunchKernel(_MatrixMulAsyncCopyLargeChunk, - grid.x, grid.y, grid.z, # grid dim - threads.x, threads.y, threads.z, # block dim - 0, # shared mem - stream, # stream - kernelArguments, 0)) # arguments + checkCudaErrors( + cuda.cuLaunchKernel( + _MatrixMulAsyncCopyLargeChunk, + grid.x, + grid.y, + grid.z, # grid dim + threads.x, + threads.y, + threads.z, # block dim + 0, # shared mem + stream, # stream + kernelArguments, + 0, + ) + ) # arguments elif kernel_number == kernels.AsyncCopyLargeChunkAWBarrier: - checkCudaErrors(cuda.cuLaunchKernel(_MatrixMulAsyncCopyLargeChunkAWBarrier, - grid.x, grid.y, grid.z, # grid dim - threads.x, threads.y, threads.z, # block dim - 0, # shared mem - stream, # stream - kernelArguments, 0)) # arguments + checkCudaErrors( + cuda.cuLaunchKernel( + _MatrixMulAsyncCopyLargeChunkAWBarrier, + grid.x, + grid.y, + grid.z, # grid dim + threads.x, + threads.y, + threads.z, # block dim + 0, # shared mem + stream, # stream + kernelArguments, + 0, + ) + ) # arguments elif kernel_number == kernels.AsyncCopyMultiStageSharedState: - checkCudaErrors(cuda.cuLaunchKernel(_MatrixMulAsyncCopyMultiStageSharedState, - gridSharedStateKernel.x, gridSharedStateKernel.y, gridSharedStateKernel.z, # grid dim - threadsSharedStateKernel.x, threadsSharedStateKernel.y, threadsSharedStateKernel.z, # block dim - 0, # shared mem - stream, # stream - kernelArguments, 0)) # arguments + checkCudaErrors( + cuda.cuLaunchKernel( + _MatrixMulAsyncCopyMultiStageSharedState, + gridSharedStateKernel.x, + gridSharedStateKernel.y, + gridSharedStateKernel.z, # grid dim + threadsSharedStateKernel.x, + threadsSharedStateKernel.y, + threadsSharedStateKernel.z, # block dim + 0, # shared mem + stream, # stream + kernelArguments, + 0, + ) + ) # arguments elif kernel_number == kernels.AsyncCopyMultiStage: - checkCudaErrors(cuda.cuLaunchKernel(_MatrixMulAsyncCopyMultiStage, - grid.x, grid.y, grid.z, # grid dim - threads.x, threads.y, threads.z, # block dim - 0, # shared mem - stream, # stream - kernelArguments, 0)) # arguments + checkCudaErrors( + cuda.cuLaunchKernel( + _MatrixMulAsyncCopyMultiStage, + grid.x, + grid.y, + grid.z, # grid dim + threads.x, + threads.y, + threads.z, # block dim + 0, # shared mem + stream, # stream + kernelArguments, + 0, + ) + ) # arguments elif kernel_number == kernels.AsyncCopySingleStage: - checkCudaErrors(cuda.cuLaunchKernel(_MatrixMulAsyncCopySingleStage, - grid.x, grid.y, grid.z, # grid dim - threads.x, threads.y, threads.z, # block dim - 0, # shared mem - stream, # stream - kernelArguments, 0)) # arguments + checkCudaErrors( + cuda.cuLaunchKernel( + _MatrixMulAsyncCopySingleStage, + grid.x, + grid.y, + grid.z, # grid dim + threads.x, + threads.y, + threads.z, # block dim + 0, # shared mem + stream, # stream + kernelArguments, + 0, + ) + ) # arguments elif kernel_number == kernels.Naive: - checkCudaErrors(cuda.cuLaunchKernel(_MatrixMulNaive, - grid.x, grid.y, grid.z, # grid dim - threads.x, threads.y, threads.z, # block dim - 0, # shared mem - stream, # stream - kernelArguments, 0)) # arguments + checkCudaErrors( + cuda.cuLaunchKernel( + _MatrixMulNaive, + grid.x, + grid.y, + grid.z, # grid dim + threads.x, + threads.y, + threads.z, # block dim + 0, # shared mem + stream, # stream + kernelArguments, + 0, + ) + ) # arguments elif kernel_number == kernels.NaiveLargeChunk: - checkCudaErrors(cuda.cuLaunchKernel(_MatrixMulNaiveLargeChunk, - grid.x, grid.y, grid.z, # grid dim - threads.x, threads.y, threads.z, # block dim - 0, # shared mem - stream, # stream - kernelArguments, 0)) # arguments + checkCudaErrors( + cuda.cuLaunchKernel( + _MatrixMulNaiveLargeChunk, + grid.x, + grid.y, + grid.z, # grid dim + threads.x, + threads.y, + threads.z, # block dim + 0, # shared mem + stream, # stream + kernelArguments, + 0, + ) + ) # arguments # Record the stop event checkCudaErrors(cudart.cudaEventRecord(stop, stream)) @@ -912,11 +1071,9 @@ def MatrixMultiply(dimsA, dimsB, kernel_number): flopsPerMatrixMul = 2.0 * dimsA.x * dimsA.y * dimsB.x gigaFlops = (flopsPerMatrixMul * 1.0e-9) / (msecPerMatrixMul / 1000.0) - print("Performance= {:.2f} GFlop/s, Time= {:.2f} msec, Size= {:.0f} Ops, WorkgroupSize= {} threads/block".format( - gigaFlops, - msecPerMatrixMul, - flopsPerMatrixMul, - threads.x * threads.y)) + print( + f"Performance= {gigaFlops:.2f} GFlop/s, Time= {msecPerMatrixMul:.2f} msec, Size= {flopsPerMatrixMul:.0f} Ops, WorkgroupSize= {threads.x * threads.y} threads/block" + ) # Copy result from device to host checkCudaErrors(cudart.cudaMemcpyAsync(h_C, d_C, mem_size_C, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream)) @@ -927,7 +1084,7 @@ def MatrixMultiply(dimsA, dimsB, kernel_number): # test relative error by the formula # |_cpu - _gpu|/<|x|, |y|> < eps - eps = 1.e-6 + eps = 1.0e-6 h_C_local = (ctypes.c_float * (dimsC.x * dimsC.y)).from_address(h_C) for i in range(dimsC.x * dimsC.y): @@ -937,10 +1094,10 @@ def MatrixMultiply(dimsA, dimsB, kernel_number): rel_err = abs_err / abs_val / dot_length if rel_err > eps: - print("Error! Matrix[{:.5f}]={:.8f} ref={:.8f} err term is > {}".format(i, h_C_local[i], dimsA.x * valB, rel_err)) + print(f"Error! Matrix[{i:.5f}]={h_C_local[i]:.8f} ref={dimsA.x * valB:.8f} err term is > {rel_err}") correct = False - print("Result = PASS" if correct else "Result = FAIL") + print("Result = PASS" if correct else "Result = FAIL") # Clean up memory checkCudaErrors(cudart.cudaFreeHost(h_A)) @@ -951,15 +1108,17 @@ def MatrixMultiply(dimsA, dimsB, kernel_number): checkCudaErrors(cudart.cudaFree(d_C)) checkCudaErrors(cudart.cudaEventDestroy(start)) checkCudaErrors(cudart.cudaEventDestroy(stop)) - print("\nNOTE: The CUDA Samples are not meant for performance "\ - "measurements. Results may vary when GPU Boost is enabled."); - + print( + "\nNOTE: The CUDA Samples are not meant for performance " + "measurements. Results may vary when GPU Boost is enabled." + ) if correct: return 0 return -1 + def checkKernelCompiles(): - kernel_headers = '''\ + kernel_headers = """\ #line __LINE__ #if __CUDA_ARCH__ >= 700 #include @@ -967,7 +1126,7 @@ def checkKernelCompiles(): #include #include #include - ''' + """ try: common.KernelHelper(kernel_headers, findCudaDevice()) except: @@ -977,6 +1136,7 @@ def checkKernelCompiles(): return False return True + @pytest.mark.skipif(not checkKernelCompiles(), reason="Automation filter against incompatible kernel") def main(): print("[globalToShmemAsyncCopy] - Starting...") @@ -986,13 +1146,15 @@ def main(): print("CUDA Toolkit 11.1 or greater is required") return - if (checkCmdLineFlag("help") or checkCmdLineFlag("?")): + if checkCmdLineFlag("help") or checkCmdLineFlag("?"): print("Usage device=n (n >= 0 for deviceID)") print(" wA=WidthA hA=HeightA (Width x Height of Matrix A)") print(" wB=WidthB hB=HeightB (Width x Height of Matrix B)") print(" kernel=kernel_number (0 - AsyncCopyMultiStageLargeChunk; 1 - AsyncCopyLargeChunk)") print(" (2 - AsyncCopyLargeChunkAWBarrier; 3 - AsyncCopyMultiStageSharedState)") - print(" (4 - AsyncCopyMultiStage; 5 - AsyncCopySingleStage; 6 - Naive without memcpy_async)") + print( + " (4 - AsyncCopyMultiStage; 5 - AsyncCopySingleStage; 6 - Naive without memcpy_async)" + ) print(" (7 - NaiveLargeChunk without memcpy_async)") print(" Note: Outer matrix dimensions of A & B matrices must be equal.") return @@ -1026,7 +1188,7 @@ def main(): dimsB.y = int(getCmdLineArgumentInt("hB=")) if dimsA.x != dimsB.y: - print("Error: outer matrix dimensions must be equal. ({} != {})".format(dimsA.x, dimsB.y)) + print(f"Error: outer matrix dimensions must be equal. ({dimsA.x} != {dimsB.y})") sys.exit(-1) selected_kernel = kernels.AsyncCopyMultiStageLargeChunk @@ -1037,16 +1199,17 @@ def main(): if kernel_number < 8: selected_kernel = kernels(kernel_number) else: - print("Error: kernel number should be between 0 to 7, you have entered %d".format(kernel_number)) + print("Error: kernel number should be between 0 to 7, you have entered %d".format()) sys.exit(-1) - major = checkCudaErrors(cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, devID)) + major = checkCudaErrors( + cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, devID) + ) if major < 7: print("globalToShmemAsyncCopy requires SM 7.0 or higher. Exiting...") return - print("MatrixA({},{}), MatrixB({},{})".format(dimsA.x, dimsA.y, - dimsB.x, dimsB.y)) + print(f"MatrixA({dimsA.x},{dimsA.y}), MatrixB({dimsB.x},{dimsB.y})") global _MatrixMulAsyncCopyMultiStageLargeChunk global _MatrixMulAsyncCopyLargeChunk @@ -1057,19 +1220,20 @@ def main(): global _MatrixMulNaive global _MatrixMulNaiveLargeChunk kernelHelper = common.KernelHelper(globalToShmemAsyncCopy, devID) - _MatrixMulAsyncCopyMultiStageLargeChunk = kernelHelper.getFunction(b'MatrixMulAsyncCopyMultiStageLargeChunk') - _MatrixMulAsyncCopyLargeChunk = kernelHelper.getFunction(b'MatrixMulAsyncCopyLargeChunk') - _MatrixMulAsyncCopyLargeChunkAWBarrier = kernelHelper.getFunction(b'MatrixMulAsyncCopyLargeChunkAWBarrier') - _MatrixMulAsyncCopyMultiStageSharedState = kernelHelper.getFunction(b'MatrixMulAsyncCopyMultiStageSharedState') - _MatrixMulAsyncCopyMultiStage = kernelHelper.getFunction(b'MatrixMulAsyncCopyMultiStage') - _MatrixMulAsyncCopySingleStage = kernelHelper.getFunction(b'MatrixMulAsyncCopySingleStage') - _MatrixMulNaive = kernelHelper.getFunction(b'MatrixMulNaive') - _MatrixMulNaiveLargeChunk = kernelHelper.getFunction(b'MatrixMulNaiveLargeChunk') + _MatrixMulAsyncCopyMultiStageLargeChunk = kernelHelper.getFunction(b"MatrixMulAsyncCopyMultiStageLargeChunk") + _MatrixMulAsyncCopyLargeChunk = kernelHelper.getFunction(b"MatrixMulAsyncCopyLargeChunk") + _MatrixMulAsyncCopyLargeChunkAWBarrier = kernelHelper.getFunction(b"MatrixMulAsyncCopyLargeChunkAWBarrier") + _MatrixMulAsyncCopyMultiStageSharedState = kernelHelper.getFunction(b"MatrixMulAsyncCopyMultiStageSharedState") + _MatrixMulAsyncCopyMultiStage = kernelHelper.getFunction(b"MatrixMulAsyncCopyMultiStage") + _MatrixMulAsyncCopySingleStage = kernelHelper.getFunction(b"MatrixMulAsyncCopySingleStage") + _MatrixMulNaive = kernelHelper.getFunction(b"MatrixMulNaive") + _MatrixMulNaiveLargeChunk = kernelHelper.getFunction(b"MatrixMulNaiveLargeChunk") matrix_result = MatrixMultiply(dimsA, dimsB, selected_kernel) if matrix_result != 0: sys.exit(-1) + if __name__ == "__main__": main() diff --git a/cuda_bindings/examples/3_CUDA_Features/simpleCudaGraphs_test.py b/cuda_bindings/examples/3_CUDA_Features/simpleCudaGraphs_test.py index 7a895acb8..ee8343632 100644 --- a/cuda_bindings/examples/3_CUDA_Features/simpleCudaGraphs_test.py +++ b/cuda_bindings/examples/3_CUDA_Features/simpleCudaGraphs_test.py @@ -6,17 +6,19 @@ # this software and related documentation outside the terms of the EULA # is strictly prohibited. import ctypes +import random as rnd + import numpy as np import pytest -import random as rnd -from cuda import cuda, cudart from common import common from common.helper_cuda import checkCudaErrors, findCudaDevice +from cuda import cuda, cudart + THREADS_PER_BLOCK = 512 GRAPH_LAUNCH_ITERATIONS = 3 -simpleCudaGraphs = '''\ +simpleCudaGraphs = """\ #include #include @@ -113,7 +115,8 @@ // write result for this block to global mem if (cta.thread_rank() == 0) result[0] = temp_sum; } -''' +""" + def init_input(a, size): ctypes.c_float.from_address(a) @@ -121,6 +124,7 @@ def init_input(a, size): for i in range(0, size): a_list[i] = rnd.random() + def cudaGraphsManual(inputVec_h, inputVec_d, outputVec_d, result_d, inputSize, numOfBlocks): result_h = ctypes.c_double(0.0) nodeDependencies = [] @@ -133,17 +137,21 @@ def cudaGraphsManual(inputVec_h, inputVec_d, outputVec_d, result_d, inputSize, n memcpyParams.srcArray = None memcpyParams.srcPos = cudart.make_cudaPos(0, 0, 0) - memcpyParams.srcPtr = cudart.make_cudaPitchedPtr(inputVec_h, np.dtype(np.float32).itemsize * inputSize, inputSize, 1) + memcpyParams.srcPtr = cudart.make_cudaPitchedPtr( + inputVec_h, np.dtype(np.float32).itemsize * inputSize, inputSize, 1 + ) memcpyParams.dstArray = None memcpyParams.dstPos = cudart.make_cudaPos(0, 0, 0) - memcpyParams.dstPtr = cudart.make_cudaPitchedPtr(inputVec_d, np.dtype(np.float32).itemsize * inputSize, inputSize, 1) + memcpyParams.dstPtr = cudart.make_cudaPitchedPtr( + inputVec_d, np.dtype(np.float32).itemsize * inputSize, inputSize, 1 + ) memcpyParams.extent = cudart.make_cudaExtent(np.dtype(np.float32).itemsize * inputSize, 1, 1) memcpyParams.kind = cudart.cudaMemcpyKind.cudaMemcpyHostToDevice memsetParams.dst = outputVec_d memsetParams.value = 0 memsetParams.pitch = 0 - memsetParams.elementSize = np.dtype(np.float32).itemsize # elementSize can be max 4 bytes + memsetParams.elementSize = np.dtype(np.float32).itemsize # elementSize can be max 4 bytes memsetParams.width = numOfBlocks * 2 memsetParams.height = 1 @@ -155,8 +163,10 @@ def cudaGraphsManual(inputVec_h, inputVec_d, outputVec_d, result_d, inputSize, n nodeDependencies.append(memsetNode) nodeDependencies.append(memcpyNode) - kernelArgs = ((inputVec_d, outputVec_d, inputSize, numOfBlocks), - (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, ctypes.c_uint)) + kernelArgs = ( + (inputVec_d, outputVec_d, inputSize, numOfBlocks), + (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, ctypes.c_uint), + ) kernelNodeParams.func = _reduce kernelNodeParams.gridDimX = numOfBlocks @@ -167,7 +177,9 @@ def cudaGraphsManual(inputVec_h, inputVec_d, outputVec_d, result_d, inputSize, n kernelNodeParams.kernelParams = kernelArgs # kernelNodeParams.extra = None - kernelNode = checkCudaErrors(cuda.cuGraphAddKernelNode(graph, nodeDependencies, len(nodeDependencies), kernelNodeParams)) + kernelNode = checkCudaErrors( + cuda.cuGraphAddKernelNode(graph, nodeDependencies, len(nodeDependencies), kernelNodeParams) + ) nodeDependencies.clear() nodeDependencies.append(kernelNode) @@ -188,12 +200,16 @@ def cudaGraphsManual(inputVec_h, inputVec_d, outputVec_d, result_d, inputSize, n kernelNodeParams.blockDimX = THREADS_PER_BLOCK kernelNodeParams.blockDimY = kernelNodeParams.blockDimZ = 1 kernelNodeParams.sharedMemBytes = 0 - kernelArgs2 = ((outputVec_d, result_d, numOfBlocks), - (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_uint)) + kernelArgs2 = ( + (outputVec_d, result_d, numOfBlocks), + (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_uint), + ) kernelNodeParams.kernelParams = kernelArgs2 # kernelNodeParams.extra = None - kernelNode = checkCudaErrors(cuda.cuGraphAddKernelNode(graph, nodeDependencies, len(nodeDependencies), kernelNodeParams)) + kernelNode = checkCudaErrors( + cuda.cuGraphAddKernelNode(graph, nodeDependencies, len(nodeDependencies), kernelNodeParams) + ) nodeDependencies.clear() nodeDependencies.append(kernelNode) @@ -208,7 +224,9 @@ def cudaGraphsManual(inputVec_h, inputVec_d, outputVec_d, result_d, inputSize, n memcpyParams.dstPtr = cudart.make_cudaPitchedPtr(result_h, np.dtype(np.float64).itemsize, 1, 1) memcpyParams.extent = cudart.make_cudaExtent(np.dtype(np.float64).itemsize, 1, 1) memcpyParams.kind = cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost - memcpyNode = checkCudaErrors(cudart.cudaGraphAddMemcpyNode(graph, nodeDependencies, len(nodeDependencies), memcpyParams)) + memcpyNode = checkCudaErrors( + cudart.cudaGraphAddMemcpyNode(graph, nodeDependencies, len(nodeDependencies), memcpyParams) + ) nodeDependencies.clear() nodeDependencies.append(memcpyNode) @@ -216,20 +234,20 @@ def cudaGraphsManual(inputVec_h, inputVec_d, outputVec_d, result_d, inputSize, n # WIP: Host nodes nodes, numNodes = checkCudaErrors(cudart.cudaGraphGetNodes(graph)) - print("\nNum of nodes in the graph created manually = {}".format(numNodes)) + print(f"\nNum of nodes in the graph created manually = {numNodes}") graphExec = checkCudaErrors(cudart.cudaGraphInstantiate(graph, 0)) clonedGraph = checkCudaErrors(cudart.cudaGraphClone(graph)) clonedGraphExec = checkCudaErrors(cudart.cudaGraphInstantiate(clonedGraph, 0)) - for i in range(GRAPH_LAUNCH_ITERATIONS): + for _i in range(GRAPH_LAUNCH_ITERATIONS): checkCudaErrors(cudart.cudaGraphLaunch(graphExec, streamForGraph)) checkCudaErrors(cudart.cudaStreamSynchronize(streamForGraph)) print("Cloned Graph Output..") - for i in range(GRAPH_LAUNCH_ITERATIONS): + for _i in range(GRAPH_LAUNCH_ITERATIONS): checkCudaErrors(cudart.cudaGraphLaunch(clonedGraphExec, streamForGraph)) checkCudaErrors(cudart.cudaStreamSynchronize(streamForGraph)) @@ -240,6 +258,7 @@ def cudaGraphsManual(inputVec_h, inputVec_d, outputVec_d, result_d, inputSize, n checkCudaErrors(cudart.cudaGraphDestroy(clonedGraph)) checkCudaErrors(cudart.cudaStreamDestroy(streamForGraph)) + def cudaGraphsUsingStreamCapture(inputVec_h, inputVec_d, outputVec_d, result_d, inputSize, numOfBlocks): result_h = ctypes.c_double(0.0) @@ -258,9 +277,15 @@ def cudaGraphsUsingStreamCapture(inputVec_h, inputVec_d, outputVec_d, result_d, checkCudaErrors(cudart.cudaStreamWaitEvent(stream2, forkStreamEvent, 0)) checkCudaErrors(cudart.cudaStreamWaitEvent(stream3, forkStreamEvent, 0)) - checkCudaErrors(cudart.cudaMemcpyAsync(inputVec_d, inputVec_h, - np.dtype(np.float32).itemsize * inputSize, cudart.cudaMemcpyKind.cudaMemcpyDefault, - stream1)) + checkCudaErrors( + cudart.cudaMemcpyAsync( + inputVec_d, + inputVec_h, + np.dtype(np.float32).itemsize * inputSize, + cudart.cudaMemcpyKind.cudaMemcpyDefault, + stream1, + ) + ) checkCudaErrors(cudart.cudaMemsetAsync(outputVec_d, 0, np.dtype(np.float64).itemsize * numOfBlocks, stream2)) @@ -271,46 +296,63 @@ def cudaGraphsUsingStreamCapture(inputVec_h, inputVec_d, outputVec_d, result_d, checkCudaErrors(cudart.cudaStreamWaitEvent(stream1, memsetEvent1, 0)) - kernelArgs = ((inputVec_d, outputVec_d, inputSize, numOfBlocks), - (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, ctypes.c_uint)) - checkCudaErrors(cuda.cuLaunchKernel(_reduce, - numOfBlocks, 1, 1, - THREADS_PER_BLOCK, 1, 1, - 0, stream1, - kernelArgs, 0)) + kernelArgs = ( + (inputVec_d, outputVec_d, inputSize, numOfBlocks), + (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, ctypes.c_uint), + ) + checkCudaErrors( + cuda.cuLaunchKernel( + _reduce, + numOfBlocks, + 1, + 1, + THREADS_PER_BLOCK, + 1, + 1, + 0, + stream1, + kernelArgs, + 0, + ) + ) checkCudaErrors(cudart.cudaStreamWaitEvent(stream1, memsetEvent2, 0)) - kernelArgs2 = ((outputVec_d, result_d, numOfBlocks), - (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_uint)) - checkCudaErrors(cuda.cuLaunchKernel(_reduceFinal, - 1, 1, 1, - THREADS_PER_BLOCK, 1, 1, - 0, stream1, - kernelArgs2, 0)) - - checkCudaErrors(cudart.cudaMemcpyAsync(result_h, result_d, np.dtype(np.float64).itemsize, - cudart.cudaMemcpyKind.cudaMemcpyDefault, stream1)) + kernelArgs2 = ( + (outputVec_d, result_d, numOfBlocks), + (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_uint), + ) + checkCudaErrors(cuda.cuLaunchKernel(_reduceFinal, 1, 1, 1, THREADS_PER_BLOCK, 1, 1, 0, stream1, kernelArgs2, 0)) + + checkCudaErrors( + cudart.cudaMemcpyAsync( + result_h, + result_d, + np.dtype(np.float64).itemsize, + cudart.cudaMemcpyKind.cudaMemcpyDefault, + stream1, + ) + ) # WIP: Host nodes graph = checkCudaErrors(cudart.cudaStreamEndCapture(stream1)) nodes, numNodes = checkCudaErrors(cudart.cudaGraphGetNodes(graph)) - print("\nNum of nodes in the graph created using stream capture API = {}".format(numNodes)) + print(f"\nNum of nodes in the graph created using stream capture API = {numNodes}") graphExec = checkCudaErrors(cudart.cudaGraphInstantiate(graph, 0)) clonedGraph = checkCudaErrors(cudart.cudaGraphClone(graph)) clonedGraphExec = checkCudaErrors(cudart.cudaGraphInstantiate(clonedGraph, 0)) - for i in range(GRAPH_LAUNCH_ITERATIONS): + for _i in range(GRAPH_LAUNCH_ITERATIONS): checkCudaErrors(cudart.cudaGraphLaunch(graphExec, streamForGraph)) checkCudaErrors(cudart.cudaStreamSynchronize(streamForGraph)) print("Cloned Graph Output..") - for i in range(GRAPH_LAUNCH_ITERATIONS): + for _i in range(GRAPH_LAUNCH_ITERATIONS): checkCudaErrors(cudart.cudaGraphLaunch(clonedGraphExec, streamForGraph)) checkCudaErrors(cudart.cudaStreamSynchronize(streamForGraph)) @@ -323,10 +365,11 @@ def cudaGraphsUsingStreamCapture(inputVec_h, inputVec_d, outputVec_d, result_d, checkCudaErrors(cudart.cudaStreamDestroy(stream2)) checkCudaErrors(cudart.cudaStreamDestroy(streamForGraph)) + def checkKernelCompiles(): - kernel_headers = '''\ + kernel_headers = """\ #include - ''' + """ try: common.KernelHelper(kernel_headers, findCudaDevice()) except: @@ -338,9 +381,10 @@ def checkKernelCompiles(): return False return True + @pytest.mark.skipif(not checkKernelCompiles(), reason="Automation filter against incompatible kernel") def main(): - size = 1 << 24 # number of elements to reduce + size = 1 << 24 # number of elements to reduce maxBlocks = 512 # This will pick the best possible CUDA capable device @@ -349,12 +393,12 @@ def main(): global _reduce global _reduceFinal kernelHelper = common.KernelHelper(simpleCudaGraphs, devID) - _reduce = kernelHelper.getFunction(b'reduce') - _reduceFinal = kernelHelper.getFunction(b'reduceFinal') + _reduce = kernelHelper.getFunction(b"reduce") + _reduceFinal = kernelHelper.getFunction(b"reduceFinal") - print("{} elements".format(size)) - print("threads per block = {}".format(THREADS_PER_BLOCK)) - print("Graph Launch iterations = {}".format(GRAPH_LAUNCH_ITERATIONS)) + print(f"{size} elements") + print(f"threads per block = {THREADS_PER_BLOCK}") + print(f"Graph Launch iterations = {GRAPH_LAUNCH_ITERATIONS}") inputVec_h = checkCudaErrors(cudart.cudaMallocHost(size * np.dtype(np.float32).itemsize)) inputVec_d = checkCudaErrors(cudart.cudaMalloc(size * np.dtype(np.float32).itemsize)) @@ -371,5 +415,6 @@ def main(): checkCudaErrors(cudart.cudaFree(result_d)) checkCudaErrors(cudart.cudaFreeHost(inputVec_h)) + if __name__ == "__main__": main() diff --git a/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py b/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py index 6f64066f8..38be5798f 100644 --- a/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py +++ b/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py @@ -7,14 +7,16 @@ # is strictly prohibited. import ctypes import math -import numpy as np import sys -from cuda import cuda, cudart +from random import random + +import numpy as np from common import common from common.helper_cuda import checkCudaErrors, findCudaDevice -from random import random -conjugateGradientMultiBlockCG = '''\ +from cuda import cuda, cudart + +conjugateGradientMultiBlockCG = """\ #line __LINE__ #include #include @@ -160,12 +162,13 @@ k++; } } -''' +""" + def genTridiag(I, J, val, N, nz): - I[0] = 0 + I[0] = 0 J[0] = 0 - J[1]= 0 + J[1] = 0 val[0] = float(random()) + 10.0 val[1] = float(random()) @@ -190,13 +193,15 @@ def genTridiag(I, J, val, N, nz): val[start + 2] = float(random()) I[N] = nz + THREADS_PER_BLOCK = 512 -sSDKname = "conjugateGradientMultiBlockCG"; +sSDKname = "conjugateGradientMultiBlockCG" + + def main(): tol = 1e-5 - print("Starting [%s]...\n" % sSDKname); - + print(f"Starting [{sSDKname}]...\n") # WAIVE: Due to bug in NVRTC return @@ -212,23 +217,24 @@ def main(): # This sample requires being run on a device that supports Cooperative Kernel # Launch if not deviceProp.cooperativeLaunch: - print("\nSelected GPU (%d) does not support Cooperative Kernel Launch, Waiving the run" % - (devID)) + print("\nSelected GPU (%d) does not support Cooperative Kernel Launch, Waiving the run" % (devID)) return # Statistics about the GPU device - print("> GPU device has %d Multi-Processors, SM %d.%d compute capabilities\n" % - (deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor)) + print( + "> GPU device has %d Multi-Processors, SM %d.%d compute capabilities\n" + % (deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor) + ) # Get kernel kernelHelper = common.KernelHelper(conjugateGradientMultiBlockCG, devID) - _gpuConjugateGradient = kernelHelper.getFunction(b'gpuConjugateGradient') + _gpuConjugateGradient = kernelHelper.getFunction(b"gpuConjugateGradient") # Generate a random tridiagonal symmetric matrix in CSR format N = 1048576 nz = (N - 2) * 3 + 4 - I = checkCudaErrors(cudart.cudaMallocManaged(np.dtype(np.int32).itemsize * (N+1), cudart.cudaMemAttachGlobal)) + I = checkCudaErrors(cudart.cudaMallocManaged(np.dtype(np.int32).itemsize * (N + 1), cudart.cudaMemAttachGlobal)) J = checkCudaErrors(cudart.cudaMallocManaged(np.dtype(np.int32).itemsize * nz, cudart.cudaMemAttachGlobal)) val = checkCudaErrors(cudart.cudaMallocManaged(np.dtype(np.float32).itemsize * nz, cudart.cudaMemAttachGlobal)) I_local = (ctypes.c_int * (N + 1)).from_address(I) @@ -250,8 +256,6 @@ def main(): p = checkCudaErrors(cudart.cudaMallocManaged(np.dtype(np.float32).itemsize * N, cudart.cudaMemAttachGlobal)) Ax = checkCudaErrors(cudart.cudaMallocManaged(np.dtype(np.float32).itemsize * N, cudart.cudaMemAttachGlobal)) r_local = (ctypes.c_float * N).from_address(r) - p_local = (ctypes.c_float * N).from_address(p) - Ax_local = (ctypes.c_float * N).from_address(Ax) checkCudaErrors(cudart.cudaDeviceSynchronize()) @@ -262,18 +266,27 @@ def main(): r_local[i] = rhs_local[i] = 1.0 x_local[i] = 0.0 - kernelArgs_value = (I, J, val, x, - Ax, p, r, dot_result, - nz, N, tol) - kernelArgs_types = (ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, - ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, - ctypes.c_int, ctypes.c_int, ctypes.c_float) + kernelArgs_value = (I, J, val, x, Ax, p, r, dot_result, nz, N, tol) + kernelArgs_types = ( + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_int, + ctypes.c_int, + ctypes.c_float, + ) kernelArgs = (kernelArgs_value, kernelArgs_types) - sMemSize = np.dtype(np.float64).itemsize * ((THREADS_PER_BLOCK/32) + 1) + sMemSize = np.dtype(np.float64).itemsize * ((THREADS_PER_BLOCK / 32) + 1) numThreads = THREADS_PER_BLOCK - numBlocksPerSm = checkCudaErrors(cuda.cuOccupancyMaxActiveBlocksPerMultiprocessor( - _gpuConjugateGradient, numThreads, sMemSize)) + numBlocksPerSm = checkCudaErrors( + cuda.cuOccupancyMaxActiveBlocksPerMultiprocessor(_gpuConjugateGradient, numThreads, sMemSize) + ) numSms = deviceProp.multiProcessorCount dimGrid = cudart.dim3() dimGrid.x = numSms * numBlocksPerSm @@ -285,24 +298,31 @@ def main(): dimBlock.z = 1 checkCudaErrors(cudart.cudaEventRecord(start, 0)) - checkCudaErrors(cuda.cuLaunchCooperativeKernel(_gpuConjugateGradient, - dimGrid.x, dimGrid.y, dimGrid.z, - dimBlock.x, dimBlock.y, dimBlock.z, - 0, 0, - kernelArgs)) + checkCudaErrors( + cuda.cuLaunchCooperativeKernel( + _gpuConjugateGradient, + dimGrid.x, + dimGrid.y, + dimGrid.z, + dimBlock.x, + dimBlock.y, + dimBlock.z, + 0, + 0, + kernelArgs, + ) + ) checkCudaErrors(cudart.cudaEventRecord(stop, 0)) checkCudaErrors(cudart.cudaDeviceSynchronize()) - time = checkCudaErrors(cudart.cudaEventElapsedTime(start, stop)); - - print("GPU Final, residual = %e, kernel execution time = %f ms" % - (math.sqrt(dot_result_local), time)) + time = checkCudaErrors(cudart.cudaEventElapsedTime(start, stop)) + print(f"GPU Final, residual = {math.sqrt(dot_result_local):e}, kernel execution time = {time:f} ms") err = 0.0 for i in range(N): rsum = 0.0 - for j in range(I_local[i], I_local[i+1]): + for j in range(I_local[i], I_local[i + 1]): rsum += val_local[j] * x_local[J_local[j]] diff = math.fabs(rsum - rhs_local[i]) @@ -322,9 +342,8 @@ def main(): checkCudaErrors(cudart.cudaEventDestroy(start)) checkCudaErrors(cudart.cudaEventDestroy(stop)) - print("Test Summary: Error amount = %f" % err) - print("&&&& conjugateGradientMultiBlockCG %s\n" % - ("PASSED" if math.sqrt(dot_result_local) < tol else "FAILED")) + print(f"Test Summary: Error amount = {err:f}") + print("&&&& conjugateGradientMultiBlockCG %s\n" % ("PASSED" if math.sqrt(dot_result_local) < tol else "FAILED")) if math.sqrt(dot_result_local) >= tol: sys.exit(-1) diff --git a/cuda_bindings/examples/common/common.py b/cuda_bindings/examples/common/common.py index c24322f94..ec55c1ac5 100644 --- a/cuda_bindings/examples/common/common.py +++ b/cuda_bindings/examples/common/common.py @@ -5,39 +5,50 @@ # this software. Any use, reproduction, disclosure, or distribution of # this software and related documentation outside the terms of the EULA # is strictly prohibited. -import ctypes -import numpy as np import os -from cuda import cuda, cudart, nvrtc + +import numpy as np from common.helper_cuda import checkCudaErrors +from cuda import cuda, cudart, nvrtc + + class KernelHelper: def __init__(self, code, devID): - prog = checkCudaErrors(nvrtc.nvrtcCreateProgram(str.encode(code), b'sourceCode.cu', 0, None, None)) - CUDA_HOME = os.getenv('CUDA_HOME') - if CUDA_HOME == None: - CUDA_HOME = os.getenv('CUDA_PATH') - if CUDA_HOME == None: - raise RuntimeError('Environment variable CUDA_HOME or CUDA_PATH is not set') - include_dirs = os.path.join(CUDA_HOME, 'include') + prog = checkCudaErrors(nvrtc.nvrtcCreateProgram(str.encode(code), b"sourceCode.cu", 0, None, None)) + CUDA_HOME = os.getenv("CUDA_HOME") + if CUDA_HOME is None: + CUDA_HOME = os.getenv("CUDA_PATH") + if CUDA_HOME is None: + raise RuntimeError("Environment variable CUDA_HOME or CUDA_PATH is not set") + include_dirs = os.path.join(CUDA_HOME, "include") # Initialize CUDA checkCudaErrors(cudart.cudaFree(0)) - major = checkCudaErrors(cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, devID)) - minor = checkCudaErrors(cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, devID)) + major = checkCudaErrors( + cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, devID) + ) + minor = checkCudaErrors( + cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, devID) + ) _, nvrtc_minor = checkCudaErrors(nvrtc.nvrtcVersion()) - use_cubin = (nvrtc_minor >= 1) - prefix = 'sm' if use_cubin else 'compute' - arch_arg = bytes(f'--gpu-architecture={prefix}_{major}{minor}', 'ascii') + use_cubin = nvrtc_minor >= 1 + prefix = "sm" if use_cubin else "compute" + arch_arg = bytes(f"--gpu-architecture={prefix}_{major}{minor}", "ascii") try: - opts = [b'--fmad=true', arch_arg, '--include-path={}'.format(include_dirs).encode('UTF-8'), - b'--std=c++11', b'-default-device'] + opts = [ + b"--fmad=true", + arch_arg, + f"--include-path={include_dirs}".encode(), + b"--std=c++11", + b"-default-device", + ] checkCudaErrors(nvrtc.nvrtcCompileProgram(prog, len(opts), opts)) except RuntimeError as err: logSize = checkCudaErrors(nvrtc.nvrtcGetProgramLogSize(prog)) - log = b' ' * logSize + log = b" " * logSize checkCudaErrors(nvrtc.nvrtcGetProgramLog(prog, log)) print(log.decode()) print(err) @@ -45,11 +56,11 @@ def __init__(self, code, devID): if use_cubin: dataSize = checkCudaErrors(nvrtc.nvrtcGetCUBINSize(prog)) - data = b' ' * dataSize + data = b" " * dataSize checkCudaErrors(nvrtc.nvrtcGetCUBIN(prog, data)) else: dataSize = checkCudaErrors(nvrtc.nvrtcGetPTXSize(prog)) - data = b' ' * dataSize + data = b" " * dataSize checkCudaErrors(nvrtc.nvrtcGetPTX(prog, data)) self.module = checkCudaErrors(cuda.cuModuleLoadData(np.char.array(data))) diff --git a/cuda_bindings/examples/common/helper_cuda.py b/cuda_bindings/examples/common/helper_cuda.py index cbd0d2da4..6cc4026dd 100644 --- a/cuda_bindings/examples/common/helper_cuda.py +++ b/cuda_bindings/examples/common/helper_cuda.py @@ -5,8 +5,10 @@ # this software. Any use, reproduction, disclosure, or distribution of # this software and related documentation outside the terms of the EULA # is strictly prohibited. +from common.helper_string import checkCmdLineFlag, getCmdLineArgumentInt + from cuda import cuda, cudart, nvrtc -from common.helper_string import getCmdLineArgumentInt, checkCmdLineFlag + def _cudaGetErrorEnum(error): if isinstance(error, cuda.CUresult): @@ -17,11 +19,12 @@ def _cudaGetErrorEnum(error): elif isinstance(error, nvrtc.nvrtcResult): return nvrtc.nvrtcGetErrorString(error)[1] else: - raise RuntimeError('Unknown error type: {}'.format(error)) + raise RuntimeError(f"Unknown error type: {error}") + def checkCudaErrors(result): if result[0].value: - raise RuntimeError("CUDA error code={}({})".format(result[0].value, _cudaGetErrorEnum(result[0]))) + raise RuntimeError(f"CUDA error code={result[0].value}({_cudaGetErrorEnum(result[0])})") if len(result) == 1: return None elif len(result) == 2: @@ -29,6 +32,7 @@ def checkCudaErrors(result): else: return result[1:] + def findCudaDevice(): devID = 0 if checkCmdLineFlag("device="): @@ -36,6 +40,7 @@ def findCudaDevice(): checkCudaErrors(cudart.cudaSetDevice(devID)) return devID + def findCudaDeviceDRV(): devID = 0 if checkCmdLineFlag("device="): diff --git a/cuda_bindings/examples/common/helper_string.py b/cuda_bindings/examples/common/helper_string.py index 1e0d65f1f..7677047a3 100644 --- a/cuda_bindings/examples/common/helper_string.py +++ b/cuda_bindings/examples/common/helper_string.py @@ -7,18 +7,13 @@ # is strictly prohibited. import sys + def checkCmdLineFlag(stringRef): - k = 0 - for i in sys.argv: - if stringRef == i and k < len(sys.argv) - 1: - return True - k += 1 - return False + return any(stringRef == i and k < len(sys.argv) - 1 for i, k in enumerate(sys.argv)) + def getCmdLineArgumentInt(stringRef): - k = 0 - for i in sys.argv: + for i, k in enumerate(sys.argv): if stringRef == i and k < len(sys.argv) - 1: - return sys.argv[k+1] - k += 1 + return sys.argv[k + 1] return 0 diff --git a/cuda_bindings/examples/extra/isoFDModelling_test.py b/cuda_bindings/examples/extra/isoFDModelling_test.py index dd4781824..01e5f5714 100644 --- a/cuda_bindings/examples/extra/isoFDModelling_test.py +++ b/cuda_bindings/examples/extra/isoFDModelling_test.py @@ -5,13 +5,15 @@ # this software. Any use, reproduction, disclosure, or distribution of # this software and related documentation outside the terms of the EULA # is strictly prohibited. -import numpy as np import time -from cuda import cuda, cudart + +import numpy as np from common import common from common.helper_cuda import checkCudaErrors -isoPropagator = '''\ +from cuda import cuda, cudart + +isoPropagator = """\ extern "C" __global__ void injectSource(float *__restrict__ in, float *__restrict__ src, int it) { @@ -153,52 +155,65 @@ idx_in += stride; } } -''' +""" display_graph = False verbose_prints = False + def align_nx(nx, blk, nops): - n_align = (int)((nx - 1)/blk) + 1 + n_align = (int)((nx - 1) / blk) + 1 n_align *= blk - n_align += 2*nops + n_align += 2 * nops n_align = (int)((n_align - 1) / 64) + 1 n_align *= 64 return (int)(n_align) + def align_ny(ny, blk, nops): - n_align = (int)((ny - 1)/blk) + 1 + n_align = (int)((ny - 1) / blk) + 1 n_align *= blk - n_align += 2*nops + n_align += 2 * nops return (int)(n_align) + # # this class contains the input params # -class params (): +class params: def __init__(self): - self.BDIMX = 32 # tiles x y for fd operators + self.BDIMX = 32 # tiles x y for fd operators self.BDIMY = 16 self.FD_ORDER = 4 - self.lead = 64 - self.FD_ORDER - self.nx = align_nx(700, 2*self.BDIMX, self.FD_ORDER) + self.lead = 64 - self.FD_ORDER + self.nx = align_nx(700, 2 * self.BDIMX, self.FD_ORDER) self.ny = align_ny(600, self.BDIMY, self.FD_ORDER) - self.blkx = (int) ((self.nx - 2*self.FD_ORDER) / (2*self.BDIMX)) - self.blky = (int) ((self.ny - 2*self.FD_ORDER) / self.BDIMY) + self.blkx = (int)((self.nx - 2 * self.FD_ORDER) / (2 * self.BDIMX)) + self.blky = (int)((self.ny - 2 * self.FD_ORDER) / self.BDIMY) - self.nz = (int)(200) + self.nz = 200 self.delta = 25.0 - self.dt = 0.3 * 1000.0 * self.delta / 4500.0 + self.dt = 0.3 * 1000.0 * self.delta / 4500.0 self.tmax_propag = 1000.0 self.nt = int(self.tmax_propag / self.dt) - self.freqMax = 3.5* 1000.0 / (4.0 * self.delta) - print("dt= ",self.dt, " delta= ", self.delta, " nt= ", self.nt, " freq max= " , self.freqMax) + self.freqMax = 3.5 * 1000.0 / (4.0 * self.delta) + print( + "dt= ", + self.dt, + " delta= ", + self.delta, + " nt= ", + self.nt, + " freq max= ", + self.freqMax, + ) + # # this class contains all the kernels to be used bu propagator # -class cudaKernels(): - def __init__ (self, cntx): +class cudaKernels: + def __init__(self, cntx): checkCudaErrors(cuda.cuInit(0)) checkCudaErrors(cuda.cuCtxSetCurrent(cntx)) dev = checkCudaErrors(cuda.cuCtxGetDevice()) @@ -206,15 +221,16 @@ def __init__ (self, cntx): self.kernelHelper = common.KernelHelper(isoPropagator, int(dev)) # kernel to create a source fnction with some max frequency - self.creatSource = self.kernelHelper.getFunction(b'createSource') + self.creatSource = self.kernelHelper.getFunction(b"createSource") # create a velocity to try things: just a sphere on the middle 4500 m/s and 2500 m/s all around - self.createVelocity = self.kernelHelper.getFunction(b'createVelocity') + self.createVelocity = self.kernelHelper.getFunction(b"createVelocity") # kernel to propagate the wavefield by 1 step in time - self.fdPropag = self.kernelHelper.getFunction(b'fwd_3D_orderX2k') + self.fdPropag = self.kernelHelper.getFunction(b"fwd_3D_orderX2k") # kernel to propagate the wavefield by 1 step in time - self.injectSource = self.kernelHelper.getFunction(b'injectSource') + self.injectSource = self.kernelHelper.getFunction(b"injectSource") + # # this class contains: propagator, source creation, velocity creation @@ -249,7 +265,7 @@ def __del__(self): # def swap(self): if verbose_prints: - print("swap in out ", int(self.waveIn), " " , int(self.waveOut)) + print("swap in out ", int(self.waveIn), " ", int(self.waveOut)) i = int(self.waveIn) j = int(self.waveOut) a = i @@ -262,8 +278,8 @@ def swap(self): # allocate the device memory # def allocate(self): - nel = self.params.nx * self.params.ny * self.params.nz - n = np.array( nel, dtype=np.uint32) + nel = self.params.nx * self.params.ny * self.params.nz + n = np.array(nel, dtype=np.uint32) bufferSize = n * np.dtype(np.float32).itemsize checkCudaErrors(cuda.cuCtxSetCurrent(self.context)) @@ -272,7 +288,7 @@ def allocate(self): checkCudaErrors(cuda.cuMemsetD32(self.velocity, 0, n)) nel += self.params.lead - n = np.array(nel, dtype=np.uint32) ## we need to align at the beginning of the tile + n = np.array(nel, dtype=np.uint32) ## we need to align at the beginning of the tile bufferSize = n * np.dtype(np.float32).itemsize self.waveIn = checkCudaErrors(cuda.cuMemAlloc(bufferSize)) @@ -294,17 +310,27 @@ def createSource(self, kernel): buf = np.array([int(self.source)], dtype=np.uint64) nt = np.array(self.params.nt, dtype=np.uint32) - dt = np.array(self.params.dt, dtype=np.float32) + dt = np.array(self.params.dt, dtype=np.float32) freq = np.array(self.params.freqMax, dtype=np.float32) args = [buf, dt, freq, nt] args = np.array([arg.ctypes.data for arg in args], dtype=np.uint64) checkCudaErrors(cuda.cuCtxSetCurrent(self.context)) - checkCudaErrors(cuda.cuLaunchKernel(kernel.creatSource, - 1, 1, 1, # grid dim - 1024, 1, 1, # block dim - 0, self.streamHalo, # shared mem and stream - args.ctypes.data, 0)) # arguments + checkCudaErrors( + cuda.cuLaunchKernel( + kernel.creatSource, + 1, + 1, + 1, # grid dim + 1024, + 1, + 1, # block dim + 0, + self.streamHalo, # shared mem and stream + args.ctypes.data, + 0, + ) + ) # arguments checkCudaErrors(cuda.cuStreamSynchronize(self.streamHalo)) # @@ -314,23 +340,37 @@ def injectSource(self, kernel, iter): checkCudaErrors(cuda.cuCtxSetCurrent(self.context)) if self.dev != 0: - return + return wavein = np.array([int(self.waveIn)], dtype=np.uint64) src = np.array([int(self.source)], dtype=np.uint64) - offset_sourceInject = self.params.lead + (int)(self.params.nz/2) * self.params.nx * self.params.ny + \ - (int)(self.params.ny/2) * self.params.nx + (int) (self.params.nx/2) + offset_sourceInject = ( + self.params.lead + + (int)(self.params.nz / 2) * self.params.nx * self.params.ny + + (int)(self.params.ny / 2) * self.params.nx + + (int)(self.params.nx / 2) + ) offset_sourceInject *= np.dtype(np.float32).itemsize np_it = np.array(iter, dtype=np.uint32) - args = [wavein+offset_sourceInject, src, np_it] + args = [wavein + offset_sourceInject, src, np_it] args = np.array([arg.ctypes.data for arg in args], dtype=np.uint64) - checkCudaErrors(cuda.cuLaunchKernel(kernel.injectSource, - 1, 1, 1, # grid dim - 1, 1, 1, # block dim - 0, self.streamHalo, # shared mem and stream - args.ctypes.data, 0)) # arguments + checkCudaErrors( + cuda.cuLaunchKernel( + kernel.injectSource, + 1, + 1, + 1, # grid dim + 1, + 1, + 1, # block dim + 0, + self.streamHalo, # shared mem and stream + args.ctypes.data, + 0, + ) + ) # arguments # # create velocity @@ -338,8 +378,11 @@ def injectSource(self, kernel, iter): def createVelocity(self, kernel): print("running create velocity on device ", self.dev) - offset_velocity = self.params.FD_ORDER * self.params.nx * self.params.ny + \ - self.params.FD_ORDER * self.params.nx + self.params.FD_ORDER + offset_velocity = ( + self.params.FD_ORDER * self.params.nx * self.params.ny + + self.params.FD_ORDER * self.params.nx + + self.params.FD_ORDER + ) offset_velocity *= np.dtype(np.float32).itemsize vel = np.array([int(self.velocity)], dtype=np.uint64) @@ -347,32 +390,45 @@ def createVelocity(self, kernel): stride = self.params.nx * self.params.ny np_dx_dt2 = np.array(dx_dt2, dtype=np.float32) - np_nz = np.array((self.params.nz-2*self.params.FD_ORDER), dtype=np.uint32) + np_nz = np.array((self.params.nz - 2 * self.params.FD_ORDER), dtype=np.uint32) np_nx = np.array(self.params.nx, dtype=np.uint32) np_stride = np.array(stride, dtype=np.uint32) - args = [vel+ offset_velocity, np_dx_dt2, np_nz, np_nx, np_stride] + args = [vel + offset_velocity, np_dx_dt2, np_nz, np_nx, np_stride] args = np.array([arg.ctypes.data for arg in args], dtype=np.uint64) checkCudaErrors(cuda.cuCtxSetCurrent(self.context)) # do halo up - checkCudaErrors(cuda.cuLaunchKernel(kernel.createVelocity, - self.params.blkx, self.params.blky, 1, # grid dim - 2*self.params.BDIMX, self.params.BDIMY, 1, # block dim - 0, self.streamHalo, # shared mem and stream - args.ctypes.data, 0)) # arguments + checkCudaErrors( + cuda.cuLaunchKernel( + kernel.createVelocity, + self.params.blkx, + self.params.blky, + 1, # grid dim + 2 * self.params.BDIMX, + self.params.BDIMY, + 1, # block dim + 0, + self.streamHalo, # shared mem and stream + args.ctypes.data, + 0, + ) + ) # arguments checkCudaErrors(cuda.cuStreamSynchronize(self.streamHalo)) # # execute the center part of propagation # - def executeCenter(self, kernel): + def executeCenter(self, kernel): if verbose_prints: print("running center on device ", self.dev) checkCudaErrors(cuda.cuCtxSetCurrent(self.context)) - offset_velocity = 2* self.params.FD_ORDER * self.params.nx * self.params.ny + \ - self.params.FD_ORDER * self.params.nx + self.params.FD_ORDER + offset_velocity = ( + 2 * self.params.FD_ORDER * self.params.nx * self.params.ny + + self.params.FD_ORDER * self.params.nx + + self.params.FD_ORDER + ) offset_wave = self.params.lead + offset_velocity @@ -384,19 +440,36 @@ def executeCenter(self, kernel): vel = np.array([int(self.velocity)], dtype=np.uint64) stride = self.params.nx * self.params.ny - np_nz = np.array(self.params.nz - 4*self.params.FD_ORDER, dtype=np.uint32) + np_nz = np.array(self.params.nz - 4 * self.params.FD_ORDER, dtype=np.uint32) np_nx = np.array(self.params.nx, dtype=np.uint32) np_stride = np.array(stride, dtype=np.uint32) - args = [wavein+offset_wave, waveout+offset_wave, vel+offset_velocity, np_nz, np_nx, np_stride] + args = [ + wavein + offset_wave, + waveout + offset_wave, + vel + offset_velocity, + np_nz, + np_nx, + np_stride, + ] args = np.array([arg.ctypes.data for arg in args], dtype=np.uint64) # do center propagation from 2 * fd_order to nz - 2 * fd_order - checkCudaErrors(cuda.cuLaunchKernel(kernel.fdPropag, - self.params.blkx, self.params.blky, 1, # grid dim - self.params.BDIMX, self.params.BDIMY, 1, # block dim - 0, self.streamCenter, # shared mem and stream - args.ctypes.data, 0)) # arguments + checkCudaErrors( + cuda.cuLaunchKernel( + kernel.fdPropag, + self.params.blkx, + self.params.blky, + 1, # grid dim + self.params.BDIMX, + self.params.BDIMY, + 1, # block dim + 0, + self.streamCenter, # shared mem and stream + args.ctypes.data, + 0, + ) + ) # arguments # # execute the halo part of propagation @@ -406,8 +479,11 @@ def executeHalo(self, kernel): print("running halos on device ", self.dev) checkCudaErrors(cuda.cuCtxSetCurrent(self.context)) - offset_velocity = self.params.FD_ORDER * self.params.nx * self.params.ny + \ - self.params.FD_ORDER * self.params.nx + self.params.FD_ORDER + offset_velocity = ( + self.params.FD_ORDER * self.params.nx * self.params.ny + + self.params.FD_ORDER * self.params.nx + + self.params.FD_ORDER + ) offset_wave = self.params.lead + offset_velocity @@ -423,31 +499,68 @@ def executeHalo(self, kernel): np_nx = np.array(self.params.nx, dtype=np.uint32) np_stride = np.array(stride, dtype=np.uint32) - args = [wavein+offset_wave, waveout+offset_wave, vel+offset_velocity, np_nz, np_nx, np_stride] + args = [ + wavein + offset_wave, + waveout + offset_wave, + vel + offset_velocity, + np_nz, + np_nx, + np_stride, + ] args = np.array([arg.ctypes.data for arg in args], dtype=np.uint64) # do halo up - checkCudaErrors(cuda.cuLaunchKernel(kernel.fdPropag, - self.params.blkx, self.params.blky, 1, # grid dim - self.params.BDIMX, self.params.BDIMY, 1, # block dim - 0, self.streamHalo, # shared mem and stream - args.ctypes.data, 0)) # arguments + checkCudaErrors( + cuda.cuLaunchKernel( + kernel.fdPropag, + self.params.blkx, + self.params.blky, + 1, # grid dim + self.params.BDIMX, + self.params.BDIMY, + 1, # block dim + 0, + self.streamHalo, # shared mem and stream + args.ctypes.data, + 0, + ) + ) # arguments # do halo down - offset_velocity = (self.params.nz - 2*self.params.FD_ORDER) * self.params.nx * self.params.ny + \ - self.params.FD_ORDER * self.params.nx + self.params.FD_ORDER + offset_velocity = ( + (self.params.nz - 2 * self.params.FD_ORDER) * self.params.nx * self.params.ny + + self.params.FD_ORDER * self.params.nx + + self.params.FD_ORDER + ) offset_wave = self.params.lead + offset_velocity offset_wave *= np.dtype(np.float32).itemsize offset_velocity *= np.dtype(np.float32).itemsize - args = [wavein+offset_wave, waveout+offset_wave, vel+offset_velocity, np_nz, np_nx, np_stride] + args = [ + wavein + offset_wave, + waveout + offset_wave, + vel + offset_velocity, + np_nz, + np_nx, + np_stride, + ] args = np.array([arg.ctypes.data for arg in args], dtype=np.uint64) - checkCudaErrors(cuda.cuLaunchKernel(kernel.fdPropag, - self.params.blkx, self.params.blky, 1, # grid dim - self.params.BDIMX, self.params.BDIMY, 1, # block dim - 0, self.streamHalo, # shared mem and stream - args.ctypes.data, 0)) # arguments + checkCudaErrors( + cuda.cuLaunchKernel( + kernel.fdPropag, + self.params.blkx, + self.params.blky, + 1, # grid dim + self.params.BDIMX, + self.params.BDIMY, + 1, # block dim + 0, + self.streamHalo, # shared mem and stream + args.ctypes.data, + 0, + ) + ) # arguments # # exchange the halos @@ -460,7 +573,7 @@ def exchangeHalo(self, propag): # # the following variables don't change # - nstride = self.params.nx * self.params.ny + nstride = self.params.nx * self.params.ny devS = self.context devD = propag.context @@ -470,7 +583,7 @@ def exchangeHalo(self, propag): if self.dev < propag.dev: # exchange up - offsetS = self.params.lead + (self.params.nz - 2*self.params.FD_ORDER) * nstride + offsetS = self.params.lead + (self.params.nz - 2 * self.params.FD_ORDER) * nstride offsetD = propag.params.lead offsetS *= np.dtype(np.float32).itemsize @@ -482,8 +595,8 @@ def exchangeHalo(self, propag): checkCudaErrors(cuda.cuMemcpyPeerAsync(waveD, devD, waveS, devS, n_exch, self.streamHalo)) else: # exchange down - offsetS = self.params.lead + self.params.FD_ORDER * nstride - offsetD = propag.params.lead + (propag.params.nz - propag.params.FD_ORDER) * nstride + offsetS = self.params.lead + self.params.FD_ORDER * nstride + offsetD = propag.params.lead + (propag.params.nz - propag.params.FD_ORDER) * nstride offsetS *= np.dtype(np.float32).itemsize offsetD *= np.dtype(np.float32).itemsize @@ -500,13 +613,14 @@ def syncStream(self, stream): checkCudaErrors(cuda.cuCtxSetCurrent(self.context)) checkCudaErrors(cuda.cuStreamSynchronize(stream)) + def main(): checkCudaErrors(cuda.cuInit(0)) # Number of GPUs print("Checking for multiple GPUs...") gpu_n = checkCudaErrors(cuda.cuDeviceGetCount()) - print("CUDA-capable device count: {}".format(gpu_n)) + print(f"CUDA-capable device count: {gpu_n}") if gpu_n < 2: print("Two or more GPUs with Peer-to-Peer access capability are required") @@ -524,10 +638,16 @@ def main(): continue i_access_j = checkCudaErrors(cudart.cudaDeviceCanAccessPeer(i, j)) j_access_i = checkCudaErrors(cudart.cudaDeviceCanAccessPeer(j, i)) - print("> Peer access from {} (GPU{}) -> {} (GPU{}) : {}\n".format( - prop[i].name, i, prop[j].name, j, "Yes" if i_access_j else "No")) - print("> Peer access from {} (GPU{}) -> {} (GPU{}) : {}\n".format( - prop[j].name, j, prop[i].name, i, "Yes" if i_access_j else "No")) + print( + "> Peer access from {} (GPU{}) -> {} (GPU{}) : {}\n".format( + prop[i].name, i, prop[j].name, j, "Yes" if i_access_j else "No" + ) + ) + print( + "> Peer access from {} (GPU{}) -> {} (GPU{}) : {}\n".format( + prop[j].name, j, prop[i].name, i, "Yes" if i_access_j else "No" + ) + ) if i_access_j and j_access_i: p2pCapableGPUs[1] = j break @@ -542,7 +662,6 @@ def main(): # Use first pair of p2p capable GPUs detected gpuid = [p2pCapableGPUs[0], p2pCapableGPUs[1]] - # # init device # @@ -552,7 +671,7 @@ def main(): # create propagators # propags = [] - kerns = [] + kerns = [] # # create kernels and propagators that are going to be used on device @@ -574,7 +693,6 @@ def main(): # start = time.time() for it in range(pars.nt): - for propag in propags: propag.syncStream(propag.streamHalo) @@ -605,26 +723,30 @@ def main(): nops = 1.0e-9 * pars.nt * npoints / (end - start) - print("this code generates " , nops , " GPoints/sec / device ") + print("this code generates ", nops, " GPoints/sec / device ") # # get the result out of gpu # nz = 2 * (int)(pars.nz - 2 * pars.FD_ORDER) print(" nz= ", nz, " nx= ", pars.nx) - hOut = np.zeros((nz, pars.nx), dtype='float32') + hOut = np.zeros((nz, pars.nx), dtype="float32") istart = 0 for propag in propags: checkCudaErrors(cuda.cuCtxSetCurrent(propag.context)) - offset = pars.lead + pars.FD_ORDER * pars.nx * pars.ny + \ - (int)(pars.ny/2) * pars.nx - - for j in range(pars.nz- 2*pars.FD_ORDER): - ptr = cuda.CUdeviceptr(int(propag.waveOut) + offset*4) - - checkCudaErrors(cuda.cuMemcpyDtoH(hOut[istart].ctypes.data, ptr, - pars.nx * np.dtype(np.float32).itemsize)) + offset = pars.lead + pars.FD_ORDER * pars.nx * pars.ny + (int)(pars.ny / 2) * pars.nx + + for j in range(pars.nz - 2 * pars.FD_ORDER): + ptr = cuda.CUdeviceptr(int(propag.waveOut) + offset * 4) + + checkCudaErrors( + cuda.cuMemcpyDtoH( + hOut[istart].ctypes.data, + ptr, + pars.nx * np.dtype(np.float32).itemsize, + ) + ) offset += pars.nx * pars.ny istart += 1 @@ -638,19 +760,26 @@ def main(): nrows = nz ncols = pars.nx dbz = hOut - dbz = np.reshape(dbz,(nrows, ncols)) + dbz = np.reshape(dbz, (nrows, ncols)) ## ## those are to plot results ## import matplotlib.pyplot as plt - import matplotlib.cm as cm + fig, ax = plt.subplots() title = "test fd kernels up to " + str(pars.tmax_propag) + " ms " plt.title(title, fontsize=20) - im = ax.imshow(dbz, interpolation='bilinear', cmap=plt.get_cmap('Greys'), aspect='auto', - origin='upper',extent=[1, pars.nx, nz, 1], - vmax=abs(dbz).max(), vmin=-abs(dbz).max()) + im = ax.imshow( + dbz, + interpolation="bilinear", + cmap=plt.get_cmap("Greys"), + aspect="auto", + origin="upper", + extent=[1, pars.nx, nz, 1], + vmax=abs(dbz).max(), + vmin=-abs(dbz).max(), + ) fig.colorbar(im, ax=ax) @@ -658,6 +787,7 @@ def main(): print("Done") + if __name__ == "__main__": display_graph = True verbose_prints = True diff --git a/cuda_bindings/examples/extra/jit_program_test.py b/cuda_bindings/examples/extra/jit_program_test.py index e55b48ec2..18835ec9d 100644 --- a/cuda_bindings/examples/extra/jit_program_test.py +++ b/cuda_bindings/examples/extra/jit_program_test.py @@ -6,20 +6,24 @@ # this software and related documentation outside the terms of the EULA # is strictly prohibited. import ctypes + import numpy as np + from cuda import cuda, nvrtc + def ASSERT_DRV(err): if isinstance(err, cuda.CUresult): if err != cuda.CUresult.CUDA_SUCCESS: - raise RuntimeError('Cuda Error: {}'.format(err)) + raise RuntimeError(f"Cuda Error: {err}") elif isinstance(err, nvrtc.nvrtcResult): if err != nvrtc.nvrtcResult.NVRTC_SUCCESS: - raise RuntimeError('Nvrtc Error: {}'.format(err)) + raise RuntimeError(f"Nvrtc Error: {err}") else: - raise RuntimeError('Unknown error type: {}'.format(err)) + raise RuntimeError(f"Unknown error type: {err}") -saxpy = '''\ + +saxpy = """\ extern "C" __global__ void saxpy(float a, float *x, float *y, float *out, size_t n) { @@ -28,11 +32,12 @@ def ASSERT_DRV(err): out[tid] = a * x[tid] + y[tid]; } } -''' +""" + def main(): # Init - err, = cuda.cuInit(0) + (err,) = cuda.cuInit(0) ASSERT_DRV(err) # Device @@ -44,30 +49,34 @@ def main(): ASSERT_DRV(err) # Create program - err, prog = nvrtc.nvrtcCreateProgram(str.encode(saxpy), b'saxpy.cu', 0, None, None) + err, prog = nvrtc.nvrtcCreateProgram(str.encode(saxpy), b"saxpy.cu", 0, None, None) ASSERT_DRV(err) # Get target architecture - err, major = cuda.cuDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice) + err, major = cuda.cuDeviceGetAttribute( + cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice + ) ASSERT_DRV(err) - err, minor = cuda.cuDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice) + err, minor = cuda.cuDeviceGetAttribute( + cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice + ) ASSERT_DRV(err) err, nvrtc_major, nvrtc_minor = nvrtc.nvrtcVersion() ASSERT_DRV(err) - use_cubin = (nvrtc_minor >= 1) - prefix = 'sm' if use_cubin else 'compute' - arch_arg = bytes(f'--gpu-architecture={prefix}_{major}{minor}', 'ascii') + use_cubin = nvrtc_minor >= 1 + prefix = "sm" if use_cubin else "compute" + arch_arg = bytes(f"--gpu-architecture={prefix}_{major}{minor}", "ascii") # Compile program - opts = [b'--fmad=false', arch_arg] - err, = nvrtc.nvrtcCompileProgram(prog, len(opts), opts) + opts = [b"--fmad=false", arch_arg] + (err,) = nvrtc.nvrtcCompileProgram(prog, len(opts), opts) ASSERT_DRV(err) # Get log from compilation err, logSize = nvrtc.nvrtcGetProgramLogSize(prog) ASSERT_DRV(err) - log = b' ' * logSize - err, = nvrtc.nvrtcGetProgramLog(prog, log) + log = b" " * logSize + (err,) = nvrtc.nvrtcGetProgramLog(prog, log) ASSERT_DRV(err) print(log.decode()) @@ -75,21 +84,21 @@ def main(): if use_cubin: err, dataSize = nvrtc.nvrtcGetCUBINSize(prog) ASSERT_DRV(err) - data = b' ' * dataSize - err, = nvrtc.nvrtcGetCUBIN(prog, data) + data = b" " * dataSize + (err,) = nvrtc.nvrtcGetCUBIN(prog, data) ASSERT_DRV(err) else: err, dataSize = nvrtc.nvrtcGetPTXSize(prog) ASSERT_DRV(err) - data = b' ' * dataSize - err, = nvrtc.nvrtcGetPTX(prog, data) + data = b" " * dataSize + (err,) = nvrtc.nvrtcGetPTX(prog, data) ASSERT_DRV(err) # Load data as module data and retrieve function data = np.char.array(data) err, module = cuda.cuModuleLoadData(data) ASSERT_DRV(err) - err, kernel = cuda.cuModuleGetFunction(module, b'saxpy') + err, kernel = cuda.cuModuleGetFunction(module, b"saxpy") ASSERT_DRV(err) # Test the kernel @@ -114,52 +123,61 @@ def main(): err, stream = cuda.cuStreamCreate(0) ASSERT_DRV(err) - err, = cuda.cuMemcpyHtoDAsync(dX, hX, bufferSize, stream) + (err,) = cuda.cuMemcpyHtoDAsync(dX, hX, bufferSize, stream) ASSERT_DRV(err) - err, = cuda.cuMemcpyHtoDAsync(dY, hY, bufferSize, stream) + (err,) = cuda.cuMemcpyHtoDAsync(dY, hY, bufferSize, stream) ASSERT_DRV(err) - err, = cuda.cuStreamSynchronize(stream) + (err,) = cuda.cuStreamSynchronize(stream) ASSERT_DRV(err) # Assert values are different before running kernel hZ = a * hX + hY if np.allclose(hOut, hZ): - raise ValueError('Error inside tolerence for host-device vectors') + raise ValueError("Error inside tolerence for host-device vectors") arg_values = (a, dX, dY, dOut, n) arg_types = (ctypes.c_float, None, None, None, ctypes.c_size_t) - err, = cuda.cuLaunchKernel(kernel, - NUM_BLOCKS, 1, 1, # grid dim - NUM_THREADS, 1, 1, # block dim - 0, stream, # shared mem and stream - (arg_values, arg_types), 0) # arguments + (err,) = cuda.cuLaunchKernel( + kernel, + NUM_BLOCKS, + 1, + 1, # grid dim + NUM_THREADS, + 1, + 1, # block dim + 0, + stream, # shared mem and stream + (arg_values, arg_types), + 0, + ) # arguments ASSERT_DRV(err) - err, = cuda.cuMemcpyDtoHAsync(hOut, dOut, bufferSize, stream) + (err,) = cuda.cuMemcpyDtoHAsync(hOut, dOut, bufferSize, stream) ASSERT_DRV(err) - err, = cuda.cuStreamSynchronize(stream) + (err,) = cuda.cuStreamSynchronize(stream) ASSERT_DRV(err) # Assert values are same after running kernel hZ = a * hX + hY if not np.allclose(hOut, hZ): - raise ValueError('Error outside tolerence for host-device vectors') + raise ValueError("Error outside tolerence for host-device vectors") - err, = cuda.cuStreamDestroy(stream) + (err,) = cuda.cuStreamDestroy(stream) ASSERT_DRV(err) - err, = cuda.cuMemFree(dX) + (err,) = cuda.cuMemFree(dX) ASSERT_DRV(err) - err, = cuda.cuMemFree(dY) + (err,) = cuda.cuMemFree(dY) ASSERT_DRV(err) - err, = cuda.cuMemFree(dOut) + (err,) = cuda.cuMemFree(dOut) ASSERT_DRV(err) - err, = cuda.cuModuleUnload(module) + (err,) = cuda.cuModuleUnload(module) ASSERT_DRV(err) - err, = cuda.cuCtxDestroy(context) + (err,) = cuda.cuCtxDestroy(context) ASSERT_DRV(err) -if __name__=="__main__": + +if __name__ == "__main__": main() diff --git a/cuda_bindings/examples/extra/numba_emm_plugin.py b/cuda_bindings/examples/extra/numba_emm_plugin.py index a80c3bbfe..45015ada4 100644 --- a/cuda_bindings/examples/extra/numba_emm_plugin.py +++ b/cuda_bindings/examples/extra/numba_emm_plugin.py @@ -52,18 +52,22 @@ be restored after freeing it. """ +from ctypes import c_size_t + from numba import cuda -from numba.cuda import (HostOnlyCUDAMemoryManager, GetIpcHandleMixin, - MemoryPointer, MemoryInfo) +from numba.cuda import ( + GetIpcHandleMixin, + HostOnlyCUDAMemoryManager, + MemoryInfo, + MemoryPointer, +) from cuda import cuda as cuda_driver -from ctypes import c_size_t - - # Python functions for allocation, deallocation, and memory info via the NVIDIA # CUDA Python Driver API + def driver_alloc(size): """ Allocate `size` bytes of device memory and return a device pointer to the @@ -71,7 +75,7 @@ def driver_alloc(size): """ err, ptr = cuda_driver.cuMemAlloc(size) if err != cuda_driver.CUresult.CUDA_SUCCESS: - raise RuntimeError(f'Unexpected error code {err} from cuMemAlloc') + raise RuntimeError(f"Unexpected error code {err} from cuMemAlloc") return ptr @@ -79,9 +83,9 @@ def driver_free(ptr): """ Free device memory pointed to by `ptr`. """ - err, = cuda_driver.cuMemFree(ptr) + (err,) = cuda_driver.cuMemFree(ptr) if err != cuda_driver.CUresult.CUDA_SUCCESS: - raise RuntimeError(f'Unexpected error code {err} from cuMemFree') + raise RuntimeError(f"Unexpected error code {err} from cuMemFree") def driver_memory_info(): @@ -90,7 +94,7 @@ def driver_memory_info(): """ err, free, total = cuda_driver.cuMemGetInfo() if err != cuda_driver.CUresult.CUDA_SUCCESS: - raise RuntimeError(f'Unexpected error code {err} from cuMemGetInfo') + raise RuntimeError(f"Unexpected error code {err} from cuMemGetInfo") return free, total @@ -99,6 +103,7 @@ def driver_memory_info(): # # https://numba.readthedocs.io/en/stable/cuda/external-memory.html#numba.cuda.BaseCUDAMemoryManager + class DriverEMMPlugin(GetIpcHandleMixin, HostOnlyCUDAMemoryManager): def memalloc(self, size): ptr = driver_alloc(size) @@ -152,10 +157,10 @@ def main(): print(f"Free after freeing device array: {ctx.get_memory_info().free}") -if __name__ == '__main__': +if __name__ == "__main__": import argparse + formatter = argparse.RawDescriptionHelpFormatter - parser = argparse.ArgumentParser(description=__doc__, - formatter_class=formatter) + parser = argparse.ArgumentParser(description=__doc__, formatter_class=formatter) parser.parse_args() main() diff --git a/cuda_bindings/pyproject.toml b/cuda_bindings/pyproject.toml index 63c09db5c..374f43e59 100644 --- a/cuda_bindings/pyproject.toml +++ b/cuda_bindings/pyproject.toml @@ -46,3 +46,60 @@ versionfile_source = "cuda/bindings/_version.py" versionfile_build = "cuda/bindings/_version.py" tag_prefix = "v" parentdir_prefix = "cuda-python-" + +[tool.ruff] +line-length = 120 + +[tool.ruff.format] +docstring-code-format = true + +exclude = ["cuda/bindings/_version.py"] + +[tool.ruff.lint] +select = [ + # pycodestyle Error + "E", + # Pyflakes + "F", + # pycodestyle Warning + "W", + # pyupgrade + "UP", + # flake8-bugbear + "B", + # flake8-simplify + "SIM", + # isort + "I", +] + +ignore = [ + "UP006", + "UP007", + "E741", # ambiguous variable name such as I + "B007", # rename unsued loop variable to _name + "UP035" # UP006, UP007, UP035 complain about deprecated Typing. use, but disregard backward compatibility of python version +] + +exclude = ["cuda/bindings/_version.py"] + +[tool.ruff.lint.per-file-ignores] +"setup.py" = ["F401"] +"__init__.py" = ["F401"] + +"examples/**/*" = [ + "E722", + "E501" # line too long + ] + +"tests/**/*" = [ + "E722", + "UP022", + "E402", # module level import not at top of file + "F841"] # F841 complains about unused variables, but some assignments have side-effects that could be useful for tests (func calls for example) + +"benchmarks/**/*" = [ + "E722", + "UP022", + "E402", # module level import not at top of file + "F841"] # F841 complains about unused variables, but some assignments have side-effects that could be useful for tests (func calls for example) diff --git a/cuda_bindings/setup.py b/cuda_bindings/setup.py index 2342e62ab..2cc9027bb 100644 --- a/cuda_bindings/setup.py +++ b/cuda_bindings/setup.py @@ -6,23 +6,23 @@ # this software and related documentation outside the terms of the EULA # is strictly prohibited. +import atexit +import contextlib import glob import os import platform +import shutil import sys import sysconfig -import atexit +import tempfile +import versioneer from Cython import Tempita from Cython.Build import cythonize from pyclibrary import CParser from setuptools import find_packages, setup -from setuptools.extension import Extension from setuptools.command.build_ext import build_ext -import versioneer -import tempfile -import shutil - +from setuptools.extension import Extension # ---------------------------------------------------------------------- # Fetch configuration options @@ -31,48 +31,49 @@ if not CUDA_HOME: CUDA_HOME = os.environ.get("CUDA_PATH") if not CUDA_HOME: - raise RuntimeError('Environment variable CUDA_HOME or CUDA_PATH is not set') + raise RuntimeError("Environment variable CUDA_HOME or CUDA_PATH is not set") CUDA_HOME = CUDA_HOME.split(os.pathsep) nthreads = int(os.environ.get("PARALLEL_LEVEL", "0") or "0") PARSER_CACHING = os.environ.get("CUDA_PYTHON_PARSER_CACHING", False) -PARSER_CACHING = True if PARSER_CACHING else False +PARSER_CACHING = bool(PARSER_CACHING) # ---------------------------------------------------------------------- # Parse user-provided CUDA headers header_dict = { - 'driver' : ['cuda.h', - 'cudaProfiler.h', - 'cudaEGL.h', - 'cudaGL.h', - 'cudaVDPAU.h'], - 'runtime' : ['driver_types.h', - 'vector_types.h', - 'cuda_runtime.h', - 'surface_types.h', - 'texture_types.h', - 'library_types.h', - 'cuda_runtime_api.h', - 'device_types.h', - 'driver_functions.h', - 'cuda_profiler_api.h', - 'cuda_egl_interop.h', - 'cuda_gl_interop.h', - 'cuda_vdpau_interop.h'], - 'nvrtc' : ['nvrtc.h']} - -replace = {' __device_builtin__ ':' ', - 'CUDARTAPI ':' ', - 'typedef __device_builtin__ enum cudaError cudaError_t;' : 'typedef cudaError cudaError_t;', - 'typedef __device_builtin__ enum cudaOutputMode cudaOutputMode_t;' : 'typedef cudaOutputMode cudaOutputMode_t;', - 'typedef enum cudaError cudaError_t;' : 'typedef cudaError cudaError_t;', - 'typedef enum cudaOutputMode cudaOutputMode_t;' : 'typedef cudaOutputMode cudaOutputMode_t;', - 'typedef enum cudaDataType_t cudaDataType_t;' : '', - 'typedef enum libraryPropertyType_t libraryPropertyType_t;' : '', - ' enum ' : ' ', - ', enum ' : ', ', - '\\(enum ' : '(',} + "driver": ["cuda.h", "cudaProfiler.h", "cudaEGL.h", "cudaGL.h", "cudaVDPAU.h"], + "runtime": [ + "driver_types.h", + "vector_types.h", + "cuda_runtime.h", + "surface_types.h", + "texture_types.h", + "library_types.h", + "cuda_runtime_api.h", + "device_types.h", + "driver_functions.h", + "cuda_profiler_api.h", + "cuda_egl_interop.h", + "cuda_gl_interop.h", + "cuda_vdpau_interop.h", + ], + "nvrtc": ["nvrtc.h"], +} + +replace = { + " __device_builtin__ ": " ", + "CUDARTAPI ": " ", + "typedef __device_builtin__ enum cudaError cudaError_t;": "typedef cudaError cudaError_t;", + "typedef __device_builtin__ enum cudaOutputMode cudaOutputMode_t;": "typedef cudaOutputMode cudaOutputMode_t;", + "typedef enum cudaError cudaError_t;": "typedef cudaError cudaError_t;", + "typedef enum cudaOutputMode cudaOutputMode_t;": "typedef cudaOutputMode cudaOutputMode_t;", + "typedef enum cudaDataType_t cudaDataType_t;": "", + "typedef enum libraryPropertyType_t libraryPropertyType_t;": "", + " enum ": " ", + ", enum ": ", ", + "\\(enum ": "(", +} found_types = [] found_structs = {} @@ -80,7 +81,7 @@ found_functions = [] found_values = [] -include_path_list = [os.path.join(path, 'include') for path in CUDA_HOME] +include_path_list = [os.path.join(path, "include") for path in CUDA_HOME] print(f'Parsing headers in "{include_path_list}" (Caching {PARSER_CACHING})') for library, header_list in header_dict.items(): header_paths = [] @@ -91,35 +92,37 @@ header_paths += [path] break if not os.path.exists(path): - print(f'Missing header {header}') + print(f"Missing header {header}") - print(f'Parsing {library} headers') - parser = CParser(header_paths, - cache='./cache_{}'.format(library.split('.')[0]) if PARSER_CACHING else None, - replace=replace) + print(f"Parsing {library} headers") + parser = CParser( + header_paths, cache="./cache_{}".format(library.split(".")[0]) if PARSER_CACHING else None, replace=replace + ) - if library == 'driver': - CUDA_VERSION = parser.defs['macros']['CUDA_VERSION'] if 'CUDA_VERSION' in parser.defs['macros'] else 'Unknown' - print(f'Found CUDA_VERSION: {CUDA_VERSION}') + if library == "driver": + CUDA_VERSION = parser.defs["macros"].get("CUDA_VERSION", "Unknown") + print(f"Found CUDA_VERSION: {CUDA_VERSION}") # Combine types with others since they sometimes get tangled - found_types += {key for key in parser.defs['types']} - found_types += {key for key in parser.defs['structs']} - found_structs.update(parser.defs['structs']) - found_types += {key for key in parser.defs['unions']} - found_unions.update(parser.defs['unions']) - found_types += {key for key in parser.defs['enums']} - found_functions += {key for key in parser.defs['functions']} - found_values += {key for key in parser.defs['values']} + found_types += {key for key in parser.defs["types"]} + found_types += {key for key in parser.defs["structs"]} + found_structs.update(parser.defs["structs"]) + found_types += {key for key in parser.defs["unions"]} + found_unions.update(parser.defs["unions"]) + found_types += {key for key in parser.defs["enums"]} + found_functions += {key for key in parser.defs["functions"]} + found_values += {key for key in parser.defs["values"]} if len(found_functions) == 0: raise RuntimeError(f'Parser found no functions. Is CUDA_HOME setup correctly? (CUDA_HOME="{CUDA_HOME}")') + # Unwrap struct and union members def unwrapMembers(found_dict): for key in found_dict: - members = [var for var, _, _ in found_dict[key]['members']] - found_dict[key]['members'] = members + members = [var for var, _, _ in found_dict[key]["members"]] + found_dict[key]["members"] = members + unwrapMembers(found_structs) unwrapMembers(found_unions) @@ -127,11 +130,13 @@ def unwrapMembers(found_dict): # ---------------------------------------------------------------------- # Generate + def fetch_input_files(path): - return [os.path.join(path, f) for f in os.listdir(path) if f.endswith('.in')] + return [os.path.join(path, f) for f in os.listdir(path) if f.endswith(".in")] + def generate_output(infile, local): - assert infile.endswith('.in') + assert infile.endswith(".in") outfile = infile[:-3] with open(infile) as f: @@ -140,19 +145,21 @@ def generate_output(infile, local): if os.path.exists(outfile): with open(outfile) as f: if f.read() == pxdcontent: - print(f'Skipping {infile} (No change)') + print(f"Skipping {infile} (No change)") return with open(outfile, "w") as f: - print(f'Generating {infile}') + print(f"Generating {infile}") f.write(pxdcontent) -path_list = [os.path.join('cuda'), - os.path.join('cuda', 'bindings'), - os.path.join('cuda', 'bindings', '_bindings'), - os.path.join('cuda', 'bindings', '_lib'), - os.path.join('cuda', 'bindings', '_lib', 'cyruntime'), - os.path.join('cuda', 'bindings', '_internal'), - ] + +path_list = [ + os.path.join("cuda"), + os.path.join("cuda", "bindings"), + os.path.join("cuda", "bindings", "_bindings"), + os.path.join("cuda", "bindings", "_lib"), + os.path.join("cuda", "bindings", "_lib", "cyruntime"), + os.path.join("cuda", "bindings", "_internal"), +] input_files = [] for path in path_list: input_files += fetch_input_files(path) @@ -171,19 +178,19 @@ def generate_output(infile, local): extra_compile_args = [] extra_cythonize_kwargs = {} -if sys.platform != 'win32': +if sys.platform != "win32": extra_compile_args += [ - '-std=c++14', - '-fpermissive', - '-Wno-deprecated-declarations', - '-D _GLIBCXX_ASSERTIONS', - '-fno-var-tracking-assignments' + "-std=c++14", + "-fpermissive", + "-Wno-deprecated-declarations", + "-D _GLIBCXX_ASSERTIONS", + "-fno-var-tracking-assignments", ] - if '--debug' in sys.argv: - extra_cythonize_kwargs['gdb_debug'] = True - extra_compile_args += ['-g', '-O0'] + if "--debug" in sys.argv: + extra_cythonize_kwargs["gdb_debug"] = True + extra_compile_args += ["-g", "-O0"] else: - extra_compile_args += ['-O3'] + extra_compile_args += ["-O3"] # For Setup extensions = [] @@ -193,6 +200,7 @@ def generate_output(infile, local): # ---------------------------------------------------------------------- # Cythonize + def prep_extensions(sources): pattern = sources[0] files = glob.glob(pattern) @@ -216,21 +224,20 @@ def prep_extensions(sources): # new path for the bindings from cybind def rename_architecture_specific_files(): - architechture_specific_files_dir = 'cuda/bindings/_internal/' - if sys.platform == 'linux': - src_files = glob.glob(os.path.join(path, '*_linux.pyx')) - elif sys.platform == 'win32': - src_files = glob.glob(os.path.join(path, '*_windows.pyx')) + if sys.platform == "linux": + src_files = glob.glob(os.path.join(path, "*_linux.pyx")) + elif sys.platform == "win32": + src_files = glob.glob(os.path.join(path, "*_windows.pyx")) else: - raise RuntimeError(f'platform is unrecognized: {sys.platform}') + raise RuntimeError(f"platform is unrecognized: {sys.platform}") dst_files = [] for src in src_files: # Set up a temporary file; it must be under the cache directory so # that atomic moves within the same filesystem can be guaranteed - with tempfile.NamedTemporaryFile(delete=False, dir='.') as f: + with tempfile.NamedTemporaryFile(delete=False, dir=".") as f: shutil.copy2(src, f.name) f_name = f.name - dst = src.replace('_linux', '').replace('_windows', '') + dst = src.replace("_linux", "").replace("_windows", "") # atomic move with the destination guaranteed to be overwritten os.replace(f_name, f"./{dst}") dst_files.append(dst) @@ -243,20 +250,17 @@ def rename_architecture_specific_files(): @atexit.register def cleanup_dst_files(): for dst in dst_files: - try: + with contextlib.suppress(FileNotFoundError): os.remove(dst) - except FileNotFoundError: - pass def do_cythonize(extensions): return cythonize( extensions, nthreads=nthreads, - compiler_directives=dict( - profile=True, language_level=3, embedsignature=True, binding=True - ), - **extra_cythonize_kwargs) + compiler_directives=dict(profile=True, language_level=3, embedsignature=True, binding=True), + **extra_cythonize_kwargs, + ) sources_list = [ @@ -270,8 +274,8 @@ def do_cythonize(extensions): # public (deprecated, to be removed) ["cuda/*.pyx"], # interal files used by generated bindings - ['cuda/bindings/_internal/nvjitlink.pyx'], - ['cuda/bindings/_internal/utils.pyx'], + ["cuda/bindings/_internal/nvjitlink.pyx"], + ["cuda/bindings/_internal/utils.pyx"], ] for sources in sources_list: @@ -284,6 +288,7 @@ def do_cythonize(extensions): # 2) Compiled to .o files as part of build_ext # This class is solely for passing the value of nthreads to build_ext + class ParallelBuildExtensions(build_ext): def initialize_options(self): build_ext.initialize_options(self) @@ -293,6 +298,7 @@ def initialize_options(self): def finalize_options(self): build_ext.finalize_options(self) + cmdclass = {"build_ext": ParallelBuildExtensions} cmdclass = versioneer.get_cmdclass(cmdclass) diff --git a/cuda_bindings/tests/cython/test_cython.py b/cuda_bindings/tests/cython/test_cython.py index 439d7aa07..ed5069219 100644 --- a/cuda_bindings/tests/cython/test_cython.py +++ b/cuda_bindings/tests/cython/test_cython.py @@ -22,9 +22,7 @@ def wrapped(*args, **kwargs): return wrapped -cython_test_modules = ["test_ccuda", - "test_ccudart", - "test_interoperability_cython"] +cython_test_modules = ["test_ccuda", "test_ccudart", "test_interoperability_cython"] for mod in cython_test_modules: diff --git a/cuda_bindings/tests/test_cuda.py b/cuda_bindings/tests/test_cuda.py index d55a4209c..984f6aab0 100644 --- a/cuda_bindings/tests/test_cuda.py +++ b/cuda_bindings/tests/test_cuda.py @@ -6,211 +6,222 @@ # this software and related documentation outside the terms of the EULA # is strictly prohibited. import platform +import shutil +import textwrap + +import numpy as np import pytest + import cuda.cuda as cuda import cuda.cudart as cudart -import numpy as np -import textwrap -import shutil -from sysconfig import get_paths + def driverVersionLessThan(target): - err, = cuda.cuInit(0) - assert(err == cuda.CUresult.CUDA_SUCCESS) + (err,) = cuda.cuInit(0) + assert err == cuda.CUresult.CUDA_SUCCESS err, version = cuda.cuDriverGetVersion() - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS return version < target + def supportsMemoryPool(): err, isSupported = cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrMemoryPoolsSupported, 0) return err == cudart.cudaError_t.cudaSuccess and isSupported + def supportsManagedMemory(): err, isSupported = cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrManagedMemory, 0) return err == cudart.cudaError_t.cudaSuccess and isSupported + def supportsCudaAPI(name): return name in dir(cuda) + def callableBinary(name): - return shutil.which(name) != None + return shutil.which(name) is not None + def test_cuda_memcpy(): # Init CUDA - err, = cuda.cuInit(0) - assert(err == cuda.CUresult.CUDA_SUCCESS) + (err,) = cuda.cuInit(0) + assert err == cuda.CUresult.CUDA_SUCCESS # Get device err, device = cuda.cuDeviceGet(0) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS # Construct context err, ctx = cuda.cuCtxCreate(0, device) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS # Allocate dev memory size = int(1024 * np.uint8().itemsize) err, dptr = cuda.cuMemAlloc(size) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS # Set h1 and h2 memory to be different h1 = np.full(size, 1).astype(np.uint8) h2 = np.full(size, 2).astype(np.uint8) - assert(np.array_equal(h1, h2) is False) + assert np.array_equal(h1, h2) is False # h1 to D - err, = cuda.cuMemcpyHtoD(dptr, h1, size) - assert(err == cuda.CUresult.CUDA_SUCCESS) + (err,) = cuda.cuMemcpyHtoD(dptr, h1, size) + assert err == cuda.CUresult.CUDA_SUCCESS # D to h2 - err, = cuda.cuMemcpyDtoH(h2, dptr, size) - assert(err == cuda.CUresult.CUDA_SUCCESS) + (err,) = cuda.cuMemcpyDtoH(h2, dptr, size) + assert err == cuda.CUresult.CUDA_SUCCESS # Validate h1 == h2 - assert(np.array_equal(h1, h2)) + assert np.array_equal(h1, h2) # Cleanup - err, = cuda.cuMemFree(dptr) - assert(err == cuda.CUresult.CUDA_SUCCESS) - err, = cuda.cuCtxDestroy(ctx) - assert(err == cuda.CUresult.CUDA_SUCCESS) + (err,) = cuda.cuMemFree(dptr) + assert err == cuda.CUresult.CUDA_SUCCESS + (err,) = cuda.cuCtxDestroy(ctx) + assert err == cuda.CUresult.CUDA_SUCCESS + def test_cuda_array(): - err, = cuda.cuInit(0) - assert(err == cuda.CUresult.CUDA_SUCCESS) + (err,) = cuda.cuInit(0) + assert err == cuda.CUresult.CUDA_SUCCESS err, device = cuda.cuDeviceGet(0) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS # No context created desc = cuda.CUDA_ARRAY_DESCRIPTOR() err, arr = cuda.cuArrayCreate(desc) - assert(err == cuda.CUresult.CUDA_ERROR_INVALID_CONTEXT or err == cuda.CUresult.CUDA_ERROR_INVALID_VALUE) + assert err == cuda.CUresult.CUDA_ERROR_INVALID_CONTEXT or err == cuda.CUresult.CUDA_ERROR_INVALID_VALUE err, ctx = cuda.cuCtxCreate(0, device) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS # Desciption not filled err, arr = cuda.cuArrayCreate(desc) - assert(err == cuda.CUresult.CUDA_ERROR_INVALID_VALUE) + assert err == cuda.CUresult.CUDA_ERROR_INVALID_VALUE # Pass desc.Format = cuda.CUarray_format.CU_AD_FORMAT_SIGNED_INT8 desc.NumChannels = 1 desc.Width = 1 err, arr = cuda.cuArrayCreate(desc) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS + + (err,) = cuda.cuArrayDestroy(arr) + assert err == cuda.CUresult.CUDA_SUCCESS + (err,) = cuda.cuCtxDestroy(ctx) + assert err == cuda.CUresult.CUDA_SUCCESS - err, = cuda.cuArrayDestroy(arr) - assert(err == cuda.CUresult.CUDA_SUCCESS) - err, = cuda.cuCtxDestroy(ctx) - assert(err == cuda.CUresult.CUDA_SUCCESS) def test_cuda_repr_primitive(): - err, = cuda.cuInit(0) - assert(err == cuda.CUresult.CUDA_SUCCESS) + (err,) = cuda.cuInit(0) + assert err == cuda.CUresult.CUDA_SUCCESS err, device = cuda.cuDeviceGet(0) - assert(err == cuda.CUresult.CUDA_SUCCESS) - assert(str(device) == '') - assert(int(device) == 0) + assert err == cuda.CUresult.CUDA_SUCCESS + assert str(device) == "" + assert int(device) == 0 err, ctx = cuda.cuCtxCreate(0, device) - assert(err == cuda.CUresult.CUDA_SUCCESS) - assert(str(ctx).startswith(' 0) - assert(hex(ctx) == hex(int(ctx))) + assert err == cuda.CUresult.CUDA_SUCCESS + assert str(ctx).startswith(" 0 + assert hex(ctx) == hex(int(ctx)) # CUdeviceptr err, dptr = cuda.cuMemAlloc(1024 * np.uint8().itemsize) - assert(err == cuda.CUresult.CUDA_SUCCESS) - assert(str(dptr).startswith(' 0) - err, = cuda.cuMemFree(dptr) + assert err == cuda.CUresult.CUDA_SUCCESS + assert str(dptr).startswith(" 0 + (err,) = cuda.cuMemFree(dptr) size = 7 dptr = cuda.CUdeviceptr(size) - assert(str(dptr) == ''.format(size)) - assert(int(dptr) == size) + assert str(dptr) == f"" + assert int(dptr) == size size = 4294967295 dptr = cuda.CUdeviceptr(size) - assert(str(dptr) == ''.format(size)) - assert(int(dptr) == size) + assert str(dptr) == f"" + assert int(dptr) == size size = 18446744073709551615 dptr = cuda.CUdeviceptr(size) - assert(str(dptr) == ''.format(size)) - assert(int(dptr) == size) + assert str(dptr) == f"" + assert int(dptr) == size # cuuint32_t size = 7 int32 = cuda.cuuint32_t(size) - assert(str(int32) == ''.format(size)) - assert(int(int32) == size) + assert str(int32) == f"" + assert int(int32) == size size = 4294967295 int32 = cuda.cuuint32_t(size) - assert(str(int32) == ''.format(size)) - assert(int(int32) == size) + assert str(int32) == f"" + assert int(int32) == size size = 18446744073709551615 try: int32 = cuda.cuuint32_t(size) - raise RuntimeError('int32 = cuda.cuuint32_t(18446744073709551615) did not fail') + raise RuntimeError("int32 = cuda.cuuint32_t(18446744073709551615) did not fail") except OverflowError as err: pass # cuuint64_t size = 7 int64 = cuda.cuuint64_t(size) - assert(str(int64) == ''.format(size)) - assert(int(int64) == size) + assert str(int64) == f"" + assert int(int64) == size size = 4294967295 int64 = cuda.cuuint64_t(size) - assert(str(int64) == ''.format(size)) - assert(int(int64) == size) + assert str(int64) == f"" + assert int(int64) == size size = 18446744073709551615 int64 = cuda.cuuint64_t(size) - assert(str(int64) == ''.format(size)) - assert(int(int64) == size) + assert str(int64) == f"" + assert int(int64) == size + + (err,) = cuda.cuCtxDestroy(ctx) + assert err == cuda.CUresult.CUDA_SUCCESS - err, = cuda.cuCtxDestroy(ctx) - assert(err == cuda.CUresult.CUDA_SUCCESS) def test_cuda_repr_pointer(): - err, = cuda.cuInit(0) - assert(err == cuda.CUresult.CUDA_SUCCESS) + (err,) = cuda.cuInit(0) + assert err == cuda.CUresult.CUDA_SUCCESS err, device = cuda.cuDeviceGet(0) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS # Test 1: Classes representing pointers err, ctx = cuda.cuCtxCreate(0, device) - assert(err == cuda.CUresult.CUDA_SUCCESS) - assert(str(ctx).startswith(' 0) - assert(hex(ctx) == hex(int(ctx))) + assert err == cuda.CUresult.CUDA_SUCCESS + assert str(ctx).startswith(" 0 + assert hex(ctx) == hex(int(ctx)) randomCtxPointer = 12345 randomCtx = cuda.CUcontext(randomCtxPointer) - assert(str(randomCtx) == ''.format(hex(randomCtxPointer))) - assert(int(randomCtx) == randomCtxPointer) - assert(hex(randomCtx) == hex(randomCtxPointer)) + assert str(randomCtx) == f"" + assert int(randomCtx) == randomCtxPointer + assert hex(randomCtx) == hex(randomCtxPointer) # Test 2: Function pointers func = 12345 b2d_cb = cuda.CUoccupancyB2DSize(func) - assert(str(b2d_cb) == ''.format(hex(func))) - assert(int(b2d_cb) == func) - assert(hex(b2d_cb) == hex(func)) + assert str(b2d_cb) == f"" + assert int(b2d_cb) == func + assert hex(b2d_cb) == hex(func) + + (err,) = cuda.cuCtxDestroy(ctx) + assert err == cuda.CUresult.CUDA_SUCCESS - err, = cuda.cuCtxDestroy(ctx) - assert(err == cuda.CUresult.CUDA_SUCCESS) def test_cuda_uuid_list_access(): - err, = cuda.cuInit(0) - assert(err == cuda.CUresult.CUDA_SUCCESS) + (err,) = cuda.cuInit(0) + assert err == cuda.CUresult.CUDA_SUCCESS err, device = cuda.cuDeviceGet(0) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS err, ctx = cuda.cuCtxCreate(0, device) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS err, uuid = cuda.cuDeviceGetUuid(device) - assert(err == cuda.CUresult.CUDA_SUCCESS) - assert(len(uuid.bytes) <= 16) + assert err == cuda.CUresult.CUDA_SUCCESS + assert len(uuid.bytes) <= 16 jit_option = cuda.CUjit_option options = { @@ -221,29 +232,29 @@ def test_cuda_uuid_list_access(): jit_option.CU_JIT_LOG_VERBOSE: 5, } - err, = cuda.cuCtxDestroy(ctx) - assert(err == cuda.CUresult.CUDA_SUCCESS) + (err,) = cuda.cuCtxDestroy(ctx) + assert err == cuda.CUresult.CUDA_SUCCESS + def test_cuda_cuModuleLoadDataEx(): - err, = cuda.cuInit(0) - assert(err == cuda.CUresult.CUDA_SUCCESS) + (err,) = cuda.cuInit(0) + assert err == cuda.CUresult.CUDA_SUCCESS err, dev = cuda.cuDeviceGet(0) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS err, ctx = cuda.cuCtxCreate(0, dev) - assert(err == cuda.CUresult.CUDA_SUCCESS) - + assert err == cuda.CUresult.CUDA_SUCCESS option_keys = [ cuda.CUjit_option.CU_JIT_INFO_LOG_BUFFER, cuda.CUjit_option.CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, cuda.CUjit_option.CU_JIT_ERROR_LOG_BUFFER, cuda.CUjit_option.CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, - cuda.CUjit_option.CU_JIT_LOG_VERBOSE + cuda.CUjit_option.CU_JIT_LOG_VERBOSE, ] err, mod = cuda.cuModuleLoadDataEx(0, 0, option_keys, []) - err, = cuda.cuCtxDestroy(ctx) - assert(err == cuda.CUresult.CUDA_SUCCESS) + (err,) = cuda.cuCtxDestroy(ctx) + assert err == cuda.CUresult.CUDA_SUCCESS def test_cuda_repr(): @@ -284,9 +295,11 @@ def test_cuda_repr(): def test_cuda_struct_list_of_enums(): desc = cuda.CUDA_TEXTURE_DESC_st() - desc.addressMode = [cuda.CUaddress_mode.CU_TR_ADDRESS_MODE_WRAP, - cuda.CUaddress_mode.CU_TR_ADDRESS_MODE_CLAMP, - cuda.CUaddress_mode.CU_TR_ADDRESS_MODE_MIRROR] + desc.addressMode = [ + cuda.CUaddress_mode.CU_TR_ADDRESS_MODE_WRAP, + cuda.CUaddress_mode.CU_TR_ADDRESS_MODE_CLAMP, + cuda.CUaddress_mode.CU_TR_ADDRESS_MODE_MIRROR, + ] # # Too many args # desc.addressMode = [cuda.CUaddress_mode.CU_TR_ADDRESS_MODE_WRAP, @@ -298,6 +311,7 @@ def test_cuda_struct_list_of_enums(): # desc.addressMode = [cuda.CUaddress_mode.CU_TR_ADDRESS_MODE_WRAP, # cuda.CUaddress_mode.CU_TR_ADDRESS_MODE_CLAMP] + def test_cuda_CUstreamBatchMemOpParams(): params = cuda.CUstreamBatchMemOpParams() params.operation = cuda.CUstreamBatchMemOpType.CU_STREAM_MEM_OP_WAIT_VALUE_32 @@ -305,16 +319,19 @@ def test_cuda_CUstreamBatchMemOpParams(): params.writeValue.operation = cuda.CUstreamBatchMemOpType.CU_STREAM_MEM_OP_WAIT_VALUE_32 params.flushRemoteWrites.operation = cuda.CUstreamBatchMemOpType.CU_STREAM_MEM_OP_WAIT_VALUE_32 params.waitValue.value64 = 666 - assert(int(params.waitValue.value64) == 666) + assert int(params.waitValue.value64) == 666 -@pytest.mark.skipif(driverVersionLessThan(11030) or not supportsMemoryPool(), reason='When new attributes were introduced') + +@pytest.mark.skipif( + driverVersionLessThan(11030) or not supportsMemoryPool(), reason="When new attributes were introduced" +) def test_cuda_memPool_attr(): - err, = cuda.cuInit(0) - assert(err == cuda.CUresult.CUDA_SUCCESS) + (err,) = cuda.cuInit(0) + assert err == cuda.CUresult.CUDA_SUCCESS err, device = cuda.cuDeviceGet(0) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS err, ctx = cuda.cuCtxCreate(0, device) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS poolProps = cuda.CUmemPoolProps() poolProps.allocType = cuda.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED @@ -323,136 +340,158 @@ def test_cuda_memPool_attr(): attr_list = [None] * 8 err, pool = cuda.cuMemPoolCreate(poolProps) - assert(err == cuda.CUresult.CUDA_SUCCESS) - - for idx, attr in enumerate([cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES, - cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC, - cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES, - cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, - cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT, - cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH, - cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_USED_MEM_CURRENT, - cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_USED_MEM_HIGH]): + assert err == cuda.CUresult.CUDA_SUCCESS + + for idx, attr in enumerate( + [ + cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES, + cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC, + cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES, + cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, + cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT, + cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH, + cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_USED_MEM_CURRENT, + cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_USED_MEM_HIGH, + ] + ): err, attr_tmp = cuda.cuMemPoolGetAttribute(pool, attr) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS attr_list[idx] = attr_tmp - for idxA, attr in enumerate([cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES, - cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC, - cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES]): - err, = cuda.cuMemPoolSetAttribute(pool, attr, 0) - assert(err == cuda.CUresult.CUDA_SUCCESS) + for idxA, attr in enumerate( + [ + cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES, + cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC, + cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES, + ] + ): + (err,) = cuda.cuMemPoolSetAttribute(pool, attr, 0) + assert err == cuda.CUresult.CUDA_SUCCESS for idx, attr in enumerate([cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD]): - err, = cuda.cuMemPoolSetAttribute(pool, attr, cuda.cuuint64_t(9)) - assert(err == cuda.CUresult.CUDA_SUCCESS) - - for idx, attr in enumerate([cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES, - cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC, - cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES, - cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD]): + (err,) = cuda.cuMemPoolSetAttribute(pool, attr, cuda.cuuint64_t(9)) + assert err == cuda.CUresult.CUDA_SUCCESS + + for idx, attr in enumerate( + [ + cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES, + cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC, + cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES, + cuda.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, + ] + ): err, attr_tmp = cuda.cuMemPoolGetAttribute(pool, attr) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS attr_list[idx] = attr_tmp - assert(attr_list[0] == 0) - assert(attr_list[1] == 0) - assert(attr_list[2] == 0) - assert(int(attr_list[3]) == 9) + assert attr_list[0] == 0 + assert attr_list[1] == 0 + assert attr_list[2] == 0 + assert int(attr_list[3]) == 9 + + (err,) = cuda.cuMemPoolDestroy(pool) + assert err == cuda.CUresult.CUDA_SUCCESS + (err,) = cuda.cuCtxDestroy(ctx) + assert err == cuda.CUresult.CUDA_SUCCESS - err, = cuda.cuMemPoolDestroy(pool) - assert(err == cuda.CUresult.CUDA_SUCCESS) - err, = cuda.cuCtxDestroy(ctx) - assert(err == cuda.CUresult.CUDA_SUCCESS) -@pytest.mark.skipif(driverVersionLessThan(11030) or not supportsManagedMemory(), reason='When new attributes were introduced') +@pytest.mark.skipif( + driverVersionLessThan(11030) or not supportsManagedMemory(), reason="When new attributes were introduced" +) def test_cuda_pointer_attr(): - err, = cuda.cuInit(0) - assert(err == cuda.CUresult.CUDA_SUCCESS) + (err,) = cuda.cuInit(0) + assert err == cuda.CUresult.CUDA_SUCCESS err, device = cuda.cuDeviceGet(0) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS err, ctx = cuda.cuCtxCreate(0, device) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS err, ptr = cuda.cuMemAllocManaged(0x1000, cuda.CUmemAttach_flags.CU_MEM_ATTACH_GLOBAL.value) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS # Individual version - attr_type_list = [cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_CONTEXT, - cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMORY_TYPE, - cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_DEVICE_POINTER, - cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_HOST_POINTER, - # cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_P2P_TOKENS, # TODO: Can I somehow test this? - cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, - cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_BUFFER_ID, - cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_IS_MANAGED, - cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, - cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE, - cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_RANGE_START_ADDR, - cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_RANGE_SIZE, - cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MAPPED, - cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES, - cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_IS_GPU_DIRECT_RDMA_CAPABLE, - cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_ACCESS_FLAGS, - cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE] + attr_type_list = [ + cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_CONTEXT, + cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMORY_TYPE, + cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_DEVICE_POINTER, + cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_HOST_POINTER, + # cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_P2P_TOKENS, # TODO: Can I somehow test this? + cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, + cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_BUFFER_ID, + cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_IS_MANAGED, + cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, + cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE, + cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_RANGE_START_ADDR, + cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_RANGE_SIZE, + cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MAPPED, + cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES, + cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_IS_GPU_DIRECT_RDMA_CAPABLE, + cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_ACCESS_FLAGS, + cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE, + ] attr_value_list = [None] * len(attr_type_list) for idx, attr in enumerate(attr_type_list): err, attr_tmp = cuda.cuPointerGetAttribute(attr, ptr) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS attr_value_list[idx] = attr_tmp # List version err, attr_value_list_v2 = cuda.cuPointerGetAttributes(len(attr_type_list), attr_type_list, ptr) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS for attr1, attr2 in zip(attr_value_list, attr_value_list_v2): - assert(str(attr1) == str(attr2)) + assert str(attr1) == str(attr2) # Test setting values for val in (True, False): - err, = cuda.cuPointerSetAttribute(val, cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, ptr) - assert(err == cuda.CUresult.CUDA_SUCCESS) + (err,) = cuda.cuPointerSetAttribute(val, cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, ptr) + assert err == cuda.CUresult.CUDA_SUCCESS err, attr_tmp = cuda.cuPointerGetAttribute(cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, ptr) - assert(err == cuda.CUresult.CUDA_SUCCESS) - assert(attr_tmp == val) + assert err == cuda.CUresult.CUDA_SUCCESS + assert attr_tmp == val + + (err,) = cuda.cuMemFree(ptr) + assert err == cuda.CUresult.CUDA_SUCCESS + (err,) = cuda.cuCtxDestroy(ctx) + assert err == cuda.CUresult.CUDA_SUCCESS - err, = cuda.cuMemFree(ptr) - assert(err == cuda.CUresult.CUDA_SUCCESS) - err, = cuda.cuCtxDestroy(ctx) - assert(err == cuda.CUresult.CUDA_SUCCESS) -@pytest.mark.skipif(not supportsManagedMemory(), reason='When new attributes were introduced') +@pytest.mark.skipif(not supportsManagedMemory(), reason="When new attributes were introduced") def test_cuda_mem_range_attr(): - err, = cuda.cuInit(0) - assert(err == cuda.CUresult.CUDA_SUCCESS) + (err,) = cuda.cuInit(0) + assert err == cuda.CUresult.CUDA_SUCCESS err, device = cuda.cuDeviceGet(0) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS err, ctx = cuda.cuCtxCreate(0, device) - assert(err == cuda.CUresult.CUDA_SUCCESS) - size = int(0x1000) + assert err == cuda.CUresult.CUDA_SUCCESS + size = 0x1000 err, ptr = cuda.cuMemAllocManaged(size, cuda.CUmemAttach_flags.CU_MEM_ATTACH_GLOBAL.value) - assert(err == cuda.CUresult.CUDA_SUCCESS) - err, = cuda.cuMemAdvise(ptr, size, cuda.CUmem_advise.CU_MEM_ADVISE_SET_READ_MOSTLY, device) - assert(err == cuda.CUresult.CUDA_SUCCESS) - err, = cuda.cuMemAdvise(ptr, size, cuda.CUmem_advise.CU_MEM_ADVISE_SET_PREFERRED_LOCATION, cuda.CU_DEVICE_CPU) - assert(err == cuda.CUresult.CUDA_SUCCESS) - err, = cuda.cuMemAdvise(ptr, size, cuda.CUmem_advise.CU_MEM_ADVISE_SET_ACCESSED_BY, cuda.CU_DEVICE_CPU) - assert(err == cuda.CUresult.CUDA_SUCCESS) - err, concurrentSupported = cuda.cuDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, device) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS + (err,) = cuda.cuMemAdvise(ptr, size, cuda.CUmem_advise.CU_MEM_ADVISE_SET_READ_MOSTLY, device) + assert err == cuda.CUresult.CUDA_SUCCESS + (err,) = cuda.cuMemAdvise(ptr, size, cuda.CUmem_advise.CU_MEM_ADVISE_SET_PREFERRED_LOCATION, cuda.CU_DEVICE_CPU) + assert err == cuda.CUresult.CUDA_SUCCESS + (err,) = cuda.cuMemAdvise(ptr, size, cuda.CUmem_advise.CU_MEM_ADVISE_SET_ACCESSED_BY, cuda.CU_DEVICE_CPU) + assert err == cuda.CUresult.CUDA_SUCCESS + err, concurrentSupported = cuda.cuDeviceGetAttribute( + cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, device + ) + assert err == cuda.CUresult.CUDA_SUCCESS if concurrentSupported: - err, = cuda.cuMemAdvise(ptr, size, cuda.CUmem_advise.CU_MEM_ADVISE_SET_ACCESSED_BY, device) - assert(err == cuda.CUresult.CUDA_SUCCESS) + (err,) = cuda.cuMemAdvise(ptr, size, cuda.CUmem_advise.CU_MEM_ADVISE_SET_ACCESSED_BY, device) + assert err == cuda.CUresult.CUDA_SUCCESS expected_values_list = ([1, -1, [0, -1, -2], -2],) else: expected_values_list = ([1, -1, [-1, -2, -2], -2], [0, -2, [-2, -2, -2], -2]) # Individual version - attr_type_list = [cuda.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY, - cuda.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION, - cuda.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY, - cuda.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION] + attr_type_list = [ + cuda.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY, + cuda.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION, + cuda.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY, + cuda.CUmem_range_attribute.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, + ] attr_type_size_list = [4, 4, 12, 4] attr_value_list = [None] * len(attr_type_list) for idx in range(len(attr_type_list)): err, attr_tmp = cuda.cuMemRangeGetAttribute(attr_type_size_list[idx], attr_type_list[idx], ptr, size) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS attr_value_list[idx] = attr_tmp matched = False @@ -461,33 +500,36 @@ def test_cuda_mem_range_attr(): matched = True break if not matched: - raise RuntimeError(f'attr_value_list {attr_value_list} did not match any {expected_values_list}') + raise RuntimeError(f"attr_value_list {attr_value_list} did not match any {expected_values_list}") # List version - err, attr_value_list_v2 = cuda.cuMemRangeGetAttributes(attr_type_size_list, attr_type_list, len(attr_type_list), ptr, size) - assert(err == cuda.CUresult.CUDA_SUCCESS) + err, attr_value_list_v2 = cuda.cuMemRangeGetAttributes( + attr_type_size_list, attr_type_list, len(attr_type_list), ptr, size + ) + assert err == cuda.CUresult.CUDA_SUCCESS for attr1, attr2 in zip(attr_value_list, attr_value_list_v2): - assert(str(attr1) == str(attr2)) + assert str(attr1) == str(attr2) + + (err,) = cuda.cuMemFree(ptr) + assert err == cuda.CUresult.CUDA_SUCCESS + (err,) = cuda.cuCtxDestroy(ctx) + assert err == cuda.CUresult.CUDA_SUCCESS - err, = cuda.cuMemFree(ptr) - assert(err == cuda.CUresult.CUDA_SUCCESS) - err, = cuda.cuCtxDestroy(ctx) - assert(err == cuda.CUresult.CUDA_SUCCESS) -@pytest.mark.skipif(driverVersionLessThan(11040) or not supportsMemoryPool(), reason='Mempool for graphs not supported') +@pytest.mark.skipif(driverVersionLessThan(11040) or not supportsMemoryPool(), reason="Mempool for graphs not supported") def test_cuda_graphMem_attr(): - err, = cuda.cuInit(0) - assert(err == cuda.CUresult.CUDA_SUCCESS) + (err,) = cuda.cuInit(0) + assert err == cuda.CUresult.CUDA_SUCCESS err, device = cuda.cuDeviceGet(0) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS err, ctx = cuda.cuCtxCreate(0, device) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS err, stream = cuda.cuStreamCreate(0) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS err, graph = cuda.cuGraphCreate(0) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS allocSize = 1 @@ -498,83 +540,95 @@ def test_cuda_graphMem_attr(): params.bytesize = allocSize err, allocNode = cuda.cuGraphAddMemAllocNode(graph, None, 0, params) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS err, freeNode = cuda.cuGraphAddMemFreeNode(graph, [allocNode], 1, params.dptr) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS err, graphExec = cuda.cuGraphInstantiate(graph, 0) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS - err, = cuda.cuGraphLaunch(graphExec, stream) - assert(err == cuda.CUresult.CUDA_SUCCESS) + (err,) = cuda.cuGraphLaunch(graphExec, stream) + assert err == cuda.CUresult.CUDA_SUCCESS err, used = cuda.cuDeviceGetGraphMemAttribute(device, cuda.CUgraphMem_attribute.CU_GRAPH_MEM_ATTR_USED_MEM_CURRENT) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS err, usedHigh = cuda.cuDeviceGetGraphMemAttribute(device, cuda.CUgraphMem_attribute.CU_GRAPH_MEM_ATTR_USED_MEM_HIGH) - assert(err == cuda.CUresult.CUDA_SUCCESS) - err, reserved = cuda.cuDeviceGetGraphMemAttribute(device, cuda.CUgraphMem_attribute.CU_GRAPH_MEM_ATTR_RESERVED_MEM_CURRENT) - assert(err == cuda.CUresult.CUDA_SUCCESS) - err, reservedHigh = cuda.cuDeviceGetGraphMemAttribute(device, cuda.CUgraphMem_attribute.CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS + err, reserved = cuda.cuDeviceGetGraphMemAttribute( + device, cuda.CUgraphMem_attribute.CU_GRAPH_MEM_ATTR_RESERVED_MEM_CURRENT + ) + assert err == cuda.CUresult.CUDA_SUCCESS + err, reservedHigh = cuda.cuDeviceGetGraphMemAttribute( + device, cuda.CUgraphMem_attribute.CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH + ) + assert err == cuda.CUresult.CUDA_SUCCESS assert int(used) >= allocSize assert int(usedHigh) == int(used) assert int(reserved) == int(usedHigh) assert int(reservedHigh) == int(reserved) - err, = cuda.cuGraphDestroy(graph) - assert(err == cuda.CUresult.CUDA_SUCCESS) - err, = cuda.cuStreamDestroy(stream) - assert(err == cuda.CUresult.CUDA_SUCCESS) - err, = cuda.cuCtxDestroy(ctx) - assert(err == cuda.CUresult.CUDA_SUCCESS) + (err,) = cuda.cuGraphDestroy(graph) + assert err == cuda.CUresult.CUDA_SUCCESS + (err,) = cuda.cuStreamDestroy(stream) + assert err == cuda.CUresult.CUDA_SUCCESS + (err,) = cuda.cuCtxDestroy(ctx) + assert err == cuda.CUresult.CUDA_SUCCESS -@pytest.mark.skipif(driverVersionLessThan(12010) - or not supportsCudaAPI('cuCoredumpSetAttributeGlobal') - or not supportsCudaAPI('cuCoredumpGetAttributeGlobal'), reason='Coredump API not present') + +@pytest.mark.skipif( + driverVersionLessThan(12010) + or not supportsCudaAPI("cuCoredumpSetAttributeGlobal") + or not supportsCudaAPI("cuCoredumpGetAttributeGlobal"), + reason="Coredump API not present", +) def test_cuda_coredump_attr(): - err, = cuda.cuInit(0) - assert(err == cuda.CUresult.CUDA_SUCCESS) + (err,) = cuda.cuInit(0) + assert err == cuda.CUresult.CUDA_SUCCESS err, device = cuda.cuDeviceGet(0) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS err, ctx = cuda.cuCtxCreate(0, device) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS attr_list = [None] * 6 - err, = cuda.cuCoredumpSetAttributeGlobal(cuda.CUcoredumpSettings.CU_COREDUMP_TRIGGER_HOST, False) - assert(err == cuda.CUresult.CUDA_SUCCESS) - err, = cuda.cuCoredumpSetAttributeGlobal(cuda.CUcoredumpSettings.CU_COREDUMP_FILE, b'corefile') - assert(err == cuda.CUresult.CUDA_SUCCESS) - err, = cuda.cuCoredumpSetAttributeGlobal(cuda.CUcoredumpSettings.CU_COREDUMP_PIPE, b'corepipe') - assert(err == cuda.CUresult.CUDA_SUCCESS) - err, = cuda.cuCoredumpSetAttributeGlobal(cuda.CUcoredumpSettings.CU_COREDUMP_LIGHTWEIGHT, True) - assert(err == cuda.CUresult.CUDA_SUCCESS) - - for idx, attr in enumerate([cuda.CUcoredumpSettings.CU_COREDUMP_TRIGGER_HOST, - cuda.CUcoredumpSettings.CU_COREDUMP_FILE, - cuda.CUcoredumpSettings.CU_COREDUMP_PIPE, - cuda.CUcoredumpSettings.CU_COREDUMP_LIGHTWEIGHT, - ]): + (err,) = cuda.cuCoredumpSetAttributeGlobal(cuda.CUcoredumpSettings.CU_COREDUMP_TRIGGER_HOST, False) + assert err == cuda.CUresult.CUDA_SUCCESS + (err,) = cuda.cuCoredumpSetAttributeGlobal(cuda.CUcoredumpSettings.CU_COREDUMP_FILE, b"corefile") + assert err == cuda.CUresult.CUDA_SUCCESS + (err,) = cuda.cuCoredumpSetAttributeGlobal(cuda.CUcoredumpSettings.CU_COREDUMP_PIPE, b"corepipe") + assert err == cuda.CUresult.CUDA_SUCCESS + (err,) = cuda.cuCoredumpSetAttributeGlobal(cuda.CUcoredumpSettings.CU_COREDUMP_LIGHTWEIGHT, True) + assert err == cuda.CUresult.CUDA_SUCCESS + + for idx, attr in enumerate( + [ + cuda.CUcoredumpSettings.CU_COREDUMP_TRIGGER_HOST, + cuda.CUcoredumpSettings.CU_COREDUMP_FILE, + cuda.CUcoredumpSettings.CU_COREDUMP_PIPE, + cuda.CUcoredumpSettings.CU_COREDUMP_LIGHTWEIGHT, + ] + ): err, attr_tmp = cuda.cuCoredumpGetAttributeGlobal(attr) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS attr_list[idx] = attr_tmp - assert(attr_list[0] == False) - assert(attr_list[1] == b'corefile') - assert(attr_list[2] == b'corepipe') - assert(attr_list[3] == True) + assert attr_list[0] is False + assert attr_list[1] == b"corefile" + assert attr_list[2] == b"corepipe" + assert attr_list[3] is True + + (err,) = cuda.cuCtxDestroy(ctx) + assert err == cuda.CUresult.CUDA_SUCCESS - err, = cuda.cuCtxDestroy(ctx) - assert(err == cuda.CUresult.CUDA_SUCCESS) def test_get_error_name_and_string(): - err, = cuda.cuInit(0) - assert(err == cuda.CUresult.CUDA_SUCCESS) + (err,) = cuda.cuInit(0) + assert err == cuda.CUresult.CUDA_SUCCESS err, device = cuda.cuDeviceGet(0) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS err, ctx = cuda.cuCtxCreate(0, device) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS err, device = cuda.cuDeviceGet(0) _, s = cuda.cuGetErrorString(err) @@ -587,134 +641,144 @@ def test_get_error_name_and_string(): assert s == b"invalid device ordinal" _, s = cuda.cuGetErrorName(err) assert s == b"CUDA_ERROR_INVALID_DEVICE" - err, = cuda.cuCtxDestroy(ctx) - assert(err == cuda.CUresult.CUDA_SUCCESS) + (err,) = cuda.cuCtxDestroy(ctx) + assert err == cuda.CUresult.CUDA_SUCCESS + -@pytest.mark.skipif(not callableBinary('nvidia-smi'), reason='Binary existance needed') +@pytest.mark.skipif(not callableBinary("nvidia-smi"), reason="Binary existance needed") def test_device_get_name(): import subprocess - err, = cuda.cuInit(0) - assert(err == cuda.CUresult.CUDA_SUCCESS) + (err,) = cuda.cuInit(0) + assert err == cuda.CUresult.CUDA_SUCCESS err, device = cuda.cuDeviceGet(0) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS err, ctx = cuda.cuCtxCreate(0, device) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS p = subprocess.run( - ["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"], - stdout=subprocess.PIPE, stderr=subprocess.PIPE + ["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"], stdout=subprocess.PIPE, stderr=subprocess.PIPE ) - delimiter = b'\r\n' if platform.system() == "Windows" else b'\n' + delimiter = b"\r\n" if platform.system() == "Windows" else b"\n" expect = p.stdout.split(delimiter) size = 64 _, got = cuda.cuDeviceGetName(size, device) - got = got.split(b'\x00')[0] - if any(b'Unable to determine the device handle for' in result for result in expect): + got = got.split(b"\x00")[0] + if any(b"Unable to determine the device handle for" in result for result in expect): # Undeterministic devices get waived pass else: assert any(got in result for result in expect) + (err,) = cuda.cuCtxDestroy(ctx) + assert err == cuda.CUresult.CUDA_SUCCESS - err, = cuda.cuCtxDestroy(ctx) - assert(err == cuda.CUresult.CUDA_SUCCESS) # TODO: cuStreamGetCaptureInfo_v2 -@pytest.mark.skipif(driverVersionLessThan(11030), reason='Driver too old for cuStreamGetCaptureInfo_v2') +@pytest.mark.skipif(driverVersionLessThan(11030), reason="Driver too old for cuStreamGetCaptureInfo_v2") def test_stream_capture(): pass + def test_profiler(): - err, = cuda.cuInit(0) - assert(err == cuda.CUresult.CUDA_SUCCESS) + (err,) = cuda.cuInit(0) + assert err == cuda.CUresult.CUDA_SUCCESS err, device = cuda.cuDeviceGet(0) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS err, ctx = cuda.cuCtxCreate(0, device) - assert(err == cuda.CUresult.CUDA_SUCCESS) - err, = cuda.cuProfilerStart() - assert(err == cuda.CUresult.CUDA_SUCCESS) - err, = cuda.cuProfilerStop() - assert(err == cuda.CUresult.CUDA_SUCCESS) - err, = cuda.cuCtxDestroy(ctx) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS + (err,) = cuda.cuProfilerStart() + assert err == cuda.CUresult.CUDA_SUCCESS + (err,) = cuda.cuProfilerStop() + assert err == cuda.CUresult.CUDA_SUCCESS + (err,) = cuda.cuCtxDestroy(ctx) + assert err == cuda.CUresult.CUDA_SUCCESS + def test_eglFrame(): val = cuda.CUeglFrame() # [, , ] - assert(int(val.frame.pArray[0]) == 0) - assert(int(val.frame.pArray[1]) == 0) - assert(int(val.frame.pArray[2]) == 0) - val.frame.pArray = [1,2,3] + assert int(val.frame.pArray[0]) == 0 + assert int(val.frame.pArray[1]) == 0 + assert int(val.frame.pArray[2]) == 0 + val.frame.pArray = [1, 2, 3] # [, , ] - assert(int(val.frame.pArray[0]) == 1) - assert(int(val.frame.pArray[1]) == 2) - assert(int(val.frame.pArray[2]) == 3) - val.frame.pArray = [cuda.CUarray(4),2,3] + assert int(val.frame.pArray[0]) == 1 + assert int(val.frame.pArray[1]) == 2 + assert int(val.frame.pArray[2]) == 3 + val.frame.pArray = [cuda.CUarray(4), 2, 3] # [, , ] - assert(int(val.frame.pArray[0]) == 4) - assert(int(val.frame.pArray[1]) == 2) - assert(int(val.frame.pArray[2]) == 3) + assert int(val.frame.pArray[0]) == 4 + assert int(val.frame.pArray[1]) == 2 + assert int(val.frame.pArray[2]) == 3 val.frame.pPitch = [4, 2, 3] # [4, 2, 3] - assert(int(val.frame.pPitch[0]) == 4) - assert(int(val.frame.pPitch[1]) == 2) - assert(int(val.frame.pPitch[2]) == 3) - val.frame.pPitch = [1,2,3] - assert(int(val.frame.pPitch[0]) == 1) - assert(int(val.frame.pPitch[1]) == 2) - assert(int(val.frame.pPitch[2]) == 3) + assert int(val.frame.pPitch[0]) == 4 + assert int(val.frame.pPitch[1]) == 2 + assert int(val.frame.pPitch[2]) == 3 + val.frame.pPitch = [1, 2, 3] + assert int(val.frame.pPitch[0]) == 1 + assert int(val.frame.pPitch[1]) == 2 + assert int(val.frame.pPitch[2]) == 3 + def test_char_range(): val = cuda.CUipcMemHandle_st() for x in range(-128, 0): val.reserved = [x] * 64 - assert(val.reserved[0] == 256 + x) + assert val.reserved[0] == 256 + x for x in range(0, 256): val.reserved = [x] * 64 - assert(val.reserved[0] == x) + assert val.reserved[0] == x + def test_anon_assign(): val1 = cuda.CUexecAffinityParam_st() val2 = cuda.CUexecAffinityParam_st() - assert(val1.param.smCount.val == 0) + assert val1.param.smCount.val == 0 val1.param.smCount.val = 5 - assert(val1.param.smCount.val == 5) + assert val1.param.smCount.val == 5 val2.param.smCount.val = 11 - assert(val2.param.smCount.val == 11) + assert val2.param.smCount.val == 11 val1.param = val2.param - assert(val1.param.smCount.val == 11) + assert val1.param.smCount.val == 11 + def test_union_assign(): val = cuda.CUlaunchAttributeValue() - val.clusterDim.x, val.clusterDim.y, val.clusterDim.z = 9,9,9 + val.clusterDim.x, val.clusterDim.y, val.clusterDim.z = 9, 9, 9 attr = cuda.CUlaunchAttribute() attr.value = val - assert(val.clusterDim.x == 9) - assert(val.clusterDim.y == 9) - assert(val.clusterDim.z == 9) + assert val.clusterDim.x == 9 + assert val.clusterDim.y == 9 + assert val.clusterDim.z == 9 + def test_invalid_repr_attribute(): val = cuda.CUlaunchAttributeValue() string = str(val) -@pytest.mark.skipif(driverVersionLessThan(12020) - or not supportsCudaAPI('cuGraphAddNode') - or not supportsCudaAPI('cuGraphNodeSetParams') - or not supportsCudaAPI('cuGraphExecNodeSetParams'), reason='Polymorphic graph APIs required') + +@pytest.mark.skipif( + driverVersionLessThan(12020) + or not supportsCudaAPI("cuGraphAddNode") + or not supportsCudaAPI("cuGraphNodeSetParams") + or not supportsCudaAPI("cuGraphExecNodeSetParams"), + reason="Polymorphic graph APIs required", +) def test_graph_poly(): - err, = cuda.cuInit(0) - assert(err == cuda.CUresult.CUDA_SUCCESS) + (err,) = cuda.cuInit(0) + assert err == cuda.CUresult.CUDA_SUCCESS err, device = cuda.cuDeviceGet(0) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS err, ctx = cuda.cuCtxCreate(0, device) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS err, stream = cuda.cuStreamCreate(0) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS # cuGraphAddNode @@ -723,18 +787,18 @@ def test_graph_poly(): buffers = [] for _ in range(2): err, dptr = cuda.cuMemAlloc(size) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS buffers += [(np.full(size, 2).astype(np.uint8), dptr)] # Update dev buffers for host, device in buffers: - err, = cuda.cuMemcpyHtoD(device, host, size) - assert(err == cuda.CUresult.CUDA_SUCCESS) + (err,) = cuda.cuMemcpyHtoD(device, host, size) + assert err == cuda.CUresult.CUDA_SUCCESS # Create graph nodes = [] err, graph = cuda.cuGraphCreate(0) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS # Memset host, device = buffers[0] @@ -746,7 +810,7 @@ def test_graph_poly(): memsetParams.memset.dst = device memsetParams.memset.value = 1 err, node = cuda.cuGraphAddNode(graph, None, 0, memsetParams) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS nodes += [node] # Memcpy @@ -761,103 +825,109 @@ def test_graph_poly(): memcpyParams.memcpy.copyParams.Height = 1 memcpyParams.memcpy.copyParams.Depth = 1 err, node = cuda.cuGraphAddNode(graph, None, 0, memcpyParams) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS nodes += [node] # Instantiate, execute, validate err, graphExec = cuda.cuGraphInstantiate(graph, 0) - assert(err == cuda.CUresult.CUDA_SUCCESS) - err, = cuda.cuGraphLaunch(graphExec, stream) - assert(err == cuda.CUresult.CUDA_SUCCESS) - err, = cuda.cuStreamSynchronize(stream) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS + (err,) = cuda.cuGraphLaunch(graphExec, stream) + assert err == cuda.CUresult.CUDA_SUCCESS + (err,) = cuda.cuStreamSynchronize(stream) + assert err == cuda.CUresult.CUDA_SUCCESS # Validate for host, device in buffers: - err, = cuda.cuMemcpyDtoH(host, device, size) - assert(err == cuda.CUresult.CUDA_SUCCESS) - assert(np.array_equal(buffers[0][0], np.full(size, 1).astype(np.uint8))) - assert(np.array_equal(buffers[1][0], np.full(size, 2).astype(np.uint8))) + (err,) = cuda.cuMemcpyDtoH(host, device, size) + assert err == cuda.CUresult.CUDA_SUCCESS + assert np.array_equal(buffers[0][0], np.full(size, 1).astype(np.uint8)) + assert np.array_equal(buffers[1][0], np.full(size, 2).astype(np.uint8)) # cuGraphNodeSetParams host, device = buffers[1] err, memcpyParamsCopy = cuda.cuGraphMemcpyNodeGetParams(nodes[1]) - assert(err == cuda.CUresult.CUDA_SUCCESS) - assert(int(memcpyParamsCopy.srcDevice) == int(device)) + assert err == cuda.CUresult.CUDA_SUCCESS + assert int(memcpyParamsCopy.srcDevice) == int(device) host, device = buffers[0] memcpyParams.memcpy.copyParams.srcDevice = device - err, = cuda.cuGraphNodeSetParams(nodes[1], memcpyParams) - assert(err == cuda.CUresult.CUDA_SUCCESS) + (err,) = cuda.cuGraphNodeSetParams(nodes[1], memcpyParams) + assert err == cuda.CUresult.CUDA_SUCCESS err, memcpyParamsCopy = cuda.cuGraphMemcpyNodeGetParams(nodes[1]) - assert(err == cuda.CUresult.CUDA_SUCCESS) - assert(int(memcpyParamsCopy.srcDevice) == int(device)) + assert err == cuda.CUresult.CUDA_SUCCESS + assert int(memcpyParamsCopy.srcDevice) == int(device) # cuGraphExecNodeSetParams memsetParams.memset.value = 11 - err, = cuda.cuGraphExecNodeSetParams(graphExec, nodes[0], memsetParams) - assert(err == cuda.CUresult.CUDA_SUCCESS) - err, = cuda.cuGraphLaunch(graphExec, stream) - assert(err == cuda.CUresult.CUDA_SUCCESS) - err, = cuda.cuStreamSynchronize(stream) - assert(err == cuda.CUresult.CUDA_SUCCESS) - err, = cuda.cuMemcpyDtoH(buffers[0][0], buffers[0][1], size) - assert(err == cuda.CUresult.CUDA_SUCCESS) - assert(np.array_equal(buffers[0][0], np.full(size, 11).astype(np.uint8))) + (err,) = cuda.cuGraphExecNodeSetParams(graphExec, nodes[0], memsetParams) + assert err == cuda.CUresult.CUDA_SUCCESS + (err,) = cuda.cuGraphLaunch(graphExec, stream) + assert err == cuda.CUresult.CUDA_SUCCESS + (err,) = cuda.cuStreamSynchronize(stream) + assert err == cuda.CUresult.CUDA_SUCCESS + (err,) = cuda.cuMemcpyDtoH(buffers[0][0], buffers[0][1], size) + assert err == cuda.CUresult.CUDA_SUCCESS + assert np.array_equal(buffers[0][0], np.full(size, 11).astype(np.uint8)) # Cleanup - err, = cuda.cuMemFree(buffers[0][1]) - assert(err == cuda.CUresult.CUDA_SUCCESS) - err, = cuda.cuMemFree(buffers[1][1]) - assert(err == cuda.CUresult.CUDA_SUCCESS) - err, = cuda.cuGraphExecDestroy(graphExec) - assert(err == cuda.CUresult.CUDA_SUCCESS) - err, = cuda.cuGraphDestroy(graph) - assert(err == cuda.CUresult.CUDA_SUCCESS) - err, = cuda.cuStreamDestroy(stream) - assert(err == cuda.CUresult.CUDA_SUCCESS) - err, = cuda.cuCtxDestroy(ctx) - assert(err == cuda.CUresult.CUDA_SUCCESS) - -@pytest.mark.skipif(driverVersionLessThan(12040) - or not supportsCudaAPI('cuDeviceGetDevResource'), reason='Polymorphic graph APIs required') + (err,) = cuda.cuMemFree(buffers[0][1]) + assert err == cuda.CUresult.CUDA_SUCCESS + (err,) = cuda.cuMemFree(buffers[1][1]) + assert err == cuda.CUresult.CUDA_SUCCESS + (err,) = cuda.cuGraphExecDestroy(graphExec) + assert err == cuda.CUresult.CUDA_SUCCESS + (err,) = cuda.cuGraphDestroy(graph) + assert err == cuda.CUresult.CUDA_SUCCESS + (err,) = cuda.cuStreamDestroy(stream) + assert err == cuda.CUresult.CUDA_SUCCESS + (err,) = cuda.cuCtxDestroy(ctx) + assert err == cuda.CUresult.CUDA_SUCCESS + + +@pytest.mark.skipif( + driverVersionLessThan(12040) or not supportsCudaAPI("cuDeviceGetDevResource"), + reason="Polymorphic graph APIs required", +) def test_cuDeviceGetDevResource(): - err, = cuda.cuInit(0) - assert(err == cuda.CUresult.CUDA_SUCCESS) + (err,) = cuda.cuInit(0) + assert err == cuda.CUresult.CUDA_SUCCESS err, device = cuda.cuDeviceGet(0) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS err, resource_in = cuda.cuDeviceGetDevResource(device, cuda.CUdevResourceType.CU_DEV_RESOURCE_TYPE_SM) err, ctx = cuda.cuCtxCreate(0, device) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS err, res, count, rem = cuda.cuDevSmResourceSplitByCount(0, resource_in, 0, 2) - assert(err == cuda.CUresult.CUDA_SUCCESS) - assert(count != 0) - assert(len(res) == 0) + assert err == cuda.CUresult.CUDA_SUCCESS + assert count != 0 + assert len(res) == 0 err, res, count_same, rem = cuda.cuDevSmResourceSplitByCount(count, resource_in, 0, 2) - assert(err == cuda.CUresult.CUDA_SUCCESS) - assert(count == count_same) - assert(len(res) == count) + assert err == cuda.CUresult.CUDA_SUCCESS + assert count == count_same + assert len(res) == count err, res, count, rem = cuda.cuDevSmResourceSplitByCount(3, resource_in, 0, 2) - assert(err == cuda.CUresult.CUDA_SUCCESS) - assert(len(res) == 3) + assert err == cuda.CUresult.CUDA_SUCCESS + assert len(res) == 3 + + (err,) = cuda.cuCtxDestroy(ctx) + assert err == cuda.CUresult.CUDA_SUCCESS - err, = cuda.cuCtxDestroy(ctx) - assert(err == cuda.CUresult.CUDA_SUCCESS) -@pytest.mark.skipif(driverVersionLessThan(12030) - or not supportsCudaAPI('cuGraphConditionalHandleCreate'), reason='Conditional graph APIs required') +@pytest.mark.skipif( + driverVersionLessThan(12030) or not supportsCudaAPI("cuGraphConditionalHandleCreate"), + reason="Conditional graph APIs required", +) def test_conditional(): - err, = cuda.cuInit(0) - assert(err == cuda.CUresult.CUDA_SUCCESS) + (err,) = cuda.cuInit(0) + assert err == cuda.CUresult.CUDA_SUCCESS err, device = cuda.cuDeviceGet(0) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS err, ctx = cuda.cuCtxCreate(0, device) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS err, graph = cuda.cuGraphCreate(0) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS err, handle = cuda.cuGraphConditionalHandleCreate(graph, ctx, 0, 0) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS params = cuda.CUgraphNodeParams() params.type = cuda.CUgraphNodeType.CU_GRAPH_NODE_TYPE_CONDITIONAL @@ -866,10 +936,10 @@ def test_conditional(): params.conditional.size = 1 params.conditional.ctx = ctx - assert(len(params.conditional.phGraph_out) == 1) - assert(int(params.conditional.phGraph_out[0]) == 0) + assert len(params.conditional.phGraph_out) == 1 + assert int(params.conditional.phGraph_out[0]) == 0 err, node = cuda.cuGraphAddNode(graph, None, 0, params) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS - assert(len(params.conditional.phGraph_out) == 1) - assert(int(params.conditional.phGraph_out[0]) != 0) + assert len(params.conditional.phGraph_out) == 1 + assert int(params.conditional.phGraph_out[0]) != 0 diff --git a/cuda_bindings/tests/test_cudart.py b/cuda_bindings/tests/test_cudart.py index 0e2c0af09..88f1b968a 100644 --- a/cuda_bindings/tests/test_cudart.py +++ b/cuda_bindings/tests/test_cudart.py @@ -5,36 +5,44 @@ # this software. Any use, reproduction, disclosure, or distribution of # this software and related documentation outside the terms of the EULA # is strictly prohibited. -from _pytest.mark.structures import store_mark import ctypes -import cuda.cuda as cuda -import cuda.cudart as cudart import math + import numpy as np import pytest +import cuda.cuda as cuda +import cuda.cudart as cudart + + def isSuccess(err): return err == cudart.cudaError_t.cudaSuccess + def assertSuccess(err): - assert(isSuccess(err)) + assert isSuccess(err) + def driverVersionLessThan(target): err, version = cudart.cudaDriverGetVersion() assertSuccess(err) return version < target + def supportsMemoryPool(): err, isSupported = cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrMemoryPoolsSupported, 0) return isSuccess(err) and isSupported + def supportsSparseTexturesDeviceFilter(): err, isSupported = cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrSparseCudaArraySupported, 0) return isSuccess(err) and isSupported + def supportsCudaAPI(name): return name in dir(cuda) or dir(cudart) + def test_cudart_memcpy(): # Allocate dev memory size = 1024 * np.uint8().itemsize @@ -44,44 +52,46 @@ def test_cudart_memcpy(): # Set h1 and h2 memory to be different h1 = np.full(size, 1).astype(np.uint8) h2 = np.full(size, 2).astype(np.uint8) - assert(np.array_equal(h1, h2) is False) + assert np.array_equal(h1, h2) is False # h1 to D - err, = cudart.cudaMemcpy(dptr, h1, size, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice) + (err,) = cudart.cudaMemcpy(dptr, h1, size, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice) assertSuccess(err) # D to h2 - err, = cudart.cudaMemcpy(h2, dptr, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost) + (err,) = cudart.cudaMemcpy(h2, dptr, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost) assertSuccess(err) # Validate h1 == h2 - assert(np.array_equal(h1, h2)) + assert np.array_equal(h1, h2) # Cleanup - err, = cudart.cudaFree(dptr) + (err,) = cudart.cudaFree(dptr) assertSuccess(err) + def test_cudart_hostRegister(): # Use hostRegister API to check for correct enum return values page_size = 80 addr_host = np.full(page_size * 3, 1).astype(np.uint8) addr = addr_host.ctypes.data - size_0 = ((16 * page_size) / 8) - addr_0 = addr + int(((0 * page_size) / 8)) - size_1 = ((16 * page_size) / 8) - addr_1 = addr + int(((8 * page_size) / 8)) + size_0 = (16 * page_size) / 8 + addr_0 = addr + int((0 * page_size) / 8) + size_1 = (16 * page_size) / 8 + addr_1 = addr + int((8 * page_size) / 8) - err, = cudart.cudaHostRegister(addr_0, size_0, 3) + (err,) = cudart.cudaHostRegister(addr_0, size_0, 3) assertSuccess(err) - err, = cudart.cudaHostRegister(addr_1, size_1, 3) - assert(err == cudart.cudaError_t.cudaErrorHostMemoryAlreadyRegistered) + (err,) = cudart.cudaHostRegister(addr_1, size_1, 3) + assert err == cudart.cudaError_t.cudaErrorHostMemoryAlreadyRegistered - err, = cudart.cudaHostUnregister(addr_1) - assert(err == cudart.cudaError_t.cudaErrorInvalidValue) - err, = cudart.cudaHostUnregister(addr_0) + (err,) = cudart.cudaHostUnregister(addr_1) + assert err == cudart.cudaError_t.cudaErrorInvalidValue + (err,) = cudart.cudaHostUnregister(addr_0) assertSuccess(err) + def test_cudart_class_reference(): offset = 1 width = 4 @@ -95,9 +105,9 @@ def test_cudart_class_reference(): externalMemoryMipmappedArrayDesc = cudart.cudaExternalMemoryMipmappedArrayDesc() # Get/set class attributes - extent.width = width + extent.width = width extent.height = height - extent.depth = depth + extent.depth = depth formatDesc.x = 8 formatDesc.y = 0 @@ -105,50 +115,51 @@ def test_cudart_class_reference(): formatDesc.w = 0 formatDesc.f = cudart.cudaChannelFormatKind.cudaChannelFormatKindSigned - externalMemoryMipmappedArrayDesc.offset = offset + externalMemoryMipmappedArrayDesc.offset = offset externalMemoryMipmappedArrayDesc.formatDesc = formatDesc - externalMemoryMipmappedArrayDesc.extent = extent - externalMemoryMipmappedArrayDesc.flags = flags - externalMemoryMipmappedArrayDesc.numLevels = numMipLevels + externalMemoryMipmappedArrayDesc.extent = extent + externalMemoryMipmappedArrayDesc.flags = flags + externalMemoryMipmappedArrayDesc.numLevels = numMipLevels # Can manipulate child structure values directly - externalMemoryMipmappedArrayDesc.extent.width = width+1 - externalMemoryMipmappedArrayDesc.extent.height = height+1 - externalMemoryMipmappedArrayDesc.extent.depth = depth+1 - assert(externalMemoryMipmappedArrayDesc.extent.width == width+1) - assert(externalMemoryMipmappedArrayDesc.extent.height == height+1) - assert(externalMemoryMipmappedArrayDesc.extent.depth == depth+1) + externalMemoryMipmappedArrayDesc.extent.width = width + 1 + externalMemoryMipmappedArrayDesc.extent.height = height + 1 + externalMemoryMipmappedArrayDesc.extent.depth = depth + 1 + assert externalMemoryMipmappedArrayDesc.extent.width == width + 1 + assert externalMemoryMipmappedArrayDesc.extent.height == height + 1 + assert externalMemoryMipmappedArrayDesc.extent.depth == depth + 1 externalMemoryMipmappedArrayDesc.formatDesc.x = 20 externalMemoryMipmappedArrayDesc.formatDesc.y = 21 externalMemoryMipmappedArrayDesc.formatDesc.z = 22 externalMemoryMipmappedArrayDesc.formatDesc.w = 23 externalMemoryMipmappedArrayDesc.formatDesc.f = cudart.cudaChannelFormatKind.cudaChannelFormatKindFloat - assert(externalMemoryMipmappedArrayDesc.formatDesc.x == 20) - assert(externalMemoryMipmappedArrayDesc.formatDesc.y == 21) - assert(externalMemoryMipmappedArrayDesc.formatDesc.z == 22) - assert(externalMemoryMipmappedArrayDesc.formatDesc.w == 23) - assert(externalMemoryMipmappedArrayDesc.formatDesc.f == cudart.cudaChannelFormatKind.cudaChannelFormatKindFloat) + assert externalMemoryMipmappedArrayDesc.formatDesc.x == 20 + assert externalMemoryMipmappedArrayDesc.formatDesc.y == 21 + assert externalMemoryMipmappedArrayDesc.formatDesc.z == 22 + assert externalMemoryMipmappedArrayDesc.formatDesc.w == 23 + assert externalMemoryMipmappedArrayDesc.formatDesc.f == cudart.cudaChannelFormatKind.cudaChannelFormatKindFloat # Can copy classes over externalMemoryMipmappedArrayDesc.extent = extent - assert(externalMemoryMipmappedArrayDesc.extent.width == width) - assert(externalMemoryMipmappedArrayDesc.extent.height == height) - assert(externalMemoryMipmappedArrayDesc.extent.depth == depth) + assert externalMemoryMipmappedArrayDesc.extent.width == width + assert externalMemoryMipmappedArrayDesc.extent.height == height + assert externalMemoryMipmappedArrayDesc.extent.depth == depth externalMemoryMipmappedArrayDesc.formatDesc = formatDesc - assert(externalMemoryMipmappedArrayDesc.formatDesc.x == 8) - assert(externalMemoryMipmappedArrayDesc.formatDesc.y == 0) - assert(externalMemoryMipmappedArrayDesc.formatDesc.z == 0) - assert(externalMemoryMipmappedArrayDesc.formatDesc.w == 0) - assert(externalMemoryMipmappedArrayDesc.formatDesc.f == cudart.cudaChannelFormatKind.cudaChannelFormatKindSigned) + assert externalMemoryMipmappedArrayDesc.formatDesc.x == 8 + assert externalMemoryMipmappedArrayDesc.formatDesc.y == 0 + assert externalMemoryMipmappedArrayDesc.formatDesc.z == 0 + assert externalMemoryMipmappedArrayDesc.formatDesc.w == 0 + assert externalMemoryMipmappedArrayDesc.formatDesc.f == cudart.cudaChannelFormatKind.cudaChannelFormatKindSigned + -@pytest.mark.skipif(not supportsSparseTexturesDeviceFilter(), reason='Sparse Texture Device Filter') +@pytest.mark.skipif(not supportsSparseTexturesDeviceFilter(), reason="Sparse Texture Device Filter") def test_cudart_class_inline(): extent = cudart.cudaExtent() - extent.width = 1000 + extent.width = 1000 extent.height = 500 - extent.depth = 0 + extent.depth = 0 desc = cudart.cudaChannelFormatDesc() desc.x = 32 @@ -158,12 +169,12 @@ def test_cudart_class_inline(): desc.f = cudart.cudaChannelFormatKind.cudaChannelFormatKindFloat numChannels = 4 - numBytesPerChannel = desc.x/8 + numBytesPerChannel = desc.x / 8 numBytesPerTexel = numChannels * numBytesPerChannel flags = cudart.cudaArraySparse maxDim = max(extent.width, extent.height) - numLevels = int(float(1.0) + math.log(maxDim, 2)) + numLevels = int(1.0 + math.log(maxDim, 2)) err, mipmap = cudart.cudaMallocMipmappedArray(desc, extent, numLevels, flags) assertSuccess(err) @@ -174,9 +185,9 @@ def test_cudart_class_inline(): # tileExtent # TODO: Will these values always be this same? Maybe need a more stable test? # TODO: Are these values even correct? Need to research the function some more.. Maybe need an easier API test - assert(sparseProp.tileExtent.width == 64) - assert(sparseProp.tileExtent.height == 64) - assert(sparseProp.tileExtent.depth == 1) + assert sparseProp.tileExtent.width == 64 + assert sparseProp.tileExtent.height == 64 + assert sparseProp.tileExtent.depth == 1 sparsePropNew = cudart.cudaArraySparseProperties() sparsePropNew.tileExtent.width = 15 @@ -185,20 +196,24 @@ def test_cudart_class_inline(): # Check that we can copy inner structs sparseProp.tileExtent = sparsePropNew.tileExtent - assert(sparseProp.tileExtent.width == 15) - assert(sparseProp.tileExtent.height == 16) - assert(sparseProp.tileExtent.depth == 17) + assert sparseProp.tileExtent.width == 15 + assert sparseProp.tileExtent.height == 16 + assert sparseProp.tileExtent.depth == 17 - assert(sparseProp.miptailFirstLevel == 3) - assert(sparseProp.miptailSize == 196608) - assert(sparseProp.flags == 0) + assert sparseProp.miptailFirstLevel == 3 + assert sparseProp.miptailSize == 196608 + assert sparseProp.flags == 0 - err, = cudart.cudaFreeMipmappedArray(mipmap) + (err,) = cudart.cudaFreeMipmappedArray(mipmap) assertSuccess(err) # TODO example = cudart.cudaExternalSemaphoreSignalNodeParams() - example.extSemArray = [cudart.cudaExternalSemaphore_t(0), cudart.cudaExternalSemaphore_t(123), cudart.cudaExternalSemaphore_t(999)] + example.extSemArray = [ + cudart.cudaExternalSemaphore_t(0), + cudart.cudaExternalSemaphore_t(123), + cudart.cudaExternalSemaphore_t(999), + ] a1 = cudart.cudaExternalSemaphoreSignalParams() a1.params.fence.value = 7 a1.params.nvSciSync.fence = 999 @@ -222,6 +237,7 @@ def test_cudart_class_inline(): a3.flags = 4 example.numExtSems = 3 + def test_cudart_graphs(): err, graph = cudart.cudaGraphCreate(0) assertSuccess(err) @@ -241,9 +257,11 @@ def test_cudart_graphs(): err, stream_with_flags = cudart.cudaStreamCreateWithFlags(cudart.cudaStreamNonBlocking) assertSuccess(err) + def test_cudart_list_access(): err, prop = cudart.cudaGetDeviceProperties(0) - prop.name = prop.name + b' '*(256-len(prop.name)) + prop.name = prop.name + b" " * (256 - len(prop.name)) + def test_cudart_class_setters(): dim = cudart.dim3() @@ -256,30 +274,119 @@ def test_cudart_class_setters(): assert dim.y == 2 assert dim.z == 3 + def test_cudart_both_type(): err, mode = cudart.cudaThreadExchangeStreamCaptureMode(cudart.cudaStreamCaptureMode.cudaStreamCaptureModeGlobal) assertSuccess(err) err, mode = cudart.cudaThreadExchangeStreamCaptureMode(cudart.cudaStreamCaptureMode.cudaStreamCaptureModeRelaxed) assertSuccess(err) - assert(mode == cudart.cudaStreamCaptureMode.cudaStreamCaptureModeGlobal) - err, mode = cudart.cudaThreadExchangeStreamCaptureMode(cudart.cudaStreamCaptureMode.cudaStreamCaptureModeThreadLocal) + assert mode == cudart.cudaStreamCaptureMode.cudaStreamCaptureModeGlobal + err, mode = cudart.cudaThreadExchangeStreamCaptureMode( + cudart.cudaStreamCaptureMode.cudaStreamCaptureModeThreadLocal + ) assertSuccess(err) - assert(mode == cudart.cudaStreamCaptureMode.cudaStreamCaptureModeRelaxed) + assert mode == cudart.cudaStreamCaptureMode.cudaStreamCaptureModeRelaxed err, mode = cudart.cudaThreadExchangeStreamCaptureMode(cudart.cudaStreamCaptureMode.cudaStreamCaptureModeGlobal) assertSuccess(err) - assert(mode == cudart.cudaStreamCaptureMode.cudaStreamCaptureModeThreadLocal) + assert mode == cudart.cudaStreamCaptureMode.cudaStreamCaptureModeThreadLocal + def test_cudart_cudaGetDeviceProperties(): err, prop = cudart.cudaGetDeviceProperties(0) assertSuccess(err) - attrs = ['accessPolicyMaxWindowSize', 'asyncEngineCount', 'canMapHostMemory', 'canUseHostPointerForRegisteredMem', 'clockRate', 'computeMode', 'computePreemptionSupported', 'concurrentKernels', 'concurrentManagedAccess', 'cooperativeLaunch', 'cooperativeMultiDeviceLaunch', 'deviceOverlap', 'directManagedMemAccessFromHost', 'getPtr', 'globalL1CacheSupported', 'hostNativeAtomicSupported', 'integrated', 'isMultiGpuBoard', 'kernelExecTimeoutEnabled', 'l2CacheSize', 'localL1CacheSupported', 'luid', 'luidDeviceNodeMask', 'major', 'managedMemory', 'maxBlocksPerMultiProcessor', 'maxGridSize', 'maxSurface1D', 'maxSurface1DLayered', 'maxSurface2D', 'maxSurface2DLayered', 'maxSurface3D', 'maxSurfaceCubemap', 'maxSurfaceCubemapLayered', 'maxTexture1D', 'maxTexture1DLayered', 'maxTexture1DLinear', 'maxTexture1DMipmap', 'maxTexture2D', 'maxTexture2DGather', 'maxTexture2DLayered', 'maxTexture2DLinear', 'maxTexture2DMipmap', 'maxTexture3D', 'maxTexture3DAlt', 'maxTextureCubemap', 'maxTextureCubemapLayered', 'maxThreadsDim', 'maxThreadsPerBlock', 'maxThreadsPerMultiProcessor', 'memPitch', 'memoryBusWidth', 'memoryClockRate', 'minor', 'multiGpuBoardGroupID', 'multiProcessorCount', 'name', 'pageableMemoryAccess', 'pageableMemoryAccessUsesHostPageTables', 'pciBusID', 'pciDeviceID', 'pciDomainID', 'persistingL2CacheMaxSize', 'regsPerBlock', 'regsPerMultiprocessor', 'reservedSharedMemPerBlock', 'sharedMemPerBlock', 'sharedMemPerBlockOptin', 'sharedMemPerMultiprocessor', 'singleToDoublePrecisionPerfRatio', 'streamPrioritiesSupported', 'surfaceAlignment', 'tccDriver', 'textureAlignment', 'texturePitchAlignment', 'totalConstMem', 'totalGlobalMem', 'unifiedAddressing', 'uuid', 'warpSize'] + attrs = [ + "accessPolicyMaxWindowSize", + "asyncEngineCount", + "canMapHostMemory", + "canUseHostPointerForRegisteredMem", + "clockRate", + "computeMode", + "computePreemptionSupported", + "concurrentKernels", + "concurrentManagedAccess", + "cooperativeLaunch", + "cooperativeMultiDeviceLaunch", + "deviceOverlap", + "directManagedMemAccessFromHost", + "getPtr", + "globalL1CacheSupported", + "hostNativeAtomicSupported", + "integrated", + "isMultiGpuBoard", + "kernelExecTimeoutEnabled", + "l2CacheSize", + "localL1CacheSupported", + "luid", + "luidDeviceNodeMask", + "major", + "managedMemory", + "maxBlocksPerMultiProcessor", + "maxGridSize", + "maxSurface1D", + "maxSurface1DLayered", + "maxSurface2D", + "maxSurface2DLayered", + "maxSurface3D", + "maxSurfaceCubemap", + "maxSurfaceCubemapLayered", + "maxTexture1D", + "maxTexture1DLayered", + "maxTexture1DLinear", + "maxTexture1DMipmap", + "maxTexture2D", + "maxTexture2DGather", + "maxTexture2DLayered", + "maxTexture2DLinear", + "maxTexture2DMipmap", + "maxTexture3D", + "maxTexture3DAlt", + "maxTextureCubemap", + "maxTextureCubemapLayered", + "maxThreadsDim", + "maxThreadsPerBlock", + "maxThreadsPerMultiProcessor", + "memPitch", + "memoryBusWidth", + "memoryClockRate", + "minor", + "multiGpuBoardGroupID", + "multiProcessorCount", + "name", + "pageableMemoryAccess", + "pageableMemoryAccessUsesHostPageTables", + "pciBusID", + "pciDeviceID", + "pciDomainID", + "persistingL2CacheMaxSize", + "regsPerBlock", + "regsPerMultiprocessor", + "reservedSharedMemPerBlock", + "sharedMemPerBlock", + "sharedMemPerBlockOptin", + "sharedMemPerMultiprocessor", + "singleToDoublePrecisionPerfRatio", + "streamPrioritiesSupported", + "surfaceAlignment", + "tccDriver", + "textureAlignment", + "texturePitchAlignment", + "totalConstMem", + "totalGlobalMem", + "unifiedAddressing", + "uuid", + "warpSize", + ] for attr in attrs: assert hasattr(prop, attr) assert len(prop.name.decode("utf-8")) != 0 assert len(prop.uuid.bytes.hex()) != 0 example = cudart.cudaExternalSemaphoreSignalNodeParams() - example.extSemArray = [cudart.cudaExternalSemaphore_t(0), cudart.cudaExternalSemaphore_t(123), cudart.cudaExternalSemaphore_t(999)] + example.extSemArray = [ + cudart.cudaExternalSemaphore_t(0), + cudart.cudaExternalSemaphore_t(123), + cudart.cudaExternalSemaphore_t(999), + ] a1 = cudart.cudaExternalSemaphoreSignalParams() a1.params.fence.value = 7 a1.params.nvSciSync.fence = 999 @@ -303,7 +410,10 @@ def test_cudart_cudaGetDeviceProperties(): a3.flags = 4 example.numExtSems = 3 -@pytest.mark.skipif(driverVersionLessThan(11030) or not supportsMemoryPool(), reason='When new attributes were introduced') + +@pytest.mark.skipif( + driverVersionLessThan(11030) or not supportsMemoryPool(), reason="When new attributes were introduced" +) def test_cudart_MemPool_attr(): poolProps = cudart.cudaMemPoolProps() poolProps.allocType = cudart.cudaMemAllocationType.cudaMemAllocationTypePinned @@ -314,69 +424,85 @@ def test_cudart_MemPool_attr(): err, pool = cudart.cudaMemPoolCreate(poolProps) assertSuccess(err) - for idx, attr in enumerate([cudart.cudaMemPoolAttr.cudaMemPoolReuseFollowEventDependencies, - cudart.cudaMemPoolAttr.cudaMemPoolReuseAllowOpportunistic, - cudart.cudaMemPoolAttr.cudaMemPoolReuseAllowInternalDependencies, - cudart.cudaMemPoolAttr.cudaMemPoolAttrReleaseThreshold, - cudart.cudaMemPoolAttr.cudaMemPoolAttrReservedMemCurrent, - cudart.cudaMemPoolAttr.cudaMemPoolAttrReservedMemHigh, - cudart.cudaMemPoolAttr.cudaMemPoolAttrUsedMemCurrent, - cudart.cudaMemPoolAttr.cudaMemPoolAttrUsedMemHigh]): + for idx, attr in enumerate( + [ + cudart.cudaMemPoolAttr.cudaMemPoolReuseFollowEventDependencies, + cudart.cudaMemPoolAttr.cudaMemPoolReuseAllowOpportunistic, + cudart.cudaMemPoolAttr.cudaMemPoolReuseAllowInternalDependencies, + cudart.cudaMemPoolAttr.cudaMemPoolAttrReleaseThreshold, + cudart.cudaMemPoolAttr.cudaMemPoolAttrReservedMemCurrent, + cudart.cudaMemPoolAttr.cudaMemPoolAttrReservedMemHigh, + cudart.cudaMemPoolAttr.cudaMemPoolAttrUsedMemCurrent, + cudart.cudaMemPoolAttr.cudaMemPoolAttrUsedMemHigh, + ] + ): err, attr_tmp = cudart.cudaMemPoolGetAttribute(pool, attr) assertSuccess(err) attr_list[idx] = attr_tmp - for idxA, attr in enumerate([cudart.cudaMemPoolAttr.cudaMemPoolReuseFollowEventDependencies, - cudart.cudaMemPoolAttr.cudaMemPoolReuseAllowOpportunistic, - cudart.cudaMemPoolAttr.cudaMemPoolReuseAllowInternalDependencies]): - err, = cudart.cudaMemPoolSetAttribute(pool, attr, 0) + for idxA, attr in enumerate( + [ + cudart.cudaMemPoolAttr.cudaMemPoolReuseFollowEventDependencies, + cudart.cudaMemPoolAttr.cudaMemPoolReuseAllowOpportunistic, + cudart.cudaMemPoolAttr.cudaMemPoolReuseAllowInternalDependencies, + ] + ): + (err,) = cudart.cudaMemPoolSetAttribute(pool, attr, 0) assertSuccess(err) for idx, attr in enumerate([cudart.cudaMemPoolAttr.cudaMemPoolAttrReleaseThreshold]): - err, = cudart.cudaMemPoolSetAttribute(pool, attr, cuda.cuuint64_t(9)) + (err,) = cudart.cudaMemPoolSetAttribute(pool, attr, cuda.cuuint64_t(9)) assertSuccess(err) - for idx, attr in enumerate([cudart.cudaMemPoolAttr.cudaMemPoolReuseFollowEventDependencies, - cudart.cudaMemPoolAttr.cudaMemPoolReuseAllowOpportunistic, - cudart.cudaMemPoolAttr.cudaMemPoolReuseAllowInternalDependencies, - cudart.cudaMemPoolAttr.cudaMemPoolAttrReleaseThreshold]): + for idx, attr in enumerate( + [ + cudart.cudaMemPoolAttr.cudaMemPoolReuseFollowEventDependencies, + cudart.cudaMemPoolAttr.cudaMemPoolReuseAllowOpportunistic, + cudart.cudaMemPoolAttr.cudaMemPoolReuseAllowInternalDependencies, + cudart.cudaMemPoolAttr.cudaMemPoolAttrReleaseThreshold, + ] + ): err, attr_tmp = cudart.cudaMemPoolGetAttribute(pool, attr) assertSuccess(err) attr_list[idx] = attr_tmp - assert(attr_list[0] == 0) - assert(attr_list[1] == 0) - assert(attr_list[2] == 0) - assert(int(attr_list[3]) == 9) + assert attr_list[0] == 0 + assert attr_list[1] == 0 + assert attr_list[2] == 0 + assert int(attr_list[3]) == 9 - err, = cudart.cudaMemPoolDestroy(pool) + (err,) = cudart.cudaMemPoolDestroy(pool) assertSuccess(err) + def test_cudart_make_api(): - err, channelDesc = cudart.cudaCreateChannelDesc(32,0,0,0,cudart.cudaChannelFormatKind.cudaChannelFormatKindFloat) + err, channelDesc = cudart.cudaCreateChannelDesc( + 32, 0, 0, 0, cudart.cudaChannelFormatKind.cudaChannelFormatKindFloat + ) assertSuccess(err) - assert(channelDesc.x == 32) - assert(channelDesc.y == 0) - assert(channelDesc.z == 0) - assert(channelDesc.w == 0) - assert(channelDesc.f == cudart.cudaChannelFormatKind.cudaChannelFormatKindFloat) + assert channelDesc.x == 32 + assert channelDesc.y == 0 + assert channelDesc.z == 0 + assert channelDesc.w == 0 + assert channelDesc.f == cudart.cudaChannelFormatKind.cudaChannelFormatKindFloat # make_cudaPitchedPtr - cudaPitchedPtr = cudart.make_cudaPitchedPtr(1,2,3,4) - assert(cudaPitchedPtr.ptr == 1) - assert(cudaPitchedPtr.pitch == 2) - assert(cudaPitchedPtr.xsize == 3) - assert(cudaPitchedPtr.ysize == 4) + cudaPitchedPtr = cudart.make_cudaPitchedPtr(1, 2, 3, 4) + assert cudaPitchedPtr.ptr == 1 + assert cudaPitchedPtr.pitch == 2 + assert cudaPitchedPtr.xsize == 3 + assert cudaPitchedPtr.ysize == 4 # make_cudaPos - cudaPos = cudart.make_cudaPos(1,2,3) - assert(cudaPos.x == 1) - assert(cudaPos.y == 2) - assert(cudaPos.z == 3) + cudaPos = cudart.make_cudaPos(1, 2, 3) + assert cudaPos.x == 1 + assert cudaPos.y == 2 + assert cudaPos.z == 3 # make_cudaExtent - cudaExtent = cudart.make_cudaExtent(1,2,3) - assert(cudaExtent.width == 1) - assert(cudaExtent.height == 2) - assert(cudaExtent.depth == 3) + cudaExtent = cudart.make_cudaExtent(1, 2, 3) + assert cudaExtent.width == 1 + assert cudaExtent.height == 2 + assert cudaExtent.depth == 3 + def test_cudart_cudaStreamGetCaptureInfo(): # create stream @@ -386,30 +512,27 @@ def test_cudart_cudaStreamGetCaptureInfo(): # validate that stream is not capturing err, status, *info = cudart.cudaStreamGetCaptureInfo(stream) assertSuccess(err) - assert(status == cudart.cudaStreamCaptureStatus.cudaStreamCaptureStatusNone) + assert status == cudart.cudaStreamCaptureStatus.cudaStreamCaptureStatusNone # start capture - err, = cudart.cudaStreamBeginCapture( - stream, cudart.cudaStreamCaptureMode.cudaStreamCaptureModeGlobal - ) + (err,) = cudart.cudaStreamBeginCapture(stream, cudart.cudaStreamCaptureMode.cudaStreamCaptureModeGlobal) assertSuccess(err) # validate that stream is capturing now err, status, *info = cudart.cudaStreamGetCaptureInfo(stream) assertSuccess(err) - assert(status == cudart.cudaStreamCaptureStatus.cudaStreamCaptureStatusActive) + assert status == cudart.cudaStreamCaptureStatus.cudaStreamCaptureStatusActive # clean up err, pgraph = cudart.cudaStreamEndCapture(stream) assertSuccess(err) + def test_cudart_cudaArrayGetInfo(): # create channel descriptor x, y, z, w = 8, 0, 0, 0 f = cudart.cudaChannelFormatKind.cudaChannelFormatKindUnsigned - err, desc = cudart.cudaCreateChannelDesc( - x, y, z, w, f - ) + err, desc = cudart.cudaCreateChannelDesc(x, y, z, w, f) assertSuccess(err) # allocate device array @@ -424,30 +547,29 @@ def test_cudart_cudaArrayGetInfo(): assertSuccess(err) # validate descriptor, extent, flags - assert(desc.x == x) - assert(desc.y == y) - assert(desc.z == z) - assert(desc.w == w) - assert(desc.f == f) - assert(extent.width == width) - assert(extent.height == height) - assert(inFlags == outFlags) + assert desc.x == x + assert desc.y == y + assert desc.z == z + assert desc.w == w + assert desc.f == f + assert extent.width == width + assert extent.height == height + assert inFlags == outFlags # clean up - err, = cudart.cudaFreeArray(arr) + (err,) = cudart.cudaFreeArray(arr) assertSuccess(err) - + + def test_cudart_cudaMemcpy2DToArray(): # create host arrays size = int(1024 * np.uint8().itemsize) h1 = np.full(size, 1).astype(np.uint8) h2 = np.full(size, 2).astype(np.uint8) - assert(np.array_equal(h1, h2) is False) + assert np.array_equal(h1, h2) is False # create channel descriptor - err, desc = cudart.cudaCreateChannelDesc( - 8, 0, 0, 0, cudart.cudaChannelFormatKind.cudaChannelFormatKindUnsigned - ) + err, desc = cudart.cudaCreateChannelDesc(8, 0, 0, 0, cudart.cudaChannelFormatKind.cudaChannelFormatKindUnsigned) assertSuccess(err) # allocate device array @@ -455,26 +577,21 @@ def test_cudart_cudaMemcpy2DToArray(): assertSuccess(err) # h1 to arr - err, = cudart.cudaMemcpy2DToArray( - arr, 0, 0, h1, size, size, 1, - cudart.cudaMemcpyKind.cudaMemcpyHostToDevice - ) + (err,) = cudart.cudaMemcpy2DToArray(arr, 0, 0, h1, size, size, 1, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice) assertSuccess(err) # arr to h2 - err, = cudart.cudaMemcpy2DFromArray( - h2, size, arr, 0, 0, size, 1, - cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost - ) + (err,) = cudart.cudaMemcpy2DFromArray(h2, size, arr, 0, 0, size, 1, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost) assertSuccess(err) # validate h1 == h2 - assert(np.array_equal(h1, h2)) + assert np.array_equal(h1, h2) # clean up - err, = cudart.cudaFreeArray(arr) + (err,) = cudart.cudaFreeArray(arr) assertSuccess(err) + def test_cudart_cudaMemcpy2DToArray_DtoD(): # allocate device memory size = 1024 * np.uint8().itemsize @@ -486,12 +603,10 @@ def test_cudart_cudaMemcpy2DToArray_DtoD(): # create host arrays h1 = np.full(size, 1).astype(np.uint8) h2 = np.full(size, 2).astype(np.uint8) - assert(np.array_equal(h1, h2) is False) + assert np.array_equal(h1, h2) is False # create channel descriptor - err, desc = cudart.cudaCreateChannelDesc( - 8, 0, 0, 0, cudart.cudaChannelFormatKind.cudaChannelFormatKindUnsigned - ) + err, desc = cudart.cudaCreateChannelDesc(8, 0, 0, 0, cudart.cudaChannelFormatKind.cudaChannelFormatKindUnsigned) assertSuccess(err) # allocate device array @@ -499,49 +614,42 @@ def test_cudart_cudaMemcpy2DToArray_DtoD(): assertSuccess(err) # h1 to d1 - err, = cudart.cudaMemcpy(d1, h1, size, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice) + (err,) = cudart.cudaMemcpy(d1, h1, size, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice) assertSuccess(err) # d1 to arr - err, = cudart.cudaMemcpy2DToArray( - arr, 0, 0, d1, size, size, 1, - cudart.cudaMemcpyKind.cudaMemcpyDeviceToDevice - ) + (err,) = cudart.cudaMemcpy2DToArray(arr, 0, 0, d1, size, size, 1, cudart.cudaMemcpyKind.cudaMemcpyDeviceToDevice) assertSuccess(err) # arr to d2 - err, = cudart.cudaMemcpy2DFromArray( - d2, size, arr, 0, 0, size, 1, - cudart.cudaMemcpyKind.cudaMemcpyDeviceToDevice - ) + (err,) = cudart.cudaMemcpy2DFromArray(d2, size, arr, 0, 0, size, 1, cudart.cudaMemcpyKind.cudaMemcpyDeviceToDevice) assertSuccess(err) # d2 to h2 - err, = cudart.cudaMemcpy(h2, d2, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost) + (err,) = cudart.cudaMemcpy(h2, d2, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost) assertSuccess(err) # validate h1 == h2 - assert(np.array_equal(h1, h2)) + assert np.array_equal(h1, h2) # clean up - err, = cudart.cudaFreeArray(arr) + (err,) = cudart.cudaFreeArray(arr) assertSuccess(err) - err, = cudart.cudaFree(d2) + (err,) = cudart.cudaFree(d2) assertSuccess(err) - err, = cudart.cudaFree(d1) + (err,) = cudart.cudaFree(d1) assertSuccess(err) + def test_cudart_cudaMemcpy2DArrayToArray(): # create host arrays size = 1024 * np.uint8().itemsize h1 = np.full(size, 1).astype(np.uint8) h2 = np.full(size, 2).astype(np.uint8) - assert(np.array_equal(h1, h2) is False) + assert np.array_equal(h1, h2) is False # create channel descriptor - err, desc = cudart.cudaCreateChannelDesc( - 8, 0, 0, 0, cudart.cudaChannelFormatKind.cudaChannelFormatKindUnsigned - ) + err, desc = cudart.cudaCreateChannelDesc(8, 0, 0, 0, cudart.cudaChannelFormatKind.cudaChannelFormatKindUnsigned) assertSuccess(err) # allocate device arrays @@ -551,46 +659,38 @@ def test_cudart_cudaMemcpy2DArrayToArray(): assertSuccess(err) # h1 to a1 - err, = cudart.cudaMemcpy2DToArray( - a1, 0, 0, h1, size, size, 1, - cudart.cudaMemcpyKind.cudaMemcpyHostToDevice - ) + (err,) = cudart.cudaMemcpy2DToArray(a1, 0, 0, h1, size, size, 1, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice) assertSuccess(err) # a1 to a2 - err, = cudart.cudaMemcpy2DArrayToArray( - a2, 0, 0, a1, 0, 0, size, 1, - cudart.cudaMemcpyKind.cudaMemcpyDeviceToDevice + (err,) = cudart.cudaMemcpy2DArrayToArray( + a2, 0, 0, a1, 0, 0, size, 1, cudart.cudaMemcpyKind.cudaMemcpyDeviceToDevice ) assertSuccess(err) # a2 to h2 - err, = cudart.cudaMemcpy2DFromArray( - h2, size, a2, 0, 0, size, 1, - cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost - ) + (err,) = cudart.cudaMemcpy2DFromArray(h2, size, a2, 0, 0, size, 1, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost) assertSuccess(err) # validate h1 == h2 - assert(np.array_equal(h1, h2)) + assert np.array_equal(h1, h2) # clean up - err, = cudart.cudaFreeArray(a2) + (err,) = cudart.cudaFreeArray(a2) assertSuccess(err) - err, = cudart.cudaFreeArray(a1) + (err,) = cudart.cudaFreeArray(a1) assertSuccess(err) + def test_cudart_cudaMemcpyArrayToArray(): # create host arrays size = 1024 * np.uint8().itemsize h1 = np.full(size, 1).astype(np.uint8) h2 = np.full(size, 2).astype(np.uint8) - assert(np.array_equal(h1, h2) is False) + assert np.array_equal(h1, h2) is False # create channel descriptor - err, desc = cudart.cudaCreateChannelDesc( - 8, 0, 0, 0, cudart.cudaChannelFormatKind.cudaChannelFormatKindUnsigned - ) + err, desc = cudart.cudaCreateChannelDesc(8, 0, 0, 0, cudart.cudaChannelFormatKind.cudaChannelFormatKindUnsigned) assertSuccess(err) # allocate device arrays @@ -600,42 +700,32 @@ def test_cudart_cudaMemcpyArrayToArray(): assertSuccess(err) # h1 to a1 - err, = cudart.cudaMemcpy2DToArray( - a1, 0, 0, h1, size, size, 1, - cudart.cudaMemcpyKind.cudaMemcpyHostToDevice - ) + (err,) = cudart.cudaMemcpy2DToArray(a1, 0, 0, h1, size, size, 1, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice) assertSuccess(err) # a1 to a2 - err, = cudart.cudaMemcpyArrayToArray( - a2, 0, 0, a1, 0, 0, size, - cudart.cudaMemcpyKind.cudaMemcpyDeviceToDevice - ) + (err,) = cudart.cudaMemcpyArrayToArray(a2, 0, 0, a1, 0, 0, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToDevice) assertSuccess(err) # a2 to h2 - err, = cudart.cudaMemcpy2DFromArray( - h2, size, a2, 0, 0, size, 1, - cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost - ) + (err,) = cudart.cudaMemcpy2DFromArray(h2, size, a2, 0, 0, size, 1, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost) assertSuccess(err) # validate h1 == h2 - assert(np.array_equal(h1, h2)) + assert np.array_equal(h1, h2) # clean up - err, = cudart.cudaFreeArray(a2) + (err,) = cudart.cudaFreeArray(a2) assertSuccess(err) - err, = cudart.cudaFreeArray(a1) + (err,) = cudart.cudaFreeArray(a1) assertSuccess(err) + def test_cudart_cudaGetChannelDesc(): # create channel descriptor x, y, z, w = 8, 0, 0, 0 f = cudart.cudaChannelFormatKind.cudaChannelFormatKindUnsigned - err, desc = cudart.cudaCreateChannelDesc( - x, y, z, w, f - ) + err, desc = cudart.cudaCreateChannelDesc(x, y, z, w, f) assertSuccess(err) # allocate device array @@ -650,16 +740,17 @@ def test_cudart_cudaGetChannelDesc(): assertSuccess(err) # validate array channel descriptor - assert(desc.x == x) - assert(desc.y == y) - assert(desc.z == z) - assert(desc.w == w) - assert(desc.f == f) + assert desc.x == x + assert desc.y == y + assert desc.z == z + assert desc.w == w + assert desc.f == f # clean up - err, = cudart.cudaFreeArray(arr) + (err,) = cudart.cudaFreeArray(arr) assertSuccess(err) + def test_cudart_cudaGetTextureObjectTextureDesc(): # create channel descriptor err, channelDesc = cudart.cudaCreateChannelDesc( @@ -689,18 +780,19 @@ def test_cudart_cudaGetTextureObjectTextureDesc(): if attr in ["borderColor", "getPtr"]: continue if not attr.startswith("_"): - assert(getattr(outTexDesc, attr) == getattr(inTexDesc, attr)) - + assert getattr(outTexDesc, attr) == getattr(inTexDesc, attr) + # clean up - err, = cudart.cudaDestroyTextureObject(texObject) + (err,) = cudart.cudaDestroyTextureObject(texObject) assertSuccess(err) + def test_cudart_cudaMemset3D(): # create host arrays size = 1024 * np.uint8().itemsize h1 = np.full(size, 1).astype(np.uint8) h2 = np.full(size, 2).astype(np.uint8) - assert(np.array_equal(h1, h2) is False) + assert np.array_equal(h1, h2) is False # allocate device memory devExtent = cudart.make_cudaExtent(32, 32, 1) @@ -709,27 +801,26 @@ def test_cudart_cudaMemset3D(): # set memory memExtent = cudart.make_cudaExtent(devPitchedPtr.pitch, devPitchedPtr.ysize, 1) - err, = cudart.cudaMemset3D(devPitchedPtr, 1, memExtent) + (err,) = cudart.cudaMemset3D(devPitchedPtr, 1, memExtent) assertSuccess(err) # D to h2 - err, = cudart.cudaMemcpy( - h2, devPitchedPtr.ptr, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost - ) + (err,) = cudart.cudaMemcpy(h2, devPitchedPtr.ptr, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost) # validate h1 == h2 - assert(np.array_equal(h1, h2)) + assert np.array_equal(h1, h2) # clean up - err, = cudart.cudaFree(devPitchedPtr.ptr) + (err,) = cudart.cudaFree(devPitchedPtr.ptr) assertSuccess(err) + def test_cudart_cudaMemset3D_2D(): # create host arrays size = 512 * np.uint8().itemsize h1 = np.full(size, 1).astype(np.uint8) h2 = np.full(size, 2).astype(np.uint8) - assert(np.array_equal(h1, h2) is False) + assert np.array_equal(h1, h2) is False # allocate device memory devExtent = cudart.make_cudaExtent(1024, 1, 1) @@ -738,58 +829,51 @@ def test_cudart_cudaMemset3D_2D(): # set memory memExtent = cudart.make_cudaExtent(size, devPitchedPtr.ysize, 1) - err, = cudart.cudaMemset3D(devPitchedPtr, 1, memExtent) + (err,) = cudart.cudaMemset3D(devPitchedPtr, 1, memExtent) assertSuccess(err) # D to h2 - err, = cudart.cudaMemcpy( - h2, devPitchedPtr.ptr, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost - ) + (err,) = cudart.cudaMemcpy(h2, devPitchedPtr.ptr, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost) # validate h1 == h2 - assert(np.array_equal(h1, h2)) + assert np.array_equal(h1, h2) # clean up - err, = cudart.cudaFree(devPitchedPtr.ptr) + (err,) = cudart.cudaFree(devPitchedPtr.ptr) assertSuccess(err) + def test_cudart_cudaMemcpyToArray(): # create host arrays size = 1024 * np.uint8().itemsize h1 = np.full(size, 1).astype(np.uint8) h2 = np.full(size, 2).astype(np.uint8) - assert(np.array_equal(h1, h2) is False) + assert np.array_equal(h1, h2) is False # create channel descriptor - err, desc = cudart.cudaCreateChannelDesc( - 8, 0, 0, 0, cudart.cudaChannelFormatKind.cudaChannelFormatKindUnsigned - ) + err, desc = cudart.cudaCreateChannelDesc(8, 0, 0, 0, cudart.cudaChannelFormatKind.cudaChannelFormatKindUnsigned) assertSuccess(err) - + # allocate device array err, arr = cudart.cudaMallocArray(desc, size, 0, 0) assertSuccess(err) # h1 to arr - err, = cudart.cudaMemcpyToArray( - arr, 0, 0, h1, size, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice - ) + (err,) = cudart.cudaMemcpyToArray(arr, 0, 0, h1, size, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice) assertSuccess(err) # arr to h2 - err, = cudart.cudaMemcpyFromArray( - h2, arr, 0, 0, size, - cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost - ) + (err,) = cudart.cudaMemcpyFromArray(h2, arr, 0, 0, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost) assertSuccess(err) # validate h1 == h2 - assert(np.array_equal(h1, h2)) + assert np.array_equal(h1, h2) # clean up - err, = cudart.cudaFreeArray(arr) + (err,) = cudart.cudaFreeArray(arr) assertSuccess(err) + def test_cudart_cudaMemcpyToArray_DtoD(): # allocate device memory size = int(1024 * np.uint8().itemsize) @@ -801,61 +885,53 @@ def test_cudart_cudaMemcpyToArray_DtoD(): # create host arrays h1 = np.full(size, 1).astype(np.uint8) h2 = np.full(size, 2).astype(np.uint8) - assert(np.array_equal(h1, h2) is False) + assert np.array_equal(h1, h2) is False # create channel descriptor - err, desc = cudart.cudaCreateChannelDesc( - 8, 0, 0, 0, cudart.cudaChannelFormatKind.cudaChannelFormatKindUnsigned - ) + err, desc = cudart.cudaCreateChannelDesc(8, 0, 0, 0, cudart.cudaChannelFormatKind.cudaChannelFormatKindUnsigned) assertSuccess(err) - + # allocate device array err, arr = cudart.cudaMallocArray(desc, size, 0, 0) assertSuccess(err) # h1 to d1 - err, = cudart.cudaMemcpy(d1, h1, size, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice) + (err,) = cudart.cudaMemcpy(d1, h1, size, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice) assertSuccess(err) # d1 to arr - err, = cudart.cudaMemcpyToArray( - arr, 0, 0, d1, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToDevice - ) + (err,) = cudart.cudaMemcpyToArray(arr, 0, 0, d1, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToDevice) assertSuccess(err) # arr to d2 - err, = cudart.cudaMemcpyFromArray( - d2, arr, 0, 0, size, - cudart.cudaMemcpyKind.cudaMemcpyDeviceToDevice - ) + (err,) = cudart.cudaMemcpyFromArray(d2, arr, 0, 0, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToDevice) assertSuccess(err) # d2 to h2 - err, = cudart.cudaMemcpy(h2, d2, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost) + (err,) = cudart.cudaMemcpy(h2, d2, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost) assertSuccess(err) # validate h1 == h2 - assert(np.array_equal(h1, h2)) + assert np.array_equal(h1, h2) # clean up - err, = cudart.cudaFreeArray(arr) + (err,) = cudart.cudaFreeArray(arr) assertSuccess(err) - err, = cudart.cudaFree(d2) + (err,) = cudart.cudaFree(d2) assertSuccess(err) - err, = cudart.cudaFree(d1) + (err,) = cudart.cudaFree(d1) assertSuccess(err) + def test_cudart_cudaMemcpy3DAsync(): # create host arrays size = int(1024 * np.uint8().itemsize) h1 = np.full(size, 1).astype(np.uint8) h2 = np.full(size, 2).astype(np.uint8) - assert(np.array_equal(h1, h2) is False) + assert np.array_equal(h1, h2) is False # create channel descriptor - err, desc = cudart.cudaCreateChannelDesc( - 8, 0, 0, 0, cudart.cudaChannelFormatKind.cudaChannelFormatKindUnsigned - ) + err, desc = cudart.cudaCreateChannelDesc(8, 0, 0, 0, cudart.cudaChannelFormatKind.cudaChannelFormatKindUnsigned) assertSuccess(err) # allocate device array @@ -874,27 +950,25 @@ def test_cudart_cudaMemcpy3DAsync(): params.kind = cudart.cudaMemcpyKind.cudaMemcpyHostToDevice # h1 to arr - err, = cudart.cudaMemcpy3DAsync(params, stream) + (err,) = cudart.cudaMemcpy3DAsync(params, stream) assertSuccess(err) # await results - err, = cudart.cudaStreamSynchronize(stream) + (err,) = cudart.cudaStreamSynchronize(stream) assertSuccess(err) # arr to h2 - err, = cudart.cudaMemcpy2DFromArray( - h2, size, arr, 0, 0, size, 1, - cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost - ) + (err,) = cudart.cudaMemcpy2DFromArray(h2, size, arr, 0, 0, size, 1, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost) assertSuccess(err) # validate h1 == h2 - assert(np.array_equal(h1, h2)) + assert np.array_equal(h1, h2) # clean up - err, = cudart.cudaFreeArray(arr) + (err,) = cudart.cudaFreeArray(arr) assertSuccess(err) + def test_cudart_cudaGraphAddMemcpyNode1D(): # allocate device memory size = 1024 * np.uint8().itemsize @@ -904,7 +978,7 @@ def test_cudart_cudaGraphAddMemcpyNode1D(): # create host arrays h1 = np.full(size, 1).astype(np.uint8) h2 = np.full(size, 2).astype(np.uint8) - assert(np.array_equal(h1, h2) is False) + assert np.array_equal(h1, h2) is False # build graph err, graph = cudart.cudaGraphCreate(0) @@ -916,8 +990,7 @@ def test_cudart_cudaGraphAddMemcpyNode1D(): ) assertSuccess(err) err, dToHNode = cudart.cudaGraphAddMemcpyNode1D( - graph, [hToDNode], 1, h2, dptr, size, - cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost + graph, [hToDNode], 1, h2, dptr, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost ) assertSuccess(err) @@ -928,19 +1001,20 @@ def test_cudart_cudaGraphAddMemcpyNode1D(): # execute graph err, execGraph = cudart.cudaGraphInstantiate(graph, 0) assertSuccess(err) - err, = cudart.cudaGraphLaunch(execGraph, stream) + (err,) = cudart.cudaGraphLaunch(execGraph, stream) # await results - err, = cudart.cudaStreamSynchronize(stream) + (err,) = cudart.cudaStreamSynchronize(stream) assertSuccess(err) # validate h1 == h2 - assert(np.array_equal(h1, h2)) + assert np.array_equal(h1, h2) # clean up - err, = cudart.cudaFree(dptr) + (err,) = cudart.cudaFree(dptr) assertSuccess(err) + def test_cudart_cudaGraphAddMemsetNode(): # allocate device memory size = 1024 * np.uint8().itemsize @@ -950,7 +1024,7 @@ def test_cudart_cudaGraphAddMemsetNode(): # create host arrays h1 = np.full(size, 1).astype(np.uint8) h2 = np.full(size, 2).astype(np.uint8) - assert(np.array_equal(h1, h2) is False) + assert np.array_equal(h1, h2) is False # build graph err, graph = cudart.cudaGraphCreate(0) @@ -966,17 +1040,13 @@ def test_cudart_cudaGraphAddMemsetNode(): params.height = 1 # add nodes - err, setNode = cudart.cudaGraphAddMemsetNode( - graph, [], 0, params - ) + err, setNode = cudart.cudaGraphAddMemsetNode(graph, [], 0, params) assertSuccess(err) err, cpyNode = cudart.cudaGraphAddMemcpyNode1D( - graph, [setNode], 1, h2, dptr, size, - cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost + graph, [setNode], 1, h2, dptr, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost ) assertSuccess(err) - # create stream err, stream = cudart.cudaStreamCreate() assertSuccess(err) @@ -984,20 +1054,21 @@ def test_cudart_cudaGraphAddMemsetNode(): # execute graph err, execGraph = cudart.cudaGraphInstantiate(graph, 0) assertSuccess(err) - err, = cudart.cudaGraphLaunch(execGraph, stream) + (err,) = cudart.cudaGraphLaunch(execGraph, stream) assertSuccess(err) # await results - err, = cudart.cudaStreamSynchronize(stream) + (err,) = cudart.cudaStreamSynchronize(stream) assertSuccess(err) # validate h1 == h2 - assert(np.array_equal(h1, h2)) + assert np.array_equal(h1, h2) # clean up - err, = cudart.cudaFree(dptr) + (err,) = cudart.cudaFree(dptr) assertSuccess(err) + def test_cudart_cudaMemcpy3DPeer(): # allocate device memory size = int(1024 * np.uint8().itemsize) @@ -1007,12 +1078,10 @@ def test_cudart_cudaMemcpy3DPeer(): # create host arrays h1 = np.full(size, 1).astype(np.uint8) h2 = np.full(size, 2).astype(np.uint8) - assert(np.array_equal(h1, h2) is False) + assert np.array_equal(h1, h2) is False # create channel descriptor - err, desc = cudart.cudaCreateChannelDesc( - 8, 0, 0, 0, cudart.cudaChannelFormatKind.cudaChannelFormatKindUnsigned - ) + err, desc = cudart.cudaCreateChannelDesc(8, 0, 0, 0, cudart.cudaChannelFormatKind.cudaChannelFormatKindUnsigned) assertSuccess(err) # allocate device array @@ -1026,29 +1095,27 @@ def test_cudart_cudaMemcpy3DPeer(): params.extent = cudart.make_cudaExtent(size, 1, 1) # h1 to D - err, = cudart.cudaMemcpy(dptr, h1, size, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice) + (err,) = cudart.cudaMemcpy(dptr, h1, size, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice) assertSuccess(err) # D to arr - err, = cudart.cudaMemcpy3DPeer(params) + (err,) = cudart.cudaMemcpy3DPeer(params) assertSuccess(err) # arr to h2 - err, = cudart.cudaMemcpy2DFromArray( - h2, size, arr, 0, 0, size, 1, - cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost - ) + (err,) = cudart.cudaMemcpy2DFromArray(h2, size, arr, 0, 0, size, 1, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost) assertSuccess(err) # validate h1 == h2 - assert(np.array_equal(h1, h2)) + assert np.array_equal(h1, h2) # clean up - err, = cudart.cudaFreeArray(arr) + (err,) = cudart.cudaFreeArray(arr) assertSuccess(err) - err, = cudart.cudaFree(dptr) + (err,) = cudart.cudaFree(dptr) assertSuccess(err) + def test_cudart_cudaMemcpy3DPeerAsync(): # allocate device memory size = 1024 * np.uint8().itemsize @@ -1058,12 +1125,10 @@ def test_cudart_cudaMemcpy3DPeerAsync(): # create host arrays h1 = np.full(size, 1).astype(np.uint8) h2 = np.full(size, 2).astype(np.uint8) - assert(np.array_equal(h1, h2) is False) + assert np.array_equal(h1, h2) is False # create channel descriptor - err, desc = cudart.cudaCreateChannelDesc( - 8, 0, 0, 0, cudart.cudaChannelFormatKind.cudaChannelFormatKindUnsigned - ) + err, desc = cudart.cudaCreateChannelDesc(8, 0, 0, 0, cudart.cudaChannelFormatKind.cudaChannelFormatKindUnsigned) assertSuccess(err) # allocate device array @@ -1081,55 +1146,54 @@ def test_cudart_cudaMemcpy3DPeerAsync(): params.extent = cudart.make_cudaExtent(size, 1, 1) # h1 to D - err, = cudart.cudaMemcpy(dptr, h1, size, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice) + (err,) = cudart.cudaMemcpy(dptr, h1, size, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice) assertSuccess(err) # D to arr - err, = cudart.cudaMemcpy3DPeerAsync(params, stream) + (err,) = cudart.cudaMemcpy3DPeerAsync(params, stream) assertSuccess(err) # await results - err, = cudart.cudaStreamSynchronize(stream) + (err,) = cudart.cudaStreamSynchronize(stream) assertSuccess(err) # arr to h2 - err, = cudart.cudaMemcpy2DFromArray( - h2, size, arr, 0, 0, size, 1, - cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost - ) + (err,) = cudart.cudaMemcpy2DFromArray(h2, size, arr, 0, 0, size, 1, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost) assertSuccess(err) # validate h1 == h2 - assert(np.array_equal(h1, h2)) + assert np.array_equal(h1, h2) # clean up - err, = cudart.cudaFreeArray(arr) + (err,) = cudart.cudaFreeArray(arr) assertSuccess(err) - err, = cudart.cudaFree(dptr) + (err,) = cudart.cudaFree(dptr) assertSuccess(err) + def test_profiler(): - err, = cudart.cudaProfilerStart() + (err,) = cudart.cudaProfilerStart() assertSuccess(err) - err, = cudart.cudaProfilerStop() + (err,) = cudart.cudaProfilerStop() assertSuccess(err) + def test_cudart_eglFrame(): frame = cudart.cudaEglFrame() # [, , ] - assert(int(frame.frame.pArray[0]) == 0) - assert(int(frame.frame.pArray[1]) == 0) - assert(int(frame.frame.pArray[2]) == 0) - frame.frame.pArray = [1,2,3] + assert int(frame.frame.pArray[0]) == 0 + assert int(frame.frame.pArray[1]) == 0 + assert int(frame.frame.pArray[2]) == 0 + frame.frame.pArray = [1, 2, 3] # [, , ] - assert(int(frame.frame.pArray[0]) == 1) - assert(int(frame.frame.pArray[1]) == 2) - assert(int(frame.frame.pArray[2]) == 3) - frame.frame.pArray = [1,2,cudart.cudaArray_t(4)] + assert int(frame.frame.pArray[0]) == 1 + assert int(frame.frame.pArray[1]) == 2 + assert int(frame.frame.pArray[2]) == 3 + frame.frame.pArray = [1, 2, cudart.cudaArray_t(4)] # [, , ] - assert(int(frame.frame.pArray[0]) == 1) - assert(int(frame.frame.pArray[1]) == 2) - assert(int(frame.frame.pArray[2]) == 4) + assert int(frame.frame.pArray[0]) == 1 + assert int(frame.frame.pArray[1]) == 2 + assert int(frame.frame.pArray[2]) == 4 # frame.frame.pPitch # [ptr : 0x1 # pitch : 2 @@ -1141,18 +1205,18 @@ def test_cudart_eglFrame(): # pitch : 0 # xsize : 0 # ysize : 0] - assert(int(frame.frame.pPitch[0].ptr) == 1) - assert(int(frame.frame.pPitch[0].pitch) == 2) - assert(int(frame.frame.pPitch[0].xsize) == 4) - assert(int(frame.frame.pPitch[0].ysize) == 0) - assert(int(frame.frame.pPitch[1].ptr) == 0) - assert(int(frame.frame.pPitch[1].pitch) == 0) - assert(int(frame.frame.pPitch[1].xsize) == 0) - assert(int(frame.frame.pPitch[1].ysize) == 0) - assert(int(frame.frame.pPitch[2].ptr) == 0) - assert(int(frame.frame.pPitch[2].pitch) == 0) - assert(int(frame.frame.pPitch[2].xsize) == 0) - assert(int(frame.frame.pPitch[2].ysize) == 0) + assert int(frame.frame.pPitch[0].ptr) == 1 + assert int(frame.frame.pPitch[0].pitch) == 2 + assert int(frame.frame.pPitch[0].xsize) == 4 + assert int(frame.frame.pPitch[0].ysize) == 0 + assert int(frame.frame.pPitch[1].ptr) == 0 + assert int(frame.frame.pPitch[1].pitch) == 0 + assert int(frame.frame.pPitch[1].xsize) == 0 + assert int(frame.frame.pPitch[1].ysize) == 0 + assert int(frame.frame.pPitch[2].ptr) == 0 + assert int(frame.frame.pPitch[2].pitch) == 0 + assert int(frame.frame.pPitch[2].xsize) == 0 + assert int(frame.frame.pPitch[2].ysize) == 0 frame.frame.pPitch = [cudart.cudaPitchedPtr(), cudart.cudaPitchedPtr(), cudart.cudaPitchedPtr()] # [ptr : 0x0 # pitch : 0 @@ -1164,21 +1228,21 @@ def test_cudart_eglFrame(): # pitch : 0 # xsize : 0 # ysize : 0] - assert(int(frame.frame.pPitch[0].ptr) == 0) - assert(int(frame.frame.pPitch[0].pitch) == 0) - assert(int(frame.frame.pPitch[0].xsize) == 0) - assert(int(frame.frame.pPitch[0].ysize) == 0) - assert(int(frame.frame.pPitch[1].ptr) == 0) - assert(int(frame.frame.pPitch[1].pitch) == 0) - assert(int(frame.frame.pPitch[1].xsize) == 0) - assert(int(frame.frame.pPitch[1].ysize) == 0) - assert(int(frame.frame.pPitch[2].ptr) == 0) - assert(int(frame.frame.pPitch[2].pitch) == 0) - assert(int(frame.frame.pPitch[2].xsize) == 0) - assert(int(frame.frame.pPitch[2].ysize) == 0) + assert int(frame.frame.pPitch[0].ptr) == 0 + assert int(frame.frame.pPitch[0].pitch) == 0 + assert int(frame.frame.pPitch[0].xsize) == 0 + assert int(frame.frame.pPitch[0].ysize) == 0 + assert int(frame.frame.pPitch[1].ptr) == 0 + assert int(frame.frame.pPitch[1].pitch) == 0 + assert int(frame.frame.pPitch[1].xsize) == 0 + assert int(frame.frame.pPitch[1].ysize) == 0 + assert int(frame.frame.pPitch[2].ptr) == 0 + assert int(frame.frame.pPitch[2].pitch) == 0 + assert int(frame.frame.pPitch[2].xsize) == 0 + assert int(frame.frame.pPitch[2].ysize) == 0 x = frame.frame.pPitch[0] x.pitch = 123 - frame.frame.pPitch = [x,x,x] + frame.frame.pPitch = [x, x, x] # [ptr : 0x0 # pitch : 123 # xsize : 0 @@ -1189,18 +1253,18 @@ def test_cudart_eglFrame(): # pitch : 123 # xsize : 0 # ysize : 0] - assert(int(frame.frame.pPitch[0].ptr) == 0) - assert(int(frame.frame.pPitch[0].pitch) == 123) - assert(int(frame.frame.pPitch[0].xsize) == 0) - assert(int(frame.frame.pPitch[0].ysize) == 0) - assert(int(frame.frame.pPitch[1].ptr) == 0) - assert(int(frame.frame.pPitch[1].pitch) == 123) - assert(int(frame.frame.pPitch[1].xsize) == 0) - assert(int(frame.frame.pPitch[1].ysize) == 0) - assert(int(frame.frame.pPitch[2].ptr) == 0) - assert(int(frame.frame.pPitch[2].pitch) == 123) - assert(int(frame.frame.pPitch[2].xsize) == 0) - assert(int(frame.frame.pPitch[2].ysize) == 0) + assert int(frame.frame.pPitch[0].ptr) == 0 + assert int(frame.frame.pPitch[0].pitch) == 123 + assert int(frame.frame.pPitch[0].xsize) == 0 + assert int(frame.frame.pPitch[0].ysize) == 0 + assert int(frame.frame.pPitch[1].ptr) == 0 + assert int(frame.frame.pPitch[1].pitch) == 123 + assert int(frame.frame.pPitch[1].xsize) == 0 + assert int(frame.frame.pPitch[1].ysize) == 0 + assert int(frame.frame.pPitch[2].ptr) == 0 + assert int(frame.frame.pPitch[2].pitch) == 123 + assert int(frame.frame.pPitch[2].xsize) == 0 + assert int(frame.frame.pPitch[2].ysize) == 0 x.pitch = 1234 # [ptr : 0x0 # pitch : 123 @@ -1212,37 +1276,40 @@ def test_cudart_eglFrame(): # pitch : 123 # xsize : 0 # ysize : 0] - assert(int(frame.frame.pPitch[0].ptr) == 0) - assert(int(frame.frame.pPitch[0].pitch) == 123) - assert(int(frame.frame.pPitch[0].xsize) == 0) - assert(int(frame.frame.pPitch[0].ysize) == 0) - assert(int(frame.frame.pPitch[1].ptr) == 0) - assert(int(frame.frame.pPitch[1].pitch) == 123) - assert(int(frame.frame.pPitch[1].xsize) == 0) - assert(int(frame.frame.pPitch[1].ysize) == 0) - assert(int(frame.frame.pPitch[2].ptr) == 0) - assert(int(frame.frame.pPitch[2].pitch) == 123) - assert(int(frame.frame.pPitch[2].xsize) == 0) - assert(int(frame.frame.pPitch[2].ysize) == 0) + assert int(frame.frame.pPitch[0].ptr) == 0 + assert int(frame.frame.pPitch[0].pitch) == 123 + assert int(frame.frame.pPitch[0].xsize) == 0 + assert int(frame.frame.pPitch[0].ysize) == 0 + assert int(frame.frame.pPitch[1].ptr) == 0 + assert int(frame.frame.pPitch[1].pitch) == 123 + assert int(frame.frame.pPitch[1].xsize) == 0 + assert int(frame.frame.pPitch[1].ysize) == 0 + assert int(frame.frame.pPitch[2].ptr) == 0 + assert int(frame.frame.pPitch[2].pitch) == 123 + assert int(frame.frame.pPitch[2].xsize) == 0 + assert int(frame.frame.pPitch[2].ysize) == 0 + def cudart_func_stream_callback(use_host_api): class testStruct(ctypes.Structure): - _fields_ = [('a', ctypes.c_int), - ('b', ctypes.c_int), - ('c', ctypes.c_int),] + _fields_ = [ + ("a", ctypes.c_int), + ("b", ctypes.c_int), + ("c", ctypes.c_int), + ] def task_callback_host(userData): data = testStruct.from_address(userData) - assert(data.a == 1) - assert(data.b == 2) - assert(data.c == 3) + assert data.a == 1 + assert data.b == 2 + assert data.c == 3 return 0 def task_callback_stream(stream, status, userData): data = testStruct.from_address(userData) - assert(data.a == 1) - assert(data.b == 2) - assert(data.c == 3) + assert data.a == 1 + assert data.b == 2 + assert data.c == 3 return 0 if use_host_api: @@ -1266,12 +1333,12 @@ def task_callback_stream(stream, status, userData): err, stream = cudart.cudaStreamCreate() assertSuccess(err) if use_host_api: - err, = cudart.cudaLaunchHostFunc(stream, callback, ctypes.addressof(c_data)) + (err,) = cudart.cudaLaunchHostFunc(stream, callback, ctypes.addressof(c_data)) assertSuccess(err) else: - err, = cudart.cudaStreamAddCallback(stream, callback, ctypes.addressof(c_data), 0) + (err,) = cudart.cudaStreamAddCallback(stream, callback, ctypes.addressof(c_data), 0) assertSuccess(err) - err, = cudart.cudaDeviceSynchronize() + (err,) = cudart.cudaDeviceSynchronize() assertSuccess(err) @@ -1279,8 +1346,11 @@ def test_cudart_func_callback(): cudart_func_stream_callback(use_host_api=False) cudart_func_stream_callback(use_host_api=True) -@pytest.mark.skipif(driverVersionLessThan(12030) - or not supportsCudaAPI('cudaGraphConditionalHandleCreate'), reason='Conditional graph APIs required') + +@pytest.mark.skipif( + driverVersionLessThan(12030) or not supportsCudaAPI("cudaGraphConditionalHandleCreate"), + reason="Conditional graph APIs required", +) def test_cudart_conditional(): err, graph = cudart.cudaGraphCreate(0) assertSuccess(err) @@ -1293,10 +1363,10 @@ def test_cudart_conditional(): params.conditional.type = cudart.cudaGraphConditionalNodeType.cudaGraphCondTypeIf params.conditional.size = 1 - assert(len(params.conditional.phGraph_out) == 1) - assert(int(params.conditional.phGraph_out[0]) == 0) + assert len(params.conditional.phGraph_out) == 1 + assert int(params.conditional.phGraph_out[0]) == 0 err, node = cudart.cudaGraphAddNode(graph, None, 0, params) assertSuccess(err) - assert(len(params.conditional.phGraph_out) == 1) - assert(int(params.conditional.phGraph_out[0]) != 0) + assert len(params.conditional.phGraph_out) == 1 + assert int(params.conditional.phGraph_out[0]) != 0 diff --git a/cuda_bindings/tests/test_interoperability.py b/cuda_bindings/tests/test_interoperability.py index aab3b9a93..998535992 100644 --- a/cuda_bindings/tests/test_interoperability.py +++ b/cuda_bindings/tests/test_interoperability.py @@ -5,229 +5,240 @@ # this software. Any use, reproduction, disclosure, or distribution of # this software and related documentation outside the terms of the EULA # is strictly prohibited. +import numpy as np import pytest + import cuda.cuda as cuda import cuda.cudart as cudart -import numpy as np + def supportsMemoryPool(): err, isSupported = cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrMemoryPoolsSupported, 0) return err == cudart.cudaError_t.cudaSuccess and isSupported + def test_interop_stream(): - err_dr, = cuda.cuInit(0) - assert(err_dr == cuda.CUresult.CUDA_SUCCESS) + (err_dr,) = cuda.cuInit(0) + assert err_dr == cuda.CUresult.CUDA_SUCCESS err_dr, device = cuda.cuDeviceGet(0) - assert(err_dr == cuda.CUresult.CUDA_SUCCESS) + assert err_dr == cuda.CUresult.CUDA_SUCCESS err_dr, ctx = cuda.cuCtxCreate(0, device) - assert(err_dr == cuda.CUresult.CUDA_SUCCESS) + assert err_dr == cuda.CUresult.CUDA_SUCCESS # DRV to RT err_dr, stream = cuda.cuStreamCreate(0) - assert(err_dr == cuda.CUresult.CUDA_SUCCESS) - err_rt, = cudart.cudaStreamDestroy(stream) - assert(err_rt == cudart.cudaError_t.cudaSuccess) + assert err_dr == cuda.CUresult.CUDA_SUCCESS + (err_rt,) = cudart.cudaStreamDestroy(stream) + assert err_rt == cudart.cudaError_t.cudaSuccess # RT to DRV err_rt, stream = cudart.cudaStreamCreate() - assert(err_rt == cudart.cudaError_t.cudaSuccess) - err_dr, = cuda.cuStreamDestroy(stream) - assert(err_dr == cuda.CUresult.CUDA_SUCCESS) + assert err_rt == cudart.cudaError_t.cudaSuccess + (err_dr,) = cuda.cuStreamDestroy(stream) + assert err_dr == cuda.CUresult.CUDA_SUCCESS + + (err_dr,) = cuda.cuCtxDestroy(ctx) + assert err_dr == cuda.CUresult.CUDA_SUCCESS - err_dr, = cuda.cuCtxDestroy(ctx) - assert(err_dr == cuda.CUresult.CUDA_SUCCESS) def test_interop_event(): - err_dr, = cuda.cuInit(0) - assert(err_dr == cuda.CUresult.CUDA_SUCCESS) + (err_dr,) = cuda.cuInit(0) + assert err_dr == cuda.CUresult.CUDA_SUCCESS err_dr, device = cuda.cuDeviceGet(0) - assert(err_dr == cuda.CUresult.CUDA_SUCCESS) + assert err_dr == cuda.CUresult.CUDA_SUCCESS err_dr, ctx = cuda.cuCtxCreate(0, device) - assert(err_dr == cuda.CUresult.CUDA_SUCCESS) + assert err_dr == cuda.CUresult.CUDA_SUCCESS # DRV to RT err_dr, event = cuda.cuEventCreate(0) - assert(err_dr == cuda.CUresult.CUDA_SUCCESS) - err_rt, = cudart.cudaEventDestroy(event) - assert(err_rt == cudart.cudaError_t.cudaSuccess) + assert err_dr == cuda.CUresult.CUDA_SUCCESS + (err_rt,) = cudart.cudaEventDestroy(event) + assert err_rt == cudart.cudaError_t.cudaSuccess # RT to DRV err_rt, event = cudart.cudaEventCreate() - assert(err_rt == cudart.cudaError_t.cudaSuccess) - err_dr, = cuda.cuEventDestroy(event) - assert(err_dr == cuda.CUresult.CUDA_SUCCESS) + assert err_rt == cudart.cudaError_t.cudaSuccess + (err_dr,) = cuda.cuEventDestroy(event) + assert err_dr == cuda.CUresult.CUDA_SUCCESS + + (err_dr,) = cuda.cuCtxDestroy(ctx) + assert err_dr == cuda.CUresult.CUDA_SUCCESS - err_dr, = cuda.cuCtxDestroy(ctx) - assert(err_dr == cuda.CUresult.CUDA_SUCCESS) def test_interop_graph(): - err_dr, = cuda.cuInit(0) - assert(err_dr == cuda.CUresult.CUDA_SUCCESS) + (err_dr,) = cuda.cuInit(0) + assert err_dr == cuda.CUresult.CUDA_SUCCESS err_dr, device = cuda.cuDeviceGet(0) - assert(err_dr == cuda.CUresult.CUDA_SUCCESS) + assert err_dr == cuda.CUresult.CUDA_SUCCESS err_dr, ctx = cuda.cuCtxCreate(0, device) - assert(err_dr == cuda.CUresult.CUDA_SUCCESS) + assert err_dr == cuda.CUresult.CUDA_SUCCESS # DRV to RT err_dr, graph = cuda.cuGraphCreate(0) - assert(err_dr == cuda.CUresult.CUDA_SUCCESS) - err_rt, = cudart.cudaGraphDestroy(graph) - assert(err_rt == cudart.cudaError_t.cudaSuccess) + assert err_dr == cuda.CUresult.CUDA_SUCCESS + (err_rt,) = cudart.cudaGraphDestroy(graph) + assert err_rt == cudart.cudaError_t.cudaSuccess # RT to DRV err_rt, graph = cudart.cudaGraphCreate(0) - assert(err_rt == cudart.cudaError_t.cudaSuccess) - err_dr, = cuda.cuGraphDestroy(graph) - assert(err_dr == cuda.CUresult.CUDA_SUCCESS) + assert err_rt == cudart.cudaError_t.cudaSuccess + (err_dr,) = cuda.cuGraphDestroy(graph) + assert err_dr == cuda.CUresult.CUDA_SUCCESS + + (err_dr,) = cuda.cuCtxDestroy(ctx) + assert err_dr == cuda.CUresult.CUDA_SUCCESS - err_dr, = cuda.cuCtxDestroy(ctx) - assert(err_dr == cuda.CUresult.CUDA_SUCCESS) def test_interop_graphNode(): - err_dr, = cuda.cuInit(0) - assert(err_dr == cuda.CUresult.CUDA_SUCCESS) + (err_dr,) = cuda.cuInit(0) + assert err_dr == cuda.CUresult.CUDA_SUCCESS err_dr, device = cuda.cuDeviceGet(0) - assert(err_dr == cuda.CUresult.CUDA_SUCCESS) + assert err_dr == cuda.CUresult.CUDA_SUCCESS err_dr, ctx = cuda.cuCtxCreate(0, device) - assert(err_dr == cuda.CUresult.CUDA_SUCCESS) + assert err_dr == cuda.CUresult.CUDA_SUCCESS err_dr, graph = cuda.cuGraphCreate(0) - assert(err_dr == cuda.CUresult.CUDA_SUCCESS) + assert err_dr == cuda.CUresult.CUDA_SUCCESS # DRV to RT err_dr, node = cuda.cuGraphAddEmptyNode(graph, [], 0) - assert(err_dr == cuda.CUresult.CUDA_SUCCESS) - err_rt, = cudart.cudaGraphDestroyNode(node) - assert(err_rt == cudart.cudaError_t.cudaSuccess) + assert err_dr == cuda.CUresult.CUDA_SUCCESS + (err_rt,) = cudart.cudaGraphDestroyNode(node) + assert err_rt == cudart.cudaError_t.cudaSuccess # RT to DRV err_rt, node = cudart.cudaGraphAddEmptyNode(graph, [], 0) - assert(err_rt == cudart.cudaError_t.cudaSuccess) - err_dr, = cuda.cuGraphDestroyNode(node) - assert(err_dr == cuda.CUresult.CUDA_SUCCESS) + assert err_rt == cudart.cudaError_t.cudaSuccess + (err_dr,) = cuda.cuGraphDestroyNode(node) + assert err_dr == cuda.CUresult.CUDA_SUCCESS + + (err_rt,) = cudart.cudaGraphDestroy(graph) + assert err_rt == cudart.cudaError_t.cudaSuccess + (err_dr,) = cuda.cuCtxDestroy(ctx) + assert err_dr == cuda.CUresult.CUDA_SUCCESS - err_rt, = cudart.cudaGraphDestroy(graph) - assert(err_rt == cudart.cudaError_t.cudaSuccess) - err_dr, = cuda.cuCtxDestroy(ctx) - assert(err_dr == cuda.CUresult.CUDA_SUCCESS) def test_interop_userObject(): - err_dr, = cuda.cuInit(0) - assert(err_dr == cuda.CUresult.CUDA_SUCCESS) + (err_dr,) = cuda.cuInit(0) + assert err_dr == cuda.CUresult.CUDA_SUCCESS err_dr, device = cuda.cuDeviceGet(0) - assert(err_dr == cuda.CUresult.CUDA_SUCCESS) + assert err_dr == cuda.CUresult.CUDA_SUCCESS err_dr, ctx = cuda.cuCtxCreate(0, device) - assert(err_dr == cuda.CUresult.CUDA_SUCCESS) + assert err_dr == cuda.CUresult.CUDA_SUCCESS # cudaUserObject_t # TODO - err_dr, = cuda.cuCtxDestroy(ctx) - assert(err_dr == cuda.CUresult.CUDA_SUCCESS) + (err_dr,) = cuda.cuCtxDestroy(ctx) + assert err_dr == cuda.CUresult.CUDA_SUCCESS + def test_interop_function(): - err_dr, = cuda.cuInit(0) - assert(err_dr == cuda.CUresult.CUDA_SUCCESS) + (err_dr,) = cuda.cuInit(0) + assert err_dr == cuda.CUresult.CUDA_SUCCESS err_dr, device = cuda.cuDeviceGet(0) - assert(err_dr == cuda.CUresult.CUDA_SUCCESS) + assert err_dr == cuda.CUresult.CUDA_SUCCESS err_dr, ctx = cuda.cuCtxCreate(0, device) - assert(err_dr == cuda.CUresult.CUDA_SUCCESS) + assert err_dr == cuda.CUresult.CUDA_SUCCESS # cudaFunction_t # TODO - err_dr, = cuda.cuCtxDestroy(ctx) - assert(err_dr == cuda.CUresult.CUDA_SUCCESS) + (err_dr,) = cuda.cuCtxDestroy(ctx) + assert err_dr == cuda.CUresult.CUDA_SUCCESS -@pytest.mark.skipif(not supportsMemoryPool(), reason='Requires mempool operations') + +@pytest.mark.skipif(not supportsMemoryPool(), reason="Requires mempool operations") def test_interop_memPool(): - err_dr, = cuda.cuInit(0) - assert(err_dr == cuda.CUresult.CUDA_SUCCESS) + (err_dr,) = cuda.cuInit(0) + assert err_dr == cuda.CUresult.CUDA_SUCCESS err_dr, device = cuda.cuDeviceGet(0) - assert(err_dr == cuda.CUresult.CUDA_SUCCESS) + assert err_dr == cuda.CUresult.CUDA_SUCCESS err_dr, ctx = cuda.cuCtxCreate(0, device) - assert(err_dr == cuda.CUresult.CUDA_SUCCESS) + assert err_dr == cuda.CUresult.CUDA_SUCCESS # DRV to RT err_dr, pool = cuda.cuDeviceGetDefaultMemPool(0) - assert(err_dr == cuda.CUresult.CUDA_SUCCESS) - err_rt, = cudart.cudaDeviceSetMemPool(0, pool) - assert(err_rt == cudart.cudaError_t.cudaSuccess) + assert err_dr == cuda.CUresult.CUDA_SUCCESS + (err_rt,) = cudart.cudaDeviceSetMemPool(0, pool) + assert err_rt == cudart.cudaError_t.cudaSuccess # RT to DRV err_rt, pool = cudart.cudaDeviceGetDefaultMemPool(0) - assert(err_rt == cudart.cudaError_t.cudaSuccess) - err_dr, = cuda.cuDeviceSetMemPool(0, pool) - assert(err_dr == cuda.CUresult.CUDA_SUCCESS) + assert err_rt == cudart.cudaError_t.cudaSuccess + (err_dr,) = cuda.cuDeviceSetMemPool(0, pool) + assert err_dr == cuda.CUresult.CUDA_SUCCESS + + (err_dr,) = cuda.cuCtxDestroy(ctx) + assert err_dr == cuda.CUresult.CUDA_SUCCESS - err_dr, = cuda.cuCtxDestroy(ctx) - assert(err_dr == cuda.CUresult.CUDA_SUCCESS) def test_interop_graphExec(): - err_dr, = cuda.cuInit(0) - assert(err_dr == cuda.CUresult.CUDA_SUCCESS) + (err_dr,) = cuda.cuInit(0) + assert err_dr == cuda.CUresult.CUDA_SUCCESS err_dr, device = cuda.cuDeviceGet(0) - assert(err_dr == cuda.CUresult.CUDA_SUCCESS) + assert err_dr == cuda.CUresult.CUDA_SUCCESS err_dr, ctx = cuda.cuCtxCreate(0, device) - assert(err_dr == cuda.CUresult.CUDA_SUCCESS) + assert err_dr == cuda.CUresult.CUDA_SUCCESS err_dr, graph = cuda.cuGraphCreate(0) - assert(err_dr == cuda.CUresult.CUDA_SUCCESS) + assert err_dr == cuda.CUresult.CUDA_SUCCESS err_dr, node = cuda.cuGraphAddEmptyNode(graph, [], 0) - assert(err_dr == cuda.CUresult.CUDA_SUCCESS) + assert err_dr == cuda.CUresult.CUDA_SUCCESS # DRV to RT err_dr, graphExec = cuda.cuGraphInstantiate(graph, 0) - assert(err_dr == cuda.CUresult.CUDA_SUCCESS) - err_rt, = cudart.cudaGraphExecDestroy(graphExec) - assert(err_rt == cudart.cudaError_t.cudaSuccess) + assert err_dr == cuda.CUresult.CUDA_SUCCESS + (err_rt,) = cudart.cudaGraphExecDestroy(graphExec) + assert err_rt == cudart.cudaError_t.cudaSuccess # RT to DRV err_rt, graphExec = cudart.cudaGraphInstantiate(graph, 0) - assert(err_rt == cudart.cudaError_t.cudaSuccess) - err_dr, = cuda.cuGraphExecDestroy(graphExec) - assert(err_dr == cuda.CUresult.CUDA_SUCCESS) + assert err_rt == cudart.cudaError_t.cudaSuccess + (err_dr,) = cuda.cuGraphExecDestroy(graphExec) + assert err_dr == cuda.CUresult.CUDA_SUCCESS + + (err_rt,) = cudart.cudaGraphDestroy(graph) + assert err_rt == cudart.cudaError_t.cudaSuccess + (err_dr,) = cuda.cuCtxDestroy(ctx) + assert err_dr == cuda.CUresult.CUDA_SUCCESS - err_rt, = cudart.cudaGraphDestroy(graph) - assert(err_rt == cudart.cudaError_t.cudaSuccess) - err_dr, = cuda.cuCtxDestroy(ctx) - assert(err_dr == cuda.CUresult.CUDA_SUCCESS) def test_interop_deviceptr(): # Init CUDA - err, = cuda.cuInit(0) - assert(err == cuda.CUresult.CUDA_SUCCESS) + (err,) = cuda.cuInit(0) + assert err == cuda.CUresult.CUDA_SUCCESS # Get device err, device = cuda.cuDeviceGet(0) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS # Construct context err, ctx = cuda.cuCtxCreate(0, device) - assert(err == cuda.CUresult.CUDA_SUCCESS) + assert err == cuda.CUresult.CUDA_SUCCESS # Allocate dev memory size = 1024 * np.uint8().itemsize err_dr, dptr = cuda.cuMemAlloc(size) - assert(err_dr == cuda.CUresult.CUDA_SUCCESS) + assert err_dr == cuda.CUresult.CUDA_SUCCESS # Allocate host memory h1 = np.full(size, 1).astype(np.uint8) h2 = np.full(size, 2).astype(np.uint8) - assert(np.array_equal(h1, h2) is False) + assert np.array_equal(h1, h2) is False # Initialize device memory - err_rt, = cudart.cudaMemset(dptr, 1, size) - assert(err_rt == cudart.cudaError_t.cudaSuccess) + (err_rt,) = cudart.cudaMemset(dptr, 1, size) + assert err_rt == cudart.cudaError_t.cudaSuccess # D to h2 - err_rt, = cudart.cudaMemcpy(h2, dptr, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost) - assert(err_rt == cudart.cudaError_t.cudaSuccess) + (err_rt,) = cudart.cudaMemcpy(h2, dptr, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost) + assert err_rt == cudart.cudaError_t.cudaSuccess # Validate h1 == h2 - assert(np.array_equal(h1, h2)) + assert np.array_equal(h1, h2) # Cleanup - err_dr, = cuda.cuMemFree(dptr) - assert(err_dr == cuda.CUresult.CUDA_SUCCESS) - err_dr, = cuda.cuCtxDestroy(ctx) - assert(err_dr == cuda.CUresult.CUDA_SUCCESS) + (err_dr,) = cuda.cuMemFree(dptr) + assert err_dr == cuda.CUresult.CUDA_SUCCESS + (err_dr,) = cuda.cuCtxDestroy(ctx) + assert err_dr == cuda.CUresult.CUDA_SUCCESS diff --git a/cuda_bindings/tests/test_kernelParams.py b/cuda_bindings/tests/test_kernelParams.py index 5bf745ef2..c45bf46ab 100644 --- a/cuda_bindings/tests/test_kernelParams.py +++ b/cuda_bindings/tests/test_kernelParams.py @@ -5,23 +5,26 @@ # this software. Any use, reproduction, disclosure, or distribution of # this software and related documentation outside the terms of the EULA # is strictly prohibited. -import pytest -from cuda import cuda, cudart, nvrtc -import numpy as np import ctypes +import numpy as np + +from cuda import cuda, cudart, nvrtc + + def ASSERT_DRV(err): if isinstance(err, cuda.CUresult): if err != cuda.CUresult.CUDA_SUCCESS: - raise RuntimeError('Cuda Error: {}'.format(err)) + raise RuntimeError(f"Cuda Error: {err}") elif isinstance(err, cudart.cudaError_t): if err != cudart.cudaError_t.cudaSuccess: - raise RuntimeError('Cudart Error: {}'.format(err)) + raise RuntimeError(f"Cudart Error: {err}") elif isinstance(err, nvrtc.nvrtcResult): if err != nvrtc.nvrtcResult.NVRTC_SUCCESS: - raise RuntimeError('Nvrtc Error: {}'.format(err)) + raise RuntimeError(f"Nvrtc Error: {err}") else: - raise RuntimeError('Unknown error type: {}'.format(err)) + raise RuntimeError(f"Unknown error type: {err}") + def common_nvrtc(allKernelStrings, dev): err, major = cuda.cuDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev) @@ -30,19 +33,19 @@ def common_nvrtc(allKernelStrings, dev): ASSERT_DRV(err) err, _, nvrtc_minor = nvrtc.nvrtcVersion() ASSERT_DRV(err) - use_cubin = (nvrtc_minor >= 1) - prefix = 'sm' if use_cubin else 'compute' - arch_arg = bytes(f'--gpu-architecture={prefix}_{major}{minor}', 'ascii') + use_cubin = nvrtc_minor >= 1 + prefix = "sm" if use_cubin else "compute" + arch_arg = bytes(f"--gpu-architecture={prefix}_{major}{minor}", "ascii") - err, prog = nvrtc.nvrtcCreateProgram(str.encode(allKernelStrings), b'allKernelStrings.cu', 0, None, None) + err, prog = nvrtc.nvrtcCreateProgram(str.encode(allKernelStrings), b"allKernelStrings.cu", 0, None, None) ASSERT_DRV(err) - opts = (b'--fmad=false', arch_arg) - err, = nvrtc.nvrtcCompileProgram(prog, len(opts), opts) + opts = (b"--fmad=false", arch_arg) + (err,) = nvrtc.nvrtcCompileProgram(prog, len(opts), opts) err_log, logSize = nvrtc.nvrtcGetProgramLogSize(prog) ASSERT_DRV(err_log) - log = b' ' * logSize - err_log, = nvrtc.nvrtcGetProgramLog(prog, log) + log = b" " * logSize + (err_log,) = nvrtc.nvrtcGetProgramLog(prog, log) ASSERT_DRV(err_log) result = log.decode() if len(result) > 1: @@ -52,14 +55,14 @@ def common_nvrtc(allKernelStrings, dev): if use_cubin: err, dataSize = nvrtc.nvrtcGetCUBINSize(prog) ASSERT_DRV(err) - data = b' ' * dataSize - err, = nvrtc.nvrtcGetCUBIN(prog, data) + data = b" " * dataSize + (err,) = nvrtc.nvrtcGetCUBIN(prog, data) ASSERT_DRV(err) else: err, dataSize = nvrtc.nvrtcGetPTXSize(prog) ASSERT_DRV(err) - data = b' ' * dataSize - err, = nvrtc.nvrtcGetPTX(prog, data) + data = b" " * dataSize + (err,) = nvrtc.nvrtcGetPTX(prog, data) ASSERT_DRV(err) err, module = cuda.cuModuleLoadData(np.char.array(data)) @@ -67,15 +70,16 @@ def common_nvrtc(allKernelStrings, dev): return module + def test_kernelParams_empty(): - err, = cuda.cuInit(0) + (err,) = cuda.cuInit(0) ASSERT_DRV(err) err, cuDevice = cuda.cuDeviceGet(0) ASSERT_DRV(err) err, context = cuda.cuCtxCreate(0, cuDevice) ASSERT_DRV(err) - kernelString = '''\ + kernelString = """\ static __device__ bool isDone; extern "C" __global__ void empty_kernel() @@ -83,50 +87,67 @@ def test_kernelParams_empty(): isDone = true; if (isDone) return; } - ''' + """ module = common_nvrtc(kernelString, cuDevice) # cudaStructs kernel - err, kernel = cuda.cuModuleGetFunction(module, b'empty_kernel') + err, kernel = cuda.cuModuleGetFunction(module, b"empty_kernel") ASSERT_DRV(err) err, stream = cuda.cuStreamCreate(0) ASSERT_DRV(err) - err, = cuda.cuLaunchKernel(kernel, - 1, 1, 1, # grid dim - 1, 1, 1, # block dim - 0, stream, # shared mem and stream - ((), ()), 0) # arguments - ASSERT_DRV(err) - err, = cuda.cuLaunchKernel(kernel, - 1, 1, 1, # grid dim - 1, 1, 1, # block dim - 0, stream, # shared mem and stream - None, 0) # arguments + (err,) = cuda.cuLaunchKernel( + kernel, + 1, + 1, + 1, # grid dim + 1, + 1, + 1, # block dim + 0, + stream, # shared mem and stream + ((), ()), + 0, + ) # arguments + ASSERT_DRV(err) + (err,) = cuda.cuLaunchKernel( + kernel, + 1, + 1, + 1, # grid dim + 1, + 1, + 1, # block dim + 0, + stream, # shared mem and stream + None, + 0, + ) # arguments ASSERT_DRV(err) # Retrieve global and validate isDone_host = ctypes.c_bool() - err, isDonePtr_device, isDonePtr_device_size = cuda.cuModuleGetGlobal(module, b'isDone') + err, isDonePtr_device, isDonePtr_device_size = cuda.cuModuleGetGlobal(module, b"isDone") ASSERT_DRV(err) - assert(isDonePtr_device_size == ctypes.sizeof(ctypes.c_bool)) - err, = cuda.cuMemcpyDtoHAsync(isDone_host, isDonePtr_device, ctypes.sizeof(ctypes.c_bool), stream) + assert isDonePtr_device_size == ctypes.sizeof(ctypes.c_bool) + (err,) = cuda.cuMemcpyDtoHAsync(isDone_host, isDonePtr_device, ctypes.sizeof(ctypes.c_bool), stream) ASSERT_DRV(err) - err, = cuda.cuStreamSynchronize(stream) + (err,) = cuda.cuStreamSynchronize(stream) ASSERT_DRV(err) - assert(isDone_host.value == True) + assert isDone_host.value is True - err, = cuda.cuStreamDestroy(stream) + (err,) = cuda.cuStreamDestroy(stream) ASSERT_DRV(err) - err, = cuda.cuModuleUnload(module) + (err,) = cuda.cuModuleUnload(module) ASSERT_DRV(err) - err, = cuda.cuCtxDestroy(context) + (err,) = cuda.cuCtxDestroy(context) ASSERT_DRV(err) + def kernelParams_basic(use_ctypes_as_values): - err, = cuda.cuInit(0) + (err,) = cuda.cuInit(0) ASSERT_DRV(err) err, cuDevice = cuda.cuDeviceGet(0) ASSERT_DRV(err) @@ -134,39 +155,66 @@ def kernelParams_basic(use_ctypes_as_values): ASSERT_DRV(err) if use_ctypes_as_values: - assertValues_host = (ctypes.c_bool(True), - ctypes.c_char(b'Z'), ctypes.c_wchar('Ä€'), - ctypes.c_byte(-127), ctypes.c_ubyte(255), - ctypes.c_short(1), ctypes.c_ushort(1), - ctypes.c_int(2), ctypes.c_uint(2), - ctypes.c_long(3), ctypes.c_ulong(3), - ctypes.c_longlong(4), ctypes.c_ulonglong(4), - ctypes.c_size_t(5), - ctypes.c_float(float(123.456)), ctypes.c_float(float(123.456)), - ctypes.c_void_p(0xdeadbeef)) + assertValues_host = ( + ctypes.c_bool(True), + ctypes.c_char(b"Z"), + ctypes.c_wchar("Ä€"), + ctypes.c_byte(-127), + ctypes.c_ubyte(255), + ctypes.c_short(1), + ctypes.c_ushort(1), + ctypes.c_int(2), + ctypes.c_uint(2), + ctypes.c_long(3), + ctypes.c_ulong(3), + ctypes.c_longlong(4), + ctypes.c_ulonglong(4), + ctypes.c_size_t(5), + ctypes.c_float(123.456), + ctypes.c_float(123.456), + ctypes.c_void_p(0xDEADBEEF), + ) else: - assertValues_host = (True, - b'Z', 'Ä€', - -127, 255, - 90, 72, - 85, 82, - 66, 65, - 86, 90, - 33, - float(123.456), float(123.456), - 0xdeadbeef) - assertTypes_host = (ctypes.c_bool, - ctypes.c_char, ctypes.c_wchar, - ctypes.c_byte, ctypes.c_ubyte, - ctypes.c_short, ctypes.c_ushort, - ctypes.c_int, ctypes.c_uint, - ctypes.c_long, ctypes.c_ulong, - ctypes.c_longlong, ctypes.c_ulonglong, - ctypes.c_size_t, - ctypes.c_float, ctypes.c_double, - ctypes.c_void_p) - - basicKernelString = '''\ + assertValues_host = ( + True, + b"Z", + "Ä€", + -127, + 255, + 90, + 72, + 85, + 82, + 66, + 65, + 86, + 90, + 33, + 123.456, + 123.456, + 0xDEADBEEF, + ) + assertTypes_host = ( + ctypes.c_bool, + ctypes.c_char, + ctypes.c_wchar, + ctypes.c_byte, + ctypes.c_ubyte, + ctypes.c_short, + ctypes.c_ushort, + ctypes.c_int, + ctypes.c_uint, + ctypes.c_long, + ctypes.c_ulong, + ctypes.c_longlong, + ctypes.c_ulonglong, + ctypes.c_size_t, + ctypes.c_float, + ctypes.c_double, + ctypes.c_void_p, + ) + + basicKernelString = """\ extern "C" __global__ void basic(bool b, char c, wchar_t wc, @@ -222,25 +270,25 @@ def kernelParams_basic(use_ctypes_as_values): *pf = f; *pd = d; } - ''' + """ idx = 0 - while '{}' in basicKernelString: + while "{}" in basicKernelString: val = assertValues_host[idx].value if use_ctypes_as_values else assertValues_host[idx] if assertTypes_host[idx] == ctypes.c_float: - basicKernelString = basicKernelString.replace('{}', str(float(val)) + 'f', 1) + basicKernelString = basicKernelString.replace("{}", str(float(val)) + "f", 1) elif assertTypes_host[idx] == ctypes.c_double: - basicKernelString = basicKernelString.replace('{}', str(float(val)), 1) + basicKernelString = basicKernelString.replace("{}", str(float(val)), 1) elif assertTypes_host[idx] == ctypes.c_char: - basicKernelString = basicKernelString.replace('{}', str(val)[1:], 1) + basicKernelString = basicKernelString.replace("{}", str(val)[1:], 1) elif assertTypes_host[idx] == ctypes.c_wchar: - basicKernelString = basicKernelString.replace('{}', str(ord(val)), 1) + basicKernelString = basicKernelString.replace("{}", str(ord(val)), 1) else: - basicKernelString = basicKernelString.replace('{}', str(int(val)), 1) + basicKernelString = basicKernelString.replace("{}", str(int(val)), 1) idx += 1 module = common_nvrtc(basicKernelString, cuDevice) - err, kernel = cuda.cuModuleGetFunction(module, b'basic') + err, kernel = cuda.cuModuleGetFunction(module, b"basic") ASSERT_DRV(err) err, stream = cuda.cuStreamCreate(0) @@ -280,108 +328,124 @@ def kernelParams_basic(use_ctypes_as_values): err, pd = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_double)) ASSERT_DRV(err) - assertValues_device = (pb, - pc, pwc, - pbyte, pubyte, - ps, pus, - pi, pui, - pl, pul, - pll, pull, - psize, - pf, pd) - assertTypes_device = (None, - None, None, - None, None, - None, None, - None, None, - None, None, - None, None, - None, - None, None) + assertValues_device = (pb, pc, pwc, pbyte, pubyte, ps, pus, pi, pui, pl, pul, pll, pull, psize, pf, pd) + assertTypes_device = ( + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + ) basicKernelValues = assertValues_host + assertValues_device basicKernelTypes = assertTypes_host + assertTypes_device - err, = cuda.cuLaunchKernel(kernel, - 1, 1, 1, # grid dim - 1, 1, 1, # block dim - 0, stream, # shared mem and stream - (basicKernelValues, basicKernelTypes), 0) # arguments + (err,) = cuda.cuLaunchKernel( + kernel, + 1, + 1, + 1, # grid dim + 1, + 1, + 1, # block dim + 0, + stream, # shared mem and stream + (basicKernelValues, basicKernelTypes), + 0, + ) # arguments ASSERT_DRV(err) # Retrieve each dptr host_params = tuple([valueType() for valueType in assertTypes_host[:-1]]) for i in range(len(host_params)): - err, = cuda.cuMemcpyDtoHAsync(host_params[i], assertValues_device[i], ctypes.sizeof(assertTypes_host[i]), stream) + (err,) = cuda.cuMemcpyDtoHAsync( + host_params[i], assertValues_device[i], ctypes.sizeof(assertTypes_host[i]), stream + ) ASSERT_DRV(err) # Validate retrieved values - err, = cuda.cuStreamSynchronize(stream) + (err,) = cuda.cuStreamSynchronize(stream) ASSERT_DRV(err) for i in range(len(host_params)): val = basicKernelValues[i].value if use_ctypes_as_values else basicKernelValues[i] if basicKernelTypes[i] == ctypes.c_float: if use_ctypes_as_values: - assert(val == host_params[i].value) + assert val == host_params[i].value else: - assert(val == (int(host_params[i].value * 1000) / 1000)) + assert val == (int(host_params[i].value * 1000) / 1000) else: - assert(val == host_params[i].value) + assert val == host_params[i].value - err, = cuda.cuMemFree(pb) + (err,) = cuda.cuMemFree(pb) ASSERT_DRV(err) - err, = cuda.cuMemFree(pc) + (err,) = cuda.cuMemFree(pc) ASSERT_DRV(err) - err, = cuda.cuMemFree(pwc) + (err,) = cuda.cuMemFree(pwc) ASSERT_DRV(err) - err, = cuda.cuMemFree(pbyte) + (err,) = cuda.cuMemFree(pbyte) ASSERT_DRV(err) - err, = cuda.cuMemFree(pubyte) + (err,) = cuda.cuMemFree(pubyte) ASSERT_DRV(err) - err, = cuda.cuMemFree(ps) + (err,) = cuda.cuMemFree(ps) ASSERT_DRV(err) - err, = cuda.cuMemFree(pus) + (err,) = cuda.cuMemFree(pus) ASSERT_DRV(err) - err, = cuda.cuMemFree(pi) + (err,) = cuda.cuMemFree(pi) ASSERT_DRV(err) - err, = cuda.cuMemFree(pui) + (err,) = cuda.cuMemFree(pui) ASSERT_DRV(err) - err, = cuda.cuMemFree(pl) + (err,) = cuda.cuMemFree(pl) ASSERT_DRV(err) - err, = cuda.cuMemFree(pul) + (err,) = cuda.cuMemFree(pul) ASSERT_DRV(err) - err, = cuda.cuMemFree(pll) + (err,) = cuda.cuMemFree(pll) ASSERT_DRV(err) - err, = cuda.cuMemFree(pull) + (err,) = cuda.cuMemFree(pull) ASSERT_DRV(err) - err, = cuda.cuMemFree(psize) + (err,) = cuda.cuMemFree(psize) ASSERT_DRV(err) - err, = cuda.cuMemFree(pf) + (err,) = cuda.cuMemFree(pf) ASSERT_DRV(err) - err, = cuda.cuMemFree(pd) + (err,) = cuda.cuMemFree(pd) ASSERT_DRV(err) - err, = cuda.cuStreamDestroy(stream) + (err,) = cuda.cuStreamDestroy(stream) ASSERT_DRV(err) - err, = cuda.cuModuleUnload(module) + (err,) = cuda.cuModuleUnload(module) ASSERT_DRV(err) - err, = cuda.cuCtxDestroy(context) + (err,) = cuda.cuCtxDestroy(context) ASSERT_DRV(err) + def test_kernelParams_basic(): # Kernel is given basic Python primative values as value input - kernelParams_basic(use_ctypes_as_values = False) + kernelParams_basic(use_ctypes_as_values=False) + def test_kernelParams_basic_ctypes(): # Kernel is given basic c_type instances as primative value input - kernelParams_basic(use_ctypes_as_values = True) + kernelParams_basic(use_ctypes_as_values=True) + def test_kernelParams_types_cuda(): - err, = cuda.cuInit(0) + (err,) = cuda.cuInit(0) ASSERT_DRV(err) err, cuDevice = cuda.cuDeviceGet(0) ASSERT_DRV(err) err, context = cuda.cuCtxCreate(0, cuDevice) ASSERT_DRV(err) - err, uvaSupported = cuda.cuDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, cuDevice) + err, uvaSupported = cuda.cuDeviceGetAttribute( + cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, cuDevice + ) ASSERT_DRV(err) err, perr = cudart.cudaMalloc(ctypes.sizeof(ctypes.c_int)) @@ -393,25 +457,33 @@ def test_kernelParams_types_cuda(): # Get device pointer if UVM is not enabled if uvaSupported: - kernelValues = (cudart.cudaError_t.cudaErrorUnknown, perr, # enums - cudart.cudaSurfaceObject_t(248), cudart.cudaSurfaceObject_t(_ptr=pSurface_host), # typedef of primative - cudart.dim3(), cudart.dim3(_ptr=pDim3_host)) # struct + kernelValues = ( + cudart.cudaError_t.cudaErrorUnknown, + perr, # enums + cudart.cudaSurfaceObject_t(248), + cudart.cudaSurfaceObject_t(_ptr=pSurface_host), # typedef of primative + cudart.dim3(), + cudart.dim3(_ptr=pDim3_host), + ) # struct else: err, pSurface_device = cudart.cudaHostGetDevicePointer(pSurface_host, 0) ASSERT_DRV(err) err, pDim3_device = cudart.cudaHostGetDevicePointer(pDim3_host, 0) ASSERT_DRV(err) - kernelValues = (cudart.cudaError_t.cudaErrorUnknown, perr, # enums - cudart.cudaSurfaceObject_t(248), cudart.cudaSurfaceObject_t(_ptr=pSurface_device), # typedef of primative - cudart.dim3(), cudart.dim3(_ptr=pDim3_device)) # struct - kernelTypes = (None, ctypes.c_void_p, - None, ctypes.c_void_p, - None, ctypes.c_void_p) + kernelValues = ( + cudart.cudaError_t.cudaErrorUnknown, + perr, # enums + cudart.cudaSurfaceObject_t(248), + cudart.cudaSurfaceObject_t(_ptr=pSurface_device), # typedef of primative + cudart.dim3(), + cudart.dim3(_ptr=pDim3_device), + ) # struct + kernelTypes = (None, ctypes.c_void_p, None, ctypes.c_void_p, None, ctypes.c_void_p) kernelValues[4].x = 1 kernelValues[4].y = 2 kernelValues[4].z = 3 - kernelString = '''\ + kernelString = """\ extern "C" __global__ void structsCuda(cudaError_t err, cudaError_t *perr, cudaSurfaceObject_t surface, cudaSurfaceObject_t *pSurface, @@ -423,73 +495,90 @@ def test_kernelParams_types_cuda(): pdim->y = dim.y; pdim->z = dim.z; } - ''' + """ module = common_nvrtc(kernelString, cuDevice) # cudaStructs kernel - err, kernel = cuda.cuModuleGetFunction(module, b'structsCuda') + err, kernel = cuda.cuModuleGetFunction(module, b"structsCuda") ASSERT_DRV(err) err, stream = cuda.cuStreamCreate(0) ASSERT_DRV(err) - err, = cuda.cuLaunchKernel(kernel, - 1, 1, 1, # grid dim - 1, 1, 1, # block dim - 0, stream, # shared mem and stream - (kernelValues, kernelTypes), 0) # arguments + (err,) = cuda.cuLaunchKernel( + kernel, + 1, + 1, + 1, # grid dim + 1, + 1, + 1, # block dim + 0, + stream, # shared mem and stream + (kernelValues, kernelTypes), + 0, + ) # arguments ASSERT_DRV(err) # Retrieve each dptr host_err = ctypes.c_int() - err, = cudart.cudaMemcpyAsync(ctypes.addressof(host_err), perr, ctypes.sizeof(ctypes.c_int()), cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream) + (err,) = cudart.cudaMemcpyAsync( + ctypes.addressof(host_err), + perr, + ctypes.sizeof(ctypes.c_int()), + cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, + stream, + ) ASSERT_DRV(err) # Validate kernel values - err, = cuda.cuStreamSynchronize(stream) + (err,) = cuda.cuStreamSynchronize(stream) ASSERT_DRV(err) cuda_err = cudart.cudaError_t(host_err.value) if uvaSupported: - assert(kernelValues[0] == cuda_err) - assert(int(kernelValues[2]) == int(kernelValues[3])) - assert(kernelValues[4].x == kernelValues[5].x) - assert(kernelValues[4].y == kernelValues[5].y) - assert(kernelValues[4].z == kernelValues[5].z) + assert kernelValues[0] == cuda_err + assert int(kernelValues[2]) == int(kernelValues[3]) + assert kernelValues[4].x == kernelValues[5].x + assert kernelValues[4].y == kernelValues[5].y + assert kernelValues[4].z == kernelValues[5].z else: surface_host = cudart.cudaSurfaceObject_t(_ptr=pSurface_host) dim3_host = cudart.dim3(_ptr=pDim3_host) - assert(kernelValues[0] == cuda_err) - assert(int(kernelValues[2]) == int(surface_host)) - assert(kernelValues[4].x == dim3_host.x) - assert(kernelValues[4].y == dim3_host.y) - assert(kernelValues[4].z == dim3_host.z) + assert kernelValues[0] == cuda_err + assert int(kernelValues[2]) == int(surface_host) + assert kernelValues[4].x == dim3_host.x + assert kernelValues[4].y == dim3_host.y + assert kernelValues[4].z == dim3_host.z - err, = cudart.cudaFree(perr) + (err,) = cudart.cudaFree(perr) ASSERT_DRV(err) - err, = cudart.cudaFreeHost(pSurface_host) + (err,) = cudart.cudaFreeHost(pSurface_host) ASSERT_DRV(err) - err, = cudart.cudaFreeHost(pDim3_host) + (err,) = cudart.cudaFreeHost(pDim3_host) ASSERT_DRV(err) - err, = cuda.cuStreamDestroy(stream) + (err,) = cuda.cuStreamDestroy(stream) ASSERT_DRV(err) - err, = cuda.cuModuleUnload(module) + (err,) = cuda.cuModuleUnload(module) ASSERT_DRV(err) - err, = cuda.cuCtxDestroy(context) + (err,) = cuda.cuCtxDestroy(context) ASSERT_DRV(err) + def test_kernelParams_struct_custom(): - err, = cuda.cuInit(0) + (err,) = cuda.cuInit(0) ASSERT_DRV(err) err, cuDevice = cuda.cuDeviceGet(0) ASSERT_DRV(err) err, context = cuda.cuCtxCreate(0, cuDevice) ASSERT_DRV(err) - err, uvaSupported = cuda.cuDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, cuDevice) + err, uvaSupported = cuda.cuDeviceGetAttribute( + cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, cuDevice + ) ASSERT_DRV(err) - kernelString = '''\ + kernelString = """\ struct testStruct { int value; }; @@ -499,11 +588,11 @@ def test_kernelParams_struct_custom(): { dst->value = src.value; } - ''' + """ module = common_nvrtc(kernelString, cuDevice) - err, kernel = cuda.cuModuleGetFunction(module, b'structCustom') + err, kernel = cuda.cuModuleGetFunction(module, b"structCustom") ASSERT_DRV(err) err, stream = cuda.cuStreamCreate(0) @@ -511,7 +600,7 @@ def test_kernelParams_struct_custom(): # structCustom kernel class testStruct(ctypes.Structure): - _fields_ = [('value',ctypes.c_int)] + _fields_ = [("value", ctypes.c_int)] err, pStruct_host = cudart.cudaHostAlloc(ctypes.sizeof(testStruct), cudart.cudaHostAllocMapped) ASSERT_DRV(err) @@ -525,39 +614,50 @@ class testStruct(ctypes.Structure): kernelValues = (testStruct(5), pStruct_device) kernelTypes = (None, ctypes.c_void_p) - err, = cuda.cuLaunchKernel(kernel, - 1, 1, 1, # grid dim - 1, 1, 1, # block dim - 0, stream, # shared mem and stream - (kernelValues, kernelTypes), 0) # arguments + (err,) = cuda.cuLaunchKernel( + kernel, + 1, + 1, + 1, # grid dim + 1, + 1, + 1, # block dim + 0, + stream, # shared mem and stream + (kernelValues, kernelTypes), + 0, + ) # arguments ASSERT_DRV(err) # Validate kernel values - err, = cuda.cuStreamSynchronize(stream) + (err,) = cuda.cuStreamSynchronize(stream) ASSERT_DRV(err) struct_shared = testStruct.from_address(pStruct_host) - assert(kernelValues[0].value == struct_shared.value) + assert kernelValues[0].value == struct_shared.value - err, = cudart.cudaFreeHost(pStruct_host) + (err,) = cudart.cudaFreeHost(pStruct_host) ASSERT_DRV(err) - err, = cuda.cuStreamDestroy(stream) + (err,) = cuda.cuStreamDestroy(stream) ASSERT_DRV(err) - err, = cuda.cuModuleUnload(module) + (err,) = cuda.cuModuleUnload(module) ASSERT_DRV(err) - err, = cuda.cuCtxDestroy(context) + (err,) = cuda.cuCtxDestroy(context) ASSERT_DRV(err) + def kernelParams_buffer_protocol_ctypes_common(pass_by_address): - err, = cuda.cuInit(0) + (err,) = cuda.cuInit(0) ASSERT_DRV(err) err, cuDevice = cuda.cuDeviceGet(0) ASSERT_DRV(err) err, context = cuda.cuCtxCreate(0, cuDevice) ASSERT_DRV(err) - err, uvaSupported = cuda.cuDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, cuDevice) + err, uvaSupported = cuda.cuDeviceGetAttribute( + cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, cuDevice + ) ASSERT_DRV(err) - kernelString = '''\ + kernelString = """\ struct testStruct { int value; }; @@ -570,11 +670,11 @@ def kernelParams_buffer_protocol_ctypes_common(pass_by_address): *pf = f; ps->value = s.value; } - ''' + """ module = common_nvrtc(kernelString, cuDevice) - err, kernel = cuda.cuModuleGetFunction(module, b'testkernel') + err, kernel = cuda.cuModuleGetFunction(module, b"testkernel") ASSERT_DRV(err) err, stream = cuda.cuStreamCreate(0) @@ -582,7 +682,7 @@ def kernelParams_buffer_protocol_ctypes_common(pass_by_address): # testkernel kernel class testStruct(ctypes.Structure): - _fields_ = [('value',ctypes.c_int)] + _fields_ = [("value", ctypes.c_int)] err, pInt_host = cudart.cudaHostAlloc(ctypes.sizeof(ctypes.c_int), cudart.cudaHostAllocMapped) ASSERT_DRV(err) @@ -593,9 +693,14 @@ class testStruct(ctypes.Structure): # Get device pointer if UVM is not enabled if uvaSupported: - kernelValues = (ctypes.c_int(1), ctypes.c_void_p(pInt_host), - ctypes.c_float(float(123.456)), ctypes.c_void_p(pFloat_host), - testStruct(5), ctypes.c_void_p(pStruct_host)) + kernelValues = ( + ctypes.c_int(1), + ctypes.c_void_p(pInt_host), + ctypes.c_float(123.456), + ctypes.c_void_p(pFloat_host), + testStruct(5), + ctypes.c_void_p(pStruct_host), + ) else: err, pInt_device = cudart.cudaHostGetDevicePointer(pInt_host, 0) ASSERT_DRV(err) @@ -603,51 +708,68 @@ class testStruct(ctypes.Structure): ASSERT_DRV(err) err, pStruct_device = cudart.cudaHostGetDevicePointer(pStruct_host, 0) ASSERT_DRV(err) - kernelValues = (ctypes.c_int(1), ctypes.c_void_p(pInt_device), - ctypes.c_float(float(123.456)), ctypes.c_void_p(pFloat_device), - testStruct(5), ctypes.c_void_p(pStruct_device)) - - packagedParams = (ctypes.c_void_p*len(kernelValues))() + kernelValues = ( + ctypes.c_int(1), + ctypes.c_void_p(pInt_device), + ctypes.c_float(123.456), + ctypes.c_void_p(pFloat_device), + testStruct(5), + ctypes.c_void_p(pStruct_device), + ) + + packagedParams = (ctypes.c_void_p * len(kernelValues))() for idx in range(len(packagedParams)): packagedParams[idx] = ctypes.addressof(kernelValues[idx]) - err, = cuda.cuLaunchKernel(kernel, - 1, 1, 1, # grid dim - 1, 1, 1, # block dim - 0, stream, # shared mem and stream - ctypes.addressof(packagedParams) if pass_by_address else packagedParams, 0) # arguments + (err,) = cuda.cuLaunchKernel( + kernel, + 1, + 1, + 1, # grid dim + 1, + 1, + 1, # block dim + 0, + stream, # shared mem and stream + ctypes.addressof(packagedParams) if pass_by_address else packagedParams, + 0, + ) # arguments ASSERT_DRV(err) # Validate kernel values - err, = cuda.cuStreamSynchronize(stream) + (err,) = cuda.cuStreamSynchronize(stream) ASSERT_DRV(err) - assert(kernelValues[0].value == ctypes.c_int.from_address(pInt_host).value) - assert(kernelValues[2].value == ctypes.c_float.from_address(pFloat_host).value) - assert(kernelValues[4].value == testStruct.from_address(pStruct_host).value) + assert kernelValues[0].value == ctypes.c_int.from_address(pInt_host).value + assert kernelValues[2].value == ctypes.c_float.from_address(pFloat_host).value + assert kernelValues[4].value == testStruct.from_address(pStruct_host).value - err, = cudart.cudaFreeHost(pStruct_host) + (err,) = cudart.cudaFreeHost(pStruct_host) ASSERT_DRV(err) - err, = cuda.cuStreamDestroy(stream) + (err,) = cuda.cuStreamDestroy(stream) ASSERT_DRV(err) - err, = cuda.cuModuleUnload(module) + (err,) = cuda.cuModuleUnload(module) ASSERT_DRV(err) - err, = cuda.cuCtxDestroy(context) + (err,) = cuda.cuCtxDestroy(context) ASSERT_DRV(err) + def test_kernelParams_buffer_protocol_ctypes(): kernelParams_buffer_protocol_ctypes_common(pass_by_address=True) kernelParams_buffer_protocol_ctypes_common(pass_by_address=False) + def test_kernelParams_buffer_protocol_numpy(): - err, = cuda.cuInit(0) + (err,) = cuda.cuInit(0) ASSERT_DRV(err) err, cuDevice = cuda.cuDeviceGet(0) ASSERT_DRV(err) err, context = cuda.cuCtxCreate(0, cuDevice) ASSERT_DRV(err) - err, uvaSupported = cuda.cuDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, cuDevice) + err, uvaSupported = cuda.cuDeviceGetAttribute( + cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, cuDevice + ) ASSERT_DRV(err) - kernelString = '''\ + kernelString = """\ struct testStruct { int value; }; @@ -660,18 +782,18 @@ def test_kernelParams_buffer_protocol_numpy(): *pf = f; ps->value = s.value; } - ''' + """ module = common_nvrtc(kernelString, cuDevice) - err, kernel = cuda.cuModuleGetFunction(module, b'testkernel') + err, kernel = cuda.cuModuleGetFunction(module, b"testkernel") ASSERT_DRV(err) err, stream = cuda.cuStreamCreate(0) ASSERT_DRV(err) # testkernel kernel - testStruct = np.dtype([('value', np.int32)]) + testStruct = np.dtype([("value", np.int32)]) err, pInt_host = cudart.cudaHostAlloc(np.dtype(np.int32).itemsize, cudart.cudaHostAllocMapped) ASSERT_DRV(err) @@ -682,9 +804,14 @@ def test_kernelParams_buffer_protocol_numpy(): # Get device pointer if UVM is not enabled if uvaSupported: - kernelValues = (np.array(1, dtype=np.uint32), np.array([pInt_host], dtype=np.uint64), - np.array(float(123.456), dtype=np.float32), np.array([pFloat_host], dtype=np.uint64), - np.array([5], testStruct), np.array([pStruct_host], dtype=np.uint64)) + kernelValues = ( + np.array(1, dtype=np.uint32), + np.array([pInt_host], dtype=np.uint64), + np.array(123.456, dtype=np.float32), + np.array([pFloat_host], dtype=np.uint64), + np.array([5], testStruct), + np.array([pStruct_host], dtype=np.uint64), + ) else: err, pInt_device = cudart.cudaHostGetDevicePointer(pInt_host, 0) ASSERT_DRV(err) @@ -692,37 +819,48 @@ def test_kernelParams_buffer_protocol_numpy(): ASSERT_DRV(err) err, pStruct_device = cudart.cudaHostGetDevicePointer(pStruct_host, 0) ASSERT_DRV(err) - kernelValues = (np.array(1, dtype=np.int32), np.array([pInt_device], dtype=np.uint64), - np.array(float(123.456), dtype=np.float32), np.array([pFloat_device], dtype=np.uint64), - np.array([5], testStruct), np.array([pStruct_device], dtype=np.uint64)) + kernelValues = ( + np.array(1, dtype=np.int32), + np.array([pInt_device], dtype=np.uint64), + np.array(123.456, dtype=np.float32), + np.array([pFloat_device], dtype=np.uint64), + np.array([5], testStruct), + np.array([pStruct_device], dtype=np.uint64), + ) packagedParams = np.array([arg.ctypes.data for arg in kernelValues], dtype=np.uint64) - err, = cuda.cuLaunchKernel(kernel, - 1, 1, 1, # grid dim - 1, 1, 1, # block dim - 0, stream, # shared mem and stream - packagedParams, 0) # arguments + (err,) = cuda.cuLaunchKernel( + kernel, + 1, + 1, + 1, # grid dim + 1, + 1, + 1, # block dim + 0, + stream, # shared mem and stream + packagedParams, + 0, + ) # arguments ASSERT_DRV(err) # Validate kernel values - err, = cuda.cuStreamSynchronize(stream) + (err,) = cuda.cuStreamSynchronize(stream) ASSERT_DRV(err) - class numpy_address_wrapper(): + class numpy_address_wrapper: def __init__(self, address, typestr): - self.__array_interface__ = {'data': (address, False), - 'typestr': typestr, - 'shape': (1,)} + self.__array_interface__ = {"data": (address, False), "typestr": typestr, "shape": (1,)} - assert(kernelValues[0] == np.array(numpy_address_wrapper(pInt_host, ' int: @@ -115,7 +111,7 @@ def name(self) -> str: """Return the device name.""" # Use 256 characters to be consistent with CUDA Runtime name = handle_return(cuda.cuDeviceGetName(256, self._id)) - name = name.split(b'\0')[0] + name = name.split(b"\0")[0] return name.decode() @property @@ -127,10 +123,12 @@ def properties(self) -> dict: @property def compute_capability(self) -> ComputeCapability: """Return a named tuple with 2 fields: major and minor.""" - major = handle_return(cudart.cudaDeviceGetAttribute( - cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, self._id)) - minor = handle_return(cudart.cudaDeviceGetAttribute( - cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, self._id)) + major = handle_return( + cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, self._id) + ) + minor = handle_return( + cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, self._id) + ) return ComputeCapability(major, minor) @property @@ -178,7 +176,7 @@ def __int__(self): def __repr__(self): return f"" - def set_current(self, ctx: Context=None) -> Union[Context, None]: + def set_current(self, ctx: Context = None) -> Union[Context, None]: """Set device to be used for GPU executions. Initializes CUDA and sets the calling thread to a valid CUDA @@ -212,8 +210,10 @@ def set_current(self, ctx: Context=None) -> Union[Context, None]: if not isinstance(ctx, Context): raise TypeError("a Context object is required") if ctx._id != self._id: - raise RuntimeError("the provided context was created on a different " - f"device {ctx._id} other than the target {self._id}") + raise RuntimeError( + "the provided context was created on a different " + f"device {ctx._id} other than the target {self._id}" + ) prev_ctx = handle_return(cuda.cuCtxPopCurrent()) handle_return(cuda.cuCtxPushCurrent(ctx._handle)) self._has_inited = True @@ -257,7 +257,7 @@ def create_context(self, options: ContextOptions = None) -> Context: raise NotImplementedError("TODO") @precondition(_check_context_initialized) - def create_stream(self, obj=None, options: StreamOptions=None) -> Stream: + def create_stream(self, obj=None, options: StreamOptions = None) -> Stream: """Create a Stream object. New stream objects can be created in two different ways: diff --git a/cuda_core/cuda/core/experimental/_event.py b/cuda_core/cuda/core/experimental/_event.py index a6d5da281..af4768953 100644 --- a/cuda_core/cuda/core/experimental/_event.py +++ b/cuda_core/cuda/core/experimental/_event.py @@ -6,9 +6,7 @@ from typing import Optional from cuda import cuda -from cuda.core.experimental._utils import check_or_create_options -from cuda.core.experimental._utils import CUDAError -from cuda.core.experimental._utils import handle_return +from cuda.core.experimental._utils import CUDAError, check_or_create_options, handle_return @dataclass @@ -30,6 +28,7 @@ class EventOptions: Note that enable_timing must be False. (Default to False) """ + enable_timing: Optional[bool] = False busy_waited_sync: Optional[bool] = False support_ipc: Optional[bool] = False @@ -50,16 +49,17 @@ class Event: and they should instead be created through a :obj:`Stream` object. """ + __slots__ = ("_handle", "_timing_disabled", "_busy_waited") def __init__(self): self._handle = None raise NotImplementedError( - "directly creating an Event object can be ambiguous. Please call " - "call Stream.record().") + "directly creating an Event object can be ambiguous. Please call call Stream.record()." + ) @staticmethod - def _init(options: Optional[EventOptions]=None): + def _init(options: Optional[EventOptions] = None): self = Event.__new__(Event) # minimal requirements for the destructor self._handle = None @@ -119,7 +119,7 @@ def sync(self): @property def is_done(self) -> bool: """Return True if all captured works have been completed, otherwise False.""" - result, = cuda.cuEventQuery(self._handle) + (result,) = cuda.cuEventQuery(self._handle) if result == cuda.CUresult.CUDA_SUCCESS: return True elif result == cuda.CUresult.CUDA_ERROR_NOT_READY: diff --git a/cuda_core/cuda/core/experimental/_launcher.py b/cuda_core/cuda/core/experimental/_launcher.py index 9991638f3..77af6b7e1 100644 --- a/cuda_core/cuda/core/experimental/_launcher.py +++ b/cuda_core/cuda/core/experimental/_launcher.py @@ -2,20 +2,16 @@ # # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE -from dataclasses import dataclass import importlib.metadata +from dataclasses import dataclass from typing import Optional, Union -import numpy as np - -from cuda import cuda, cudart +from cuda import cuda from cuda.core.experimental._kernel_arg_handler import ParamHolder -from cuda.core.experimental._memory import Buffer from cuda.core.experimental._module import Kernel from cuda.core.experimental._stream import Stream from cuda.core.experimental._utils import CUDAError, check_or_create_options, handle_return - # TODO: revisit this treatment for py313t builds _inited = False _use_ex = None @@ -28,8 +24,7 @@ def _lazy_init(): global _use_ex # binding availability depends on cuda-python version - _py_major_minor = tuple(int(v) for v in ( - importlib.metadata.version("cuda-python").split(".")[:2])) + _py_major_minor = tuple(int(v) for v in (importlib.metadata.version("cuda-python").split(".")[:2])) _driver_ver = handle_return(cuda.cuDriverGetVersion()) _use_ex = (_driver_ver >= 11080) and (_py_major_minor >= (11, 8)) _inited = True @@ -55,6 +50,7 @@ class LaunchConfig: (Default to size 0) """ + # TODO: expand LaunchConfig to include other attributes grid: Union[tuple, int] = None block: Union[tuple, int] = None @@ -65,14 +61,11 @@ def __post_init__(self): self.grid = self._cast_to_3_tuple(self.grid) self.block = self._cast_to_3_tuple(self.block) # we handle "stream=None" in the launch API - if self.stream is not None: - if not isinstance(self.stream, Stream): - try: - self.stream = Stream._init(self.stream) - except Exception as e: - raise ValueError( - "stream must either be a Stream object " - "or support __cuda_stream__") from e + if self.stream is not None and not isinstance(self.stream, Stream): + try: + self.stream = Stream._init(self.stream) + except Exception as e: + raise ValueError("stream must either be a Stream object or support __cuda_stream__") from e if self.shmem_size is None: self.shmem_size = 0 @@ -141,14 +134,11 @@ def launch(kernel, config, *kernel_args): drv_cfg.hStream = config.stream._handle drv_cfg.sharedMemBytes = config.shmem_size drv_cfg.numAttrs = 0 # TODO - handle_return(cuda.cuLaunchKernelEx( - drv_cfg, int(kernel._handle), args_ptr, 0)) + handle_return(cuda.cuLaunchKernelEx(drv_cfg, int(kernel._handle), args_ptr, 0)) else: # TODO: check if config has any unsupported attrs - handle_return(cuda.cuLaunchKernel( - int(kernel._handle), - *config.grid, - *config.block, - config.shmem_size, - config.stream._handle, - args_ptr, 0)) + handle_return( + cuda.cuLaunchKernel( + int(kernel._handle), *config.grid, *config.block, config.shmem_size, config.stream._handle, args_ptr, 0 + ) + ) diff --git a/cuda_core/cuda/core/experimental/_memory.py b/cuda_core/cuda/core/experimental/_memory.py index 678f26ee8..415b5151f 100644 --- a/cuda_core/cuda/core/experimental/_memory.py +++ b/cuda_core/cuda/core/experimental/_memory.py @@ -6,14 +6,12 @@ import abc from typing import Optional, Tuple, TypeVar -import warnings from cuda import cuda from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule from cuda.core.experimental._stream import default_stream from cuda.core.experimental._utils import handle_return - PyCapsule = TypeVar("PyCapsule") @@ -44,9 +42,9 @@ class Buffer: """ # TODO: handle ownership? (_mr could be None) - __slots__ = ("_ptr", "_size", "_mr",) + __slots__ = ("_ptr", "_size", "_mr") - def __init__(self, ptr, size, mr: MemoryResource=None): + def __init__(self, ptr, size, mr: MemoryResource = None): self._ptr = ptr self._size = size self._mr = mr @@ -112,7 +110,7 @@ def device_id(self) -> int: return self._mr.device_id raise NotImplementedError - def copy_to(self, dst: Buffer=None, *, stream) -> Buffer: + def copy_to(self, dst: Buffer = None, *, stream) -> Buffer: """Copy from this buffer to the dst buffer asynchronously on the given stream. Copies the data from this buffer to the provided dst buffer. @@ -136,8 +134,7 @@ def copy_to(self, dst: Buffer=None, *, stream) -> Buffer: dst = self._mr.allocate(self._size, stream) if dst._size != self._size: raise ValueError("buffer sizes mismatch between src and dst") - handle_return( - cuda.cuMemcpyAsync(dst._ptr, self._ptr, self._size, stream._handle)) + handle_return(cuda.cuMemcpyAsync(dst._ptr, self._ptr, self._size, stream._handle)) return dst def copy_from(self, src: Buffer, *, stream): @@ -156,14 +153,16 @@ def copy_from(self, src: Buffer, *, stream): raise ValueError("stream must be provided") if src._size != self._size: raise ValueError("buffer sizes mismatch between src and dst") - handle_return( - cuda.cuMemcpyAsync(self._ptr, src._ptr, self._size, stream._handle)) - - def __dlpack__(self, *, - stream: Optional[int] = None, - max_version: Optional[Tuple[int, int]] = None, - dl_device: Optional[Tuple[int, int]] = None, - copy: Optional[bool] = None) -> PyCapsule: + handle_return(cuda.cuMemcpyAsync(self._ptr, src._ptr, self._size, stream._handle)) + + def __dlpack__( + self, + *, + stream: Optional[int] = None, + max_version: Optional[Tuple[int, int]] = None, + dl_device: Optional[Tuple[int, int]] = None, + copy: Optional[bool] = None, + ) -> PyCapsule: # Note: we ignore the stream argument entirely (as if it is -1). # It is the user's responsibility to maintain stream order. if dl_device is not None or copy is True: @@ -172,10 +171,7 @@ def __dlpack__(self, *, versioned = False else: assert len(max_version) == 2 - if max_version >= (1, 0): - versioned = True - else: - versioned = False + versioned = max_version >= (1, 0) capsule = make_py_capsule(self, versioned) return capsule @@ -191,10 +187,10 @@ def __dlpack_device__(self) -> Tuple[int, int]: raise BufferError("invalid buffer") def __buffer__(self, flags: int, /) -> memoryview: - # Support for Python-level buffer protocol as per PEP 688. - # This raises a BufferError unless: + # Support for Python-level buffer protocol as per PEP 688. + # This raises a BufferError unless: # 1. Python is 3.12+ - # 2. This Buffer object is host accessible + # 2. This Buffer object is host accessible raise NotImplementedError("TODO") def __release_buffer__(self, buffer: memoryview, /): @@ -203,20 +199,16 @@ def __release_buffer__(self, buffer: memoryview, /): class MemoryResource(abc.ABC): - __slots__ = ("_handle",) @abc.abstractmethod - def __init__(self, *args, **kwargs): - ... + def __init__(self, *args, **kwargs): ... @abc.abstractmethod - def allocate(self, size, stream=None) -> Buffer: - ... + def allocate(self, size, stream=None) -> Buffer: ... @abc.abstractmethod - def deallocate(self, ptr, size, stream=None): - ... + def deallocate(self, ptr, size, stream=None): ... @property @abc.abstractmethod @@ -241,7 +233,6 @@ def device_id(self) -> int: class _DefaultAsyncMempool(MemoryResource): - __slots__ = ("_dev_id",) def __init__(self, dev_id): @@ -273,7 +264,6 @@ def device_id(self) -> int: class _DefaultPinnedMemorySource(MemoryResource): - def __init__(self): # TODO: support flags from cuMemHostAlloc? self._handle = None diff --git a/cuda_core/cuda/core/experimental/_module.py b/cuda_core/cuda/core/experimental/_module.py index 7621b9ee4..836d5064a 100644 --- a/cuda_core/cuda/core/experimental/_module.py +++ b/cuda_core/cuda/core/experimental/_module.py @@ -4,10 +4,9 @@ import importlib.metadata -from cuda import cuda, cudart +from cuda import cuda from cuda.core.experimental._utils import handle_return - _backend = { "old": { "file": cuda.cuModuleLoad, @@ -56,7 +55,10 @@ class Kernel: """ - __slots__ = ("_handle", "_module",) + __slots__ = ( + "_handle", + "_module", + ) def __init__(self): raise NotImplementedError("directly constructing a Kernel instance is not supported") @@ -107,8 +109,7 @@ class ObjectCode: __slots__ = ("_handle", "_code_type", "_module", "_loader", "_sym_map") _supported_code_type = ("cubin", "ptx", "ltoir", "fatbin") - def __init__(self, module, code_type, jit_options=None, *, - symbol_mapping=None): + def __init__(self, module, code_type, jit_options=None, *, symbol_mapping=None): if code_type not in self._supported_code_type: raise ValueError _lazy_init() @@ -129,9 +130,16 @@ def __init__(self, module, code_type, jit_options=None, *, if jit_options is None: jit_options = {} if backend == "new": - args = (module, list(jit_options.keys()), list(jit_options.values()), len(jit_options), - # TODO: support library options - [], [], 0) + args = ( + module, + list(jit_options.keys()), + list(jit_options.values()), + len(jit_options), + # TODO: support library options + [], + [], + 0, + ) else: # "old" backend args = (module, len(jit_options), list(jit_options.keys()), list(jit_options.values())) self._handle = handle_return(self._loader["data"](*args)) diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py index 75b7313f4..6cf13c83b 100644 --- a/cuda_core/cuda/core/experimental/_program.py +++ b/cuda_core/cuda/core/experimental/_program.py @@ -3,8 +3,8 @@ # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE from cuda import nvrtc -from cuda.core.experimental._utils import handle_return from cuda.core.experimental._module import ObjectCode +from cuda.core.experimental._utils import handle_return class Program: @@ -24,8 +24,8 @@ class Program: """ - __slots__ = ("_handle", "_backend", ) - _supported_code_type = ("c++", ) + __slots__ = ("_handle", "_backend") + _supported_code_type = ("c++",) _supported_target_type = ("ptx", "cubin", "ltoir") def __init__(self, code, code_type): @@ -38,8 +38,7 @@ def __init__(self, code, code_type): raise TypeError # TODO: support pre-loaded headers & include names # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved - self._handle = handle_return( - nvrtc.nvrtcCreateProgram(code.encode(), b"", 0, [], [])) + self._handle = handle_return(nvrtc.nvrtcCreateProgram(code.encode(), b"", 0, [], [])) self._backend = "nvrtc" else: raise NotImplementedError @@ -85,14 +84,10 @@ def compile(self, target_type, options=(), name_expressions=(), logs=None): if self._backend == "nvrtc": if name_expressions: for n in name_expressions: - handle_return( - nvrtc.nvrtcAddNameExpression(self._handle, n.encode()), - handle=self._handle) + handle_return(nvrtc.nvrtcAddNameExpression(self._handle, n.encode()), handle=self._handle) # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved options = list(o.encode() for o in options) - handle_return( - nvrtc.nvrtcCompileProgram(self._handle, len(options), options), - handle=self._handle) + handle_return(nvrtc.nvrtcCompileProgram(self._handle, len(options), options), handle=self._handle) size_func = getattr(nvrtc, f"nvrtcGet{target_type.upper()}Size") comp_func = getattr(nvrtc, f"nvrtcGet{target_type.upper()}") @@ -103,16 +98,15 @@ def compile(self, target_type, options=(), name_expressions=(), logs=None): symbol_mapping = {} if name_expressions: for n in name_expressions: - symbol_mapping[n] = handle_return(nvrtc.nvrtcGetLoweredName( - self._handle, n.encode()), handle=self._handle) + symbol_mapping[n] = handle_return( + nvrtc.nvrtcGetLoweredName(self._handle, n.encode()), handle=self._handle + ) if logs is not None: - logsize = handle_return(nvrtc.nvrtcGetProgramLogSize(self._handle), - handle=self._handle) + logsize = handle_return(nvrtc.nvrtcGetProgramLogSize(self._handle), handle=self._handle) if logsize > 1: log = b" " * logsize - handle_return(nvrtc.nvrtcGetProgramLog(self._handle, log), - handle=self._handle) + handle_return(nvrtc.nvrtcGetProgramLog(self._handle, log), handle=self._handle) logs.write(log.decode()) # TODO: handle jit_options for ptx? diff --git a/cuda_core/cuda/core/experimental/_stream.py b/cuda_core/cuda/core/experimental/_stream.py index 7f50dafdb..6a68d1753 100644 --- a/cuda_core/cuda/core/experimental/_stream.py +++ b/cuda_core/cuda/core/experimental/_stream.py @@ -4,18 +4,16 @@ from __future__ import annotations -from dataclasses import dataclass import os -from typing import Optional, Tuple, TYPE_CHECKING, Union +from dataclasses import dataclass +from typing import TYPE_CHECKING, Optional, Tuple, Union if TYPE_CHECKING: from cuda.core.experimental._device import Device from cuda import cuda, cudart from cuda.core.experimental._context import Context from cuda.core.experimental._event import Event, EventOptions -from cuda.core.experimental._utils import check_or_create_options -from cuda.core.experimental._utils import get_device_from_ctx -from cuda.core.experimental._utils import handle_return +from cuda.core.experimental._utils import check_or_create_options, get_device_from_ctx, handle_return @dataclass @@ -31,6 +29,7 @@ class StreamOptions: higher priority. (Default to lowest priority) """ + nonblocking: bool = True priority: Optional[int] = None @@ -53,8 +52,7 @@ class Stream: """ - __slots__ = ("_handle", "_nonblocking", "_priority", "_owner", "_builtin", - "_device_id", "_ctx_handle") + __slots__ = ("_handle", "_nonblocking", "_priority", "_owner", "_builtin", "_device_id", "_ctx_handle") def __init__(self): # minimal requirements for the destructor @@ -64,10 +62,11 @@ def __init__(self): raise NotImplementedError( "directly creating a Stream object can be ambiguous. Please either " "call Device.create_stream() or, if a stream pointer is already " - "available from somewhere else, Stream.from_handle()") + "available from somewhere else, Stream.from_handle()" + ) @staticmethod - def _init(obj=None, *, options: Optional[StreamOptions]=None): + def _init(obj=None, *, options: Optional[StreamOptions] = None): self = Stream.__new__(Stream) # minimal requirements for the destructor @@ -95,10 +94,7 @@ def _init(obj=None, *, options: Optional[StreamOptions]=None): nonblocking = options.nonblocking priority = options.priority - if nonblocking: - flags = cuda.CUstream_flags.CU_STREAM_NON_BLOCKING - else: - flags = cuda.CUstream_flags.CU_STREAM_DEFAULT + flags = cuda.CUstream_flags.CU_STREAM_NON_BLOCKING if nonblocking else cuda.CUstream_flags.CU_STREAM_DEFAULT high, low = handle_return(cudart.cudaDeviceGetStreamPriorityRange()) if priority is not None: @@ -107,8 +103,7 @@ def _init(obj=None, *, options: Optional[StreamOptions]=None): else: priority = high - self._handle = handle_return( - cuda.cuStreamCreateWithPriority(flags, priority)) + self._handle = handle_return(cuda.cuStreamCreateWithPriority(flags, priority)) self._owner = None self._nonblocking = nonblocking self._priority = priority @@ -169,7 +164,7 @@ def sync(self): """Synchronize the stream.""" handle_return(cuda.cuStreamSynchronize(self._handle)) - def record(self, event: Event=None, options: EventOptions=None) -> Event: + def record(self, event: Event = None, options: EventOptions = None) -> Event: """Record an event onto the stream. Creates an Event object (or reuses the given one) by @@ -217,12 +212,11 @@ def wait(self, event_or_stream: Union[Event, Stream]): stream = Stream._init(event_or_stream) except Exception as e: raise ValueError( - "only an Event, Stream, or object supporting " - "__cuda_stream__ can be waited") from e + "only an Event, Stream, or object supporting __cuda_stream__ can be waited" + ) from e else: stream = event_or_stream - event = handle_return( - cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DISABLE_TIMING)) + event = handle_return(cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DISABLE_TIMING)) handle_return(cuda.cuEventRecord(event, stream.handle)) discard_event = True @@ -243,11 +237,11 @@ def device(self) -> Device: """ from cuda.core.experimental._device import Device # avoid circular import + if self._device_id is None: # Get the stream context first if self._ctx_handle is None: - self._ctx_handle = handle_return( - cuda.cuStreamGetCtx(self._handle)) + self._ctx_handle = handle_return(cuda.cuStreamGetCtx(self._handle)) self._device_id = get_device_from_ctx(self._ctx_handle) return Device(self._device_id) @@ -255,8 +249,7 @@ def device(self) -> Device: def context(self) -> Context: """Return the :obj:`Context` associated with this stream.""" if self._ctx_handle is None: - self._ctx_handle = handle_return( - cuda.cuStreamGetCtx(self._handle)) + self._ctx_handle = handle_return(cuda.cuStreamGetCtx(self._handle)) if self._device_id is None: self._device_id = get_device_from_ctx(self._ctx_handle) return Context._from_ctx(self._ctx_handle, self._device_id) @@ -285,15 +278,16 @@ def from_handle(handle: int) -> Stream: Newly created stream object. """ + class _stream_holder: @property def __cuda_stream__(self): return (0, handle) + return Stream._init(obj=_stream_holder()) class _LegacyDefaultStream(Stream): - def __init__(self): self._handle = cuda.CUstream(cuda.CU_STREAM_LEGACY) self._owner = None @@ -303,7 +297,6 @@ def __init__(self): class _PerThreadDefaultStream(Stream): - def __init__(self): self._handle = cuda.CUstream(cuda.CU_STREAM_PER_THREAD) self._owner = None @@ -327,7 +320,7 @@ def default_stream(): """ # TODO: flip the default - use_ptds = int(os.environ.get('CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM', 0)) + use_ptds = int(os.environ.get("CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM", 0)) if use_ptds: return PER_THREAD_DEFAULT_STREAM else: diff --git a/cuda_core/cuda/core/experimental/_utils.py b/cuda_core/cuda/core/experimental/_utils.py index 894e21653..9cb47a33e 100644 --- a/cuda_core/cuda/core/experimental/_utils.py +++ b/cuda_core/cuda/core/experimental/_utils.py @@ -2,17 +2,19 @@ # # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE -from collections import namedtuple import functools +from collections import namedtuple from typing import Callable, Dict from cuda import cuda, cudart, nvrtc -class CUDAError(Exception): pass +class CUDAError(Exception): + pass -class NVRTCError(CUDAError): pass +class NVRTCError(CUDAError): + pass ComputeCapability = namedtuple("ComputeCapability", ("major", "minor")) @@ -50,7 +52,7 @@ def _check_error(error, handle=None): err += f", compilation log:\n\n{log.decode()}" raise NVRTCError(err) else: - raise RuntimeError('Unknown error type: {}'.format(error)) + raise RuntimeError(f"Unknown error type: {error}") def handle_return(result, handle=None): @@ -76,9 +78,11 @@ def check_or_create_options(cls, options, options_description, *, keep_none=Fals options = cls(**options) if not isinstance(options, cls): - raise TypeError(f"The {options_description} must be provided as an object " - f"of type {cls.__name__} or as a dict with valid {options_description}. " - f"The provided object is '{options}'.") + raise TypeError( + f"The {options_description} must be provided as an object " + f"of type {cls.__name__} or as a dict with valid {options_description}. " + f"The provided object is '{options}'." + ) return options @@ -88,17 +92,19 @@ def precondition(checker: Callable[..., None], what: str = "") -> Callable: A decorator that adds checks to ensure any preconditions are met. Args: - checker: The function to call to check whether the preconditions are met. It has the same signature as the wrapped - function with the addition of the keyword argument `what`. + checker: The function to call to check whether the preconditions are met. It has + the same signature as the wrapped function with the addition of the keyword argument `what`. what: A string that is passed in to `checker` to provide context information. Returns: Callable: A decorator that creates the wrapping. """ + def outer(wrapped_function): """ A decorator that actually wraps the function for checking preconditions. """ + @functools.wraps(wrapped_function) def inner(*args, **kwargs): """ @@ -116,17 +122,15 @@ def inner(*args, **kwargs): def get_device_from_ctx(ctx_handle) -> int: """Get device ID from the given ctx.""" - from cuda.core.experimental._device import Device # avoid circular import + from cuda.core.experimental._device import Device # avoid circular import + prev_ctx = Device().context._handle - if int(ctx_handle) != int(prev_ctx): - switch_context = True - else: - switch_context = False + switch_context = int(ctx_handle) != int(prev_ctx) if switch_context: assert prev_ctx == handle_return(cuda.cuCtxPopCurrent()) handle_return(cuda.cuCtxPushCurrent(ctx_handle)) device_id = int(handle_return(cuda.cuCtxGetDevice())) if switch_context: - assert ctx_handle == handle_return(cuda.cuCtxPopCurrent()) + assert ctx_handle == handle_return(cuda.cuCtxPopCurrent()) handle_return(cuda.cuCtxPushCurrent(prev_ctx)) return device_id diff --git a/cuda_core/cuda/core/experimental/utils.py b/cuda_core/cuda/core/experimental/utils.py index 74f41e4d3..0717b41aa 100644 --- a/cuda_core/cuda/core/experimental/utils.py +++ b/cuda_core/cuda/core/experimental/utils.py @@ -2,4 +2,3 @@ # # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE -from cuda.core.experimental._memoryview import StridedMemoryView, viewable diff --git a/cuda_core/docs/source/conf.py b/cuda_core/docs/source/conf.py index 5b28d331f..ce37b3aa3 100644 --- a/cuda_core/docs/source/conf.py +++ b/cuda_core/docs/source/conf.py @@ -10,15 +10,16 @@ # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. import os + # import sys # sys.path.insert(0, os.path.abspath('.')) # -- Project information ----------------------------------------------------- -project = 'cuda.core' -copyright = '2024, NVIDIA' -author = 'NVIDIA' +project = "cuda.core" +copyright = "2024, NVIDIA" +author = "NVIDIA" # The full version, including alpha/beta/rc tags release = os.environ["SPHINX_CUDA_CORE_VER"] @@ -30,16 +31,16 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.autosummary', - 'sphinx.ext.napoleon', - 'myst_nb', - 'enum_tools.autoenum', - 'sphinx_copybutton', + "sphinx.ext.autodoc", + "sphinx.ext.autosummary", + "sphinx.ext.napoleon", + "myst_nb", + "enum_tools.autoenum", + "sphinx_copybutton", ] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. @@ -50,34 +51,34 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. -html_baseurl = 'docs' -html_theme = 'furo' -#html_theme = 'pydata_sphinx_theme' +html_baseurl = "docs" +html_theme = "furo" +# html_theme = 'pydata_sphinx_theme' html_theme_options = { "light_logo": "logo-light-mode.png", "dark_logo": "logo-dark-mode.png", # For pydata_sphinx_theme: - #"logo": { - # "image_light": "_static/logo-light-mode.png", + # "logo": { + # "image_light": "_static/logo-light-mode.png", # "image_dark": "_static/logo-dark-mode.png", - #}, - #"switcher": { + # }, + # "switcher": { # "json_url": "https://nvidia.github.io/cuda-python/cuda-core/versions.json", # "version_match": release, - #}, + # }, ## Add light/dark mode and documentation version switcher - #"navbar_end": [ + # "navbar_end": [ # "search-button", # "theme-switcher", # "version-switcher", # "navbar-icon-links", - #], + # ], } # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] # skip cmdline prompts -copybutton_exclude = '.linenos, .gp' +copybutton_exclude = ".linenos, .gp" diff --git a/cuda_core/examples/saxpy.py b/cuda_core/examples/saxpy.py index 37ad49330..8caa4d4a5 100644 --- a/cuda_core/examples/saxpy.py +++ b/cuda_core/examples/saxpy.py @@ -4,12 +4,9 @@ import sys -from cuda.core.experimental import Device -from cuda.core.experimental import LaunchConfig, launch -from cuda.core.experimental import Program - import cupy as cp +from cuda.core.experimental import Device, LaunchConfig, Program, launch # compute out = a * x + y code = """ @@ -35,9 +32,13 @@ prog = Program(code, code_type="c++") mod = prog.compile( "cubin", - options=("-std=c++11", "-arch=sm_" + "".join(f"{i}" for i in dev.compute_capability),), + options=( + "-std=c++11", + "-arch=sm_" + "".join(f"{i}" for i in dev.compute_capability), + ), logs=sys.stdout, - name_expressions=("saxpy", "saxpy")) + name_expressions=("saxpy", "saxpy"), +) # run in single precision ker = mod.get_kernel("saxpy") @@ -62,7 +63,7 @@ s.sync() # check result -assert cp.allclose(out, a*x+y) +assert cp.allclose(out, a * x + y) # let's repeat again, this time allocates our own out buffer instead of cupy's # run in double precision @@ -77,8 +78,10 @@ dev.sync() # prepare output -buf = dev.allocate(size * 8, # = dtype.itemsize - stream=s) +buf = dev.allocate( + size * 8, # = dtype.itemsize + stream=s, +) # prepare launch block = 64 @@ -92,9 +95,10 @@ # check result # we wrap output buffer as a cupy array for simplicity -out = cp.ndarray(size, dtype=dtype, - memptr=cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(int(buf.handle), buf.size, buf), 0)) -assert cp.allclose(out, a*x+y) +out = cp.ndarray( + size, dtype=dtype, memptr=cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(int(buf.handle), buf.size, buf), 0) +) +assert cp.allclose(out, a * x + y) # clean up resources that we allocate # cupy cleans up automatically the rest diff --git a/cuda_core/examples/vector_add.py b/cuda_core/examples/vector_add.py index baee409aa..550eaf2a2 100644 --- a/cuda_core/examples/vector_add.py +++ b/cuda_core/examples/vector_add.py @@ -2,12 +2,9 @@ # # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE -from cuda.core.experimental import Device -from cuda.core.experimental import LaunchConfig, launch -from cuda.core.experimental import Program - import cupy as cp +from cuda.core.experimental import Device, LaunchConfig, Program, launch # compute c = a + b code = """ @@ -32,8 +29,12 @@ prog = Program(code, code_type="c++") mod = prog.compile( "cubin", - options=("-std=c++17", "-arch=sm_" + "".join(f"{i}" for i in dev.compute_capability),), - name_expressions=("vector_add",)) + options=( + "-std=c++17", + "-arch=sm_" + "".join(f"{i}" for i in dev.compute_capability), + ), + name_expressions=("vector_add",), +) # run in single precision ker = mod.get_kernel("vector_add") @@ -58,5 +59,5 @@ s.sync() # check result -assert cp.allclose(c, a+b) +assert cp.allclose(c, a + b) print("done!") diff --git a/cuda_core/pyproject.toml b/cuda_core/pyproject.toml index 35d1c42a1..6993825f3 100644 --- a/cuda_core/pyproject.toml +++ b/cuda_core/pyproject.toml @@ -55,3 +55,43 @@ include = ["cuda.core*"] [tool.setuptools.dynamic] version = { attr = "cuda.core._version.__version__" } readme = { file = ["README.md"], content-type = "text/markdown" } + +[tool.ruff] +line-length = 120 + +[tool.ruff.format] +docstring-code-format = true + +exclude = ["cuda/core/_version.py"] + +[tool.ruff.lint] +select = [ + # pycodestyle Error + "E", + # Pyflakes + "F", + # pycodestyle Warning + "W", + # pyupgrade + "UP", + # flake8-bugbear + "B", + # flake8-simplify + "SIM", + # isort + "I", +] + +ignore = [ + "UP006", + "UP007", + "E741", # ambiguous variable name such as I + "B007", # rename unsued loop variable to _name + "UP035" # UP006, UP007, UP035 complain about deprecated Typing. use, but disregard backward compatibility of python version +] + +exclude = ["cuda/core/_version.py"] + +[tool.ruff.lint.per-file-ignores] +"__init__.py" = ["F401"] +"setup.py" = ["F401"] diff --git a/cuda_core/setup.py b/cuda_core/setup.py index 8d20f2c94..029a19317 100644 --- a/cuda_core/setup.py +++ b/cuda_core/setup.py @@ -5,10 +5,9 @@ import os from Cython.Build import cythonize -from setuptools import setup, Extension +from setuptools import Extension, setup from setuptools.command.build_ext import build_ext as _build_ext - ext_modules = ( Extension( "cuda.core.experimental._dlpack", @@ -29,16 +28,15 @@ class build_ext(_build_ext): - def build_extensions(self): self.parallel = os.cpu_count() // 2 super().build_extensions() setup( - ext_modules=cythonize(ext_modules, - verbose=True, language_level=3, - compiler_directives={'embedsignature': True}), - cmdclass = {'build_ext': build_ext,}, + ext_modules=cythonize(ext_modules, verbose=True, language_level=3, compiler_directives={"embedsignature": True}), + cmdclass={ + "build_ext": build_ext, + }, zip_safe=False, ) diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py index 3c7eccd0c..bb99fb33c 100644 --- a/cuda_core/tests/conftest.py +++ b/cuda_core/tests/conftest.py @@ -10,11 +10,12 @@ except ImportError: from cuda import cuda as driver -from cuda.core.experimental import Device -from cuda.core.experimental import _device -from cuda.core.experimental._utils import handle_return import pytest +from cuda.core.experimental import Device, _device +from cuda.core.experimental._utils import handle_return + + @pytest.fixture(scope="function") def init_cuda(): device = Device() @@ -22,12 +23,14 @@ def init_cuda(): yield _device_unset_current() + def _device_unset_current(): handle_return(driver.cuCtxPopCurrent()) with _device._tls_lock: del _device._tls.devices + @pytest.fixture(scope="function") def deinit_cuda(): yield - _device_unset_current() \ No newline at end of file + _device_unset_current() diff --git a/cuda_core/tests/example_tests/test_basic_examples.py b/cuda_core/tests/example_tests/test_basic_examples.py index c02ea43fc..9b94ecd3e 100644 --- a/cuda_core/tests/example_tests/test_basic_examples.py +++ b/cuda_core/tests/example_tests/test_basic_examples.py @@ -8,18 +8,18 @@ # If we have subcategories of examples in the future, this file can be split along those lines -from .utils import run_example -import os import glob +import os + import pytest -samples_path = os.path.join( - os.path.dirname(__file__), '..', '..', 'examples') -sample_files = glob.glob(samples_path+'**/*.py', recursive=True) -@pytest.mark.parametrize( - 'example', sample_files -) +from .utils import run_example + +samples_path = os.path.join(os.path.dirname(__file__), "..", "..", "examples") +sample_files = glob.glob(samples_path + "**/*.py", recursive=True) + + +@pytest.mark.parametrize("example", sample_files) class TestExamples: def test_example(self, example, deinit_cuda): - filename = os.path.basename(example) run_example(samples_path, example) diff --git a/cuda_core/tests/example_tests/utils.py b/cuda_core/tests/example_tests/utils.py index 23a3018ce..f6ac3e15d 100644 --- a/cuda_core/tests/example_tests/utils.py +++ b/cuda_core/tests/example_tests/utils.py @@ -9,16 +9,19 @@ import gc import os import sys -import pytest + import cupy as cp +import pytest + class SampleTestError(Exception): pass + def parse_python_script(filepath): - if not filepath.endswith('.py'): + if not filepath.endswith(".py"): raise ValueError(f"{filepath} not supported") - with open(filepath, "r", encoding='utf-8') as f: + with open(filepath, encoding="utf-8") as f: script = f.read() return script @@ -34,17 +37,17 @@ def run_example(samples_path, filename, env=None): exec(script, env if env else {}) except ImportError as e: # for samples requiring any of optional dependencies - for m in ('cupy',): + for m in ("cupy",): if f"No module named '{m}'" in str(e): - pytest.skip(f'{m} not installed, skipping related tests') + pytest.skip(f"{m} not installed, skipping related tests") break else: raise except Exception as e: - msg = "\n" - msg += f'Got error ({filename}):\n' - msg += str(e) - raise SampleTestError(msg) from e + msg = "\n" + msg += f"Got error ({filename}):\n" + msg += str(e) + raise SampleTestError(msg) from e finally: sys.path = old_sys_path sys.argv = old_argv diff --git a/cuda_core/tests/test_device.py b/cuda_core/tests/test_device.py index c809bfb3f..afc3ed5b6 100644 --- a/cuda_core/tests/test_device.py +++ b/cuda_core/tests/test_device.py @@ -13,16 +13,19 @@ from cuda import cudart as runtime from cuda.core.experimental import Device -from cuda.core.experimental._utils import handle_return, ComputeCapability +from cuda.core.experimental._utils import ComputeCapability, handle_return + def test_device_set_current(deinit_cuda): device = Device() device.set_current() assert handle_return(driver.cuCtxGetCurrent()) is not None - + + def test_device_repr(): device = Device(0) - assert str(device).startswith(' bool: @property def device_id(self) -> int: return 0 - + + class DummyHostMemoryResource(MemoryResource): def __init__(self): pass @@ -49,7 +52,7 @@ def allocate(self, size, stream=None) -> Buffer: return Buffer(ptr=ptr, size=size, mr=self) def deallocate(self, ptr, size, stream=None): - #the memory is deallocated per the ctypes deallocation at garbage collection time + # the memory is deallocated per the ctypes deallocation at garbage collection time pass @property @@ -64,6 +67,7 @@ def is_host_accessible(self) -> bool: def device_id(self) -> int: raise RuntimeError("the pinned memory resource is not bound to any GPU") + class DummyUnifiedMemoryResource(MemoryResource): def __init__(self, device): self.device = device @@ -87,6 +91,7 @@ def is_host_accessible(self) -> bool: def device_id(self) -> int: return 0 + class DummyPinnedMemoryResource(MemoryResource): def __init__(self, device): self.device = device @@ -110,7 +115,8 @@ def is_host_accessible(self) -> bool: def device_id(self) -> int: raise RuntimeError("the pinned memory resource is not bound to any GPU") -def buffer_initialization(dummy_mr : MemoryResource): + +def buffer_initialization(dummy_mr: MemoryResource): buffer = dummy_mr.allocate(size=1024) assert buffer.handle != 0 assert buffer.size == 1024 @@ -119,6 +125,7 @@ def buffer_initialization(dummy_mr : MemoryResource): assert buffer.is_host_accessible == dummy_mr.is_host_accessible buffer.close() + def test_buffer_initialization(): device = Device() device.set_current() @@ -126,8 +133,9 @@ def test_buffer_initialization(): buffer_initialization(DummyHostMemoryResource()) buffer_initialization(DummyUnifiedMemoryResource(device)) buffer_initialization(DummyPinnedMemoryResource(device)) - -def buffer_copy_to(dummy_mr : MemoryResource, device : Device, check = False): + + +def buffer_copy_to(dummy_mr: MemoryResource, device: Device, check=False): src_buffer = dummy_mr.allocate(size=1024) dst_buffer = dummy_mr.allocate(size=1024) stream = device.create_stream() @@ -136,27 +144,29 @@ def buffer_copy_to(dummy_mr : MemoryResource, device : Device, check = False): src_ptr = ctypes.cast(src_buffer.handle, ctypes.POINTER(ctypes.c_byte)) for i in range(1024): src_ptr[i] = ctypes.c_byte(i) - + src_buffer.copy_to(dst_buffer, stream=stream) device.sync() if check: dst_ptr = ctypes.cast(dst_buffer.handle, ctypes.POINTER(ctypes.c_byte)) - + for i in range(10): assert dst_ptr[i] == src_ptr[i] dst_buffer.close() src_buffer.close() + def test_buffer_copy_to(): device = Device() device.set_current() buffer_copy_to(DummyDeviceMemoryResource(device), device) buffer_copy_to(DummyUnifiedMemoryResource(device), device) - buffer_copy_to(DummyPinnedMemoryResource(device), device, check = True) + buffer_copy_to(DummyPinnedMemoryResource(device), device, check=True) + -def buffer_copy_from(dummy_mr : MemoryResource, device, check = False): +def buffer_copy_from(dummy_mr: MemoryResource, device, check=False): src_buffer = dummy_mr.allocate(size=1024) dst_buffer = dummy_mr.allocate(size=1024) stream = device.create_stream() @@ -165,31 +175,34 @@ def buffer_copy_from(dummy_mr : MemoryResource, device, check = False): src_ptr = ctypes.cast(src_buffer.handle, ctypes.POINTER(ctypes.c_byte)) for i in range(1024): src_ptr[i] = ctypes.c_byte(i) - + dst_buffer.copy_from(src_buffer, stream=stream) device.sync() if check: dst_ptr = ctypes.cast(dst_buffer.handle, ctypes.POINTER(ctypes.c_byte)) - + for i in range(10): assert dst_ptr[i] == src_ptr[i] dst_buffer.close() src_buffer.close() + def test_buffer_copy_from(): device = Device() device.set_current() buffer_copy_from(DummyDeviceMemoryResource(device), device) buffer_copy_from(DummyUnifiedMemoryResource(device), device) - buffer_copy_from(DummyPinnedMemoryResource(device), device, check = True) + buffer_copy_from(DummyPinnedMemoryResource(device), device, check=True) -def buffer_close(dummy_mr : MemoryResource): + +def buffer_close(dummy_mr: MemoryResource): buffer = dummy_mr.allocate(size=1024) buffer.close() assert buffer.handle == 0 - assert buffer.memory_resource == None + assert buffer.memory_resource is None + def test_buffer_close(): device = Device() diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py index 832963777..5f0b6056d 100644 --- a/cuda_core/tests/test_module.py +++ b/cuda_core/tests/test_module.py @@ -6,11 +6,17 @@ # this software and related documentation outside the terms of the EULA # is strictly prohibited. -from cuda.core.experimental._module import ObjectCode -import pytest import importlib -@pytest.mark.skipif(int(importlib.metadata.version("cuda-python").split(".")[0]) < 12, reason='Module loading for older drivers validate require valid module code.') +import pytest + +from cuda.core.experimental._module import ObjectCode + + +@pytest.mark.skipif( + int(importlib.metadata.version("cuda-python").split(".")[0]) < 12, + reason="Module loading for older drivers validate require valid module code.", +) def test_object_code_initialization(): # Test with supported code types for code_type in ["cubin", "ptx", "fatbin"]: @@ -24,15 +30,19 @@ def test_object_code_initialization(): with pytest.raises(ValueError): ObjectCode(b"dummy_data", "unsupported_code_type") -#TODO add ObjectCode tests which provide the appropriate data for cuLibraryLoadFromFile + +# TODO add ObjectCode tests which provide the appropriate data for cuLibraryLoadFromFile def test_object_code_initialization_with_str(): assert True + def test_object_code_initialization_with_jit_options(): assert True + def test_object_code_get_kernel(): assert True + def test_kernel_from_obj(): assert True diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py index caa7369eb..af94a7ba0 100644 --- a/cuda_core/tests/test_program.py +++ b/cuda_core/tests/test_program.py @@ -6,52 +6,61 @@ # this software and related documentation outside the terms of the EULA # is strictly prohibited. -from cuda.core.experimental import Program -from cuda.core.experimental._module import ObjectCode, Kernel import pytest +from cuda.core.experimental import Program +from cuda.core.experimental._module import Kernel, ObjectCode + + def test_program_init_valid_code_type(): - code = "extern \"C\" __global__ void my_kernel() {}" + code = 'extern "C" __global__ void my_kernel() {}' program = Program(code, "c++") assert program.backend == "nvrtc" assert program.handle is not None + def test_program_init_invalid_code_type(): - code = "extern \"C\" __global__ void my_kernel() {}" + code = 'extern "C" __global__ void my_kernel() {}' with pytest.raises(NotImplementedError): Program(code, "python") + def test_program_init_invalid_code_format(): code = 12345 with pytest.raises(TypeError): Program(code, "c++") + def test_program_compile_valid_target_type(): - code = "extern \"C\" __global__ void my_kernel() {}" + code = 'extern "C" __global__ void my_kernel() {}' program = Program(code, "c++") object_code = program.compile("ptx") kernel = object_code.get_kernel("my_kernel") assert isinstance(object_code, ObjectCode) assert isinstance(kernel, Kernel) + def test_program_compile_invalid_target_type(): - code = "extern \"C\" __global__ void my_kernel() {}" + code = 'extern "C" __global__ void my_kernel() {}' program = Program(code, "c++") with pytest.raises(NotImplementedError): program.compile("invalid_target") + def test_program_backend_property(): - code = "extern \"C\" __global__ void my_kernel() {}" + code = 'extern "C" __global__ void my_kernel() {}' program = Program(code, "c++") assert program.backend == "nvrtc" + def test_program_handle_property(): - code = "extern \"C\" __global__ void my_kernel() {}" + code = 'extern "C" __global__ void my_kernel() {}' program = Program(code, "c++") assert program.handle is not None + def test_program_close(): - code = "extern \"C\" __global__ void my_kernel() {}" + code = 'extern "C" __global__ void my_kernel() {}' program = Program(code, "c++") program.close() assert program.handle is None diff --git a/cuda_core/tests/test_stream.py b/cuda_core/tests/test_stream.py index faf0ad7d8..03cdd8526 100644 --- a/cuda_core/tests/test_stream.py +++ b/cuda_core/tests/test_stream.py @@ -6,28 +6,34 @@ # this software and related documentation outside the terms of the EULA # is strictly prohibited. +import pytest + from cuda.core.experimental import Device, Stream, StreamOptions -from cuda.core.experimental._stream import LEGACY_DEFAULT_STREAM, PER_THREAD_DEFAULT_STREAM, default_stream from cuda.core.experimental._event import Event -import pytest +from cuda.core.experimental._stream import LEGACY_DEFAULT_STREAM, PER_THREAD_DEFAULT_STREAM, default_stream + def test_stream_init(): with pytest.raises(NotImplementedError): Stream() + def test_stream_init_with_options(init_cuda): stream = Device().create_stream(options=StreamOptions(nonblocking=True, priority=0)) assert stream.is_nonblocking is True assert stream.priority == 0 + def test_stream_handle(init_cuda): stream = Device().create_stream(options=StreamOptions()) assert isinstance(stream.handle, int) + def test_stream_is_nonblocking(init_cuda): stream = Device().create_stream(options=StreamOptions(nonblocking=True)) assert stream.is_nonblocking is True + def test_stream_priority(init_cuda): stream = Device().create_stream(options=StreamOptions(priority=0)) assert stream.priority == 0 @@ -36,20 +42,24 @@ def test_stream_priority(init_cuda): with pytest.raises(ValueError): stream = Device().create_stream(options=StreamOptions(priority=1)) + def test_stream_sync(init_cuda): stream = Device().create_stream(options=StreamOptions()) stream.sync() # Should not raise any exceptions + def test_stream_record(init_cuda): stream = Device().create_stream(options=StreamOptions()) event = stream.record() assert isinstance(event, Event) + def test_stream_record_invalid_event(init_cuda): stream = Device().create_stream(options=StreamOptions()) with pytest.raises(TypeError): stream.record(event="invalid_event") + def test_stream_wait_event(init_cuda): s1 = Device().create_stream() s2 = Device().create_stream() @@ -57,21 +67,25 @@ def test_stream_wait_event(init_cuda): s2.wait(e1) # Should not raise any exceptions s2.sync() + def test_stream_wait_invalid_event(init_cuda): stream = Device().create_stream(options=StreamOptions()) with pytest.raises(ValueError): stream.wait(event_or_stream="invalid_event") + def test_stream_device(init_cuda): stream = Device().create_stream(options=StreamOptions()) device = stream.device assert isinstance(device, Device) + def test_stream_context(init_cuda): stream = Device().create_stream(options=StreamOptions()) context = stream.context assert context is not None + def test_stream_from_foreign_stream(init_cuda): device = Device() other_stream = device.create_stream(options=StreamOptions()) @@ -81,17 +95,21 @@ def test_stream_from_foreign_stream(init_cuda): assert isinstance(device, Device) context = stream.context assert context is not None - + + def test_stream_from_handle(): stream = Stream.from_handle(0) assert isinstance(stream, Stream) + def test_legacy_default_stream(): assert isinstance(LEGACY_DEFAULT_STREAM, Stream) + def test_per_thread_default_stream(): assert isinstance(PER_THREAD_DEFAULT_STREAM, Stream) + def test_default_stream(): stream = default_stream() assert isinstance(stream, Stream) diff --git a/cuda_python/docs/source/conf.py b/cuda_python/docs/source/conf.py index 8a5ab87f9..ab00c2203 100644 --- a/cuda_python/docs/source/conf.py +++ b/cuda_python/docs/source/conf.py @@ -16,9 +16,9 @@ # -- Project information ----------------------------------------------------- -project = 'CUDA Python' -copyright = '2021-2024, NVIDIA' -author = 'NVIDIA' +project = "CUDA Python" +copyright = "2021-2024, NVIDIA" +author = "NVIDIA" # The full version, including alpha/beta/rc tags release = os.environ["SPHINX_CUDA_PYTHON_VER"] @@ -30,14 +30,14 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.napoleon', - 'myst_nb', - 'enum_tools.autoenum' + "sphinx.ext.autodoc", + "sphinx.ext.napoleon", + "myst_nb", + "enum_tools.autoenum", ] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. @@ -48,40 +48,41 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. -html_baseurl = 'docs' -html_theme = 'furo' -#html_theme = 'pydata_sphinx_theme' +html_baseurl = "docs" +html_theme = "furo" +# html_theme = 'pydata_sphinx_theme' html_theme_options = { "light_logo": "logo-light-mode.png", "dark_logo": "logo-dark-mode.png", # For pydata_sphinx_theme: - #"logo": { - # "image_light": "_static/logo-light-mode.png", + # "logo": { + # "image_light": "_static/logo-light-mode.png", # "image_dark": "_static/logo-dark-mode.png", - #}, - #"switcher": { + # }, + # "switcher": { # "json_url": "https://nvidia.github.io/cuda-python/cuda-core/versions.json", # "version_match": release, - #}, + # }, ## Add light/dark mode and documentation version switcher - #"navbar_end": [ + # "navbar_end": [ # "search-button", # "theme-switcher", # "version-switcher", # "navbar-icon-links", - #], + # ], } # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] # Allow overwriting CUDA Python's domain name for local development. See: # - https://stackoverflow.com/a/61694897/2344149 # - https://www.sphinx-doc.org/en/master/usage/configuration.html#confval-rst_epilog -CUDA_PYTHON_DOMAIN = os.environ.get('CUDA_PYTHON_DOMAIN', - 'https://nvidia.github.io/cuda-python') +CUDA_PYTHON_DOMAIN = os.environ.get( + "CUDA_PYTHON_DOMAIN", "https://nvidia.github.io/cuda-python" +) rst_epilog = f""" .. _cuda.core: {CUDA_PYTHON_DOMAIN}/cuda-core/latest .. _cuda.bindings: {CUDA_PYTHON_DOMAIN}/cuda-bindings/latest