diff --git a/cuda_core/examples/simple_multi_gpu_example.py b/cuda_core/examples/simple_multi_gpu_example.py new file mode 100644 index 000000000..89dd6cbea --- /dev/null +++ b/cuda_core/examples/simple_multi_gpu_example.py @@ -0,0 +1,134 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + +import sys + +import cupy as cp + +from cuda.core.experimental import Device, LaunchConfig, Program, launch, system + +if system.num_devices < 2: + print("this example requires at least 2 GPUs", file=sys.stderr) + sys.exit(0) + +dtype = cp.float32 +size = 50000 + +# Set GPU 0 +dev0 = Device(0) +dev0.set_current() +stream0 = dev0.create_stream() + +# Compile a kernel targeting GPU 0 to compute c = a + b +code_add = """ +extern "C" +__global__ void vector_add(const float* A, + const float* B, + float* C, + size_t N) { + const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x; + for (size_t i=tid; i