Skip to content

Commit a817969

Browse files
authored
Merge pull request #304 from amwi04/multi-gpu
multi gpu example
2 parents bfd3ecb + b041f32 commit a817969

File tree

2 files changed

+137
-2
lines changed

2 files changed

+137
-2
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
2+
#
3+
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
4+
5+
import sys
6+
7+
import cupy as cp
8+
9+
from cuda.core.experimental import Device, LaunchConfig, Program, launch, system
10+
11+
if system.num_devices < 2:
12+
print("this example requires at least 2 GPUs", file=sys.stderr)
13+
sys.exit(0)
14+
15+
dtype = cp.float32
16+
size = 50000
17+
18+
# Set GPU 0
19+
dev0 = Device(0)
20+
dev0.set_current()
21+
stream0 = dev0.create_stream()
22+
23+
# Compile a kernel targeting GPU 0 to compute c = a + b
24+
code_add = """
25+
extern "C"
26+
__global__ void vector_add(const float* A,
27+
const float* B,
28+
float* C,
29+
size_t N) {
30+
const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
31+
for (size_t i=tid; i<N; i+=gridDim.x*blockDim.x) {
32+
C[tid] = A[tid] + B[tid];
33+
}
34+
}
35+
"""
36+
arch0 = "".join(f"{i}" for i in dev0.compute_capability)
37+
prog_add = Program(code_add, code_type="c++")
38+
mod_add = prog_add.compile(
39+
"cubin",
40+
options=(
41+
"-std=c++17",
42+
"-arch=sm_" + arch0,
43+
),
44+
)
45+
ker_add = mod_add.get_kernel("vector_add")
46+
47+
# Set GPU 1
48+
dev1 = Device(1)
49+
dev1.set_current()
50+
stream1 = dev1.create_stream()
51+
52+
# Compile a kernel targeting GPU 1 to compute c = a - b
53+
code_sub = """
54+
extern "C"
55+
__global__ void vector_sub(const float* A,
56+
const float* B,
57+
float* C,
58+
size_t N) {
59+
const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
60+
for (size_t i=tid; i<N; i+=gridDim.x*blockDim.x) {
61+
C[tid] = A[tid] - B[tid];
62+
}
63+
}
64+
"""
65+
arch1 = "".join(f"{i}" for i in dev1.compute_capability)
66+
prog_sub = Program(code_sub, code_type="c++")
67+
mod_sub = prog_sub.compile(
68+
"cubin",
69+
options=(
70+
"-std=c++17",
71+
"-arch=sm_" + arch1,
72+
),
73+
)
74+
ker_sub = mod_sub.get_kernel("vector_sub")
75+
76+
77+
# This adaptor ensures that any foreign stream (ex: from CuPy) that have not
78+
# yet supported the __cuda_stream__ protocol can still be recognized by
79+
# cuda.core.
80+
class StreamAdaptor:
81+
def __init__(self, obj):
82+
self.obj = obj
83+
84+
@property
85+
def __cuda_stream__(self):
86+
# Note: CuPy streams have a .ptr attribute
87+
return (0, self.obj.ptr)
88+
89+
90+
# Create launch configs for each kernel that will be executed on the respective
91+
# CUDA streams.
92+
block = 256
93+
grid = (size + block - 1) // block
94+
config0 = LaunchConfig(grid=grid, block=block, stream=stream0)
95+
config1 = LaunchConfig(grid=grid, block=block, stream=stream1)
96+
97+
# Allocate memory on GPU 0
98+
# Note: This runs on CuPy's current stream for GPU 0
99+
dev0.set_current()
100+
a = cp.random.random(size, dtype=dtype)
101+
b = cp.random.random(size, dtype=dtype)
102+
c = cp.empty_like(a)
103+
cp_stream0 = StreamAdaptor(cp.cuda.get_current_stream())
104+
105+
# Establish a stream order to ensure that memory has been initialized before
106+
# accessed by the kernel.
107+
stream0.wait(cp_stream0)
108+
109+
# Launch the add kernel on GPU 0 / stream 0
110+
launch(ker_add, config0, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size))
111+
112+
# Allocate memory on GPU 1
113+
# Note: This runs on CuPy's current stream for GPU 1.
114+
dev1.set_current()
115+
x = cp.random.random(size, dtype=dtype)
116+
y = cp.random.random(size, dtype=dtype)
117+
z = cp.empty_like(a)
118+
cp_stream1 = StreamAdaptor(cp.cuda.get_current_stream())
119+
120+
# Establish a stream order
121+
stream1.wait(cp_stream1)
122+
123+
# Launch the subtract kernel on GPU 1 / stream 1
124+
launch(ker_sub, config1, x.data.ptr, y.data.ptr, z.data.ptr, cp.uint64(size))
125+
126+
# Synchronize both GPUs are validate the results
127+
dev0.set_current()
128+
stream0.sync()
129+
assert cp.allclose(c, a + b)
130+
dev1.set_current()
131+
stream1.sync()
132+
assert cp.allclose(z, x - y)
133+
134+
print("done")

cuda_core/tests/conftest.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,9 @@ def _device_unset_current():
4141
# no active context, do nothing
4242
return
4343
handle_return(driver.cuCtxPopCurrent())
44-
with _device._tls_lock:
45-
del _device._tls.devices
44+
if hasattr(_device._tls, "devices"):
45+
with _device._tls_lock:
46+
del _device._tls.devices
4647

4748

4849
@pytest.fixture(scope="function")

0 commit comments

Comments
 (0)