|
2 | 2 | #
|
3 | 3 | # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
|
4 | 4 |
|
| 5 | +import sys |
| 6 | + |
5 | 7 | import cupy as cp
|
6 | 8 |
|
7 |
| -from cuda.core.experimental import Device, LaunchConfig, Program, launch |
| 9 | +from cuda.core.experimental import Device, LaunchConfig, Program, launch, system |
| 10 | + |
| 11 | +if system.num_devices < 2: |
| 12 | + print("this example requires at least 2 GPUs", file=sys.stderr) |
| 13 | + sys.exit(0) |
8 | 14 |
|
9 | 15 | dtype = cp.float32
|
10 | 16 | size = 50000
|
11 | 17 |
|
12 |
| -# Set GPU0 |
| 18 | +# Set GPU 0 |
13 | 19 | dev0 = Device(0)
|
14 | 20 | dev0.set_current()
|
15 | 21 | stream0 = dev0.create_stream()
|
16 | 22 |
|
17 |
| -# Allocate memory to GPU0 |
18 |
| -a = cp.random.random(size, dtype=dtype) |
19 |
| -b = cp.random.random(size, dtype=dtype) |
20 |
| -c = cp.empty_like(a) |
21 |
| - |
22 |
| -# Set GPU1 |
23 |
| -dev1 = Device(1) |
24 |
| -dev1.set_current() |
25 |
| -stream1 = dev1.create_stream() |
26 |
| - |
27 |
| -# Allocate memory to GPU1 |
28 |
| -x = cp.random.random(size, dtype=dtype) |
29 |
| -y = cp.random.random(size, dtype=dtype) |
30 |
| -z = cp.empty_like(a) |
31 |
| - |
32 |
| -# compute c = a + b |
| 23 | +# Compile a kernel targeting GPU 0 to compute c = a + b |
33 | 24 | code_add = """
|
34 | 25 | extern "C"
|
35 | 26 | __global__ void vector_add(const float* A,
|
|
42 | 33 | }
|
43 | 34 | }
|
44 | 35 | """
|
| 36 | +arch0 = "".join(f"{i}" for i in dev0.compute_capability) |
| 37 | +prog_add = Program(code_add, code_type="c++") |
| 38 | +mod_add = prog_add.compile( |
| 39 | + "cubin", |
| 40 | + options=( |
| 41 | + "-std=c++17", |
| 42 | + "-arch=sm_" + arch0, |
| 43 | + ), |
| 44 | +) |
| 45 | +ker_add = mod_add.get_kernel("vector_add") |
45 | 46 |
|
46 |
| -# compute c = a - b |
| 47 | +# Set GPU 1 |
| 48 | +dev1 = Device(1) |
| 49 | +dev1.set_current() |
| 50 | +stream1 = dev1.create_stream() |
| 51 | + |
| 52 | +# Compile a kernel targeting GPU 1 to compute c = a - b |
47 | 53 | code_sub = """
|
48 | 54 | extern "C"
|
49 |
| -__global__ void vector_sub(const *float A, |
| 55 | +__global__ void vector_sub(const float* A, |
50 | 56 | const float* B,
|
51 | 57 | float* C,
|
52 | 58 | size_t N) {
|
|
56 | 62 | }
|
57 | 63 | }
|
58 | 64 | """
|
59 |
| - |
60 |
| -arch0 = "".join(f"{i}" for i in dev0.compute_capability) |
61 |
| -prog_add = Program(code_add, code_type="c++") |
62 |
| -mod_add = prog_add.compile( |
63 |
| - "cubin", |
64 |
| - options=( |
65 |
| - "-std=c++17", |
66 |
| - "-arch=sm_" + arch0, |
67 |
| - ), |
68 |
| -) |
69 |
| - |
70 |
| -# run in single precision |
71 |
| -ker_add = mod_add.get_kernel("vector_add") |
72 |
| - |
73 | 65 | arch1 = "".join(f"{i}" for i in dev1.compute_capability)
|
74 | 66 | prog_sub = Program(code_sub, code_type="c++")
|
75 | 67 | mod_sub = prog_sub.compile(
|
|
79 | 71 | "-arch=sm_" + arch1,
|
80 | 72 | ),
|
81 | 73 | )
|
82 |
| - |
83 |
| -# run in single precision |
84 | 74 | ker_sub = mod_sub.get_kernel("vector_sub")
|
85 | 75 |
|
86 |
| -# Synchronize devices to ensure that memory has been created |
87 |
| -dev0.sync() |
88 |
| -dev1.sync() |
89 | 76 |
|
| 77 | +# This adaptor ensures that any foreign stream (ex: from CuPy) that have not |
| 78 | +# yet supported the __cuda_stream__ protocol can still be recognized by |
| 79 | +# cuda.core. |
| 80 | +class StreamAdaptor: |
| 81 | + def __init__(self, obj): |
| 82 | + self.obj = obj |
| 83 | + |
| 84 | + @property |
| 85 | + def __cuda_stream__(self): |
| 86 | + # Note: CuPy streams have a .ptr attribute |
| 87 | + return (0, self.obj.ptr) |
| 88 | + |
| 89 | + |
| 90 | +# Create launch configs for each kernel that will be executed on the respective |
| 91 | +# CUDA streams. |
90 | 92 | block = 256
|
91 | 93 | grid = (size + block - 1) // block
|
92 |
| - |
93 | 94 | config0 = LaunchConfig(grid=grid, block=block, stream=stream0)
|
94 | 95 | config1 = LaunchConfig(grid=grid, block=block, stream=stream1)
|
95 | 96 |
|
96 |
| -# Launch GPU0 and Synchronize the stream |
| 97 | +# Allocate memory on GPU 0 |
| 98 | +# Note: This runs on CuPy's current stream for GPU 0 |
97 | 99 | dev0.set_current()
|
98 |
| -launch(ker_add, config0, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size)) |
99 |
| -stream0.sync() |
| 100 | +a = cp.random.random(size, dtype=dtype) |
| 101 | +b = cp.random.random(size, dtype=dtype) |
| 102 | +c = cp.empty_like(a) |
| 103 | +cp_stream0 = StreamAdaptor(cp.cuda.get_current_stream()) |
100 | 104 |
|
101 |
| -# Validate result |
102 |
| -assert cp.allclose(c, a + b) |
| 105 | +# Establish a stream order to ensure that memory has been initialized before |
| 106 | +# accessed by the kernel. |
| 107 | +stream0.wait(cp_stream0) |
103 | 108 |
|
104 |
| -# Launch GPU1 and Synchronize the stream |
| 109 | +# Launch the add kernel on GPU 0 / stream 0 |
| 110 | +launch(ker_add, config0, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size)) |
| 111 | + |
| 112 | +# Allocate memory on GPU 1 |
| 113 | +# Note: This runs on CuPy's current stream for GPU 1. |
105 | 114 | dev1.set_current()
|
| 115 | +x = cp.random.random(size, dtype=dtype) |
| 116 | +y = cp.random.random(size, dtype=dtype) |
| 117 | +z = cp.empty_like(a) |
| 118 | +cp_stream1 = StreamAdaptor(cp.cuda.get_current_stream()) |
| 119 | + |
| 120 | +# Establish a stream order |
| 121 | +stream1.wait(cp_stream1) |
| 122 | + |
| 123 | +# Launch the subtract kernel on GPU 1 / stream 1 |
106 | 124 | launch(ker_sub, config1, x.data.ptr, y.data.ptr, z.data.ptr, cp.uint64(size))
|
| 125 | + |
| 126 | +# Synchronize both GPUs are validate the results |
| 127 | +dev0.set_current() |
| 128 | +stream0.sync() |
| 129 | +assert cp.allclose(c, a + b) |
| 130 | +dev1.set_current() |
107 | 131 | stream1.sync()
|
108 | 132 | assert cp.allclose(z, x - y)
|
109 | 133 |
|
|
0 commit comments