Skip to content

Commit 797286b

Browse files
authored
Merge pull request #87 from leofang/cuda_py
Introducing `cuda.core`: pythonic access to CUDA core functionalities
2 parents c4cabc3 + 317dd13 commit 797286b

24 files changed

+2520
-0
lines changed

cuda_core/MANIFEST.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
recursive-include cuda/core *.pyx *.pxd

cuda_core/README.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# `cuda.core`: (experimental) pythonic CUDA module
2+
3+
Currently under active development. To build from source, just do:
4+
```shell
5+
$ git clone https://github.com/NVIDIA/cuda-python
6+
$ cd cuda-python/cuda_core # move to the directory where this README locates
7+
$ pip install .
8+
```
9+
For now `cuda-python` is a required dependency.

cuda_core/cuda/core/__init__.pxd

Whitespace-only changes.

cuda_core/cuda/core/__init__.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
2+
#
3+
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
4+
5+
from cuda.core._device import Device
6+
from cuda.core._event import EventOptions
7+
from cuda.core._launcher import LaunchConfig, launch
8+
from cuda.core._program import Program
9+
from cuda.core._stream import Stream, StreamOptions
10+
from cuda.core._version import __version__

cuda_core/cuda/core/_context.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
2+
#
3+
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
4+
5+
from dataclasses import dataclass
6+
7+
from cuda import cuda, cudart
8+
from cuda.core._utils import handle_return
9+
10+
11+
@dataclass
12+
class ContextOptions:
13+
pass # TODO
14+
15+
16+
class Context:
17+
18+
__slots__ = ("_handle", "_id")
19+
20+
def __init__(self):
21+
raise NotImplementedError("TODO")
22+
23+
@staticmethod
24+
def _from_ctx(obj, dev_id):
25+
assert isinstance(obj, cuda.CUcontext)
26+
ctx = Context.__new__(Context)
27+
ctx._handle = obj
28+
ctx._id = dev_id
29+
return ctx

cuda_core/cuda/core/_device.py

Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,187 @@
1+
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
2+
#
3+
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
4+
5+
import threading
6+
from typing import Optional, Union
7+
import warnings
8+
9+
from cuda import cuda, cudart
10+
from cuda.core._utils import handle_return, ComputeCapability, CUDAError, \
11+
precondition
12+
from cuda.core._context import Context, ContextOptions
13+
from cuda.core._memory import _DefaultAsyncMempool, Buffer, MemoryResource
14+
from cuda.core._stream import default_stream, Stream, StreamOptions
15+
16+
17+
_tls = threading.local()
18+
_tls_lock = threading.Lock()
19+
20+
21+
class Device:
22+
23+
__slots__ = ("_id", "_mr", "_has_inited")
24+
25+
def __new__(cls, device_id=None):
26+
# important: creating a Device instance does not initialize the GPU!
27+
if device_id is None:
28+
device_id = handle_return(cudart.cudaGetDevice())
29+
assert isinstance(device_id, int), f"{device_id=}"
30+
else:
31+
total = handle_return(cudart.cudaGetDeviceCount())
32+
if not isinstance(device_id, int) or not (0 <= device_id < total):
33+
raise ValueError(
34+
f"device_id must be within [0, {total}), got {device_id}")
35+
36+
# ensure Device is singleton
37+
with _tls_lock:
38+
if not hasattr(_tls, "devices"):
39+
total = handle_return(cudart.cudaGetDeviceCount())
40+
_tls.devices = []
41+
for dev_id in range(total):
42+
dev = super().__new__(cls)
43+
dev._id = dev_id
44+
dev._mr = _DefaultAsyncMempool(dev_id)
45+
dev._has_inited = False
46+
_tls.devices.append(dev)
47+
48+
return _tls.devices[device_id]
49+
50+
def _check_context_initialized(self, *args, **kwargs):
51+
if not self._has_inited:
52+
raise CUDAError("the device is not yet initialized, "
53+
"perhaps you forgot to call .set_current() first?")
54+
55+
@property
56+
def device_id(self) -> int:
57+
return self._id
58+
59+
@property
60+
def pci_bus_id(self) -> str:
61+
bus_id = handle_return(cudart.cudaDeviceGetPCIBusId(13, self._id))
62+
return bus_id[:12].decode()
63+
64+
@property
65+
def uuid(self) -> str:
66+
driver_ver = handle_return(cuda.cuDriverGetVersion())
67+
if driver_ver >= 11040:
68+
uuid = handle_return(cuda.cuDeviceGetUuid_v2(self._id))
69+
else:
70+
uuid = handle_return(cuda.cuDeviceGetUuid(self._id))
71+
uuid = uuid.bytes.hex()
72+
# 8-4-4-4-12
73+
return f"{uuid[:8]}-{uuid[8:12]}-{uuid[12:16]}-{uuid[16:20]}-{uuid[20:]}"
74+
75+
@property
76+
def name(self) -> str:
77+
# assuming a GPU name is less than 128 characters...
78+
name = handle_return(cuda.cuDeviceGetName(128, self._id))
79+
name = name.split(b'\0')[0]
80+
return name.decode()
81+
82+
@property
83+
def properties(self) -> dict:
84+
# TODO: pythonize the key names
85+
return handle_return(cudart.cudaGetDeviceProperties(self._id))
86+
87+
@property
88+
def compute_capability(self) -> ComputeCapability:
89+
"""Returns a named tuple with 2 fields: major and minor. """
90+
major = handle_return(cudart.cudaDeviceGetAttribute(
91+
cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, self._id))
92+
minor = handle_return(cudart.cudaDeviceGetAttribute(
93+
cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, self._id))
94+
return ComputeCapability(major, minor)
95+
96+
@property
97+
@precondition(_check_context_initialized)
98+
def context(self) -> Context:
99+
ctx = handle_return(cuda.cuCtxGetCurrent())
100+
assert int(ctx) != 0
101+
return Context._from_ctx(ctx, self._id)
102+
103+
@property
104+
def memory_resource(self) -> MemoryResource:
105+
return self._mr
106+
107+
@memory_resource.setter
108+
def memory_resource(self, mr):
109+
if not isinstance(mr, MemoryResource):
110+
raise TypeError
111+
self._mr = mr
112+
113+
@property
114+
def default_stream(self) -> Stream:
115+
return default_stream()
116+
117+
def __int__(self):
118+
return self._id
119+
120+
def __repr__(self):
121+
return f"<Device {self._id} ({self.name})>"
122+
123+
def set_current(self, ctx: Context=None) -> Union[Context, None]:
124+
"""
125+
Entry point of this object. Users always start a code by
126+
calling this method, e.g.
127+
128+
>>> from cuda.core import Device
129+
>>> dev0 = Device(0)
130+
>>> dev0.set_current()
131+
>>> # ... do work on device 0 ...
132+
133+
The optional ctx argument is for advanced users to bind a
134+
CUDA context with the device. In this case, the previously
135+
set context is popped and returned to the user.
136+
"""
137+
if ctx is not None:
138+
if not isinstance(ctx, Context):
139+
raise TypeError("a Context object is required")
140+
if ctx._id != self._id:
141+
raise RuntimeError("the provided context was created on a different "
142+
f"device {ctx._id} other than the target {self._id}")
143+
prev_ctx = handle_return(cuda.cuCtxPopCurrent())
144+
handle_return(cuda.cuCtxPushCurrent(ctx._handle))
145+
self._has_inited = True
146+
if int(prev_ctx) != 0:
147+
return Context._from_ctx(prev_ctx, self._id)
148+
else:
149+
ctx = handle_return(cuda.cuCtxGetCurrent())
150+
if int(ctx) == 0:
151+
# use primary ctx
152+
ctx = handle_return(cuda.cuDevicePrimaryCtxRetain(self._id))
153+
handle_return(cuda.cuCtxPushCurrent(ctx))
154+
else:
155+
ctx_id = handle_return(cuda.cuCtxGetDevice())
156+
if ctx_id != self._id:
157+
# use primary ctx
158+
ctx = handle_return(cuda.cuDevicePrimaryCtxRetain(self._id))
159+
handle_return(cuda.cuCtxPushCurrent(ctx))
160+
else:
161+
# no-op, a valid context already exists and is set current
162+
pass
163+
self._has_inited = True
164+
165+
def create_context(self, options: ContextOptions = None) -> Context:
166+
# Create a Context object (but do NOT set it current yet!).
167+
# ContextOptions is a dataclass for setting e.g. affinity or CIG
168+
# options.
169+
raise NotImplementedError("TODO")
170+
171+
@precondition(_check_context_initialized)
172+
def create_stream(self, obj=None, options: StreamOptions=None) -> Stream:
173+
# Create a Stream object by either holding a newly created
174+
# CUDA stream or wrapping an existing foreign object supporting
175+
# the __cuda_stream__ protocol. In the latter case, a reference
176+
# to obj is held internally so that its lifetime is managed.
177+
return Stream._init(obj=obj, options=options)
178+
179+
@precondition(_check_context_initialized)
180+
def allocate(self, size, stream=None) -> Buffer:
181+
if stream is None:
182+
stream = default_stream()
183+
return self._mr.allocate(size, stream)
184+
185+
@precondition(_check_context_initialized)
186+
def sync(self):
187+
handle_return(cudart.cudaDeviceSynchronize())

cuda_core/cuda/core/_dlpack.pxd

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
2+
#
3+
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
4+
5+
cimport cpython
6+
7+
from libc cimport stdlib
8+
from libc.stdint cimport uint8_t
9+
from libc.stdint cimport uint16_t
10+
from libc.stdint cimport uint32_t
11+
from libc.stdint cimport int32_t
12+
from libc.stdint cimport int64_t
13+
from libc.stdint cimport uint64_t
14+
from libc.stdint cimport intptr_t
15+
16+
17+
cdef extern from "dlpack.h" nogil:
18+
"""
19+
#define DLPACK_TENSOR_UNUSED_NAME "dltensor"
20+
#define DLPACK_VERSIONED_TENSOR_UNUSED_NAME "dltensor_versioned"
21+
#define DLPACK_TENSOR_USED_NAME "used_dltensor"
22+
#define DLPACK_VERSIONED_TENSOR_USED_NAME "used_dltensor_versioned"
23+
"""
24+
ctypedef enum _DLDeviceType "DLDeviceType":
25+
_kDLCPU "kDLCPU"
26+
_kDLCUDA "kDLCUDA"
27+
_kDLCUDAHost "kDLCUDAHost"
28+
_kDLCUDAManaged "kDLCUDAManaged"
29+
30+
ctypedef struct DLDevice:
31+
_DLDeviceType device_type
32+
int32_t device_id
33+
34+
cdef enum DLDataTypeCode:
35+
kDLInt
36+
kDLUInt
37+
kDLFloat
38+
kDLBfloat
39+
kDLComplex
40+
kDLBool
41+
42+
ctypedef struct DLDataType:
43+
uint8_t code
44+
uint8_t bits
45+
uint16_t lanes
46+
47+
ctypedef struct DLTensor:
48+
void* data
49+
DLDevice device
50+
int32_t ndim
51+
DLDataType dtype
52+
int64_t* shape
53+
int64_t* strides
54+
uint64_t byte_offset
55+
56+
ctypedef struct DLManagedTensor:
57+
DLTensor dl_tensor
58+
void* manager_ctx
59+
void (*deleter)(DLManagedTensor*)
60+
61+
ctypedef struct DLPackVersion:
62+
uint32_t major
63+
uint32_t minor
64+
65+
ctypedef struct DLManagedTensorVersioned:
66+
DLPackVersion version
67+
void* manager_ctx
68+
void (*deleter)(DLManagedTensorVersioned*)
69+
uint64_t flags
70+
DLTensor dl_tensor
71+
72+
int DLPACK_MAJOR_VERSION
73+
int DLPACK_MINOR_VERSION
74+
int DLPACK_FLAG_BITMASK_READ_ONLY
75+
76+
const char* DLPACK_TENSOR_UNUSED_NAME
77+
const char* DLPACK_VERSIONED_TENSOR_UNUSED_NAME
78+
const char* DLPACK_TENSOR_USED_NAME
79+
const char* DLPACK_VERSIONED_TENSOR_USED_NAME

0 commit comments

Comments
 (0)