|
| 1 | +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. |
| 2 | +# |
| 3 | +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE |
| 4 | + |
| 5 | +import threading |
| 6 | +from typing import Optional, Union |
| 7 | +import warnings |
| 8 | + |
| 9 | +from cuda import cuda, cudart |
| 10 | +from cuda.core._utils import handle_return, ComputeCapability, CUDAError, \ |
| 11 | + precondition |
| 12 | +from cuda.core._context import Context, ContextOptions |
| 13 | +from cuda.core._memory import _DefaultAsyncMempool, Buffer, MemoryResource |
| 14 | +from cuda.core._stream import default_stream, Stream, StreamOptions |
| 15 | + |
| 16 | + |
| 17 | +_tls = threading.local() |
| 18 | +_tls_lock = threading.Lock() |
| 19 | + |
| 20 | + |
| 21 | +class Device: |
| 22 | + |
| 23 | + __slots__ = ("_id", "_mr", "_has_inited") |
| 24 | + |
| 25 | + def __new__(cls, device_id=None): |
| 26 | + # important: creating a Device instance does not initialize the GPU! |
| 27 | + if device_id is None: |
| 28 | + device_id = handle_return(cudart.cudaGetDevice()) |
| 29 | + assert isinstance(device_id, int), f"{device_id=}" |
| 30 | + else: |
| 31 | + total = handle_return(cudart.cudaGetDeviceCount()) |
| 32 | + if not isinstance(device_id, int) or not (0 <= device_id < total): |
| 33 | + raise ValueError( |
| 34 | + f"device_id must be within [0, {total}), got {device_id}") |
| 35 | + |
| 36 | + # ensure Device is singleton |
| 37 | + with _tls_lock: |
| 38 | + if not hasattr(_tls, "devices"): |
| 39 | + total = handle_return(cudart.cudaGetDeviceCount()) |
| 40 | + _tls.devices = [] |
| 41 | + for dev_id in range(total): |
| 42 | + dev = super().__new__(cls) |
| 43 | + dev._id = dev_id |
| 44 | + dev._mr = _DefaultAsyncMempool(dev_id) |
| 45 | + dev._has_inited = False |
| 46 | + _tls.devices.append(dev) |
| 47 | + |
| 48 | + return _tls.devices[device_id] |
| 49 | + |
| 50 | + def _check_context_initialized(self, *args, **kwargs): |
| 51 | + if not self._has_inited: |
| 52 | + raise CUDAError("the device is not yet initialized, " |
| 53 | + "perhaps you forgot to call .set_current() first?") |
| 54 | + |
| 55 | + @property |
| 56 | + def device_id(self) -> int: |
| 57 | + return self._id |
| 58 | + |
| 59 | + @property |
| 60 | + def pci_bus_id(self) -> str: |
| 61 | + bus_id = handle_return(cudart.cudaDeviceGetPCIBusId(13, self._id)) |
| 62 | + return bus_id[:12].decode() |
| 63 | + |
| 64 | + @property |
| 65 | + def uuid(self) -> str: |
| 66 | + driver_ver = handle_return(cuda.cuDriverGetVersion()) |
| 67 | + if driver_ver >= 11040: |
| 68 | + uuid = handle_return(cuda.cuDeviceGetUuid_v2(self._id)) |
| 69 | + else: |
| 70 | + uuid = handle_return(cuda.cuDeviceGetUuid(self._id)) |
| 71 | + uuid = uuid.bytes.hex() |
| 72 | + # 8-4-4-4-12 |
| 73 | + return f"{uuid[:8]}-{uuid[8:12]}-{uuid[12:16]}-{uuid[16:20]}-{uuid[20:]}" |
| 74 | + |
| 75 | + @property |
| 76 | + def name(self) -> str: |
| 77 | + # assuming a GPU name is less than 128 characters... |
| 78 | + name = handle_return(cuda.cuDeviceGetName(128, self._id)) |
| 79 | + name = name.split(b'\0')[0] |
| 80 | + return name.decode() |
| 81 | + |
| 82 | + @property |
| 83 | + def properties(self) -> dict: |
| 84 | + # TODO: pythonize the key names |
| 85 | + return handle_return(cudart.cudaGetDeviceProperties(self._id)) |
| 86 | + |
| 87 | + @property |
| 88 | + def compute_capability(self) -> ComputeCapability: |
| 89 | + """Returns a named tuple with 2 fields: major and minor. """ |
| 90 | + major = handle_return(cudart.cudaDeviceGetAttribute( |
| 91 | + cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, self._id)) |
| 92 | + minor = handle_return(cudart.cudaDeviceGetAttribute( |
| 93 | + cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, self._id)) |
| 94 | + return ComputeCapability(major, minor) |
| 95 | + |
| 96 | + @property |
| 97 | + @precondition(_check_context_initialized) |
| 98 | + def context(self) -> Context: |
| 99 | + ctx = handle_return(cuda.cuCtxGetCurrent()) |
| 100 | + assert int(ctx) != 0 |
| 101 | + return Context._from_ctx(ctx, self._id) |
| 102 | + |
| 103 | + @property |
| 104 | + def memory_resource(self) -> MemoryResource: |
| 105 | + return self._mr |
| 106 | + |
| 107 | + @memory_resource.setter |
| 108 | + def memory_resource(self, mr): |
| 109 | + if not isinstance(mr, MemoryResource): |
| 110 | + raise TypeError |
| 111 | + self._mr = mr |
| 112 | + |
| 113 | + @property |
| 114 | + def default_stream(self) -> Stream: |
| 115 | + return default_stream() |
| 116 | + |
| 117 | + def __int__(self): |
| 118 | + return self._id |
| 119 | + |
| 120 | + def __repr__(self): |
| 121 | + return f"<Device {self._id} ({self.name})>" |
| 122 | + |
| 123 | + def set_current(self, ctx: Context=None) -> Union[Context, None]: |
| 124 | + """ |
| 125 | + Entry point of this object. Users always start a code by |
| 126 | + calling this method, e.g. |
| 127 | + |
| 128 | + >>> from cuda.core import Device |
| 129 | + >>> dev0 = Device(0) |
| 130 | + >>> dev0.set_current() |
| 131 | + >>> # ... do work on device 0 ... |
| 132 | + |
| 133 | + The optional ctx argument is for advanced users to bind a |
| 134 | + CUDA context with the device. In this case, the previously |
| 135 | + set context is popped and returned to the user. |
| 136 | + """ |
| 137 | + if ctx is not None: |
| 138 | + if not isinstance(ctx, Context): |
| 139 | + raise TypeError("a Context object is required") |
| 140 | + if ctx._id != self._id: |
| 141 | + raise RuntimeError("the provided context was created on a different " |
| 142 | + f"device {ctx._id} other than the target {self._id}") |
| 143 | + prev_ctx = handle_return(cuda.cuCtxPopCurrent()) |
| 144 | + handle_return(cuda.cuCtxPushCurrent(ctx._handle)) |
| 145 | + self._has_inited = True |
| 146 | + if int(prev_ctx) != 0: |
| 147 | + return Context._from_ctx(prev_ctx, self._id) |
| 148 | + else: |
| 149 | + ctx = handle_return(cuda.cuCtxGetCurrent()) |
| 150 | + if int(ctx) == 0: |
| 151 | + # use primary ctx |
| 152 | + ctx = handle_return(cuda.cuDevicePrimaryCtxRetain(self._id)) |
| 153 | + handle_return(cuda.cuCtxPushCurrent(ctx)) |
| 154 | + else: |
| 155 | + ctx_id = handle_return(cuda.cuCtxGetDevice()) |
| 156 | + if ctx_id != self._id: |
| 157 | + # use primary ctx |
| 158 | + ctx = handle_return(cuda.cuDevicePrimaryCtxRetain(self._id)) |
| 159 | + handle_return(cuda.cuCtxPushCurrent(ctx)) |
| 160 | + else: |
| 161 | + # no-op, a valid context already exists and is set current |
| 162 | + pass |
| 163 | + self._has_inited = True |
| 164 | + |
| 165 | + def create_context(self, options: ContextOptions = None) -> Context: |
| 166 | + # Create a Context object (but do NOT set it current yet!). |
| 167 | + # ContextOptions is a dataclass for setting e.g. affinity or CIG |
| 168 | + # options. |
| 169 | + raise NotImplementedError("TODO") |
| 170 | + |
| 171 | + @precondition(_check_context_initialized) |
| 172 | + def create_stream(self, obj=None, options: StreamOptions=None) -> Stream: |
| 173 | + # Create a Stream object by either holding a newly created |
| 174 | + # CUDA stream or wrapping an existing foreign object supporting |
| 175 | + # the __cuda_stream__ protocol. In the latter case, a reference |
| 176 | + # to obj is held internally so that its lifetime is managed. |
| 177 | + return Stream._init(obj=obj, options=options) |
| 178 | + |
| 179 | + @precondition(_check_context_initialized) |
| 180 | + def allocate(self, size, stream=None) -> Buffer: |
| 181 | + if stream is None: |
| 182 | + stream = default_stream() |
| 183 | + return self._mr.allocate(size, stream) |
| 184 | + |
| 185 | + @precondition(_check_context_initialized) |
| 186 | + def sync(self): |
| 187 | + handle_return(cudart.cudaDeviceSynchronize()) |
0 commit comments