From 6d2b1b3504d8bb14927a4b6cf747481f72fa2aa8 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Thu, 13 Mar 2025 20:27:39 -0700
Subject: [PATCH 1/9] Capture PRs since 2025-03-06

---
 cuda_core/docs/source/release/0.2.0-notes.rst | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/cuda_core/docs/source/release/0.2.0-notes.rst b/cuda_core/docs/source/release/0.2.0-notes.rst
index 02f586d58..36d7fdd3c 100644
--- a/cuda_core/docs/source/release/0.2.0-notes.rst
+++ b/cuda_core/docs/source/release/0.2.0-notes.rst
@@ -27,9 +27,20 @@ New features
 - Expose :class:`ObjectCode` as a public API, which allows loading cubins from memory or disk. For loading other kinds of code types, please continue using :class:`Program`.
 - A C++ helper function ``get_cuda_native_handle()`` is provided in the new ``include/utility.cuh`` header to retrive the underlying CUDA C objects (ex: ``CUstream``) from a Python object returned by the ``.handle`` attribute (ex: :attr:`Stream.handle`).
 - For objects such as :class:`Program` and :class:`Linker` that could dispatch to different backends, a new ``.backend`` attribute is provided to query this information.
-- Support CUDA event timing.
+- Support CUDA event timing. (#481, #498, #508)
 - An :class:`~_event.Event` may now be created without recording it to a :class:`~_stream.Stream` using the :meth:`Device.create_event` method.
 
+Minor fixes and enhancements
+----------------------------
+- Fix a dangling pointer problem in _linker.py (#516)
+- Add ``@functools.lru_cache`` decorator for ``get_binding_version()`` (#512)
+- Change selected ``.decode()`` to ``.decode("utf-8", errors="backslashreplace")`` (#510)
+- Add :class:`Event` to public API (#501)
+
+Test fixes
+----------
+- Clean up device initialization in test (#507)
+
 Limitations
 -----------
 

From 0c6a283b85e9dc3cdbbcf493b3507cb60d4792f8 Mon Sep 17 00:00:00 2001
From: Vladislav Zhurba <vzhurba@nvidia.com>
Date: Thu, 13 Mar 2025 12:31:47 -0700
Subject: [PATCH 2/9] Check for required headers early

Also fix conditional EGL API inclusions. They should always be
included because all of their types are already redefined and available.
---
 .../bindings/_lib/cyruntime/cyruntime.pxd.in  |  6 +-
 .../bindings/_lib/cyruntime/cyruntime.pyx.in  | 11 ++++
 cuda_bindings/setup.py                        | 62 ++++++++++++-------
 3 files changed, 55 insertions(+), 24 deletions(-)

diff --git a/cuda_bindings/cuda/bindings/_lib/cyruntime/cyruntime.pxd.in b/cuda_bindings/cuda/bindings/_lib/cyruntime/cyruntime.pxd.in
index 743dac01a..c760f0220 100644
--- a/cuda_bindings/cuda/bindings/_lib/cyruntime/cyruntime.pxd.in
+++ b/cuda_bindings/cuda/bindings/_lib/cyruntime/cyruntime.pxd.in
@@ -108,9 +108,9 @@ from libcpp cimport bool
 {{if 'cudaCreateSurfaceObject' in found_functions}}cdef cudaError_t _cudaCreateSurfaceObject(cudaSurfaceObject_t* pSurfObject, const cudaResourceDesc* pResDesc) except ?cudaErrorCallRequiresNewerDriver nogil{{endif}}
 {{if 'cudaGetTextureObjectResourceDesc' in found_functions}}cdef cudaError_t _cudaGetTextureObjectResourceDesc(cudaResourceDesc* pResDesc, cudaTextureObject_t texObject) except ?cudaErrorCallRequiresNewerDriver nogil{{endif}}
 {{if 'cudaGraphicsEGLRegisterImage' in found_functions}}cdef cudaError_t _cudaGraphicsEGLRegisterImage(cudaGraphicsResource_t* pCudaResource, EGLImageKHR image, unsigned int flags) except ?cudaErrorCallRequiresNewerDriver nogil{{endif}}
-{{if 'cudaEGLStreamProducerPresentFrame' in found_functions}}cdef cudaError_t _cudaEGLStreamProducerPresentFrame(cudaEglStreamConnection* conn, cudaEglFrame eglframe, cudaStream_t* pStream) except ?cudaErrorCallRequiresNewerDriver nogil{{endif}}
-{{if 'cudaEGLStreamProducerReturnFrame' in found_functions}}cdef cudaError_t _cudaEGLStreamProducerReturnFrame(cudaEglStreamConnection* conn, cudaEglFrame* eglframe, cudaStream_t* pStream) except ?cudaErrorCallRequiresNewerDriver nogil{{endif}}
-{{if 'cudaGraphicsResourceGetMappedEglFrame' in found_functions}}cdef cudaError_t _cudaGraphicsResourceGetMappedEglFrame(cudaEglFrame* eglFrame, cudaGraphicsResource_t resource, unsigned int index, unsigned int mipLevel) except ?cudaErrorCallRequiresNewerDriver nogil{{endif}}
+{{if True}}cdef cudaError_t _cudaEGLStreamProducerPresentFrame(cudaEglStreamConnection* conn, cudaEglFrame eglframe, cudaStream_t* pStream) except ?cudaErrorCallRequiresNewerDriver nogil{{endif}}
+{{if True}}cdef cudaError_t _cudaEGLStreamProducerReturnFrame(cudaEglStreamConnection* conn, cudaEglFrame* eglframe, cudaStream_t* pStream) except ?cudaErrorCallRequiresNewerDriver nogil{{endif}}
+{{if True}}cdef cudaError_t _cudaGraphicsResourceGetMappedEglFrame(cudaEglFrame* eglFrame, cudaGraphicsResource_t resource, unsigned int index, unsigned int mipLevel) except ?cudaErrorCallRequiresNewerDriver nogil{{endif}}
 {{if True}}cdef cudaError_t _cudaVDPAUSetVDPAUDevice(int device, VdpDevice vdpDevice, VdpGetProcAddress* vdpGetProcAddress) except ?cudaErrorCallRequiresNewerDriver nogil{{endif}}
 {{if 'cudaArrayGetMemoryRequirements' in found_functions}}cdef cudaError_t _cudaArrayGetMemoryRequirements(cudaArrayMemoryRequirements* memoryRequirements, cudaArray_t array, int device) except ?cudaErrorCallRequiresNewerDriver nogil{{endif}}
 {{if 'cudaMipmappedArrayGetMemoryRequirements' in found_functions}}cdef cudaError_t _cudaMipmappedArrayGetMemoryRequirements(cudaArrayMemoryRequirements* memoryRequirements, cudaMipmappedArray_t mipmap, int device) except ?cudaErrorCallRequiresNewerDriver nogil{{endif}}
diff --git a/cuda_bindings/cuda/bindings/_lib/cyruntime/cyruntime.pyx.in b/cuda_bindings/cuda/bindings/_lib/cyruntime/cyruntime.pyx.in
index 3210f173c..b66f0c0c8 100644
--- a/cuda_bindings/cuda/bindings/_lib/cyruntime/cyruntime.pyx.in
+++ b/cuda_bindings/cuda/bindings/_lib/cyruntime/cyruntime.pyx.in
@@ -2206,6 +2206,7 @@ cdef cudaError_t _cudaGetTextureObjectResourceDesc(cudaResourceDesc* pResDesc, c
     return err
 
 {{endif}}
+{{if True}}
 
 cdef cudaError_t _cudaEGLStreamProducerPresentFrame(cudaEglStreamConnection* conn, cudaEglFrame eglframe, cudaStream_t* pStream) except ?cudaErrorCallRequiresNewerDriver nogil:
     cdef cudaError_t err = cudaSuccess
@@ -2222,6 +2223,9 @@ cdef cudaError_t _cudaEGLStreamProducerPresentFrame(cudaEglStreamConnection* con
         _setLastError(err)
     return err
 
+{{endif}}
+{{if True}}
+
 cdef cudaError_t _cudaEGLStreamProducerReturnFrame(cudaEglStreamConnection* conn, cudaEglFrame* eglframe, cudaStream_t* pStream) except ?cudaErrorCallRequiresNewerDriver nogil:
     cdef cudaError_t err = cudaSuccess
     err = m_global.lazyInitContextState()
@@ -2242,6 +2246,9 @@ cdef cudaError_t _cudaEGLStreamProducerReturnFrame(cudaEglStreamConnection* conn
         return err
     return err
 
+{{endif}}
+{{if True}}
+
 cdef cudaError_t _cudaGraphicsResourceGetMappedEglFrame(cudaEglFrame* eglFrame, cudaGraphicsResource_t resource, unsigned int index, unsigned int mipLevel) except ?cudaErrorCallRequiresNewerDriver nogil:
     cdef cudaError_t err = cudaSuccess
     err = m_global.lazyInitContextState()
@@ -2259,9 +2266,13 @@ cdef cudaError_t _cudaGraphicsResourceGetMappedEglFrame(cudaEglFrame* eglFrame,
         return err
     return err
 
+{{endif}}
+{{if True}}
+
 cdef cudaError_t _cudaVDPAUSetVDPAUDevice(int device, VdpDevice vdpDevice, VdpGetProcAddress* vdpGetProcAddress) except ?cudaErrorCallRequiresNewerDriver nogil:
     return cudaErrorNotSupported
 
+{{endif}}
 {{if 'cudaArrayGetMemoryRequirements' in found_functions}}
 
 cdef cudaError_t _cudaArrayGetMemoryRequirements(cudaArrayMemoryRequirements* memoryRequirements, cudaArray_t array, int device) except ?cudaErrorCallRequiresNewerDriver nogil:
diff --git a/cuda_bindings/setup.py b/cuda_bindings/setup.py
index 887f30ac2..b280781af 100644
--- a/cuda_bindings/setup.py
+++ b/cuda_bindings/setup.py
@@ -48,8 +48,11 @@
 # ----------------------------------------------------------------------
 # Parse user-provided CUDA headers
 
-header_dict = {
-    "driver": ["cuda.h", "cudaProfiler.h", "cudaEGL.h", "cudaGL.h", "cudaVDPAU.h"],
+required_headers = {
+    "driver": [
+        "cuda.h",
+        "cudaProfiler.h",
+    ],
     "runtime": [
         "driver_types.h",
         "vector_types.h",
@@ -61,13 +64,44 @@
         "device_types.h",
         "driver_functions.h",
         "cuda_profiler_api.h",
-        "cuda_egl_interop.h",
-        "cuda_gl_interop.h",
-        "cuda_vdpau_interop.h",
     ],
-    "nvrtc": ["nvrtc.h"],
+    "nvrtc": [
+        "nvrtc.h",
+    ],
+    # During compilation, Cython will reference C headers that are not
+    # explicitly parsed above. The following headers are known dependencies:
+    #
+    # - crt/host_defines.h
+    # - builtin_types.h
+    # - cuda_device_runtime_api.h
+    #
+    # These dependencies are specified through the headers above.
 }
 
+# Assert that all headers exist
+header_dict = {}
+missing_headers = []
+include_path_list = [os.path.join(path, "include") for path in CUDA_HOME]
+
+for library, header_list in required_headers.items():
+    header_paths = []
+    for header in header_list:
+        path_candidate = [os.path.join(path, header) for path in include_path_list]
+        for path in path_candidate:
+            if os.path.exists(path):
+                header_paths += [path]
+                break
+        if not os.path.exists(path):
+            missing_headers += [header]
+
+    # Update dictionary with validated paths to headers
+    header_dict[library] = header_paths
+
+if missing_headers:
+    error_message = "Couldn't find required headers: "
+    error_message += ", ".join([header for header in missing_headers])
+    raise RuntimeError(f"{error_message}\nIs CUDA_HOME setup correctly? (CUDA_HOME=\"{CUDA_HOME}\")")
+
 replace = {
     " __device_builtin__ ": " ",
     "CUDARTAPI ": " ",
@@ -117,19 +151,8 @@ def __repr__(self):
         return f"{self._name}: {self._member_names} with types {self._member_types}"
 
 
-include_path_list = [os.path.join(path, "include") for path in CUDA_HOME]
 print(f'Parsing headers in "{include_path_list}" (Caching = {PARSER_CACHING})')
-for library, header_list in header_dict.items():
-    header_paths = []
-    for header in header_list:
-        path_candidate = [os.path.join(path, header) for path in include_path_list]
-        for path in path_candidate:
-            if os.path.exists(path):
-                header_paths += [path]
-                break
-        if not os.path.exists(path):
-            print(f"Missing header {header}")
-
+for library, header_paths in header_dict.items():
     print(f"Parsing {library} headers")
     parser = CParser(
         header_paths, cache="./cache_{}".format(library.split(".")[0]) if PARSER_CACHING else None, replace=replace
@@ -161,9 +184,6 @@ def __repr__(self):
         if discovered:
             found_struct += discovered
 
-if len(found_functions) == 0:
-    raise RuntimeError(f'Parser found no functions. Is CUDA_HOME setup correctly? (CUDA_HOME="{CUDA_HOME}")')
-
 # ----------------------------------------------------------------------
 # Generate
 

From fe6dcedf3989f3c72ca6939f8322216bcaac7944 Mon Sep 17 00:00:00 2001
From: Vladislav Zhurba <vzhurba@nvidia.com>
Date: Thu, 13 Mar 2025 12:39:34 -0700
Subject: [PATCH 3/9] Run pre-commit

---
 cuda_bindings/setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_bindings/setup.py b/cuda_bindings/setup.py
index b280781af..e8db7899d 100644
--- a/cuda_bindings/setup.py
+++ b/cuda_bindings/setup.py
@@ -100,7 +100,7 @@
 if missing_headers:
     error_message = "Couldn't find required headers: "
     error_message += ", ".join([header for header in missing_headers])
-    raise RuntimeError(f"{error_message}\nIs CUDA_HOME setup correctly? (CUDA_HOME=\"{CUDA_HOME}\")")
+    raise RuntimeError(f'{error_message}\nIs CUDA_HOME setup correctly? (CUDA_HOME="{CUDA_HOME}")')
 
 replace = {
     " __device_builtin__ ": " ",

From cd42fbbafcfda3a499798487f7c86b5e44d2b047 Mon Sep 17 00:00:00 2001
From: Vladislav Zhurba <vzhurba@nvidia.com>
Date: Thu, 13 Mar 2025 12:41:56 -0700
Subject: [PATCH 4/9] Wording

---
 cuda_bindings/setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_bindings/setup.py b/cuda_bindings/setup.py
index e8db7899d..5945599a2 100644
--- a/cuda_bindings/setup.py
+++ b/cuda_bindings/setup.py
@@ -75,7 +75,7 @@
     # - builtin_types.h
     # - cuda_device_runtime_api.h
     #
-    # These dependencies are specified through the headers above.
+    # These are the dependencies of the headers we parse.
 }
 
 # Assert that all headers exist

From 525980f562b6719242aabf271bb29e2a65be5e29 Mon Sep 17 00:00:00 2001
From: Vladislav Zhurba <vzhurba@nvidia.com>
Date: Thu, 13 Mar 2025 12:43:34 -0700
Subject: [PATCH 5/9] Different wording

---
 cuda_bindings/setup.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/cuda_bindings/setup.py b/cuda_bindings/setup.py
index 5945599a2..a7ffafc5e 100644
--- a/cuda_bindings/setup.py
+++ b/cuda_bindings/setup.py
@@ -69,13 +69,11 @@
         "nvrtc.h",
     ],
     # During compilation, Cython will reference C headers that are not
-    # explicitly parsed above. The following headers are known dependencies:
+    # explicitly parsed above. These are the known dependencies:
     #
     # - crt/host_defines.h
     # - builtin_types.h
     # - cuda_device_runtime_api.h
-    #
-    # These are the dependencies of the headers we parse.
 }
 
 # Assert that all headers exist

From c7c7892a7af37ea63beb5e4329b14119cf294f21 Mon Sep 17 00:00:00 2001
From: Vladislav Zhurba <vzhurba@nvidia.com>
Date: Thu, 13 Mar 2025 15:25:26 -0700
Subject: [PATCH 6/9] Update release notes

---
 cuda_bindings/docs/source/release/12.X.Y-notes.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cuda_bindings/docs/source/release/12.X.Y-notes.rst b/cuda_bindings/docs/source/release/12.X.Y-notes.rst
index 708bb77e3..6536d3aea 100644
--- a/cuda_bindings/docs/source/release/12.X.Y-notes.rst
+++ b/cuda_bindings/docs/source/release/12.X.Y-notes.rst
@@ -9,3 +9,4 @@ Highlights
 
 * The ``cuda.bindings.nvvm`` Python module was added, wrapping the
   `libNVVM C API <https://docs.nvidia.com/cuda/libnvvm-api/>`_.
+* Source build error checking added for missing required headers

From 0a99e5a115bcc2075a2d555a9a8a2c8d72bf8804 Mon Sep 17 00:00:00 2001
From: Vladislav Zhurba <vzhurba@nvidia.com>
Date: Thu, 13 Mar 2025 16:12:27 -0700
Subject: [PATCH 7/9] Apply review

---
 cuda_bindings/setup.py | 153 ++++++++++++++++++++++-------------------
 1 file changed, 81 insertions(+), 72 deletions(-)

diff --git a/cuda_bindings/setup.py b/cuda_bindings/setup.py
index a7ffafc5e..3409766bf 100644
--- a/cuda_bindings/setup.py
+++ b/cuda_bindings/setup.py
@@ -76,49 +76,30 @@
     # - cuda_device_runtime_api.h
 }
 
-# Assert that all headers exist
-header_dict = {}
-missing_headers = []
-include_path_list = [os.path.join(path, "include") for path in CUDA_HOME]
 
-for library, header_list in required_headers.items():
-    header_paths = []
-    for header in header_list:
-        path_candidate = [os.path.join(path, header) for path in include_path_list]
-        for path in path_candidate:
-            if os.path.exists(path):
-                header_paths += [path]
-                break
-        if not os.path.exists(path):
-            missing_headers += [header]
-
-    # Update dictionary with validated paths to headers
-    header_dict[library] = header_paths
-
-if missing_headers:
-    error_message = "Couldn't find required headers: "
-    error_message += ", ".join([header for header in missing_headers])
-    raise RuntimeError(f'{error_message}\nIs CUDA_HOME setup correctly? (CUDA_HOME="{CUDA_HOME}")')
-
-replace = {
-    " __device_builtin__ ": " ",
-    "CUDARTAPI ": " ",
-    "typedef __device_builtin__ enum cudaError cudaError_t;": "typedef cudaError cudaError_t;",
-    "typedef __device_builtin__ enum cudaOutputMode cudaOutputMode_t;": "typedef cudaOutputMode cudaOutputMode_t;",
-    "typedef enum cudaError cudaError_t;": "typedef cudaError cudaError_t;",
-    "typedef enum cudaOutputMode cudaOutputMode_t;": "typedef cudaOutputMode cudaOutputMode_t;",
-    "typedef enum cudaDataType_t cudaDataType_t;": "",
-    "typedef enum libraryPropertyType_t libraryPropertyType_t;": "",
-    "  enum ": "   ",
-    ", enum ": ", ",
-    "\\(enum ": "(",
-}
+def fetch_header_paths(required_headers, include_path_list):
+    header_dict = {}
+    missing_headers = []
+    for library, header_list in required_headers.items():
+        header_paths = []
+        for header in header_list:
+            path_candidate = [os.path.join(path, header) for path in include_path_list]
+            for path in path_candidate:
+                if os.path.exists(path):
+                    header_paths += [path]
+                    break
+            else:
+                missing_headers += [header]
+
+        # Update dictionary with validated paths to headers
+        header_dict[library] = header_paths
 
-found_types = []
-found_functions = []
-found_values = []
-found_struct = []
-struct_list = {}
+    if missing_headers:
+        error_message = "Couldn't find required headers: "
+        error_message += ", ".join([header for header in missing_headers])
+        raise RuntimeError(f'{error_message}\nIs CUDA_HOME setup correctly? (CUDA_HOME="{CUDA_HOME}")')
+
+    return header_dict
 
 
 class Struct:
@@ -149,38 +130,66 @@ def __repr__(self):
         return f"{self._name}: {self._member_names} with types {self._member_types}"
 
 
-print(f'Parsing headers in "{include_path_list}" (Caching = {PARSER_CACHING})')
-for library, header_paths in header_dict.items():
-    print(f"Parsing {library} headers")
-    parser = CParser(
-        header_paths, cache="./cache_{}".format(library.split(".")[0]) if PARSER_CACHING else None, replace=replace
-    )
+def parse_headers(header_dict):
+    found_types = []
+    found_functions = []
+    found_values = []
+    found_struct = []
+    struct_list = {}
+
+    replace = {
+        " __device_builtin__ ": " ",
+        "CUDARTAPI ": " ",
+        "typedef __device_builtin__ enum cudaError cudaError_t;": "typedef cudaError cudaError_t;",
+        "typedef __device_builtin__ enum cudaOutputMode cudaOutputMode_t;": "typedef cudaOutputMode cudaOutputMode_t;",
+        "typedef enum cudaError cudaError_t;": "typedef cudaError cudaError_t;",
+        "typedef enum cudaOutputMode cudaOutputMode_t;": "typedef cudaOutputMode cudaOutputMode_t;",
+        "typedef enum cudaDataType_t cudaDataType_t;": "",
+        "typedef enum libraryPropertyType_t libraryPropertyType_t;": "",
+        "  enum ": "   ",
+        ", enum ": ", ",
+        "\\(enum ": "(",
+    }
+
+    print(f'Parsing headers in "{include_path_list}" (Caching = {PARSER_CACHING})')
+    for library, header_paths in header_dict.items():
+        print(f"Parsing {library} headers")
+        parser = CParser(
+            header_paths, cache="./cache_{}".format(library.split(".")[0]) if PARSER_CACHING else None, replace=replace
+        )
+
+        if library == "driver":
+            CUDA_VERSION = parser.defs["macros"].get("CUDA_VERSION", "Unknown")
+            print(f"Found CUDA_VERSION: {CUDA_VERSION}")
 
-    if library == "driver":
-        CUDA_VERSION = parser.defs["macros"].get("CUDA_VERSION", "Unknown")
-        print(f"Found CUDA_VERSION: {CUDA_VERSION}")
-
-    # Combine types with others since they sometimes get tangled
-    found_types += {key for key in parser.defs["types"]}
-    found_types += {key for key in parser.defs["structs"]}
-    found_types += {key for key in parser.defs["unions"]}
-    found_types += {key for key in parser.defs["enums"]}
-    found_functions += {key for key in parser.defs["functions"]}
-    found_values += {key for key in parser.defs["values"]}
-
-    for key, value in parser.defs["structs"].items():
-        struct_list[key] = Struct(key, value["members"])
-    for key, value in parser.defs["unions"].items():
-        struct_list[key] = Struct(key, value["members"])
-
-    for key, value in struct_list.items():
-        if key.startswith("anon_union") or key.startswith("anon_struct"):
-            continue
-
-        found_struct += [key]
-        discovered = value.discoverMembers(struct_list, key)
-        if discovered:
-            found_struct += discovered
+        # Combine types with others since they sometimes get tangled
+        found_types += {key for key in parser.defs["types"]}
+        found_types += {key for key in parser.defs["structs"]}
+        found_types += {key for key in parser.defs["unions"]}
+        found_types += {key for key in parser.defs["enums"]}
+        found_functions += {key for key in parser.defs["functions"]}
+        found_values += {key for key in parser.defs["values"]}
+
+        for key, value in parser.defs["structs"].items():
+            struct_list[key] = Struct(key, value["members"])
+        for key, value in parser.defs["unions"].items():
+            struct_list[key] = Struct(key, value["members"])
+
+        for key, value in struct_list.items():
+            if key.startswith("anon_union") or key.startswith("anon_struct"):
+                continue
+
+            found_struct += [key]
+            discovered = value.discoverMembers(struct_list, key)
+            if discovered:
+                found_struct += discovered
+
+    return found_types, found_functions, found_values, found_struct, struct_list
+
+
+include_path_list = [os.path.join(path, "include") for path in CUDA_HOME]
+header_dict = fetch_header_paths(required_headers, include_path_list)
+found_types, found_functions, found_values, found_struct, struct_list = parse_headers(header_dict)
 
 # ----------------------------------------------------------------------
 # Generate

From dc5a4228d7002f2947d379d79b38c0af18aaadff Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Fri, 14 Mar 2025 16:03:16 -0700
Subject: [PATCH 8/9] Remove mention of :class:`Event`

---
 cuda_core/docs/source/release/0.2.0-notes.rst | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cuda_core/docs/source/release/0.2.0-notes.rst b/cuda_core/docs/source/release/0.2.0-notes.rst
index 36d7fdd3c..35b1f1cc9 100644
--- a/cuda_core/docs/source/release/0.2.0-notes.rst
+++ b/cuda_core/docs/source/release/0.2.0-notes.rst
@@ -35,7 +35,6 @@ Minor fixes and enhancements
 - Fix a dangling pointer problem in _linker.py (#516)
 - Add ``@functools.lru_cache`` decorator for ``get_binding_version()`` (#512)
 - Change selected ``.decode()`` to ``.decode("utf-8", errors="backslashreplace")`` (#510)
-- Add :class:`Event` to public API (#501)
 
 Test fixes
 ----------

From 5f31dd84a04f90ec437e525c341283594c90ea2b Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Fri, 14 Mar 2025 20:08:05 -0700
Subject: [PATCH 9/9] Add release notes for PRs identified by at-leofang:

https://github.com/NVIDIA/cuda-python/pull/519#discussion_r1996271665
---
 cuda_core/docs/source/release/0.2.0-notes.rst | 24 ++++++++++++++-----
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/cuda_core/docs/source/release/0.2.0-notes.rst b/cuda_core/docs/source/release/0.2.0-notes.rst
index 35b1f1cc9..87353f476 100644
--- a/cuda_core/docs/source/release/0.2.0-notes.rst
+++ b/cuda_core/docs/source/release/0.2.0-notes.rst
@@ -27,18 +27,30 @@ New features
 - Expose :class:`ObjectCode` as a public API, which allows loading cubins from memory or disk. For loading other kinds of code types, please continue using :class:`Program`.
 - A C++ helper function ``get_cuda_native_handle()`` is provided in the new ``include/utility.cuh`` header to retrive the underlying CUDA C objects (ex: ``CUstream``) from a Python object returned by the ``.handle`` attribute (ex: :attr:`Stream.handle`).
 - For objects such as :class:`Program` and :class:`Linker` that could dispatch to different backends, a new ``.backend`` attribute is provided to query this information.
-- Support CUDA event timing. (#481, #498, #508)
-- An :class:`~_event.Event` may now be created without recording it to a :class:`~_stream.Stream` using the :meth:`Device.create_event` method.
+- Support CUDA :class:`Event` timing. (#481, #498, #508)
+- An :class:`Event` may now be created without recording it to a :class:`~_stream.Stream` using the :meth:`Device.create_event` method.
+- :class:`Program` now supports the additional ``PTX`` code type. (#317)
+- :meth:`Linker.link` exceptions now include the original error log. (#423)
+- In a systematic sweep through the cuda.core implementations, many exceptions messages were made more consistent and informative. (#458)
+
+New examples
+------------
+- ``jit_lto_fractal.py`` — Demonstrates just-in-time link-time optimization for fractal generation. (:class:`Device`, :class:`LaunchConfig`, :class:`Linker`, :class:`LinkerOptions`, :class:`Program`, :class:`ProgramOptions`) (#475)
+- ``simple_multi_gpu_example.py`` — Example of using multiple GPUs. (:class:`Device`, :class:`Program`, :class:`LaunchConfig`) (#304)
+- ``show_device_properties.py`` — Displays detailed device properties. (:class:`Device`) (#474)
 
 Minor fixes and enhancements
 ----------------------------
-- Fix a dangling pointer problem in _linker.py (#516)
-- Add ``@functools.lru_cache`` decorator for ``get_binding_version()`` (#512)
-- Change selected ``.decode()`` to ``.decode("utf-8", errors="backslashreplace")`` (#510)
+- A dangling pointer problem in ``_linker.py`` was fixed. (#516)
+- Add ``@functools.lru_cache`` decorator for :func:`get_binding_version`. (#512)
+- Selected ``.decode()`` were changed to ``.decode("utf-8", errors="backslashreplace")`` to ensure that decoding error messages does not abort the process. (#510)
+- The performance of :meth:`Device.compute_capability` was improved. (#459)
+- The :class:`Program` constructor now issues a warning when falling back to :func:`cuLink`. (#315)
+- To avoid deprecation warnings, the cuda.bindings imports in the cuda.core implementations were cleaned up. (#404)
 
 Test fixes
 ----------
-- Clean up device initialization in test (#507)
+- Clean up device initialization in some tests. (#507)
 
 Limitations
 -----------