vstinner
diff --git a/‎Doc/c-api/unicode.rst
Lines changed: 65 additions & 0 deletions b/‎Doc/c-api/unicode.rst
Lines changed: 65 additions & 0 deletions
diff --git a/‎Doc/data/stable_abi.dat
Lines changed: 3 additions & 0 deletions b/‎Doc/data/stable_abi.dat
Lines changed: 3 additions & 0 deletions
diff --git a/‎Doc/whatsnew/3.14.rst
Lines changed: 4 additions & 0 deletions b/‎Doc/whatsnew/3.14.rst
Lines changed: 4 additions & 0 deletions
diff --git a/‎Include/unicodeobject.h
Lines changed: 18 additions & 0 deletions b/‎Include/unicodeobject.h
Lines changed: 18 additions & 0 deletions
diff --git a/‎Lib/test/test_capi/test_unicode.py
Lines changed: 177 additions & 3 deletions b/‎Lib/test/test_capi/test_unicode.py
Lines changed: 177 additions & 3 deletions
diff --git a/‎Lib/test/test_stable_abi_ctypes.py
Lines changed: 3 additions & 0 deletions b/‎Lib/test/test_stable_abi_ctypes.py
Lines changed: 3 additions & 0 deletions
diff --git a/‎Misc/NEWS.d/next/C API/2024-05-27-17-46-17.gh-issue-119609.kPIx6S.rst
Lines changed: 2 additions & 0 deletions b/‎Misc/NEWS.d/next/C API/2024-05-27-17-46-17.gh-issue-119609.kPIx6S.rst
Lines changed: 2 additions & 0 deletions
diff --git a/‎Misc/stable_abi.toml
Lines changed: 16 additions & 0 deletions b/‎Misc/stable_abi.toml
Lines changed: 16 additions & 0 deletions
@@ -341,6 +341,71 @@ APIs:
    .. versionadded:: 3.3
 
 
+.. c:function:: int PyUnicode_Export(PyObject *unicode, uint32_t requested_formats, Py_buffer *view)
+
+   Export the contents of the *unicode* string in one of the requested format
+   *requested_formats*.
+
+   * On success, fill *view*, and return ``0``.
+   * On error, set an exception and return ``-1``.
+
+   The export must be released by :c:func:`PyBuffer_Release`.
+   The contents of the buffer are valid until they are released.
+
+   The buffer is read-only and must not be modified.
+
+   *unicode* and *view* must not be NULL.
+
+   Available formats:
+
+   .. c:namespace:: NULL
+
+   ===================================  ========  ===========================
+   Constant Identifier                    Value  Description
+   ===================================  ========  ===========================
+   .. c:macro:: PyUnicode_FORMAT_ASCII  ``0x01``  ASCII string (``Py_UCS1*``)
+   .. c:macro:: PyUnicode_FORMAT_UCS1   ``0x02``  UCS-1 string (``Py_UCS1*``)
+   .. c:macro:: PyUnicode_FORMAT_UCS2   ``0x04``  UCS-2 string (``Py_UCS2*``)
+   .. c:macro:: PyUnicode_FORMAT_UCS4   ``0x08``  UCS-4 string (``Py_UCS4*``)
+   .. c:macro:: PyUnicode_FORMAT_UTF8   ``0x10``  UTF-8 string (``char*``)
+   ===================================  ========  ===========================
+
+   *requested_formats* can be a single format or a bitwise combination of the
+   formats in the table above.
+   On success, *\*format* will be set to a single one of the requested flags.
+
+   Note that future versions of Python may introduce additional formats.
+
+   .. versionadded:: 3.14
+
+
+.. c:function:: int PyUnicode_GetBufferFormat(const Py_buffer *view, uint32_t *format)
+
+   Get the format of the buffer *view*.
+
+   * On success, set *\*result* to the corresponding `PyUnicode_FORMAT_*` value
+     and return ``0``.
+   * On error, set an exception and return ``-1``.
+
+   *view* must be a buffer filled by :c:func:`PyUnicode_Export`.
+
+   .. versionadded:: 3.14
+
+
+.. c:function:: PyObject* PyUnicode_Import(const void *data, Py_ssize_t nbytes, uint32_t format)
+
+   Create a string object from a buffer in an “export format”.
+
+   * Return a reference to a new string object on success.
+   * Set an exception and return ``NULL`` on error.
+
+   *data* must not be NULL. *nbytes* must be positive or zero.
+
+   See :c:func:`PyUnicode_Export` for the available formats.
+
+   .. versionadded:: 3.14
+
+
 .. c:function:: PyObject* PyUnicode_FromKindAndData(int kind, const void *buffer, \
                                                     Py_ssize_t size)
 
 
@@ -529,6 +529,10 @@ New Features
 
   (Contributed by Victor Stinner in :gh:`107954`.)
 
+* Add :c:func:`PyUnicode_Export` and :c:func:`PyUnicode_Import` functions to
+  export and import strings.
+  (Contributed by Victor Stinner in :gh:`119609`.)
+
 
 Porting to Python 3.14
 ----------------------
 
@@ -248,6 +248,24 @@ PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
     const char *u              /* UTF-8 encoded string */
     );
 
+#define PyUnicode_FORMAT_ASCII 0x01  // Py_UCS1* (ASCII string)
+#define PyUnicode_FORMAT_UCS1 0x02   // Py_UCS1*
+#define PyUnicode_FORMAT_UCS2 0x04   // Py_UCS2*
+#define PyUnicode_FORMAT_UCS4 0x08   // Py_UCS4*
+#define PyUnicode_FORMAT_UTF8 0x10   // char*
+
+PyAPI_FUNC(int) PyUnicode_Export(
+    PyObject *unicode,
+    uint32_t requested_formats,
+    Py_buffer *view);
+PyAPI_FUNC(int) PyUnicode_GetBufferFormat(
+    const Py_buffer *view,
+    uint32_t *format);
+PyAPI_FUNC(PyObject*) PyUnicode_Import(
+    const void *data,
+    Py_ssize_t nbytes,
+    uint32_t format);
+
 /* --- wchar_t support for platforms which support it --------------------- */
 
 #ifdef HAVE_WCHAR_H
 
@@ -1,5 +1,6 @@
-import unittest
+import struct
 import sys
+import unittest
 from test import support
 from test.support import import_helper
 
@@ -28,6 +29,14 @@ class Str(str):
     pass
 
 
+PyUnicode_FORMAT_ASCII = 0x01
+PyUnicode_FORMAT_UCS1 = 0x02
+PyUnicode_FORMAT_UCS2 = 0x04
+PyUnicode_FORMAT_UCS4 = 0x08
+PyUnicode_FORMAT_UTF8 = 0x10
+# Invalid native format
+PyUnicode_FORMAT_INVALID = 0x20
+
 class CAPITest(unittest.TestCase):
 
     @support.cpython_only
@@ -1721,6 +1730,139 @@ def test_pep393_utf8_caching_bug(self):
                 # Check that the second call returns the same result
                 self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))
 
+    def test_unicode_export(self):
+        # Test PyUnicode_Export() and PyUnicode_FreeExport()
+        unicode_export = _testlimitedcapi.unicode_export
+        if sys.byteorder == 'little':
+            ucs2_enc = 'utf-16le'
+            ucs4_enc = 'utf-32le'
+        else:
+            ucs2_enc = 'utf-16be'
+            ucs4_enc = 'utf-32be'
+
+        # export to the native format
+        formats = (PyUnicode_FORMAT_ASCII
+                   | PyUnicode_FORMAT_UCS1
+                   | PyUnicode_FORMAT_UCS2
+                   | PyUnicode_FORMAT_UCS4)
+        BUFFER_UCS1 = 'B'
+        BUFFER_UCS2 = 'H'
+        if struct.calcsize('I') == 4:
+            BUFFER_UCS4 = 'I'
+        elif struct.calcsize('L') == 4:
+            BUFFER_UCS4 = 'L'
+        else:
+            self.fail("unable to get BUFFER_UCS4 ")
+
+        self.assertEqual(unicode_export("abc", formats),
+                         (b'abc', PyUnicode_FORMAT_ASCII, 1, BUFFER_UCS1))
+        self.assertEqual(unicode_export("latin1:\xe9", formats),
+                         (b'latin1:\xe9', PyUnicode_FORMAT_UCS1, 1, BUFFER_UCS1))
+        self.assertEqual(unicode_export('ucs2:\u20ac', formats),
+                         ('ucs2:\u20ac'.encode(ucs2_enc),
+                          PyUnicode_FORMAT_UCS2, 2, BUFFER_UCS2))
+        self.assertEqual(unicode_export('ucs4:\U0010ffff', formats),
+                         ('ucs4:\U0010ffff'.encode(ucs4_enc),
+                          PyUnicode_FORMAT_UCS4, 4, BUFFER_UCS4))
+
+        # export ASCII as UCS1
+        self.assertEqual(unicode_export("abc", PyUnicode_FORMAT_UCS1),
+                         (b'abc', PyUnicode_FORMAT_UCS1, 1, BUFFER_UCS1))
+
+        # export ASCII and UCS1 to UCS2
+        self.assertEqual(unicode_export("abc", PyUnicode_FORMAT_UCS2),
+                         ('abc'.encode(ucs2_enc), PyUnicode_FORMAT_UCS2, 2, BUFFER_UCS2))
+        self.assertEqual(unicode_export("latin1:\xe9", PyUnicode_FORMAT_UCS2),
+                         ('latin1:\xe9'.encode(ucs2_enc), PyUnicode_FORMAT_UCS2, 2, BUFFER_UCS2))
+
+        # always export to UCS4
+        self.assertEqual(unicode_export("abc", PyUnicode_FORMAT_UCS4),
+                         ('abc'.encode(ucs4_enc), PyUnicode_FORMAT_UCS4, 4, BUFFER_UCS4))
+        self.assertEqual(unicode_export("latin1:\xe9", PyUnicode_FORMAT_UCS4),
+                         ('latin1:\xe9'.encode(ucs4_enc), PyUnicode_FORMAT_UCS4, 4, BUFFER_UCS4))
+        self.assertEqual(unicode_export('ucs2:\u20ac', PyUnicode_FORMAT_UCS4),
+                         ('ucs2:\u20ac'.encode(ucs4_enc),
+                          PyUnicode_FORMAT_UCS4, 4, BUFFER_UCS4))
+        self.assertEqual(unicode_export('ucs4:\U0010ffff', PyUnicode_FORMAT_UCS4),
+                         ('ucs4:\U0010ffff'.encode(ucs4_enc),
+                          PyUnicode_FORMAT_UCS4, 4, BUFFER_UCS4))
+
+        # always export to UTF8
+        self.assertEqual(unicode_export("abc", PyUnicode_FORMAT_UTF8),
+                         ('abc'.encode('utf8'), PyUnicode_FORMAT_UTF8, 1, 'B'))
+        self.assertEqual(unicode_export("latin1:\xe9", PyUnicode_FORMAT_UTF8),
+                         ('latin1:\xe9'.encode('utf8'), PyUnicode_FORMAT_UTF8, 1, 'B'))
+        self.assertEqual(unicode_export('ucs2:\u20ac', PyUnicode_FORMAT_UTF8),
+                         ('ucs2:\u20ac'.encode('utf8'),
+                          PyUnicode_FORMAT_UTF8, 1, 'B'))
+        self.assertEqual(unicode_export('ucs4:\U0010ffff', PyUnicode_FORMAT_UTF8),
+                         ('ucs4:\U0010ffff'.encode('utf8'),
+                          PyUnicode_FORMAT_UTF8, 1, 'B'))
+
+        # No supported format or invalid format
+        with self.assertRaisesRegex(ValueError,
+                                    "unable to find a matching export format"):
+            unicode_export('abc', 0)
+        with self.assertRaisesRegex(ValueError,
+                                    "unable to find a matching export format"):
+            unicode_export('abc', PyUnicode_FORMAT_INVALID)
+
+    def test_unicode_import(self):
+        # Test PyUnicode_Import()
+        unicode_import = _testlimitedcapi.unicode_import
+        if sys.byteorder == 'little':
+            ucs2_enc = 'utf-16le'
+            ucs4_enc = 'utf-32le'
+        else:
+            ucs2_enc = 'utf-16be'
+            ucs4_enc = 'utf-32be'
+
+        self.assertEqual(unicode_import(b'abc', PyUnicode_FORMAT_ASCII),
+                         "abc")
+        self.assertEqual(unicode_import(b'latin1:\xe9', PyUnicode_FORMAT_UCS1),
+                         "latin1:\xe9")
+
+        self.assertEqual(unicode_import('ucs2:\u20ac'.encode(ucs2_enc),
+                                          PyUnicode_FORMAT_UCS2),
+                         'ucs2:\u20ac')
+
+        self.assertEqual(unicode_import('ucs4:\U0010ffff'.encode(ucs4_enc),
+                                          PyUnicode_FORMAT_UCS4),
+                         'ucs4:\U0010ffff')
+
+        text = "abc\xe9\U0010ffff"
+        self.assertEqual(unicode_import(text.encode('utf8'),
+                                          PyUnicode_FORMAT_UTF8),
+                         text)
+
+        # Empty string
+        for native_format in (
+            PyUnicode_FORMAT_ASCII,
+            PyUnicode_FORMAT_UCS1,
+            PyUnicode_FORMAT_UCS2,
+            PyUnicode_FORMAT_UCS4,
+            PyUnicode_FORMAT_UTF8,
+        ):
+            with self.subTest(native_format=native_format):
+                self.assertEqual(unicode_import(b'', native_format),
+                                 '')
+
+        # Invalid format
+        with self.assertRaises(ValueError):
+            unicode_import(b'', PyUnicode_FORMAT_INVALID)
+
+        # Invalid size
+        ucs2 = 'ucs2:\u20ac'.encode(ucs2_enc)
+        with self.assertRaises(ValueError):
+            unicode_import(ucs2[:-1], PyUnicode_FORMAT_UCS2)
+        ucs4 = 'ucs4:\U0010ffff'.encode(ucs4_enc)
+        with self.assertRaises(ValueError):
+            unicode_import(ucs4[:-1], PyUnicode_FORMAT_UCS4)
+        with self.assertRaises(ValueError):
+            unicode_import(ucs4[:-2], PyUnicode_FORMAT_UCS4)
+        with self.assertRaises(ValueError):
+            unicode_import(ucs4[:-3], PyUnicode_FORMAT_UCS4)
+
 
 class PyUnicodeWriterTest(unittest.TestCase):
     def create_writer(self, size):
@@ -1903,6 +2045,38 @@ def test_recover_error(self):
 
         self.assertEqual(writer.finish(), 'Hello World.')
 
-
-if __name__ == "__main__":
+    def test_unicode_export_import_roundtrip(self):
+        unicode_export = _testlimitedcapi.unicode_export
+        unicode_import = _testlimitedcapi.unicode_import
+
+        ASCII = PyUnicode_FORMAT_ASCII
+        UCS1 = PyUnicode_FORMAT_UCS1
+        UCS2 = PyUnicode_FORMAT_UCS2
+        UCS4 = PyUnicode_FORMAT_UCS4
+        UTF8 = PyUnicode_FORMAT_UTF8
+        ALL = (ASCII | UCS1 | UCS2 | UCS4 | UTF8)
+
+        for string, allowed_formats in (
+            ('', {ASCII, UCS1, UCS2, UCS4, UTF8}),
+            ('ascii', {ASCII, UCS1, UCS2, UCS4, UTF8}),
+            ('latin1:\xe9', {UCS1, UCS2, UCS4, UTF8}),
+            ('ucs2:\u20ac', {UCS2, UCS4, UTF8}),
+            ('ucs4:\U0001f638', {UCS4, UTF8}),
+        ):
+            for format in ASCII, UCS1, UCS2, UCS4, UTF8:
+                with self.subTest(string=string, format=format):
+                    if format not in allowed_formats:
+                        with self.assertRaises(ValueError):
+                            unicode_export(string, format)
+                    else:
+                        buf, buf_fmt, item_size, view_fmt = unicode_export(string, format)
+                        restored = unicode_import(buf, buf_fmt)
+                        self.assertEqual(restored, string)
+
+            buf, buf_fmt, item_size, view_fmt = unicode_export(string, ALL)
+            restored = unicode_import(buf, buf_fmt)
+            self.assertEqual(restored, string)
+
+
+if __name__ == '__main__':
     unittest.main()
@@ -0,0 +1,2 @@
+Add :c:func:`PyUnicode_Export` and :c:func:`PyUnicode_Import` functions to
+export and import strings. Patch by Victor Stinner.
@@ -2526,3 +2526,19 @@
     added = '3.14'
 [function.PyLong_AsUInt64]
     added = '3.14'
+[const.PyUnicode_FORMAT_ASCII]
+    added = '3.14'
+[const.PyUnicode_FORMAT_UCS1]
+    added = '3.14'
+[const.PyUnicode_FORMAT_UCS2]
+    added = '3.14'
+[const.PyUnicode_FORMAT_UCS4]
+    added = '3.14'
+[const.PyUnicode_FORMAT_UTF8]
+    added = '3.14'
+[function.PyUnicode_Export]
+    added = '3.14'
+[function.PyUnicode_GetBufferFormat]
+    added = '3.14'
+[function.PyUnicode_Import]
+    added = '3.14'
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	+Add :c:func:`PyUnicode_Export` and :c:func:`PyUnicode_Import` functions to
	`2`	`+export and import strings. Patch by Victor Stinner.`