python · vstinner · May 27, 2024 · Sep 5, 2024 · Sep 5, 2024 · Sep 5, 2024
diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst
@@ -341,6 +341,82 @@ APIs:
    .. versionadded:: 3.3
 
 
+.. c:function:: int32_t PyUnicode_Export(PyObject *unicode, int32_t requested_formats, uint32_t flags, Py_buffer *view)
+
+   Export the contents of the *unicode* string in one of the *requested_formats*.
+
+   * On success, fill *view*, and return a format (greater than ``0``).
+   * On error, set an exception, and return ``-1``.
+     *view* is left unchanged.
+
+   After a successful call to :c:func:`PyUnicode_Export`,
+   the *view* buffer must be released by :c:func:`PyBuffer_Release`.
+   The contents of the buffer are valid until they are released.
+
+   The buffer is read-only and must not be modified.
+
+   *unicode* and *view* must not be NULL.
+
+   Available formats:
+
+   .. c:namespace:: NULL
+
+   ===================================  ========  ===========================
+   Constant Identifier                  Value     Description
+   ===================================  ========  ===========================
+   .. c:macro:: PyUnicode_FORMAT_UCS1   ``0x01``  UCS-1 string (``Py_UCS1*``)
+   .. c:macro:: PyUnicode_FORMAT_UCS2   ``0x02``  UCS-2 string (``Py_UCS2*``)
+   .. c:macro:: PyUnicode_FORMAT_UCS4   ``0x04``  UCS-4 string (``Py_UCS4*``)
+   .. c:macro:: PyUnicode_FORMAT_UTF8   ``0x08``  UTF-8 string (``char*``)
+   .. c:macro:: PyUnicode_FORMAT_ASCII  ``0x10``  ASCII string (``Py_UCS1*``)
+   ===================================  ========  ===========================
+
+   UCS-2 and UCS-4 use the native byte order.
+
+   *requested_formats* can be a single format or a bitwise combination of the
+   formats in the table above.
+   On success, the returned format will be set to a single one of the requested
+   flags.
+
+   Note that future versions of Python may introduce additional formats.
+
+   By default, if the :c:macro:`PyUnicode_EXPORT_COPY` flag is not set in
+   *flags*, no memory is copied and no conversion is done.
+
+   If the :c:macro:`PyUnicode_EXPORT_COPY` flag is set in *flags*, the function
+   can copy memory to provide the requested format and convert from a format
+   to another.
+
+   The :c:macro:`PyUnicode_EXPORT_COPY` flag is needed to export to
+   :c:macro:`PyUnicode_FORMAT_UTF8` a string containing surrogate characters.
+
+   Available flags:
+
+   .. c:namespace:: NULL
+
+   ==================================  ========  ===================================
+   Flag                                Value     Description
+   ==================================  ========  ===================================
+   .. c:macro:: PyUnicode_EXPORT_COPY  ``0x01``  Allow memory copies and conversions
+   ==================================  ========  ===================================
+
+   .. versionadded:: 3.14
+
+
+.. c:function:: PyObject* PyUnicode_Import(const void *data, Py_ssize_t nbytes, int32_t format)
+
+   Create a Unicode string object from a buffer in a supported format.
+
+   * Return a reference to a new string object on success.
+   * Set an exception and return ``NULL`` on error.
+
+   *data* must not be NULL. *nbytes* must be positive or zero.
+
+   See :c:func:`PyUnicode_Export` for the available formats.
+
+   .. versionadded:: 3.14
+
+
 .. c:function:: PyObject* PyUnicode_FromKindAndData(int kind, const void *buffer, \
                                                     Py_ssize_t size)
 

diff --git a/Doc/data/stable_abi.dat b/Doc/data/stable_abi.dat
diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst
@@ -554,6 +554,10 @@ New Features
 
   (Contributed by Victor Stinner in :gh:`107954`.)
 
+* Add :c:func:`PyUnicode_Export` and :c:func:`PyUnicode_Import` functions
+  to export and import strings.
+  (Contributed by Victor Stinner in :gh:`119609`.)
+
 
 Porting to Python 3.14
 ----------------------

diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h
@@ -248,6 +248,27 @@ PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
     const char *u              /* UTF-8 encoded string */
     );
 
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030e0000
+#define PyUnicode_FORMAT_UCS1  0x01   // Py_UCS1*
+#define PyUnicode_FORMAT_UCS2  0x02   // Py_UCS2*
+#define PyUnicode_FORMAT_UCS4  0x04   // Py_UCS4*
+#define PyUnicode_FORMAT_UTF8  0x08   // char*
+#define PyUnicode_FORMAT_ASCII 0x10   // char* (ASCII string)
+
+#define PyUnicode_EXPORT_COPY 0x01
+
+
+PyAPI_FUNC(int32_t) PyUnicode_Export(
+    PyObject *unicode,
+    int32_t requested_formats,
+    uint32_t flags,
+    Py_buffer *view);
+PyAPI_FUNC(PyObject*) PyUnicode_Import(
+    const void *data,
+    Py_ssize_t nbytes,
+    int32_t format);
+#endif
+
 /* --- wchar_t support for platforms which support it --------------------- */
 
 #ifdef HAVE_WCHAR_H

diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py
@@ -1,5 +1,6 @@
-import unittest
+import struct
 import sys
+import unittest
 from test import support
 from test.support import import_helper
 
@@ -28,6 +29,17 @@ class Str(str):
     pass
 
 
+PyUnicode_FORMAT_UCS1 = 0x01
+PyUnicode_FORMAT_UCS2 = 0x02
+PyUnicode_FORMAT_UCS4 = 0x04
+PyUnicode_FORMAT_UTF8 = 0x08
+PyUnicode_FORMAT_ASCII = 0x10
+# Invalid native format
+PyUnicode_FORMAT_INVALID = 0x20
+
+PyUnicode_EXPORT_COPY = 0x01
+
+
 class CAPITest(unittest.TestCase):
 
     @support.cpython_only
@@ -1721,6 +1733,183 @@ def test_pep393_utf8_caching_bug(self):
                 # Check that the second call returns the same result
                 self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))
 
+    def test_unicode_export(self):
+        # Test PyUnicode_Export() and PyUnicode_FreeExport()
+        unicode_export = _testlimitedcapi.unicode_export
+        if sys.byteorder == 'little':
+            ucs2_enc = 'utf-16le'
+            ucs4_enc = 'utf-32le'
+        else:
+            ucs2_enc = 'utf-16be'
+            ucs4_enc = 'utf-32be'
+
+        # export to the native format
+        formats = (PyUnicode_FORMAT_ASCII
+                   | PyUnicode_FORMAT_UCS1
+                   | PyUnicode_FORMAT_UCS2
+                   | PyUnicode_FORMAT_UCS4)
+        BUFFER_UCS1 = 'B'
+        BUFFER_UCS2 = '=H'
+        BUFFER_UCS4 = '=I'
+
+        def check_ucs1(text, formats, flags=0):
+            if formats == PyUnicode_FORMAT_UCS1:
+                export_format = PyUnicode_FORMAT_UCS1
+            elif text.isascii():
+                export_format = PyUnicode_FORMAT_ASCII
+            else:
+                export_format = PyUnicode_FORMAT_UCS1
+            self.assertEqual(unicode_export(text, formats, flags),
+                             (text.encode('latin1'), export_format, 1, BUFFER_UCS1))
+
+        def check_ucs2(text, formats, flags=0):
+            self.assertEqual(unicode_export(text, formats, flags),
+                             (text.encode(ucs2_enc, 'surrogatepass'),
+                              PyUnicode_FORMAT_UCS2, 2, BUFFER_UCS2))
+
+        def check_ucs4(text, formats, flags=0):
+            self.assertEqual(unicode_export(text, formats, flags),
+                             (text.encode(ucs4_enc, 'surrogatepass'),
+                              PyUnicode_FORMAT_UCS4, 4, BUFFER_UCS4))
+
+        def check_utf8(text, flags=0):
+            self.assertEqual(unicode_export(text, PyUnicode_FORMAT_UTF8, flags),
+                             (text.encode('utf8', 'surrogatepass'),
+                              PyUnicode_FORMAT_UTF8, 1, 'B'))
+
+        def check_no_matching_format(text, formats, flags=0):
+            err_msg = "unable to find a matching export format"
+            with self.assertRaisesRegex(ValueError, err_msg):
+                unicode_export('abc', formats, flags)
+
+        # export as native format
+        check_ucs1("abc", formats)
+        check_ucs1("latin1:\xe9", formats)
+        check_ucs2('ucs2:\u20ac', formats)
+        check_ucs4('ucs4:\U0010ffff', formats)
+
+        # convert ASCII to UCS1
+        check_ucs1("abc", PyUnicode_FORMAT_UCS1)
+
+        # convert to UCS2 (need PyUnicode_EXPORT_COPY)
+        check_no_matching_format("abc", PyUnicode_FORMAT_UCS2)
+        check_no_matching_format("latin1:\xe9", PyUnicode_FORMAT_UCS2)
+        check_ucs2("abc", PyUnicode_FORMAT_UCS2, PyUnicode_EXPORT_COPY)
+        check_ucs2("latin1:\xe9", PyUnicode_FORMAT_UCS2, PyUnicode_EXPORT_COPY)
+
+        # convert to UCS4 (need PyUnicode_EXPORT_COPY)
+        check_no_matching_format("abc", PyUnicode_FORMAT_UCS4)
+        check_no_matching_format("latin1:\xe9", PyUnicode_FORMAT_UCS4)
+        check_no_matching_format('ucs2:\u20ac', PyUnicode_FORMAT_UCS4)
+        check_ucs4("abc", PyUnicode_FORMAT_UCS4, PyUnicode_EXPORT_COPY)
+        check_ucs4("latin1:\xe9", PyUnicode_FORMAT_UCS4, PyUnicode_EXPORT_COPY)
+        check_ucs4('ucs2:\u20ac', PyUnicode_FORMAT_UCS4, PyUnicode_EXPORT_COPY)
+
+        # always encode to UTF8
+        check_utf8("abc")
+        check_utf8("latin1:\xe9")
+        check_utf8('ucs2:\u20ac')
+        check_utf8('ucs4:\U0010ffff')
+
+        # surrogates
+        check_ucs2('\udc80', PyUnicode_FORMAT_UCS2)
+        check_ucs4('\udc80', PyUnicode_FORMAT_UCS4, PyUnicode_EXPORT_COPY)
+        check_utf8('\udc80', PyUnicode_EXPORT_COPY)
+
+        # No supported format or invalid format
+        for formats in (0, PyUnicode_FORMAT_INVALID):
+            with self.subTest(formats=formats):
+                check_no_matching_format('abc', formats)
+
+    def test_unicode_import(self):
+        # Test PyUnicode_Import()
+        unicode_import = _testlimitedcapi.unicode_import
+        if sys.byteorder == 'little':
+            ucs2_enc = 'utf-16le'
+            ucs4_enc = 'utf-32le'
+        else:
+            ucs2_enc = 'utf-16be'
+            ucs4_enc = 'utf-32be'
+
+        self.assertEqual(unicode_import(b'abc', PyUnicode_FORMAT_ASCII),
+                         "abc")
+        self.assertEqual(unicode_import(b'latin1:\xe9', PyUnicode_FORMAT_UCS1),
+                         "latin1:\xe9")
+
+        self.assertEqual(unicode_import('ucs2:\u20ac'.encode(ucs2_enc),
+                                        PyUnicode_FORMAT_UCS2),
+                         'ucs2:\u20ac')
+
+        self.assertEqual(unicode_import('ucs4:\U0010ffff'.encode(ucs4_enc),
+                                        PyUnicode_FORMAT_UCS4),
+                         'ucs4:\U0010ffff')
+
+        text = "abc\xe9\U0010ffff"
+        self.assertEqual(unicode_import(text.encode('utf8'),
+                                        PyUnicode_FORMAT_UTF8),
+                         text)
+
+        # Empty string
+        for native_format in (
+            PyUnicode_FORMAT_ASCII,
+            PyUnicode_FORMAT_UCS1,
+            PyUnicode_FORMAT_UCS2,
+            PyUnicode_FORMAT_UCS4,
+            PyUnicode_FORMAT_UTF8,
+        ):
+            with self.subTest(native_format=native_format):
+                self.assertEqual(unicode_import(b'', native_format),
+                                 '')
+
+        # Invalid format
+        with self.assertRaises(ValueError):
+            unicode_import(b'', PyUnicode_FORMAT_INVALID)
+
+        # Invalid size
+        ucs2 = 'ucs2:\u20ac'.encode(ucs2_enc)
+        with self.assertRaises(ValueError):
+            unicode_import(ucs2[:-1], PyUnicode_FORMAT_UCS2)
+        ucs4 = 'ucs4:\U0010ffff'.encode(ucs4_enc)
+        with self.assertRaises(ValueError):
+            unicode_import(ucs4[:-1], PyUnicode_FORMAT_UCS4)
+        with self.assertRaises(ValueError):
+            unicode_import(ucs4[:-2], PyUnicode_FORMAT_UCS4)
+        with self.assertRaises(ValueError):
+            unicode_import(ucs4[:-3], PyUnicode_FORMAT_UCS4)
+
+    def test_unicode_export_import_roundtrip(self):
+        unicode_export = _testlimitedcapi.unicode_export
+        unicode_import = _testlimitedcapi.unicode_import
+
+        ASCII = PyUnicode_FORMAT_ASCII
+        UCS1 = PyUnicode_FORMAT_UCS1
+        UCS2 = PyUnicode_FORMAT_UCS2
+        UCS4 = PyUnicode_FORMAT_UCS4
+        UTF8 = PyUnicode_FORMAT_UTF8
+        ALL = (ASCII | UCS1 | UCS2 | UCS4 | UTF8)
+
+        def roundtrip(string, formats):
+            export = unicode_export(string, formats, PyUnicode_EXPORT_COPY)
+            buf, buf_fmt, item_size, view_fmt = export
+            self.assertEqual(unicode_import(buf, buf_fmt), string)
+
+        for string, allowed_formats in (
+            ('', {ASCII, UCS1, UCS2, UCS4, UTF8}),
+            ('ascii', {ASCII, UCS1, UCS2, UCS4, UTF8}),
+            ('latin1:\xe9', {UCS1, UCS2, UCS4, UTF8}),
+            ('ucs2:\u20ac', {UCS2, UCS4, UTF8}),
+            ('ucs4:\U0001f638', {UCS4, UTF8}),
+        ):
+            for formats in ASCII, UCS1, UCS2, UCS4, UTF8:
+                with self.subTest(string=string, formats=formats):
+                    if formats not in allowed_formats:
+                        with self.assertRaises(ValueError):
+                            unicode_export(string, formats, PyUnicode_EXPORT_COPY)
+                    else:
+                        roundtrip(string, formats)
+
+            roundtrip(string, ALL)
+
 
 class PyUnicodeWriterTest(unittest.TestCase):
     def create_writer(self, size):
@@ -1904,5 +2093,5 @@ def test_recover_error(self):
         self.assertEqual(writer.finish(), 'Hello World.')
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     unittest.main()
diff --git a/Lib/test/test_stable_abi_ctypes.py b/Lib/test/test_stable_abi_ctypes.py
diff --git a/Misc/NEWS.d/next/C_API/2024-09-05-16-57-51.gh-issue-119609.5EZ-kg.rst b/Misc/NEWS.d/next/C_API/2024-09-05-16-57-51.gh-issue-119609.5EZ-kg.rst
@@ -0,0 +1,2 @@
+Add :c:func:`PyUnicode_Export` and :c:func:`PyUnicode_Import` functions to
+export and import strings. Patch by Victor Stinner.
@@ -2527,4 +2527,18 @@
 [function.PyLong_AsUInt64]
     added = '3.14'
 [const.Py_tp_vectorcall]
-    added = '3.14'
+    added = '3.14'
+[const.PyUnicode_FORMAT_UCS1]
+    added = '3.14'
+[const.PyUnicode_FORMAT_UCS2]
+    added = '3.14'
+[const.PyUnicode_FORMAT_UCS4]
+    added = '3.14'
+[const.PyUnicode_FORMAT_UTF8]
+    added = '3.14'
+[const.PyUnicode_FORMAT_ASCII]
+    added = '3.14'
+[function.PyUnicode_Export]
+    added = '3.14'
+[function.PyUnicode_Import]
+    added = '3.14'
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		Add :c:func:`PyUnicode_Export` and :c:func:`PyUnicode_Import` functions to
		export and import strings. Patch by Victor Stinner.