Skip to content

Commit c2fec6f

Browse files
committed
pythongh-119609: Add PyUnicode_Export() function
Add PyUnicode_Export(), PyUnicode_GetBufferFormat() and PyUnicode_Import() functions to the limited C API.
1 parent 092abc4 commit c2fec6f

File tree

11 files changed

+583
-4
lines changed

11 files changed

+583
-4
lines changed

Doc/c-api/unicode.rst

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -341,6 +341,71 @@ APIs:
341341
.. versionadded:: 3.3
342342
343343
344+
.. c:function:: int PyUnicode_Export(PyObject *unicode, uint32_t requested_formats, Py_buffer *view)
345+
346+
Export the contents of the *unicode* string in one of the requested format
347+
*requested_formats*.
348+
349+
* On success, fill *view*, and return ``0``.
350+
* On error, set an exception and return ``-1``.
351+
352+
The export must be released by :c:func:`PyBuffer_Release`.
353+
The contents of the buffer are valid until they are released.
354+
355+
The buffer is read-only and must not be modified.
356+
357+
*unicode* and *view* must not be NULL.
358+
359+
Available formats:
360+
361+
.. c:namespace:: NULL
362+
363+
=================================== ======== ===========================
364+
Constant Identifier Value Description
365+
=================================== ======== ===========================
366+
.. c:macro:: PyUnicode_FORMAT_ASCII ``0x01`` ASCII string (``Py_UCS1*``)
367+
.. c:macro:: PyUnicode_FORMAT_UCS1 ``0x02`` UCS-1 string (``Py_UCS1*``)
368+
.. c:macro:: PyUnicode_FORMAT_UCS2 ``0x04`` UCS-2 string (``Py_UCS2*``)
369+
.. c:macro:: PyUnicode_FORMAT_UCS4 ``0x08`` UCS-4 string (``Py_UCS4*``)
370+
.. c:macro:: PyUnicode_FORMAT_UTF8 ``0x10`` UTF-8 string (``char*``)
371+
=================================== ======== ===========================
372+
373+
*requested_formats* can be a single format or a bitwise combination of the
374+
formats in the table above.
375+
On success, *\*format* will be set to a single one of the requested flags.
376+
377+
Note that future versions of Python may introduce additional formats.
378+
379+
.. versionadded:: 3.14
380+
381+
382+
.. c:function:: int PyUnicode_GetBufferFormat(const Py_buffer *view, uint32_t *format)
383+
384+
Get the format of the buffer *view*.
385+
386+
* On success, set *\*result* to the corresponding `PyUnicode_FORMAT_*` value
387+
and return ``0``.
388+
* On error, set an exception and return ``-1``.
389+
390+
*view* must be a buffer filled by :c:func:`PyUnicode_Export`.
391+
392+
.. versionadded:: 3.14
393+
394+
395+
.. c:function:: PyObject* PyUnicode_Import(const void *data, Py_ssize_t nbytes, uint32_t format)
396+
397+
Create a string object from a buffer in an “export format”.
398+
399+
* Return a reference to a new string object on success.
400+
* Set an exception and return ``NULL`` on error.
401+
402+
*data* must not be NULL. *nbytes* must be positive or zero.
403+
404+
See :c:func:`PyUnicode_Export` for the available formats.
405+
406+
.. versionadded:: 3.14
407+
408+
344409
.. c:function:: PyObject* PyUnicode_FromKindAndData(int kind, const void *buffer, \
345410
Py_ssize_t size)
346411

Doc/data/stable_abi.dat

Lines changed: 3 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Doc/whatsnew/3.14.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -529,6 +529,10 @@ New Features
529529

530530
(Contributed by Victor Stinner in :gh:`107954`.)
531531

532+
* Add :c:func:`PyUnicode_Export` and :c:func:`PyUnicode_Import` functions to
533+
export and import strings.
534+
(Contributed by Victor Stinner in :gh:`119609`.)
535+
532536

533537
Porting to Python 3.14
534538
----------------------

Include/unicodeobject.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,24 @@ PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
248248
const char *u /* UTF-8 encoded string */
249249
);
250250

251+
#define PyUnicode_FORMAT_ASCII 0x01 // Py_UCS1* (ASCII string)
252+
#define PyUnicode_FORMAT_UCS1 0x02 // Py_UCS1*
253+
#define PyUnicode_FORMAT_UCS2 0x04 // Py_UCS2*
254+
#define PyUnicode_FORMAT_UCS4 0x08 // Py_UCS4*
255+
#define PyUnicode_FORMAT_UTF8 0x10 // char*
256+
257+
PyAPI_FUNC(int) PyUnicode_Export(
258+
PyObject *unicode,
259+
uint32_t requested_formats,
260+
Py_buffer *view);
261+
PyAPI_FUNC(int) PyUnicode_GetBufferFormat(
262+
const Py_buffer *view,
263+
uint32_t *format);
264+
PyAPI_FUNC(PyObject*) PyUnicode_Import(
265+
const void *data,
266+
Py_ssize_t nbytes,
267+
uint32_t format);
268+
251269
/* --- wchar_t support for platforms which support it --------------------- */
252270

253271
#ifdef HAVE_WCHAR_H

Lib/test/test_capi/test_unicode.py

Lines changed: 177 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
1-
import unittest
1+
import struct
22
import sys
3+
import unittest
34
from test import support
45
from test.support import import_helper
56

@@ -28,6 +29,14 @@ class Str(str):
2829
pass
2930

3031

32+
PyUnicode_FORMAT_ASCII = 0x01
33+
PyUnicode_FORMAT_UCS1 = 0x02
34+
PyUnicode_FORMAT_UCS2 = 0x04
35+
PyUnicode_FORMAT_UCS4 = 0x08
36+
PyUnicode_FORMAT_UTF8 = 0x10
37+
# Invalid native format
38+
PyUnicode_FORMAT_INVALID = 0x20
39+
3140
class CAPITest(unittest.TestCase):
3241

3342
@support.cpython_only
@@ -1721,6 +1730,139 @@ def test_pep393_utf8_caching_bug(self):
17211730
# Check that the second call returns the same result
17221731
self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))
17231732

1733+
def test_unicode_export(self):
1734+
# Test PyUnicode_Export() and PyUnicode_FreeExport()
1735+
unicode_export = _testlimitedcapi.unicode_export
1736+
if sys.byteorder == 'little':
1737+
ucs2_enc = 'utf-16le'
1738+
ucs4_enc = 'utf-32le'
1739+
else:
1740+
ucs2_enc = 'utf-16be'
1741+
ucs4_enc = 'utf-32be'
1742+
1743+
# export to the native format
1744+
formats = (PyUnicode_FORMAT_ASCII
1745+
| PyUnicode_FORMAT_UCS1
1746+
| PyUnicode_FORMAT_UCS2
1747+
| PyUnicode_FORMAT_UCS4)
1748+
BUFFER_UCS1 = 'B'
1749+
BUFFER_UCS2 = 'H'
1750+
if struct.calcsize('I') == 4:
1751+
BUFFER_UCS4 = 'I'
1752+
elif struct.calcsize('L') == 4:
1753+
BUFFER_UCS4 = 'L'
1754+
else:
1755+
self.fail("unable to get BUFFER_UCS4 ")
1756+
1757+
self.assertEqual(unicode_export("abc", formats),
1758+
(b'abc', PyUnicode_FORMAT_ASCII, 1, BUFFER_UCS1))
1759+
self.assertEqual(unicode_export("latin1:\xe9", formats),
1760+
(b'latin1:\xe9', PyUnicode_FORMAT_UCS1, 1, BUFFER_UCS1))
1761+
self.assertEqual(unicode_export('ucs2:\u20ac', formats),
1762+
('ucs2:\u20ac'.encode(ucs2_enc),
1763+
PyUnicode_FORMAT_UCS2, 2, BUFFER_UCS2))
1764+
self.assertEqual(unicode_export('ucs4:\U0010ffff', formats),
1765+
('ucs4:\U0010ffff'.encode(ucs4_enc),
1766+
PyUnicode_FORMAT_UCS4, 4, BUFFER_UCS4))
1767+
1768+
# export ASCII as UCS1
1769+
self.assertEqual(unicode_export("abc", PyUnicode_FORMAT_UCS1),
1770+
(b'abc', PyUnicode_FORMAT_UCS1, 1, BUFFER_UCS1))
1771+
1772+
# export ASCII and UCS1 to UCS2
1773+
self.assertEqual(unicode_export("abc", PyUnicode_FORMAT_UCS2),
1774+
('abc'.encode(ucs2_enc), PyUnicode_FORMAT_UCS2, 2, BUFFER_UCS2))
1775+
self.assertEqual(unicode_export("latin1:\xe9", PyUnicode_FORMAT_UCS2),
1776+
('latin1:\xe9'.encode(ucs2_enc), PyUnicode_FORMAT_UCS2, 2, BUFFER_UCS2))
1777+
1778+
# always export to UCS4
1779+
self.assertEqual(unicode_export("abc", PyUnicode_FORMAT_UCS4),
1780+
('abc'.encode(ucs4_enc), PyUnicode_FORMAT_UCS4, 4, BUFFER_UCS4))
1781+
self.assertEqual(unicode_export("latin1:\xe9", PyUnicode_FORMAT_UCS4),
1782+
('latin1:\xe9'.encode(ucs4_enc), PyUnicode_FORMAT_UCS4, 4, BUFFER_UCS4))
1783+
self.assertEqual(unicode_export('ucs2:\u20ac', PyUnicode_FORMAT_UCS4),
1784+
('ucs2:\u20ac'.encode(ucs4_enc),
1785+
PyUnicode_FORMAT_UCS4, 4, BUFFER_UCS4))
1786+
self.assertEqual(unicode_export('ucs4:\U0010ffff', PyUnicode_FORMAT_UCS4),
1787+
('ucs4:\U0010ffff'.encode(ucs4_enc),
1788+
PyUnicode_FORMAT_UCS4, 4, BUFFER_UCS4))
1789+
1790+
# always export to UTF8
1791+
self.assertEqual(unicode_export("abc", PyUnicode_FORMAT_UTF8),
1792+
('abc'.encode('utf8'), PyUnicode_FORMAT_UTF8, 1, 'B'))
1793+
self.assertEqual(unicode_export("latin1:\xe9", PyUnicode_FORMAT_UTF8),
1794+
('latin1:\xe9'.encode('utf8'), PyUnicode_FORMAT_UTF8, 1, 'B'))
1795+
self.assertEqual(unicode_export('ucs2:\u20ac', PyUnicode_FORMAT_UTF8),
1796+
('ucs2:\u20ac'.encode('utf8'),
1797+
PyUnicode_FORMAT_UTF8, 1, 'B'))
1798+
self.assertEqual(unicode_export('ucs4:\U0010ffff', PyUnicode_FORMAT_UTF8),
1799+
('ucs4:\U0010ffff'.encode('utf8'),
1800+
PyUnicode_FORMAT_UTF8, 1, 'B'))
1801+
1802+
# No supported format or invalid format
1803+
with self.assertRaisesRegex(ValueError,
1804+
"unable to find a matching export format"):
1805+
unicode_export('abc', 0)
1806+
with self.assertRaisesRegex(ValueError,
1807+
"unable to find a matching export format"):
1808+
unicode_export('abc', PyUnicode_FORMAT_INVALID)
1809+
1810+
def test_unicode_import(self):
1811+
# Test PyUnicode_Import()
1812+
unicode_import = _testlimitedcapi.unicode_import
1813+
if sys.byteorder == 'little':
1814+
ucs2_enc = 'utf-16le'
1815+
ucs4_enc = 'utf-32le'
1816+
else:
1817+
ucs2_enc = 'utf-16be'
1818+
ucs4_enc = 'utf-32be'
1819+
1820+
self.assertEqual(unicode_import(b'abc', PyUnicode_FORMAT_ASCII),
1821+
"abc")
1822+
self.assertEqual(unicode_import(b'latin1:\xe9', PyUnicode_FORMAT_UCS1),
1823+
"latin1:\xe9")
1824+
1825+
self.assertEqual(unicode_import('ucs2:\u20ac'.encode(ucs2_enc),
1826+
PyUnicode_FORMAT_UCS2),
1827+
'ucs2:\u20ac')
1828+
1829+
self.assertEqual(unicode_import('ucs4:\U0010ffff'.encode(ucs4_enc),
1830+
PyUnicode_FORMAT_UCS4),
1831+
'ucs4:\U0010ffff')
1832+
1833+
text = "abc\xe9\U0010ffff"
1834+
self.assertEqual(unicode_import(text.encode('utf8'),
1835+
PyUnicode_FORMAT_UTF8),
1836+
text)
1837+
1838+
# Empty string
1839+
for native_format in (
1840+
PyUnicode_FORMAT_ASCII,
1841+
PyUnicode_FORMAT_UCS1,
1842+
PyUnicode_FORMAT_UCS2,
1843+
PyUnicode_FORMAT_UCS4,
1844+
PyUnicode_FORMAT_UTF8,
1845+
):
1846+
with self.subTest(native_format=native_format):
1847+
self.assertEqual(unicode_import(b'', native_format),
1848+
'')
1849+
1850+
# Invalid format
1851+
with self.assertRaises(ValueError):
1852+
unicode_import(b'', PyUnicode_FORMAT_INVALID)
1853+
1854+
# Invalid size
1855+
ucs2 = 'ucs2:\u20ac'.encode(ucs2_enc)
1856+
with self.assertRaises(ValueError):
1857+
unicode_import(ucs2[:-1], PyUnicode_FORMAT_UCS2)
1858+
ucs4 = 'ucs4:\U0010ffff'.encode(ucs4_enc)
1859+
with self.assertRaises(ValueError):
1860+
unicode_import(ucs4[:-1], PyUnicode_FORMAT_UCS4)
1861+
with self.assertRaises(ValueError):
1862+
unicode_import(ucs4[:-2], PyUnicode_FORMAT_UCS4)
1863+
with self.assertRaises(ValueError):
1864+
unicode_import(ucs4[:-3], PyUnicode_FORMAT_UCS4)
1865+
17241866

17251867
class PyUnicodeWriterTest(unittest.TestCase):
17261868
def create_writer(self, size):
@@ -1903,6 +2045,38 @@ def test_recover_error(self):
19032045

19042046
self.assertEqual(writer.finish(), 'Hello World.')
19052047

1906-
1907-
if __name__ == "__main__":
2048+
def test_unicode_export_import_roundtrip(self):
2049+
unicode_export = _testlimitedcapi.unicode_export
2050+
unicode_import = _testlimitedcapi.unicode_import
2051+
2052+
ASCII = PyUnicode_FORMAT_ASCII
2053+
UCS1 = PyUnicode_FORMAT_UCS1
2054+
UCS2 = PyUnicode_FORMAT_UCS2
2055+
UCS4 = PyUnicode_FORMAT_UCS4
2056+
UTF8 = PyUnicode_FORMAT_UTF8
2057+
ALL = (ASCII | UCS1 | UCS2 | UCS4 | UTF8)
2058+
2059+
for string, allowed_formats in (
2060+
('', {ASCII, UCS1, UCS2, UCS4, UTF8}),
2061+
('ascii', {ASCII, UCS1, UCS2, UCS4, UTF8}),
2062+
('latin1:\xe9', {UCS1, UCS2, UCS4, UTF8}),
2063+
('ucs2:\u20ac', {UCS2, UCS4, UTF8}),
2064+
('ucs4:\U0001f638', {UCS4, UTF8}),
2065+
):
2066+
for format in ASCII, UCS1, UCS2, UCS4, UTF8:
2067+
with self.subTest(string=string, format=format):
2068+
if format not in allowed_formats:
2069+
with self.assertRaises(ValueError):
2070+
unicode_export(string, format)
2071+
else:
2072+
buf, buf_fmt, item_size, view_fmt = unicode_export(string, format)
2073+
restored = unicode_import(buf, buf_fmt)
2074+
self.assertEqual(restored, string)
2075+
2076+
buf, buf_fmt, item_size, view_fmt = unicode_export(string, ALL)
2077+
restored = unicode_import(buf, buf_fmt)
2078+
self.assertEqual(restored, string)
2079+
2080+
2081+
if __name__ == '__main__':
19082082
unittest.main()

Lib/test/test_stable_abi_ctypes.py

Lines changed: 3 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Add :c:func:`PyUnicode_Export` and :c:func:`PyUnicode_Import` functions to
2+
export and import strings. Patch by Victor Stinner.

Misc/stable_abi.toml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2526,3 +2526,19 @@
25262526
added = '3.14'
25272527
[function.PyLong_AsUInt64]
25282528
added = '3.14'
2529+
[const.PyUnicode_FORMAT_ASCII]
2530+
added = '3.14'
2531+
[const.PyUnicode_FORMAT_UCS1]
2532+
added = '3.14'
2533+
[const.PyUnicode_FORMAT_UCS2]
2534+
added = '3.14'
2535+
[const.PyUnicode_FORMAT_UCS4]
2536+
added = '3.14'
2537+
[const.PyUnicode_FORMAT_UTF8]
2538+
added = '3.14'
2539+
[function.PyUnicode_Export]
2540+
added = '3.14'
2541+
[function.PyUnicode_GetBufferFormat]
2542+
added = '3.14'
2543+
[function.PyUnicode_Import]
2544+
added = '3.14'

0 commit comments

Comments
 (0)