Skip to content

Commit 4d77192

Browse files
committed
Change the API to PyUnicode_Export()
1 parent 28c30c0 commit 4d77192

File tree

4 files changed

+207
-91
lines changed

4 files changed

+207
-91
lines changed

Include/unicodeobject.h

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -248,27 +248,33 @@ PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
248248
const char *u /* UTF-8 encoded string */
249249
);
250250

251-
#define PyUnicode_NATIVE_ASCII 1
252-
#define PyUnicode_NATIVE_UCS1 2
253-
#define PyUnicode_NATIVE_UCS2 3
254-
#define PyUnicode_NATIVE_UCS4 4
255-
#define PyUnicode_NATIVE_UTF8 5
251+
#define PyUnicode_FORMAT_ASCII 0x01
252+
#define PyUnicode_FORMAT_UCS1 0x02
253+
#define PyUnicode_FORMAT_UCS2 0x04
254+
#define PyUnicode_FORMAT_UCS4 0x08
255+
#define PyUnicode_FORMAT_UTF8 0x10
256256

257257
// Get the content of a string in its native format.
258258
// - Return the content, set '*size' and '*native_format' on success.
259259
// - Set an exception and return NULL on error.
260-
PyAPI_FUNC(const void*) PyUnicode_AsNativeFormat(
260+
PyAPI_FUNC(const void*) PyUnicode_Export(
261261
PyObject *unicode,
262+
unsigned int supported_formats,
262263
Py_ssize_t *size,
263-
int *native_format);
264+
unsigned int *format);
265+
266+
PyAPI_FUNC(void) PyUnicode_FreeExport(
267+
PyObject *unicode,
268+
const void* data,
269+
unsigned int format);
264270

265271
// Create a string object from a native format string.
266272
// - Return a reference to a new string object on success.
267273
// - Set an exception and return NULL on error.
268-
PyAPI_FUNC(PyObject*) PyUnicode_FromNativeFormat(
274+
PyAPI_FUNC(PyObject*) PyUnicode_Import(
269275
const void *data,
270276
Py_ssize_t size,
271-
int native_format);
277+
unsigned int format);
272278

273279
/* --- wchar_t support for platforms which support it --------------------- */
274280

Lib/test/test_capi/test_unicode.py

Lines changed: 91 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,13 @@ class Str(str):
2424
pass
2525

2626

27-
PyUnicode_NATIVE_ASCII = 1
28-
PyUnicode_NATIVE_UCS1 = 2
29-
PyUnicode_NATIVE_UCS2 = 3
30-
PyUnicode_NATIVE_UCS4 = 4
31-
PyUnicode_NATIVE_UTF8 = 5
27+
PyUnicode_FORMAT_ASCII = 0x01
28+
PyUnicode_FORMAT_UCS1 = 0x02
29+
PyUnicode_FORMAT_UCS2 = 0x04
30+
PyUnicode_FORMAT_UCS4 = 0x08
31+
PyUnicode_FORMAT_UTF8 = 0x10
3232
# Invalid native format
33-
PyUnicode_NATIVE_INVALID = 0
33+
PyUnicode_FORMAT_INVALID = 0x20
3434

3535
class CAPITest(unittest.TestCase):
3636

@@ -1683,74 +1683,119 @@ def test_pep393_utf8_caching_bug(self):
16831683
# Check that the second call returns the same result
16841684
self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))
16851685

1686-
def test_unicode_asnativeformat(self):
1687-
# Test PyUnicode_AsNativeFormat()
1688-
asnativeformat = _testlimitedcapi.unicode_asnativeformat
1689-
self.assertEqual(asnativeformat("abc"),
1690-
(b'abc', PyUnicode_NATIVE_ASCII))
1691-
self.assertEqual(asnativeformat("latin1:\xe9"),
1692-
(b'latin1:\xe9', PyUnicode_NATIVE_UCS1))
1693-
1694-
ucs2_enc = 'utf-16le' if sys.byteorder == 'little' else 'utf-16be'
1695-
self.assertEqual(asnativeformat('ucs2:\u20ac'),
1686+
def test_unicode_export(self):
1687+
# Test PyUnicode_Export() and PyUnicode_FreeExport()
1688+
unicode_export = _testlimitedcapi.unicode_export
1689+
if sys.byteorder == 'little':
1690+
ucs2_enc = 'utf-16le'
1691+
ucs4_enc = 'utf-32le'
1692+
else:
1693+
ucs2_enc = 'utf-16be'
1694+
ucs4_enc = 'utf-32be'
1695+
1696+
# export to the native format
1697+
formats = (PyUnicode_FORMAT_ASCII
1698+
| PyUnicode_FORMAT_UCS1
1699+
| PyUnicode_FORMAT_UCS2
1700+
| PyUnicode_FORMAT_UCS4)
1701+
self.assertEqual(unicode_export("abc", formats),
1702+
(b'abc', PyUnicode_FORMAT_ASCII))
1703+
self.assertEqual(unicode_export("latin1:\xe9", formats),
1704+
(b'latin1:\xe9', PyUnicode_FORMAT_UCS1))
1705+
self.assertEqual(unicode_export('ucs2:\u20ac', formats),
16961706
('ucs2:\u20ac'.encode(ucs2_enc),
1697-
PyUnicode_NATIVE_UCS2))
1698-
1699-
ucs4_enc = 'utf-32le' if sys.byteorder == 'little' else 'utf-32be'
1700-
self.assertEqual(asnativeformat('ucs4:\U0010ffff'),
1707+
PyUnicode_FORMAT_UCS2))
1708+
self.assertEqual(unicode_export('ucs4:\U0010ffff', formats),
17011709
('ucs4:\U0010ffff'.encode(ucs4_enc),
1702-
PyUnicode_NATIVE_UCS4))
1703-
1704-
def test_unicode_fromnativeformat(self):
1705-
# Test PyUnicode_FromNativeFormat()
1706-
fromnativeformat = _testlimitedcapi.unicode_fromnativeformat
1707-
self.assertEqual(fromnativeformat(b'abc', PyUnicode_NATIVE_ASCII),
1710+
PyUnicode_FORMAT_UCS4))
1711+
1712+
# always export to UCS4
1713+
self.assertEqual(unicode_export("abc", PyUnicode_FORMAT_UCS4),
1714+
('abc'.encode(ucs4_enc), PyUnicode_FORMAT_UCS4))
1715+
self.assertEqual(unicode_export("latin1:\xe9", PyUnicode_FORMAT_UCS4),
1716+
('latin1:\xe9'.encode(ucs4_enc), PyUnicode_FORMAT_UCS4))
1717+
self.assertEqual(unicode_export('ucs2:\u20ac', PyUnicode_FORMAT_UCS4),
1718+
('ucs2:\u20ac'.encode(ucs4_enc),
1719+
PyUnicode_FORMAT_UCS4))
1720+
self.assertEqual(unicode_export('ucs4:\U0010ffff', PyUnicode_FORMAT_UCS4),
1721+
('ucs4:\U0010ffff'.encode(ucs4_enc),
1722+
PyUnicode_FORMAT_UCS4))
1723+
1724+
# always export to UTF8
1725+
self.assertEqual(unicode_export("abc", PyUnicode_FORMAT_UTF8),
1726+
('abc'.encode('utf8'), PyUnicode_FORMAT_UTF8))
1727+
self.assertEqual(unicode_export("latin1:\xe9", PyUnicode_FORMAT_UTF8),
1728+
('latin1:\xe9'.encode('utf8'), PyUnicode_FORMAT_UTF8))
1729+
self.assertEqual(unicode_export('ucs2:\u20ac', PyUnicode_FORMAT_UTF8),
1730+
('ucs2:\u20ac'.encode('utf8'),
1731+
PyUnicode_FORMAT_UTF8))
1732+
self.assertEqual(unicode_export('ucs4:\U0010ffff', PyUnicode_FORMAT_UTF8),
1733+
('ucs4:\U0010ffff'.encode('utf8'),
1734+
PyUnicode_FORMAT_UTF8))
1735+
1736+
# No supported format or invalid format
1737+
with self.assertRaisesRegex(ValueError,
1738+
"unable to find a matching export format"):
1739+
unicode_export('abc', 0)
1740+
with self.assertRaisesRegex(ValueError,
1741+
"unable to find a matching export format"):
1742+
unicode_export('abc', PyUnicode_FORMAT_INVALID)
1743+
1744+
def test_unicode_import(self):
1745+
# Test PyUnicode_Import()
1746+
unicode_import = _testlimitedcapi.unicode_import
1747+
if sys.byteorder == 'little':
1748+
ucs2_enc = 'utf-16le'
1749+
ucs4_enc = 'utf-32le'
1750+
else:
1751+
ucs2_enc = 'utf-16be'
1752+
ucs4_enc = 'utf-32be'
1753+
1754+
self.assertEqual(unicode_import(b'abc', PyUnicode_FORMAT_ASCII),
17081755
"abc")
1709-
self.assertEqual(fromnativeformat(b'latin1:\xe9', PyUnicode_NATIVE_UCS1),
1756+
self.assertEqual(unicode_import(b'latin1:\xe9', PyUnicode_FORMAT_UCS1),
17101757
"latin1:\xe9")
17111758

1712-
ucs2_enc = 'utf-16le' if sys.byteorder == 'little' else 'utf-16be'
1713-
self.assertEqual(fromnativeformat('ucs2:\u20ac'.encode(ucs2_enc),
1714-
PyUnicode_NATIVE_UCS2),
1759+
self.assertEqual(unicode_import('ucs2:\u20ac'.encode(ucs2_enc),
1760+
PyUnicode_FORMAT_UCS2),
17151761
'ucs2:\u20ac')
17161762

1717-
ucs4_enc = 'utf-32le' if sys.byteorder == 'little' else 'utf-32be'
1718-
self.assertEqual(fromnativeformat('ucs4:\U0010ffff'.encode(ucs4_enc),
1719-
PyUnicode_NATIVE_UCS4),
1763+
self.assertEqual(unicode_import('ucs4:\U0010ffff'.encode(ucs4_enc),
1764+
PyUnicode_FORMAT_UCS4),
17201765
'ucs4:\U0010ffff')
17211766

17221767
text = "abc\xe9\U0010ffff"
1723-
self.assertEqual(fromnativeformat(text.encode('utf8'),
1724-
PyUnicode_NATIVE_UTF8),
1768+
self.assertEqual(unicode_import(text.encode('utf8'),
1769+
PyUnicode_FORMAT_UTF8),
17251770
text)
17261771

17271772
# Empty string
17281773
for native_format in (
1729-
PyUnicode_NATIVE_ASCII,
1730-
PyUnicode_NATIVE_UCS1,
1731-
PyUnicode_NATIVE_UCS2,
1732-
PyUnicode_NATIVE_UCS4,
1733-
PyUnicode_NATIVE_UTF8,
1774+
PyUnicode_FORMAT_ASCII,
1775+
PyUnicode_FORMAT_UCS1,
1776+
PyUnicode_FORMAT_UCS2,
1777+
PyUnicode_FORMAT_UCS4,
1778+
PyUnicode_FORMAT_UTF8,
17341779
):
17351780
with self.subTest(native_format=native_format):
1736-
self.assertEqual(fromnativeformat(b'', native_format),
1781+
self.assertEqual(unicode_import(b'', native_format),
17371782
'')
17381783

17391784
# Invalid format
17401785
with self.assertRaises(ValueError):
1741-
fromnativeformat(b'', PyUnicode_NATIVE_INVALID)
1786+
unicode_import(b'', PyUnicode_FORMAT_INVALID)
17421787

17431788
# Invalid size
17441789
ucs2 = 'ucs2:\u20ac'.encode(ucs2_enc)
17451790
with self.assertRaises(ValueError):
1746-
fromnativeformat(ucs2[:-1], PyUnicode_NATIVE_UCS2)
1791+
unicode_import(ucs2[:-1], PyUnicode_FORMAT_UCS2)
17471792
ucs4 = 'ucs4:\U0010ffff'.encode(ucs4_enc)
17481793
with self.assertRaises(ValueError):
1749-
fromnativeformat(ucs4[:-1], PyUnicode_NATIVE_UCS4)
1794+
unicode_import(ucs4[:-1], PyUnicode_FORMAT_UCS4)
17501795
with self.assertRaises(ValueError):
1751-
fromnativeformat(ucs4[:-2], PyUnicode_NATIVE_UCS4)
1796+
unicode_import(ucs4[:-2], PyUnicode_FORMAT_UCS4)
17521797
with self.assertRaises(ValueError):
1753-
fromnativeformat(ucs4[:-3], PyUnicode_NATIVE_UCS4)
1798+
unicode_import(ucs4[:-3], PyUnicode_FORMAT_UCS4)
17541799

17551800

17561801
if __name__ == '__main__':

Modules/_testlimitedcapi/unicode.c

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1840,29 +1840,38 @@ test_string_from_format(PyObject *self, PyObject *Py_UNUSED(ignored))
18401840

18411841
// Test PyUnicode_AsNativeFormat()
18421842
static PyObject*
1843-
unicode_asnativeformat(PyObject *self, PyObject *obj)
1843+
unicode_export(PyObject *self, PyObject *args)
18441844
{
1845+
PyObject *obj;
1846+
unsigned int supported_formats;
1847+
if (!PyArg_ParseTuple(args, "OI", &obj, &supported_formats)) {
1848+
return NULL;
1849+
}
1850+
18451851
Py_ssize_t size;
1846-
int native_format;
1847-
const void *data = PyUnicode_AsNativeFormat(obj, &size, &native_format);
1852+
unsigned int format;
1853+
const void *data = PyUnicode_Export(obj, supported_formats, &size, &format);
18481854
if (data == NULL) {
18491855
return NULL;
18501856
}
1851-
return Py_BuildValue("y#i", data, size, native_format);
1857+
1858+
PyObject *res = Py_BuildValue("y#i", data, size, format);
1859+
PyUnicode_FreeExport(obj, data, format);
1860+
return res;
18521861
}
18531862

18541863

18551864
// Test PyUnicode_FromNativeFormat()
18561865
static PyObject*
1857-
unicode_fromnativeformat(PyObject *self, PyObject *args)
1866+
unicode_import(PyObject *self, PyObject *args)
18581867
{
18591868
const void *data;
18601869
Py_ssize_t size;
1861-
int native_format;
1862-
if (!PyArg_ParseTuple(args, "y#i", &data, &size, &native_format)) {
1870+
unsigned int format;
1871+
if (!PyArg_ParseTuple(args, "y#i", &data, &size, &format)) {
18631872
return NULL;
18641873
}
1865-
return PyUnicode_FromNativeFormat(data, size, native_format);
1874+
return PyUnicode_Import(data, size, format);
18661875
}
18671876

18681877

@@ -1953,8 +1962,8 @@ static PyMethodDef TestMethods[] = {
19531962
{"unicode_format", unicode_format, METH_VARARGS},
19541963
{"unicode_contains", unicode_contains, METH_VARARGS},
19551964
{"unicode_isidentifier", unicode_isidentifier, METH_O},
1956-
{"unicode_asnativeformat", unicode_asnativeformat, METH_O},
1957-
{"unicode_fromnativeformat", unicode_fromnativeformat, METH_VARARGS},
1965+
{"unicode_export", unicode_export, METH_VARARGS},
1966+
{"unicode_import", unicode_import, METH_VARARGS},
19581967
{NULL},
19591968
};
19601969

0 commit comments

Comments
 (0)