diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 958fafd47ac81b..4182d87472d546 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -341,6 +341,82 @@ APIs: .. versionadded:: 3.3 +.. c:function:: int32_t PyUnicode_Export(PyObject *unicode, int32_t requested_formats, uint32_t flags, Py_buffer *view) + + Export the contents of the *unicode* string in one of the *requested_formats*. + + * On success, fill *view*, and return a format (greater than ``0``). + * On error, set an exception, and return ``-1``. + *view* is left unchanged. + + After a successful call to :c:func:`PyUnicode_Export`, + the *view* buffer must be released by :c:func:`PyBuffer_Release`. + The contents of the buffer are valid until they are released. + + The buffer is read-only and must not be modified. + + *unicode* and *view* must not be NULL. + + Available formats: + + .. c:namespace:: NULL + + =================================== ======== =========================== + Constant Identifier Value Description + =================================== ======== =========================== + .. c:macro:: PyUnicode_FORMAT_UCS1 ``0x01`` UCS-1 string (``Py_UCS1*``) + .. c:macro:: PyUnicode_FORMAT_UCS2 ``0x02`` UCS-2 string (``Py_UCS2*``) + .. c:macro:: PyUnicode_FORMAT_UCS4 ``0x04`` UCS-4 string (``Py_UCS4*``) + .. c:macro:: PyUnicode_FORMAT_UTF8 ``0x08`` UTF-8 string (``char*``) + .. c:macro:: PyUnicode_FORMAT_ASCII ``0x10`` ASCII string (``Py_UCS1*``) + =================================== ======== =========================== + + UCS-2 and UCS-4 use the native byte order. + + *requested_formats* can be a single format or a bitwise combination of the + formats in the table above. + On success, the returned format will be set to a single one of the requested + flags. + + Note that future versions of Python may introduce additional formats. + + By default, if the :c:macro:`PyUnicode_EXPORT_COPY` flag is not set in + *flags*, no memory is copied and no conversion is done. + + If the :c:macro:`PyUnicode_EXPORT_COPY` flag is set in *flags*, the function + can copy memory to provide the requested format and convert from a format + to another. + + The :c:macro:`PyUnicode_EXPORT_COPY` flag is needed to export to + :c:macro:`PyUnicode_FORMAT_UTF8` a string containing surrogate characters. + + Available flags: + + .. c:namespace:: NULL + + ================================== ======== =================================== + Flag Value Description + ================================== ======== =================================== + .. c:macro:: PyUnicode_EXPORT_COPY ``0x01`` Allow memory copies and conversions + ================================== ======== =================================== + + .. versionadded:: 3.14 + + +.. c:function:: PyObject* PyUnicode_Import(const void *data, Py_ssize_t nbytes, int32_t format) + + Create a Unicode string object from a buffer in a supported format. + + * Return a reference to a new string object on success. + * Set an exception and return ``NULL`` on error. + + *data* must not be NULL. *nbytes* must be positive or zero. + + See :c:func:`PyUnicode_Export` for the available formats. + + .. versionadded:: 3.14 + + .. c:function:: PyObject* PyUnicode_FromKindAndData(int kind, const void *buffer, \ Py_ssize_t size) diff --git a/Doc/data/stable_abi.dat b/Doc/data/stable_abi.dat index 7eeee270bb7f32..e60d809e969c06 100644 --- a/Doc/data/stable_abi.dat +++ b/Doc/data/stable_abi.dat @@ -784,6 +784,7 @@ func,PyUnicode_EncodeFSDefault,3.2,, func,PyUnicode_EncodeLocale,3.7,, func,PyUnicode_EqualToUTF8,3.13,, func,PyUnicode_EqualToUTF8AndSize,3.13,, +func,PyUnicode_Export,3.14,, func,PyUnicode_FSConverter,3.2,, func,PyUnicode_FSDecoder,3.2,, func,PyUnicode_Find,3.2,, @@ -799,6 +800,7 @@ func,PyUnicode_FromStringAndSize,3.2,, func,PyUnicode_FromWideChar,3.2,, func,PyUnicode_GetDefaultEncoding,3.2,, func,PyUnicode_GetLength,3.7,, +func,PyUnicode_Import,3.14,, func,PyUnicode_InternFromString,3.2,, func,PyUnicode_InternInPlace,3.2,, func,PyUnicode_IsIdentifier,3.2,, diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst index 5e762336e547f6..d4c7d7b17f8da2 100644 --- a/Doc/whatsnew/3.14.rst +++ b/Doc/whatsnew/3.14.rst @@ -554,6 +554,10 @@ New Features (Contributed by Victor Stinner in :gh:`107954`.) +* Add :c:func:`PyUnicode_Export` and :c:func:`PyUnicode_Import` functions + to export and import strings. + (Contributed by Victor Stinner in :gh:`119609`.) + Porting to Python 3.14 ---------------------- diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index dee00715b3c51d..5b1eb15f2703e4 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -248,6 +248,27 @@ PyAPI_FUNC(PyObject *) PyUnicode_InternFromString( const char *u /* UTF-8 encoded string */ ); +#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030e0000 +#define PyUnicode_FORMAT_UCS1 0x01 // Py_UCS1* +#define PyUnicode_FORMAT_UCS2 0x02 // Py_UCS2* +#define PyUnicode_FORMAT_UCS4 0x04 // Py_UCS4* +#define PyUnicode_FORMAT_UTF8 0x08 // char* +#define PyUnicode_FORMAT_ASCII 0x10 // char* (ASCII string) + +#define PyUnicode_EXPORT_COPY 0x01 + + +PyAPI_FUNC(int32_t) PyUnicode_Export( + PyObject *unicode, + int32_t requested_formats, + uint32_t flags, + Py_buffer *view); +PyAPI_FUNC(PyObject*) PyUnicode_Import( + const void *data, + Py_ssize_t nbytes, + int32_t format); +#endif + /* --- wchar_t support for platforms which support it --------------------- */ #ifdef HAVE_WCHAR_H diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index e6f85427214958..b6ecc2a5a6b811 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1,5 +1,6 @@ -import unittest +import struct import sys +import unittest from test import support from test.support import import_helper @@ -28,6 +29,17 @@ class Str(str): pass +PyUnicode_FORMAT_UCS1 = 0x01 +PyUnicode_FORMAT_UCS2 = 0x02 +PyUnicode_FORMAT_UCS4 = 0x04 +PyUnicode_FORMAT_UTF8 = 0x08 +PyUnicode_FORMAT_ASCII = 0x10 +# Invalid native format +PyUnicode_FORMAT_INVALID = 0x20 + +PyUnicode_EXPORT_COPY = 0x01 + + class CAPITest(unittest.TestCase): @support.cpython_only @@ -1721,6 +1733,183 @@ def test_pep393_utf8_caching_bug(self): # Check that the second call returns the same result self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1)) + def test_unicode_export(self): + # Test PyUnicode_Export() and PyUnicode_FreeExport() + unicode_export = _testlimitedcapi.unicode_export + if sys.byteorder == 'little': + ucs2_enc = 'utf-16le' + ucs4_enc = 'utf-32le' + else: + ucs2_enc = 'utf-16be' + ucs4_enc = 'utf-32be' + + # export to the native format + formats = (PyUnicode_FORMAT_ASCII + | PyUnicode_FORMAT_UCS1 + | PyUnicode_FORMAT_UCS2 + | PyUnicode_FORMAT_UCS4) + BUFFER_UCS1 = 'B' + BUFFER_UCS2 = '=H' + BUFFER_UCS4 = '=I' + + def check_ucs1(text, formats, flags=0): + if formats == PyUnicode_FORMAT_UCS1: + export_format = PyUnicode_FORMAT_UCS1 + elif text.isascii(): + export_format = PyUnicode_FORMAT_ASCII + else: + export_format = PyUnicode_FORMAT_UCS1 + self.assertEqual(unicode_export(text, formats, flags), + (text.encode('latin1'), export_format, 1, BUFFER_UCS1)) + + def check_ucs2(text, formats, flags=0): + self.assertEqual(unicode_export(text, formats, flags), + (text.encode(ucs2_enc, 'surrogatepass'), + PyUnicode_FORMAT_UCS2, 2, BUFFER_UCS2)) + + def check_ucs4(text, formats, flags=0): + self.assertEqual(unicode_export(text, formats, flags), + (text.encode(ucs4_enc, 'surrogatepass'), + PyUnicode_FORMAT_UCS4, 4, BUFFER_UCS4)) + + def check_utf8(text, flags=0): + self.assertEqual(unicode_export(text, PyUnicode_FORMAT_UTF8, flags), + (text.encode('utf8', 'surrogatepass'), + PyUnicode_FORMAT_UTF8, 1, 'B')) + + def check_no_matching_format(text, formats, flags=0): + err_msg = "unable to find a matching export format" + with self.assertRaisesRegex(ValueError, err_msg): + unicode_export('abc', formats, flags) + + # export as native format + check_ucs1("abc", formats) + check_ucs1("latin1:\xe9", formats) + check_ucs2('ucs2:\u20ac', formats) + check_ucs4('ucs4:\U0010ffff', formats) + + # convert ASCII to UCS1 + check_ucs1("abc", PyUnicode_FORMAT_UCS1) + + # convert to UCS2 (need PyUnicode_EXPORT_COPY) + check_no_matching_format("abc", PyUnicode_FORMAT_UCS2) + check_no_matching_format("latin1:\xe9", PyUnicode_FORMAT_UCS2) + check_ucs2("abc", PyUnicode_FORMAT_UCS2, PyUnicode_EXPORT_COPY) + check_ucs2("latin1:\xe9", PyUnicode_FORMAT_UCS2, PyUnicode_EXPORT_COPY) + + # convert to UCS4 (need PyUnicode_EXPORT_COPY) + check_no_matching_format("abc", PyUnicode_FORMAT_UCS4) + check_no_matching_format("latin1:\xe9", PyUnicode_FORMAT_UCS4) + check_no_matching_format('ucs2:\u20ac', PyUnicode_FORMAT_UCS4) + check_ucs4("abc", PyUnicode_FORMAT_UCS4, PyUnicode_EXPORT_COPY) + check_ucs4("latin1:\xe9", PyUnicode_FORMAT_UCS4, PyUnicode_EXPORT_COPY) + check_ucs4('ucs2:\u20ac', PyUnicode_FORMAT_UCS4, PyUnicode_EXPORT_COPY) + + # always encode to UTF8 + check_utf8("abc") + check_utf8("latin1:\xe9") + check_utf8('ucs2:\u20ac') + check_utf8('ucs4:\U0010ffff') + + # surrogates + check_ucs2('\udc80', PyUnicode_FORMAT_UCS2) + check_ucs4('\udc80', PyUnicode_FORMAT_UCS4, PyUnicode_EXPORT_COPY) + check_utf8('\udc80', PyUnicode_EXPORT_COPY) + + # No supported format or invalid format + for formats in (0, PyUnicode_FORMAT_INVALID): + with self.subTest(formats=formats): + check_no_matching_format('abc', formats) + + def test_unicode_import(self): + # Test PyUnicode_Import() + unicode_import = _testlimitedcapi.unicode_import + if sys.byteorder == 'little': + ucs2_enc = 'utf-16le' + ucs4_enc = 'utf-32le' + else: + ucs2_enc = 'utf-16be' + ucs4_enc = 'utf-32be' + + self.assertEqual(unicode_import(b'abc', PyUnicode_FORMAT_ASCII), + "abc") + self.assertEqual(unicode_import(b'latin1:\xe9', PyUnicode_FORMAT_UCS1), + "latin1:\xe9") + + self.assertEqual(unicode_import('ucs2:\u20ac'.encode(ucs2_enc), + PyUnicode_FORMAT_UCS2), + 'ucs2:\u20ac') + + self.assertEqual(unicode_import('ucs4:\U0010ffff'.encode(ucs4_enc), + PyUnicode_FORMAT_UCS4), + 'ucs4:\U0010ffff') + + text = "abc\xe9\U0010ffff" + self.assertEqual(unicode_import(text.encode('utf8'), + PyUnicode_FORMAT_UTF8), + text) + + # Empty string + for native_format in ( + PyUnicode_FORMAT_ASCII, + PyUnicode_FORMAT_UCS1, + PyUnicode_FORMAT_UCS2, + PyUnicode_FORMAT_UCS4, + PyUnicode_FORMAT_UTF8, + ): + with self.subTest(native_format=native_format): + self.assertEqual(unicode_import(b'', native_format), + '') + + # Invalid format + with self.assertRaises(ValueError): + unicode_import(b'', PyUnicode_FORMAT_INVALID) + + # Invalid size + ucs2 = 'ucs2:\u20ac'.encode(ucs2_enc) + with self.assertRaises(ValueError): + unicode_import(ucs2[:-1], PyUnicode_FORMAT_UCS2) + ucs4 = 'ucs4:\U0010ffff'.encode(ucs4_enc) + with self.assertRaises(ValueError): + unicode_import(ucs4[:-1], PyUnicode_FORMAT_UCS4) + with self.assertRaises(ValueError): + unicode_import(ucs4[:-2], PyUnicode_FORMAT_UCS4) + with self.assertRaises(ValueError): + unicode_import(ucs4[:-3], PyUnicode_FORMAT_UCS4) + + def test_unicode_export_import_roundtrip(self): + unicode_export = _testlimitedcapi.unicode_export + unicode_import = _testlimitedcapi.unicode_import + + ASCII = PyUnicode_FORMAT_ASCII + UCS1 = PyUnicode_FORMAT_UCS1 + UCS2 = PyUnicode_FORMAT_UCS2 + UCS4 = PyUnicode_FORMAT_UCS4 + UTF8 = PyUnicode_FORMAT_UTF8 + ALL = (ASCII | UCS1 | UCS2 | UCS4 | UTF8) + + def roundtrip(string, formats): + export = unicode_export(string, formats, PyUnicode_EXPORT_COPY) + buf, buf_fmt, item_size, view_fmt = export + self.assertEqual(unicode_import(buf, buf_fmt), string) + + for string, allowed_formats in ( + ('', {ASCII, UCS1, UCS2, UCS4, UTF8}), + ('ascii', {ASCII, UCS1, UCS2, UCS4, UTF8}), + ('latin1:\xe9', {UCS1, UCS2, UCS4, UTF8}), + ('ucs2:\u20ac', {UCS2, UCS4, UTF8}), + ('ucs4:\U0001f638', {UCS4, UTF8}), + ): + for formats in ASCII, UCS1, UCS2, UCS4, UTF8: + with self.subTest(string=string, formats=formats): + if formats not in allowed_formats: + with self.assertRaises(ValueError): + unicode_export(string, formats, PyUnicode_EXPORT_COPY) + else: + roundtrip(string, formats) + + roundtrip(string, ALL) + class PyUnicodeWriterTest(unittest.TestCase): def create_writer(self, size): @@ -1904,5 +2093,5 @@ def test_recover_error(self): self.assertEqual(writer.finish(), 'Hello World.') -if __name__ == "__main__": +if __name__ == '__main__': unittest.main() diff --git a/Lib/test/test_stable_abi_ctypes.py b/Lib/test/test_stable_abi_ctypes.py index 4bca33b7451f80..483f42c8d14ec3 100644 --- a/Lib/test/test_stable_abi_ctypes.py +++ b/Lib/test/test_stable_abi_ctypes.py @@ -806,6 +806,7 @@ def test_windows_feature_macros(self): "PyUnicode_EncodeLocale", "PyUnicode_EqualToUTF8", "PyUnicode_EqualToUTF8AndSize", + "PyUnicode_Export", "PyUnicode_FSConverter", "PyUnicode_FSDecoder", "PyUnicode_Find", @@ -822,6 +823,7 @@ def test_windows_feature_macros(self): "PyUnicode_GetDefaultEncoding", "PyUnicode_GetLength", "PyUnicode_GetSize", + "PyUnicode_Import", "PyUnicode_InternFromString", "PyUnicode_InternImmortal", "PyUnicode_InternInPlace", diff --git a/Misc/NEWS.d/next/C_API/2024-09-05-16-57-51.gh-issue-119609.5EZ-kg.rst b/Misc/NEWS.d/next/C_API/2024-09-05-16-57-51.gh-issue-119609.5EZ-kg.rst new file mode 100644 index 00000000000000..3eae4543f087d0 --- /dev/null +++ b/Misc/NEWS.d/next/C_API/2024-09-05-16-57-51.gh-issue-119609.5EZ-kg.rst @@ -0,0 +1,2 @@ +Add :c:func:`PyUnicode_Export` and :c:func:`PyUnicode_Import` functions to +export and import strings. Patch by Victor Stinner. diff --git a/Misc/stable_abi.toml b/Misc/stable_abi.toml index 6036fc96fdd995..70f69c01363bef 100644 --- a/Misc/stable_abi.toml +++ b/Misc/stable_abi.toml @@ -2527,4 +2527,18 @@ [function.PyLong_AsUInt64] added = '3.14' [const.Py_tp_vectorcall] - added = '3.14' \ No newline at end of file + added = '3.14' +[const.PyUnicode_FORMAT_UCS1] + added = '3.14' +[const.PyUnicode_FORMAT_UCS2] + added = '3.14' +[const.PyUnicode_FORMAT_UCS4] + added = '3.14' +[const.PyUnicode_FORMAT_UTF8] + added = '3.14' +[const.PyUnicode_FORMAT_ASCII] + added = '3.14' +[function.PyUnicode_Export] + added = '3.14' +[function.PyUnicode_Import] + added = '3.14' diff --git a/Modules/_testlimitedcapi/unicode.c b/Modules/_testlimitedcapi/unicode.c index 2b70d09108a333..9b6c0ee9a9d38f 100644 --- a/Modules/_testlimitedcapi/unicode.c +++ b/Modules/_testlimitedcapi/unicode.c @@ -1,7 +1,7 @@ #include "pyconfig.h" // Py_GIL_DISABLED #ifndef Py_GIL_DISABLED - // Need limited C API 3.13 to test PyUnicode_EqualToUTF8() -# define Py_LIMITED_API 0x030d0000 + // Need limited C API 3.14 to test PyUnicode_Export() +# define Py_LIMITED_API 0x030e0000 #endif #include "parts.h" @@ -1837,6 +1837,71 @@ test_string_from_format(PyObject *self, PyObject *Py_UNUSED(ignored)) #undef CHECK_FORMAT_0 } + +// Test PyUnicode_Export() +static PyObject* +unicode_export(PyObject *self, PyObject *args) +{ + PyObject *obj; + unsigned int requested_formats, flags; + if (!PyArg_ParseTuple(args, "OII", &obj, &requested_formats, &flags)) { + return NULL; + } + + Py_buffer view; + int32_t format = PyUnicode_Export(obj, requested_formats, flags, &view); + if (format < 0) { + return NULL; + } + + // Make sure that the exported string ends with a NUL character + char *data = view.buf; + Py_ssize_t nbytes = view.len * view.itemsize; + switch (format) + { + case PyUnicode_FORMAT_ASCII: + case PyUnicode_FORMAT_UCS1: + assert(data[nbytes] == 0); + break; + case PyUnicode_FORMAT_UCS2: + assert(data[nbytes] == 0); + assert(data[nbytes + 1] == 0); + break; + case PyUnicode_FORMAT_UCS4: + assert(data[nbytes] == 0); + assert(data[nbytes + 1] == 0); + assert(data[nbytes + 2] == 0); + assert(data[nbytes + 3] == 0); + break; + case PyUnicode_FORMAT_UTF8: + assert(data[nbytes] == 0); + break; + } + + assert(view.format != NULL); + PyObject *res = Py_BuildValue("y#iis", + view.buf, view.len * view.itemsize, + (int)format, + (int)view.itemsize, view.format); + PyBuffer_Release(&view); + return res; +} + + +// Test PyUnicode_Import() +static PyObject* +unicode_import(PyObject *self, PyObject *args) +{ + const void *data; + Py_ssize_t nbytes; + int format; + if (!PyArg_ParseTuple(args, "y#i", &data, &nbytes, &format)) { + return NULL; + } + return PyUnicode_Import(data, nbytes, (int32_t)format); +} + + static PyMethodDef TestMethods[] = { {"codec_incrementalencoder", codec_incrementalencoder, METH_VARARGS}, {"codec_incrementaldecoder", codec_incrementaldecoder, METH_VARARGS}, @@ -1924,6 +1989,8 @@ static PyMethodDef TestMethods[] = { {"unicode_format", unicode_format, METH_VARARGS}, {"unicode_contains", unicode_contains, METH_VARARGS}, {"unicode_isidentifier", unicode_isidentifier, METH_O}, + {"unicode_export", unicode_export, METH_VARARGS}, + {"unicode_import", unicode_import, METH_VARARGS}, {NULL}, }; diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 2494c989544ca0..2f907e2558d534 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -208,6 +208,9 @@ unicode_decode_utf8_writer(_PyUnicodeWriter *writer, static inline int unicode_is_finalizing(void); static int unicode_is_singleton(PyObject *unicode); #endif +static Py_UCS4* +as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, + int copy_null); // Return a reference to the immortal empty string singleton. @@ -2332,6 +2335,199 @@ PyUnicodeWriter_WriteUCS4(PyUnicodeWriter *pub_writer, } +static int32_t +unicode_export(PyObject *obj, Py_buffer *view, + Py_ssize_t len, const void *buf, + int itemsize, const char *format, int32_t export_format) +{ + if (PyBuffer_FillInfo(view, obj, (void*)buf, len, + 1, PyBUF_SIMPLE) < 0) { + return -1; + } + view->itemsize = itemsize; + view->format = (char*)format; + return export_format; +} + + +static int32_t +unicode_export_bytes(PyObject *bytes, Py_buffer *view, Py_ssize_t len, + int itemsize, const char *format, int32_t export_format) +{ + const void *buf = PyBytes_AS_STRING(bytes); + + if (PyBuffer_FillInfo(view, bytes, (void*)buf, len, + 1, PyBUF_SIMPLE) < 0) + { + Py_DECREF(bytes); + return -1; + } + Py_DECREF(bytes); + + view->itemsize = itemsize; + view->format = (char*)format; + return export_format; +} + + +int32_t +PyUnicode_Export(PyObject *unicode, int32_t requested_formats, + uint32_t flags, Py_buffer *view) +{ + if (!PyUnicode_Check(unicode)) { + PyErr_Format(PyExc_TypeError, "must be str, not %T", unicode); + return -1; + } + Py_ssize_t len = PyUnicode_GET_LENGTH(unicode); + + // Native ASCII + if (PyUnicode_IS_ASCII(unicode) + && (requested_formats & PyUnicode_FORMAT_ASCII)) + { + return unicode_export(unicode, view, + len, PyUnicode_1BYTE_DATA(unicode), + 1, "B", PyUnicode_FORMAT_ASCII); + } + + // Native UCS1 + int kind = PyUnicode_KIND(unicode); + if (kind == PyUnicode_1BYTE_KIND + && (requested_formats & PyUnicode_FORMAT_UCS1)) + { + return unicode_export(unicode, view, + len, PyUnicode_1BYTE_DATA(unicode), + 1, "B", PyUnicode_FORMAT_UCS1); + } + + // Native UCS2 + if (kind == PyUnicode_2BYTE_KIND + && (requested_formats & PyUnicode_FORMAT_UCS2)) + { + return unicode_export(unicode, view, + len, PyUnicode_2BYTE_DATA(unicode), + 2, "=H", PyUnicode_FORMAT_UCS2); + } + + // Convert ASCII or UCS1 to UCS2 (need PyUnicode_EXPORT_COPY) + if (flags & PyUnicode_EXPORT_COPY + && kind == PyUnicode_1BYTE_KIND + && requested_formats & PyUnicode_FORMAT_UCS2) + { + PyObject *bytes = PyBytes_FromStringAndSize(NULL, (len + 1) * 2); + if (!bytes) { + return -1; + } + Py_UCS2 *ucs2 = (Py_UCS2*)PyBytes_AS_STRING(bytes); + + _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS2, + PyUnicode_1BYTE_DATA(unicode), + PyUnicode_1BYTE_DATA(unicode) + len, + ucs2); + ucs2[len] = 0; + + return unicode_export_bytes(bytes, view, len, + 2, "=H", PyUnicode_FORMAT_UCS2); + } + + // Native UCS4 + if (kind == PyUnicode_4BYTE_KIND + && (requested_formats & PyUnicode_FORMAT_UCS4)) + { + return unicode_export(unicode, view, + len, PyUnicode_4BYTE_DATA(unicode), + 4, "=I", PyUnicode_FORMAT_UCS4); + } + + // Convert ASCII, UCS1 or UCS2 to UCS4 (need PyUnicode_EXPORT_COPY) + if (flags & PyUnicode_EXPORT_COPY + && requested_formats & PyUnicode_FORMAT_UCS4) + { + PyObject *bytes = PyBytes_FromStringAndSize(NULL, (len + 1) * 4); + if (bytes == NULL) { + return -1; + } + Py_UCS4 *ucs4 = (Py_UCS4*)PyBytes_AS_STRING(bytes); + + (void)as_ucs4(unicode, ucs4, len + 1, 1); + + return unicode_export_bytes(bytes, view, len, + 4, "=I", PyUnicode_FORMAT_UCS4); + } + + // Encode UCS1, UCS2 or UCS4 to UTF-8 + if (requested_formats & PyUnicode_FORMAT_UTF8) { + Py_ssize_t nbytes; + const char *utf8 = PyUnicode_AsUTF8AndSize(unicode, &nbytes); + if (utf8 != NULL) { + return unicode_export(unicode, view, + nbytes, utf8, + 1, "B", PyUnicode_FORMAT_UTF8); + } + if (flags & PyUnicode_EXPORT_COPY + && PyErr_ExceptionMatches(PyExc_UnicodeEncodeError)) + { + PyErr_Clear(); + PyObject *bytes = _PyUnicode_AsUTF8String(unicode, "surrogatepass"); + if (bytes == NULL) { + return -1; + } + len = PyBytes_GET_SIZE(bytes); + + return unicode_export_bytes(bytes, view, PyBytes_GET_SIZE(bytes), + 1, "B", PyUnicode_FORMAT_UTF8); + } + return -1; + } + + PyErr_SetString(PyExc_ValueError, + "unable to find a matching export format"); + return -1; +} + + +PyObject* +PyUnicode_Import(const void *data, Py_ssize_t nbytes, + int32_t format) +{ + if (nbytes < 0) { + PyErr_SetString(PyExc_ValueError, "Negative nbytes"); + return NULL; + } + + switch (format) + { + case PyUnicode_FORMAT_ASCII: + return PyUnicode_DecodeASCII((const char*)data, nbytes, NULL); + + case PyUnicode_FORMAT_UCS1: + return _PyUnicode_FromUCS1(data, nbytes); + + case PyUnicode_FORMAT_UCS2: + if (nbytes % 2) { + PyErr_Format(PyExc_ValueError, "nbytes must be a multiple of 2: %zd", + nbytes); + return NULL; + } + return _PyUnicode_FromUCS2(data, nbytes / 2); + + case PyUnicode_FORMAT_UCS4: + if (nbytes % 4) { + PyErr_Format(PyExc_ValueError, "nbytes must be a multiple of 4: %zd", + nbytes); + return NULL; + } + return _PyUnicode_FromUCS4(data, nbytes / 4); + + case PyUnicode_FORMAT_UTF8: + return PyUnicode_DecodeUTF8((const char*)data, nbytes, NULL); + + default: + PyErr_Format(PyExc_ValueError, "unknown format: %i", format); + return NULL; + } +} + + PyObject* PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size) { @@ -2508,15 +2704,14 @@ static Py_UCS4* as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, int copy_null) { - int kind; - const void *data; - Py_ssize_t len, targetlen; - kind = PyUnicode_KIND(string); - data = PyUnicode_DATA(string); - len = PyUnicode_GET_LENGTH(string); - targetlen = len; - if (copy_null) + int kind = PyUnicode_KIND(string); + const void *data = PyUnicode_DATA(string); + Py_ssize_t len = PyUnicode_GET_LENGTH(string); + Py_ssize_t targetlen = len; + if (copy_null) { targetlen++; + } + if (!target) { target = PyMem_New(Py_UCS4, targetlen); if (!target) { @@ -2528,11 +2723,13 @@ as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, if (targetsize < targetlen) { PyErr_Format(PyExc_SystemError, "string is longer than the buffer"); - if (copy_null && 0 < targetsize) + if (copy_null && 0 < targetsize) { target[0] = 0; + } return NULL; } } + if (kind == PyUnicode_1BYTE_KIND) { const Py_UCS1 *start = (const Py_UCS1 *) data; _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target); @@ -2547,8 +2744,10 @@ as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, else { Py_UNREACHABLE(); } - if (copy_null) + if (copy_null) { target[len] = 0; + } + return target; } diff --git a/PC/python3dll.c b/PC/python3dll.c index 1845334b244d8c..02206b14abcf82 100755 --- a/PC/python3dll.c +++ b/PC/python3dll.c @@ -717,6 +717,7 @@ EXPORT_FUNC(PyUnicode_EncodeFSDefault) EXPORT_FUNC(PyUnicode_EncodeLocale) EXPORT_FUNC(PyUnicode_EqualToUTF8) EXPORT_FUNC(PyUnicode_EqualToUTF8AndSize) +EXPORT_FUNC(PyUnicode_Export) EXPORT_FUNC(PyUnicode_Find) EXPORT_FUNC(PyUnicode_FindChar) EXPORT_FUNC(PyUnicode_Format) @@ -733,6 +734,7 @@ EXPORT_FUNC(PyUnicode_FSDecoder) EXPORT_FUNC(PyUnicode_GetDefaultEncoding) EXPORT_FUNC(PyUnicode_GetLength) EXPORT_FUNC(PyUnicode_GetSize) +EXPORT_FUNC(PyUnicode_Import) EXPORT_FUNC(PyUnicode_InternFromString) EXPORT_FUNC(PyUnicode_InternImmortal) EXPORT_FUNC(PyUnicode_InternInPlace)