Skip to content

[3.13] gh-128013: fix data race in PyUnicode_AsUTF8AndSize on free-threading (#128021) #128417

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jan 2, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 19 additions & 1 deletion Lib/test/test_capi/test_unicode.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import unittest
import sys
from test import support
from test.support import import_helper
from test.support import threading_helper, import_helper

try:
import _testcapi
Expand Down Expand Up @@ -959,6 +959,24 @@ def test_asutf8(self):
self.assertRaises(TypeError, unicode_asutf8, [], 0)
# CRASHES unicode_asutf8(NULL, 0)

@unittest.skipIf(_testcapi is None, 'need _testcapi module')
@threading_helper.requires_working_threading()
def test_asutf8_race(self):
"""Test that there's no race condition in PyUnicode_AsUTF8()"""
unicode_asutf8 = _testcapi.unicode_asutf8
from threading import Thread

data = "😊"

def worker():
for _ in range(1000):
self.assertEqual(unicode_asutf8(data, 5), b'\xf0\x9f\x98\x8a\0')

threads = [Thread(target=worker) for _ in range(10)]
with threading_helper.start_threads(threads):
pass


@support.cpython_only
@unittest.skipIf(_testlimitedcapi is None, 'need _testlimitedcapi module')
def test_asutf8andsize(self):
Expand Down
166 changes: 107 additions & 59 deletions Objects/unicodeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -111,47 +111,80 @@ NOTE: In the interpreter's initialization phase, some globals are currently
# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
#endif

#define _PyUnicode_UTF8(op) \
(_PyCompactUnicodeObject_CAST(op)->utf8)
#define PyUnicode_UTF8(op) \
(assert(_PyUnicode_CHECK(op)), \
PyUnicode_IS_COMPACT_ASCII(op) ? \
((char*)(_PyASCIIObject_CAST(op) + 1)) : \
_PyUnicode_UTF8(op))
#define _PyUnicode_UTF8_LENGTH(op) \
(_PyCompactUnicodeObject_CAST(op)->utf8_length)
#define PyUnicode_UTF8_LENGTH(op) \
(assert(_PyUnicode_CHECK(op)), \
PyUnicode_IS_COMPACT_ASCII(op) ? \
_PyASCIIObject_CAST(op)->length : \
_PyUnicode_UTF8_LENGTH(op))
static inline char* _PyUnicode_UTF8(PyObject *op)
{
return FT_ATOMIC_LOAD_PTR_ACQUIRE(_PyCompactUnicodeObject_CAST(op)->utf8);
}

static inline char* PyUnicode_UTF8(PyObject *op)
{
assert(_PyUnicode_CHECK(op));
if (PyUnicode_IS_COMPACT_ASCII(op)) {
return ((char*)(_PyASCIIObject_CAST(op) + 1));
}
else {
return _PyUnicode_UTF8(op);
}
}

static inline void PyUnicode_SET_UTF8(PyObject *op, char *utf8)
{
FT_ATOMIC_STORE_PTR_RELEASE(_PyCompactUnicodeObject_CAST(op)->utf8, utf8);
}

static inline Py_ssize_t PyUnicode_UTF8_LENGTH(PyObject *op)
{
assert(_PyUnicode_CHECK(op));
if (PyUnicode_IS_COMPACT_ASCII(op)) {
return _PyASCIIObject_CAST(op)->length;
}
else {
return _PyCompactUnicodeObject_CAST(op)->utf8_length;
}
}

static inline void PyUnicode_SET_UTF8_LENGTH(PyObject *op, Py_ssize_t length)
{
_PyCompactUnicodeObject_CAST(op)->utf8_length = length;
}

#define _PyUnicode_LENGTH(op) \
(_PyASCIIObject_CAST(op)->length)
#define _PyUnicode_STATE(op) \
(_PyASCIIObject_CAST(op)->state)
#define _PyUnicode_HASH(op) \
(_PyASCIIObject_CAST(op)->hash)
#define _PyUnicode_KIND(op) \
(assert(_PyUnicode_CHECK(op)), \
_PyASCIIObject_CAST(op)->state.kind)
#define _PyUnicode_GET_LENGTH(op) \
(assert(_PyUnicode_CHECK(op)), \
_PyASCIIObject_CAST(op)->length)

static inline Py_hash_t PyUnicode_HASH(PyObject *op)
{
assert(_PyUnicode_CHECK(op));
return FT_ATOMIC_LOAD_SSIZE_RELAXED(_PyASCIIObject_CAST(op)->hash);
}

static inline void PyUnicode_SET_HASH(PyObject *op, Py_hash_t hash)
{
FT_ATOMIC_STORE_SSIZE_RELAXED(_PyASCIIObject_CAST(op)->hash, hash);
}

#define _PyUnicode_DATA_ANY(op) \
(_PyUnicodeObject_CAST(op)->data.any)

#define _PyUnicode_SHARE_UTF8(op) \
(assert(_PyUnicode_CHECK(op)), \
assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
(_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
static inline int _PyUnicode_SHARE_UTF8(PyObject *op)
{
assert(_PyUnicode_CHECK(op));
assert(!PyUnicode_IS_COMPACT_ASCII(op));
return (_PyUnicode_UTF8(op) == PyUnicode_DATA(op));
}

/* true if the Unicode object has an allocated UTF-8 memory block
(not shared with other data) */
#define _PyUnicode_HAS_UTF8_MEMORY(op) \
((!PyUnicode_IS_COMPACT_ASCII(op) \
&& _PyUnicode_UTF8(op) \
&& _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
static inline int _PyUnicode_HAS_UTF8_MEMORY(PyObject *op)
{
return (!PyUnicode_IS_COMPACT_ASCII(op)
&& _PyUnicode_UTF8(op) != NULL
&& _PyUnicode_UTF8(op) != PyUnicode_DATA(op));
}


/* Generic helper macro to convert characters of different types.
from_type and to_type have to be valid type names, begin and end
Expand Down Expand Up @@ -650,7 +683,7 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content)
|| kind == PyUnicode_2BYTE_KIND
|| kind == PyUnicode_4BYTE_KIND);
CHECK(ascii->state.ascii == 0);
CHECK(compact->utf8 != data);
CHECK(_PyUnicode_UTF8(op) != data);
}
else {
PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
Expand All @@ -662,16 +695,17 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content)
CHECK(ascii->state.compact == 0);
CHECK(data != NULL);
if (ascii->state.ascii) {
CHECK(compact->utf8 == data);
CHECK(_PyUnicode_UTF8(op) == data);
CHECK(compact->utf8_length == ascii->length);
}
else {
CHECK(compact->utf8 != data);
CHECK(_PyUnicode_UTF8(op) != data);
}
}

if (compact->utf8 == NULL)
#ifndef Py_GIL_DISABLED
if (_PyUnicode_UTF8(op) == NULL)
CHECK(compact->utf8_length == 0);
#endif
}

/* check that the best kind is used: O(n) operation */
Expand Down Expand Up @@ -1115,8 +1149,8 @@ resize_compact(PyObject *unicode, Py_ssize_t length)

if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
PyMem_Free(_PyUnicode_UTF8(unicode));
_PyUnicode_UTF8(unicode) = NULL;
_PyUnicode_UTF8_LENGTH(unicode) = 0;
PyUnicode_SET_UTF8_LENGTH(unicode, 0);
PyUnicode_SET_UTF8(unicode, NULL);
}
#ifdef Py_TRACE_REFS
_Py_ForgetReference(unicode);
Expand Down Expand Up @@ -1169,8 +1203,8 @@ resize_inplace(PyObject *unicode, Py_ssize_t length)
if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
{
PyMem_Free(_PyUnicode_UTF8(unicode));
_PyUnicode_UTF8(unicode) = NULL;
_PyUnicode_UTF8_LENGTH(unicode) = 0;
PyUnicode_SET_UTF8_LENGTH(unicode, 0);
PyUnicode_SET_UTF8(unicode, NULL);
}

data = (PyObject *)PyObject_Realloc(data, new_size);
Expand All @@ -1180,8 +1214,8 @@ resize_inplace(PyObject *unicode, Py_ssize_t length)
}
_PyUnicode_DATA_ANY(unicode) = data;
if (share_utf8) {
_PyUnicode_UTF8(unicode) = data;
_PyUnicode_UTF8_LENGTH(unicode) = length;
PyUnicode_SET_UTF8_LENGTH(unicode, length);
PyUnicode_SET_UTF8(unicode, data);
}
_PyUnicode_LENGTH(unicode) = length;
PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Expand Down Expand Up @@ -1411,12 +1445,12 @@ unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,

assert(unicode != NULL);
assert(_PyUnicode_CHECK(unicode));
assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
ucs4_out = PyUnicode_4BYTE_DATA(unicode);

for (iter = begin; iter < end; ) {
assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
_PyUnicode_GET_LENGTH(unicode)));
PyUnicode_GET_LENGTH(unicode)));
if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
&& (iter+1) < end
&& Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Expand All @@ -1430,7 +1464,7 @@ unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
}
}
assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
_PyUnicode_GET_LENGTH(unicode)));
PyUnicode_GET_LENGTH(unicode)));

}
#endif
Expand Down Expand Up @@ -1801,7 +1835,7 @@ unicode_modifiable(PyObject *unicode)
assert(_PyUnicode_CHECK(unicode));
if (Py_REFCNT(unicode) != 1)
return 0;
if (FT_ATOMIC_LOAD_SSIZE_RELAXED(_PyUnicode_HASH(unicode)) != -1)
if (PyUnicode_HASH(unicode) != -1)
return 0;
if (PyUnicode_CHECK_INTERNED(unicode))
return 0;
Expand Down Expand Up @@ -4052,6 +4086,21 @@ PyUnicode_FSDecoder(PyObject* arg, void* addr)

static int unicode_fill_utf8(PyObject *unicode);


static int
unicode_ensure_utf8(PyObject *unicode)
{
int err = 0;
if (PyUnicode_UTF8(unicode) == NULL) {
Py_BEGIN_CRITICAL_SECTION(unicode);
if (PyUnicode_UTF8(unicode) == NULL) {
err = unicode_fill_utf8(unicode);
}
Py_END_CRITICAL_SECTION();
}
return err;
}

const char *
PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
{
Expand All @@ -4063,13 +4112,11 @@ PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
return NULL;
}

if (PyUnicode_UTF8(unicode) == NULL) {
if (unicode_fill_utf8(unicode) == -1) {
if (psize) {
*psize = -1;
}
return NULL;
if (unicode_ensure_utf8(unicode) == -1) {
if (psize) {
*psize = -1;
}
return NULL;
}

if (psize) {
Expand Down Expand Up @@ -5401,6 +5448,7 @@ unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
static int
unicode_fill_utf8(PyObject *unicode)
{
_Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(unicode);
/* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
assert(!PyUnicode_IS_ASCII(unicode));

Expand Down Expand Up @@ -5442,10 +5490,10 @@ unicode_fill_utf8(PyObject *unicode)
PyErr_NoMemory();
return -1;
}
_PyUnicode_UTF8(unicode) = cache;
_PyUnicode_UTF8_LENGTH(unicode) = len;
memcpy(cache, start, len);
cache[len] = '\0';
PyUnicode_SET_UTF8_LENGTH(unicode, len);
PyUnicode_SET_UTF8(unicode, cache);
_PyBytesWriter_Dealloc(&writer);
return 0;
}
Expand Down Expand Up @@ -10996,9 +11044,9 @@ _PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
return 0;
}

Py_hash_t right_hash = FT_ATOMIC_LOAD_SSIZE_RELAXED(_PyUnicode_HASH(right_uni));
Py_hash_t right_hash = PyUnicode_HASH(right_uni);
assert(right_hash != -1);
Py_hash_t hash = FT_ATOMIC_LOAD_SSIZE_RELAXED(_PyUnicode_HASH(left));
Py_hash_t hash = PyUnicode_HASH(left);
if (hash != -1 && hash != right_hash) {
return 0;
}
Expand Down Expand Up @@ -11484,14 +11532,14 @@ unicode_hash(PyObject *self)
#ifdef Py_DEBUG
assert(_Py_HashSecret_Initialized);
#endif
Py_hash_t hash = FT_ATOMIC_LOAD_SSIZE_RELAXED(_PyUnicode_HASH(self));
Py_hash_t hash = PyUnicode_HASH(self);
if (hash != -1) {
return hash;
}
x = _Py_HashBytes(PyUnicode_DATA(self),
PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));

FT_ATOMIC_STORE_SSIZE_RELAXED(_PyUnicode_HASH(self), x);
PyUnicode_SET_HASH(self, x);
return x;
}

Expand Down Expand Up @@ -14888,8 +14936,8 @@ unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
_PyUnicode_STATE(self).compact = 0;
_PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
_PyUnicode_STATE(self).statically_allocated = 0;
_PyUnicode_UTF8_LENGTH(self) = 0;
_PyUnicode_UTF8(self) = NULL;
PyUnicode_SET_UTF8_LENGTH(self, 0);
PyUnicode_SET_UTF8(self, NULL);
_PyUnicode_DATA_ANY(self) = NULL;

share_utf8 = 0;
Expand Down Expand Up @@ -14919,8 +14967,8 @@ unicode_subtype_new(PyTypeObject *type, PyObject *unicode)

_PyUnicode_DATA_ANY(self) = data;
if (share_utf8) {
_PyUnicode_UTF8_LENGTH(self) = length;
_PyUnicode_UTF8(self) = data;
PyUnicode_SET_UTF8_LENGTH(self, length);
PyUnicode_SET_UTF8(self, data);
}

memcpy(data, PyUnicode_DATA(unicode), kind * (length + 1));
Expand Down
Loading