Skip to content

Commit fa6c48e

Browse files
[3.13] gh-128013: fix data race in PyUnicode_AsUTF8AndSize on free-threading (#128021) (#128417)
1 parent 24cddab commit fa6c48e

File tree

2 files changed

+126
-60
lines changed

2 files changed

+126
-60
lines changed

Lib/test/test_capi/test_unicode.py

+19-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import unittest
22
import sys
33
from test import support
4-
from test.support import import_helper
4+
from test.support import threading_helper, import_helper
55

66
try:
77
import _testcapi
@@ -959,6 +959,24 @@ def test_asutf8(self):
959959
self.assertRaises(TypeError, unicode_asutf8, [], 0)
960960
# CRASHES unicode_asutf8(NULL, 0)
961961

962+
@unittest.skipIf(_testcapi is None, 'need _testcapi module')
963+
@threading_helper.requires_working_threading()
964+
def test_asutf8_race(self):
965+
"""Test that there's no race condition in PyUnicode_AsUTF8()"""
966+
unicode_asutf8 = _testcapi.unicode_asutf8
967+
from threading import Thread
968+
969+
data = "😊"
970+
971+
def worker():
972+
for _ in range(1000):
973+
self.assertEqual(unicode_asutf8(data, 5), b'\xf0\x9f\x98\x8a\0')
974+
975+
threads = [Thread(target=worker) for _ in range(10)]
976+
with threading_helper.start_threads(threads):
977+
pass
978+
979+
962980
@support.cpython_only
963981
@unittest.skipIf(_testlimitedcapi is None, 'need _testlimitedcapi module')
964982
def test_asutf8andsize(self):

Objects/unicodeobject.c

+107-59
Original file line numberDiff line numberDiff line change
@@ -111,47 +111,80 @@ NOTE: In the interpreter's initialization phase, some globals are currently
111111
# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
112112
#endif
113113

114-
#define _PyUnicode_UTF8(op) \
115-
(_PyCompactUnicodeObject_CAST(op)->utf8)
116-
#define PyUnicode_UTF8(op) \
117-
(assert(_PyUnicode_CHECK(op)), \
118-
PyUnicode_IS_COMPACT_ASCII(op) ? \
119-
((char*)(_PyASCIIObject_CAST(op) + 1)) : \
120-
_PyUnicode_UTF8(op))
121-
#define _PyUnicode_UTF8_LENGTH(op) \
122-
(_PyCompactUnicodeObject_CAST(op)->utf8_length)
123-
#define PyUnicode_UTF8_LENGTH(op) \
124-
(assert(_PyUnicode_CHECK(op)), \
125-
PyUnicode_IS_COMPACT_ASCII(op) ? \
126-
_PyASCIIObject_CAST(op)->length : \
127-
_PyUnicode_UTF8_LENGTH(op))
114+
static inline char* _PyUnicode_UTF8(PyObject *op)
115+
{
116+
return FT_ATOMIC_LOAD_PTR_ACQUIRE(_PyCompactUnicodeObject_CAST(op)->utf8);
117+
}
118+
119+
static inline char* PyUnicode_UTF8(PyObject *op)
120+
{
121+
assert(_PyUnicode_CHECK(op));
122+
if (PyUnicode_IS_COMPACT_ASCII(op)) {
123+
return ((char*)(_PyASCIIObject_CAST(op) + 1));
124+
}
125+
else {
126+
return _PyUnicode_UTF8(op);
127+
}
128+
}
129+
130+
static inline void PyUnicode_SET_UTF8(PyObject *op, char *utf8)
131+
{
132+
FT_ATOMIC_STORE_PTR_RELEASE(_PyCompactUnicodeObject_CAST(op)->utf8, utf8);
133+
}
134+
135+
static inline Py_ssize_t PyUnicode_UTF8_LENGTH(PyObject *op)
136+
{
137+
assert(_PyUnicode_CHECK(op));
138+
if (PyUnicode_IS_COMPACT_ASCII(op)) {
139+
return _PyASCIIObject_CAST(op)->length;
140+
}
141+
else {
142+
return _PyCompactUnicodeObject_CAST(op)->utf8_length;
143+
}
144+
}
145+
146+
static inline void PyUnicode_SET_UTF8_LENGTH(PyObject *op, Py_ssize_t length)
147+
{
148+
_PyCompactUnicodeObject_CAST(op)->utf8_length = length;
149+
}
128150

129151
#define _PyUnicode_LENGTH(op) \
130152
(_PyASCIIObject_CAST(op)->length)
131153
#define _PyUnicode_STATE(op) \
132154
(_PyASCIIObject_CAST(op)->state)
133155
#define _PyUnicode_HASH(op) \
134156
(_PyASCIIObject_CAST(op)->hash)
135-
#define _PyUnicode_KIND(op) \
136-
(assert(_PyUnicode_CHECK(op)), \
137-
_PyASCIIObject_CAST(op)->state.kind)
138-
#define _PyUnicode_GET_LENGTH(op) \
139-
(assert(_PyUnicode_CHECK(op)), \
140-
_PyASCIIObject_CAST(op)->length)
157+
158+
static inline Py_hash_t PyUnicode_HASH(PyObject *op)
159+
{
160+
assert(_PyUnicode_CHECK(op));
161+
return FT_ATOMIC_LOAD_SSIZE_RELAXED(_PyASCIIObject_CAST(op)->hash);
162+
}
163+
164+
static inline void PyUnicode_SET_HASH(PyObject *op, Py_hash_t hash)
165+
{
166+
FT_ATOMIC_STORE_SSIZE_RELAXED(_PyASCIIObject_CAST(op)->hash, hash);
167+
}
168+
141169
#define _PyUnicode_DATA_ANY(op) \
142170
(_PyUnicodeObject_CAST(op)->data.any)
143171

144-
#define _PyUnicode_SHARE_UTF8(op) \
145-
(assert(_PyUnicode_CHECK(op)), \
146-
assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
147-
(_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
172+
static inline int _PyUnicode_SHARE_UTF8(PyObject *op)
173+
{
174+
assert(_PyUnicode_CHECK(op));
175+
assert(!PyUnicode_IS_COMPACT_ASCII(op));
176+
return (_PyUnicode_UTF8(op) == PyUnicode_DATA(op));
177+
}
148178

149179
/* true if the Unicode object has an allocated UTF-8 memory block
150180
(not shared with other data) */
151-
#define _PyUnicode_HAS_UTF8_MEMORY(op) \
152-
((!PyUnicode_IS_COMPACT_ASCII(op) \
153-
&& _PyUnicode_UTF8(op) \
154-
&& _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
181+
static inline int _PyUnicode_HAS_UTF8_MEMORY(PyObject *op)
182+
{
183+
return (!PyUnicode_IS_COMPACT_ASCII(op)
184+
&& _PyUnicode_UTF8(op) != NULL
185+
&& _PyUnicode_UTF8(op) != PyUnicode_DATA(op));
186+
}
187+
155188

156189
/* Generic helper macro to convert characters of different types.
157190
from_type and to_type have to be valid type names, begin and end
@@ -650,7 +683,7 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content)
650683
|| kind == PyUnicode_2BYTE_KIND
651684
|| kind == PyUnicode_4BYTE_KIND);
652685
CHECK(ascii->state.ascii == 0);
653-
CHECK(compact->utf8 != data);
686+
CHECK(_PyUnicode_UTF8(op) != data);
654687
}
655688
else {
656689
PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
@@ -662,16 +695,17 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content)
662695
CHECK(ascii->state.compact == 0);
663696
CHECK(data != NULL);
664697
if (ascii->state.ascii) {
665-
CHECK(compact->utf8 == data);
698+
CHECK(_PyUnicode_UTF8(op) == data);
666699
CHECK(compact->utf8_length == ascii->length);
667700
}
668701
else {
669-
CHECK(compact->utf8 != data);
702+
CHECK(_PyUnicode_UTF8(op) != data);
670703
}
671704
}
672-
673-
if (compact->utf8 == NULL)
705+
#ifndef Py_GIL_DISABLED
706+
if (_PyUnicode_UTF8(op) == NULL)
674707
CHECK(compact->utf8_length == 0);
708+
#endif
675709
}
676710

677711
/* check that the best kind is used: O(n) operation */
@@ -1115,8 +1149,8 @@ resize_compact(PyObject *unicode, Py_ssize_t length)
11151149

11161150
if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
11171151
PyMem_Free(_PyUnicode_UTF8(unicode));
1118-
_PyUnicode_UTF8(unicode) = NULL;
1119-
_PyUnicode_UTF8_LENGTH(unicode) = 0;
1152+
PyUnicode_SET_UTF8_LENGTH(unicode, 0);
1153+
PyUnicode_SET_UTF8(unicode, NULL);
11201154
}
11211155
#ifdef Py_TRACE_REFS
11221156
_Py_ForgetReference(unicode);
@@ -1169,8 +1203,8 @@ resize_inplace(PyObject *unicode, Py_ssize_t length)
11691203
if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
11701204
{
11711205
PyMem_Free(_PyUnicode_UTF8(unicode));
1172-
_PyUnicode_UTF8(unicode) = NULL;
1173-
_PyUnicode_UTF8_LENGTH(unicode) = 0;
1206+
PyUnicode_SET_UTF8_LENGTH(unicode, 0);
1207+
PyUnicode_SET_UTF8(unicode, NULL);
11741208
}
11751209

11761210
data = (PyObject *)PyObject_Realloc(data, new_size);
@@ -1180,8 +1214,8 @@ resize_inplace(PyObject *unicode, Py_ssize_t length)
11801214
}
11811215
_PyUnicode_DATA_ANY(unicode) = data;
11821216
if (share_utf8) {
1183-
_PyUnicode_UTF8(unicode) = data;
1184-
_PyUnicode_UTF8_LENGTH(unicode) = length;
1217+
PyUnicode_SET_UTF8_LENGTH(unicode, length);
1218+
PyUnicode_SET_UTF8(unicode, data);
11851219
}
11861220
_PyUnicode_LENGTH(unicode) = length;
11871221
PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
@@ -1411,12 +1445,12 @@ unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
14111445

14121446
assert(unicode != NULL);
14131447
assert(_PyUnicode_CHECK(unicode));
1414-
assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1448+
assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
14151449
ucs4_out = PyUnicode_4BYTE_DATA(unicode);
14161450

14171451
for (iter = begin; iter < end; ) {
14181452
assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1419-
_PyUnicode_GET_LENGTH(unicode)));
1453+
PyUnicode_GET_LENGTH(unicode)));
14201454
if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
14211455
&& (iter+1) < end
14221456
&& Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
@@ -1430,7 +1464,7 @@ unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
14301464
}
14311465
}
14321466
assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1433-
_PyUnicode_GET_LENGTH(unicode)));
1467+
PyUnicode_GET_LENGTH(unicode)));
14341468

14351469
}
14361470
#endif
@@ -1801,7 +1835,7 @@ unicode_modifiable(PyObject *unicode)
18011835
assert(_PyUnicode_CHECK(unicode));
18021836
if (Py_REFCNT(unicode) != 1)
18031837
return 0;
1804-
if (FT_ATOMIC_LOAD_SSIZE_RELAXED(_PyUnicode_HASH(unicode)) != -1)
1838+
if (PyUnicode_HASH(unicode) != -1)
18051839
return 0;
18061840
if (PyUnicode_CHECK_INTERNED(unicode))
18071841
return 0;
@@ -4052,6 +4086,21 @@ PyUnicode_FSDecoder(PyObject* arg, void* addr)
40524086

40534087
static int unicode_fill_utf8(PyObject *unicode);
40544088

4089+
4090+
static int
4091+
unicode_ensure_utf8(PyObject *unicode)
4092+
{
4093+
int err = 0;
4094+
if (PyUnicode_UTF8(unicode) == NULL) {
4095+
Py_BEGIN_CRITICAL_SECTION(unicode);
4096+
if (PyUnicode_UTF8(unicode) == NULL) {
4097+
err = unicode_fill_utf8(unicode);
4098+
}
4099+
Py_END_CRITICAL_SECTION();
4100+
}
4101+
return err;
4102+
}
4103+
40554104
const char *
40564105
PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
40574106
{
@@ -4063,13 +4112,11 @@ PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
40634112
return NULL;
40644113
}
40654114

4066-
if (PyUnicode_UTF8(unicode) == NULL) {
4067-
if (unicode_fill_utf8(unicode) == -1) {
4068-
if (psize) {
4069-
*psize = -1;
4070-
}
4071-
return NULL;
4115+
if (unicode_ensure_utf8(unicode) == -1) {
4116+
if (psize) {
4117+
*psize = -1;
40724118
}
4119+
return NULL;
40734120
}
40744121

40754122
if (psize) {
@@ -5401,6 +5448,7 @@ unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
54015448
static int
54025449
unicode_fill_utf8(PyObject *unicode)
54035450
{
5451+
_Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(unicode);
54045452
/* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
54055453
assert(!PyUnicode_IS_ASCII(unicode));
54065454

@@ -5442,10 +5490,10 @@ unicode_fill_utf8(PyObject *unicode)
54425490
PyErr_NoMemory();
54435491
return -1;
54445492
}
5445-
_PyUnicode_UTF8(unicode) = cache;
5446-
_PyUnicode_UTF8_LENGTH(unicode) = len;
54475493
memcpy(cache, start, len);
54485494
cache[len] = '\0';
5495+
PyUnicode_SET_UTF8_LENGTH(unicode, len);
5496+
PyUnicode_SET_UTF8(unicode, cache);
54495497
_PyBytesWriter_Dealloc(&writer);
54505498
return 0;
54515499
}
@@ -10996,9 +11044,9 @@ _PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
1099611044
return 0;
1099711045
}
1099811046

10999-
Py_hash_t right_hash = FT_ATOMIC_LOAD_SSIZE_RELAXED(_PyUnicode_HASH(right_uni));
11047+
Py_hash_t right_hash = PyUnicode_HASH(right_uni);
1100011048
assert(right_hash != -1);
11001-
Py_hash_t hash = FT_ATOMIC_LOAD_SSIZE_RELAXED(_PyUnicode_HASH(left));
11049+
Py_hash_t hash = PyUnicode_HASH(left);
1100211050
if (hash != -1 && hash != right_hash) {
1100311051
return 0;
1100411052
}
@@ -11484,14 +11532,14 @@ unicode_hash(PyObject *self)
1148411532
#ifdef Py_DEBUG
1148511533
assert(_Py_HashSecret_Initialized);
1148611534
#endif
11487-
Py_hash_t hash = FT_ATOMIC_LOAD_SSIZE_RELAXED(_PyUnicode_HASH(self));
11535+
Py_hash_t hash = PyUnicode_HASH(self);
1148811536
if (hash != -1) {
1148911537
return hash;
1149011538
}
1149111539
x = _Py_HashBytes(PyUnicode_DATA(self),
1149211540
PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
1149311541

11494-
FT_ATOMIC_STORE_SSIZE_RELAXED(_PyUnicode_HASH(self), x);
11542+
PyUnicode_SET_HASH(self, x);
1149511543
return x;
1149611544
}
1149711545

@@ -14888,8 +14936,8 @@ unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
1488814936
_PyUnicode_STATE(self).compact = 0;
1488914937
_PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
1489014938
_PyUnicode_STATE(self).statically_allocated = 0;
14891-
_PyUnicode_UTF8_LENGTH(self) = 0;
14892-
_PyUnicode_UTF8(self) = NULL;
14939+
PyUnicode_SET_UTF8_LENGTH(self, 0);
14940+
PyUnicode_SET_UTF8(self, NULL);
1489314941
_PyUnicode_DATA_ANY(self) = NULL;
1489414942

1489514943
share_utf8 = 0;
@@ -14919,8 +14967,8 @@ unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
1491914967

1492014968
_PyUnicode_DATA_ANY(self) = data;
1492114969
if (share_utf8) {
14922-
_PyUnicode_UTF8_LENGTH(self) = length;
14923-
_PyUnicode_UTF8(self) = data;
14970+
PyUnicode_SET_UTF8_LENGTH(self, length);
14971+
PyUnicode_SET_UTF8(self, data);
1492414972
}
1492514973

1492614974
memcpy(data, PyUnicode_DATA(unicode), kind * (length + 1));

0 commit comments

Comments
 (0)