Skip to content

Commit 9833bb9

Browse files
authored
bpo-46845: Reduce dict size when all keys are Unicode (GH-31564)
1 parent 21099fc commit 9833bb9

File tree

9 files changed

+870
-477
lines changed

9 files changed

+870
-477
lines changed

Doc/whatsnew/3.11.rst

+5
Original file line numberDiff line numberDiff line change
@@ -404,6 +404,11 @@ Optimizations
404404
larger *k*).
405405
(Contributed by Serhiy Storchaka in :issue:`37295`.)
406406

407+
* Dict don't store hash value when all inserted keys are Unicode objects.
408+
This reduces dict size. For example, ``sys.getsizeof(dict.fromkeys("abcdefg"))``
409+
becomes 272 bytes from 352 bytes on 64bit platform.
410+
(Contributed by Inada Naoki in :issue:`46845`.)
411+
407412

408413
CPython bytecode changes
409414
========================

Include/internal/pycore_dict.h

+15-2
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,11 @@ typedef struct {
4343
PyObject *me_value; /* This field is only meaningful for combined tables */
4444
} PyDictKeyEntry;
4545

46+
typedef struct {
47+
PyObject *me_key; /* The key must be Unicode and have hash. */
48+
PyObject *me_value; /* This field is only meaningful for combined tables */
49+
} PyDictUnicodeEntry;
50+
4651
extern PyDictKeysObject *_PyDict_NewKeysForClass(void);
4752
extern PyObject *_PyDict_FromKeys(PyObject *, PyObject *, PyObject *);
4853

@@ -70,6 +75,7 @@ extern PyObject *_PyDict_Pop_KnownHash(PyObject *, PyObject *, Py_hash_t, PyObje
7075
#define DKIX_EMPTY (-1)
7176
#define DKIX_DUMMY (-2) /* Used internally */
7277
#define DKIX_ERROR (-3)
78+
#define DKIX_KEY_CHANGED (-4) /* Used internally */
7379

7480
typedef enum {
7581
DICT_KEYS_GENERAL = 0,
@@ -114,7 +120,7 @@ struct _dictkeysobject {
114120
Dynamically sized, SIZEOF_VOID_P is minimum. */
115121
char dk_indices[]; /* char is required to avoid strict aliasing. */
116122

117-
/* "PyDictKeyEntry dk_entries[dk_usable];" array follows:
123+
/* "PyDictKeyEntry or PyDictUnicodeEntry dk_entries[USABLE_FRACTION(DK_SIZE(dk))];" array follows:
118124
see the DK_ENTRIES() macro */
119125
};
120126

@@ -148,13 +154,20 @@ struct _dictvalues {
148154
2 : sizeof(int32_t))
149155
#endif
150156
#define DK_ENTRIES(dk) \
151-
((PyDictKeyEntry*)(&((int8_t*)((dk)->dk_indices))[(size_t)1 << (dk)->dk_log2_index_bytes]))
157+
(assert(dk->dk_kind == DICT_KEYS_GENERAL), (PyDictKeyEntry*)(&((int8_t*)((dk)->dk_indices))[(size_t)1 << (dk)->dk_log2_index_bytes]))
158+
#define DK_UNICODE_ENTRIES(dk) \
159+
(assert(dk->dk_kind != DICT_KEYS_GENERAL), (PyDictUnicodeEntry*)(&((int8_t*)((dk)->dk_indices))[(size_t)1 << (dk)->dk_log2_index_bytes]))
160+
#define DK_IS_UNICODE(dk) ((dk)->dk_kind != DICT_KEYS_GENERAL)
152161

153162
extern uint64_t _pydict_global_version;
154163

155164
#define DICT_NEXT_VERSION() (++_pydict_global_version)
156165

157166
extern PyObject *_PyObject_MakeDictFromInstanceAttributes(PyObject *obj, PyDictValues *values);
167+
extern PyObject *_PyDict_FromItems(
168+
PyObject *const *keys, Py_ssize_t keys_offset,
169+
PyObject *const *values, Py_ssize_t values_offset,
170+
Py_ssize_t length);
158171

159172
static inline void
160173
_PyDictValues_AddToInsertionOrder(PyDictValues *values, Py_ssize_t ix)

Lib/test/test_sys.py

+8-4
Original file line numberDiff line numberDiff line change
@@ -1346,8 +1346,12 @@ def inner():
13461346
check({}.__iter__, size('2P'))
13471347
# empty dict
13481348
check({}, size('nQ2P'))
1349-
# dict
1350-
check({"a": 1}, size('nQ2P') + calcsize(DICT_KEY_STRUCT_FORMAT) + 8 + (8*2//3)*calcsize('n2P'))
1349+
# dict (string key)
1350+
check({"a": 1}, size('nQ2P') + calcsize(DICT_KEY_STRUCT_FORMAT) + 8 + (8*2//3)*calcsize('2P'))
1351+
longdict = {str(i): i for i in range(8)}
1352+
check(longdict, size('nQ2P') + calcsize(DICT_KEY_STRUCT_FORMAT) + 16 + (16*2//3)*calcsize('2P'))
1353+
# dict (non-string key)
1354+
check({1: 1}, size('nQ2P') + calcsize(DICT_KEY_STRUCT_FORMAT) + 8 + (8*2//3)*calcsize('n2P'))
13511355
longdict = {1:1, 2:2, 3:3, 4:4, 5:5, 6:6, 7:7, 8:8}
13521356
check(longdict, size('nQ2P') + calcsize(DICT_KEY_STRUCT_FORMAT) + 16 + (16*2//3)*calcsize('n2P'))
13531357
# dictionary-keyview
@@ -1506,14 +1510,14 @@ def delx(self): del self.__x
15061510
)
15071511
class newstyleclass(object): pass
15081512
# Separate block for PyDictKeysObject with 8 keys and 5 entries
1509-
check(newstyleclass, s + calcsize(DICT_KEY_STRUCT_FORMAT) + 64 + 42*calcsize("n2P"))
1513+
check(newstyleclass, s + calcsize(DICT_KEY_STRUCT_FORMAT) + 64 + 42*calcsize("2P"))
15101514
# dict with shared keys
15111515
[newstyleclass() for _ in range(100)]
15121516
check(newstyleclass().__dict__, size('nQ2P') + self.P)
15131517
o = newstyleclass()
15141518
o.a = o.b = o.c = o.d = o.e = o.f = o.g = o.h = 1
15151519
# Separate block for PyDictKeysObject with 16 keys and 10 entries
1516-
check(newstyleclass, s + calcsize(DICT_KEY_STRUCT_FORMAT) + 64 + 42*calcsize("n2P"))
1520+
check(newstyleclass, s + calcsize(DICT_KEY_STRUCT_FORMAT) + 64 + 42*calcsize("2P"))
15171521
# dict with shared keys
15181522
check(newstyleclass().__dict__, size('nQ2P') + self.P)
15191523
# unicode
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Reduces dict size by removing hash value from hash table when all inserted
2+
keys are Unicode. For example, ``sys.getsizeof(dict.fromkeys("abcdefg"))``
3+
becomes 272 bytes from 352 bytes on 64bit platform.

Objects/call.c

+2-17
Original file line numberDiff line numberDiff line change
@@ -934,26 +934,11 @@ PyObject *
934934
_PyStack_AsDict(PyObject *const *values, PyObject *kwnames)
935935
{
936936
Py_ssize_t nkwargs;
937-
PyObject *kwdict;
938-
Py_ssize_t i;
939937

940938
assert(kwnames != NULL);
941939
nkwargs = PyTuple_GET_SIZE(kwnames);
942-
kwdict = _PyDict_NewPresized(nkwargs);
943-
if (kwdict == NULL) {
944-
return NULL;
945-
}
946-
947-
for (i = 0; i < nkwargs; i++) {
948-
PyObject *key = PyTuple_GET_ITEM(kwnames, i);
949-
PyObject *value = *values++;
950-
/* If key already exists, replace it with the new value */
951-
if (PyDict_SetItem(kwdict, key, value)) {
952-
Py_DECREF(kwdict);
953-
return NULL;
954-
}
955-
}
956-
return kwdict;
940+
return _PyDict_FromItems(&PyTuple_GET_ITEM(kwnames, 0), 1,
941+
values, 1, nkwargs);
957942
}
958943

959944

Objects/dictnotes.txt

+2-2
Original file line numberDiff line numberDiff line change
@@ -70,8 +70,8 @@ A values array
7070
Tunable Dictionary Parameters
7171
-----------------------------
7272

73-
See comments for PyDict_MINSIZE_SPLIT, PyDict_MINSIZE_COMBINED,
74-
USABLE_FRACTION and GROWTH_RATE in dictobject.c
73+
See comments for PyDict_MINSIZE, USABLE_FRACTION and GROWTH_RATE in
74+
dictobject.c
7575

7676
Tune-ups should be measured across a broad range of applications and
7777
use cases. A change to any parameter will help in some situations and

0 commit comments

Comments
 (0)