Skip to content

Commit b7e73f2

Browse files
committed
Update to mimalloc 2.0.5
1 parent 7773e00 commit b7e73f2

19 files changed

+685
-368
lines changed

Include/internal/pycore_mimalloc.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@
5858
*/
5959
#if 1
6060
#define _mi_abandoned_await_readers _Py__mi_abandoned_await_readers
61+
#define _mi_abandoned_collect _Py__mi_abandoned_collect
6162
#define _mi_abandoned_reclaim_all _Py__mi_abandoned_reclaim_all
6263
#define mi_aligned_alloc _Py_mi_aligned_alloc
6364
#define mi_aligned_offset_recalloc _Py_mi_aligned_offset_recalloc
@@ -201,7 +202,6 @@
201202
#define _mi_os_reset _Py__mi_os_reset
202203
#define _mi_os_shrink _Py__mi_os_shrink
203204
#define _mi_os_unprotect _Py__mi_os_unprotect
204-
#define _mi_os_unreset _Py__mi_os_unreset
205205
#define _mi_page_abandon _Py__mi_page_abandon
206206
#define _mi_page_empty _Py__mi_page_empty
207207
#define _mi_page_free_collect _Py__mi_page_free_collect
@@ -224,6 +224,7 @@
224224
#define mi_realloc_aligned_at _Py_mi_realloc_aligned_at
225225
#define mi_realloc_aligned _Py_mi_realloc_aligned
226226
#define mi_reallocarray _Py_mi_reallocarray
227+
#define mi_reallocarr _Py_mi_reallocarr
227228
#define mi_reallocf _Py_mi_reallocf
228229
#define mi_reallocn _Py_mi_reallocn
229230
#define mi_realloc _Py_mi_realloc
@@ -241,6 +242,7 @@
241242
#define mi_rezalloc_aligned_at _Py_mi_rezalloc_aligned_at
242243
#define mi_rezalloc_aligned _Py_mi_rezalloc_aligned
243244
#define mi_rezalloc _Py_mi_rezalloc
245+
#define _mi_segment_cache_collect _Py__mi_segment_cache_collect
244246
#define _mi_segment_cache_pop _Py__mi_segment_cache_pop
245247
#define _mi_segment_cache_push _Py__mi_segment_cache_push
246248
#define _mi_segment_huge_page_free _Py__mi_segment_huge_page_free

Include/mimalloc/mimalloc-internal.h

Lines changed: 93 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/* ----------------------------------------------------------------------------
2-
Copyright (c) 2018-2021, Microsoft Research, Daan Leijen
2+
Copyright (c) 2018-2022, Microsoft Research, Daan Leijen
33
This is free software; you can redistribute it and/or modify it under the
44
terms of the MIT license. A copy of the license can be found in the file
55
"LICENSE" at the root of this distribution.
@@ -43,6 +43,11 @@ terms of the MIT license. A copy of the license can be found in the file
4343
#define mi_decl_externc
4444
#endif
4545

46+
#if !defined(_WIN32) && !defined(__wasi__)
47+
#define MI_USE_PTHREADS
48+
#include <pthread.h>
49+
#endif
50+
4651
// "options.c"
4752
void _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* message);
4853
void _mi_fprintf(mi_output_fun* out, void* arg, const char* fmt, ...);
@@ -78,7 +83,7 @@ bool _mi_os_unprotect(void* addr, size_t size);
7883
bool _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* stats);
7984
bool _mi_os_decommit(void* p, size_t size, mi_stats_t* stats);
8085
bool _mi_os_reset(void* p, size_t size, mi_stats_t* stats);
81-
bool _mi_os_unreset(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
86+
// bool _mi_os_unreset(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
8287
size_t _mi_os_good_alloc_size(size_t size);
8388
bool _mi_os_has_overcommit(void);
8489

@@ -90,6 +95,7 @@ void _mi_arena_free(void* p, size_t size, size_t memid, bool is_committed,
9095
// "segment-cache.c"
9196
void* _mi_segment_cache_pop(size_t size, mi_commit_mask_t* commit_mask, mi_commit_mask_t* decommit_mask, bool* large, bool* is_pinned, bool* is_zero, size_t* memid, mi_os_tld_t* tld);
9297
bool _mi_segment_cache_push(void* start, size_t size, size_t memid, const mi_commit_mask_t* commit_mask, const mi_commit_mask_t* decommit_mask, bool is_large, bool is_pinned, mi_os_tld_t* tld);
98+
void _mi_segment_cache_collect(bool force, mi_os_tld_t* tld);
9399
void _mi_segment_map_allocated_at(const mi_segment_t* segment);
94100
void _mi_segment_map_freed_at(const mi_segment_t* segment);
95101

@@ -104,6 +110,7 @@ void _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, mi
104110
uint8_t* _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size); // page start for any page
105111
void _mi_abandoned_reclaim_all(mi_heap_t* heap, mi_segments_tld_t* tld);
106112
void _mi_abandoned_await_readers(void);
113+
void _mi_abandoned_collect(mi_heap_t* heap, bool force, mi_segments_tld_t* tld);
107114

108115

109116

@@ -157,8 +164,8 @@ bool _mi_page_is_valid(mi_page_t* page);
157164
// ------------------------------------------------------
158165

159166
#if defined(__GNUC__) || defined(__clang__)
160-
#define mi_unlikely(x) __builtin_expect((x),0)
161-
#define mi_likely(x) __builtin_expect((x),1)
167+
#define mi_unlikely(x) __builtin_expect(!!(x),false)
168+
#define mi_likely(x) __builtin_expect(!!(x),true)
162169
#else
163170
#define mi_unlikely(x) (x)
164171
#define mi_likely(x) (x)
@@ -263,11 +270,6 @@ static inline size_t _mi_wsize_from_size(size_t size) {
263270
return (size + sizeof(uintptr_t) - 1) / sizeof(uintptr_t);
264271
}
265272

266-
// Does malloc satisfy the alignment constraints already?
267-
static inline bool mi_malloc_satisfies_alignment(size_t alignment, size_t size) {
268-
return (alignment == sizeof(void*) || (alignment == MI_MAX_ALIGN_SIZE && size > (MI_MAX_ALIGN_SIZE/2)));
269-
}
270-
271273
// Overflow detecting multiply
272274
#if __has_builtin(__builtin_umul_overflow) || (defined(__GNUC__) && (__GNUC__ >= 5))
273275
#include <limits.h> // UINT_MAX, ULONG_MAX
@@ -318,7 +320,7 @@ We try to circumvent this in an efficient way:
318320
- macOSX : we use an unused TLS slot from the OS allocated slots (MI_TLS_SLOT). On OSX, the
319321
loader itself calls `malloc` even before the modules are initialized.
320322
- OpenBSD: we use an unused slot from the pthread block (MI_TLS_PTHREAD_SLOT_OFS).
321-
- DragonFly: the uniqueid use is buggy but kept for reference.
323+
- DragonFly: defaults are working but seem slow compared to freeBSD (see PR #323)
322324
------------------------------------------------------------------------------------------- */
323325

324326
extern const mi_heap_t _mi_heap_empty; // read-only empty heap, initial value of the thread local default heap
@@ -335,16 +337,18 @@ mi_heap_t* _mi_heap_main_get(void); // statically allocated main backing hea
335337
// use end bytes of a name; goes wrong if anyone uses names > 23 characters (ptrhread specifies 16)
336338
// see <https://github.com/openbsd/src/blob/master/lib/libc/include/thread_private.h#L371>
337339
#define MI_TLS_PTHREAD_SLOT_OFS (6*sizeof(int) + 4*sizeof(void*) + 24)
338-
#elif defined(__DragonFly__)
339-
#warning "mimalloc is not working correctly on DragonFly yet."
340-
//#define MI_TLS_PTHREAD_SLOT_OFS (4 + 1*sizeof(void*)) // offset `uniqueid` (also used by gdb?) <https://github.com/DragonFlyBSD/DragonFlyBSD/blob/master/lib/libthread_xu/thread/thr_private.h#L458>
340+
// #elif defined(__DragonFly__)
341+
// #warning "mimalloc is not working correctly on DragonFly yet."
342+
// #define MI_TLS_PTHREAD_SLOT_OFS (4 + 1*sizeof(void*)) // offset `uniqueid` (also used by gdb?) <https://github.com/DragonFlyBSD/DragonFlyBSD/blob/master/lib/libthread_xu/thread/thr_private.h#L458>
343+
#elif defined(__ANDROID__)
344+
// See issue #381
345+
#define MI_TLS_PTHREAD
341346
#endif
342347
#endif
343348

344349
#if defined(MI_TLS_SLOT)
345350
static inline void* mi_tls_slot(size_t slot) mi_attr_noexcept; // forward declaration
346351
#elif defined(MI_TLS_PTHREAD_SLOT_OFS)
347-
#include <pthread.h>
348352
static inline mi_heap_t** mi_tls_pthread_heap_slot(void) {
349353
pthread_t self = pthread_self();
350354
#if defined(__DragonFly__)
@@ -356,7 +360,6 @@ static inline mi_heap_t** mi_tls_pthread_heap_slot(void) {
356360
return (mi_heap_t**)((uint8_t*)self + MI_TLS_PTHREAD_SLOT_OFS);
357361
}
358362
#elif defined(MI_TLS_PTHREAD)
359-
#include <pthread.h>
360363
extern pthread_key_t _mi_heap_default_key;
361364
#endif
362365

@@ -366,11 +369,15 @@ extern pthread_key_t _mi_heap_default_key;
366369
// However, on the Apple M1 we do use the address of this variable as the unique thread-id (issue #356).
367370
extern mi_decl_thread mi_heap_t* _mi_heap_default; // default heap to allocate from
368371

369-
370372
static inline mi_heap_t* mi_get_default_heap(void) {
371373
#if defined(MI_TLS_SLOT)
372374
mi_heap_t* heap = (mi_heap_t*)mi_tls_slot(MI_TLS_SLOT);
373-
if (mi_unlikely(heap == NULL)) { heap = (mi_heap_t*)&_mi_heap_empty; } //_mi_heap_empty_get(); }
375+
if (mi_unlikely(heap == NULL)) {
376+
#ifdef __GNUC__
377+
__asm(""); // prevent conditional load of the address of _mi_heap_empty
378+
#endif
379+
heap = (mi_heap_t*)&_mi_heap_empty;
380+
}
374381
return heap;
375382
#elif defined(MI_TLS_PTHREAD_SLOT_OFS)
376383
mi_heap_t* heap = *mi_tls_pthread_heap_slot();
@@ -785,90 +792,105 @@ static inline size_t _mi_os_numa_node_count(void) {
785792
// -------------------------------------------------------------------
786793
// Getting the thread id should be performant as it is called in the
787794
// fast path of `_mi_free` and we specialize for various platforms.
795+
// We only require _mi_threadid() to return a unique id for each thread.
788796
// -------------------------------------------------------------------
789797
#if defined(_WIN32)
798+
790799
#define WIN32_LEAN_AND_MEAN
791800
#include <windows.h>
792801
static inline mi_threadid_t _mi_thread_id(void) mi_attr_noexcept {
793802
// Windows: works on Intel and ARM in both 32- and 64-bit
794803
return (uintptr_t)NtCurrentTeb();
795804
}
796805

797-
#elif defined(__GNUC__) && \
798-
(defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__))
806+
// We use assembly for a fast thread id on the main platforms. The TLS layout depends on
807+
// both the OS and libc implementation so we use specific tests for each main platform.
808+
// If you test on another platform and it works please send a PR :-)
809+
// see also https://akkadia.org/drepper/tls.pdf for more info on the TLS register.
810+
#elif defined(__GNUC__) && ( \
811+
(defined(__GLIBC__) && (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__))) \
812+
|| (defined(__APPLE__) && (defined(__x86_64__) || defined(__aarch64__))) \
813+
|| (defined(__BIONIC__) && (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__))) \
814+
|| (defined(__FreeBSD__) && (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__))) \
815+
|| (defined(__OpenBSD__) && (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__))) \
816+
)
799817

800-
// TLS register on x86 is in the FS or GS register, see: https://akkadia.org/drepper/tls.pdf
801818
static inline void* mi_tls_slot(size_t slot) mi_attr_noexcept {
802819
void* res;
803820
const size_t ofs = (slot*sizeof(void*));
804-
#if defined(__i386__)
805-
__asm__("movl %%gs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : ); // 32-bit always uses GS
806-
#elif defined(__APPLE__) && defined(__x86_64__)
807-
__asm__("movq %%gs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : ); // x86_64 macOSX uses GS
808-
#elif defined(__x86_64__) && (MI_INTPTR_SIZE==4)
809-
__asm__("movl %%fs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : ); // x32 ABI
810-
#elif defined(__x86_64__)
811-
__asm__("movq %%fs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : ); // x86_64 Linux, BSD uses FS
812-
#elif defined(__arm__)
813-
void** tcb; MI_UNUSED(ofs);
814-
__asm__ volatile ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tcb));
815-
res = tcb[slot];
816-
#elif defined(__aarch64__)
817-
void** tcb; MI_UNUSED(ofs);
818-
#if defined(__APPLE__) // M1, issue #343
819-
__asm__ volatile ("mrs %0, tpidrro_el0" : "=r" (tcb));
820-
tcb = (void**)((uintptr_t)tcb & ~0x07UL); // clear lower 3 bits
821-
#else
822-
__asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tcb));
821+
#if defined(__i386__)
822+
__asm__("movl %%gs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : ); // x86 32-bit always uses GS
823+
#elif defined(__APPLE__) && defined(__x86_64__)
824+
__asm__("movq %%gs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : ); // x86_64 macOSX uses GS
825+
#elif defined(__x86_64__) && (MI_INTPTR_SIZE==4)
826+
__asm__("movl %%fs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : ); // x32 ABI
827+
#elif defined(__x86_64__)
828+
__asm__("movq %%fs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : ); // x86_64 Linux, BSD uses FS
829+
#elif defined(__arm__)
830+
void** tcb; MI_UNUSED(ofs);
831+
__asm__ volatile ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tcb));
832+
res = tcb[slot];
833+
#elif defined(__aarch64__)
834+
void** tcb; MI_UNUSED(ofs);
835+
#if defined(__APPLE__) // M1, issue #343
836+
__asm__ volatile ("mrs %0, tpidrro_el0\nbic %0, %0, #7" : "=r" (tcb));
837+
#else
838+
__asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tcb));
839+
#endif
840+
res = tcb[slot];
823841
#endif
824-
res = tcb[slot];
825-
#endif
826842
return res;
827843
}
828844

829-
// setting is only used on macOSX for now
845+
// setting a tls slot is only used on macOS for now
830846
static inline void mi_tls_slot_set(size_t slot, void* value) mi_attr_noexcept {
831847
const size_t ofs = (slot*sizeof(void*));
832-
#if defined(__i386__)
833-
__asm__("movl %1,%%gs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : ); // 32-bit always uses GS
834-
#elif defined(__APPLE__) && defined(__x86_64__)
835-
__asm__("movq %1,%%gs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : ); // x86_64 macOSX uses GS
836-
#elif defined(__x86_64__) && (MI_INTPTR_SIZE==4)
837-
__asm__("movl %1,%%fs:%1" : "=m" (*((void**)ofs)) : "rn" (value) : ); // x32 ABI
838-
#elif defined(__x86_64__)
839-
__asm__("movq %1,%%fs:%1" : "=m" (*((void**)ofs)) : "rn" (value) : ); // x86_64 Linux, BSD uses FS
840-
#elif defined(__arm__)
841-
void** tcb; MI_UNUSED(ofs);
842-
__asm__ volatile ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tcb));
843-
tcb[slot] = value;
844-
#elif defined(__aarch64__)
845-
void** tcb; MI_UNUSED(ofs);
846-
#if defined(__APPLE__) // M1, issue #343
847-
__asm__ volatile ("mrs %0, tpidrro_el0" : "=r" (tcb));
848-
tcb = (void**)((uintptr_t)tcb & ~0x07UL); // clear lower 3 bits
849-
#else
850-
__asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tcb));
848+
#if defined(__i386__)
849+
__asm__("movl %1,%%gs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : ); // 32-bit always uses GS
850+
#elif defined(__APPLE__) && defined(__x86_64__)
851+
__asm__("movq %1,%%gs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : ); // x86_64 macOS uses GS
852+
#elif defined(__x86_64__) && (MI_INTPTR_SIZE==4)
853+
__asm__("movl %1,%%fs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : ); // x32 ABI
854+
#elif defined(__x86_64__)
855+
__asm__("movq %1,%%fs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : ); // x86_64 Linux, BSD uses FS
856+
#elif defined(__arm__)
857+
void** tcb; MI_UNUSED(ofs);
858+
__asm__ volatile ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tcb));
859+
tcb[slot] = value;
860+
#elif defined(__aarch64__)
861+
void** tcb; MI_UNUSED(ofs);
862+
#if defined(__APPLE__) // M1, issue #343
863+
__asm__ volatile ("mrs %0, tpidrro_el0\nbic %0, %0, #7" : "=r" (tcb));
864+
#else
865+
__asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tcb));
866+
#endif
867+
tcb[slot] = value;
851868
#endif
852-
tcb[slot] = value;
853-
#endif
854869
}
855870

856871
static inline mi_threadid_t _mi_thread_id(void) mi_attr_noexcept {
857-
#if defined(__BIONIC__) && (defined(__arm__) || defined(__aarch64__))
858-
// on Android, slot 1 is the thread ID (pointer to pthread internal struct)
859-
return (uintptr_t)mi_tls_slot(1);
860-
#else
861-
// in all our other targets, slot 0 is the pointer to the thread control block
862-
return (uintptr_t)mi_tls_slot(0);
863-
#endif
872+
#if defined(__BIONIC__)
873+
// issue #384, #495: on the Bionic libc (Android), slot 1 is the thread id
874+
// see: https://github.com/aosp-mirror/platform_bionic/blob/c44b1d0676ded732df4b3b21c5f798eacae93228/libc/platform/bionic/tls_defines.h#L86
875+
return (uintptr_t)mi_tls_slot(1);
876+
#else
877+
// in all our other targets, slot 0 is the thread id
878+
// glibc: https://sourceware.org/git/?p=glibc.git;a=blob_plain;f=sysdeps/x86_64/nptl/tls.h
879+
// apple: https://github.com/apple/darwin-xnu/blob/main/libsyscall/os/tsd.h#L36
880+
return (uintptr_t)mi_tls_slot(0);
881+
#endif
864882
}
883+
865884
#else
866-
// otherwise use standard C
885+
886+
// otherwise use portable C, taking the address of a thread local variable (this is still very fast on most platforms).
867887
static inline mi_threadid_t _mi_thread_id(void) mi_attr_noexcept {
868888
return (uintptr_t)&_mi_heap_default;
869889
}
890+
870891
#endif
871892

893+
872894
// -----------------------------------------------------------------------
873895
// Count bits: trailing or leading zeros (with MI_INTPTR_BITS on all zero)
874896
// -----------------------------------------------------------------------

Include/mimalloc/mimalloc-types.h

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -162,17 +162,23 @@ typedef int32_t mi_ssize_t;
162162
#define MI_BIN_HUGE (73U)
163163

164164
#if (MI_MEDIUM_OBJ_WSIZE_MAX >= 655360)
165-
#error "define more bins"
165+
#error "mimalloc internal: define more bins"
166+
#endif
167+
#if (MI_ALIGNMENT_MAX > MI_SEGMENT_SIZE/2)
168+
#error "mimalloc internal: the max aligned boundary is too large for the segment size"
169+
#endif
170+
#if (MI_ALIGNED_MAX % MI_SEGMENT_SLICE_SIZE != 0)
171+
#error "mimalloc internal: the max aligned boundary must be an integral multiple of the segment slice size"
166172
#endif
167173

168-
// Maximum slice offset (7)
169-
#define MI_MAX_SLICE_OFFSET ((MI_MEDIUM_PAGE_SIZE / MI_SEGMENT_SLICE_SIZE) - 1)
174+
// Maximum slice offset (15)
175+
#define MI_MAX_SLICE_OFFSET ((MI_ALIGNMENT_MAX / MI_SEGMENT_SLICE_SIZE) - 1)
170176

171177
// Used as a special value to encode block sizes in 32 bits.
172178
#define MI_HUGE_BLOCK_SIZE ((uint32_t)MI_HUGE_OBJ_SIZE_MAX)
173179

174180
// blocks up to this size are always allocated aligned
175-
#define MI_MAX_ALIGN_GUARANTEE (8*MI_MAX_ALIGN_SIZE)
181+
#define MI_MAX_ALIGN_GUARANTEE (8*MI_MAX_ALIGN_SIZE)
176182

177183

178184

@@ -314,10 +320,15 @@ typedef enum mi_segment_kind_e {
314320
// the corresponding MI_COMMIT_SIZE area is committed.
315321
// The MI_COMMIT_SIZE must be a multiple of the slice
316322
// size. If it is equal we have the most fine grained
317-
// decommit (but in practice 2x seems to perform better).
323+
// decommit (but setting it higher can be more efficient).
324+
// The MI_MINIMAL_COMMIT_SIZE is the minimal amount that will
325+
// be committed in one go which can be set higher than
326+
// MI_COMMIT_SIZE for efficiency (while the decommit mask
327+
// is still tracked in fine-grained MI_COMMIT_SIZE chunks)
318328
// ------------------------------------------------------
319329

320-
#define MI_COMMIT_SIZE (MI_SEGMENT_SLICE_SIZE)
330+
#define MI_MINIMAL_COMMIT_SIZE (2*MI_MiB)
331+
#define MI_COMMIT_SIZE (MI_SEGMENT_SLICE_SIZE) // 64KiB
321332
#define MI_COMMIT_MASK_BITS (MI_SEGMENT_SIZE / MI_COMMIT_SIZE)
322333
#define MI_COMMIT_MASK_FIELD_BITS MI_SIZE_BITS
323334
#define MI_COMMIT_MASK_FIELD_COUNT (MI_COMMIT_MASK_BITS / MI_COMMIT_MASK_FIELD_BITS)

0 commit comments

Comments
 (0)