From 7ef2e30c9d0d45472c3942ff9733f1d7dfc184ec Mon Sep 17 00:00:00 2001 From: Sam Gross Date: Tue, 3 Jun 2025 20:47:13 +0000 Subject: [PATCH 1/8] gh-133136: Limit excess memory held by QSBR The free threading build uses QSBR to delay the freeing of dictionary keys and list arrays when the objects are accessed by multiple threads in order to allow concurrent reads to proceeed with holding the object lock. The requests are processed in batches to reduce execution overhead, but for large memory blocks this can lead to excess memory usage. Take into account the size of the memory block when deciding when to process QSBR requests. --- Include/internal/pycore_pymem.h | 2 +- Include/internal/pycore_qsbr.h | 4 ++ ...-06-03-21-06-22.gh-issue-133136.Usnvri.rst | 2 + Objects/codeobject.c | 2 +- Objects/dictobject.c | 4 +- Objects/listobject.c | 3 +- Objects/obmalloc.c | 39 +++++++++++++++---- 7 files changed, 44 insertions(+), 12 deletions(-) create mode 100644 Misc/NEWS.d/next/Core_and_Builtins/2025-06-03-21-06-22.gh-issue-133136.Usnvri.rst diff --git a/Include/internal/pycore_pymem.h b/Include/internal/pycore_pymem.h index 02537bdfef8598..3e12084b82ab26 100644 --- a/Include/internal/pycore_pymem.h +++ b/Include/internal/pycore_pymem.h @@ -88,7 +88,7 @@ extern wchar_t *_PyMem_DefaultRawWcsdup(const wchar_t *str); extern int _PyMem_DebugEnabled(void); // Enqueue a pointer to be freed possibly after some delay. -extern void _PyMem_FreeDelayed(void *ptr); +extern void _PyMem_FreeDelayed(void *ptr, size_t size); // Enqueue an object to be freed possibly after some delay #ifdef Py_GIL_DISABLED diff --git a/Include/internal/pycore_qsbr.h b/Include/internal/pycore_qsbr.h index b835c3abaf5d0b..e839427ddf45db 100644 --- a/Include/internal/pycore_qsbr.h +++ b/Include/internal/pycore_qsbr.h @@ -51,6 +51,10 @@ struct _qsbr_thread_state { // Used to defer advancing write sequence a fixed number of times int deferrals; + // Estimate for the amount of memory that is held by this thread since + // the last non-deferred advance. + size_t memory_deferred; + // Is this thread state allocated? bool allocated; struct _qsbr_thread_state *freelist_next; diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-06-03-21-06-22.gh-issue-133136.Usnvri.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-06-03-21-06-22.gh-issue-133136.Usnvri.rst new file mode 100644 index 00000000000000..a9501c13c95b3a --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-06-03-21-06-22.gh-issue-133136.Usnvri.rst @@ -0,0 +1,2 @@ +Limit excess memory usage in the :term:`free threading` build when a +large dictionary or list is resized and accessed by multiple threads. diff --git a/Objects/codeobject.c b/Objects/codeobject.c index ee869d991d93cd..d0ce49d9d14ef6 100644 --- a/Objects/codeobject.c +++ b/Objects/codeobject.c @@ -3350,7 +3350,7 @@ create_tlbc_lock_held(PyCodeObject *co, Py_ssize_t idx) } memcpy(new_tlbc->entries, tlbc->entries, tlbc->size * sizeof(void *)); _Py_atomic_store_ptr_release(&co->co_tlbc, new_tlbc); - _PyMem_FreeDelayed(tlbc); + _PyMem_FreeDelayed(tlbc, tlbc->size * sizeof(void *)); tlbc = new_tlbc; } char *bc = PyMem_Calloc(1, _PyCode_NBYTES(co)); diff --git a/Objects/dictobject.c b/Objects/dictobject.c index fd8ccf56324207..6ff97cd67111a4 100644 --- a/Objects/dictobject.c +++ b/Objects/dictobject.c @@ -813,7 +813,7 @@ free_keys_object(PyDictKeysObject *keys, bool use_qsbr) { #ifdef Py_GIL_DISABLED if (use_qsbr) { - _PyMem_FreeDelayed(keys); + _PyMem_FreeDelayed(keys, _PyDict_KeysSize(keys)); return; } #endif @@ -858,7 +858,7 @@ free_values(PyDictValues *values, bool use_qsbr) assert(values->embedded == 0); #ifdef Py_GIL_DISABLED if (use_qsbr) { - _PyMem_FreeDelayed(values); + _PyMem_FreeDelayed(values, values_size_from_count(values->capacity)); return; } #endif diff --git a/Objects/listobject.c b/Objects/listobject.c index c5895645a2dd12..23d3472b6d4153 100644 --- a/Objects/listobject.c +++ b/Objects/listobject.c @@ -61,7 +61,8 @@ free_list_items(PyObject** items, bool use_qsbr) #ifdef Py_GIL_DISABLED _PyListArray *array = _Py_CONTAINER_OF(items, _PyListArray, ob_item); if (use_qsbr) { - _PyMem_FreeDelayed(array); + size_t size = sizeof(_PyListArray) + array->allocated * sizeof(PyObject *); + _PyMem_FreeDelayed(array, size); } else { PyMem_Free(array); diff --git a/Objects/obmalloc.c b/Objects/obmalloc.c index d3931aab623b70..5d6e65b1fffde3 100644 --- a/Objects/obmalloc.c +++ b/Objects/obmalloc.c @@ -1141,8 +1141,27 @@ free_work_item(uintptr_t ptr, delayed_dealloc_cb cb, void *state) } } +static int +should_advance_qsbr(_PyThreadStateImpl *tstate, size_t size) +{ + // If the deferred memory exceeds 1 MiB, we force an advance in the + // shared QSBR sequence number to limit excess memory usage. + static const size_t QSBR_DEFERRED_LIMIT = 1024 * 1024; + if (size > QSBR_DEFERRED_LIMIT) { + tstate->qsbr->memory_deferred = 0; + return 1; + } + + tstate->qsbr->memory_deferred += size; + if (tstate->qsbr->memory_deferred > QSBR_DEFERRED_LIMIT) { + tstate->qsbr->memory_deferred = 0; + return 1; + } + return 0; +} + static void -free_delayed(uintptr_t ptr) +free_delayed(uintptr_t ptr, size_t size) { #ifndef Py_GIL_DISABLED free_work_item(ptr, NULL, NULL); @@ -1200,23 +1219,29 @@ free_delayed(uintptr_t ptr) } assert(buf != NULL && buf->wr_idx < WORK_ITEMS_PER_CHUNK); - uint64_t seq = _Py_qsbr_deferred_advance(tstate->qsbr); + uint64_t seq; + int force_advance = should_advance_qsbr(tstate, size); + if (force_advance) { + seq = _Py_qsbr_advance(tstate->qsbr->shared); + } + else { + seq = _Py_qsbr_deferred_advance(tstate->qsbr); + } buf->array[buf->wr_idx].ptr = ptr; buf->array[buf->wr_idx].qsbr_goal = seq; buf->wr_idx++; - - if (buf->wr_idx == WORK_ITEMS_PER_CHUNK) { + if (buf->wr_idx == WORK_ITEMS_PER_CHUNK || force_advance) { _PyMem_ProcessDelayed((PyThreadState *)tstate); } #endif } void -_PyMem_FreeDelayed(void *ptr) +_PyMem_FreeDelayed(void *ptr, size_t size) { assert(!((uintptr_t)ptr & 0x01)); if (ptr != NULL) { - free_delayed((uintptr_t)ptr); + free_delayed((uintptr_t)ptr, size); } } @@ -1226,7 +1251,7 @@ _PyObject_XDecRefDelayed(PyObject *ptr) { assert(!((uintptr_t)ptr & 0x01)); if (ptr != NULL) { - free_delayed(((uintptr_t)ptr)|0x01); + free_delayed(((uintptr_t)ptr)|0x01, 64); } } #endif From ce9232b8ab26869b1d33ab851f82708ec79ef555 Mon Sep 17 00:00:00 2001 From: Sam Gross Date: Tue, 3 Jun 2025 21:36:02 +0000 Subject: [PATCH 2/8] Fix unused function warning --- Objects/obmalloc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Objects/obmalloc.c b/Objects/obmalloc.c index 5d6e65b1fffde3..5af2ed5a21afa9 100644 --- a/Objects/obmalloc.c +++ b/Objects/obmalloc.c @@ -1141,6 +1141,7 @@ free_work_item(uintptr_t ptr, delayed_dealloc_cb cb, void *state) } } +#ifdef Py_GIL_DISABLED static int should_advance_qsbr(_PyThreadStateImpl *tstate, size_t size) { @@ -1159,6 +1160,7 @@ should_advance_qsbr(_PyThreadStateImpl *tstate, size_t size) } return 0; } +#endif static void free_delayed(uintptr_t ptr, size_t size) From 3978e35071dd8c72fd2ea2587b1f3241912ec6c6 Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Wed, 11 Jun 2025 09:53:59 -0700 Subject: [PATCH 3/8] Re-work QSBR deferred advance and processing. --- Include/internal/pycore_qsbr.h | 28 +++++++++++++------ Objects/obmalloc.c | 45 ++++++++---------------------- Python/qsbr.c | 51 ++++++++++++++++++++++++++++------ 3 files changed, 74 insertions(+), 50 deletions(-) diff --git a/Include/internal/pycore_qsbr.h b/Include/internal/pycore_qsbr.h index e839427ddf45db..732b0a0b52004c 100644 --- a/Include/internal/pycore_qsbr.h +++ b/Include/internal/pycore_qsbr.h @@ -48,12 +48,15 @@ struct _qsbr_thread_state { // Thread state (or NULL) PyThreadState *tstate; - // Used to defer advancing write sequence a fixed number of times - int deferrals; + // Number of held items added by this thread since last process + int deferred_count; // Estimate for the amount of memory that is held by this thread since - // the last non-deferred advance. - size_t memory_deferred; + // the last process + size_t deferred_memory; + + // Sequence number at time of last "should process" check. + uint64_t seq_last_check; // Is this thread state allocated? bool allocated; @@ -113,11 +116,20 @@ _Py_qbsr_goal_reached(struct _qsbr_thread_state *qsbr, uint64_t goal) extern uint64_t _Py_qsbr_advance(struct _qsbr_shared *shared); -// Batches requests to advance the write sequence. This advances the write -// sequence every N calls, which reduces overhead but increases time to -// reclamation. Returns the new goal. +// Advance the write sequence as required and return the sequence goal to use +// for memory to be freed. If the sequence is advanced, this goal is the new +// sequence value, otherwise it is the next sequence value. In either case, +// the goal is higher that any write sequence value already observed by readers. +// +// The 'size' argument is the size in bytes of the memory scheduled to be +// freed. If that size is not available, pass zero as the value. extern uint64_t -_Py_qsbr_deferred_advance(struct _qsbr_thread_state *qsbr); +_Py_qsbr_advance_with_size(struct _qsbr_thread_state *qsbr, size_t size); + +// Return true if memory held by QSBR should be processed to determine if it +// can be safely freed. +extern bool +_Py_qsbr_should_process(struct _qsbr_thread_state *qsbr); // Have the read sequences advanced to the given goal? If this returns true, // it safe to reclaim any memory tagged with the goal (or earlier goal). diff --git a/Objects/obmalloc.c b/Objects/obmalloc.c index 5af2ed5a21afa9..160d2cee19ef2e 100644 --- a/Objects/obmalloc.c +++ b/Objects/obmalloc.c @@ -139,7 +139,10 @@ _PyMem_mi_page_maybe_free(mi_page_t *page, mi_page_queue_t *pq, bool force) _PyMem_mi_page_clear_qsbr(page); page->retire_expire = 0; - page->qsbr_goal = _Py_qsbr_deferred_advance(tstate->qsbr); + + size_t bsize = mi_page_block_size(page); + page->qsbr_goal = _Py_qsbr_advance_with_size(tstate->qsbr, page->capacity*bsize); + llist_insert_tail(&tstate->mimalloc.page_list, &page->qsbr_node); return false; } @@ -1141,27 +1144,6 @@ free_work_item(uintptr_t ptr, delayed_dealloc_cb cb, void *state) } } -#ifdef Py_GIL_DISABLED -static int -should_advance_qsbr(_PyThreadStateImpl *tstate, size_t size) -{ - // If the deferred memory exceeds 1 MiB, we force an advance in the - // shared QSBR sequence number to limit excess memory usage. - static const size_t QSBR_DEFERRED_LIMIT = 1024 * 1024; - if (size > QSBR_DEFERRED_LIMIT) { - tstate->qsbr->memory_deferred = 0; - return 1; - } - - tstate->qsbr->memory_deferred += size; - if (tstate->qsbr->memory_deferred > QSBR_DEFERRED_LIMIT) { - tstate->qsbr->memory_deferred = 0; - return 1; - } - return 0; -} -#endif - static void free_delayed(uintptr_t ptr, size_t size) { @@ -1221,20 +1203,12 @@ free_delayed(uintptr_t ptr, size_t size) } assert(buf != NULL && buf->wr_idx < WORK_ITEMS_PER_CHUNK); - uint64_t seq; - int force_advance = should_advance_qsbr(tstate, size); - if (force_advance) { - seq = _Py_qsbr_advance(tstate->qsbr->shared); - } - else { - seq = _Py_qsbr_deferred_advance(tstate->qsbr); - } + uint64_t seq = _Py_qsbr_advance_with_size(tstate->qsbr, size); buf->array[buf->wr_idx].ptr = ptr; buf->array[buf->wr_idx].qsbr_goal = seq; buf->wr_idx++; - if (buf->wr_idx == WORK_ITEMS_PER_CHUNK || force_advance) { - _PyMem_ProcessDelayed((PyThreadState *)tstate); - } + + _PyMem_ProcessDelayed((PyThreadState *)tstate); #endif } @@ -1326,8 +1300,11 @@ maybe_process_interp_queue(struct _Py_mem_interp_free_queue *queue, void _PyMem_ProcessDelayed(PyThreadState *tstate) { - PyInterpreterState *interp = tstate->interp; _PyThreadStateImpl *tstate_impl = (_PyThreadStateImpl *)tstate; + if (!_Py_qsbr_should_process(tstate_impl->qsbr)) { + return; + } + PyInterpreterState *interp = tstate->interp; // Process thread-local work process_queue(&tstate_impl->mem_free_queue, tstate_impl, true, NULL, NULL); diff --git a/Python/qsbr.c b/Python/qsbr.c index bf34fb2523dfc8..7b7aae20ee9567 100644 --- a/Python/qsbr.c +++ b/Python/qsbr.c @@ -41,9 +41,14 @@ // Starting size of the array of qsbr thread states #define MIN_ARRAY_SIZE 8 -// For _Py_qsbr_deferred_advance(): the number of deferrals before advancing -// the write sequence. -#define QSBR_DEFERRED_LIMIT 10 +// For should_advance_qsbr(): the number of deferred items before advancing +// the write sequence. This is based on WORK_ITEMS_PER_CHUNK. We ideally +// want to process a chunk before it overflows. +#define QSBR_DEFERRED_LIMIT 127 + +// If the deferred memory exceeds 1 MiB, we force an advance in the +// shared QSBR sequence number to limit excess memory usage. +#define QSBR_MEM_LIMIT 1024*1024 // Allocate a QSBR thread state from the freelist static struct _qsbr_thread_state * @@ -113,17 +118,47 @@ _Py_qsbr_advance(struct _qsbr_shared *shared) // NOTE: with 64-bit sequence numbers, we don't have to worry too much // about the wr_seq getting too far ahead of rd_seq, but if we ever use // 32-bit sequence numbers, we'll need to be more careful. - return _Py_atomic_add_uint64(&shared->wr_seq, QSBR_INCR) + QSBR_INCR; + return _Py_atomic_add_uint64(&shared->wr_seq, QSBR_INCR); +} + +static int +should_advance_qsbr(struct _qsbr_thread_state *qsbr, size_t size) +{ + qsbr->deferred_count++; + qsbr->deferred_memory += size; + if (qsbr->deferred_count >= QSBR_DEFERRED_LIMIT || + qsbr->deferred_memory > QSBR_MEM_LIMIT) { + qsbr->deferred_count = 0; + qsbr->deferred_memory = 0; + return 1; + } + return 0; } uint64_t -_Py_qsbr_deferred_advance(struct _qsbr_thread_state *qsbr) +_Py_qsbr_advance_with_size(struct _qsbr_thread_state *qsbr, size_t size) { - if (++qsbr->deferrals < QSBR_DEFERRED_LIMIT) { + if (should_advance_qsbr(qsbr, size)) { + // Advance the write sequence and return the updated value as the goal. + return _Py_qsbr_advance(qsbr->shared); + } + else { + // Don't advance, return the next sequence value as the goal. return _Py_qsbr_shared_current(qsbr->shared) + QSBR_INCR; } - qsbr->deferrals = 0; - return _Py_qsbr_advance(qsbr->shared); +} + +bool +_Py_qsbr_should_process(struct _qsbr_thread_state *qsbr) +{ + if (qsbr->seq_last_check == qsbr->seq) { + // If 'seq' for this thread hasn't advanced, it is unlikely that any + // deferred memory is ready to be freed. Wait longer before trying + // to process. + return false; + } + qsbr->seq_last_check = qsbr->seq; + return true; } static uint64_t From 0b276ab6f77f246b1b75aba4ee28b1b725aeb92c Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Fri, 13 Jun 2025 10:13:38 -0700 Subject: [PATCH 4/8] Update comments. --- Include/internal/pycore_qsbr.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Include/internal/pycore_qsbr.h b/Include/internal/pycore_qsbr.h index 732b0a0b52004c..47ad4cd3efaaa7 100644 --- a/Include/internal/pycore_qsbr.h +++ b/Include/internal/pycore_qsbr.h @@ -48,11 +48,11 @@ struct _qsbr_thread_state { // Thread state (or NULL) PyThreadState *tstate; - // Number of held items added by this thread since last process + // Number of held items added by this thread since write sequence advance int deferred_count; // Estimate for the amount of memory that is held by this thread since - // the last process + // the last write sequence advance size_t deferred_memory; // Sequence number at time of last "should process" check. From 7ea28fffe1ca379e3f29440b5e0e70d1a5a01435 Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Mon, 16 Jun 2025 13:13:25 -0700 Subject: [PATCH 5/8] Revise based on review feedback. * Keep separate count of mimalloc page memory that is deferred from collection. This memory doesn't get freed by _PyMem_ProcessDelayed(). We want to advance the write sequence if there is too much of it but calling _PyMem_ProcessDelayed() is not helpful. * Use `process_seq` variable to schedule the next call to `_PyMem_ProcessDelayed()`. * Rename advance functions to have "deferred" in name. * Move `_Py_qsbr_should_process()` call up one level. --- Include/internal/pycore_qsbr.h | 31 +++++++++++------ Objects/obmalloc.c | 18 +++++----- Python/qsbr.c | 62 ++++++++++++++++++++-------------- 3 files changed, 68 insertions(+), 43 deletions(-) diff --git a/Include/internal/pycore_qsbr.h b/Include/internal/pycore_qsbr.h index 47ad4cd3efaaa7..570f7216c01119 100644 --- a/Include/internal/pycore_qsbr.h +++ b/Include/internal/pycore_qsbr.h @@ -55,8 +55,14 @@ struct _qsbr_thread_state { // the last write sequence advance size_t deferred_memory; - // Sequence number at time of last "should process" check. - uint64_t seq_last_check; + // Amount of memory in mimalloc pages deferred from collection. When + // deferred, they are prevented from being used for a different size class + // and in a different thread. + size_t deferred_page_memory; + + // If non-zero, processing if deferred memory should be performed if the + // read sequence has reached this value. + uint64_t process_seq; // Is this thread state allocated? bool allocated; @@ -66,7 +72,7 @@ struct _qsbr_thread_state { // Padding to avoid false sharing struct _qsbr_pad { struct _qsbr_thread_state qsbr; - char __padding[64 - sizeof(struct _qsbr_thread_state)]; + char __padding[128 - sizeof(struct _qsbr_thread_state)]; }; // Per-interpreter state @@ -117,14 +123,19 @@ extern uint64_t _Py_qsbr_advance(struct _qsbr_shared *shared); // Advance the write sequence as required and return the sequence goal to use -// for memory to be freed. If the sequence is advanced, this goal is the new -// sequence value, otherwise it is the next sequence value. In either case, -// the goal is higher that any write sequence value already observed by readers. -// -// The 'size' argument is the size in bytes of the memory scheduled to be -// freed. If that size is not available, pass zero as the value. +// for memory to be freed. The 'free_size' argument is the size in bytes of +// the memory scheduled to be freed. If that size is not available, pass zero +// as the value. +extern uint64_t +_Py_qsbr_deferred_advance_for_free(struct _qsbr_thread_state *qsbr, + size_t free_size); + +// Advance the write sequence as required and return the sequence goal to use +// for a mimalloc page to be collected. The 'page_size' argument is the size +// of the mimalloc page being deferred from collection. extern uint64_t -_Py_qsbr_advance_with_size(struct _qsbr_thread_state *qsbr, size_t size); +_Py_qsbr_deferred_advance_for_page(struct _qsbr_thread_state *qsbr, + size_t page_size); // Return true if memory held by QSBR should be processed to determine if it // can be safely freed. diff --git a/Objects/obmalloc.c b/Objects/obmalloc.c index 160d2cee19ef2e..783750c7c63e54 100644 --- a/Objects/obmalloc.c +++ b/Objects/obmalloc.c @@ -141,7 +141,7 @@ _PyMem_mi_page_maybe_free(mi_page_t *page, mi_page_queue_t *pq, bool force) page->retire_expire = 0; size_t bsize = mi_page_block_size(page); - page->qsbr_goal = _Py_qsbr_advance_with_size(tstate->qsbr, page->capacity*bsize); + page->qsbr_goal = _Py_qsbr_deferred_advance_for_page(tstate->qsbr, page->capacity*bsize); llist_insert_tail(&tstate->mimalloc.page_list, &page->qsbr_node); return false; @@ -1203,12 +1203,14 @@ free_delayed(uintptr_t ptr, size_t size) } assert(buf != NULL && buf->wr_idx < WORK_ITEMS_PER_CHUNK); - uint64_t seq = _Py_qsbr_advance_with_size(tstate->qsbr, size); + uint64_t seq = _Py_qsbr_deferred_advance_for_free(tstate->qsbr, size); buf->array[buf->wr_idx].ptr = ptr; buf->array[buf->wr_idx].qsbr_goal = seq; buf->wr_idx++; - _PyMem_ProcessDelayed((PyThreadState *)tstate); + if (_Py_qsbr_should_process(tstate->qsbr)) { + _PyMem_ProcessDelayed((PyThreadState *)tstate); + } #endif } @@ -1227,7 +1229,10 @@ _PyObject_XDecRefDelayed(PyObject *ptr) { assert(!((uintptr_t)ptr & 0x01)); if (ptr != NULL) { - free_delayed(((uintptr_t)ptr)|0x01, 64); + // We use 0 as the size since we don't have an easy way to know the + // actual size. If we are freeing many objects, the write sequence + // will be advanced due to QSBR_DEFERRED_LIMIT. + free_delayed(((uintptr_t)ptr)|0x01, 0); } } #endif @@ -1300,11 +1305,8 @@ maybe_process_interp_queue(struct _Py_mem_interp_free_queue *queue, void _PyMem_ProcessDelayed(PyThreadState *tstate) { - _PyThreadStateImpl *tstate_impl = (_PyThreadStateImpl *)tstate; - if (!_Py_qsbr_should_process(tstate_impl->qsbr)) { - return; - } PyInterpreterState *interp = tstate->interp; + _PyThreadStateImpl *tstate_impl = (_PyThreadStateImpl *)tstate; // Process thread-local work process_queue(&tstate_impl->mem_free_queue, tstate_impl, true, NULL, NULL); diff --git a/Python/qsbr.c b/Python/qsbr.c index 7b7aae20ee9567..ed82b468c5c40e 100644 --- a/Python/qsbr.c +++ b/Python/qsbr.c @@ -41,14 +41,19 @@ // Starting size of the array of qsbr thread states #define MIN_ARRAY_SIZE 8 -// For should_advance_qsbr(): the number of deferred items before advancing +// For deferred advance on free: the number of deferred items before advancing // the write sequence. This is based on WORK_ITEMS_PER_CHUNK. We ideally // want to process a chunk before it overflows. #define QSBR_DEFERRED_LIMIT 127 // If the deferred memory exceeds 1 MiB, we force an advance in the // shared QSBR sequence number to limit excess memory usage. -#define QSBR_MEM_LIMIT 1024*1024 +#define QSBR_FREE_MEM_LIMIT 1024*1024 + +// If we are deferring collection of more than this amount of memory for +// mimalloc pages, advance the write sequence. Advancing allows these +// pages to be re-used in a different thread or for a different size class. +#define QSBR_PAGE_MEM_LIMIT 4096*10 // Allocate a QSBR thread state from the freelist static struct _qsbr_thread_state * @@ -121,43 +126,50 @@ _Py_qsbr_advance(struct _qsbr_shared *shared) return _Py_atomic_add_uint64(&shared->wr_seq, QSBR_INCR); } -static int -should_advance_qsbr(struct _qsbr_thread_state *qsbr, size_t size) +uint64_t +_Py_qsbr_deferred_advance_for_page(struct _qsbr_thread_state *qsbr, size_t page_size) { - qsbr->deferred_count++; - qsbr->deferred_memory += size; - if (qsbr->deferred_count >= QSBR_DEFERRED_LIMIT || - qsbr->deferred_memory > QSBR_MEM_LIMIT) { - qsbr->deferred_count = 0; - qsbr->deferred_memory = 0; - return 1; + qsbr->deferred_page_memory += page_size; + if (qsbr->deferred_page_memory > QSBR_PAGE_MEM_LIMIT) { + qsbr->deferred_page_memory = 0; + // Advance the write sequence and return the updated value as the goal. + return _Py_qsbr_advance(qsbr->shared); } - return 0; + // Don't advance, return the next sequence value as the goal. + return _Py_qsbr_shared_current(qsbr->shared) + QSBR_INCR; } uint64_t -_Py_qsbr_advance_with_size(struct _qsbr_thread_state *qsbr, size_t size) +_Py_qsbr_deferred_advance_for_free(struct _qsbr_thread_state *qsbr, size_t free_size) { - if (should_advance_qsbr(qsbr, size)) { - // Advance the write sequence and return the updated value as the goal. - return _Py_qsbr_advance(qsbr->shared); - } - else { - // Don't advance, return the next sequence value as the goal. - return _Py_qsbr_shared_current(qsbr->shared) + QSBR_INCR; + qsbr->deferred_count++; + qsbr->deferred_memory += free_size; + if (qsbr->deferred_count >= QSBR_DEFERRED_LIMIT || + qsbr->deferred_memory > QSBR_FREE_MEM_LIMIT) { + qsbr->deferred_count = 0; + qsbr->deferred_memory = 0; + // Advance the write sequence + uint64_t seq = _Py_qsbr_advance(qsbr->shared); + if (qsbr->process_seq == 0) { + // Process the queue of deferred frees when the read sequence + // reaches this value. We don't process immediately because + // we want to give readers a chance to advance their sequence. + qsbr->process_seq = seq; + } + // Return current (just advanced) sequence as the goal. + return seq; } + // Don't advance, return the next sequence value as the goal. + return _Py_qsbr_shared_current(qsbr->shared) + QSBR_INCR; } bool _Py_qsbr_should_process(struct _qsbr_thread_state *qsbr) { - if (qsbr->seq_last_check == qsbr->seq) { - // If 'seq' for this thread hasn't advanced, it is unlikely that any - // deferred memory is ready to be freed. Wait longer before trying - // to process. + if (qsbr->process_seq == 0 || qsbr->seq < qsbr->process_seq) { return false; } - qsbr->seq_last_check = qsbr->seq; + qsbr->process_seq = 0; return true; } From d2f5acc113599265ff3f89a2d88653119a6ae002 Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Mon, 16 Jun 2025 18:29:30 -0700 Subject: [PATCH 6/8] Revert change to _Py_qsbr_advance(). Since _Py_atomic_add_uint64() returns the old value, we need to add QSBR_INCR. --- Python/qsbr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Python/qsbr.c b/Python/qsbr.c index ed82b468c5c40e..54b20e3488e642 100644 --- a/Python/qsbr.c +++ b/Python/qsbr.c @@ -123,7 +123,7 @@ _Py_qsbr_advance(struct _qsbr_shared *shared) // NOTE: with 64-bit sequence numbers, we don't have to worry too much // about the wr_seq getting too far ahead of rd_seq, but if we ever use // 32-bit sequence numbers, we'll need to be more careful. - return _Py_atomic_add_uint64(&shared->wr_seq, QSBR_INCR); + return _Py_atomic_add_uint64(&shared->wr_seq, QSBR_INCR) + QSBR_INCR; } uint64_t From ee133922a9eabc2d6e93fe9963a440525a41d9bd Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Mon, 16 Jun 2025 23:41:08 -0700 Subject: [PATCH 7/8] Call _PyMem_FreeDelayed() from eval breaker. Refactor code to keep obmalloc logic out of the qsbr.c file. Call _PyMem_ProcessDelayed() from the eval breaker. --- Include/internal/pycore_qsbr.h | 36 +++++++--------- Objects/obmalloc.c | 76 ++++++++++++++++++++++++++++++++-- Python/ceval_gil.c | 4 ++ Python/qsbr.c | 59 +------------------------- 4 files changed, 92 insertions(+), 83 deletions(-) diff --git a/Include/internal/pycore_qsbr.h b/Include/internal/pycore_qsbr.h index 570f7216c01119..1f9b3fcf777493 100644 --- a/Include/internal/pycore_qsbr.h +++ b/Include/internal/pycore_qsbr.h @@ -48,7 +48,8 @@ struct _qsbr_thread_state { // Thread state (or NULL) PyThreadState *tstate; - // Number of held items added by this thread since write sequence advance + // Number of held items added by this thread since the last write sequence + // advance int deferred_count; // Estimate for the amount of memory that is held by this thread since @@ -60,9 +61,8 @@ struct _qsbr_thread_state { // and in a different thread. size_t deferred_page_memory; - // If non-zero, processing if deferred memory should be performed if the - // read sequence has reached this value. - uint64_t process_seq; + // True if the deferred memory frees should be processed. + bool should_process; // Is this thread state allocated? bool allocated; @@ -72,7 +72,7 @@ struct _qsbr_thread_state { // Padding to avoid false sharing struct _qsbr_pad { struct _qsbr_thread_state qsbr; - char __padding[128 - sizeof(struct _qsbr_thread_state)]; + char __padding[64 - sizeof(struct _qsbr_thread_state)]; }; // Per-interpreter state @@ -122,25 +122,17 @@ _Py_qbsr_goal_reached(struct _qsbr_thread_state *qsbr, uint64_t goal) extern uint64_t _Py_qsbr_advance(struct _qsbr_shared *shared); -// Advance the write sequence as required and return the sequence goal to use -// for memory to be freed. The 'free_size' argument is the size in bytes of -// the memory scheduled to be freed. If that size is not available, pass zero -// as the value. +// Return the next value for the write sequence (current plus the increment). extern uint64_t -_Py_qsbr_deferred_advance_for_free(struct _qsbr_thread_state *qsbr, - size_t free_size); +_Py_qsbr_shared_next(struct _qsbr_shared *shared); -// Advance the write sequence as required and return the sequence goal to use -// for a mimalloc page to be collected. The 'page_size' argument is the size -// of the mimalloc page being deferred from collection. -extern uint64_t -_Py_qsbr_deferred_advance_for_page(struct _qsbr_thread_state *qsbr, - size_t page_size); - -// Return true if memory held by QSBR should be processed to determine if it -// can be safely freed. -extern bool -_Py_qsbr_should_process(struct _qsbr_thread_state *qsbr); +// Return true if deferred memory frees held by QSBR should be processed to +// determine if they can be safely freed. +static inline bool +_Py_qsbr_should_process(struct _qsbr_thread_state *qsbr) +{ + return qsbr->should_process; +} // Have the read sequences advanced to the given goal? If this returns true, // it safe to reclaim any memory tagged with the goal (or earlier goal). diff --git a/Objects/obmalloc.c b/Objects/obmalloc.c index 783750c7c63e54..650a742a5a9f49 100644 --- a/Objects/obmalloc.c +++ b/Objects/obmalloc.c @@ -124,6 +124,29 @@ _PyMem_mi_page_is_safe_to_free(mi_page_t *page) } +#ifdef Py_GIL_DISABLED + +// If we are deferring collection of more than this amount of memory for +// mimalloc pages, advance the write sequence. Advancing allows these +// pages to be re-used in a different thread or for a different size class. +#define QSBR_PAGE_MEM_LIMIT 4096*20 + +// Return true if the global write sequence should be advanced for a mimalloc +// page that is deferred from collection. +static bool +should_advance_qsbr_for_page(struct _qsbr_thread_state *qsbr, mi_page_t *page) +{ + size_t bsize = mi_page_block_size(page); + size_t page_size = page->capacity*bsize; + qsbr->deferred_page_memory += page_size; + if (qsbr->deferred_page_memory > QSBR_PAGE_MEM_LIMIT) { + qsbr->deferred_page_memory = 0; + return true; + } + return false; +} +#endif + static bool _PyMem_mi_page_maybe_free(mi_page_t *page, mi_page_queue_t *pq, bool force) { @@ -140,8 +163,12 @@ _PyMem_mi_page_maybe_free(mi_page_t *page, mi_page_queue_t *pq, bool force) _PyMem_mi_page_clear_qsbr(page); page->retire_expire = 0; - size_t bsize = mi_page_block_size(page); - page->qsbr_goal = _Py_qsbr_deferred_advance_for_page(tstate->qsbr, page->capacity*bsize); + if (should_advance_qsbr_for_page(tstate->qsbr, page)) { + page->qsbr_goal = _Py_qsbr_advance(tstate->qsbr->shared); + } + else { + page->qsbr_goal = _Py_qsbr_shared_next(tstate->qsbr->shared); + } llist_insert_tail(&tstate->mimalloc.page_list, &page->qsbr_node); return false; @@ -1144,6 +1171,36 @@ free_work_item(uintptr_t ptr, delayed_dealloc_cb cb, void *state) } } + +#ifdef Py_GIL_DISABLED + +// For deferred advance on free: the number of deferred items before advancing +// the write sequence. This is based on WORK_ITEMS_PER_CHUNK. We ideally +// want to process a chunk before it overflows. +#define QSBR_DEFERRED_LIMIT 127 + +// If the deferred memory exceeds 1 MiB, advance the write sequence. This +// helps limit memory usage due to QSBR delaying frees too long. +#define QSBR_FREE_MEM_LIMIT 1024*1024 + +// Return true if the global write sequence should be advanced for a deferred +// memory free. +static bool +should_advance_qsbr_for_free(struct _qsbr_thread_state *qsbr, size_t size) +{ + qsbr->deferred_count++; + qsbr->deferred_memory += size; + if (qsbr->deferred_count > QSBR_DEFERRED_LIMIT || + qsbr->deferred_memory > QSBR_FREE_MEM_LIMIT) { + qsbr->deferred_count = 0; + qsbr->deferred_memory = 0; + qsbr->should_process = true; + return true; + } + return false; +} +#endif + static void free_delayed(uintptr_t ptr, size_t size) { @@ -1203,12 +1260,21 @@ free_delayed(uintptr_t ptr, size_t size) } assert(buf != NULL && buf->wr_idx < WORK_ITEMS_PER_CHUNK); - uint64_t seq = _Py_qsbr_deferred_advance_for_free(tstate->qsbr, size); + uint64_t seq; + if (should_advance_qsbr_for_free(tstate->qsbr, size)) { + seq = _Py_qsbr_advance(tstate->qsbr->shared); + } + else { + seq = _Py_qsbr_shared_next(tstate->qsbr->shared); + } buf->array[buf->wr_idx].ptr = ptr; buf->array[buf->wr_idx].qsbr_goal = seq; buf->wr_idx++; - if (_Py_qsbr_should_process(tstate->qsbr)) { + if (buf->wr_idx == WORK_ITEMS_PER_CHUNK) { + // Normally the processing of delayed items is done from the eval + // breaker. Processing here is a safety measure to ensure too much + // work does not accumulate. _PyMem_ProcessDelayed((PyThreadState *)tstate); } #endif @@ -1308,6 +1374,8 @@ _PyMem_ProcessDelayed(PyThreadState *tstate) PyInterpreterState *interp = tstate->interp; _PyThreadStateImpl *tstate_impl = (_PyThreadStateImpl *)tstate; + tstate_impl->qsbr->should_process = false; + // Process thread-local work process_queue(&tstate_impl->mem_free_queue, tstate_impl, true, NULL, NULL); diff --git a/Python/ceval_gil.c b/Python/ceval_gil.c index 6d2383ac7c1c65..020bd790b630e2 100644 --- a/Python/ceval_gil.c +++ b/Python/ceval_gil.c @@ -1387,6 +1387,10 @@ _Py_HandlePending(PyThreadState *tstate) _Py_unset_eval_breaker_bit(tstate, _PY_EVAL_EXPLICIT_MERGE_BIT); _Py_brc_merge_refcounts(tstate); } + /* Process deferred memory frees held by QSBR */ + if (_Py_qsbr_should_process(((_PyThreadStateImpl *)tstate)->qsbr)) { + _PyMem_ProcessDelayed(tstate); + } #endif /* GC scheduled to run */ diff --git a/Python/qsbr.c b/Python/qsbr.c index 54b20e3488e642..c9fad5c05ef108 100644 --- a/Python/qsbr.c +++ b/Python/qsbr.c @@ -41,20 +41,6 @@ // Starting size of the array of qsbr thread states #define MIN_ARRAY_SIZE 8 -// For deferred advance on free: the number of deferred items before advancing -// the write sequence. This is based on WORK_ITEMS_PER_CHUNK. We ideally -// want to process a chunk before it overflows. -#define QSBR_DEFERRED_LIMIT 127 - -// If the deferred memory exceeds 1 MiB, we force an advance in the -// shared QSBR sequence number to limit excess memory usage. -#define QSBR_FREE_MEM_LIMIT 1024*1024 - -// If we are deferring collection of more than this amount of memory for -// mimalloc pages, advance the write sequence. Advancing allows these -// pages to be re-used in a different thread or for a different size class. -#define QSBR_PAGE_MEM_LIMIT 4096*10 - // Allocate a QSBR thread state from the freelist static struct _qsbr_thread_state * qsbr_allocate(struct _qsbr_shared *shared) @@ -127,50 +113,9 @@ _Py_qsbr_advance(struct _qsbr_shared *shared) } uint64_t -_Py_qsbr_deferred_advance_for_page(struct _qsbr_thread_state *qsbr, size_t page_size) -{ - qsbr->deferred_page_memory += page_size; - if (qsbr->deferred_page_memory > QSBR_PAGE_MEM_LIMIT) { - qsbr->deferred_page_memory = 0; - // Advance the write sequence and return the updated value as the goal. - return _Py_qsbr_advance(qsbr->shared); - } - // Don't advance, return the next sequence value as the goal. - return _Py_qsbr_shared_current(qsbr->shared) + QSBR_INCR; -} - -uint64_t -_Py_qsbr_deferred_advance_for_free(struct _qsbr_thread_state *qsbr, size_t free_size) +_Py_qsbr_shared_next(struct _qsbr_shared *shared) { - qsbr->deferred_count++; - qsbr->deferred_memory += free_size; - if (qsbr->deferred_count >= QSBR_DEFERRED_LIMIT || - qsbr->deferred_memory > QSBR_FREE_MEM_LIMIT) { - qsbr->deferred_count = 0; - qsbr->deferred_memory = 0; - // Advance the write sequence - uint64_t seq = _Py_qsbr_advance(qsbr->shared); - if (qsbr->process_seq == 0) { - // Process the queue of deferred frees when the read sequence - // reaches this value. We don't process immediately because - // we want to give readers a chance to advance their sequence. - qsbr->process_seq = seq; - } - // Return current (just advanced) sequence as the goal. - return seq; - } - // Don't advance, return the next sequence value as the goal. - return _Py_qsbr_shared_current(qsbr->shared) + QSBR_INCR; -} - -bool -_Py_qsbr_should_process(struct _qsbr_thread_state *qsbr) -{ - if (qsbr->process_seq == 0 || qsbr->seq < qsbr->process_seq) { - return false; - } - qsbr->process_seq = 0; - return true; + return _Py_qsbr_shared_current(shared) + QSBR_INCR; } static uint64_t From 03f01b0e81ed290a833c9fcd6469306f1c0a0521 Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Tue, 24 Jun 2025 23:20:22 -0700 Subject: [PATCH 8/8] Avoid overflow in deferred memory accumulators. --- Objects/obmalloc.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/Objects/obmalloc.c b/Objects/obmalloc.c index a6346c31b7042f..deb7fd957e57dd 100644 --- a/Objects/obmalloc.c +++ b/Objects/obmalloc.c @@ -138,6 +138,10 @@ should_advance_qsbr_for_page(struct _qsbr_thread_state *qsbr, mi_page_t *page) { size_t bsize = mi_page_block_size(page); size_t page_size = page->capacity*bsize; + if (page_size > QSBR_PAGE_MEM_LIMIT) { + qsbr->deferred_page_memory = 0; + return true; + } qsbr->deferred_page_memory += page_size; if (qsbr->deferred_page_memory > QSBR_PAGE_MEM_LIMIT) { qsbr->deferred_page_memory = 0; @@ -1188,6 +1192,12 @@ free_work_item(uintptr_t ptr, delayed_dealloc_cb cb, void *state) static bool should_advance_qsbr_for_free(struct _qsbr_thread_state *qsbr, size_t size) { + if (size > QSBR_FREE_MEM_LIMIT) { + qsbr->deferred_count = 0; + qsbr->deferred_memory = 0; + qsbr->should_process = true; + return true; + } qsbr->deferred_count++; qsbr->deferred_memory += size; if (qsbr->deferred_count > QSBR_DEFERRED_LIMIT ||