/// \copyright SPDX-License-Identifier: Apache-2.0 /// \author Леонид Юрьев aka Leonid Yuriev \date 2015-2024 #include "internals.h" MDBX_NOTHROW_PURE_FUNCTION static bool is_lifo(const MDBX_txn *txn) { return (txn->env->flags & MDBX_LIFORECLAIM) != 0; } MDBX_MAYBE_UNUSED static inline const char *dbg_prefix(const gcu_t *ctx) { return is_lifo(ctx->cursor.txn) ? " lifo" : " fifo"; } static inline size_t backlog_size(MDBX_txn *txn) { return MDBX_PNL_GETSIZE(txn->tw.relist) + txn->tw.loose_count; } static int clean_stored_retired(MDBX_txn *txn, gcu_t *ctx) { int err = MDBX_SUCCESS; if (ctx->retired_stored) { MDBX_cursor *const gc = ptr_disp(txn, sizeof(MDBX_txn)); tASSERT(txn, txn == txn->env->basal_txn && gc->next == gc); gc->txn = txn; gc->dbi_state = txn->dbi_state; gc->top_and_flags = z_fresh_mark; gc->next = txn->cursors[FREE_DBI]; txn->cursors[FREE_DBI] = gc; do { MDBX_val key, val; #if MDBX_ENABLE_BIGFOOT key.iov_base = &ctx->bigfoot; #else key.iov_base = &txn->txnid; #endif /* MDBX_ENABLE_BIGFOOT */ key.iov_len = sizeof(txnid_t); const csr_t csr = cursor_seek(gc, &key, &val, MDBX_SET); if (csr.err == MDBX_SUCCESS && csr.exact) { ctx->retired_stored = 0; err = cursor_del(gc, 0); TRACE("== clear-4linear, backlog %zu, err %d", backlog_size(txn), err); } else err = (csr.err == MDBX_NOTFOUND) ? MDBX_SUCCESS : csr.err; } #if MDBX_ENABLE_BIGFOOT while (!err && --ctx->bigfoot >= txn->txnid); #else while (0); #endif /* MDBX_ENABLE_BIGFOOT */ txn->cursors[FREE_DBI] = gc->next; gc->next = gc; } return err; } static int touch_gc(gcu_t *ctx) { tASSERT(ctx->cursor.txn, is_pointed(&ctx->cursor) || ctx->cursor.txn->dbs[FREE_DBI].leaf_pages == 0); MDBX_val key, val; key.iov_base = val.iov_base = nullptr; key.iov_len = sizeof(txnid_t); val.iov_len = MDBX_PNL_SIZEOF(ctx->cursor.txn->tw.retired_pages); ctx->cursor.flags |= z_gcu_preparation; int err = cursor_touch(&ctx->cursor, &key, &val); ctx->cursor.flags -= z_gcu_preparation; return err; } /* Prepare a backlog of pages to modify GC itself, while reclaiming is * prohibited. It should be enough to prevent search in gc_alloc_ex() * during a deleting, when GC tree is unbalanced. */ static int prepare_backlog(MDBX_txn *txn, gcu_t *ctx) { const size_t for_cow = txn->dbs[FREE_DBI].height; const size_t for_rebalance = for_cow + 1 + (txn->dbs[FREE_DBI].height + 1ul >= txn->dbs[FREE_DBI].branch_pages); size_t for_split = ctx->retired_stored == 0; tASSERT(txn, is_pointed(&ctx->cursor) || txn->dbs[FREE_DBI].leaf_pages == 0); const intptr_t retired_left = MDBX_PNL_SIZEOF(txn->tw.retired_pages) - ctx->retired_stored; size_t for_relist = 0; if (MDBX_ENABLE_BIGFOOT && retired_left > 0) { for_relist = (retired_left + txn->env->maxgc_large1page - 1) / txn->env->maxgc_large1page; const size_t per_branch_page = txn->env->maxgc_per_branch; for (size_t entries = for_relist; entries > 1; for_split += entries) entries = (entries + per_branch_page - 1) / per_branch_page; } else if (!MDBX_ENABLE_BIGFOOT && retired_left != 0) { for_relist = largechunk_npages(txn->env, MDBX_PNL_SIZEOF(txn->tw.retired_pages)); } const size_t for_tree_before_touch = for_cow + for_rebalance + for_split; const size_t for_tree_after_touch = for_rebalance + for_split; const size_t for_all_before_touch = for_relist + for_tree_before_touch; const size_t for_all_after_touch = for_relist + for_tree_after_touch; if (likely(for_relist < 2 && backlog_size(txn) > for_all_before_touch) && (ctx->cursor.top < 0 || is_modifable(txn, ctx->cursor.pg[ctx->cursor.top]))) return MDBX_SUCCESS; TRACE(">> retired-stored %zu, left %zi, backlog %zu, need %zu (4list %zu, " "4split %zu, " "4cow %zu, 4tree %zu)", ctx->retired_stored, retired_left, backlog_size(txn), for_all_before_touch, for_relist, for_split, for_cow, for_tree_before_touch); int err = touch_gc(ctx); TRACE("== after-touch, backlog %zu, err %d", backlog_size(txn), err); if (!MDBX_ENABLE_BIGFOOT && unlikely(for_relist > 1) && MDBX_PNL_GETSIZE(txn->tw.retired_pages) != ctx->retired_stored && err == MDBX_SUCCESS) { if (unlikely(ctx->retired_stored)) { err = clean_stored_retired(txn, ctx); if (unlikely(err != MDBX_SUCCESS)) return err; if (!ctx->retired_stored) return /* restart by tail-recursion */ prepare_backlog(txn, ctx); } err = gc_alloc_ex(&ctx->cursor, for_relist, ALLOC_RESERVE).err; TRACE("== after-4linear, backlog %zu, err %d", backlog_size(txn), err); cASSERT(&ctx->cursor, backlog_size(txn) >= for_relist || err != MDBX_SUCCESS); } while (backlog_size(txn) < for_all_after_touch && err == MDBX_SUCCESS) err = gc_alloc_ex(&ctx->cursor, 0, ALLOC_RESERVE | ALLOC_UNIMPORTANT).err; TRACE("<< backlog %zu, err %d, gc: height %u, branch %zu, leaf %zu, large " "%zu, entries %zu", backlog_size(txn), err, txn->dbs[FREE_DBI].height, (size_t)txn->dbs[FREE_DBI].branch_pages, (size_t)txn->dbs[FREE_DBI].leaf_pages, (size_t)txn->dbs[FREE_DBI].large_pages, (size_t)txn->dbs[FREE_DBI].items); tASSERT(txn, err != MDBX_NOTFOUND || (txn->flags & txn_gc_drained) != 0); return (err != MDBX_NOTFOUND) ? err : MDBX_SUCCESS; } static inline void zeroize_reserved(const MDBX_env *env, MDBX_val pnl) { #if MDBX_DEBUG && (defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__)) /* Для предотвращения предупреждения Valgrind из mdbx_dump_val() * вызванное через макрос DVAL_DEBUG() на выходе * из cursor_seek(MDBX_SET_KEY), которая вызывается ниже внутри gc_update() в * цикле очистки и цикле заполнения зарезервированных элементов. */ memset(pnl.iov_base, 0xBB, pnl.iov_len); #endif /* MDBX_DEBUG && (ENABLE_MEMCHECK || __SANITIZE_ADDRESS__) */ /* PNL is initially empty, zero out at least the length */ memset(pnl.iov_base, 0, sizeof(pgno_t)); if ((env->flags & (MDBX_WRITEMAP | MDBX_NOMEMINIT)) == 0) /* zero out to avoid leaking values from uninitialized malloc'ed memory * to the file in non-writemap mode if length of the saving page-list * was changed during space reservation. */ memset(pnl.iov_base, 0, pnl.iov_len); } static int gcu_loose(MDBX_txn *txn, gcu_t *ctx) { tASSERT(txn, txn->tw.loose_count > 0); /* Return loose page numbers to tw.relist, * though usually none are left at this point. * The pages themselves remain in dirtylist. */ if (unlikely(!txn->tw.gc.reclaimed && txn->tw.gc.last_reclaimed < 1)) { TRACE("%s: try allocate gc-slot for %zu loose-pages", dbg_prefix(ctx), txn->tw.loose_count); int err = gc_alloc_ex(&ctx->cursor, 0, ALLOC_RESERVE).err; if (err == MDBX_SUCCESS) { TRACE("%s: retry since gc-slot for %zu loose-pages available", dbg_prefix(ctx), txn->tw.loose_count); return MDBX_RESULT_TRUE; } /* Put loose page numbers in tw.retired_pages, * since unable to return ones to tw.relist. */ err = pnl_need(&txn->tw.retired_pages, txn->tw.loose_count); if (unlikely(err != MDBX_SUCCESS)) return err; for (page_t *lp = txn->tw.loose_pages; lp; lp = page_next(lp)) { pnl_append_prereserved(txn->tw.retired_pages, lp->pgno); MDBX_ASAN_UNPOISON_MEMORY_REGION(&page_next(lp), sizeof(page_t *)); VALGRIND_MAKE_MEM_DEFINED(&page_next(lp), sizeof(page_t *)); } TRACE("%s: append %zu loose-pages to retired-pages", dbg_prefix(ctx), txn->tw.loose_count); } else { /* Room for loose pages + temp PNL with same */ int err = pnl_need(&txn->tw.relist, 2 * txn->tw.loose_count + 2); if (unlikely(err != MDBX_SUCCESS)) return err; pnl_t loose = txn->tw.relist + MDBX_PNL_ALLOCLEN(txn->tw.relist) - txn->tw.loose_count - 1; size_t count = 0; for (page_t *lp = txn->tw.loose_pages; lp; lp = page_next(lp)) { tASSERT(txn, lp->flags == P_LOOSE); loose[++count] = lp->pgno; MDBX_ASAN_UNPOISON_MEMORY_REGION(&page_next(lp), sizeof(page_t *)); VALGRIND_MAKE_MEM_DEFINED(&page_next(lp), sizeof(page_t *)); } tASSERT(txn, count == txn->tw.loose_count); MDBX_PNL_SETSIZE(loose, count); pnl_sort(loose, txn->geo.first_unallocated); pnl_merge(txn->tw.relist, loose); TRACE("%s: append %zu loose-pages to reclaimed-pages", dbg_prefix(ctx), txn->tw.loose_count); } /* filter-out list of dirty-pages from loose-pages */ dpl_t *const dl = txn->tw.dirtylist; if (dl) { tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); tASSERT(txn, dl->sorted <= dl->length); size_t w = 0, sorted_out = 0; for (size_t r = w; ++r <= dl->length;) { page_t *dp = dl->items[r].ptr; tASSERT(txn, dp->flags == P_LOOSE || is_modifable(txn, dp)); tASSERT(txn, dpl_endpgno(dl, r) <= txn->geo.first_unallocated); if ((dp->flags & P_LOOSE) == 0) { if (++w != r) dl->items[w] = dl->items[r]; } else { tASSERT(txn, dp->flags == P_LOOSE); sorted_out += dl->sorted >= r; if (!MDBX_AVOID_MSYNC || !(txn->flags & MDBX_WRITEMAP)) page_shadow_release(txn->env, dp, 1); } } TRACE("%s: filtered-out loose-pages from %zu -> %zu dirty-pages", dbg_prefix(ctx), dl->length, w); tASSERT(txn, txn->tw.loose_count == dl->length - w); dl->sorted -= sorted_out; tASSERT(txn, dl->sorted <= w); dpl_setlen(dl, w); dl->pages_including_loose -= txn->tw.loose_count; txn->tw.dirtyroom += txn->tw.loose_count; tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == (txn->parent ? txn->parent->tw.dirtyroom : txn->env->options.dp_limit)); } else { tASSERT(txn, (txn->flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC); } txn->tw.loose_pages = nullptr; txn->tw.loose_count = 0; #if MDBX_ENABLE_REFUND txn->tw.loose_refund_wl = 0; #endif /* MDBX_ENABLE_REFUND */ return MDBX_SUCCESS; } static int gcu_retired(MDBX_txn *txn, gcu_t *ctx) { int err; if (unlikely(!ctx->retired_stored)) { /* Make sure last page of GC is touched and on retired-list */ err = outer_last(&ctx->cursor, nullptr, nullptr); if (likely(err == MDBX_SUCCESS)) err = touch_gc(ctx); if (unlikely(err != MDBX_SUCCESS) && err != MDBX_NOTFOUND) return err; } MDBX_val key, data; #if MDBX_ENABLE_BIGFOOT size_t retired_pages_before; do { if (ctx->bigfoot > txn->txnid) { err = clean_stored_retired(txn, ctx); if (unlikely(err != MDBX_SUCCESS)) return err; tASSERT(txn, ctx->bigfoot <= txn->txnid); } retired_pages_before = MDBX_PNL_GETSIZE(txn->tw.retired_pages); err = prepare_backlog(txn, ctx); if (unlikely(err != MDBX_SUCCESS)) return err; if (retired_pages_before != MDBX_PNL_GETSIZE(txn->tw.retired_pages)) { TRACE("%s: retired-list changed (%zu -> %zu), retry", dbg_prefix(ctx), retired_pages_before, MDBX_PNL_GETSIZE(txn->tw.retired_pages)); break; } pnl_sort(txn->tw.retired_pages, txn->geo.first_unallocated); ctx->retired_stored = 0; ctx->bigfoot = txn->txnid; do { if (ctx->retired_stored) { err = prepare_backlog(txn, ctx); if (unlikely(err != MDBX_SUCCESS)) return err; if (ctx->retired_stored >= MDBX_PNL_GETSIZE(txn->tw.retired_pages)) { TRACE("%s: retired-list changed (%zu -> %zu), retry", dbg_prefix(ctx), retired_pages_before, MDBX_PNL_GETSIZE(txn->tw.retired_pages)); break; } } key.iov_len = sizeof(txnid_t); key.iov_base = &ctx->bigfoot; const size_t left = MDBX_PNL_GETSIZE(txn->tw.retired_pages) - ctx->retired_stored; const size_t chunk = (left > txn->env->maxgc_large1page && ctx->bigfoot < MAX_TXNID) ? txn->env->maxgc_large1page : left; data.iov_len = (chunk + 1) * sizeof(pgno_t); err = cursor_put(&ctx->cursor, &key, &data, MDBX_RESERVE); if (unlikely(err != MDBX_SUCCESS)) return err; #if MDBX_DEBUG && (defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__)) /* Для предотвращения предупреждения Valgrind из mdbx_dump_val() * вызванное через макрос DVAL_DEBUG() на выходе * из cursor_seek(MDBX_SET_KEY), которая вызывается как выше в цикле * очистки, так и ниже в цикле заполнения зарезервированных элементов. */ memset(data.iov_base, 0xBB, data.iov_len); #endif /* MDBX_DEBUG && (ENABLE_MEMCHECK || __SANITIZE_ADDRESS__) */ if (retired_pages_before == MDBX_PNL_GETSIZE(txn->tw.retired_pages)) { const size_t at = (is_lifo(txn) == MDBX_PNL_ASCENDING) ? left - chunk : ctx->retired_stored; pgno_t *const begin = txn->tw.retired_pages + at; /* MDBX_PNL_ASCENDING == false && LIFO == false: * - the larger pgno is at the beginning of retired list * and should be placed with the larger txnid. * MDBX_PNL_ASCENDING == true && LIFO == true: * - the larger pgno is at the ending of retired list * and should be placed with the smaller txnid. */ const pgno_t save = *begin; *begin = (pgno_t)chunk; memcpy(data.iov_base, begin, data.iov_len); *begin = save; TRACE("%s: put-retired/bigfoot @ %" PRIaTXN " (slice #%u) #%zu [%zu..%zu] of %zu", dbg_prefix(ctx), ctx->bigfoot, (unsigned)(ctx->bigfoot - txn->txnid), chunk, at, at + chunk, retired_pages_before); } ctx->retired_stored += chunk; } while (ctx->retired_stored < MDBX_PNL_GETSIZE(txn->tw.retired_pages) && (++ctx->bigfoot, true)); } while (retired_pages_before != MDBX_PNL_GETSIZE(txn->tw.retired_pages)); #else /* Write to last page of GC */ key.iov_len = sizeof(txnid_t); key.iov_base = &txn->txnid; do { prepare_backlog(txn, ctx); data.iov_len = MDBX_PNL_SIZEOF(txn->tw.retired_pages); err = cursor_put(&ctx->cursor, &key, &data, MDBX_RESERVE); if (unlikely(err != MDBX_SUCCESS)) return err; #if MDBX_DEBUG && (defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__)) /* Для предотвращения предупреждения Valgrind из mdbx_dump_val() * вызванное через макрос DVAL_DEBUG() на выходе * из cursor_seek(MDBX_SET_KEY), которая вызывается как выше в цикле * очистки, так и ниже в цикле заполнения зарезервированных элементов. */ memset(data.iov_base, 0xBB, data.iov_len); #endif /* MDBX_DEBUG && (ENABLE_MEMCHECK || __SANITIZE_ADDRESS__) */ /* Retry if tw.retired_pages[] grew during the Put() */ } while (data.iov_len < MDBX_PNL_SIZEOF(txn->tw.retired_pages)); ctx->retired_stored = MDBX_PNL_GETSIZE(txn->tw.retired_pages); pnl_sort(txn->tw.retired_pages, txn->geo.first_unallocated); tASSERT(txn, data.iov_len == MDBX_PNL_SIZEOF(txn->tw.retired_pages)); memcpy(data.iov_base, txn->tw.retired_pages, data.iov_len); TRACE("%s: put-retired #%zu @ %" PRIaTXN, dbg_prefix(ctx), ctx->retired_stored, txn->txnid); #endif /* MDBX_ENABLE_BIGFOOT */ if (LOG_ENABLED(MDBX_LOG_EXTRA)) { size_t i = ctx->retired_stored; DEBUG_EXTRA("txn %" PRIaTXN " root %" PRIaPGNO " num %zu, retired-PNL", txn->txnid, txn->dbs[FREE_DBI].root, i); for (; i; i--) DEBUG_EXTRA_PRINT(" %" PRIaPGNO, txn->tw.retired_pages[i]); DEBUG_EXTRA_PRINT("%s\n", "."); } return MDBX_SUCCESS; } typedef struct gcu_rid_result { int err; txnid_t rid; } rid_t; static rid_t get_rid_for_reclaimed(MDBX_txn *txn, gcu_t *ctx, const size_t left) { rid_t r; if (is_lifo(txn)) { if (txn->tw.gc.reclaimed == nullptr) { txn->tw.gc.reclaimed = txl_alloc(); if (unlikely(!txn->tw.gc.reclaimed)) { r.err = MDBX_ENOMEM; goto return_error; } } if (MDBX_PNL_GETSIZE(txn->tw.gc.reclaimed) < txl_max && left > (MDBX_PNL_GETSIZE(txn->tw.gc.reclaimed) - ctx->reused_slot) * txn->env->maxgc_large1page && !ctx->dense) { /* Hужен свободный для для сохранения списка страниц. */ bool need_cleanup = false; txnid_t snap_oldest = 0; retry_rid: do { r.err = gc_alloc_ex(&ctx->cursor, 0, ALLOC_RESERVE).err; snap_oldest = txn->env->lck->cached_oldest.weak; if (likely(r.err == MDBX_SUCCESS)) { TRACE("%s: took @%" PRIaTXN " from GC", dbg_prefix(ctx), MDBX_PNL_LAST(txn->tw.gc.reclaimed)); need_cleanup = true; } } while (r.err == MDBX_SUCCESS && MDBX_PNL_GETSIZE(txn->tw.gc.reclaimed) < txl_max && left > (MDBX_PNL_GETSIZE(txn->tw.gc.reclaimed) - ctx->reused_slot) * txn->env->maxgc_large1page); if (likely(r.err == MDBX_SUCCESS)) { TRACE("%s: got enough from GC.", dbg_prefix(ctx)); goto return_continue; } else if (unlikely(r.err != MDBX_NOTFOUND)) /* LY: some troubles... */ goto return_error; if (MDBX_PNL_GETSIZE(txn->tw.gc.reclaimed)) { if (need_cleanup) { txl_sort(txn->tw.gc.reclaimed); ctx->cleaned_slot = 0; } ctx->rid = MDBX_PNL_LAST(txn->tw.gc.reclaimed); } else { tASSERT(txn, txn->tw.gc.last_reclaimed == 0); if (unlikely(txn_snapshot_oldest(txn) != snap_oldest)) /* should retry gc_alloc_ex() * if the oldest reader changes since the last attempt */ goto retry_rid; /* no reclaimable GC entries, * therefore no entries with ID < mdbx_find_oldest(txn) */ txn->tw.gc.last_reclaimed = ctx->rid = snap_oldest; TRACE("%s: none recycled yet, set rid to @%" PRIaTXN, dbg_prefix(ctx), ctx->rid); } /* В GC нет годных к переработке записей, * будем использовать свободные id в обратном порядке. */ while (MDBX_PNL_GETSIZE(txn->tw.gc.reclaimed) < txl_max && left > (MDBX_PNL_GETSIZE(txn->tw.gc.reclaimed) - ctx->reused_slot) * txn->env->maxgc_large1page) { if (unlikely(ctx->rid <= MIN_TXNID)) { ctx->dense = true; if (unlikely(MDBX_PNL_GETSIZE(txn->tw.gc.reclaimed) <= ctx->reused_slot)) { NOTICE("** restart: reserve depleted (reused_gc_slot %zu >= " "gc.reclaimed %zu)", ctx->reused_slot, MDBX_PNL_GETSIZE(txn->tw.gc.reclaimed)); goto return_restart; } break; } tASSERT(txn, ctx->rid >= MIN_TXNID && ctx->rid <= MAX_TXNID); ctx->rid -= 1; MDBX_val key = {&ctx->rid, sizeof(ctx->rid)}, data; r.err = cursor_seek(&ctx->cursor, &key, &data, MDBX_SET_KEY).err; if (unlikely(r.err == MDBX_SUCCESS)) { DEBUG("%s: GC's id %" PRIaTXN " is present, going to first", dbg_prefix(ctx), ctx->rid); r.err = outer_first(&ctx->cursor, &key, nullptr); if (unlikely(r.err != MDBX_SUCCESS || key.iov_len != sizeof(txnid_t))) { ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, "invalid GC-key size", (unsigned)key.iov_len); r.err = MDBX_CORRUPTED; goto return_error; } const txnid_t gc_first = unaligned_peek_u64(4, key.iov_base); if (unlikely(gc_first <= INITIAL_TXNID)) { NOTICE("%s: no free GC's id(s) less than %" PRIaTXN " (going dense-mode)", dbg_prefix(ctx), ctx->rid); ctx->dense = true; goto return_restart; } ctx->rid = gc_first - 1; } tASSERT(txn, !ctx->dense); r.err = txl_append(&txn->tw.gc.reclaimed, ctx->rid); if (unlikely(r.err != MDBX_SUCCESS)) goto return_error; if (ctx->reused_slot) /* rare case, but it is better to clear and re-create GC entries * with less fragmentation. */ need_cleanup = true; else ctx->cleaned_slot += 1 /* mark cleanup is not needed for added slot. */; TRACE("%s: append @%" PRIaTXN " to lifo-reclaimed, cleaned-gc-slot = %zu", dbg_prefix(ctx), ctx->rid, ctx->cleaned_slot); } if (need_cleanup) { if (ctx->cleaned_slot) { TRACE("%s: restart to clear and re-create GC entries", dbg_prefix(ctx)); goto return_restart; } goto return_continue; } } const size_t i = MDBX_PNL_GETSIZE(txn->tw.gc.reclaimed) - ctx->reused_slot; tASSERT(txn, i > 0 && i <= MDBX_PNL_GETSIZE(txn->tw.gc.reclaimed)); r.rid = txn->tw.gc.reclaimed[i]; TRACE("%s: take @%" PRIaTXN " from lifo-reclaimed[%zu]", dbg_prefix(ctx), r.rid, i); } else { tASSERT(txn, txn->tw.gc.reclaimed == nullptr); if (unlikely(ctx->rid == 0)) { ctx->rid = txn_snapshot_oldest(txn); MDBX_val key; r.err = outer_first(&ctx->cursor, &key, nullptr); if (likely(r.err == MDBX_SUCCESS)) { if (unlikely(key.iov_len != sizeof(txnid_t))) { ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, "invalid GC-key size", (unsigned)key.iov_len); r.err = MDBX_CORRUPTED; goto return_error; } const txnid_t gc_first = unaligned_peek_u64(4, key.iov_base); if (ctx->rid >= gc_first && gc_first) ctx->rid = gc_first - 1; if (unlikely(ctx->rid <= MIN_TXNID)) { ERROR("%s", "** no GC tail-space to store (going dense-mode)"); ctx->dense = true; goto return_restart; } } else if (r.err != MDBX_NOTFOUND) { r.rid = 0; return r; } txn->tw.gc.last_reclaimed = ctx->rid; ctx->cleaned_id = ctx->rid + 1; } r.rid = ctx->rid--; TRACE("%s: take @%" PRIaTXN " from GC", dbg_prefix(ctx), r.rid); } ++ctx->reused_slot; r.err = MDBX_SUCCESS; return r; return_continue: r.err = MDBX_SUCCESS; r.rid = 0; return r; return_restart: r.err = MDBX_RESULT_TRUE; r.rid = 0; return r; return_error: tASSERT(txn, r.err != MDBX_SUCCESS); r.rid = 0; return r; } /* Cleanups reclaimed GC (aka freeDB) records, saves the retired-list (aka * freelist) of current transaction to GC, puts back into GC leftover of the * reclaimed pages with chunking. This recursive changes the reclaimed-list, * loose-list and retired-list. Keep trying until it stabilizes. * * NOTE: This code is a consequence of many iterations of adding crutches (aka * "checks and balances") to partially bypass the fundamental design problems * inherited from LMDB. So do not try to understand it completely in order to * avoid your madness. */ int gc_update(MDBX_txn *txn, gcu_t *ctx) { TRACE("\n>>> @%" PRIaTXN, txn->txnid); MDBX_env *const env = txn->env; ctx->cursor.next = txn->cursors[FREE_DBI]; txn->cursors[FREE_DBI] = &ctx->cursor; int rc; /* txn->tw.relist[] can grow and shrink during this call. * txn->tw.gc.last_reclaimed and txn->tw.retired_pages[] can only grow. * But page numbers cannot disappear from txn->tw.retired_pages[]. */ retry_clean_adj: ctx->reserve_adj = 0; retry: ctx->loop += !(ctx->prev_first_unallocated > txn->geo.first_unallocated); TRACE(">> restart, loop %u", ctx->loop); tASSERT(txn, pnl_check_allocated(txn->tw.relist, txn->geo.first_unallocated - MDBX_ENABLE_REFUND)); tASSERT(txn, dpl_check(txn)); if (unlikely(/* paranoia */ ctx->loop > ((MDBX_DEBUG > 0) ? 12 : 42))) { ERROR("txn #%" PRIaTXN " too more loops %u, bailout", txn->txnid, ctx->loop); rc = MDBX_PROBLEM; goto bailout; } if (unlikely(ctx->dense || ctx->prev_first_unallocated > txn->geo.first_unallocated)) { rc = clean_stored_retired(txn, ctx); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } ctx->prev_first_unallocated = txn->geo.first_unallocated; rc = MDBX_SUCCESS; ctx->reserved = 0; ctx->cleaned_slot = 0; ctx->reused_slot = 0; ctx->amount = 0; ctx->fill_idx = ~0u; ctx->cleaned_id = 0; ctx->rid = txn->tw.gc.last_reclaimed; while (true) { /* Come back here after each Put() in case retired-list changed */ TRACE("%s", " >> continue"); tASSERT(txn, pnl_check_allocated(txn->tw.relist, txn->geo.first_unallocated - MDBX_ENABLE_REFUND)); MDBX_val key, data; if (is_lifo(txn)) { if (ctx->cleaned_slot < (txn->tw.gc.reclaimed ? MDBX_PNL_GETSIZE(txn->tw.gc.reclaimed) : 0)) { ctx->reserved = 0; ctx->cleaned_slot = 0; ctx->reused_slot = 0; ctx->fill_idx = ~0u; /* LY: cleanup reclaimed records. */ do { ctx->cleaned_id = txn->tw.gc.reclaimed[++ctx->cleaned_slot]; tASSERT(txn, ctx->cleaned_slot > 0 && ctx->cleaned_id <= env->lck->cached_oldest.weak); key.iov_base = &ctx->cleaned_id; key.iov_len = sizeof(ctx->cleaned_id); rc = cursor_seek(&ctx->cursor, &key, nullptr, MDBX_SET).err; if (rc == MDBX_NOTFOUND) continue; if (unlikely(rc != MDBX_SUCCESS)) goto bailout; rc = prepare_backlog(txn, ctx); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; tASSERT(txn, ctx->cleaned_id <= env->lck->cached_oldest.weak); TRACE("%s: cleanup-reclaimed-id [%zu]%" PRIaTXN, dbg_prefix(ctx), ctx->cleaned_slot, ctx->cleaned_id); tASSERT(txn, *txn->cursors == &ctx->cursor); rc = cursor_del(&ctx->cursor, 0); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } while (ctx->cleaned_slot < MDBX_PNL_GETSIZE(txn->tw.gc.reclaimed)); txl_sort(txn->tw.gc.reclaimed); } } else { /* Удаляем оставшиеся вынутые из GC записи. */ while (txn->tw.gc.last_reclaimed && ctx->cleaned_id <= txn->tw.gc.last_reclaimed) { rc = outer_first(&ctx->cursor, &key, nullptr); if (rc == MDBX_NOTFOUND) { ctx->cleaned_id = txn->tw.gc.last_reclaimed + 1; ctx->rid = txn->tw.gc.last_reclaimed; ctx->reserved = 0; ctx->reused_slot = 0; break; } if (unlikely(rc != MDBX_SUCCESS)) goto bailout; if (!MDBX_DISABLE_VALIDATION && unlikely(key.iov_len != sizeof(txnid_t))) { ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, "invalid GC-key size", (unsigned)key.iov_len); rc = MDBX_CORRUPTED; goto bailout; } if (ctx->rid != ctx->cleaned_id) { ctx->rid = ctx->cleaned_id; ctx->reserved = 0; ctx->reused_slot = 0; } ctx->cleaned_id = unaligned_peek_u64(4, key.iov_base); if (ctx->cleaned_id > txn->tw.gc.last_reclaimed) break; rc = prepare_backlog(txn, ctx); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; tASSERT(txn, ctx->cleaned_id <= txn->tw.gc.last_reclaimed); tASSERT(txn, ctx->cleaned_id <= env->lck->cached_oldest.weak); TRACE("%s: cleanup-reclaimed-id %" PRIaTXN, dbg_prefix(ctx), ctx->cleaned_id); tASSERT(txn, *txn->cursors == &ctx->cursor); rc = cursor_del(&ctx->cursor, 0); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } } tASSERT(txn, pnl_check_allocated(txn->tw.relist, txn->geo.first_unallocated - MDBX_ENABLE_REFUND)); tASSERT(txn, dpl_check(txn)); if (AUDIT_ENABLED()) { rc = audit_ex(txn, ctx->retired_stored, false); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } /* return suitable into unallocated space */ if (txn_refund(txn)) { tASSERT(txn, pnl_check_allocated(txn->tw.relist, txn->geo.first_unallocated - MDBX_ENABLE_REFUND)); if (AUDIT_ENABLED()) { rc = audit_ex(txn, ctx->retired_stored, false); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } } if (txn->tw.loose_pages) { /* put loose pages into the reclaimed- or retired-list */ rc = gcu_loose(txn, ctx); if (unlikely(rc != MDBX_SUCCESS)) { if (rc == MDBX_RESULT_TRUE) continue; goto bailout; } tASSERT(txn, txn->tw.loose_pages == 0); } if (unlikely(ctx->reserved > MDBX_PNL_GETSIZE(txn->tw.relist)) && (ctx->loop < 5 || ctx->reserved - MDBX_PNL_GETSIZE(txn->tw.relist) > env->maxgc_large1page / 2)) { TRACE("%s: reclaimed-list changed %zu -> %zu, retry", dbg_prefix(ctx), ctx->amount, MDBX_PNL_GETSIZE(txn->tw.relist)); ctx->reserve_adj += ctx->reserved - MDBX_PNL_GETSIZE(txn->tw.relist); goto retry; } ctx->amount = MDBX_PNL_GETSIZE(txn->tw.relist); if (ctx->retired_stored < MDBX_PNL_GETSIZE(txn->tw.retired_pages)) { /* store retired-list into GC */ rc = gcu_retired(txn, ctx); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; continue; } tASSERT(txn, pnl_check_allocated(txn->tw.relist, txn->geo.first_unallocated - MDBX_ENABLE_REFUND)); tASSERT(txn, txn->tw.loose_count == 0); TRACE("%s", " >> reserving"); if (AUDIT_ENABLED()) { rc = audit_ex(txn, ctx->retired_stored, false); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } const size_t left = ctx->amount - ctx->reserved - ctx->reserve_adj; TRACE("%s: amount %zu, reserved %zd, reserve_adj %zu, left %zd, " "lifo-reclaimed-slots %zu, " "reused-gc-slots %zu", dbg_prefix(ctx), ctx->amount, ctx->reserved, ctx->reserve_adj, left, txn->tw.gc.reclaimed ? MDBX_PNL_GETSIZE(txn->tw.gc.reclaimed) : 0, ctx->reused_slot); if (0 >= (intptr_t)left) break; const rid_t rid_result = get_rid_for_reclaimed(txn, ctx, left); if (unlikely(!rid_result.rid)) { rc = rid_result.err; if (likely(rc == MDBX_SUCCESS)) continue; if (likely(rc == MDBX_RESULT_TRUE)) goto retry; goto bailout; } tASSERT(txn, rid_result.err == MDBX_SUCCESS); const txnid_t reservation_gc_id = rid_result.rid; size_t chunk = left; if (unlikely(left > env->maxgc_large1page)) { const size_t avail_gc_slots = txn->tw.gc.reclaimed ? MDBX_PNL_GETSIZE(txn->tw.gc.reclaimed) - ctx->reused_slot + 1 : (ctx->rid < INT16_MAX) ? (size_t)ctx->rid : INT16_MAX; if (likely(avail_gc_slots > 1)) { #if MDBX_ENABLE_BIGFOOT chunk = env->maxgc_large1page; if (avail_gc_slots < INT16_MAX && unlikely(left > env->maxgc_large1page * avail_gc_slots)) /* TODO: Можно смотреть последовательности какой длины есть в relist * и пробовать нарезать куски соответствующего размера. * Смысл в том, чтобы не дробить последовательности страниц, * а использовать целиком. */ chunk = env->maxgc_large1page + left / (env->maxgc_large1page * avail_gc_slots) * env->maxgc_large1page; #else if (chunk < env->maxgc_large1page * 2) chunk /= 2; else { const size_t prefer_max_scatter = 257; const size_t threshold = env->maxgc_large1page * ((avail_gc_slots < prefer_max_scatter) ? avail_gc_slots : prefer_max_scatter); if (left < threshold) chunk = env->maxgc_large1page; else { const size_t tail = left - threshold + env->maxgc_large1page + 1; size_t span = 1; size_t avail = ((pgno2bytes(env, span) - PAGEHDRSZ) / sizeof(pgno_t)) /* - 1 + span */; if (tail > avail) { for (size_t i = ctx->amount - span; i > 0; --i) { if (MDBX_PNL_ASCENDING ? (txn->tw.relist[i] + span) : (txn->tw.relist[i] - span) == txn->tw.relist[i + span]) { span += 1; avail = ((pgno2bytes(env, span) - PAGEHDRSZ) / sizeof(pgno_t)) - 1 + span; if (avail >= tail) break; } } } chunk = (avail >= tail) ? tail - span : (avail_gc_slots > 3 && ctx->reused_slot < prefer_max_scatter - 3) ? avail - span : tail; } } #endif /* MDBX_ENABLE_BIGFOOT */ } } tASSERT(txn, chunk > 0); TRACE("%s: gc_rid %" PRIaTXN ", reused_gc_slot %zu, reservation-id " "%" PRIaTXN, dbg_prefix(ctx), ctx->rid, ctx->reused_slot, reservation_gc_id); TRACE("%s: chunk %zu, gc-per-ovpage %u", dbg_prefix(ctx), chunk, env->maxgc_large1page); tASSERT(txn, reservation_gc_id <= env->lck->cached_oldest.weak); if (unlikely(reservation_gc_id < MIN_TXNID || reservation_gc_id > atomic_load64(&env->lck->cached_oldest, mo_Relaxed))) { ERROR("** internal error (reservation_gc_id %" PRIaTXN ")", reservation_gc_id); rc = MDBX_PROBLEM; goto bailout; } tASSERT(txn, reservation_gc_id >= MIN_TXNID && reservation_gc_id <= MAX_TXNID); key.iov_len = sizeof(reservation_gc_id); key.iov_base = (void *)&reservation_gc_id; data.iov_len = (chunk + 1) * sizeof(pgno_t); TRACE("%s: reserve %zu [%zu...%zu) @%" PRIaTXN, dbg_prefix(ctx), chunk, ctx->reserved + 1, ctx->reserved + chunk + 1, reservation_gc_id); prepare_backlog(txn, ctx); rc = cursor_put(&ctx->cursor, &key, &data, MDBX_RESERVE | MDBX_NOOVERWRITE); tASSERT(txn, pnl_check_allocated(txn->tw.relist, txn->geo.first_unallocated - MDBX_ENABLE_REFUND)); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; zeroize_reserved(env, data); ctx->reserved += chunk; TRACE("%s: reserved %zu (+%zu), continue", dbg_prefix(ctx), ctx->reserved, chunk); continue; } tASSERT(txn, ctx->cleaned_slot == (txn->tw.gc.reclaimed ? MDBX_PNL_GETSIZE(txn->tw.gc.reclaimed) : 0)); TRACE("%s", " >> filling"); /* Fill in the reserved records */ size_t excess_slots = 0; ctx->fill_idx = txn->tw.gc.reclaimed ? MDBX_PNL_GETSIZE(txn->tw.gc.reclaimed) - ctx->reused_slot : ctx->reused_slot; rc = MDBX_SUCCESS; tASSERT(txn, pnl_check_allocated(txn->tw.relist, txn->geo.first_unallocated - MDBX_ENABLE_REFUND)); tASSERT(txn, dpl_check(txn)); if (ctx->amount) { MDBX_val key, data; key.iov_len = data.iov_len = 0; key.iov_base = data.iov_base = nullptr; size_t left = ctx->amount, excess = 0; if (txn->tw.gc.reclaimed == nullptr) { tASSERT(txn, is_lifo(txn) == 0); rc = outer_first(&ctx->cursor, &key, &data); if (unlikely(rc != MDBX_SUCCESS)) { if (rc != MDBX_NOTFOUND) goto bailout; } } else { tASSERT(txn, is_lifo(txn) != 0); } while (true) { txnid_t fill_gc_id; TRACE("%s: left %zu of %zu", dbg_prefix(ctx), left, MDBX_PNL_GETSIZE(txn->tw.relist)); if (txn->tw.gc.reclaimed == nullptr) { tASSERT(txn, is_lifo(txn) == 0); fill_gc_id = key.iov_base ? unaligned_peek_u64(4, key.iov_base) : MIN_TXNID; if (ctx->fill_idx == 0 || fill_gc_id > txn->tw.gc.last_reclaimed) { if (!left) break; NOTICE("** restart: reserve depleted (fill_idx %zu, fill_id %" PRIaTXN " > last_reclaimed %" PRIaTXN ", left %zu", ctx->fill_idx, fill_gc_id, txn->tw.gc.last_reclaimed, left); ctx->reserve_adj = (ctx->reserve_adj > left) ? ctx->reserve_adj - left : 0; goto retry; } ctx->fill_idx -= 1; } else { tASSERT(txn, is_lifo(txn) != 0); if (ctx->fill_idx >= MDBX_PNL_GETSIZE(txn->tw.gc.reclaimed)) { if (!left) break; NOTICE("** restart: reserve depleted (fill_idx %zu >= " "gc.reclaimed %zu, left %zu", ctx->fill_idx, MDBX_PNL_GETSIZE(txn->tw.gc.reclaimed), left); ctx->reserve_adj = (ctx->reserve_adj > left) ? ctx->reserve_adj - left : 0; goto retry; } ctx->fill_idx += 1; fill_gc_id = txn->tw.gc.reclaimed[ctx->fill_idx]; TRACE("%s: seek-reservation @%" PRIaTXN " at gc.reclaimed[%zu]", dbg_prefix(ctx), fill_gc_id, ctx->fill_idx); key.iov_base = &fill_gc_id; key.iov_len = sizeof(fill_gc_id); rc = cursor_seek(&ctx->cursor, &key, &data, MDBX_SET_KEY).err; if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } tASSERT(txn, ctx->cleaned_slot == (txn->tw.gc.reclaimed ? MDBX_PNL_GETSIZE(txn->tw.gc.reclaimed) : 0)); tASSERT(txn, fill_gc_id > 0 && fill_gc_id <= env->lck->cached_oldest.weak); key.iov_base = &fill_gc_id; key.iov_len = sizeof(fill_gc_id); tASSERT(txn, data.iov_len >= sizeof(pgno_t) * 2); size_t chunk = data.iov_len / sizeof(pgno_t) - 1; if (unlikely(chunk > left)) { const size_t delta = chunk - left; excess += delta; TRACE("%s: chunk %zu > left %zu, @%" PRIaTXN, dbg_prefix(ctx), chunk, left, fill_gc_id); if (!left) { excess_slots += 1; goto next; } if ((ctx->loop < 5 && delta > (ctx->loop / 2)) || delta > env->maxgc_large1page) data.iov_len = (left + 1) * sizeof(pgno_t); chunk = left; } rc = cursor_put(&ctx->cursor, &key, &data, MDBX_CURRENT | MDBX_RESERVE); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; zeroize_reserved(env, data); if (unlikely(txn->tw.loose_count || ctx->amount != MDBX_PNL_GETSIZE(txn->tw.relist))) { NOTICE("** restart: reclaimed-list changed (%zu -> %zu, loose +%zu)", ctx->amount, MDBX_PNL_GETSIZE(txn->tw.relist), txn->tw.loose_count); if (ctx->loop < 5 || (ctx->loop > 10 && (ctx->loop & 1))) goto retry_clean_adj; goto retry; } if (unlikely(txn->tw.gc.reclaimed ? ctx->cleaned_slot < MDBX_PNL_GETSIZE(txn->tw.gc.reclaimed) : ctx->cleaned_id < txn->tw.gc.last_reclaimed)) { NOTICE("%s", "** restart: reclaimed-slots changed"); goto retry; } if (unlikely(ctx->retired_stored != MDBX_PNL_GETSIZE(txn->tw.retired_pages))) { tASSERT(txn, ctx->retired_stored < MDBX_PNL_GETSIZE(txn->tw.retired_pages)); NOTICE("** restart: retired-list growth (%zu -> %zu)", ctx->retired_stored, MDBX_PNL_GETSIZE(txn->tw.retired_pages)); goto retry; } pgno_t *dst = data.iov_base; *dst++ = (pgno_t)chunk; pgno_t *src = MDBX_PNL_BEGIN(txn->tw.relist) + left - chunk; memcpy(dst, src, chunk * sizeof(pgno_t)); pgno_t *from = src, *to = src + chunk; TRACE("%s: fill %zu [ %zu:%" PRIaPGNO "...%zu:%" PRIaPGNO "] @%" PRIaTXN, dbg_prefix(ctx), chunk, from - txn->tw.relist, from[0], to - txn->tw.relist, to[-1], fill_gc_id); left -= chunk; if (AUDIT_ENABLED()) { rc = audit_ex(txn, ctx->retired_stored + ctx->amount - left, true); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } next: if (txn->tw.gc.reclaimed == nullptr) { tASSERT(txn, is_lifo(txn) == 0); rc = outer_next(&ctx->cursor, &key, &data, MDBX_NEXT); if (unlikely(rc != MDBX_SUCCESS)) { if (rc == MDBX_NOTFOUND && !left) { rc = MDBX_SUCCESS; break; } goto bailout; } } else { tASSERT(txn, is_lifo(txn) != 0); } } if (excess) { size_t n = excess, adj = excess; while (n >= env->maxgc_large1page) adj -= n /= env->maxgc_large1page; ctx->reserve_adj += adj; TRACE("%s: extra %zu reserved space, adj +%zu (%zu)", dbg_prefix(ctx), excess, adj, ctx->reserve_adj); } } tASSERT(txn, rc == MDBX_SUCCESS); if (unlikely(txn->tw.loose_count != 0 || ctx->amount != MDBX_PNL_GETSIZE(txn->tw.relist))) { NOTICE("** restart: got %zu loose pages (reclaimed-list %zu -> %zu)", txn->tw.loose_count, ctx->amount, MDBX_PNL_GETSIZE(txn->tw.relist)); goto retry; } if (unlikely(excess_slots)) { const bool will_retry = ctx->loop < 5 || excess_slots > 1; NOTICE("** %s: reserve excess (excess-slots %zu, filled-slot %zu, adj %zu, " "loop %u)", will_retry ? "restart" : "ignore", excess_slots, ctx->fill_idx, ctx->reserve_adj, ctx->loop); if (will_retry) goto retry; } tASSERT(txn, txn->tw.gc.reclaimed == nullptr || ctx->cleaned_slot == MDBX_PNL_GETSIZE(txn->tw.gc.reclaimed)); bailout: txn->cursors[FREE_DBI] = ctx->cursor.next; MDBX_PNL_SETSIZE(txn->tw.relist, 0); #if MDBX_ENABLE_PROFGC env->lck->pgops.gc_prof.wloops += (uint32_t)ctx->loop; #endif /* MDBX_ENABLE_PROFGC */ TRACE("<<< %u loops, rc = %d", ctx->loop, rc); return rc; }