From 07674ada47e363df3f5691c09d6c99fd4bf95273 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 18 Nov 2022 15:41:15 +0300 Subject: [PATCH] =?UTF-8?q?mdbx:=20=D0=B4=D0=BE=D1=80=D0=B0=D0=B1=D0=BE?= =?UTF-8?q?=D1=82=D0=BA=D0=B0=20=D0=BF=D0=BE=D0=B4=D0=B3=D0=BE=D1=82=D0=BE?= =?UTF-8?q?=D0=B2=D0=BA=D0=B8=20=D1=80=D0=B5=D0=B7=D0=B5=D1=80=D0=B2=D0=B0?= =?UTF-8?q?=20=D0=BF=D0=B5=D1=80=D0=B5=D0=B4=20=D0=BE=D0=B1=D0=BD=D0=BE?= =?UTF-8?q?=D0=B2=D0=BB=D0=B5=D0=BD=D0=B8=D0=B5=D0=BC=20GC=20=D0=BF=D1=80?= =?UTF-8?q?=D0=B8=20=D0=B2=D0=BA=D0=BB=D1=8E=D1=87=D0=B5=D0=BD=D0=BD=D0=BE?= =?UTF-8?q?=D0=BC=20BigFoot.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core.c | 188 +++++++++++++++++++++++++++++++++++------------------ 1 file changed, 125 insertions(+), 63 deletions(-) diff --git a/src/core.c b/src/core.c index 335da529..407cd935 100644 --- a/src/core.c +++ b/src/core.c @@ -6761,6 +6761,7 @@ static pgr_t page_alloc_slowpath(const MDBX_cursor *mc, const size_t num, MDBX_cursor *const gc = (MDBX_cursor *)((char *)env->me_txn0 + sizeof(MDBX_txn)); + eASSERT(env, mc != gc && gc->mc_next == nullptr); gc->mc_txn = txn; gc->mc_flags = 0; @@ -7188,8 +7189,16 @@ done: } if (LOG_ENABLED(level)) debug_log(level, __func__, __LINE__, - "unable alloc %zu %s, flags 0x%x, errcode %d\n", num, what, - flags, ret.err); + "unable alloc %zu %s, alloc-flags 0x%x, err %d, txn-flags " + "0x%x, re-list-len %zu, loose-count %zu, gc: height %u, " + "branch %zu, leaf %zu, large %zu, entries %zu\n", + num, what, flags, ret.err, txn->mt_flags, + MDBX_PNL_GETSIZE(txn->tw.relist), txn->tw.loose_count, + txn->mt_dbs[FREE_DBI].md_depth, + (size_t)txn->mt_dbs[FREE_DBI].md_branch_pages, + (size_t)txn->mt_dbs[FREE_DBI].md_leaf_pages, + (size_t)txn->mt_dbs[FREE_DBI].md_overflow_pages, + (size_t)txn->mt_dbs[FREE_DBI].md_entries); ret.page = NULL; } } else { @@ -9508,7 +9517,13 @@ static __always_inline size_t gcu_backlog_size(MDBX_txn *txn) { static int gcu_clean_stored_retired(MDBX_txn *txn, gcu_context_t *ctx) { int err = MDBX_SUCCESS; - if (ctx->retired_stored) + if (ctx->retired_stored) { + MDBX_cursor *const gc = (MDBX_cursor *)((char *)txn + sizeof(MDBX_txn)); + tASSERT(txn, txn == txn->mt_env->me_txn0 && gc->mc_next == nullptr); + gc->mc_txn = txn; + gc->mc_flags = 0; + gc->mc_next = txn->mt_cursors[FREE_DBI]; + txn->mt_cursors[FREE_DBI] = gc; do { MDBX_val key, val; #if MDBX_ENABLE_BIGFOOT @@ -9517,11 +9532,10 @@ static int gcu_clean_stored_retired(MDBX_txn *txn, gcu_context_t *ctx) { key.iov_base = &txn->mt_txnid; #endif /* MDBX_ENABLE_BIGFOOT */ key.iov_len = sizeof(txnid_t); - const struct cursor_set_result csr = - cursor_set(&ctx->cursor, &key, &val, MDBX_SET); + const struct cursor_set_result csr = cursor_set(gc, &key, &val, MDBX_SET); if (csr.err == MDBX_SUCCESS && csr.exact) { ctx->retired_stored = 0; - err = mdbx_cursor_del(&ctx->cursor, 0); + err = mdbx_cursor_del(gc, 0); TRACE("== clear-4linear, backlog %zu, err %d", gcu_backlog_size(txn), err); } @@ -9531,6 +9545,9 @@ static int gcu_clean_stored_retired(MDBX_txn *txn, gcu_context_t *ctx) { #else while (0); #endif /* MDBX_ENABLE_BIGFOOT */ + txn->mt_cursors[FREE_DBI] = gc->mc_next; + gc->mc_next = nullptr; + } return err; } @@ -9544,28 +9561,45 @@ static int gcu_touch(gcu_context_t *ctx) { /* Prepare a backlog of pages to modify GC itself, while reclaiming is * prohibited. It should be enough to prevent search in page_alloc_slowpath() * during a deleting, when GC tree is unbalanced. */ -static int gcu_prepare_backlog(MDBX_txn *txn, gcu_context_t *ctx, - const bool reserve4retired) { - const size_t pages4retiredlist = - reserve4retired ? number_of_ovpages( - txn->mt_env, MDBX_PNL_SIZEOF(txn->tw.retired_pages)) - : 0; - const size_t backlog4cow = txn->mt_dbs[FREE_DBI].md_depth; - const size_t backlog4rebalance = backlog4cow + 1; +static int gcu_prepare_backlog(MDBX_txn *txn, gcu_context_t *ctx) { + const size_t for_cow = txn->mt_dbs[FREE_DBI].md_depth; + const size_t for_rebalance = for_cow + 1; + size_t for_split = ctx->retired_stored == 0; - if (likely(pages4retiredlist < 2 && - gcu_backlog_size(txn) > (reserve4retired - ? backlog4rebalance - : (backlog4cow + backlog4rebalance)))) + const intptr_t retired_left = + MDBX_PNL_SIZEOF(txn->tw.retired_pages) - ctx->retired_stored; + size_t for_retiredlist = 0; + if (MDBX_ENABLE_BIGFOOT && retired_left > 0) { + for_retiredlist = (retired_left + txn->mt_env->me_maxgc_ov1page - 1) / + txn->mt_env->me_maxgc_ov1page; + const size_t per_branch_page = + (txn->mt_env->me_psize - PAGEHDRSZ) / + (sizeof(indx_t) + sizeof(MDBX_node) + sizeof(txnid_t)); + for (size_t entries = for_retiredlist; entries > 1; for_split += entries) + entries = (entries + per_branch_page - 1) / per_branch_page; + } else if (!MDBX_ENABLE_BIGFOOT && retired_left != 0) { + for_retiredlist = + number_of_ovpages(txn->mt_env, MDBX_PNL_SIZEOF(txn->tw.retired_pages)); + } + + const size_t for_tree_before_touch = for_cow + for_rebalance + for_split; + const size_t for_tree_after_touch = for_rebalance + for_split; + const size_t for_data = for_retiredlist; + const size_t for_all_before_touch = for_data + for_tree_before_touch; + const size_t for_all_after_touch = for_data + for_tree_after_touch; + + if (likely(for_data < 2 && gcu_backlog_size(txn) > for_all_before_touch)) return MDBX_SUCCESS; - TRACE( - ">> reserve4retired %c, backlog %zu, 4list %zu, 4cow %zu, 4rebalance %zu", - reserve4retired ? 'Y' : 'N', gcu_backlog_size(txn), pages4retiredlist, - backlog4cow, backlog4rebalance); + TRACE(">> retired-stored %zu, left %zi, backlog %zu, need %zu (4list %zu, " + "4split %zu, " + "4cow %zu, 4tree %zu)", + ctx->retired_stored, retired_left, gcu_backlog_size(txn), + for_all_before_touch, for_data, for_split, for_cow, + for_tree_before_touch); int err; - if (unlikely(pages4retiredlist > 2)) { + if (unlikely(for_data > 2)) { MDBX_val key, val; key.iov_base = val.iov_base = nullptr; key.iov_len = sizeof(txnid_t); @@ -9578,28 +9612,34 @@ static int gcu_prepare_backlog(MDBX_txn *txn, gcu_context_t *ctx, err = gcu_touch(ctx); TRACE("== after-touch, backlog %zu, err %d", gcu_backlog_size(txn), err); - if (unlikely(pages4retiredlist > 1) && + if (!MDBX_ENABLE_BIGFOOT && unlikely(for_data > 1) && MDBX_PNL_GETSIZE(txn->tw.retired_pages) != ctx->retired_stored && err == MDBX_SUCCESS) { - tASSERT(txn, reserve4retired); - err = gcu_clean_stored_retired(txn, ctx); - if (unlikely(err != MDBX_SUCCESS)) - return err; - err = - page_alloc_slowpath(&ctx->cursor, pages4retiredlist, MDBX_ALLOC_RESERVE) - .err; + if (unlikely(ctx->retired_stored)) { + err = gcu_clean_stored_retired(txn, ctx); + if (unlikely(err != MDBX_SUCCESS)) + return err; + if (!ctx->retired_stored) + return /* restart by tail-recursion */ gcu_prepare_backlog(txn, ctx); + } + err = page_alloc_slowpath(&ctx->cursor, for_data, MDBX_ALLOC_RESERVE).err; TRACE("== after-4linear, backlog %zu, err %d", gcu_backlog_size(txn), err); cASSERT(&ctx->cursor, - gcu_backlog_size(txn) >= pages4retiredlist || err != MDBX_SUCCESS); + gcu_backlog_size(txn) >= for_data || err != MDBX_SUCCESS); } - while (gcu_backlog_size(txn) < backlog4cow + pages4retiredlist && - err == MDBX_SUCCESS) + while (gcu_backlog_size(txn) < for_all_after_touch && err == MDBX_SUCCESS) err = page_alloc_slowpath(&ctx->cursor, 0, MDBX_ALLOC_RESERVE | MDBX_ALLOC_UNIMPORTANT) .err; - TRACE("<< backlog %zu, err %d", gcu_backlog_size(txn), err); + TRACE("<< backlog %zu, err %d, gc: height %u, branch %zu, leaf %zu, large " + "%zu, entries %zu", + gcu_backlog_size(txn), err, txn->mt_dbs[FREE_DBI].md_depth, + (size_t)txn->mt_dbs[FREE_DBI].md_branch_pages, + (size_t)txn->mt_dbs[FREE_DBI].md_leaf_pages, + (size_t)txn->mt_dbs[FREE_DBI].md_overflow_pages, + (size_t)txn->mt_dbs[FREE_DBI].md_entries); return (err != MDBX_NOTFOUND) ? err : MDBX_SUCCESS; } @@ -9665,9 +9705,10 @@ retry: TRACE("%s", " >> continue"); if (ctx->retired_stored != MDBX_PNL_GETSIZE(txn->tw.retired_pages) && - (MDBX_PNL_GETSIZE(txn->tw.retired_pages) > env->me_maxgc_ov1page || + (ctx->loop == 1 || + MDBX_PNL_GETSIZE(txn->tw.retired_pages) > env->me_maxgc_ov1page || ctx->retired_stored > env->me_maxgc_ov1page)) { - rc = gcu_prepare_backlog(txn, ctx, true); + rc = gcu_prepare_backlog(txn, ctx); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } @@ -9696,7 +9737,7 @@ retry: if (unlikely(rc != MDBX_SUCCESS)) goto bailout; if (likely(!ctx->dense)) { - rc = gcu_prepare_backlog(txn, ctx, false); + rc = gcu_prepare_backlog(txn, ctx); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } @@ -9730,7 +9771,7 @@ retry: if (ctx->cleaned_id > txn->tw.last_reclaimed) break; if (likely(!ctx->dense)) { - rc = gcu_prepare_backlog(txn, ctx, false); + rc = gcu_prepare_backlog(txn, ctx); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } @@ -9767,30 +9808,29 @@ retry: /* handle loose pages - put ones into the reclaimed- or retired-list */ if (txn->tw.loose_pages) { + tASSERT(txn, txn->tw.loose_count > 0); /* Return loose page numbers to tw.relist, * though usually none are left at this point. * The pages themselves remain in dirtylist. */ if (unlikely(!txn->tw.lifo_reclaimed && txn->tw.last_reclaimed < 1)) { - if (txn->tw.loose_count > 0) { - TRACE("%s: try allocate gc-slot for %zu loose-pages", dbg_prefix_mode, - txn->tw.loose_count); - rc = page_alloc_slowpath(&ctx->cursor, 0, MDBX_ALLOC_RESERVE).err; - if (rc == MDBX_SUCCESS) { - TRACE("%s: retry since gc-slot for %zu loose-pages available", - dbg_prefix_mode, txn->tw.loose_count); - continue; - } - - /* Put loose page numbers in tw.retired_pages, - * since unable to return them to tw.relist. */ - if (unlikely((rc = pnl_need(&txn->tw.retired_pages, - txn->tw.loose_count)) != 0)) - goto bailout; - for (MDBX_page *mp = txn->tw.loose_pages; mp; mp = mp->mp_next) - pnl_xappend(txn->tw.retired_pages, mp->mp_pgno); - TRACE("%s: append %zu loose-pages to retired-pages", dbg_prefix_mode, - txn->tw.loose_count); + TRACE("%s: try allocate gc-slot for %zu loose-pages", dbg_prefix_mode, + txn->tw.loose_count); + rc = page_alloc_slowpath(&ctx->cursor, 0, MDBX_ALLOC_RESERVE).err; + if (rc == MDBX_SUCCESS) { + TRACE("%s: retry since gc-slot for %zu loose-pages available", + dbg_prefix_mode, txn->tw.loose_count); + continue; } + + /* Put loose page numbers in tw.retired_pages, + * since unable to return them to tw.relist. */ + if (unlikely((rc = pnl_need(&txn->tw.retired_pages, + txn->tw.loose_count)) != 0)) + goto bailout; + for (MDBX_page *mp = txn->tw.loose_pages; mp; mp = mp->mp_next) + pnl_xappend(txn->tw.retired_pages, mp->mp_pgno); + TRACE("%s: append %zu loose-pages to retired-pages", dbg_prefix_mode, + txn->tw.loose_count); } else { /* Room for loose pages + temp PNL with same */ rc = pnl_need(&txn->tw.relist, 2 * txn->tw.loose_count + 2); @@ -9874,14 +9914,31 @@ retry: } retired_pages_before = MDBX_PNL_GETSIZE(txn->tw.retired_pages); - rc = gcu_prepare_backlog(txn, ctx, true); + rc = gcu_prepare_backlog(txn, ctx); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; + if (retired_pages_before != MDBX_PNL_GETSIZE(txn->tw.retired_pages)) { + TRACE("%s: retired-list changed (%zu -> %zu), retry", dbg_prefix_mode, + retired_pages_before, MDBX_PNL_GETSIZE(txn->tw.retired_pages)); + break; + } pnl_sort(txn->tw.retired_pages, txn->mt_next_pgno); ctx->retired_stored = 0; ctx->bigfoot = txn->mt_txnid; do { + if (ctx->retired_stored) { + rc = gcu_prepare_backlog(txn, ctx); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + if (ctx->retired_stored >= + MDBX_PNL_GETSIZE(txn->tw.retired_pages)) { + TRACE("%s: retired-list changed (%zu -> %zu), retry", + dbg_prefix_mode, retired_pages_before, + MDBX_PNL_GETSIZE(txn->tw.retired_pages)); + break; + } + } key.iov_len = sizeof(txnid_t); key.iov_base = &ctx->bigfoot; const size_t left = @@ -9927,7 +9984,7 @@ retry: key.iov_len = sizeof(txnid_t); key.iov_base = &txn->mt_txnid; do { - gcu_prepare_backlog(txn, ctx, true); + gcu_prepare_backlog(txn, ctx); data.iov_len = MDBX_PNL_SIZEOF(txn->tw.retired_pages); rc = mdbx_cursor_put(&ctx->cursor, &key, &data, MDBX_RESERVE); if (unlikely(rc != MDBX_SUCCESS)) @@ -9940,7 +9997,7 @@ retry: eASSERT(env, data.iov_len == MDBX_PNL_SIZEOF(txn->tw.retired_pages)); memcpy(data.iov_base, txn->tw.retired_pages, data.iov_len); - TRACE("%s: put-retired #%u @ %" PRIaTXN, dbg_prefix_mode, + TRACE("%s: put-retired #%zu @ %" PRIaTXN, dbg_prefix_mode, ctx->retired_stored, txn->mt_txnid); #endif /* MDBX_ENABLE_BIGFOOT */ if (LOG_ENABLED(MDBX_LOG_EXTRA)) { @@ -9982,7 +10039,7 @@ retry: if (0 >= (intptr_t)left) break; - const size_t prefer_max_scatter = 257; + const size_t prefer_max_scatter = MDBX_ENABLE_BIGFOOT ? MDBX_TXL_MAX : 257; txnid_t reservation_gc_id; if (ctx->lifo) { if (txn->tw.lifo_reclaimed == nullptr) { @@ -10154,6 +10211,10 @@ retry: : (ctx->rid < INT16_MAX) ? (size_t)ctx->rid : INT16_MAX; if (avail_gc_slots > 1) { +#if MDBX_ENABLE_BIGFOOT + chunk = (chunk < env->me_maxgc_ov1page * 2) ? chunk / 2 + : env->me_maxgc_ov1page; +#else if (chunk < env->me_maxgc_ov1page * 2) chunk /= 2; else { @@ -10190,6 +10251,7 @@ retry: : tail; } } +#endif /* MDBX_ENABLE_BIGFOOT */ } } tASSERT(txn, chunk > 0); @@ -10217,7 +10279,7 @@ retry: data.iov_len = (chunk + 1) * sizeof(pgno_t); TRACE("%s: reserve %zu [%zu...%zu) @%" PRIaTXN, dbg_prefix_mode, chunk, ctx->settled + 1, ctx->settled + chunk + 1, reservation_gc_id); - gcu_prepare_backlog(txn, ctx, true); + gcu_prepare_backlog(txn, ctx); rc = mdbx_cursor_put(&ctx->cursor, &key, &data, MDBX_RESERVE | MDBX_NOOVERWRITE); tASSERT(txn, pnl_check_allocated(txn->tw.relist,