mdbx: add update-gc context (extracted from bigfoot).

This commit is contained in:
Леонид Юрьев (Leonid Yuriev) 2022-06-20 20:16:54 +03:00
parent f5280ebf6e
commit f1ccc717b4
2 changed files with 289 additions and 246 deletions

View File

@ -9090,77 +9090,110 @@ __cold static int mdbx_audit_ex(MDBX_txn *txn, unsigned retired_stored,
return MDBX_PROBLEM; return MDBX_PROBLEM;
} }
static __always_inline unsigned backlog_size(MDBX_txn *txn) { typedef struct gc_update_context {
unsigned retired_stored, loop;
unsigned settled, cleaned_slot, reused_slot, filled_slot;
txnid_t cleaned_id, rid;
bool lifo, dense;
MDBX_cursor_couple cursor;
} gcu_context_t;
static __inline int gcu_context_init(MDBX_txn *txn, gcu_context_t *ctx) {
memset(ctx, 0, offsetof(gcu_context_t, cursor));
ctx->lifo = (txn->mt_env->me_flags & MDBX_LIFORECLAIM) != 0;
return mdbx_cursor_init(&ctx->cursor.outer, txn, FREE_DBI);
}
static __always_inline unsigned gcu_backlog_size(MDBX_txn *txn) {
return MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) + txn->tw.loose_count; return MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) + txn->tw.loose_count;
} }
static int gcu_clean_stored_retired(MDBX_txn *txn, gcu_context_t *ctx) {
int err = MDBX_SUCCESS;
if (ctx->retired_stored) {
MDBX_val key, val;
key.iov_base = &txn->mt_txnid;
key.iov_len = sizeof(txnid_t);
const struct cursor_set_result csr =
mdbx_cursor_set(&ctx->cursor.outer, &key, &val, MDBX_SET);
if (csr.err == MDBX_SUCCESS && csr.exact) {
ctx->retired_stored = 0;
err = mdbx_cursor_del(&ctx->cursor.outer, 0);
mdbx_trace("== clear-4linear, backlog %u, err %d", gcu_backlog_size(txn),
err);
}
}
return err;
}
/* LY: Prepare a backlog of pages to modify GC itself, /* LY: Prepare a backlog of pages to modify GC itself,
* while reclaiming is prohibited. It should be enough to prevent search * while reclaiming is prohibited. It should be enough to prevent search
* in mdbx_page_alloc() during a deleting, when GC tree is unbalanced. */ * in mdbx_page_alloc() during a deleting, when GC tree is unbalanced. */
static int mdbx_prep_backlog(MDBX_txn *txn, MDBX_cursor *gc_cursor, static int gcu_prepare_backlog(MDBX_txn *txn, gcu_context_t *ctx,
const size_t pnl_bytes, unsigned *retired_stored) { const bool reserve4retired) {
const unsigned linear4list = number_of_ovpages(txn->mt_env, pnl_bytes); const unsigned pages4retiredlist =
reserve4retired ? number_of_ovpages(
txn->mt_env, MDBX_PNL_SIZEOF(txn->tw.retired_pages))
: 0;
const unsigned backlog4cow = txn->mt_dbs[FREE_DBI].md_depth; const unsigned backlog4cow = txn->mt_dbs[FREE_DBI].md_depth;
const unsigned backlog4rebalance = backlog4cow + 1; const unsigned backlog4rebalance = backlog4cow + 1;
if (likely(linear4list == 1 && if (likely(pages4retiredlist < 2 &&
backlog_size(txn) > (pnl_bytes gcu_backlog_size(txn) > (reserve4retired
? backlog4rebalance ? backlog4rebalance
: (backlog4cow + backlog4rebalance)))) : (backlog4cow + backlog4rebalance))))
return MDBX_SUCCESS; return MDBX_SUCCESS;
mdbx_trace(">> pnl_bytes %zu, backlog %u, 4list %u, 4cow %u, 4rebalance %u", mdbx_trace(
pnl_bytes, backlog_size(txn), linear4list, backlog4cow, ">> reserve4retired %c, backlog %u, 4list %u, 4cow %u, 4rebalance %u",
backlog4rebalance); reserve4retired ? 'Y' : 'N', gcu_backlog_size(txn), pages4retiredlist,
backlog4cow, backlog4rebalance);
MDBX_val gc_key, fake_val;
int err; int err;
if (unlikely(linear4list > 2)) { if (unlikely(pages4retiredlist > 2)) {
gc_key.iov_base = fake_val.iov_base = nullptr; MDBX_val key, val;
gc_key.iov_len = sizeof(txnid_t); key.iov_base = val.iov_base = nullptr;
fake_val.iov_len = pnl_bytes; key.iov_len = sizeof(txnid_t);
err = mdbx_cursor_spill(gc_cursor, &gc_key, &fake_val); val.iov_len = MDBX_PNL_SIZEOF(txn->tw.retired_pages);
err = mdbx_cursor_spill(&ctx->cursor.outer, &key, &val);
if (unlikely(err != MDBX_SUCCESS)) if (unlikely(err != MDBX_SUCCESS))
return err; return err;
} }
gc_cursor->mc_flags &= ~C_RECLAIMING; ctx->cursor.outer.mc_flags &= ~C_RECLAIMING;
err = mdbx_cursor_touch(gc_cursor); err = mdbx_cursor_touch(&ctx->cursor.outer);
mdbx_trace("== after-touch, backlog %u, err %d", backlog_size(txn), err); mdbx_trace("== after-touch, backlog %u, err %d", gcu_backlog_size(txn), err);
if (unlikely(linear4list > 1) && err == MDBX_SUCCESS) { if (unlikely(pages4retiredlist > 1) &&
if (retired_stored) { MDBX_PNL_SIZE(txn->tw.retired_pages) != ctx->retired_stored &&
gc_key.iov_base = &txn->mt_txnid; err == MDBX_SUCCESS) {
gc_key.iov_len = sizeof(txn->mt_txnid); mdbx_tassert(txn, reserve4retired);
const struct cursor_set_result csr = err = gcu_clean_stored_retired(txn, ctx);
mdbx_cursor_set(gc_cursor, &gc_key, &fake_val, MDBX_SET); if (unlikely(err != MDBX_SUCCESS))
if (csr.err == MDBX_SUCCESS && csr.exact) { return err;
*retired_stored = 0; err = mdbx_page_alloc(&ctx->cursor.outer, pages4retiredlist,
err = mdbx_cursor_del(gc_cursor, 0); MDBX_ALLOC_GC | MDBX_ALLOC_FAKE)
mdbx_trace("== clear-4linear, backlog %u, err %d", backlog_size(txn),
err);
}
}
err =
mdbx_page_alloc(gc_cursor, linear4list, MDBX_ALLOC_GC | MDBX_ALLOC_FAKE)
.err; .err;
mdbx_trace("== after-4linear, backlog %u, err %d", backlog_size(txn), err); mdbx_trace("== after-4linear, backlog %u, err %d", gcu_backlog_size(txn),
mdbx_cassert(gc_cursor, err);
backlog_size(txn) >= linear4list || err != MDBX_SUCCESS); mdbx_cassert(&ctx->cursor.outer,
gcu_backlog_size(txn) >= pages4retiredlist ||
err != MDBX_SUCCESS);
} }
while (backlog_size(txn) < backlog4cow + linear4list && err == MDBX_SUCCESS) while (gcu_backlog_size(txn) < backlog4cow + pages4retiredlist &&
err = mdbx_page_alloc(gc_cursor, 0, err == MDBX_SUCCESS)
err = mdbx_page_alloc(&ctx->cursor.outer, 0,
MDBX_ALLOC_GC | MDBX_ALLOC_SLOT | MDBX_ALLOC_FAKE | MDBX_ALLOC_GC | MDBX_ALLOC_SLOT | MDBX_ALLOC_FAKE |
MDBX_ALLOC_NOLOG) MDBX_ALLOC_NOLOG)
.err; .err;
gc_cursor->mc_flags |= C_RECLAIMING; ctx->cursor.outer.mc_flags |= C_RECLAIMING;
mdbx_trace("<< backlog %u, err %d", backlog_size(txn), err); mdbx_trace("<< backlog %u, err %d", gcu_backlog_size(txn), err);
return (err != MDBX_NOTFOUND) ? err : MDBX_SUCCESS; return (err != MDBX_NOTFOUND) ? err : MDBX_SUCCESS;
} }
static __inline void clean_reserved_gc_pnl(MDBX_env *env, MDBX_val pnl) { static __inline void gcu_clean_reserved(MDBX_env *env, MDBX_val pnl) {
/* PNL is initially empty, zero out at least the length */ /* PNL is initially empty, zero out at least the length */
memset(pnl.iov_base, 0, sizeof(pgno_t)); memset(pnl.iov_base, 0, sizeof(pgno_t));
if ((env->me_flags & (MDBX_WRITEMAP | MDBX_NOMEMINIT)) == 0) if ((env->me_flags & (MDBX_WRITEMAP | MDBX_NOMEMINIT)) == 0)
@ -9179,61 +9212,54 @@ static __inline void clean_reserved_gc_pnl(MDBX_env *env, MDBX_val pnl) {
* "checks and balances") to partially bypass the fundamental design problems * "checks and balances") to partially bypass the fundamental design problems
* inherited from LMDB. So do not try to understand it completely in order to * inherited from LMDB. So do not try to understand it completely in order to
* avoid your madness. */ * avoid your madness. */
static int mdbx_update_gc(MDBX_txn *txn) { static int mdbx_update_gc(MDBX_txn *txn, gcu_context_t *ctx) {
mdbx_trace("\n>>> @%" PRIaTXN, txn->mt_txnid);
MDBX_env *const env = txn->mt_env;
const char *const dbg_prefix_mode = ctx->lifo ? " lifo" : " fifo";
(void)dbg_prefix_mode;
ctx->cursor.outer.mc_flags |= C_RECLAIMING;
ctx->cursor.outer.mc_next = txn->mt_cursors[FREE_DBI];
txn->mt_cursors[FREE_DBI] = &ctx->cursor.outer;
/* txn->tw.reclaimed_pglist[] can grow and shrink during this call. /* txn->tw.reclaimed_pglist[] can grow and shrink during this call.
* txn->tw.last_reclaimed and txn->tw.retired_pages[] can only grow. * txn->tw.last_reclaimed and txn->tw.retired_pages[] can only grow.
* Page numbers cannot disappear from txn->tw.retired_pages[]. */ * Page numbers cannot disappear from txn->tw.retired_pages[]. */
MDBX_env *const env = txn->mt_env;
const bool lifo = (env->me_flags & MDBX_LIFORECLAIM) != 0;
const char *dbg_prefix_mode = lifo ? " lifo" : " fifo";
(void)dbg_prefix_mode;
mdbx_trace("\n>>> @%" PRIaTXN, txn->mt_txnid);
unsigned retired_stored = 0, loop = 0;
MDBX_cursor_couple couple;
int rc = mdbx_cursor_init(&couple.outer, txn, FREE_DBI);
if (unlikely(rc != MDBX_SUCCESS))
goto bailout_notracking;
couple.outer.mc_flags |= C_RECLAIMING;
couple.outer.mc_next = txn->mt_cursors[FREE_DBI];
txn->mt_cursors[FREE_DBI] = &couple.outer;
bool dense_gc = false;
retry: retry:
++loop; ++ctx->loop;
mdbx_trace("%s", " >> restart"); mdbx_trace("%s", " >> restart");
int rc = MDBX_SUCCESS;
mdbx_tassert(txn, mdbx_tassert(txn,
mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
txn->mt_next_pgno - MDBX_ENABLE_REFUND)); txn->mt_next_pgno - MDBX_ENABLE_REFUND));
mdbx_tassert(txn, mdbx_dirtylist_check(txn)); mdbx_tassert(txn, mdbx_dirtylist_check(txn));
if (unlikely(/* paranoia */ loop > ((MDBX_DEBUG > 0) ? 12 : 42))) { if (unlikely(/* paranoia */ ctx->loop > ((MDBX_DEBUG > 0) ? 12 : 42))) {
mdbx_error("too more loops %u, bailout", loop); mdbx_error("too more loops %u, bailout", ctx->loop);
rc = MDBX_PROBLEM; rc = MDBX_PROBLEM;
goto bailout; goto bailout;
} }
if (unlikely(dense_gc) && retired_stored) { if (unlikely(ctx->dense)) {
rc = mdbx_prep_backlog(txn, &couple.outer, rc = gcu_clean_stored_retired(txn, ctx);
MDBX_PNL_SIZEOF(txn->tw.retired_pages),
&retired_stored);
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
goto bailout; goto bailout;
} }
unsigned settled = 0, cleaned_gc_slot = 0, reused_gc_slot = 0, ctx->settled = 0;
filled_gc_slot = ~0u; ctx->cleaned_slot = 0;
txnid_t cleaned_gc_id = 0, gc_rid = txn->tw.last_reclaimed; ctx->reused_slot = 0;
ctx->filled_slot = ~0u;
ctx->cleaned_id = 0;
ctx->rid = txn->tw.last_reclaimed;
while (true) { while (true) {
/* Come back here after each Put() in case retired-list changed */ /* Come back here after each Put() in case retired-list changed */
MDBX_val key, data; MDBX_val key, data;
mdbx_trace("%s", " >> continue"); mdbx_trace("%s", " >> continue");
if (retired_stored != MDBX_PNL_SIZE(txn->tw.retired_pages) && if (ctx->retired_stored != MDBX_PNL_SIZE(txn->tw.retired_pages) &&
MDBX_PNL_SIZE(txn->tw.retired_pages) > env->me_maxgc_ov1page) { (MDBX_PNL_SIZE(txn->tw.retired_pages) > env->me_maxgc_ov1page ||
rc = mdbx_prep_backlog(txn, &couple.outer, ctx->retired_stored > env->me_maxgc_ov1page)) {
MDBX_PNL_SIZEOF(txn->tw.retired_pages), rc = gcu_prepare_backlog(txn, ctx, true);
&retired_stored);
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
goto bailout; goto bailout;
} }
@ -9241,48 +9267,48 @@ retry:
mdbx_tassert(txn, mdbx_tassert(txn,
mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
txn->mt_next_pgno - MDBX_ENABLE_REFUND)); txn->mt_next_pgno - MDBX_ENABLE_REFUND));
if (lifo) { if (ctx->lifo) {
if (cleaned_gc_slot < (txn->tw.lifo_reclaimed if (ctx->cleaned_slot < (txn->tw.lifo_reclaimed
? MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) ? MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)
: 0)) { : 0)) {
settled = 0; ctx->settled = 0;
cleaned_gc_slot = 0; ctx->cleaned_slot = 0;
reused_gc_slot = 0; ctx->reused_slot = 0;
filled_gc_slot = ~0u; ctx->filled_slot = ~0u;
/* LY: cleanup reclaimed records. */ /* LY: cleanup reclaimed records. */
do { do {
cleaned_gc_id = txn->tw.lifo_reclaimed[++cleaned_gc_slot]; ctx->cleaned_id = txn->tw.lifo_reclaimed[++ctx->cleaned_slot];
mdbx_tassert(txn, mdbx_tassert(txn, ctx->cleaned_slot > 0 &&
cleaned_gc_slot > 0 && ctx->cleaned_id <
cleaned_gc_id < env->me_lck->mti_oldest_reader.weak); env->me_lck->mti_oldest_reader.weak);
key.iov_base = &cleaned_gc_id; key.iov_base = &ctx->cleaned_id;
key.iov_len = sizeof(cleaned_gc_id); key.iov_len = sizeof(ctx->cleaned_id);
rc = mdbx_cursor_get(&couple.outer, &key, NULL, MDBX_SET); rc = mdbx_cursor_get(&ctx->cursor.outer, &key, NULL, MDBX_SET);
if (rc == MDBX_NOTFOUND) if (rc == MDBX_NOTFOUND)
continue; continue;
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
goto bailout; goto bailout;
if (likely(!dense_gc)) { if (likely(!ctx->dense)) {
rc = mdbx_prep_backlog(txn, &couple.outer, 0, nullptr); rc = gcu_prepare_backlog(txn, ctx, false);
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
goto bailout; goto bailout;
} }
mdbx_tassert(txn, mdbx_tassert(txn,
cleaned_gc_id < env->me_lck->mti_oldest_reader.weak); ctx->cleaned_id < env->me_lck->mti_oldest_reader.weak);
mdbx_trace("%s: cleanup-reclaimed-id [%u]%" PRIaTXN, dbg_prefix_mode, mdbx_trace("%s: cleanup-reclaimed-id [%u]%" PRIaTXN, dbg_prefix_mode,
cleaned_gc_slot, cleaned_gc_id); ctx->cleaned_slot, ctx->cleaned_id);
mdbx_tassert(txn, *txn->mt_cursors == &couple.outer); mdbx_tassert(txn, *txn->mt_cursors == &ctx->cursor.outer);
rc = mdbx_cursor_del(&couple.outer, 0); rc = mdbx_cursor_del(&ctx->cursor.outer, 0);
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
goto bailout; goto bailout;
} while (cleaned_gc_slot < MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); } while (ctx->cleaned_slot < MDBX_PNL_SIZE(txn->tw.lifo_reclaimed));
mdbx_txl_sort(txn->tw.lifo_reclaimed); mdbx_txl_sort(txn->tw.lifo_reclaimed);
} }
} else { } else {
/* If using records from GC which we have not yet deleted, /* If using records from GC which we have not yet deleted,
* now delete them and any we reserved for tw.reclaimed_pglist. */ * now delete them and any we reserved for tw.reclaimed_pglist. */
while (cleaned_gc_id <= txn->tw.last_reclaimed) { while (ctx->cleaned_id <= txn->tw.last_reclaimed) {
rc = mdbx_cursor_first(&couple.outer, &key, NULL); rc = mdbx_cursor_first(&ctx->cursor.outer, &key, NULL);
if (unlikely(rc != MDBX_SUCCESS)) { if (unlikely(rc != MDBX_SUCCESS)) {
if (rc == MDBX_NOTFOUND) if (rc == MDBX_NOTFOUND)
break; break;
@ -9293,28 +9319,29 @@ retry:
rc = MDBX_CORRUPTED; rc = MDBX_CORRUPTED;
goto bailout; goto bailout;
} }
gc_rid = cleaned_gc_id; ctx->rid = ctx->cleaned_id;
settled = 0; ctx->settled = 0;
reused_gc_slot = 0; ctx->reused_slot = 0;
cleaned_gc_id = unaligned_peek_u64(4, key.iov_base); ctx->cleaned_id = unaligned_peek_u64(4, key.iov_base);
if (!MDBX_DISABLE_PAGECHECKS && if (!MDBX_DISABLE_PAGECHECKS && unlikely(ctx->cleaned_id < MIN_TXNID ||
unlikely(cleaned_gc_id < MIN_TXNID || cleaned_gc_id > MAX_TXNID)) { ctx->cleaned_id > MAX_TXNID)) {
rc = MDBX_CORRUPTED; rc = MDBX_CORRUPTED;
goto bailout; goto bailout;
} }
if (cleaned_gc_id > txn->tw.last_reclaimed) if (ctx->cleaned_id > txn->tw.last_reclaimed)
break; break;
if (likely(!dense_gc)) { if (likely(!ctx->dense)) {
rc = mdbx_prep_backlog(txn, &couple.outer, 0, nullptr); rc = gcu_prepare_backlog(txn, ctx, false);
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
goto bailout; goto bailout;
} }
mdbx_tassert(txn, cleaned_gc_id <= txn->tw.last_reclaimed); mdbx_tassert(txn, ctx->cleaned_id <= txn->tw.last_reclaimed);
mdbx_tassert(txn, cleaned_gc_id < env->me_lck->mti_oldest_reader.weak); mdbx_tassert(txn,
ctx->cleaned_id < env->me_lck->mti_oldest_reader.weak);
mdbx_trace("%s: cleanup-reclaimed-id %" PRIaTXN, dbg_prefix_mode, mdbx_trace("%s: cleanup-reclaimed-id %" PRIaTXN, dbg_prefix_mode,
cleaned_gc_id); ctx->cleaned_id);
mdbx_tassert(txn, *txn->mt_cursors == &couple.outer); mdbx_tassert(txn, *txn->mt_cursors == &ctx->cursor.outer);
rc = mdbx_cursor_del(&couple.outer, 0); rc = mdbx_cursor_del(&ctx->cursor.outer, 0);
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
goto bailout; goto bailout;
} }
@ -9325,7 +9352,7 @@ retry:
txn->mt_next_pgno - MDBX_ENABLE_REFUND)); txn->mt_next_pgno - MDBX_ENABLE_REFUND));
mdbx_tassert(txn, mdbx_dirtylist_check(txn)); mdbx_tassert(txn, mdbx_dirtylist_check(txn));
if (mdbx_audit_enabled()) { if (mdbx_audit_enabled()) {
rc = mdbx_audit_ex(txn, retired_stored, false); rc = mdbx_audit_ex(txn, ctx->retired_stored, false);
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
goto bailout; goto bailout;
} }
@ -9336,7 +9363,7 @@ retry:
txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
txn->mt_next_pgno - MDBX_ENABLE_REFUND)); txn->mt_next_pgno - MDBX_ENABLE_REFUND));
if (mdbx_audit_enabled()) { if (mdbx_audit_enabled()) {
rc = mdbx_audit_ex(txn, retired_stored, false); rc = mdbx_audit_ex(txn, ctx->retired_stored, false);
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
goto bailout; goto bailout;
} }
@ -9349,6 +9376,18 @@ retry:
* The pages themselves remain in dirtylist. */ * The pages themselves remain in dirtylist. */
if (unlikely(!txn->tw.lifo_reclaimed && txn->tw.last_reclaimed < 1)) { if (unlikely(!txn->tw.lifo_reclaimed && txn->tw.last_reclaimed < 1)) {
if (txn->tw.loose_count > 0) { if (txn->tw.loose_count > 0) {
mdbx_trace("%s: try allocate gc-slot for %u loose-pages",
dbg_prefix_mode, txn->tw.loose_count);
rc =
mdbx_page_alloc(&ctx->cursor.outer, 0,
MDBX_ALLOC_GC | MDBX_ALLOC_SLOT | MDBX_ALLOC_FAKE)
.err;
if (rc == MDBX_SUCCESS) {
mdbx_trace("%s: retry since gc-slot for %u loose-pages available",
dbg_prefix_mode, txn->tw.loose_count);
continue;
}
/* Put loose page numbers in tw.retired_pages, /* Put loose page numbers in tw.retired_pages,
* since unable to return them to tw.reclaimed_pglist. */ * since unable to return them to tw.reclaimed_pglist. */
if (unlikely((rc = mdbx_pnl_need(&txn->tw.retired_pages, if (unlikely((rc = mdbx_pnl_need(&txn->tw.retired_pages,
@ -9416,47 +9455,47 @@ retry:
const unsigned amount = (unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist); const unsigned amount = (unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist);
/* handle retired-list - store ones into single gc-record */ /* handle retired-list - store ones into single gc-record */
if (retired_stored < MDBX_PNL_SIZE(txn->tw.retired_pages)) { if (ctx->retired_stored < MDBX_PNL_SIZE(txn->tw.retired_pages)) {
if (unlikely(!retired_stored)) { if (unlikely(!ctx->retired_stored)) {
/* Make sure last page of GC is touched and on retired-list */ /* Make sure last page of GC is touched and on retired-list */
couple.outer.mc_flags &= ~C_RECLAIMING; ctx->cursor.outer.mc_flags &= ~C_RECLAIMING;
rc = mdbx_page_search(&couple.outer, NULL, rc = mdbx_page_search(&ctx->cursor.outer, NULL,
MDBX_PS_LAST | MDBX_PS_MODIFY); MDBX_PS_LAST | MDBX_PS_MODIFY);
couple.outer.mc_flags |= C_RECLAIMING; ctx->cursor.outer.mc_flags |= C_RECLAIMING;
if (unlikely(rc != MDBX_SUCCESS) && rc != MDBX_NOTFOUND) if (unlikely(rc != MDBX_SUCCESS) && rc != MDBX_NOTFOUND)
goto bailout; goto bailout;
} }
/* Write to last page of GC */ /* Write to last page of GC */
key.iov_len = sizeof(txn->mt_txnid); key.iov_len = sizeof(txnid_t);
key.iov_base = &txn->mt_txnid; key.iov_base = &txn->mt_txnid;
do { do {
gcu_prepare_backlog(txn, ctx, true);
data.iov_len = MDBX_PNL_SIZEOF(txn->tw.retired_pages); data.iov_len = MDBX_PNL_SIZEOF(txn->tw.retired_pages);
mdbx_prep_backlog(txn, &couple.outer, data.iov_len, &retired_stored); rc = mdbx_cursor_put(&ctx->cursor.outer, &key, &data, MDBX_RESERVE);
rc = mdbx_cursor_put(&couple.outer, &key, &data, MDBX_RESERVE);
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
goto bailout; goto bailout;
/* Retry if tw.retired_pages[] grew during the Put() */ /* Retry if tw.retired_pages[] grew during the Put() */
} while (data.iov_len < MDBX_PNL_SIZEOF(txn->tw.retired_pages)); } while (data.iov_len < MDBX_PNL_SIZEOF(txn->tw.retired_pages));
retired_stored = (unsigned)MDBX_PNL_SIZE(txn->tw.retired_pages); ctx->retired_stored = (unsigned)MDBX_PNL_SIZE(txn->tw.retired_pages);
mdbx_pnl_sort(txn->tw.retired_pages, txn->mt_next_pgno); mdbx_pnl_sort(txn->tw.retired_pages, txn->mt_next_pgno);
mdbx_assert(env, data.iov_len == MDBX_PNL_SIZEOF(txn->tw.retired_pages)); mdbx_assert(env, data.iov_len == MDBX_PNL_SIZEOF(txn->tw.retired_pages));
memcpy(data.iov_base, txn->tw.retired_pages, data.iov_len); memcpy(data.iov_base, txn->tw.retired_pages, data.iov_len);
mdbx_trace("%s: put-retired #%u @ %" PRIaTXN, dbg_prefix_mode, mdbx_trace("%s: put-retired #%u @ %" PRIaTXN, dbg_prefix_mode,
retired_stored, txn->mt_txnid); ctx->retired_stored, txn->mt_txnid);
if (mdbx_log_enabled(MDBX_LOG_EXTRA)) { if (mdbx_log_enabled(MDBX_LOG_EXTRA)) {
unsigned i = retired_stored; unsigned i = ctx->retired_stored;
mdbx_debug_extra("PNL write txn %" PRIaTXN " root %" PRIaPGNO mdbx_debug_extra("txn %" PRIaTXN " root %" PRIaPGNO
" num %u, PNL", " num %u, retired-PNL",
txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i); txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i);
for (; i; i--) for (; i; i--)
mdbx_debug_extra_print(" %" PRIaPGNO, txn->tw.retired_pages[i]); mdbx_debug_extra_print(" %" PRIaPGNO, txn->tw.retired_pages[i]);
mdbx_debug_extra_print("%s\n", "."); mdbx_debug_extra_print("%s\n", ".");
} }
if (unlikely(amount != MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) && if (unlikely(amount != MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) &&
settled)) { ctx->settled)) {
mdbx_trace("%s: reclaimed-list changed %u -> %u, retry", mdbx_trace("%s: reclaimed-list changed %u -> %u, retry",
dbg_prefix_mode, amount, dbg_prefix_mode, amount,
(unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)); (unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist));
@ -9475,24 +9514,24 @@ retry:
mdbx_trace("%s", " >> reserving"); mdbx_trace("%s", " >> reserving");
if (mdbx_audit_enabled()) { if (mdbx_audit_enabled()) {
rc = mdbx_audit_ex(txn, retired_stored, false); rc = mdbx_audit_ex(txn, ctx->retired_stored, false);
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
goto bailout; goto bailout;
} }
const unsigned left = amount - settled; const unsigned left = amount - ctx->settled;
mdbx_trace("%s: amount %u, settled %d, left %d, lifo-reclaimed-slots %u, " mdbx_trace("%s: amount %u, settled %d, left %d, lifo-reclaimed-slots %u, "
"reused-gc-slots %u", "reused-gc-slots %u",
dbg_prefix_mode, amount, settled, (int)left, dbg_prefix_mode, amount, ctx->settled, (int)left,
txn->tw.lifo_reclaimed txn->tw.lifo_reclaimed
? (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) ? (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)
: 0, : 0,
reused_gc_slot); ctx->reused_slot);
if (0 >= (int)left) if (0 >= (int)left)
break; break;
const unsigned prefer_max_scatter = 257; const unsigned prefer_max_scatter = 257;
txnid_t reservation_gc_id; txnid_t reservation_gc_id;
if (lifo) { if (ctx->lifo) {
if (txn->tw.lifo_reclaimed == nullptr) { if (txn->tw.lifo_reclaimed == nullptr) {
txn->tw.lifo_reclaimed = mdbx_txl_alloc(); txn->tw.lifo_reclaimed = mdbx_txl_alloc();
if (unlikely(!txn->tw.lifo_reclaimed)) { if (unlikely(!txn->tw.lifo_reclaimed)) {
@ -9503,18 +9542,18 @@ retry:
if ((unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) < if ((unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) <
prefer_max_scatter && prefer_max_scatter &&
left > ((unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - left > ((unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) -
reused_gc_slot) * ctx->reused_slot) *
env->me_maxgc_ov1page && env->me_maxgc_ov1page &&
!dense_gc) { !ctx->dense) {
/* LY: need just a txn-id for save page list. */ /* LY: need just a txn-id for save page list. */
bool need_cleanup = false; bool need_cleanup = false;
txnid_t snap_oldest; txnid_t snap_oldest;
retry_rid: retry_rid:
couple.outer.mc_flags &= ~C_RECLAIMING; ctx->cursor.outer.mc_flags &= ~C_RECLAIMING;
do { do {
snap_oldest = mdbx_find_oldest(txn); snap_oldest = mdbx_find_oldest(txn);
rc = rc =
mdbx_page_alloc(&couple.outer, 0, mdbx_page_alloc(&ctx->cursor.outer, 0,
MDBX_ALLOC_GC | MDBX_ALLOC_SLOT | MDBX_ALLOC_FAKE) MDBX_ALLOC_GC | MDBX_ALLOC_SLOT | MDBX_ALLOC_FAKE)
.err; .err;
if (likely(rc == MDBX_SUCCESS)) { if (likely(rc == MDBX_SUCCESS)) {
@ -9526,9 +9565,9 @@ retry:
(unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) < (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) <
prefer_max_scatter && prefer_max_scatter &&
left > ((unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - left > ((unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) -
reused_gc_slot) * ctx->reused_slot) *
env->me_maxgc_ov1page); env->me_maxgc_ov1page);
couple.outer.mc_flags |= C_RECLAIMING; ctx->cursor.outer.mc_flags |= C_RECLAIMING;
if (likely(rc == MDBX_SUCCESS)) { if (likely(rc == MDBX_SUCCESS)) {
mdbx_trace("%s: got enough from GC.", dbg_prefix_mode); mdbx_trace("%s: got enough from GC.", dbg_prefix_mode);
@ -9540,9 +9579,9 @@ retry:
if (MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)) { if (MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)) {
if (need_cleanup) { if (need_cleanup) {
mdbx_txl_sort(txn->tw.lifo_reclaimed); mdbx_txl_sort(txn->tw.lifo_reclaimed);
cleaned_gc_slot = 0; ctx->cleaned_slot = 0;
} }
gc_rid = MDBX_PNL_LAST(txn->tw.lifo_reclaimed); ctx->rid = MDBX_PNL_LAST(txn->tw.lifo_reclaimed);
} else { } else {
mdbx_tassert(txn, txn->tw.last_reclaimed == 0); mdbx_tassert(txn, txn->tw.last_reclaimed == 0);
if (unlikely(mdbx_find_oldest(txn) != snap_oldest)) if (unlikely(mdbx_find_oldest(txn) != snap_oldest))
@ -9551,42 +9590,42 @@ retry:
goto retry_rid; goto retry_rid;
/* no reclaimable GC entries, /* no reclaimable GC entries,
* therefore no entries with ID < mdbx_find_oldest(txn) */ * therefore no entries with ID < mdbx_find_oldest(txn) */
txn->tw.last_reclaimed = gc_rid = snap_oldest - 1; txn->tw.last_reclaimed = ctx->rid = snap_oldest - 1;
mdbx_trace("%s: none recycled yet, set rid to @%" PRIaTXN, mdbx_trace("%s: none recycled yet, set rid to @%" PRIaTXN,
dbg_prefix_mode, gc_rid); dbg_prefix_mode, ctx->rid);
} }
/* LY: GC is empty, will look any free txn-id in high2low order. */ /* LY: GC is empty, will look any free txn-id in high2low order. */
while (MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) < prefer_max_scatter && while (MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) < prefer_max_scatter &&
left > ((unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - left > ((unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) -
reused_gc_slot) * ctx->reused_slot) *
env->me_maxgc_ov1page) { env->me_maxgc_ov1page) {
if (unlikely(gc_rid <= MIN_TXNID)) { if (unlikely(ctx->rid <= MIN_TXNID)) {
if (unlikely(MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) <= if (unlikely(MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) <=
reused_gc_slot)) { ctx->reused_slot)) {
mdbx_notice("** restart: reserve depleted (reused_gc_slot %u >= " mdbx_notice("** restart: reserve depleted (reused_gc_slot %u >= "
"lifo_reclaimed %u" PRIaTXN, "lifo_reclaimed %u" PRIaTXN,
reused_gc_slot, ctx->reused_slot,
(unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed));
goto retry; goto retry;
} }
break; break;
} }
mdbx_tassert(txn, gc_rid >= MIN_TXNID && gc_rid <= MAX_TXNID); mdbx_tassert(txn, ctx->rid >= MIN_TXNID && ctx->rid <= MAX_TXNID);
--gc_rid; --ctx->rid;
key.iov_base = &gc_rid; key.iov_base = &ctx->rid;
key.iov_len = sizeof(gc_rid); key.iov_len = sizeof(ctx->rid);
rc = mdbx_cursor_get(&couple.outer, &key, &data, MDBX_SET_KEY); rc = mdbx_cursor_get(&ctx->cursor.outer, &key, &data, MDBX_SET_KEY);
if (unlikely(rc == MDBX_SUCCESS)) { if (unlikely(rc == MDBX_SUCCESS)) {
mdbx_debug("%s: GC's id %" PRIaTXN mdbx_debug("%s: GC's id %" PRIaTXN
" is used, continue bottom-up search", " is used, continue bottom-up search",
dbg_prefix_mode, gc_rid); dbg_prefix_mode, ctx->rid);
++gc_rid; ++ctx->rid;
rc = mdbx_cursor_get(&couple.outer, &key, &data, MDBX_FIRST); rc = mdbx_cursor_get(&ctx->cursor.outer, &key, &data, MDBX_FIRST);
if (rc == MDBX_NOTFOUND) { if (rc == MDBX_NOTFOUND) {
mdbx_debug("%s: GC is empty (going dense-mode)", dbg_prefix_mode); mdbx_debug("%s: GC is empty (going dense-mode)", dbg_prefix_mode);
dense_gc = true; ctx->dense = true;
break; break;
} }
if (unlikely(rc != MDBX_SUCCESS || if (unlikely(rc != MDBX_SUCCESS ||
@ -9603,52 +9642,52 @@ retry:
if (gc_first <= MIN_TXNID) { if (gc_first <= MIN_TXNID) {
mdbx_debug("%s: no free GC's id(s) less than %" PRIaTXN mdbx_debug("%s: no free GC's id(s) less than %" PRIaTXN
" (going dense-mode)", " (going dense-mode)",
dbg_prefix_mode, gc_rid); dbg_prefix_mode, ctx->rid);
dense_gc = true; ctx->dense = true;
break; break;
} }
gc_rid = gc_first - 1; ctx->rid = gc_first - 1;
} }
mdbx_assert(env, !dense_gc); mdbx_assert(env, !ctx->dense);
rc = mdbx_txl_append(&txn->tw.lifo_reclaimed, gc_rid); rc = mdbx_txl_append(&txn->tw.lifo_reclaimed, ctx->rid);
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
goto bailout; goto bailout;
if (reused_gc_slot) if (ctx->reused_slot)
/* rare case, but it is better to clear and re-create GC entries /* rare case, but it is better to clear and re-create GC entries
* with less fragmentation. */ * with less fragmentation. */
need_cleanup = true; need_cleanup = true;
else else
cleaned_gc_slot += ctx->cleaned_slot +=
1 /* mark cleanup is not needed for added slot. */; 1 /* mark cleanup is not needed for added slot. */;
mdbx_trace("%s: append @%" PRIaTXN mdbx_trace("%s: append @%" PRIaTXN
" to lifo-reclaimed, cleaned-gc-slot = %u", " to lifo-reclaimed, cleaned-gc-slot = %u",
dbg_prefix_mode, gc_rid, cleaned_gc_slot); dbg_prefix_mode, ctx->rid, ctx->cleaned_slot);
} }
if (need_cleanup || dense_gc) { if (need_cleanup || ctx->dense) {
if (cleaned_gc_slot) if (ctx->cleaned_slot)
mdbx_trace( mdbx_trace(
"%s: restart inner-loop to clear and re-create GC entries", "%s: restart inner-loop to clear and re-create GC entries",
dbg_prefix_mode); dbg_prefix_mode);
cleaned_gc_slot = 0; ctx->cleaned_slot = 0;
continue; continue;
} }
} }
const unsigned i = const unsigned i =
(unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - reused_gc_slot; (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - ctx->reused_slot;
mdbx_tassert(txn, i > 0 && i <= MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); mdbx_tassert(txn, i > 0 && i <= MDBX_PNL_SIZE(txn->tw.lifo_reclaimed));
reservation_gc_id = txn->tw.lifo_reclaimed[i]; reservation_gc_id = txn->tw.lifo_reclaimed[i];
mdbx_trace("%s: take @%" PRIaTXN " from lifo-reclaimed[%u]", mdbx_trace("%s: take @%" PRIaTXN " from lifo-reclaimed[%u]",
dbg_prefix_mode, reservation_gc_id, i); dbg_prefix_mode, reservation_gc_id, i);
} else { } else {
mdbx_tassert(txn, txn->tw.lifo_reclaimed == NULL); mdbx_tassert(txn, txn->tw.lifo_reclaimed == NULL);
if (unlikely(gc_rid == 0)) { if (unlikely(ctx->rid == 0)) {
gc_rid = mdbx_find_oldest(txn) - 1; ctx->rid = mdbx_find_oldest(txn) - 1;
rc = mdbx_cursor_get(&couple.outer, &key, NULL, MDBX_FIRST); rc = mdbx_cursor_get(&ctx->cursor.outer, &key, NULL, MDBX_FIRST);
if (rc == MDBX_SUCCESS) { if (rc == MDBX_SUCCESS) {
if (!MDBX_DISABLE_PAGECHECKS && if (!MDBX_DISABLE_PAGECHECKS &&
unlikely(key.iov_len != sizeof(txnid_t))) { unlikely(key.iov_len != sizeof(txnid_t))) {
@ -9661,31 +9700,31 @@ retry:
rc = MDBX_CORRUPTED; rc = MDBX_CORRUPTED;
goto bailout; goto bailout;
} }
if (gc_rid >= gc_first) if (ctx->rid >= gc_first)
gc_rid = gc_first - 1; ctx->rid = gc_first - 1;
if (unlikely(gc_rid == 0)) { if (unlikely(ctx->rid == 0)) {
mdbx_error("%s", "** no GC tail-space to store (going dense-mode)"); mdbx_error("%s", "** no GC tail-space to store (going dense-mode)");
dense_gc = true; ctx->dense = true;
goto retry; goto retry;
} }
} else if (rc != MDBX_NOTFOUND) } else if (rc != MDBX_NOTFOUND)
goto bailout; goto bailout;
txn->tw.last_reclaimed = gc_rid; txn->tw.last_reclaimed = ctx->rid;
cleaned_gc_id = gc_rid + 1; ctx->cleaned_id = ctx->rid + 1;
} }
reservation_gc_id = gc_rid--; reservation_gc_id = ctx->rid--;
mdbx_trace("%s: take @%" PRIaTXN " from head-gc-id", dbg_prefix_mode, mdbx_trace("%s: take @%" PRIaTXN " from head-gc-id", dbg_prefix_mode,
reservation_gc_id); reservation_gc_id);
} }
++reused_gc_slot; ++ctx->reused_slot;
unsigned chunk = left; unsigned chunk = left;
if (unlikely(chunk > env->me_maxgc_ov1page)) { if (unlikely(chunk > env->me_maxgc_ov1page)) {
const unsigned avail_gc_slots = const unsigned avail_gc_slots =
txn->tw.lifo_reclaimed txn->tw.lifo_reclaimed
? (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - ? (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) -
reused_gc_slot + 1 ctx->reused_slot + 1
: (gc_rid < INT16_MAX) ? (unsigned)gc_rid : (ctx->rid < INT16_MAX) ? (unsigned)ctx->rid
: INT16_MAX; : INT16_MAX;
if (avail_gc_slots > 1) { if (avail_gc_slots > 1) {
if (chunk < env->me_maxgc_ov1page * 2) if (chunk < env->me_maxgc_ov1page * 2)
@ -9720,7 +9759,7 @@ retry:
chunk = (avail >= tail) ? tail - span chunk = (avail >= tail) ? tail - span
: (avail_gc_slots > 3 && : (avail_gc_slots > 3 &&
reused_gc_slot < prefer_max_scatter - 3) ctx->reused_slot < prefer_max_scatter - 3)
? avail - span ? avail - span
: tail; : tail;
} }
@ -9731,7 +9770,7 @@ retry:
mdbx_trace("%s: gc_rid %" PRIaTXN ", reused_gc_slot %u, reservation-id " mdbx_trace("%s: gc_rid %" PRIaTXN ", reused_gc_slot %u, reservation-id "
"%" PRIaTXN, "%" PRIaTXN,
dbg_prefix_mode, gc_rid, reused_gc_slot, reservation_gc_id); dbg_prefix_mode, ctx->rid, ctx->reused_slot, reservation_gc_id);
mdbx_trace("%s: chunk %u, gc-per-ovpage %u", dbg_prefix_mode, chunk, mdbx_trace("%s: chunk %u, gc-per-ovpage %u", dbg_prefix_mode, chunk,
env->me_maxgc_ov1page); env->me_maxgc_ov1page);
@ -9751,9 +9790,9 @@ retry:
key.iov_base = &reservation_gc_id; key.iov_base = &reservation_gc_id;
data.iov_len = (chunk + 1) * sizeof(pgno_t); data.iov_len = (chunk + 1) * sizeof(pgno_t);
mdbx_trace("%s: reserve %u [%u...%u) @%" PRIaTXN, dbg_prefix_mode, chunk, mdbx_trace("%s: reserve %u [%u...%u) @%" PRIaTXN, dbg_prefix_mode, chunk,
settled + 1, settled + chunk + 1, reservation_gc_id); ctx->settled + 1, ctx->settled + chunk + 1, reservation_gc_id);
mdbx_prep_backlog(txn, &couple.outer, data.iov_len, nullptr); gcu_prepare_backlog(txn, ctx, true);
rc = mdbx_cursor_put(&couple.outer, &key, &data, rc = mdbx_cursor_put(&ctx->cursor.outer, &key, &data,
MDBX_RESERVE | MDBX_NOOVERWRITE); MDBX_RESERVE | MDBX_NOOVERWRITE);
mdbx_tassert(txn, mdbx_tassert(txn,
mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
@ -9761,14 +9800,14 @@ retry:
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
goto bailout; goto bailout;
clean_reserved_gc_pnl(env, data); gcu_clean_reserved(env, data);
settled += chunk; ctx->settled += chunk;
mdbx_trace("%s: settled %u (+%u), continue", dbg_prefix_mode, settled, mdbx_trace("%s: settled %u (+%u), continue", dbg_prefix_mode, ctx->settled,
chunk); chunk);
if (txn->tw.lifo_reclaimed && if (txn->tw.lifo_reclaimed &&
unlikely(amount < MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)) && unlikely(amount < MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)) &&
(loop < 5 || MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) - amount > (ctx->loop < 5 || MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) - amount >
env->me_maxgc_ov1page)) { env->me_maxgc_ov1page)) {
mdbx_notice("** restart: reclaimed-list growth %u -> %u", amount, mdbx_notice("** restart: reclaimed-list growth %u -> %u", amount,
(unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)); (unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist));
@ -9780,15 +9819,15 @@ retry:
mdbx_tassert( mdbx_tassert(
txn, txn,
cleaned_gc_slot == ctx->cleaned_slot ==
(txn->tw.lifo_reclaimed ? MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) : 0)); (txn->tw.lifo_reclaimed ? MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) : 0));
mdbx_trace("%s", " >> filling"); mdbx_trace("%s", " >> filling");
/* Fill in the reserved records */ /* Fill in the reserved records */
filled_gc_slot = ctx->filled_slot =
txn->tw.lifo_reclaimed txn->tw.lifo_reclaimed
? (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - reused_gc_slot ? (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - ctx->reused_slot
: reused_gc_slot; : ctx->reused_slot;
rc = MDBX_SUCCESS; rc = MDBX_SUCCESS;
mdbx_tassert(txn, mdbx_tassert(txn,
mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
@ -9802,12 +9841,12 @@ retry:
const unsigned amount = MDBX_PNL_SIZE(txn->tw.reclaimed_pglist); const unsigned amount = MDBX_PNL_SIZE(txn->tw.reclaimed_pglist);
unsigned left = amount; unsigned left = amount;
if (txn->tw.lifo_reclaimed == nullptr) { if (txn->tw.lifo_reclaimed == nullptr) {
mdbx_tassert(txn, lifo == 0); mdbx_tassert(txn, ctx->lifo == 0);
rc = mdbx_cursor_first(&couple.outer, &key, &data); rc = mdbx_cursor_first(&ctx->cursor.outer, &key, &data);
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
goto bailout; goto bailout;
} else { } else {
mdbx_tassert(txn, lifo != 0); mdbx_tassert(txn, ctx->lifo != 0);
} }
while (true) { while (true) {
@ -9815,35 +9854,35 @@ retry:
mdbx_trace("%s: left %u of %u", dbg_prefix_mode, left, mdbx_trace("%s: left %u of %u", dbg_prefix_mode, left,
(unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)); (unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist));
if (txn->tw.lifo_reclaimed == nullptr) { if (txn->tw.lifo_reclaimed == nullptr) {
mdbx_tassert(txn, lifo == 0); mdbx_tassert(txn, ctx->lifo == 0);
fill_gc_id = unaligned_peek_u64(4, key.iov_base); fill_gc_id = unaligned_peek_u64(4, key.iov_base);
if (filled_gc_slot-- == 0 || fill_gc_id > txn->tw.last_reclaimed) { if (ctx->filled_slot-- == 0 || fill_gc_id > txn->tw.last_reclaimed) {
mdbx_notice( mdbx_notice(
"** restart: reserve depleted (filled_slot %u, fill_id %" PRIaTXN "** restart: reserve depleted (filled_slot %u, fill_id %" PRIaTXN
" > last_reclaimed %" PRIaTXN, " > last_reclaimed %" PRIaTXN,
filled_gc_slot, fill_gc_id, txn->tw.last_reclaimed); ctx->filled_slot, fill_gc_id, txn->tw.last_reclaimed);
goto retry; goto retry;
} }
} else { } else {
mdbx_tassert(txn, lifo != 0); mdbx_tassert(txn, ctx->lifo != 0);
if (++filled_gc_slot > if (++ctx->filled_slot >
(unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)) { (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)) {
mdbx_notice("** restart: reserve depleted (filled_gc_slot %u > " mdbx_notice("** restart: reserve depleted (filled_gc_slot %u > "
"lifo_reclaimed %u" PRIaTXN, "lifo_reclaimed %u" PRIaTXN,
filled_gc_slot, ctx->filled_slot,
(unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed));
goto retry; goto retry;
} }
fill_gc_id = txn->tw.lifo_reclaimed[filled_gc_slot]; fill_gc_id = txn->tw.lifo_reclaimed[ctx->filled_slot];
mdbx_trace("%s: seek-reservation @%" PRIaTXN " at lifo_reclaimed[%u]", mdbx_trace("%s: seek-reservation @%" PRIaTXN " at lifo_reclaimed[%u]",
dbg_prefix_mode, fill_gc_id, filled_gc_slot); dbg_prefix_mode, fill_gc_id, ctx->filled_slot);
key.iov_base = &fill_gc_id; key.iov_base = &fill_gc_id;
key.iov_len = sizeof(fill_gc_id); key.iov_len = sizeof(fill_gc_id);
rc = mdbx_cursor_get(&couple.outer, &key, &data, MDBX_SET_KEY); rc = mdbx_cursor_get(&ctx->cursor.outer, &key, &data, MDBX_SET_KEY);
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
goto bailout; goto bailout;
} }
mdbx_tassert(txn, cleaned_gc_slot == mdbx_tassert(txn, ctx->cleaned_slot ==
(txn->tw.lifo_reclaimed (txn->tw.lifo_reclaimed
? MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) ? MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)
: 0)); : 0));
@ -9853,25 +9892,25 @@ retry:
key.iov_len = sizeof(fill_gc_id); key.iov_len = sizeof(fill_gc_id);
mdbx_tassert(txn, data.iov_len >= sizeof(pgno_t) * 2); mdbx_tassert(txn, data.iov_len >= sizeof(pgno_t) * 2);
couple.outer.mc_flags |= C_GCFREEZE; ctx->cursor.outer.mc_flags |= C_GCFREEZE;
unsigned chunk = (unsigned)(data.iov_len / sizeof(pgno_t)) - 1; unsigned chunk = (unsigned)(data.iov_len / sizeof(pgno_t)) - 1;
if (unlikely(chunk > left)) { if (unlikely(chunk > left)) {
mdbx_trace("%s: chunk %u > left %u, @%" PRIaTXN, dbg_prefix_mode, chunk, mdbx_trace("%s: chunk %u > left %u, @%" PRIaTXN, dbg_prefix_mode, chunk,
left, fill_gc_id); left, fill_gc_id);
if ((loop < 5 && chunk - left > loop / 2) || if ((ctx->loop < 5 && chunk - left > ctx->loop / 2) ||
chunk - left > env->me_maxgc_ov1page) { chunk - left > env->me_maxgc_ov1page) {
data.iov_len = (left + 1) * sizeof(pgno_t); data.iov_len = (left + 1) * sizeof(pgno_t);
if (loop < 7) if (ctx->loop < 7)
couple.outer.mc_flags &= ~C_GCFREEZE; ctx->cursor.outer.mc_flags &= ~C_GCFREEZE;
} }
chunk = left; chunk = left;
} }
rc = mdbx_cursor_put(&couple.outer, &key, &data, rc = mdbx_cursor_put(&ctx->cursor.outer, &key, &data,
MDBX_CURRENT | MDBX_RESERVE); MDBX_CURRENT | MDBX_RESERVE);
couple.outer.mc_flags &= ~C_GCFREEZE; ctx->cursor.outer.mc_flags &= ~C_GCFREEZE;
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
goto bailout; goto bailout;
clean_reserved_gc_pnl(env, data); gcu_clean_reserved(env, data);
if (unlikely(txn->tw.loose_count || if (unlikely(txn->tw.loose_count ||
amount != MDBX_PNL_SIZE(txn->tw.reclaimed_pglist))) { amount != MDBX_PNL_SIZE(txn->tw.reclaimed_pglist))) {
@ -9881,16 +9920,18 @@ retry:
goto retry; goto retry;
} }
if (unlikely(txn->tw.lifo_reclaimed if (unlikely(txn->tw.lifo_reclaimed
? cleaned_gc_slot < MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) ? ctx->cleaned_slot <
: cleaned_gc_id < txn->tw.last_reclaimed)) { MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)
: ctx->cleaned_id < txn->tw.last_reclaimed)) {
mdbx_notice("%s", "** restart: reclaimed-slots changed"); mdbx_notice("%s", "** restart: reclaimed-slots changed");
goto retry; goto retry;
} }
if (unlikely(retired_stored != MDBX_PNL_SIZE(txn->tw.retired_pages))) { if (unlikely(ctx->retired_stored !=
mdbx_tassert(txn, MDBX_PNL_SIZE(txn->tw.retired_pages))) {
retired_stored < MDBX_PNL_SIZE(txn->tw.retired_pages)); mdbx_tassert(txn, ctx->retired_stored <
MDBX_PNL_SIZE(txn->tw.retired_pages));
mdbx_notice("** restart: retired-list growth (%u -> %u)", mdbx_notice("** restart: retired-list growth (%u -> %u)",
retired_stored, MDBX_PNL_SIZE(txn->tw.retired_pages)); ctx->retired_stored, MDBX_PNL_SIZE(txn->tw.retired_pages));
goto retry; goto retry;
} }
@ -9907,7 +9948,7 @@ retry:
left -= chunk; left -= chunk;
if (mdbx_audit_enabled()) { if (mdbx_audit_enabled()) {
rc = mdbx_audit_ex(txn, retired_stored + amount - left, true); rc = mdbx_audit_ex(txn, ctx->retired_stored + amount - left, true);
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
goto bailout; goto bailout;
} }
@ -9917,12 +9958,12 @@ retry:
} }
if (txn->tw.lifo_reclaimed == nullptr) { if (txn->tw.lifo_reclaimed == nullptr) {
mdbx_tassert(txn, lifo == 0); mdbx_tassert(txn, ctx->lifo == 0);
rc = mdbx_cursor_next(&couple.outer, &key, &data, MDBX_NEXT); rc = mdbx_cursor_next(&ctx->cursor.outer, &key, &data, MDBX_NEXT);
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
goto bailout; goto bailout;
} else { } else {
mdbx_tassert(txn, lifo != 0); mdbx_tassert(txn, ctx->lifo != 0);
} }
} }
} }
@ -9932,28 +9973,27 @@ retry:
mdbx_notice("** restart: got %u loose pages", txn->tw.loose_count); mdbx_notice("** restart: got %u loose pages", txn->tw.loose_count);
goto retry; goto retry;
} }
if (unlikely(filled_gc_slot != if (unlikely(ctx->filled_slot !=
(txn->tw.lifo_reclaimed (txn->tw.lifo_reclaimed
? (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) ? (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)
: 0))) { : 0))) {
const bool will_retry = loop < 9; const bool will_retry = ctx->loop < 9;
mdbx_notice("** %s: reserve excess (filled-slot %u, loop %u)", mdbx_notice("** %s: reserve excess (filled-slot %u, loop %u)",
will_retry ? "restart" : "ignore", filled_gc_slot, loop); will_retry ? "restart" : "ignore", ctx->filled_slot, ctx->loop);
if (will_retry) if (will_retry)
goto retry; goto retry;
} }
mdbx_tassert(txn, mdbx_tassert(txn,
txn->tw.lifo_reclaimed == NULL || txn->tw.lifo_reclaimed == NULL ||
cleaned_gc_slot == MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); ctx->cleaned_slot == MDBX_PNL_SIZE(txn->tw.lifo_reclaimed));
bailout: bailout:
txn->mt_cursors[FREE_DBI] = couple.outer.mc_next; txn->mt_cursors[FREE_DBI] = ctx->cursor.outer.mc_next;
bailout_notracking:
MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) = 0; MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) = 0;
mdbx_trace("<<< %u loops, rc = %d", loop, rc); mdbx_trace("<<< %u loops, rc = %d", ctx->loop, rc);
return rc; return rc;
} }
@ -10591,7 +10631,11 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) {
} }
ts_1 = latency ? mdbx_osal_monotime() : 0; ts_1 = latency ? mdbx_osal_monotime() : 0;
rc = mdbx_update_gc(txn); gcu_context_t gcu_ctx;
rc = gcu_context_init(txn, &gcu_ctx);
if (unlikely(rc != MDBX_SUCCESS))
goto fail;
rc = mdbx_update_gc(txn, &gcu_ctx);
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
goto fail; goto fail;
@ -10613,11 +10657,11 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) {
goto fail; goto fail;
} }
struct mdbx_iov_ctx ctx; struct mdbx_iov_ctx write_ctx;
mdbx_iov_init(txn, &ctx); mdbx_iov_init(txn, &write_ctx);
rc = mdbx_txn_write(txn, &ctx); rc = mdbx_txn_write(txn, &write_ctx);
if (likely(rc == MDBX_SUCCESS)) if (likely(rc == MDBX_SUCCESS))
mdbx_iov_done(txn, &ctx); mdbx_iov_done(txn, &write_ctx);
/* TODO: use ctx.flush_begin & ctx.flush_end for range-sync */ /* TODO: use ctx.flush_begin & ctx.flush_end for range-sync */
ts_3 = latency ? mdbx_osal_monotime() : 0; ts_3 = latency ? mdbx_osal_monotime() : 0;
@ -10636,7 +10680,6 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) {
meta.mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; meta.mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI];
meta.mm_canary = txn->mt_canary; meta.mm_canary = txn->mt_canary;
meta_set_txnid(env, &meta, txn->mt_txnid); meta_set_txnid(env, &meta, txn->mt_txnid);
rc = mdbx_sync_locked( rc = mdbx_sync_locked(
env, env->me_flags | txn->mt_flags | MDBX_SHRINK_ALLOWED, &meta); env, env->me_flags | txn->mt_flags | MDBX_SHRINK_ALLOWED, &meta);
} }

View File

@ -923,9 +923,9 @@ struct MDBX_txn {
/* corresponding to the current size of datafile */ /* corresponding to the current size of datafile */
#define mt_end_pgno mt_geo.now #define mt_end_pgno mt_geo.now
/* The ID of this transaction. IDs are integers incrementing from 1. /* The ID of this transaction. IDs are integers incrementing from
* Only committed write transactions increment the ID. If a transaction * INITIAL_TXNID. Only committed write transactions increment the ID. If a
* aborts, the ID may be re-used by the next writer. */ * transaction aborts, the ID may be re-used by the next writer. */
txnid_t mt_txnid; txnid_t mt_txnid;
txnid_t mt_front; txnid_t mt_front;
@ -986,11 +986,11 @@ struct MDBX_txn {
MDBX_page *loose_pages; MDBX_page *loose_pages;
/* Number of loose pages (tw.loose_pages) */ /* Number of loose pages (tw.loose_pages) */
unsigned loose_count; unsigned loose_count;
unsigned spill_least_removed;
/* The sorted list of dirty pages we temporarily wrote to disk /* The sorted list of dirty pages we temporarily wrote to disk
* because the dirty list was full. page numbers in here are * because the dirty list was full. page numbers in here are
* shifted left by 1, deleted slots have the LSB set. */ * shifted left by 1, deleted slots have the LSB set. */
MDBX_PNL spill_pages; MDBX_PNL spill_pages;
unsigned spill_least_removed;
} tw; } tw;
}; };
}; };