mdbx: backport - fix/rewrite mdbx_update_gc().

Change-Id: I580a1ff0cbeeb529e2bcbd50d97bfba7bcf5a546
This commit is contained in:
Leonid Yuriev 2018-08-11 02:04:34 +03:00
parent b91e645919
commit 57655583e5
2 changed files with 223 additions and 173 deletions

View File

@ -751,8 +751,8 @@ struct MDBX_env {
MDBX_PNL me_free_pgs; MDBX_PNL me_free_pgs;
/* ID2L of pages written during a write txn. Length MDBX_PNL_UM_SIZE. */ /* ID2L of pages written during a write txn. Length MDBX_PNL_UM_SIZE. */
MDBX_ID2L me_dirtylist; MDBX_ID2L me_dirtylist;
/* Max number of freelist items that can fit in a single overflow page */ /* Number of freelist items that can fit in a single overflow page */
unsigned me_maxfree_1pg; unsigned me_maxgc_ov1page;
/* Max size of a node on a page */ /* Max size of a node on a page */
unsigned me_nodemax; unsigned me_nodemax;
unsigned me_maxkey_limit; /* max size of a key */ unsigned me_maxkey_limit; /* max size of a key */

View File

@ -2141,7 +2141,7 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp,
if (likely(flags & MDBX_ALLOC_GC)) { if (likely(flags & MDBX_ALLOC_GC)) {
flags |= env->me_flags & (MDBX_COALESCE | MDBX_LIFORECLAIM); flags |= env->me_flags & (MDBX_COALESCE | MDBX_LIFORECLAIM);
if (unlikely(mc->mc_flags & C_RECLAIMING)) { if (unlikely(mc->mc_flags & C_RECLAIMING)) {
/* If mc is updating the freeDB, then the freelist cannot play /* If mc is updating the freeDB, then the befree-list cannot play
* catch-up with itself by growing while trying to save it. */ * catch-up with itself by growing while trying to save it. */
flags &= flags &=
~(MDBX_ALLOC_GC | MDBX_ALLOC_KICK | MDBX_COALESCE | MDBX_LIFORECLAIM); ~(MDBX_ALLOC_GC | MDBX_ALLOC_KICK | MDBX_COALESCE | MDBX_LIFORECLAIM);
@ -2380,8 +2380,8 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp,
if (repg_len > MDBX_PNL_UM_SIZE / 2) if (repg_len > MDBX_PNL_UM_SIZE / 2)
break; break;
if (flags & MDBX_COALESCE) { if (flags & MDBX_COALESCE) {
if (repg_len /* current size */ >= env->me_maxfree_1pg / 2 || if (repg_len /* current size */ >= env->me_maxgc_ov1page ||
repg_pos /* prev size */ >= env->me_maxfree_1pg / 4) repg_pos /* prev size */ >= env->me_maxgc_ov1page / 2)
flags &= ~MDBX_COALESCE; flags &= ~MDBX_COALESCE;
} }
} }
@ -3485,80 +3485,95 @@ static int mdbx_prep_backlog(MDBX_txn *txn, MDBX_cursor *mc) {
return MDBX_SUCCESS; return MDBX_SUCCESS;
} }
/* Save the freelist as of this transaction to the freeDB. /* Cleanup reclaimed GC records, than save the befree-list as of this
* This changes the freelist. Keep trying until it stabilizes. */ * transaction to GC (aka freeDB). This recursive changes the reclaimed-list
static int mdbx_freelist_save(MDBX_txn *txn) { * loose-list and befree-list. Keep trying until it stabilizes. */
static int mdbx_update_gc(MDBX_txn *txn) {
/* env->me_reclaimed_pglist[] can grow and shrink during this call. /* env->me_reclaimed_pglist[] can grow and shrink during this call.
* env->me_last_reclaimed and txn->mt_free_pages[] can only grow. * env->me_last_reclaimed and txn->mt_befree_pages[] can only grow.
* Page numbers cannot disappear from txn->mt_free_pages[]. */ * Page numbers cannot disappear from txn->mt_befree_pages[]. */
MDBX_cursor mc;
MDBX_env *env = txn->mt_env; MDBX_env *env = txn->mt_env;
int rc, more = 1;
txnid_t cleanup_reclaimed_id = 0, head_id = 0;
pgno_t befree_count = 0;
intptr_t head_room = 0, total_room = 0;
unsigned cleanup_reclaimed_pos = 0, refill_reclaimed_pos = 0;
const bool lifo = (env->me_flags & MDBX_LIFORECLAIM) != 0; const bool lifo = (env->me_flags & MDBX_LIFORECLAIM) != 0;
rc = mdbx_cursor_init(&mc, txn, FREE_DBI, NULL); MDBX_cursor mc;
int rc = mdbx_cursor_init(&mc, txn, FREE_DBI, NULL);
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
return rc; return rc;
/* MDBX_RESERVE cancels meminit in ovpage malloc (when no WRITEMAP) */ const char *dbg_prefix_mode = lifo ? " lifo" : " fifo";
const intptr_t clean_limit = mdbx_trace("\n>>> @%" PRIaTXN, txn->mt_txnid);
(env->me_flags & (MDBX_NOMEMINIT | MDBX_WRITEMAP)) ? SSIZE_MAX (void)dbg_prefix_mode;
: env->me_maxfree_1pg; unsigned befree_stored = 0, loop = 0;
mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist));
retry:
mdbx_trace(" >> restart");
mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist)); mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist));
again_on_freelist_change: if (unlikely(/* paranoia */ ++loop > 42)) {
mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist)); mdbx_error("too more loops %u, bailout", loop);
rc = MDBX_PROBLEM;
goto bailout;
}
unsigned placed = 0, cleaned_gc_slot = 0, reused_gc_slot = 0,
filled_gc_slot = ~0u;
txnid_t cleaned_gc_id = 0,
head_gc_id = lifo ? *env->me_oldest : env->me_last_reclaimed;
while (1) { while (1) {
/* Come back here after each Put() in case freelist changed */ /* Come back here after each Put() in case befree-list changed */
MDBX_val key, data; MDBX_val key, data;
mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist)); mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist));
if (!lifo) { if (!lifo) {
/* If using records from freeDB which we have not yet deleted, /* If using records from freeDB which we have not yet deleted,
* now delete them and any we reserved for me_reclaimed_pglist. */ * now delete them and any we reserved for me_reclaimed_pglist. */
while (cleanup_reclaimed_id < env->me_last_reclaimed) { while (cleaned_gc_id < env->me_last_reclaimed) {
rc = mdbx_cursor_first(&mc, &key, NULL); rc = mdbx_cursor_first(&mc, &key, NULL);
if (unlikely(rc)) if (unlikely(rc != MDBX_SUCCESS))
goto bailout; goto bailout;
rc = mdbx_prep_backlog(txn, &mc); rc = mdbx_prep_backlog(txn, &mc);
if (unlikely(rc)) if (unlikely(rc != MDBX_SUCCESS))
goto bailout; goto bailout;
cleanup_reclaimed_id = head_id = *(txnid_t *)key.iov_base; cleaned_gc_id = head_gc_id = *(txnid_t *)key.iov_base;
total_room = head_room = 0; mdbx_tassert(txn, cleaned_gc_id < *env->me_oldest);
more = 1; placed = 0;
mdbx_tassert(txn, cleanup_reclaimed_id <= env->me_last_reclaimed); mdbx_tassert(txn, cleaned_gc_id <= env->me_last_reclaimed);
mc.mc_flags |= C_RECLAIMING; mc.mc_flags |= C_RECLAIMING;
mdbx_trace("%s.cleanup-reclaimed-id %" PRIaTXN, dbg_prefix_mode,
cleaned_gc_id);
rc = mdbx_cursor_del(&mc, 0); rc = mdbx_cursor_del(&mc, 0);
mc.mc_flags ^= C_RECLAIMING; mc.mc_flags ^= C_RECLAIMING;
if (unlikely(rc)) if (unlikely(rc != MDBX_SUCCESS))
goto bailout; goto bailout;
} }
} else if (txn->mt_lifo_reclaimed) { } else if (txn->mt_lifo_reclaimed) {
/* LY: cleanup reclaimed records. */ /* LY: cleanup reclaimed records. */
while (cleanup_reclaimed_pos < txn->mt_lifo_reclaimed[0]) { while (cleaned_gc_slot < txn->mt_lifo_reclaimed[0]) {
cleanup_reclaimed_id = txn->mt_lifo_reclaimed[++cleanup_reclaimed_pos]; cleaned_gc_id = txn->mt_lifo_reclaimed[++cleaned_gc_slot];
key.iov_base = &cleanup_reclaimed_id; head_gc_id = (head_gc_id > cleaned_gc_id) ? cleaned_gc_id : head_gc_id;
key.iov_len = sizeof(cleanup_reclaimed_id); key.iov_base = &cleaned_gc_id;
key.iov_len = sizeof(cleaned_gc_id);
rc = mdbx_cursor_get(&mc, &key, NULL, MDBX_SET); rc = mdbx_cursor_get(&mc, &key, NULL, MDBX_SET);
if (likely(rc != MDBX_NOTFOUND)) { if (likely(rc != MDBX_NOTFOUND)) {
if (unlikely(rc)) if (unlikely(rc != MDBX_SUCCESS))
goto bailout; goto bailout;
rc = mdbx_prep_backlog(txn, &mc); rc = mdbx_prep_backlog(txn, &mc);
if (unlikely(rc)) if (unlikely(rc != MDBX_SUCCESS))
goto bailout; goto bailout;
mdbx_tassert(txn, cleaned_gc_id < *env->me_oldest);
mc.mc_flags |= C_RECLAIMING; mc.mc_flags |= C_RECLAIMING;
mdbx_trace("%s.cleanup-reclaimed-id [%u]%" PRIaTXN, dbg_prefix_mode,
cleaned_gc_slot, cleaned_gc_id);
rc = mdbx_cursor_del(&mc, 0); rc = mdbx_cursor_del(&mc, 0);
mc.mc_flags ^= C_RECLAIMING; mc.mc_flags ^= C_RECLAIMING;
if (unlikely(rc)) if (unlikely(rc != MDBX_SUCCESS))
goto bailout; goto bailout;
} }
} }
} }
// handle loose pages - put ones into the reclaimed- or befree-list
mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist)); mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist));
if (txn->mt_loose_pages) { if (txn->mt_loose_pages) {
/* Return loose page numbers to me_reclaimed_pglist, /* Return loose page numbers to me_reclaimed_pglist,
@ -3566,7 +3581,7 @@ again_on_freelist_change:
* The pages themselves remain in dirtylist. */ * The pages themselves remain in dirtylist. */
if (unlikely(!env->me_reclaimed_pglist) && if (unlikely(!env->me_reclaimed_pglist) &&
!(lifo && env->me_last_reclaimed > 1)) { !(lifo && env->me_last_reclaimed > 1)) {
/* Put loose page numbers in mt_free_pages, /* Put loose page numbers in mt_befree_pages,
* since unable to return them to me_reclaimed_pglist. */ * since unable to return them to me_reclaimed_pglist. */
if (unlikely((rc = mdbx_pnl_need(&txn->mt_befree_pages, if (unlikely((rc = mdbx_pnl_need(&txn->mt_befree_pages,
txn->mt_loose_count)) != 0)) txn->mt_loose_count)) != 0))
@ -3575,8 +3590,9 @@ again_on_freelist_change:
mdbx_pnl_xappend(txn->mt_befree_pages, mp->mp_pgno); mdbx_pnl_xappend(txn->mt_befree_pages, mp->mp_pgno);
} else { } else {
/* Room for loose pages + temp PNL with same */ /* Room for loose pages + temp PNL with same */
if ((rc = mdbx_pnl_need(&env->me_reclaimed_pglist, rc = mdbx_pnl_need(&env->me_reclaimed_pglist,
2 * txn->mt_loose_count + 1)) != 0) 2 * txn->mt_loose_count + 1);
if (unlikely(rc != MDBX_SUCCESS))
goto bailout; goto bailout;
MDBX_PNL loose = env->me_reclaimed_pglist + MDBX_PNL loose = env->me_reclaimed_pglist +
MDBX_PNL_ALLOCLEN(env->me_reclaimed_pglist) - MDBX_PNL_ALLOCLEN(env->me_reclaimed_pglist) -
@ -3612,9 +3628,9 @@ again_on_freelist_change:
txn->mt_loose_count = 0; txn->mt_loose_count = 0;
} }
// handle reclaimed pages - return suitable into unallocated space
mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist)); mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist));
if (env->me_reclaimed_pglist) { if (env->me_reclaimed_pglist) {
/* Refund suitable pages into "unallocated" space */
pgno_t tail = txn->mt_next_pgno; pgno_t tail = txn->mt_next_pgno;
pgno_t *const begin = env->me_reclaimed_pglist + 1; pgno_t *const begin = env->me_reclaimed_pglist + 1;
pgno_t *const end = begin + env->me_reclaimed_pglist[0]; pgno_t *const end = begin + env->me_reclaimed_pglist[0];
@ -3644,82 +3660,71 @@ again_on_freelist_change:
} }
} }
/* Save the PNL of pages freed by this txn, to a single record */ // handle befree-list - store ones into singe gc-record
if (befree_count < txn->mt_befree_pages[0]) { if (befree_stored < txn->mt_befree_pages[0]) {
if (unlikely(!befree_count)) { if (unlikely(!befree_stored)) {
/* Make sure last page of freeDB is touched and on freelist */ /* Make sure last page of freeDB is touched and on befree-list */
rc = mdbx_page_search(&mc, NULL, MDBX_PS_LAST | MDBX_PS_MODIFY); rc = mdbx_page_search(&mc, NULL, MDBX_PS_LAST | MDBX_PS_MODIFY);
if (unlikely(rc && rc != MDBX_NOTFOUND)) if (unlikely(rc != MDBX_SUCCESS && rc != MDBX_NOTFOUND))
goto bailout; goto bailout;
} }
pgno_t *befree_pages = txn->mt_befree_pages;
/* Write to last page of freeDB */ /* Write to last page of freeDB */
key.iov_len = sizeof(txn->mt_txnid); key.iov_len = sizeof(txn->mt_txnid);
key.iov_base = &txn->mt_txnid; key.iov_base = &txn->mt_txnid;
do { do {
befree_count = befree_pages[0]; data.iov_len = MDBX_PNL_SIZEOF(txn->mt_befree_pages);
data.iov_len = MDBX_PNL_SIZEOF(befree_pages);
rc = mdbx_cursor_put(&mc, &key, &data, MDBX_RESERVE); rc = mdbx_cursor_put(&mc, &key, &data, MDBX_RESERVE);
if (unlikely(rc)) if (unlikely(rc != MDBX_SUCCESS))
goto bailout; goto bailout;
/* Retry if mt_free_pages[] grew during the Put() */ /* Retry if mt_befree_pages[] grew during the Put() */
befree_pages = txn->mt_befree_pages; } while (data.iov_len < MDBX_PNL_SIZEOF(txn->mt_befree_pages));
} while (befree_count < befree_pages[0]);
mdbx_pnl_sort(befree_pages); befree_stored = (unsigned)txn->mt_befree_pages[0];
memcpy(data.iov_base, befree_pages, data.iov_len); mdbx_pnl_sort(txn->mt_befree_pages);
memcpy(data.iov_base, txn->mt_befree_pages, data.iov_len);
mdbx_trace("%s.put-befree #%u @ %" PRIaTXN, dbg_prefix_mode,
befree_stored, txn->mt_txnid);
if (mdbx_debug_enabled(MDBX_DBG_EXTRA)) { if (mdbx_debug_enabled(MDBX_DBG_EXTRA)) {
unsigned i = (unsigned)befree_pages[0]; unsigned i = befree_stored;
mdbx_debug_extra("PNL write txn %" PRIaTXN " root %" PRIaPGNO mdbx_debug_extra("PNL write txn %" PRIaTXN " root %" PRIaPGNO
" num %u, PNL", " num %u, PNL",
txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i); txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i);
for (; i; i--) for (; i; i--)
mdbx_debug_extra_print(" %" PRIaPGNO "", befree_pages[i]); mdbx_debug_extra_print(" %" PRIaPGNO "", txn->mt_befree_pages[i]);
mdbx_debug_extra_print("\n"); mdbx_debug_extra_print("\n");
} }
continue; continue;
} }
// handle reclaimed and loost pages - merge and store both into gc
mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist)); mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist));
const intptr_t rpl_len = mdbx_tassert(txn, txn->mt_loose_count == 0);
(env->me_reclaimed_pglist ? env->me_reclaimed_pglist[0] : 0) + const unsigned amount =
txn->mt_loose_count; env->me_reclaimed_pglist ? env->me_reclaimed_pglist[0] : 0;
if (rpl_len && refill_reclaimed_pos == 0) const unsigned left = amount - placed;
refill_reclaimed_pos = 1;
/* Reserve records for me_reclaimed_pglist[]. Split it if multi-page, mdbx_trace("%s: amount %u, placed %d, left %d", dbg_prefix_mode, amount,
* to avoid searching freeDB for a page range. Use keys in placed, (int)left);
* range [1,me_last_reclaimed]: Smaller than txnid of oldest reader. */ if (0 >= (int)left)
if (total_room >= rpl_len) {
if (total_room == rpl_len || --more < 0)
break; break;
} else if (head_room >= (intptr_t)env->me_maxfree_1pg && head_id > 1) {
/* Keep current record (overflow page), add a new one */
head_id--;
refill_reclaimed_pos++;
head_room = 0;
}
mdbx_trace(" >> reserving");
txnid_t reservation_gc_id;
const unsigned lifo_gc_slots =
txn->mt_lifo_reclaimed ? (unsigned)txn->mt_lifo_reclaimed[0] : 0;
if (lifo) { if (lifo) {
if (refill_reclaimed_pos > if (reused_gc_slot >= lifo_gc_slots) {
(txn->mt_lifo_reclaimed ? txn->mt_lifo_reclaimed[0] : 0)) {
/* LY: need just a txn-id for save page list. */ /* LY: need just a txn-id for save page list. */
rc = mdbx_page_alloc(&mc, 0, NULL, MDBX_ALLOC_GC | MDBX_ALLOC_KICK); rc = mdbx_page_alloc(&mc, 0, NULL, MDBX_ALLOC_GC | MDBX_ALLOC_KICK);
if (likely(rc == 0)) if (likely(rc == MDBX_SUCCESS))
/* LY: ok, reclaimed from freedb. */ /* LY: ok, reclaimed from freedb. */
continue; continue;
if (unlikely(rc != MDBX_NOTFOUND)) if (unlikely(rc != MDBX_NOTFOUND))
/* LY: other troubles... */ /* LY: other troubles... */
goto bailout; goto bailout;
/* LY: freedb is empty, will look any free txn-id in high2low order. */
if (unlikely(env->me_last_reclaimed < 1)) {
/* LY: not any txn in the past of freedb. */
rc = MDBX_MAP_FULL;
goto bailout;
}
if (unlikely(!txn->mt_lifo_reclaimed)) { if (unlikely(!txn->mt_lifo_reclaimed)) {
txn->mt_lifo_reclaimed = mdbx_txl_alloc(); txn->mt_lifo_reclaimed = mdbx_txl_alloc();
if (unlikely(!txn->mt_lifo_reclaimed)) { if (unlikely(!txn->mt_lifo_reclaimed)) {
@ -3727,57 +3732,80 @@ again_on_freelist_change:
goto bailout; goto bailout;
} }
} }
/* LY: append the list. */ /* LY: freedb is empty, will look any free txn-id in high2low order. */
rc = mdbx_txl_append(&txn->mt_lifo_reclaimed, rc = mdbx_txl_append(&txn->mt_lifo_reclaimed, --head_gc_id);
env->me_last_reclaimed - 1); if (unlikely(rc != MDBX_SUCCESS))
if (unlikely(rc))
goto bailout; goto bailout;
--env->me_last_reclaimed; cleaned_gc_slot += 1 /* mark GC cleanup is not needed. */;
/* LY: note that freeDB cleanup is not needed. */
++cleanup_reclaimed_pos; mdbx_trace("%s: append @%" PRIaTXN
" to lifo-reclaimed, cleaned-gc-slot = %u",
dbg_prefix_mode, head_gc_id, cleaned_gc_slot);
} }
mdbx_tassert(txn, txn->mt_lifo_reclaimed != NULL); mdbx_tassert(txn, txn->mt_lifo_reclaimed != NULL);
head_id = txn->mt_lifo_reclaimed[refill_reclaimed_pos]; reservation_gc_id = txn->mt_lifo_reclaimed[++reused_gc_slot];
mdbx_trace("%s: take @%" PRIaTXN " from lifo-reclaimed[%u]",
dbg_prefix_mode, reservation_gc_id, reused_gc_slot);
head_gc_id =
(head_gc_id > reservation_gc_id) ? reservation_gc_id : head_gc_id;
} else { } else {
mdbx_tassert(txn, txn->mt_lifo_reclaimed == NULL); mdbx_tassert(txn, txn->mt_lifo_reclaimed == NULL);
reused_gc_slot++ /* just count reserved records */;
reservation_gc_id = head_gc_id--;
mdbx_trace("%s: take @%" PRIaTXN " from head-gc-id", dbg_prefix_mode,
reservation_gc_id);
} }
/* (Re)write {key = head_id, PNL length = head_room} */ mdbx_trace("%s: head_gc_id %" PRIaTXN
total_room -= head_room; ", reused_gc_slot %u, lifo_gc_slots %u, reservation-id "
head_room = rpl_len - total_room; "%" PRIaTXN,
if (head_room > (intptr_t)env->me_maxfree_1pg && head_id > 1) { dbg_prefix_mode, head_gc_id, reused_gc_slot, lifo_gc_slots,
/* Overflow multi-page for part of me_reclaimed_pglist */ reservation_gc_id);
head_room /= (head_id < INT16_MAX) ? (pgno_t)head_id
: INT16_MAX; /* amortize page sizes */ const bool no_slots_more =
head_room += env->me_maxfree_1pg - head_room % (env->me_maxfree_1pg + 1); head_gc_id < 2 && (!lifo || reused_gc_slot >= lifo_gc_slots);
} else if (head_room < 0) { const unsigned chunk =
/* Rare case, not bothering to delete this record */ (left < env->me_maxgc_ov1page || no_slots_more)
head_room = 0; ? left
continue; : (left < env->me_maxgc_ov1page * 2)
? /* the half to each of the last two chunks */ left / 2
: env->me_maxgc_ov1page;
mdbx_trace("%s: chunk %u, no_slots_more %s, gc-per-ovpage %u",
dbg_prefix_mode, chunk, no_slots_more ? "yes" : "no",
env->me_maxgc_ov1page);
mdbx_tassert(txn, reservation_gc_id < *env->me_oldest);
if (unlikely(reservation_gc_id < 1)) {
/* LY: not any txn in the past of freedb. */
rc = MDBX_PROBLEM;
goto bailout;
} }
key.iov_len = sizeof(head_id);
key.iov_base = &head_id; key.iov_len = sizeof(reservation_gc_id);
data.iov_len = (head_room + 1) * sizeof(pgno_t); key.iov_base = &reservation_gc_id;
rc = mdbx_cursor_put(&mc, &key, &data, MDBX_RESERVE); data.iov_len = (chunk + 1) * sizeof(pgno_t);
mdbx_trace("%s.reserve: %u [%u...%u] @%" PRIaTXN, dbg_prefix_mode, chunk,
placed + 1, placed + chunk + 1, reservation_gc_id);
rc = mdbx_cursor_put(&mc, &key, &data, MDBX_RESERVE | MDBX_NOOVERWRITE);
mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist)); mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist));
if (unlikely(rc)) if (unlikely(rc != MDBX_SUCCESS))
goto bailout; goto bailout;
/* PNL is initially empty, zero out at least the length */ /* PNL is initially empty, zero out at least the length */
pgno_t *pgs = (pgno_t *)data.iov_base; memset(data.iov_base, 0, sizeof(pgno_t));
intptr_t i = head_room > clean_limit ? head_room : 0; placed += chunk;
do { mdbx_trace("%s.placed %u (+%u), continue", dbg_prefix_mode, placed, chunk);
pgs[i] = 0;
} while (--i >= 0);
total_room += head_room;
continue; continue;
} }
mdbx_tassert(txn, mdbx_tassert(txn,
cleanup_reclaimed_pos == cleaned_gc_slot ==
(txn->mt_lifo_reclaimed ? txn->mt_lifo_reclaimed[0] : 0)); (txn->mt_lifo_reclaimed ? txn->mt_lifo_reclaimed[0] : 0));
/* Fill in the reserved me_reclaimed_pglist records */ mdbx_trace(" >> filling");
/* Fill in the reserved records */
filled_gc_slot = reused_gc_slot;
rc = MDBX_SUCCESS; rc = MDBX_SUCCESS;
mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist)); mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist));
if (env->me_reclaimed_pglist && env->me_reclaimed_pglist[0]) { if (env->me_reclaimed_pglist && env->me_reclaimed_pglist[0]) {
@ -3785,89 +3813,109 @@ again_on_freelist_change:
key.iov_len = data.iov_len = 0; /* avoid MSVC warning */ key.iov_len = data.iov_len = 0; /* avoid MSVC warning */
key.iov_base = data.iov_base = NULL; key.iov_base = data.iov_base = NULL;
size_t rpl_left = env->me_reclaimed_pglist[0]; unsigned left = env->me_reclaimed_pglist[0];
pgno_t *rpl_end = env->me_reclaimed_pglist + rpl_left; pgno_t *end = env->me_reclaimed_pglist + left;
if (txn->mt_lifo_reclaimed == 0) { if (txn->mt_lifo_reclaimed == nullptr) {
mdbx_tassert(txn, lifo == 0); mdbx_tassert(txn, lifo == 0);
rc = mdbx_cursor_first(&mc, &key, &data); rc = mdbx_cursor_first(&mc, &key, &data);
if (unlikely(rc)) if (unlikely(rc != MDBX_SUCCESS))
goto bailout; goto bailout;
} else { } else {
mdbx_tassert(txn, lifo != 0); mdbx_tassert(txn, lifo != 0);
} }
while (1) { while (1) {
txnid_t id; txnid_t fill_gc_id;
if (txn->mt_lifo_reclaimed == 0) { mdbx_trace("%s: left %u of %u", dbg_prefix_mode, left,
(unsigned)env->me_reclaimed_pglist[0]);
if (txn->mt_lifo_reclaimed == nullptr) {
mdbx_tassert(txn, lifo == 0); mdbx_tassert(txn, lifo == 0);
id = *(txnid_t *)key.iov_base; fill_gc_id = *(txnid_t *)key.iov_base;
mdbx_tassert(txn, id <= env->me_last_reclaimed); if (filled_gc_slot-- /* just countdown reserved records */ == 0 ||
fill_gc_id > env->me_last_reclaimed) {
mdbx_notice(
"** restart: reserve depleted (filled_slot %u, fill_id %" PRIaTXN
" > last_reclaimed %" PRIaTXN,
filled_gc_slot, fill_gc_id, env->me_last_reclaimed);
goto retry;
}
} else { } else {
mdbx_tassert(txn, lifo != 0); mdbx_tassert(txn, lifo != 0);
mdbx_tassert(txn, if (filled_gc_slot == 0) {
refill_reclaimed_pos > 0 && mdbx_notice("** restart: reserve depleted (filled_slot == 0)");
refill_reclaimed_pos <= txn->mt_lifo_reclaimed[0]); goto retry;
id = txn->mt_lifo_reclaimed[refill_reclaimed_pos--]; }
key.iov_base = &id; mdbx_tassert(txn, filled_gc_slot > 0 &&
key.iov_len = sizeof(id); filled_gc_slot <= txn->mt_lifo_reclaimed[0]);
fill_gc_id = txn->mt_lifo_reclaimed[filled_gc_slot--];
mdbx_trace("%s.seek-reservaton @%" PRIaTXN " at lifo_reclaimed[%u]",
dbg_prefix_mode, fill_gc_id, (unsigned)filled_gc_slot);
key.iov_base = &fill_gc_id;
key.iov_len = sizeof(fill_gc_id);
rc = mdbx_cursor_get(&mc, &key, &data, MDBX_SET); rc = mdbx_cursor_get(&mc, &key, &data, MDBX_SET);
if (unlikely(rc)) if (unlikely(rc != MDBX_SUCCESS))
goto bailout; goto bailout;
} }
mdbx_tassert( mdbx_tassert(
txn, cleanup_reclaimed_pos == txn, cleaned_gc_slot ==
(txn->mt_lifo_reclaimed ? txn->mt_lifo_reclaimed[0] : 0)); (txn->mt_lifo_reclaimed ? txn->mt_lifo_reclaimed[0] : 0));
mdbx_tassert(txn, data.iov_len >= sizeof(pgno_t) * 2); mdbx_tassert(txn, data.iov_len >= sizeof(pgno_t) * 2);
size_t chunk_len = (data.iov_len / sizeof(pgno_t)) - 1; const size_t space = (data.iov_len / sizeof(pgno_t)) - 1;
if (chunk_len > rpl_left) const unsigned chunk = (space > left) ? left : (unsigned)space;
chunk_len = rpl_left; data.iov_len = (chunk + 1) * sizeof(pgno_t);
data.iov_len = (chunk_len + 1) * sizeof(pgno_t); mdbx_tassert(txn, fill_gc_id > 0 && fill_gc_id < *env->me_oldest);
key.iov_base = &id; key.iov_base = &fill_gc_id;
key.iov_len = sizeof(id); key.iov_len = sizeof(fill_gc_id);
rpl_end -= chunk_len; end -= chunk;
data.iov_base = rpl_end; data.iov_base = end;
pgno_t save = rpl_end[0]; pgno_t save = end[0];
rpl_end[0] = (pgno_t)chunk_len; end[0] = (pgno_t)chunk;
mdbx_tassert(txn, mdbx_pnl_check(rpl_end)); mdbx_tassert(txn, mdbx_pnl_check(end));
mc.mc_flags |= C_RECLAIMING; mc.mc_flags |= C_RECLAIMING;
rc = mdbx_cursor_put(&mc, &key, &data, MDBX_CURRENT); rc = mdbx_cursor_put(&mc, &key, &data, MDBX_CURRENT);
mc.mc_flags ^= C_RECLAIMING; mc.mc_flags ^= C_RECLAIMING;
mdbx_tassert(txn, mdbx_pnl_check(rpl_end)); mdbx_tassert(txn, mdbx_pnl_check(end));
mdbx_tassert( mdbx_tassert(
txn, cleanup_reclaimed_pos == txn, cleaned_gc_slot ==
(txn->mt_lifo_reclaimed ? txn->mt_lifo_reclaimed[0] : 0)); (txn->mt_lifo_reclaimed ? txn->mt_lifo_reclaimed[0] : 0));
rpl_end[0] = save; pgno_t *from = end + 1, *to = end + end[0];
if (unlikely(rc)) mdbx_trace("%s.fill: %u [ %u:%" PRIaPGNO "...%u:%" PRIaPGNO
"] @%" PRIaTXN,
dbg_prefix_mode, (unsigned)end[0],
(unsigned)(from - env->me_reclaimed_pglist), *from,
(unsigned)(to - env->me_reclaimed_pglist), *to, fill_gc_id);
end[0] = save;
if (unlikely(rc != MDBX_SUCCESS))
goto bailout; goto bailout;
rpl_left -= chunk_len; left -= chunk;
if (rpl_left == 0) if (left == 0) {
rc = MDBX_SUCCESS;
break; break;
}
if (!lifo) { if (!lifo) {
rc = mdbx_cursor_next(&mc, &key, &data, MDBX_NEXT); rc = mdbx_cursor_next(&mc, &key, &data, MDBX_NEXT);
if (unlikely(rc)) if (unlikely(rc != MDBX_SUCCESS))
goto bailout; goto bailout;
} }
} }
} }
mdbx_tassert(txn, rc == MDBX_SUCCESS);
if (txn->mt_lifo_reclaimed) {
mdbx_tassert(txn, cleaned_gc_slot == txn->mt_lifo_reclaimed[0]);
if (unlikely(filled_gc_slot != 0)) {
mdbx_notice("** restart: reserve excess (filled-slot %u > 0)",
filled_gc_slot);
goto retry;
}
}
bailout: bailout:
if (txn->mt_lifo_reclaimed) { if (txn->mt_lifo_reclaimed) {
mdbx_tassert(txn, rc || cleanup_reclaimed_pos == txn->mt_lifo_reclaimed[0]);
if (rc == MDBX_SUCCESS &&
cleanup_reclaimed_pos != txn->mt_lifo_reclaimed[0]) {
mdbx_tassert(txn, cleanup_reclaimed_pos < txn->mt_lifo_reclaimed[0]);
/* LY: zeroed cleanup_idx to force cleanup
* and refill created freeDB records. */
cleanup_reclaimed_pos = 0;
/* LY: restart filling */
total_room = head_room = refill_reclaimed_pos = 0;
more = 1;
goto again_on_freelist_change;
}
txn->mt_lifo_reclaimed[0] = 0; txn->mt_lifo_reclaimed[0] = 0;
if (txn != env->me_txn0) { if (txn != env->me_txn0) {
mdbx_txl_free(txn->mt_lifo_reclaimed); mdbx_txl_free(txn->mt_lifo_reclaimed);
@ -3875,6 +3923,7 @@ bailout:
} }
} }
mdbx_trace("<<< rc = %d", rc);
return rc; return rc;
} }
@ -3997,7 +4046,7 @@ static __cold bool mdbx_txn_import_dbi(MDBX_txn *txn, MDBX_dbi dbi) {
(env->me_dbflags[i] & MDBX_VALID)) { (env->me_dbflags[i] & MDBX_VALID)) {
txn->mt_dbs[i].md_flags = env->me_dbflags[i] & PERSISTENT_FLAGS; txn->mt_dbs[i].md_flags = env->me_dbflags[i] & PERSISTENT_FLAGS;
txn->mt_dbflags[i] = DB_VALID | DB_USRVALID | DB_STALE; txn->mt_dbflags[i] = DB_VALID | DB_USRVALID | DB_STALE;
assert(txn->mt_dbxs[i].md_cmp != NULL); mdbx_tassert(txn, txn->mt_dbxs[i].md_cmp != NULL);
} }
} }
txn->mt_numdbs = snap_numdbs; txn->mt_numdbs = snap_numdbs;
@ -4241,7 +4290,7 @@ int mdbx_txn_commit(MDBX_txn *txn) {
} }
} }
rc = mdbx_freelist_save(txn); rc = mdbx_update_gc(txn);
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
goto fail; goto fail;
@ -4845,7 +4894,8 @@ int __cold mdbx_env_get_maxkeysize(MDBX_env *env) {
#define mdbx_maxkey(nodemax) ((nodemax) - (NODESIZE + sizeof(MDBX_db))) #define mdbx_maxkey(nodemax) ((nodemax) - (NODESIZE + sizeof(MDBX_db)))
#define mdbx_maxfree1pg(pagesize) (((pagesize)-PAGEHDRSZ) / sizeof(pgno_t) - 1) #define mdbx_maxgc_ov1page(pagesize) \
(((pagesize)-PAGEHDRSZ) / sizeof(pgno_t) - 1)
int mdbx_get_maxkeysize(size_t pagesize) { int mdbx_get_maxkeysize(size_t pagesize) {
if (pagesize == 0) if (pagesize == 0)
@ -4867,11 +4917,11 @@ static void __cold mdbx_setup_pagesize(MDBX_env *env, const size_t pagesize) {
mdbx_ensure(env, pagesize <= MAX_PAGESIZE); mdbx_ensure(env, pagesize <= MAX_PAGESIZE);
env->me_psize = (unsigned)pagesize; env->me_psize = (unsigned)pagesize;
STATIC_ASSERT(mdbx_maxfree1pg(MIN_PAGESIZE) > 42); STATIC_ASSERT(mdbx_maxgc_ov1page(MIN_PAGESIZE) > 42);
STATIC_ASSERT(mdbx_maxfree1pg(MAX_PAGESIZE) < MDBX_PNL_DB_MAX); STATIC_ASSERT(mdbx_maxgc_ov1page(MAX_PAGESIZE) < MDBX_PNL_DB_MAX);
const intptr_t maxfree_1pg = (pagesize - PAGEHDRSZ) / sizeof(pgno_t) - 1; const intptr_t maxgc_ov1page = (pagesize - PAGEHDRSZ) / sizeof(pgno_t) - 1;
mdbx_ensure(env, maxfree_1pg > 42 && maxfree_1pg < MDBX_PNL_DB_MAX); mdbx_ensure(env, maxgc_ov1page > 42 && maxgc_ov1page < MDBX_PNL_DB_MAX);
env->me_maxfree_1pg = (unsigned)maxfree_1pg; env->me_maxgc_ov1page = (unsigned)maxgc_ov1page;
STATIC_ASSERT(mdbx_nodemax(MIN_PAGESIZE) > 42); STATIC_ASSERT(mdbx_nodemax(MIN_PAGESIZE) > 42);
STATIC_ASSERT(mdbx_nodemax(MAX_PAGESIZE) < UINT16_MAX); STATIC_ASSERT(mdbx_nodemax(MAX_PAGESIZE) < UINT16_MAX);