mdbx: one more fix/rewrite mdbx_update_gc().

Change-Id: I7188f1566488d239d018311286612c3117f58127
This commit is contained in:
Leonid Yuriev 2018-08-21 15:05:04 +03:00
parent 7aab221bf4
commit f371f10743

View File

@ -1,4 +1,4 @@
/* /*
* Copyright 2015-2018 Leonid Yuriev <leo@yuriev.ru> * Copyright 2015-2018 Leonid Yuriev <leo@yuriev.ru>
* and other libmdbx authors: please see AUTHORS file. * and other libmdbx authors: please see AUTHORS file.
* All rights reserved. * All rights reserved.
@ -3539,7 +3539,7 @@ static int mdbx_update_gc(MDBX_txn *txn) {
/* env->me_reclaimed_pglist[] can grow and shrink during this call. /* env->me_reclaimed_pglist[] can grow and shrink during this call.
* env->me_last_reclaimed and txn->mt_befree_pages[] can only grow. * env->me_last_reclaimed and txn->mt_befree_pages[] can only grow.
* Page numbers cannot disappear from txn->mt_befree_pages[]. */ * Page numbers cannot disappear from txn->mt_befree_pages[]. */
MDBX_env *env = txn->mt_env; MDBX_env *const env = txn->mt_env;
const bool lifo = (env->me_flags & MDBX_LIFORECLAIM) != 0; const bool lifo = (env->me_flags & MDBX_LIFORECLAIM) != 0;
MDBX_cursor mc; MDBX_cursor mc;
@ -3556,18 +3556,18 @@ static int mdbx_update_gc(MDBX_txn *txn) {
retry: retry:
mdbx_trace(" >> restart"); mdbx_trace(" >> restart");
mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist, true)); mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist, true));
unsigned placed = 0, cleaned_gc_slot = 0, reused_gc_slots = 0,
filled_gc_slot = ~0u;
txnid_t cleaned_gc_id = 0, head_gc_id = env->me_last_reclaimed
? env->me_last_reclaimed
: ~(txnid_t)0;
if (unlikely(/* paranoia */ ++loop > 42)) { if (unlikely(/* paranoia */ ++loop > 42)) {
mdbx_error("too more loops %u, bailout", loop); mdbx_error("too more loops %u, bailout", loop);
rc = MDBX_PROBLEM; rc = MDBX_PROBLEM;
goto bailout; goto bailout;
} }
unsigned placed = 0, cleaned_gc_slot = 0, reused_gc_slot = 0,
filled_gc_slot = ~0u;
txnid_t cleaned_gc_id = 0, head_gc_id = env->me_last_reclaimed
? env->me_last_reclaimed
: ~(txnid_t)0;
while (1) { while (1) {
/* Come back here after each Put() in case befree-list changed */ /* Come back here after each Put() in case befree-list changed */
MDBX_val key, data; MDBX_val key, data;
@ -3585,7 +3585,6 @@ retry:
goto bailout; goto bailout;
cleaned_gc_id = head_gc_id = *(txnid_t *)key.iov_base; cleaned_gc_id = head_gc_id = *(txnid_t *)key.iov_base;
mdbx_tassert(txn, cleaned_gc_id < *env->me_oldest); mdbx_tassert(txn, cleaned_gc_id < *env->me_oldest);
placed = 0;
mdbx_tassert(txn, cleaned_gc_id <= env->me_last_reclaimed); mdbx_tassert(txn, cleaned_gc_id <= env->me_last_reclaimed);
mc.mc_flags |= C_RECLAIMING; mc.mc_flags |= C_RECLAIMING;
mdbx_trace("%s.cleanup-reclaimed-id %" PRIaTXN, dbg_prefix_mode, mdbx_trace("%s.cleanup-reclaimed-id %" PRIaTXN, dbg_prefix_mode,
@ -3594,32 +3593,34 @@ retry:
mc.mc_flags ^= C_RECLAIMING; mc.mc_flags ^= C_RECLAIMING;
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
goto bailout; goto bailout;
placed = 0;
} }
} else if (txn->mt_lifo_reclaimed) { } else if (txn->mt_lifo_reclaimed &&
cleaned_gc_slot < txn->mt_lifo_reclaimed[0]) {
/* LY: cleanup reclaimed records. */ /* LY: cleanup reclaimed records. */
while (cleaned_gc_slot < txn->mt_lifo_reclaimed[0]) { do {
cleaned_gc_id = txn->mt_lifo_reclaimed[++cleaned_gc_slot]; cleaned_gc_id = txn->mt_lifo_reclaimed[++cleaned_gc_slot];
assert(cleaned_gc_slot > 0 && cleaned_gc_id < *env->me_oldest); assert(cleaned_gc_slot > 0 && cleaned_gc_id < *env->me_oldest);
head_gc_id = (head_gc_id > cleaned_gc_id) ? cleaned_gc_id : head_gc_id; head_gc_id = (head_gc_id > cleaned_gc_id) ? cleaned_gc_id : head_gc_id;
key.iov_base = &cleaned_gc_id; key.iov_base = &cleaned_gc_id;
key.iov_len = sizeof(cleaned_gc_id); key.iov_len = sizeof(cleaned_gc_id);
rc = mdbx_cursor_get(&mc, &key, NULL, MDBX_SET); rc = mdbx_cursor_get(&mc, &key, NULL, MDBX_SET);
if (likely(rc != MDBX_NOTFOUND)) { if (rc == MDBX_NOTFOUND)
if (unlikely(rc != MDBX_SUCCESS)) continue;
goto bailout; if (unlikely(rc != MDBX_SUCCESS))
rc = mdbx_prep_backlog(txn, &mc); goto bailout;
if (unlikely(rc != MDBX_SUCCESS)) rc = mdbx_prep_backlog(txn, &mc);
goto bailout; if (unlikely(rc != MDBX_SUCCESS))
mdbx_tassert(txn, cleaned_gc_id < *env->me_oldest); goto bailout;
mc.mc_flags |= C_RECLAIMING; mdbx_tassert(txn, cleaned_gc_id < *env->me_oldest);
mdbx_trace("%s.cleanup-reclaimed-id [%u]%" PRIaTXN, dbg_prefix_mode, mc.mc_flags |= C_RECLAIMING;
cleaned_gc_slot, cleaned_gc_id); mdbx_trace("%s.cleanup-reclaimed-id [%u]%" PRIaTXN, dbg_prefix_mode,
rc = mdbx_cursor_del(&mc, 0); cleaned_gc_slot, cleaned_gc_id);
mc.mc_flags ^= C_RECLAIMING; rc = mdbx_cursor_del(&mc, 0);
if (unlikely(rc != MDBX_SUCCESS)) mc.mc_flags ^= C_RECLAIMING;
goto bailout; if (unlikely(rc != MDBX_SUCCESS))
} goto bailout;
} } while (cleaned_gc_slot < txn->mt_lifo_reclaimed[0]);
mdbx_txl_sort(txn->mt_lifo_reclaimed); mdbx_txl_sort(txn->mt_lifo_reclaimed);
assert(txn->mt_lifo_reclaimed[0] == 0 || assert(txn->mt_lifo_reclaimed[0] == 0 ||
txn->mt_lifo_reclaimed[txn->mt_lifo_reclaimed[0]] == head_gc_id); txn->mt_lifo_reclaimed[txn->mt_lifo_reclaimed[0]] == head_gc_id);
@ -3753,21 +3754,30 @@ retry:
// handle reclaimed and loost pages - merge and store both into gc // handle reclaimed and loost pages - merge and store both into gc
mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist, true)); mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist, true));
mdbx_tassert(txn, txn->mt_loose_count == 0); mdbx_tassert(txn, txn->mt_loose_count == 0);
mdbx_trace(" >> reserving");
const unsigned amount = const unsigned amount =
env->me_reclaimed_pglist ? env->me_reclaimed_pglist[0] : 0; env->me_reclaimed_pglist ? env->me_reclaimed_pglist[0] : 0;
const unsigned left = amount - placed; const unsigned left = amount - placed;
mdbx_trace("%s: amount %u, placed %d, left %d", dbg_prefix_mode, amount, mdbx_trace("%s: amount %u, placed %d, left %d", dbg_prefix_mode, amount,
placed, (int)left); placed, (int)left);
if (0 >= (int)left) if (0 >= (int)left)
break; break;
mdbx_trace(" >> reserving");
txnid_t reservation_gc_id; txnid_t reservation_gc_id;
const unsigned lifo_gc_slots =
txn->mt_lifo_reclaimed ? (unsigned)txn->mt_lifo_reclaimed[0] : 0;
if (lifo) { if (lifo) {
if (reused_gc_slot >= lifo_gc_slots) { assert(txn->mt_lifo_reclaimed != NULL);
if (unlikely(!txn->mt_lifo_reclaimed)) {
txn->mt_lifo_reclaimed = mdbx_txl_alloc();
if (unlikely(!txn->mt_lifo_reclaimed)) {
rc = MDBX_ENOMEM;
goto bailout;
}
}
if (head_gc_id > 1 && txn->mt_lifo_reclaimed[0] < INT16_MAX &&
left > ((unsigned)txn->mt_lifo_reclaimed[0] - reused_gc_slots) *
env->me_maxgc_ov1page) {
/* LY: need just a txn-id for save page list. */ /* LY: need just a txn-id for save page list. */
rc = mdbx_page_alloc(&mc, 0, NULL, MDBX_ALLOC_GC | MDBX_ALLOC_KICK); rc = mdbx_page_alloc(&mc, 0, NULL, MDBX_ALLOC_GC | MDBX_ALLOC_KICK);
if (likely(rc == MDBX_SUCCESS)) if (likely(rc == MDBX_SUCCESS))
@ -3777,58 +3787,101 @@ retry:
/* LY: other troubles... */ /* LY: other troubles... */
goto bailout; goto bailout;
if (unlikely(!txn->mt_lifo_reclaimed)) {
txn->mt_lifo_reclaimed = mdbx_txl_alloc();
if (unlikely(!txn->mt_lifo_reclaimed)) {
rc = MDBX_ENOMEM;
goto bailout;
}
}
/* LY: freedb is empty, will look any free txn-id in high2low order. */ /* LY: freedb is empty, will look any free txn-id in high2low order. */
rc = mdbx_txl_append(&txn->mt_lifo_reclaimed, --head_gc_id); do {
if (unlikely(rc != MDBX_SUCCESS)) --head_gc_id;
goto bailout; assert(txn->mt_lifo_reclaimed[txn->mt_lifo_reclaimed[0]] >
cleaned_gc_slot += 1 /* mark GC cleanup is not needed. */; head_gc_id);
rc = mdbx_txl_append(&txn->mt_lifo_reclaimed, head_gc_id);
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
cleaned_gc_slot += 1 /* mark GC cleanup is not needed. */;
mdbx_trace("%s: append @%" PRIaTXN mdbx_trace("%s: append @%" PRIaTXN
" to lifo-reclaimed, cleaned-gc-slot = %u", " to lifo-reclaimed, cleaned-gc-slot = %u",
dbg_prefix_mode, head_gc_id, cleaned_gc_slot); dbg_prefix_mode, head_gc_id, cleaned_gc_slot);
} while (head_gc_id > 1 && txn->mt_lifo_reclaimed[0] < INT16_MAX &&
left >
((unsigned)txn->mt_lifo_reclaimed[0] - reused_gc_slots) *
env->me_maxgc_ov1page);
} }
mdbx_tassert(txn, txn->mt_lifo_reclaimed != NULL);
reservation_gc_id = txn->mt_lifo_reclaimed[++reused_gc_slot]; if ((unsigned)txn->mt_lifo_reclaimed[0] <= reused_gc_slots) {
mdbx_notice("** restart: reserve depleted (reused_gc_slot %u >= "
"lifo_reclaimed %u" PRIaTXN,
reused_gc_slots, (unsigned)txn->mt_lifo_reclaimed[0]);
goto retry;
}
const unsigned i = (unsigned)txn->mt_lifo_reclaimed[0] - reused_gc_slots;
assert(i > 0 && i <= txn->mt_lifo_reclaimed[0]);
reservation_gc_id = txn->mt_lifo_reclaimed[i];
mdbx_trace("%s: take @%" PRIaTXN " from lifo-reclaimed[%u]", mdbx_trace("%s: take @%" PRIaTXN " from lifo-reclaimed[%u]",
dbg_prefix_mode, reservation_gc_id, reused_gc_slot); dbg_prefix_mode, reservation_gc_id, i);
head_gc_id =
(head_gc_id > reservation_gc_id) ? reservation_gc_id : head_gc_id;
} else { } else {
mdbx_tassert(txn, txn->mt_lifo_reclaimed == NULL); mdbx_tassert(txn, txn->mt_lifo_reclaimed == NULL);
reused_gc_slot++ /* just count reserved records */;
reservation_gc_id = head_gc_id--; reservation_gc_id = head_gc_id--;
mdbx_trace("%s: take @%" PRIaTXN " from head-gc-id", dbg_prefix_mode, mdbx_trace("%s: take @%" PRIaTXN " from head-gc-id", dbg_prefix_mode,
reservation_gc_id); reservation_gc_id);
} }
mdbx_trace("%s: head_gc_id %" PRIaTXN ++reused_gc_slots;
", reused_gc_slot %u, lifo_gc_slots %u, reservation-id " assert(txn->mt_lifo_reclaimed == NULL ||
txn->mt_lifo_reclaimed[0] <= INT16_MAX);
unsigned chunk = left;
if (unlikely(chunk > env->me_maxgc_ov1page)) {
const unsigned avail_gs_slots =
lifo ? (unsigned)txn->mt_lifo_reclaimed[0] - reused_gc_slots
: (head_gc_id < INT16_MAX) ? (unsigned)head_gc_id : INT16_MAX;
if (avail_gs_slots > 1) {
if (chunk < env->me_maxgc_ov1page * 2)
chunk /= 2;
else {
const unsigned threshold = env->me_maxgc_ov1page * avail_gs_slots;
if (left < threshold)
chunk = env->me_maxgc_ov1page;
else {
const unsigned tail = left - threshold + env->me_maxgc_ov1page + 1;
unsigned span = 1;
unsigned avail = (unsigned)((pgno2bytes(env, span) - PAGEHDRSZ) /
sizeof(pgno_t)) /*- 1 + span */;
if (tail > avail) {
for (unsigned i = env->me_reclaimed_pglist[0] - span; i > 0;
--i) {
if (MDBX_PNL_ASCENDING
? (env->me_reclaimed_pglist[i] + span)
: (env->me_reclaimed_pglist[i] - span) ==
env->me_reclaimed_pglist[i + span]) {
span += 1;
avail = (unsigned)((pgno2bytes(env, span) - PAGEHDRSZ) /
sizeof(pgno_t)) -
1 + span;
if (avail >= tail)
break;
}
}
}
chunk = (avail >= tail)
? tail - span
: (avail_gs_slots > 3 && reused_gc_slots < 42)
? avail - span
: tail;
}
}
}
}
assert(chunk > 0);
mdbx_trace("%s: head_gc_id %" PRIaTXN ", reused_gc_slot %u, reservation-id "
"%" PRIaTXN, "%" PRIaTXN,
dbg_prefix_mode, head_gc_id, reused_gc_slot, lifo_gc_slots, dbg_prefix_mode, head_gc_id, reused_gc_slots, reservation_gc_id);
reservation_gc_id);
const bool no_slots_more = mdbx_trace("%s: chunk %u, gc-per-ovpage %u", dbg_prefix_mode, chunk,
head_gc_id < 2 && (!lifo || reused_gc_slot >= lifo_gc_slots);
const unsigned chunk =
(left < env->me_maxgc_ov1page || no_slots_more)
? left
: (left < env->me_maxgc_ov1page * 2)
? /* the half to each of the last two chunks */ left / 2
: env->me_maxgc_ov1page;
mdbx_trace("%s: chunk %u, no_slots_more %s, gc-per-ovpage %u",
dbg_prefix_mode, chunk, no_slots_more ? "yes" : "no",
env->me_maxgc_ov1page); env->me_maxgc_ov1page);
mdbx_tassert(txn, reservation_gc_id < *env->me_oldest); mdbx_tassert(txn, reservation_gc_id < *env->me_oldest);
if (unlikely(reservation_gc_id < 1)) { if (unlikely(reservation_gc_id < 1 ||
reservation_gc_id >= *env->me_oldest)) {
/* LY: not any txn in the past of freedb. */ /* LY: not any txn in the past of freedb. */
rc = MDBX_PROBLEM; rc = MDBX_PROBLEM;
goto bailout; goto bailout;
@ -3857,7 +3910,9 @@ retry:
mdbx_trace(" >> filling"); mdbx_trace(" >> filling");
/* Fill in the reserved records */ /* Fill in the reserved records */
filled_gc_slot = reused_gc_slot; filled_gc_slot = txn->mt_lifo_reclaimed
? (unsigned)txn->mt_lifo_reclaimed[0] - reused_gc_slots
: reused_gc_slots;
rc = MDBX_SUCCESS; rc = MDBX_SUCCESS;
mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist, true)); mdbx_tassert(txn, mdbx_pnl_check(env->me_reclaimed_pglist, true));
if (env->me_reclaimed_pglist && env->me_reclaimed_pglist[0]) { if (env->me_reclaimed_pglist && env->me_reclaimed_pglist[0]) {
@ -3883,8 +3938,7 @@ retry:
if (txn->mt_lifo_reclaimed == nullptr) { if (txn->mt_lifo_reclaimed == nullptr) {
mdbx_tassert(txn, lifo == 0); mdbx_tassert(txn, lifo == 0);
fill_gc_id = *(txnid_t *)key.iov_base; fill_gc_id = *(txnid_t *)key.iov_base;
if (filled_gc_slot-- /* just countdown reserved records */ == 0 || if (filled_gc_slot-- == 0 || fill_gc_id > env->me_last_reclaimed) {
fill_gc_id > env->me_last_reclaimed) {
mdbx_notice( mdbx_notice(
"** restart: reserve depleted (filled_slot %u, fill_id %" PRIaTXN "** restart: reserve depleted (filled_slot %u, fill_id %" PRIaTXN
" > last_reclaimed %" PRIaTXN, " > last_reclaimed %" PRIaTXN,
@ -3893,15 +3947,15 @@ retry:
} }
} else { } else {
mdbx_tassert(txn, lifo != 0); mdbx_tassert(txn, lifo != 0);
if (filled_gc_slot == 0) { if (++filled_gc_slot > (unsigned)txn->mt_lifo_reclaimed[0]) {
mdbx_notice("** restart: reserve depleted (filled_slot == 0)"); mdbx_notice("** restart: reserve depleted (filled_gc_slot %u > "
"lifo_reclaimed %u" PRIaTXN,
filled_gc_slot, (unsigned)txn->mt_lifo_reclaimed[0]);
goto retry; goto retry;
} }
mdbx_tassert(txn, filled_gc_slot > 0 && fill_gc_id = txn->mt_lifo_reclaimed[filled_gc_slot];
filled_gc_slot <= txn->mt_lifo_reclaimed[0]);
fill_gc_id = txn->mt_lifo_reclaimed[filled_gc_slot--];
mdbx_trace("%s.seek-reservaton @%" PRIaTXN " at lifo_reclaimed[%u]", mdbx_trace("%s.seek-reservaton @%" PRIaTXN " at lifo_reclaimed[%u]",
dbg_prefix_mode, fill_gc_id, (unsigned)filled_gc_slot); dbg_prefix_mode, fill_gc_id, filled_gc_slot);
key.iov_base = &fill_gc_id; key.iov_base = &fill_gc_id;
key.iov_len = sizeof(fill_gc_id); key.iov_len = sizeof(fill_gc_id);
rc = mdbx_cursor_get(&mc, &key, &data, MDBX_SET); rc = mdbx_cursor_get(&mc, &key, &data, MDBX_SET);
@ -3957,17 +4011,17 @@ retry:
} }
mdbx_tassert(txn, rc == MDBX_SUCCESS); mdbx_tassert(txn, rc == MDBX_SUCCESS);
if (txn->mt_lifo_reclaimed) { if (unlikely(
mdbx_tassert(txn, cleaned_gc_slot == txn->mt_lifo_reclaimed[0]); filled_gc_slot !=
if (unlikely(filled_gc_slot != 0)) { (txn->mt_lifo_reclaimed ? (unsigned)txn->mt_lifo_reclaimed[0] : 0))) {
mdbx_notice("** restart: reserve excess (filled-slot %u > 0)", mdbx_notice("** restart: reserve excess (filled-slot %u)", filled_gc_slot);
filled_gc_slot); goto retry;
goto retry;
}
} }
bailout: bailout:
if (txn->mt_lifo_reclaimed) { if (txn->mt_lifo_reclaimed) {
mdbx_tassert(txn, rc != MDBX_SUCCESS ||
cleaned_gc_slot == txn->mt_lifo_reclaimed[0]);
txn->mt_lifo_reclaimed[0] = 0; txn->mt_lifo_reclaimed[0] = 0;
if (txn != env->me_txn0) { if (txn != env->me_txn0) {
mdbx_txl_free(txn->mt_lifo_reclaimed); mdbx_txl_free(txn->mt_lifo_reclaimed);