mdbx: refine page_alloc() and update_gc() to reduce looping.

Resolves https://github.com/erthink/libmdbx/issues/254.
This commit is contained in:
Leonid Yuriev 2021-12-30 17:36:33 +03:00
parent 2ba90e63b1
commit ae5df65af0

View File

@ -6335,7 +6335,6 @@ no_loose:
pgno_t pgno, *re_list = txn->tw.reclaimed_pglist; pgno_t pgno, *re_list = txn->tw.reclaimed_pglist;
unsigned range_begin = 0, re_len = MDBX_PNL_SIZE(re_list); unsigned range_begin = 0, re_len = MDBX_PNL_SIZE(re_list);
txnid_t oldest = 0, last = 0; txnid_t oldest = 0, last = 0;
const unsigned wanna_range = num - 1;
while (true) { /* hsr-kick retry loop */ while (true) { /* hsr-kick retry loop */
MDBX_cursor_couple recur; MDBX_cursor_couple recur;
@ -6348,13 +6347,15 @@ no_loose:
mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
txn->mt_next_pgno)); txn->mt_next_pgno));
if ((flags & (MDBX_COALESCE | MDBX_ALLOC_CACHE)) == MDBX_ALLOC_CACHE && if ((flags & (MDBX_COALESCE | MDBX_ALLOC_CACHE)) == MDBX_ALLOC_CACHE &&
re_len > wanna_range) { re_len >= num) {
mdbx_tassert(txn, MDBX_PNL_LAST(re_list) < txn->mt_next_pgno && mdbx_tassert(txn, MDBX_PNL_LAST(re_list) < txn->mt_next_pgno &&
MDBX_PNL_FIRST(re_list) < txn->mt_next_pgno); MDBX_PNL_FIRST(re_list) < txn->mt_next_pgno);
range_begin = MDBX_PNL_ASCENDING ? 1 : re_len; range_begin = MDBX_PNL_ASCENDING ? 1 : re_len;
pgno = MDBX_PNL_LEAST(re_list); pgno = MDBX_PNL_LEAST(re_list);
if (likely(wanna_range == 0)) if (likely(num == 1))
goto done; goto done;
const unsigned wanna_range = num - 1;
#if MDBX_PNL_ASCENDING #if MDBX_PNL_ASCENDING
mdbx_tassert(txn, pgno == re_list[1] && range_begin == 1); mdbx_tassert(txn, pgno == re_list[1] && range_begin == 1);
while (true) { while (true) {
@ -6558,7 +6559,8 @@ no_loose:
} }
/* Done for a kick-reclaim mode, actually no page needed */ /* Done for a kick-reclaim mode, actually no page needed */
if (unlikely(flags & MDBX_ALLOC_SLOT)) { if (unlikely(num == 0)) {
mdbx_assert(env, flags & MDBX_ALLOC_SLOT);
ret.err = MDBX_SUCCESS; ret.err = MDBX_SUCCESS;
ret.page = NULL; ret.page = NULL;
return ret; return ret;
@ -8722,10 +8724,12 @@ static int mdbx_prep_backlog(MDBX_txn *txn, MDBX_cursor *gc_cursor,
err); err);
} }
} }
err = mdbx_page_alloc(gc_cursor, linear4list, err =
MDBX_ALLOC_GC | MDBX_ALLOC_CACHE | MDBX_ALLOC_SLOT) mdbx_page_alloc(gc_cursor, linear4list, MDBX_ALLOC_GC | MDBX_ALLOC_SLOT)
.err; .err;
mdbx_trace("== after-4linear, backlog %u, err %d", backlog_size(txn), err); mdbx_trace("== after-4linear, backlog %u, err %d", backlog_size(txn), err);
mdbx_cassert(gc_cursor,
backlog_size(txn) >= linear4list || err != MDBX_SUCCESS);
} }
while (backlog_size(txn) < backlog4cow + linear4list && err == MDBX_SUCCESS) while (backlog_size(txn) < backlog4cow + linear4list && err == MDBX_SUCCESS)
@ -8768,10 +8772,10 @@ static int mdbx_update_gc(MDBX_txn *txn) {
couple.outer.mc_flags |= C_RECLAIMING; couple.outer.mc_flags |= C_RECLAIMING;
couple.outer.mc_next = txn->tw.cursors[FREE_DBI]; couple.outer.mc_next = txn->tw.cursors[FREE_DBI];
txn->tw.cursors[FREE_DBI] = &couple.outer; txn->tw.cursors[FREE_DBI] = &couple.outer;
bool dense_gc = false;
retry: retry:
++loop; ++loop;
retry_noaccount:
mdbx_trace("%s", " >> restart"); mdbx_trace("%s", " >> restart");
mdbx_tassert(txn, mdbx_tassert(txn,
mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
@ -8783,7 +8787,7 @@ retry_noaccount:
goto bailout; goto bailout;
} }
if (retired_stored != MDBX_PNL_SIZE(txn->tw.retired_pages)) { if (unlikely(dense_gc) && retired_stored) {
rc = mdbx_prep_backlog(txn, &couple.outer, rc = mdbx_prep_backlog(txn, &couple.outer,
MDBX_PNL_SIZEOF(txn->tw.retired_pages), MDBX_PNL_SIZEOF(txn->tw.retired_pages),
&retired_stored); &retired_stored);
@ -8799,6 +8803,15 @@ retry_noaccount:
MDBX_val key, data; MDBX_val key, data;
mdbx_trace("%s", " >> continue"); mdbx_trace("%s", " >> continue");
if (retired_stored != MDBX_PNL_SIZE(txn->tw.retired_pages) &&
MDBX_PNL_SIZE(txn->tw.retired_pages) > env->me_maxgc_ov1page) {
rc = mdbx_prep_backlog(txn, &couple.outer,
MDBX_PNL_SIZEOF(txn->tw.retired_pages),
&retired_stored);
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
}
mdbx_tassert(txn, mdbx_tassert(txn,
mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
txn->mt_next_pgno - MDBX_ENABLE_REFUND)); txn->mt_next_pgno - MDBX_ENABLE_REFUND));
@ -8823,9 +8836,11 @@ retry_noaccount:
continue; continue;
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
goto bailout; goto bailout;
rc = mdbx_prep_backlog(txn, &couple.outer, 0, nullptr); if (likely(!dense_gc)) {
if (unlikely(rc != MDBX_SUCCESS)) rc = mdbx_prep_backlog(txn, &couple.outer, 0, nullptr);
goto bailout; if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
}
mdbx_tassert(txn, mdbx_tassert(txn,
cleaned_gc_id < env->me_lck->mti_oldest_reader.weak); cleaned_gc_id < env->me_lck->mti_oldest_reader.weak);
mdbx_trace("%s: cleanup-reclaimed-id [%u]%" PRIaTXN, dbg_prefix_mode, mdbx_trace("%s: cleanup-reclaimed-id [%u]%" PRIaTXN, dbg_prefix_mode,
@ -8863,7 +8878,7 @@ retry_noaccount:
} }
if (cleaned_gc_id > txn->tw.last_reclaimed) if (cleaned_gc_id > txn->tw.last_reclaimed)
break; break;
if (cleaned_gc_id < txn->tw.last_reclaimed) { if (likely(!dense_gc) && cleaned_gc_id < txn->tw.last_reclaimed) {
rc = mdbx_prep_backlog(txn, &couple.outer, 0, nullptr); rc = mdbx_prep_backlog(txn, &couple.outer, 0, nullptr);
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
goto bailout; goto bailout;
@ -9014,11 +9029,12 @@ retry_noaccount:
mdbx_debug_extra_print(" %" PRIaPGNO, txn->tw.retired_pages[i]); mdbx_debug_extra_print(" %" PRIaPGNO, txn->tw.retired_pages[i]);
mdbx_debug_extra_print("%s\n", "."); mdbx_debug_extra_print("%s\n", ".");
} }
if (unlikely(amount != MDBX_PNL_SIZE(txn->tw.reclaimed_pglist))) { if (unlikely(amount != MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) &&
settled)) {
mdbx_trace("%s: reclaimed-list changed %u -> %u, retry", mdbx_trace("%s: reclaimed-list changed %u -> %u, retry",
dbg_prefix_mode, amount, dbg_prefix_mode, amount,
(unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)); (unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist));
goto retry_noaccount /* rare case, but avoids GC fragmentation goto retry /* rare case, but avoids GC fragmentation
and one cycle. */ and one cycle. */
; ;
} }
@ -9062,7 +9078,8 @@ retry_noaccount:
prefer_max_scatter && prefer_max_scatter &&
left > ((unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - left > ((unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) -
reused_gc_slot) * reused_gc_slot) *
env->me_maxgc_ov1page) { env->me_maxgc_ov1page &&
!dense_gc) {
/* LY: need just a txn-id for save page list. */ /* LY: need just a txn-id for save page list. */
bool need_cleanup = false; bool need_cleanup = false;
@ -9118,7 +9135,7 @@ retry_noaccount:
left > ((unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - left > ((unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) -
reused_gc_slot) * reused_gc_slot) *
env->me_maxgc_ov1page) { env->me_maxgc_ov1page) {
if (unlikely(gc_rid < 2)) { if (unlikely(gc_rid <= MIN_TXNID)) {
if (unlikely(MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) <= if (unlikely(MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) <=
reused_gc_slot)) { reused_gc_slot)) {
mdbx_notice("** restart: reserve depleted (reused_gc_slot %u >= " mdbx_notice("** restart: reserve depleted (reused_gc_slot %u >= "
@ -9143,6 +9160,7 @@ retry_noaccount:
rc = mdbx_cursor_get(&couple.outer, &key, &data, MDBX_FIRST); rc = mdbx_cursor_get(&couple.outer, &key, &data, MDBX_FIRST);
if (rc == MDBX_NOTFOUND) { if (rc == MDBX_NOTFOUND) {
mdbx_debug("%s: GC is empty", dbg_prefix_mode); mdbx_debug("%s: GC is empty", dbg_prefix_mode);
dense_gc = true;
break; break;
} }
if (unlikely(rc != MDBX_SUCCESS || if (unlikely(rc != MDBX_SUCCESS ||
@ -9156,14 +9174,16 @@ retry_noaccount:
rc = MDBX_CORRUPTED; rc = MDBX_CORRUPTED;
goto bailout; goto bailout;
} }
if (gc_first < 2) { if (gc_first <= MIN_TXNID) {
mdbx_debug("%s: no free GC's id(s) less than %" PRIaTXN, mdbx_debug("%s: no free GC's id(s) less than %" PRIaTXN,
dbg_prefix_mode, gc_rid); dbg_prefix_mode, gc_rid);
dense_gc = true;
break; break;
} }
gc_rid = gc_first - 1; gc_rid = gc_first - 1;
} }
mdbx_assert(env, !dense_gc);
rc = mdbx_txl_append(&txn->tw.lifo_reclaimed, gc_rid); rc = mdbx_txl_append(&txn->tw.lifo_reclaimed, gc_rid);
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
goto bailout; goto bailout;
@ -9181,10 +9201,12 @@ retry_noaccount:
dbg_prefix_mode, gc_rid, cleaned_gc_slot); dbg_prefix_mode, gc_rid, cleaned_gc_slot);
} }
if (need_cleanup) { if (need_cleanup || dense_gc) {
if (cleaned_gc_slot)
mdbx_trace(
"%s: restart inner-loop to clear and re-create GC entries",
dbg_prefix_mode);
cleaned_gc_slot = 0; cleaned_gc_slot = 0;
mdbx_trace("%s: restart inner-loop to clear and re-create GC entries",
dbg_prefix_mode);
continue; continue;
} }
} }
@ -9216,6 +9238,7 @@ retry_noaccount:
gc_rid = gc_first - 1; gc_rid = gc_first - 1;
if (unlikely(gc_rid == 0)) { if (unlikely(gc_rid == 0)) {
mdbx_error("%s", "** no GC tail-space to store"); mdbx_error("%s", "** no GC tail-space to store");
dense_gc = true;
goto retry; goto retry;
} }
} else if (rc != MDBX_NOTFOUND) } else if (rc != MDBX_NOTFOUND)
@ -9251,7 +9274,7 @@ retry_noaccount:
const unsigned tail = left - threshold + env->me_maxgc_ov1page + 1; const unsigned tail = left - threshold + env->me_maxgc_ov1page + 1;
unsigned span = 1; unsigned span = 1;
unsigned avail = (unsigned)((pgno2bytes(env, span) - PAGEHDRSZ) / unsigned avail = (unsigned)((pgno2bytes(env, span) - PAGEHDRSZ) /
sizeof(pgno_t)) /*- 1 + span */; sizeof(pgno_t)) /* - 1 + span */;
if (tail > avail) { if (tail > avail) {
for (unsigned i = amount - span; i > 0; --i) { for (unsigned i = amount - span; i > 0; --i) {
if (MDBX_PNL_ASCENDING if (MDBX_PNL_ASCENDING
@ -9288,7 +9311,7 @@ retry_noaccount:
mdbx_tassert(txn, reservation_gc_id < env->me_lck->mti_oldest_reader.weak); mdbx_tassert(txn, reservation_gc_id < env->me_lck->mti_oldest_reader.weak);
if (unlikely( if (unlikely(
reservation_gc_id < 1 || reservation_gc_id < MIN_TXNID ||
reservation_gc_id >= reservation_gc_id >=
atomic_load64(&env->me_lck->mti_oldest_reader, mo_Relaxed))) { atomic_load64(&env->me_lck->mti_oldest_reader, mo_Relaxed))) {
mdbx_error("** internal error (reservation_gc_id %" PRIaTXN ")", mdbx_error("** internal error (reservation_gc_id %" PRIaTXN ")",
@ -9317,10 +9340,12 @@ retry_noaccount:
chunk); chunk);
if (txn->tw.lifo_reclaimed && if (txn->tw.lifo_reclaimed &&
unlikely(amount < MDBX_PNL_SIZE(txn->tw.reclaimed_pglist))) { unlikely(amount < MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)) &&
(loop < 5 || MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) - amount >
env->me_maxgc_ov1page)) {
mdbx_notice("** restart: reclaimed-list growth %u -> %u", amount, mdbx_notice("** restart: reclaimed-list growth %u -> %u", amount,
(unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)); (unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist));
goto retry_noaccount; goto retry;
} }
continue; continue;