mdbx: account loose-pages during spilling and do purge ones if a lack of dirtyroom.

Related to https://github.com/erthink/libmdbx/issues/186

Change-Id: I50c11195ad31615181f8359e123c1ffe2ebcfc30
This commit is contained in:
Leonid Yuriev 2021-04-28 15:20:15 +03:00
parent 17116b9b46
commit 329a2a50e6

View File

@ -5101,7 +5101,7 @@ static int spill_page(MDBX_txn *txn, struct mdbx_iov_ctx *ctx, MDBX_page *dp,
static int mdbx_txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, static int mdbx_txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0,
const unsigned need) { const unsigned need) {
#ifndef MDBX_DEBUG_SPILLING #ifndef MDBX_DEBUG_SPILLING
if (likely(txn->tw.dirtyroom >= need)) if (likely(txn->tw.dirtyroom + txn->tw.loose_count >= need))
return MDBX_SUCCESS; return MDBX_SUCCESS;
unsigned wanna_spill = need - txn->tw.dirtyroom; unsigned wanna_spill = need - txn->tw.dirtyroom;
#else #else
@ -5217,7 +5217,7 @@ static int mdbx_txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0,
} }
/* half of 8-bit radix-sort */ /* half of 8-bit radix-sort */
unsigned radix_counters[256], spillable = 0; unsigned radix_counters[256], spillable = 0, spilled = 0;
memset(&radix_counters, 0, sizeof(radix_counters)); memset(&radix_counters, 0, sizeof(radix_counters));
unsigned const reciprocal = 255 * 256 / (lru_max - lru_min + 1); unsigned const reciprocal = 255 * 256 / (lru_max - lru_min + 1);
for (unsigned i = 1; i <= dl->length; ++i) { for (unsigned i = 1; i <= dl->length; ++i) {
@ -5228,74 +5228,75 @@ static int mdbx_txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0,
} }
} }
unsigned prio2spill = 0, prio2adjacent = 127, amount = radix_counters[0]; if (likely(spillable > 0)) {
for (unsigned i = 1; i < 256; i++) { unsigned prio2spill = 0, prio2adjacent = 127, amount = radix_counters[0];
if (amount < wanna_spill) { for (unsigned i = 1; i < 256; i++) {
prio2spill = i; if (amount < wanna_spill) {
prio2adjacent = i + (255 - i) / 2; prio2spill = i;
amount += radix_counters[i]; prio2adjacent = i + (255 - i) / 2;
} else if (amount + amount < spillable + wanna_spill amount += radix_counters[i];
/* РАВНОЗНАЧНО: amount - wanna_spill < spillable - amount */) { } else if (amount + amount < spillable + wanna_spill
prio2adjacent = i; /* РАВНОЗНАЧНО: amount - wanna_spill < spillable - amount */) {
amount += radix_counters[i]; prio2adjacent = i;
} else amount += radix_counters[i];
break; } else
} break;
}
unsigned prev_prio = 256, spilled = 0; unsigned prev_prio = 256;
unsigned r, w, prio; unsigned r, w, prio;
for (w = 0, r = 1; r <= dl->length && spilled < wanna_spill; for (w = 0, r = 1; r <= dl->length && spilled < wanna_spill;
prev_prio = prio, ++r) { prev_prio = prio, ++r) {
prio = spill_prio(txn, r, lru_min, reciprocal); prio = spill_prio(txn, r, lru_min, reciprocal);
MDBX_page *const dp = dl->items[r].ptr; MDBX_page *const dp = dl->items[r].ptr;
if (prio < prio2adjacent) { if (prio < prio2adjacent) {
const pgno_t pgno = dl->items[r].pgno; const pgno_t pgno = dl->items[r].pgno;
const unsigned npages = dpl_npages(dl, r); const unsigned npages = dpl_npages(dl, r);
if (prio <= prio2spill) { if (prio <= prio2spill) {
if (prev_prio < prio2adjacent && prev_prio > prio2spill && if (prev_prio < prio2adjacent && prev_prio > prio2spill &&
dpl_endpgno(dl, r - 1) == pgno) { dpl_endpgno(dl, r - 1) == pgno) {
mdbx_debug("co-spill %u prev-adjacent page %" PRIaPGNO mdbx_debug("co-spill %u prev-adjacent page %" PRIaPGNO
" (lru-dist %d, prio %u)", " (lru-dist %d, prio %u)",
dpl_npages(dl, w), dl->items[r - 1].pgno, dpl_npages(dl, w), dl->items[r - 1].pgno,
txn->tw.dirtylru - dl->items[r - 1].lru, prev_prio); txn->tw.dirtylru - dl->items[r - 1].lru, prev_prio);
--w; --w;
rc = spill_page(txn, &ctx, dl->items[r - 1].ptr, rc = spill_page(txn, &ctx, dl->items[r - 1].ptr,
dpl_npages(dl, r - 1)); dpl_npages(dl, r - 1));
if (unlikely(rc != MDBX_SUCCESS))
break;
++spilled;
}
mdbx_debug("spill %u page %" PRIaPGNO " (lru-dist %d, prio %u)",
npages, dp->mp_pgno, txn->tw.dirtylru - dl->items[r].lru,
prio);
rc = spill_page(txn, &ctx, dp, npages);
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
break; break;
++spilled; ++spilled;
continue;
} }
mdbx_debug("spill %u page %" PRIaPGNO " (lru-dist %d, prio %u)", npages, if (prev_prio <= prio2spill && dpl_endpgno(dl, r - 1) == pgno) {
dp->mp_pgno, txn->tw.dirtylru - dl->items[r].lru, prio); mdbx_debug("co-spill %u next-adjacent page %" PRIaPGNO
rc = spill_page(txn, &ctx, dp, npages); " (lru-dist %d, prio %u)",
if (unlikely(rc != MDBX_SUCCESS)) npages, dp->mp_pgno, txn->tw.dirtylru - dl->items[r].lru,
break; prio);
++spilled; rc = spill_page(txn, &ctx, dp, npages);
continue; if (unlikely(rc != MDBX_SUCCESS))
} break;
prio = prev_prio /* to continue co-spilling next adjacent pages */;
if (prev_prio <= prio2spill && dpl_endpgno(dl, r - 1) == pgno) { ++spilled;
mdbx_debug("co-spill %u next-adjacent page %" PRIaPGNO continue;
" (lru-dist %d, prio %u)", }
npages, dp->mp_pgno, txn->tw.dirtylru - dl->items[r].lru,
prio);
rc = spill_page(txn, &ctx, dp, npages);
if (unlikely(rc != MDBX_SUCCESS))
break;
prio = prev_prio /* to continue co-spilling next adjacent pages */;
++spilled;
continue;
} }
dl->items[++w] = dl->items[r];
} }
dl->items[++w] = dl->items[r];
}
while (r <= dl->length) while (r <= dl->length)
dl->items[++w] = dl->items[r++]; dl->items[++w] = dl->items[r++];
mdbx_tassert(txn, r - 1 - w == spilled); mdbx_tassert(txn, r - 1 - w == spilled);
if (likely(spilled > 0)) {
dl->sorted = dpl_setlen(dl, w); dl->sorted = dpl_setlen(dl, w);
txn->tw.dirtyroom += spilled; txn->tw.dirtyroom += spilled;
mdbx_tassert(txn, mdbx_dirtylist_check(txn)); mdbx_tassert(txn, mdbx_dirtylist_check(txn));
@ -5315,7 +5316,9 @@ static int mdbx_txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0,
mdbx_tassert(txn, ctx.iov_items == 0 && rc == MDBX_SUCCESS); mdbx_tassert(txn, ctx.iov_items == 0 && rc == MDBX_SUCCESS);
} }
return likely(txn->tw.dirtyroom > need / 2) ? MDBX_SUCCESS : MDBX_TXN_FULL; return likely(txn->tw.loose_count + txn->tw.dirtyroom > need / 2)
? MDBX_SUCCESS
: MDBX_TXN_FULL;
} }
static int mdbx_cursor_spill(MDBX_cursor *mc, const MDBX_val *key, static int mdbx_cursor_spill(MDBX_cursor *mc, const MDBX_val *key,
@ -5631,16 +5634,35 @@ static __cold pgno_t mdbx_find_largest(MDBX_env *env, pgno_t largest) {
/* Add a page to the txn's dirty list */ /* Add a page to the txn's dirty list */
static int __must_check_result mdbx_page_dirty(MDBX_txn *txn, MDBX_page *mp, static int __must_check_result mdbx_page_dirty(MDBX_txn *txn, MDBX_page *mp,
unsigned npages) { unsigned npages) {
int rc;
mp->mp_txnid = txn->mt_front; mp->mp_txnid = txn->mt_front;
if (unlikely(txn->tw.dirtyroom == 0)) { if (unlikely(txn->tw.dirtyroom == 0)) {
mdbx_error("Dirtyroom is depleted, DPL length %u", if (txn->tw.loose_count) {
txn->tw.dirtylist->length); MDBX_page *loose = txn->tw.loose_pages;
if (!(txn->mt_flags & MDBX_WRITEMAP)) mdbx_debug("purge-and-reclaim loose page %" PRIaPGNO, loose->mp_pgno);
mdbx_dpage_free(txn->mt_env, mp, npages); rc = mdbx_pnl_insert_range(&txn->tw.reclaimed_pglist, loose->mp_pgno, 1);
return MDBX_TXN_FULL; if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
unsigned di = mdbx_dpl_search(txn, loose->mp_pgno);
mdbx_tassert(txn, txn->tw.dirtylist->items[di].ptr == loose);
mdbx_dpl_remove(txn, di);
txn->tw.loose_pages = loose->mp_next;
txn->tw.loose_count--;
txn->tw.dirtyroom++;
if (!(txn->mt_flags & MDBX_WRITEMAP))
mdbx_dpage_free(txn->mt_env, loose, 1);
} else {
mdbx_error("Dirtyroom is depleted, DPL length %u",
txn->tw.dirtylist->length);
if (!(txn->mt_flags & MDBX_WRITEMAP))
mdbx_dpage_free(txn->mt_env, mp, npages);
return MDBX_TXN_FULL;
}
} }
const int rc = mdbx_dpl_append(txn, mp->mp_pgno, mp, npages);
rc = mdbx_dpl_append(txn, mp->mp_pgno, mp, npages);
if (unlikely(rc != MDBX_SUCCESS)) { if (unlikely(rc != MDBX_SUCCESS)) {
bailout:
txn->mt_flags |= MDBX_TXN_ERROR; txn->mt_flags |= MDBX_TXN_ERROR;
return rc; return rc;
} }