From 6bedb02ac08694b7bf51d18fb2391b0edb0adb6b Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Thu, 20 May 2021 17:42:22 +0300 Subject: [PATCH] mdbx: fix lru-counter overflow by module 2^31. 2 of 2 fixes for https://github.com/erthink/libmdbx/issues/195 --- src/core.c | 91 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 47 insertions(+), 44 deletions(-) diff --git a/src/core.c b/src/core.c index 1afd3e01..d388b5f9 100644 --- a/src/core.c +++ b/src/core.c @@ -3535,6 +3535,13 @@ mdbx_dpl_append(MDBX_txn *txn, pgno_t pgno, MDBX_page *page, unsigned npages) { return MDBX_SUCCESS; } +static __inline uint32_t mdbx_dpl_age(const MDBX_txn *txn, unsigned i) { + const MDBX_dpl *dl = txn->tw.dirtylist; + assert((int)i > 0 && i <= dl->length); + /* overflow could be here */ + return (txn->tw.dirtylru - dl->items[i].lru) & UINT32_C(0x7fffFFFF); +} + /*----------------------------------------------------------------------------*/ uint8_t mdbx_runtime_flags = MDBX_RUNTIME_FLAGS_INIT; @@ -4207,8 +4214,9 @@ MDBX_MAYBE_UNUSED __cold static bool mdbx_dirtylist_check(MDBX_txn *txn) { if (unlikely(dp->mp_pgno != dl->items[i].pgno)) return false; - mdbx_tassert(txn, txn->tw.dirtylru >= dl->items[i].lru); - if (unlikely(txn->tw.dirtylru < dl->items[i].lru)) + const uint32_t age = mdbx_dpl_age(txn, i); + mdbx_tassert(txn, age < UINT32_MAX / 3); + if (unlikely(age > UINT32_MAX / 3)) return false; mdbx_tassert(txn, dp->mp_flags == P_LOOSE || IS_MODIFIABLE(txn, dp)); @@ -4977,14 +4985,14 @@ static int spill_page(MDBX_txn *txn, struct mdbx_iov_ctx *ctx, MDBX_page *dp, static unsigned mdbx_cursor_keep(MDBX_txn *txn, MDBX_cursor *mc) { unsigned keep = 0; while (mc->mc_flags & C_INITIALIZED) { - for (unsigned i = 0; i < mc->mc_snum; i++) { + for (unsigned i = 0; i < mc->mc_snum; ++i) { const MDBX_page *mp = mc->mc_pg[i]; if (IS_MODIFIABLE(txn, mp) && !IS_SUBP(mp)) { unsigned const n = mdbx_dpl_search(txn, mp->mp_pgno); if (txn->tw.dirtylist->items[n].pgno == mp->mp_pgno && - txn->tw.dirtylist->items[n].lru != txn->tw.dirtylru) { + mdbx_dpl_age(txn, n)) { txn->tw.dirtylist->items[n].lru = txn->tw.dirtylru; - keep++; + ++keep; } } } @@ -5011,12 +5019,12 @@ static unsigned mdbx_txn_keep(MDBX_txn *txn, MDBX_cursor *m0) { * ... * > 255 = must not be spilled. */ static unsigned spill_prio(const MDBX_txn *txn, const unsigned i, - const unsigned lru_min, const unsigned reciprocal) { + const uint32_t reciprocal) { MDBX_dpl *const dl = txn->tw.dirtylist; - const unsigned lru = dl->items[i].lru; + const uint32_t age = mdbx_dpl_age(txn, i); const unsigned npages = dpl_npages(dl, i); const pgno_t pgno = dl->items[i].pgno; - if (lru == txn->tw.dirtylru) { + if (age == 0) { mdbx_debug("skip %s %u page %" PRIaPGNO, "keep", npages, pgno); return 256; } @@ -5045,21 +5053,22 @@ static unsigned spill_prio(const MDBX_txn *txn, const unsigned i, while ((parent = parent->mt_parent) != nullptr); } - unsigned prio = 1 + ((lru - lru_min) * reciprocal >> 8); - mdbx_tassert(txn, prio > 0 && prio < 256); + mdbx_tassert(txn, age * (uint64_t)reciprocal < UINT32_MAX); + unsigned prio = age * reciprocal >> 24; + mdbx_tassert(txn, prio < 256); if (likely(npages == 1)) - return prio; + return prio = 256 - prio; /* make a large/overflow pages be likely to spill */ - uint32_t x = npages | npages >> 1; - x |= x >> 2; - x |= x >> 4; - x |= x >> 8; - x |= x >> 16; - x = (255 - prio) * log2n_powerof2(x + 1) + /* golden ratio factor */ 157; - x = (x < 256) ? 255 - x : 0; - mdbx_tassert(txn, x < 256 && x < prio); - return prio = x; + uint32_t factor = npages | npages >> 1; + factor |= factor >> 2; + factor |= factor >> 4; + factor |= factor >> 8; + factor |= factor >> 16; + factor = prio * log2n_powerof2(factor + 1) + /* golden ratio */ 157; + factor = (factor < 256) ? 255 - factor : 0; + mdbx_tassert(txn, factor < 256 && factor < (256 - prio)); + return prio = factor; } /* Spill pages from the dirty list back to disk. @@ -5216,21 +5225,20 @@ static int mdbx_txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, * тем самым повышая их шансы на выталкивание. */ /* get min/max of LRU-labels */ - unsigned lru_min = dl->items[1].lru, lru_max = lru_min; - for (unsigned i = 2; i <= dl->length; ++i) { - lru_min = (lru_min < dl->items[i].lru) ? lru_min : dl->items[i].lru; - lru_max = (lru_max > dl->items[i].lru) ? lru_max : dl->items[i].lru; + uint32_t age_max = 0; + for (unsigned i = 1; i <= dl->length; ++i) { + const uint32_t age = mdbx_dpl_age(txn, i); + age_max = (age_max >= age) ? age_max : age; } - mdbx_verbose("lru-head %u, lru-min %u, lru-max %u", txn->tw.dirtylru, lru_min, - lru_max); + mdbx_verbose("lru-head %u, age-max %u", txn->tw.dirtylru, age_max); /* half of 8-bit radix-sort */ unsigned radix_counters[256], spillable = 0, spilled = 0; memset(&radix_counters, 0, sizeof(radix_counters)); - unsigned const reciprocal = 255 * 256 / (lru_max - lru_min + 1); + const uint32_t reciprocal = (UINT32_C(255) << 24) / (age_max + 1); for (unsigned i = 1; i <= dl->length; ++i) { - unsigned prio = spill_prio(txn, i, lru_min, reciprocal); + unsigned prio = spill_prio(txn, i, reciprocal); if (prio < 256) { radix_counters[prio] += 1; spillable += 1; @@ -5261,7 +5269,7 @@ static int mdbx_txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, unsigned r, w, prio; for (w = 0, r = 1; r <= dl->length && spilled < wanna_spill; prev_prio = prio, ++r) { - prio = spill_prio(txn, r, lru_min, reciprocal); + prio = spill_prio(txn, r, reciprocal); MDBX_page *const dp = dl->items[r].ptr; if (prio < prio2adjacent) { const pgno_t pgno = dl->items[r].pgno; @@ -5270,9 +5278,9 @@ static int mdbx_txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, if (prev_prio < prio2adjacent && prev_prio > prio2spill && dpl_endpgno(dl, r - 1) == pgno) { mdbx_debug("co-spill %u prev-adjacent page %" PRIaPGNO - " (lru-dist %d, prio %u)", + " (age %d, prio %u)", dpl_npages(dl, w), dl->items[r - 1].pgno, - txn->tw.dirtylru - dl->items[r - 1].lru, prev_prio); + mdbx_dpl_age(txn, r - 1), prev_prio); --w; rc = spill_page(txn, &ctx, dl->items[r - 1].ptr, dpl_npages(dl, r - 1)); @@ -5281,9 +5289,8 @@ static int mdbx_txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, ++spilled; } - mdbx_debug("spill %u page %" PRIaPGNO " (lru-dist %d, prio %u)", - npages, dp->mp_pgno, txn->tw.dirtylru - dl->items[r].lru, - prio); + mdbx_debug("spill %u page %" PRIaPGNO " (age %d, prio %u)", npages, + dp->mp_pgno, mdbx_dpl_age(txn, r), prio); rc = spill_page(txn, &ctx, dp, npages); if (unlikely(rc != MDBX_SUCCESS)) break; @@ -5293,9 +5300,8 @@ static int mdbx_txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, if (prev_prio <= prio2spill && dpl_endpgno(dl, r - 1) == pgno) { mdbx_debug("co-spill %u next-adjacent page %" PRIaPGNO - " (lru-dist %d, prio %u)", - npages, dp->mp_pgno, txn->tw.dirtylru - dl->items[r].lru, - prio); + " (age %d, prio %u)", + npages, dp->mp_pgno, mdbx_dpl_age(txn, r), prio); rc = spill_page(txn, &ctx, dp, npages); if (unlikely(rc != MDBX_SUCCESS)) break; @@ -5333,10 +5339,9 @@ static int mdbx_txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, for (unsigned i = 1; i <= dl->length; ++i) { MDBX_page *dp = dl->items[i].ptr; mdbx_notice( - "dirtylist[%u]: pgno %u, npages %u, flags 0x%04X, lru %u, prio %u", i, - dp->mp_pgno, dpl_npages(dl, i), dp->mp_flags, - txn->tw.dirtylru - dl->items[i].lru, - spill_prio(txn, i, lru_min, reciprocal)); + "dirtylist[%u]: pgno %u, npages %u, flags 0x%04X, age %u, prio %u", i, + dp->mp_pgno, dpl_npages(dl, i), dp->mp_flags, mdbx_dpl_age(txn, i), + spill_prio(txn, i, reciprocal)); } } @@ -7586,7 +7591,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { if (unlikely(rc != MDBX_SUCCESS)) goto bailout; txn->tw.dirtyroom = txn->mt_env->me_options.dp_limit; - txn->tw.dirtylru = 0; + txn->tw.dirtylru = MDBX_DEBUG ? ~42u : 0; } /* Setup db info */ @@ -8352,7 +8357,6 @@ static int mdbx_txn_end(MDBX_txn *txn, const unsigned mode) { parent->mt_child = nullptr; parent->mt_flags &= ~MDBX_TXN_HAS_CHILD; - mdbx_tassert(parent, parent->tw.dirtylru <= txn->tw.dirtylru); parent->tw.dirtylru = txn->tw.dirtylru; mdbx_tassert(parent, mdbx_dirtylist_check(parent)); mdbx_tassert(parent, mdbx_audit_ex(parent, 0, false) == 0); @@ -9748,7 +9752,6 @@ static __inline void mdbx_txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, parent->tw.dirtyroom -= dst->sorted - dst->length; assert(parent->tw.dirtyroom <= parent->mt_env->me_options.dp_limit); dpl_setlen(dst, dst->sorted); - mdbx_tassert(parent, parent->tw.dirtylru <= txn->tw.dirtylru); parent->tw.dirtylru = txn->tw.dirtylru; mdbx_tassert(parent, mdbx_dirtylist_check(parent)); mdbx_dpl_free(txn);