mdbx: fix lru-counter overflow by module 2^31.

2 of 2 fixes for https://github.com/erthink/libmdbx/issues/195
This commit is contained in:
Leonid Yuriev 2021-05-20 17:42:22 +03:00
parent bc6a690733
commit 6bedb02ac0

View File

@ -3535,6 +3535,13 @@ mdbx_dpl_append(MDBX_txn *txn, pgno_t pgno, MDBX_page *page, unsigned npages) {
return MDBX_SUCCESS; return MDBX_SUCCESS;
} }
static __inline uint32_t mdbx_dpl_age(const MDBX_txn *txn, unsigned i) {
const MDBX_dpl *dl = txn->tw.dirtylist;
assert((int)i > 0 && i <= dl->length);
/* overflow could be here */
return (txn->tw.dirtylru - dl->items[i].lru) & UINT32_C(0x7fffFFFF);
}
/*----------------------------------------------------------------------------*/ /*----------------------------------------------------------------------------*/
uint8_t mdbx_runtime_flags = MDBX_RUNTIME_FLAGS_INIT; uint8_t mdbx_runtime_flags = MDBX_RUNTIME_FLAGS_INIT;
@ -4207,8 +4214,9 @@ MDBX_MAYBE_UNUSED __cold static bool mdbx_dirtylist_check(MDBX_txn *txn) {
if (unlikely(dp->mp_pgno != dl->items[i].pgno)) if (unlikely(dp->mp_pgno != dl->items[i].pgno))
return false; return false;
mdbx_tassert(txn, txn->tw.dirtylru >= dl->items[i].lru); const uint32_t age = mdbx_dpl_age(txn, i);
if (unlikely(txn->tw.dirtylru < dl->items[i].lru)) mdbx_tassert(txn, age < UINT32_MAX / 3);
if (unlikely(age > UINT32_MAX / 3))
return false; return false;
mdbx_tassert(txn, dp->mp_flags == P_LOOSE || IS_MODIFIABLE(txn, dp)); mdbx_tassert(txn, dp->mp_flags == P_LOOSE || IS_MODIFIABLE(txn, dp));
@ -4977,14 +4985,14 @@ static int spill_page(MDBX_txn *txn, struct mdbx_iov_ctx *ctx, MDBX_page *dp,
static unsigned mdbx_cursor_keep(MDBX_txn *txn, MDBX_cursor *mc) { static unsigned mdbx_cursor_keep(MDBX_txn *txn, MDBX_cursor *mc) {
unsigned keep = 0; unsigned keep = 0;
while (mc->mc_flags & C_INITIALIZED) { while (mc->mc_flags & C_INITIALIZED) {
for (unsigned i = 0; i < mc->mc_snum; i++) { for (unsigned i = 0; i < mc->mc_snum; ++i) {
const MDBX_page *mp = mc->mc_pg[i]; const MDBX_page *mp = mc->mc_pg[i];
if (IS_MODIFIABLE(txn, mp) && !IS_SUBP(mp)) { if (IS_MODIFIABLE(txn, mp) && !IS_SUBP(mp)) {
unsigned const n = mdbx_dpl_search(txn, mp->mp_pgno); unsigned const n = mdbx_dpl_search(txn, mp->mp_pgno);
if (txn->tw.dirtylist->items[n].pgno == mp->mp_pgno && if (txn->tw.dirtylist->items[n].pgno == mp->mp_pgno &&
txn->tw.dirtylist->items[n].lru != txn->tw.dirtylru) { mdbx_dpl_age(txn, n)) {
txn->tw.dirtylist->items[n].lru = txn->tw.dirtylru; txn->tw.dirtylist->items[n].lru = txn->tw.dirtylru;
keep++; ++keep;
} }
} }
} }
@ -5011,12 +5019,12 @@ static unsigned mdbx_txn_keep(MDBX_txn *txn, MDBX_cursor *m0) {
* ... * ...
* > 255 = must not be spilled. */ * > 255 = must not be spilled. */
static unsigned spill_prio(const MDBX_txn *txn, const unsigned i, static unsigned spill_prio(const MDBX_txn *txn, const unsigned i,
const unsigned lru_min, const unsigned reciprocal) { const uint32_t reciprocal) {
MDBX_dpl *const dl = txn->tw.dirtylist; MDBX_dpl *const dl = txn->tw.dirtylist;
const unsigned lru = dl->items[i].lru; const uint32_t age = mdbx_dpl_age(txn, i);
const unsigned npages = dpl_npages(dl, i); const unsigned npages = dpl_npages(dl, i);
const pgno_t pgno = dl->items[i].pgno; const pgno_t pgno = dl->items[i].pgno;
if (lru == txn->tw.dirtylru) { if (age == 0) {
mdbx_debug("skip %s %u page %" PRIaPGNO, "keep", npages, pgno); mdbx_debug("skip %s %u page %" PRIaPGNO, "keep", npages, pgno);
return 256; return 256;
} }
@ -5045,21 +5053,22 @@ static unsigned spill_prio(const MDBX_txn *txn, const unsigned i,
while ((parent = parent->mt_parent) != nullptr); while ((parent = parent->mt_parent) != nullptr);
} }
unsigned prio = 1 + ((lru - lru_min) * reciprocal >> 8); mdbx_tassert(txn, age * (uint64_t)reciprocal < UINT32_MAX);
mdbx_tassert(txn, prio > 0 && prio < 256); unsigned prio = age * reciprocal >> 24;
mdbx_tassert(txn, prio < 256);
if (likely(npages == 1)) if (likely(npages == 1))
return prio; return prio = 256 - prio;
/* make a large/overflow pages be likely to spill */ /* make a large/overflow pages be likely to spill */
uint32_t x = npages | npages >> 1; uint32_t factor = npages | npages >> 1;
x |= x >> 2; factor |= factor >> 2;
x |= x >> 4; factor |= factor >> 4;
x |= x >> 8; factor |= factor >> 8;
x |= x >> 16; factor |= factor >> 16;
x = (255 - prio) * log2n_powerof2(x + 1) + /* golden ratio factor */ 157; factor = prio * log2n_powerof2(factor + 1) + /* golden ratio */ 157;
x = (x < 256) ? 255 - x : 0; factor = (factor < 256) ? 255 - factor : 0;
mdbx_tassert(txn, x < 256 && x < prio); mdbx_tassert(txn, factor < 256 && factor < (256 - prio));
return prio = x; return prio = factor;
} }
/* Spill pages from the dirty list back to disk. /* Spill pages from the dirty list back to disk.
@ -5216,21 +5225,20 @@ static int mdbx_txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0,
* тем самым повышая их шансы на выталкивание. */ * тем самым повышая их шансы на выталкивание. */
/* get min/max of LRU-labels */ /* get min/max of LRU-labels */
unsigned lru_min = dl->items[1].lru, lru_max = lru_min; uint32_t age_max = 0;
for (unsigned i = 2; i <= dl->length; ++i) { for (unsigned i = 1; i <= dl->length; ++i) {
lru_min = (lru_min < dl->items[i].lru) ? lru_min : dl->items[i].lru; const uint32_t age = mdbx_dpl_age(txn, i);
lru_max = (lru_max > dl->items[i].lru) ? lru_max : dl->items[i].lru; age_max = (age_max >= age) ? age_max : age;
} }
mdbx_verbose("lru-head %u, lru-min %u, lru-max %u", txn->tw.dirtylru, lru_min, mdbx_verbose("lru-head %u, age-max %u", txn->tw.dirtylru, age_max);
lru_max);
/* half of 8-bit radix-sort */ /* half of 8-bit radix-sort */
unsigned radix_counters[256], spillable = 0, spilled = 0; unsigned radix_counters[256], spillable = 0, spilled = 0;
memset(&radix_counters, 0, sizeof(radix_counters)); memset(&radix_counters, 0, sizeof(radix_counters));
unsigned const reciprocal = 255 * 256 / (lru_max - lru_min + 1); const uint32_t reciprocal = (UINT32_C(255) << 24) / (age_max + 1);
for (unsigned i = 1; i <= dl->length; ++i) { for (unsigned i = 1; i <= dl->length; ++i) {
unsigned prio = spill_prio(txn, i, lru_min, reciprocal); unsigned prio = spill_prio(txn, i, reciprocal);
if (prio < 256) { if (prio < 256) {
radix_counters[prio] += 1; radix_counters[prio] += 1;
spillable += 1; spillable += 1;
@ -5261,7 +5269,7 @@ static int mdbx_txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0,
unsigned r, w, prio; unsigned r, w, prio;
for (w = 0, r = 1; r <= dl->length && spilled < wanna_spill; for (w = 0, r = 1; r <= dl->length && spilled < wanna_spill;
prev_prio = prio, ++r) { prev_prio = prio, ++r) {
prio = spill_prio(txn, r, lru_min, reciprocal); prio = spill_prio(txn, r, reciprocal);
MDBX_page *const dp = dl->items[r].ptr; MDBX_page *const dp = dl->items[r].ptr;
if (prio < prio2adjacent) { if (prio < prio2adjacent) {
const pgno_t pgno = dl->items[r].pgno; const pgno_t pgno = dl->items[r].pgno;
@ -5270,9 +5278,9 @@ static int mdbx_txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0,
if (prev_prio < prio2adjacent && prev_prio > prio2spill && if (prev_prio < prio2adjacent && prev_prio > prio2spill &&
dpl_endpgno(dl, r - 1) == pgno) { dpl_endpgno(dl, r - 1) == pgno) {
mdbx_debug("co-spill %u prev-adjacent page %" PRIaPGNO mdbx_debug("co-spill %u prev-adjacent page %" PRIaPGNO
" (lru-dist %d, prio %u)", " (age %d, prio %u)",
dpl_npages(dl, w), dl->items[r - 1].pgno, dpl_npages(dl, w), dl->items[r - 1].pgno,
txn->tw.dirtylru - dl->items[r - 1].lru, prev_prio); mdbx_dpl_age(txn, r - 1), prev_prio);
--w; --w;
rc = spill_page(txn, &ctx, dl->items[r - 1].ptr, rc = spill_page(txn, &ctx, dl->items[r - 1].ptr,
dpl_npages(dl, r - 1)); dpl_npages(dl, r - 1));
@ -5281,9 +5289,8 @@ static int mdbx_txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0,
++spilled; ++spilled;
} }
mdbx_debug("spill %u page %" PRIaPGNO " (lru-dist %d, prio %u)", mdbx_debug("spill %u page %" PRIaPGNO " (age %d, prio %u)", npages,
npages, dp->mp_pgno, txn->tw.dirtylru - dl->items[r].lru, dp->mp_pgno, mdbx_dpl_age(txn, r), prio);
prio);
rc = spill_page(txn, &ctx, dp, npages); rc = spill_page(txn, &ctx, dp, npages);
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
break; break;
@ -5293,9 +5300,8 @@ static int mdbx_txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0,
if (prev_prio <= prio2spill && dpl_endpgno(dl, r - 1) == pgno) { if (prev_prio <= prio2spill && dpl_endpgno(dl, r - 1) == pgno) {
mdbx_debug("co-spill %u next-adjacent page %" PRIaPGNO mdbx_debug("co-spill %u next-adjacent page %" PRIaPGNO
" (lru-dist %d, prio %u)", " (age %d, prio %u)",
npages, dp->mp_pgno, txn->tw.dirtylru - dl->items[r].lru, npages, dp->mp_pgno, mdbx_dpl_age(txn, r), prio);
prio);
rc = spill_page(txn, &ctx, dp, npages); rc = spill_page(txn, &ctx, dp, npages);
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
break; break;
@ -5333,10 +5339,9 @@ static int mdbx_txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0,
for (unsigned i = 1; i <= dl->length; ++i) { for (unsigned i = 1; i <= dl->length; ++i) {
MDBX_page *dp = dl->items[i].ptr; MDBX_page *dp = dl->items[i].ptr;
mdbx_notice( mdbx_notice(
"dirtylist[%u]: pgno %u, npages %u, flags 0x%04X, lru %u, prio %u", i, "dirtylist[%u]: pgno %u, npages %u, flags 0x%04X, age %u, prio %u", i,
dp->mp_pgno, dpl_npages(dl, i), dp->mp_flags, dp->mp_pgno, dpl_npages(dl, i), dp->mp_flags, mdbx_dpl_age(txn, i),
txn->tw.dirtylru - dl->items[i].lru, spill_prio(txn, i, reciprocal));
spill_prio(txn, i, lru_min, reciprocal));
} }
} }
@ -7586,7 +7591,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) {
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
goto bailout; goto bailout;
txn->tw.dirtyroom = txn->mt_env->me_options.dp_limit; txn->tw.dirtyroom = txn->mt_env->me_options.dp_limit;
txn->tw.dirtylru = 0; txn->tw.dirtylru = MDBX_DEBUG ? ~42u : 0;
} }
/* Setup db info */ /* Setup db info */
@ -8352,7 +8357,6 @@ static int mdbx_txn_end(MDBX_txn *txn, const unsigned mode) {
parent->mt_child = nullptr; parent->mt_child = nullptr;
parent->mt_flags &= ~MDBX_TXN_HAS_CHILD; parent->mt_flags &= ~MDBX_TXN_HAS_CHILD;
mdbx_tassert(parent, parent->tw.dirtylru <= txn->tw.dirtylru);
parent->tw.dirtylru = txn->tw.dirtylru; parent->tw.dirtylru = txn->tw.dirtylru;
mdbx_tassert(parent, mdbx_dirtylist_check(parent)); mdbx_tassert(parent, mdbx_dirtylist_check(parent));
mdbx_tassert(parent, mdbx_audit_ex(parent, 0, false) == 0); mdbx_tassert(parent, mdbx_audit_ex(parent, 0, false) == 0);
@ -9748,7 +9752,6 @@ static __inline void mdbx_txn_merge(MDBX_txn *const parent, MDBX_txn *const txn,
parent->tw.dirtyroom -= dst->sorted - dst->length; parent->tw.dirtyroom -= dst->sorted - dst->length;
assert(parent->tw.dirtyroom <= parent->mt_env->me_options.dp_limit); assert(parent->tw.dirtyroom <= parent->mt_env->me_options.dp_limit);
dpl_setlen(dst, dst->sorted); dpl_setlen(dst, dst->sorted);
mdbx_tassert(parent, parent->tw.dirtylru <= txn->tw.dirtylru);
parent->tw.dirtylru = txn->tw.dirtylru; parent->tw.dirtylru = txn->tw.dirtylru;
mdbx_tassert(parent, mdbx_dirtylist_check(parent)); mdbx_tassert(parent, mdbx_dirtylist_check(parent));
mdbx_dpl_free(txn); mdbx_dpl_free(txn);