From ccb45730f2653a7afe6f5e900c623ee30d2d563d Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Sat, 21 Dec 2019 00:57:47 +0300 Subject: [PATCH] mdbx: use page's mp_txnid for basic integrity checking. Change-Id: I50d6f1251e4fd84e535a708e78dd24d84ec53780 --- src/elements/core.c | 57 +++++++++++++++++++++++++++------------- src/elements/internals.h | 4 +-- 2 files changed, 41 insertions(+), 20 deletions(-) diff --git a/src/elements/core.c b/src/elements/core.c index 1be4c075..96a31a09 100644 --- a/src/elements/core.c +++ b/src/elements/core.c @@ -2849,7 +2849,7 @@ static void mdbx_refund_loose(MDBX_txn *txn) { most -= 1; } const unsigned refunded = txn->mt_next_pgno - most; - mdbx_verbose("refund-sorted %u pages %" PRIaPGNO " -> %" PRIaPGNO, + mdbx_verbose("refund-suitable %u pages %" PRIaPGNO " -> %" PRIaPGNO, refunded, most, txn->mt_next_pgno); txn->tw.loose_count -= refunded; txn->tw.dirtyroom += refunded; @@ -2889,7 +2889,7 @@ static void mdbx_refund_loose(MDBX_txn *txn) { while (dl->length && dl[dl->length].pgno == txn->mt_next_pgno - 1 && dl[dl->length].ptr->mp_flags == (P_LOOSE | P_DIRTY)) { MDBX_page *dp = dl[dl->length].ptr; - mdbx_verbose("refund-unsorted page %" PRIaPGNO, dp->mp_pgno); + mdbx_verbose("refund-sorted page %" PRIaPGNO, dp->mp_pgno); mdbx_tassert(txn, dp->mp_pgno == dl[dl->length].pgno); dl->length -= 1; } @@ -2958,7 +2958,7 @@ static __cold void mdbx_kill_page(MDBX_env *env, MDBX_page *mp, pgno_t pgno, mdbx_assert(env, pgno >= NUM_METAS && npages); if (IS_DIRTY(mp) || (env->me_flags & MDBX_WRITEMAP)) { const size_t bytes = pgno2bytes(env, npages); - memset(mp, 0, bytes); + memset(mp, -1, bytes); mp->mp_pgno = pgno; if ((env->me_flags & MDBX_WRITEMAP) == 0) mdbx_pwrite(env->me_lazy_fd, mp, bytes, pgno2bytes(env, pgno)); @@ -2996,6 +2996,7 @@ static int mdbx_page_loose(MDBX_txn *txn, MDBX_page *mp) { const unsigned npages = IS_OVERFLOW(mp) ? mp->mp_pages : 1; const pgno_t pgno = mp->mp_pgno; + mp->mp_txnid = INVALID_TXNID; if (txn->mt_parent) { mdbx_tassert(txn, (txn->mt_env->me_flags & MDBX_WRITEMAP) == 0); mdbx_tassert(txn, mp != pgno2page(txn->mt_env, pgno)); @@ -3597,6 +3598,8 @@ static __cold pgno_t mdbx_find_largest(MDBX_env *env, pgno_t largest) { /* Add a page to the txn's dirty list */ static int __must_check_result mdbx_page_dirty(MDBX_txn *txn, MDBX_page *mp) { + mp->mp_txnid = INVALID_TXNID; + mp->mp_flags |= P_DIRTY; const int rc = mdbx_dpl_append(txn->tw.dirtylist, mp->mp_pgno, mp); if (unlikely(rc != MDBX_SUCCESS)) { txn->mt_flags |= MDBX_TXN_ERROR; @@ -3997,6 +4000,8 @@ __hot static int mdbx_page_alloc(MDBX_cursor *mc, const unsigned num, mdbx_ensure(env, np->mp_pgno >= NUM_METAS); VALGRIND_MAKE_MEM_UNDEFINED(page_data(np), page_space(txn->mt_env)); ASAN_UNPOISON_MEMORY_REGION(page_data(np), page_space(txn->mt_env)); + np->mp_flags = P_DIRTY; + np->mp_txnid = INVALID_TXNID; *mp = np; return MDBX_SUCCESS; } @@ -4488,10 +4493,8 @@ static int __must_check_result mdbx_page_unspill(MDBX_txn *txn, MDBX_page *mp, * page remains spilled until child commits */ int rc = mdbx_page_dirty(txn, np); - if (likely(rc == MDBX_SUCCESS)) { - np->mp_flags |= P_DIRTY; + if (likely(rc == MDBX_SUCCESS)) *ret = np; - } return rc; } return MDBX_SUCCESS; @@ -4575,6 +4578,7 @@ __hot static int mdbx_page_touch(MDBX_cursor *mc) { mdbx_page_copy(np, mp, txn->mt_env->me_psize); np->mp_pgno = pgno; + np->mp_txnid = INVALID_TXNID; np->mp_flags |= P_DIRTY; done: @@ -6619,7 +6623,7 @@ __hot static int mdbx_page_flush(MDBX_txn *txn, const unsigned keep) { (flush_end > dp->mp_pgno + npages) ? flush_end : dp->mp_pgno + npages; *env->me_unsynced_pages += npages; dp->mp_flags &= ~P_DIRTY; - dp->mp_validator = 0 /* TODO */; + dp->mp_txnid = txn->mt_txnid; if ((env->me_flags & MDBX_WRITEMAP) == 0) { const size_t size = pgno2bytes(env, npages); @@ -9604,19 +9608,19 @@ __hot static int mdbx_page_get(MDBX_cursor *mc, pgno_t pgno, MDBX_page **ret, * back in from the map (but don't unspill it here, * leave that unless page_touch happens again). */ if (txn->tw.spill_pages && mdbx_pnl_exist(txn->tw.spill_pages, pgno << 1)) - goto mapped; + goto spilled; p = mdbx_dpl_find(txn->tw.dirtylist, pgno); if (p) - goto done; + goto dirty; level++; } while ((txn = txn->mt_parent) != NULL); } level = 0; -mapped: +spilled: p = pgno2page(env, pgno); -done: +dirty: if (unlikely(p->mp_pgno != pgno)) { mdbx_error("mismatch pgno %" PRIaPGNO " (actual) != %" PRIaPGNO " (expected)", @@ -9624,16 +9628,25 @@ done: goto corrupted; } - if (likely(!IS_OVERFLOW(p))) { - if (unlikely(p->mp_upper < p->mp_lower || - ((p->mp_lower | p->mp_upper) & 1) || - PAGEHDRSZ + p->mp_upper > env->me_psize)) { - mdbx_error("invalid page lower(%u)/upper(%u), pg-limit %u", p->mp_lower, - p->mp_upper, page_space(env)); + if (unlikely((p->mp_flags & (P_LOOSE | P_SUBP | P_META | P_DIRTY)) != 0 || + p->mp_txnid > mc->mc_txn->mt_txnid)) { + if (unlikely((mc->mc_txn->mt_flags & MDBX_RDONLY) != 0 || + (p->mp_flags & (P_LOOSE | P_SUBP | P_META | P_DIRTY)) != + P_DIRTY)) { + mdbx_error("invalid page's flags (0x%x) or txnid %" PRIaTXN + " > (actual) %" PRIaTXN " (expected)", + p->mp_flags, p->mp_txnid, mc->mc_txn->mt_txnid); goto corrupted; } } - /* TODO: more checks here, including p->mp_validator */ + + if (unlikely(!IS_OVERFLOW(p) && (p->mp_upper < p->mp_lower || + ((p->mp_lower | p->mp_upper) & 1) != 0 || + PAGEHDRSZ + p->mp_upper > env->me_psize))) { + mdbx_error("invalid page lower(%u)/upper(%u), pg-limit %u", p->mp_lower, + p->mp_upper, page_space(env)); + goto corrupted; + } if (mdbx_audit_enabled()) { int err = mdbx_page_check(env, p, true); @@ -10908,6 +10921,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, insert_key = insert_data = (rc != MDBX_SUCCESS); uint16_t fp_flags = P_LEAF | P_DIRTY; MDBX_page *fp = env->me_pbuf; + fp->mp_txnid = INVALID_TXNID; if (insert_key) { /* The key does not exist */ mdbx_debug("inserting key at index %i", mc->mc_ki[mc->mc_top]); @@ -10995,6 +11009,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, } /* Is it dirty? */ if (IS_DIRTY(omp)) { + mdbx_cassert(mc, omp->mp_txnid > SAFE64_INVALID_THRESHOLD); /* yes, overwrite it. Note in this case we don't * bother to try shrinking the page if the new data * is smaller than the overflow threshold. */ @@ -11126,6 +11141,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, case MDBX_CURRENT | MDBX_NODUPDATA: case MDBX_CURRENT: fp->mp_flags |= P_DIRTY; + fp->mp_txnid = INVALID_TXNID; fp->mp_pgno = mp->mp_pgno; mc->mc_xcursor->mx_cursor.mc_pg[0] = fp; flags |= F_DUPDATA; @@ -11167,6 +11183,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, } if (mp != fp) { mp->mp_flags = fp_flags | P_DIRTY; + mp->mp_txnid = INVALID_TXNID; mp->mp_leaf2_ksize = fp->mp_leaf2_ksize; mp->mp_lower = fp->mp_lower; mdbx_cassert(mc, fp->mp_upper + offset <= UINT16_MAX); @@ -11514,6 +11531,7 @@ static int mdbx_page_new(MDBX_cursor *mc, unsigned flags, unsigned num, mdbx_debug("allocated new page #%" PRIaPGNO ", size %u", np->mp_pgno, mc->mc_txn->mt_env->me_psize); np->mp_flags = (uint16_t)(flags | P_DIRTY); + np->mp_txnid = INVALID_TXNID; np->mp_lower = 0; np->mp_upper = (indx_t)(mc->mc_txn->mt_env->me_psize - PAGEHDRSZ); @@ -13540,6 +13558,7 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *newkey, } copy->mp_pgno = mp->mp_pgno; copy->mp_flags = mp->mp_flags; + copy->mp_txnid = INVALID_TXNID; copy->mp_lower = 0; copy->mp_upper = (indx_t)page_space(env); @@ -14082,6 +14101,7 @@ static int __cold mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { mo = (MDBX_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); memcpy(mo, omp, my->mc_env->me_psize); mo->mp_pgno = my->mc_next_pgno; + mo->mp_txnid = MIN_TXNID; my->mc_next_pgno += omp->mp_pages; my->mc_wlen[toggle] += my->mc_env->me_psize; if (omp->mp_pages > 1) { @@ -14146,6 +14166,7 @@ static int __cold mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { } mo = (MDBX_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); mdbx_page_copy(mo, mp, my->mc_env->me_psize); + mo->mp_txnid = MIN_TXNID; mo->mp_pgno = my->mc_next_pgno++; my->mc_wlen[toggle] += my->mc_env->me_psize; if (mc.mc_top) { diff --git a/src/elements/internals.h b/src/elements/internals.h index ad6ef53c..daeab529 100644 --- a/src/elements/internals.h +++ b/src/elements/internals.h @@ -211,6 +211,7 @@ typedef uint32_t pgno_t; typedef uint64_t txnid_t; #define PRIaTXN PRIi64 #define MIN_TXNID UINT64_C(1) +#define INVALID_TXNID UINT64_MAX /* LY: for testing non-atomic 64-bit txnid on 32-bit arches. * #define MDBX_TXNID_STEP (UINT32_MAX / 3) */ #ifndef MDBX_TXNID_STEP @@ -347,8 +348,7 @@ typedef struct MDBX_meta { typedef struct MDBX_page { union { struct MDBX_page *mp_next; /* for in-memory list of freed pages */ - uint64_t mp_validator; /* checksum of page content or a txnid during - * which the page has been updated */ + uint64_t mp_txnid; /* txnid during which the page has been COW-ed */ }; uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */ #define P_BRANCH 0x01 /* branch page */