mdbx: use page's mp_txnid for basic integrity checking.

Change-Id: I50d6f1251e4fd84e535a708e78dd24d84ec53780
This commit is contained in:
Leonid Yuriev 2019-12-21 00:57:47 +03:00
parent d11bfef36b
commit ccb45730f2
2 changed files with 41 additions and 20 deletions

View File

@ -2849,7 +2849,7 @@ static void mdbx_refund_loose(MDBX_txn *txn) {
most -= 1; most -= 1;
} }
const unsigned refunded = txn->mt_next_pgno - most; const unsigned refunded = txn->mt_next_pgno - most;
mdbx_verbose("refund-sorted %u pages %" PRIaPGNO " -> %" PRIaPGNO, mdbx_verbose("refund-suitable %u pages %" PRIaPGNO " -> %" PRIaPGNO,
refunded, most, txn->mt_next_pgno); refunded, most, txn->mt_next_pgno);
txn->tw.loose_count -= refunded; txn->tw.loose_count -= refunded;
txn->tw.dirtyroom += refunded; txn->tw.dirtyroom += refunded;
@ -2889,7 +2889,7 @@ static void mdbx_refund_loose(MDBX_txn *txn) {
while (dl->length && dl[dl->length].pgno == txn->mt_next_pgno - 1 && while (dl->length && dl[dl->length].pgno == txn->mt_next_pgno - 1 &&
dl[dl->length].ptr->mp_flags == (P_LOOSE | P_DIRTY)) { dl[dl->length].ptr->mp_flags == (P_LOOSE | P_DIRTY)) {
MDBX_page *dp = dl[dl->length].ptr; MDBX_page *dp = dl[dl->length].ptr;
mdbx_verbose("refund-unsorted page %" PRIaPGNO, dp->mp_pgno); mdbx_verbose("refund-sorted page %" PRIaPGNO, dp->mp_pgno);
mdbx_tassert(txn, dp->mp_pgno == dl[dl->length].pgno); mdbx_tassert(txn, dp->mp_pgno == dl[dl->length].pgno);
dl->length -= 1; dl->length -= 1;
} }
@ -2958,7 +2958,7 @@ static __cold void mdbx_kill_page(MDBX_env *env, MDBX_page *mp, pgno_t pgno,
mdbx_assert(env, pgno >= NUM_METAS && npages); mdbx_assert(env, pgno >= NUM_METAS && npages);
if (IS_DIRTY(mp) || (env->me_flags & MDBX_WRITEMAP)) { if (IS_DIRTY(mp) || (env->me_flags & MDBX_WRITEMAP)) {
const size_t bytes = pgno2bytes(env, npages); const size_t bytes = pgno2bytes(env, npages);
memset(mp, 0, bytes); memset(mp, -1, bytes);
mp->mp_pgno = pgno; mp->mp_pgno = pgno;
if ((env->me_flags & MDBX_WRITEMAP) == 0) if ((env->me_flags & MDBX_WRITEMAP) == 0)
mdbx_pwrite(env->me_lazy_fd, mp, bytes, pgno2bytes(env, pgno)); mdbx_pwrite(env->me_lazy_fd, mp, bytes, pgno2bytes(env, pgno));
@ -2996,6 +2996,7 @@ static int mdbx_page_loose(MDBX_txn *txn, MDBX_page *mp) {
const unsigned npages = IS_OVERFLOW(mp) ? mp->mp_pages : 1; const unsigned npages = IS_OVERFLOW(mp) ? mp->mp_pages : 1;
const pgno_t pgno = mp->mp_pgno; const pgno_t pgno = mp->mp_pgno;
mp->mp_txnid = INVALID_TXNID;
if (txn->mt_parent) { if (txn->mt_parent) {
mdbx_tassert(txn, (txn->mt_env->me_flags & MDBX_WRITEMAP) == 0); mdbx_tassert(txn, (txn->mt_env->me_flags & MDBX_WRITEMAP) == 0);
mdbx_tassert(txn, mp != pgno2page(txn->mt_env, pgno)); mdbx_tassert(txn, mp != pgno2page(txn->mt_env, pgno));
@ -3597,6 +3598,8 @@ static __cold pgno_t mdbx_find_largest(MDBX_env *env, pgno_t largest) {
/* Add a page to the txn's dirty list */ /* Add a page to the txn's dirty list */
static int __must_check_result mdbx_page_dirty(MDBX_txn *txn, MDBX_page *mp) { static int __must_check_result mdbx_page_dirty(MDBX_txn *txn, MDBX_page *mp) {
mp->mp_txnid = INVALID_TXNID;
mp->mp_flags |= P_DIRTY;
const int rc = mdbx_dpl_append(txn->tw.dirtylist, mp->mp_pgno, mp); const int rc = mdbx_dpl_append(txn->tw.dirtylist, mp->mp_pgno, mp);
if (unlikely(rc != MDBX_SUCCESS)) { if (unlikely(rc != MDBX_SUCCESS)) {
txn->mt_flags |= MDBX_TXN_ERROR; txn->mt_flags |= MDBX_TXN_ERROR;
@ -3997,6 +4000,8 @@ __hot static int mdbx_page_alloc(MDBX_cursor *mc, const unsigned num,
mdbx_ensure(env, np->mp_pgno >= NUM_METAS); mdbx_ensure(env, np->mp_pgno >= NUM_METAS);
VALGRIND_MAKE_MEM_UNDEFINED(page_data(np), page_space(txn->mt_env)); VALGRIND_MAKE_MEM_UNDEFINED(page_data(np), page_space(txn->mt_env));
ASAN_UNPOISON_MEMORY_REGION(page_data(np), page_space(txn->mt_env)); ASAN_UNPOISON_MEMORY_REGION(page_data(np), page_space(txn->mt_env));
np->mp_flags = P_DIRTY;
np->mp_txnid = INVALID_TXNID;
*mp = np; *mp = np;
return MDBX_SUCCESS; return MDBX_SUCCESS;
} }
@ -4488,10 +4493,8 @@ static int __must_check_result mdbx_page_unspill(MDBX_txn *txn, MDBX_page *mp,
* page remains spilled until child commits */ * page remains spilled until child commits */
int rc = mdbx_page_dirty(txn, np); int rc = mdbx_page_dirty(txn, np);
if (likely(rc == MDBX_SUCCESS)) { if (likely(rc == MDBX_SUCCESS))
np->mp_flags |= P_DIRTY;
*ret = np; *ret = np;
}
return rc; return rc;
} }
return MDBX_SUCCESS; return MDBX_SUCCESS;
@ -4575,6 +4578,7 @@ __hot static int mdbx_page_touch(MDBX_cursor *mc) {
mdbx_page_copy(np, mp, txn->mt_env->me_psize); mdbx_page_copy(np, mp, txn->mt_env->me_psize);
np->mp_pgno = pgno; np->mp_pgno = pgno;
np->mp_txnid = INVALID_TXNID;
np->mp_flags |= P_DIRTY; np->mp_flags |= P_DIRTY;
done: done:
@ -6619,7 +6623,7 @@ __hot static int mdbx_page_flush(MDBX_txn *txn, const unsigned keep) {
(flush_end > dp->mp_pgno + npages) ? flush_end : dp->mp_pgno + npages; (flush_end > dp->mp_pgno + npages) ? flush_end : dp->mp_pgno + npages;
*env->me_unsynced_pages += npages; *env->me_unsynced_pages += npages;
dp->mp_flags &= ~P_DIRTY; dp->mp_flags &= ~P_DIRTY;
dp->mp_validator = 0 /* TODO */; dp->mp_txnid = txn->mt_txnid;
if ((env->me_flags & MDBX_WRITEMAP) == 0) { if ((env->me_flags & MDBX_WRITEMAP) == 0) {
const size_t size = pgno2bytes(env, npages); const size_t size = pgno2bytes(env, npages);
@ -9604,19 +9608,19 @@ __hot static int mdbx_page_get(MDBX_cursor *mc, pgno_t pgno, MDBX_page **ret,
* back in from the map (but don't unspill it here, * back in from the map (but don't unspill it here,
* leave that unless page_touch happens again). */ * leave that unless page_touch happens again). */
if (txn->tw.spill_pages && mdbx_pnl_exist(txn->tw.spill_pages, pgno << 1)) if (txn->tw.spill_pages && mdbx_pnl_exist(txn->tw.spill_pages, pgno << 1))
goto mapped; goto spilled;
p = mdbx_dpl_find(txn->tw.dirtylist, pgno); p = mdbx_dpl_find(txn->tw.dirtylist, pgno);
if (p) if (p)
goto done; goto dirty;
level++; level++;
} while ((txn = txn->mt_parent) != NULL); } while ((txn = txn->mt_parent) != NULL);
} }
level = 0; level = 0;
mapped: spilled:
p = pgno2page(env, pgno); p = pgno2page(env, pgno);
done: dirty:
if (unlikely(p->mp_pgno != pgno)) { if (unlikely(p->mp_pgno != pgno)) {
mdbx_error("mismatch pgno %" PRIaPGNO " (actual) != %" PRIaPGNO mdbx_error("mismatch pgno %" PRIaPGNO " (actual) != %" PRIaPGNO
" (expected)", " (expected)",
@ -9624,16 +9628,25 @@ done:
goto corrupted; goto corrupted;
} }
if (likely(!IS_OVERFLOW(p))) { if (unlikely((p->mp_flags & (P_LOOSE | P_SUBP | P_META | P_DIRTY)) != 0 ||
if (unlikely(p->mp_upper < p->mp_lower || p->mp_txnid > mc->mc_txn->mt_txnid)) {
((p->mp_lower | p->mp_upper) & 1) || if (unlikely((mc->mc_txn->mt_flags & MDBX_RDONLY) != 0 ||
PAGEHDRSZ + p->mp_upper > env->me_psize)) { (p->mp_flags & (P_LOOSE | P_SUBP | P_META | P_DIRTY)) !=
mdbx_error("invalid page lower(%u)/upper(%u), pg-limit %u", p->mp_lower, P_DIRTY)) {
p->mp_upper, page_space(env)); mdbx_error("invalid page's flags (0x%x) or txnid %" PRIaTXN
" > (actual) %" PRIaTXN " (expected)",
p->mp_flags, p->mp_txnid, mc->mc_txn->mt_txnid);
goto corrupted; goto corrupted;
} }
} }
/* TODO: more checks here, including p->mp_validator */
if (unlikely(!IS_OVERFLOW(p) && (p->mp_upper < p->mp_lower ||
((p->mp_lower | p->mp_upper) & 1) != 0 ||
PAGEHDRSZ + p->mp_upper > env->me_psize))) {
mdbx_error("invalid page lower(%u)/upper(%u), pg-limit %u", p->mp_lower,
p->mp_upper, page_space(env));
goto corrupted;
}
if (mdbx_audit_enabled()) { if (mdbx_audit_enabled()) {
int err = mdbx_page_check(env, p, true); int err = mdbx_page_check(env, p, true);
@ -10908,6 +10921,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data,
insert_key = insert_data = (rc != MDBX_SUCCESS); insert_key = insert_data = (rc != MDBX_SUCCESS);
uint16_t fp_flags = P_LEAF | P_DIRTY; uint16_t fp_flags = P_LEAF | P_DIRTY;
MDBX_page *fp = env->me_pbuf; MDBX_page *fp = env->me_pbuf;
fp->mp_txnid = INVALID_TXNID;
if (insert_key) { if (insert_key) {
/* The key does not exist */ /* The key does not exist */
mdbx_debug("inserting key at index %i", mc->mc_ki[mc->mc_top]); mdbx_debug("inserting key at index %i", mc->mc_ki[mc->mc_top]);
@ -10995,6 +11009,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data,
} }
/* Is it dirty? */ /* Is it dirty? */
if (IS_DIRTY(omp)) { if (IS_DIRTY(omp)) {
mdbx_cassert(mc, omp->mp_txnid > SAFE64_INVALID_THRESHOLD);
/* yes, overwrite it. Note in this case we don't /* yes, overwrite it. Note in this case we don't
* bother to try shrinking the page if the new data * bother to try shrinking the page if the new data
* is smaller than the overflow threshold. */ * is smaller than the overflow threshold. */
@ -11126,6 +11141,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data,
case MDBX_CURRENT | MDBX_NODUPDATA: case MDBX_CURRENT | MDBX_NODUPDATA:
case MDBX_CURRENT: case MDBX_CURRENT:
fp->mp_flags |= P_DIRTY; fp->mp_flags |= P_DIRTY;
fp->mp_txnid = INVALID_TXNID;
fp->mp_pgno = mp->mp_pgno; fp->mp_pgno = mp->mp_pgno;
mc->mc_xcursor->mx_cursor.mc_pg[0] = fp; mc->mc_xcursor->mx_cursor.mc_pg[0] = fp;
flags |= F_DUPDATA; flags |= F_DUPDATA;
@ -11167,6 +11183,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data,
} }
if (mp != fp) { if (mp != fp) {
mp->mp_flags = fp_flags | P_DIRTY; mp->mp_flags = fp_flags | P_DIRTY;
mp->mp_txnid = INVALID_TXNID;
mp->mp_leaf2_ksize = fp->mp_leaf2_ksize; mp->mp_leaf2_ksize = fp->mp_leaf2_ksize;
mp->mp_lower = fp->mp_lower; mp->mp_lower = fp->mp_lower;
mdbx_cassert(mc, fp->mp_upper + offset <= UINT16_MAX); mdbx_cassert(mc, fp->mp_upper + offset <= UINT16_MAX);
@ -11514,6 +11531,7 @@ static int mdbx_page_new(MDBX_cursor *mc, unsigned flags, unsigned num,
mdbx_debug("allocated new page #%" PRIaPGNO ", size %u", np->mp_pgno, mdbx_debug("allocated new page #%" PRIaPGNO ", size %u", np->mp_pgno,
mc->mc_txn->mt_env->me_psize); mc->mc_txn->mt_env->me_psize);
np->mp_flags = (uint16_t)(flags | P_DIRTY); np->mp_flags = (uint16_t)(flags | P_DIRTY);
np->mp_txnid = INVALID_TXNID;
np->mp_lower = 0; np->mp_lower = 0;
np->mp_upper = (indx_t)(mc->mc_txn->mt_env->me_psize - PAGEHDRSZ); np->mp_upper = (indx_t)(mc->mc_txn->mt_env->me_psize - PAGEHDRSZ);
@ -13540,6 +13558,7 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *newkey,
} }
copy->mp_pgno = mp->mp_pgno; copy->mp_pgno = mp->mp_pgno;
copy->mp_flags = mp->mp_flags; copy->mp_flags = mp->mp_flags;
copy->mp_txnid = INVALID_TXNID;
copy->mp_lower = 0; copy->mp_lower = 0;
copy->mp_upper = (indx_t)page_space(env); copy->mp_upper = (indx_t)page_space(env);
@ -14082,6 +14101,7 @@ static int __cold mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) {
mo = (MDBX_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); mo = (MDBX_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]);
memcpy(mo, omp, my->mc_env->me_psize); memcpy(mo, omp, my->mc_env->me_psize);
mo->mp_pgno = my->mc_next_pgno; mo->mp_pgno = my->mc_next_pgno;
mo->mp_txnid = MIN_TXNID;
my->mc_next_pgno += omp->mp_pages; my->mc_next_pgno += omp->mp_pages;
my->mc_wlen[toggle] += my->mc_env->me_psize; my->mc_wlen[toggle] += my->mc_env->me_psize;
if (omp->mp_pages > 1) { if (omp->mp_pages > 1) {
@ -14146,6 +14166,7 @@ static int __cold mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) {
} }
mo = (MDBX_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); mo = (MDBX_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]);
mdbx_page_copy(mo, mp, my->mc_env->me_psize); mdbx_page_copy(mo, mp, my->mc_env->me_psize);
mo->mp_txnid = MIN_TXNID;
mo->mp_pgno = my->mc_next_pgno++; mo->mp_pgno = my->mc_next_pgno++;
my->mc_wlen[toggle] += my->mc_env->me_psize; my->mc_wlen[toggle] += my->mc_env->me_psize;
if (mc.mc_top) { if (mc.mc_top) {

View File

@ -211,6 +211,7 @@ typedef uint32_t pgno_t;
typedef uint64_t txnid_t; typedef uint64_t txnid_t;
#define PRIaTXN PRIi64 #define PRIaTXN PRIi64
#define MIN_TXNID UINT64_C(1) #define MIN_TXNID UINT64_C(1)
#define INVALID_TXNID UINT64_MAX
/* LY: for testing non-atomic 64-bit txnid on 32-bit arches. /* LY: for testing non-atomic 64-bit txnid on 32-bit arches.
* #define MDBX_TXNID_STEP (UINT32_MAX / 3) */ * #define MDBX_TXNID_STEP (UINT32_MAX / 3) */
#ifndef MDBX_TXNID_STEP #ifndef MDBX_TXNID_STEP
@ -347,8 +348,7 @@ typedef struct MDBX_meta {
typedef struct MDBX_page { typedef struct MDBX_page {
union { union {
struct MDBX_page *mp_next; /* for in-memory list of freed pages */ struct MDBX_page *mp_next; /* for in-memory list of freed pages */
uint64_t mp_validator; /* checksum of page content or a txnid during uint64_t mp_txnid; /* txnid during which the page has been COW-ed */
* which the page has been updated */
}; };
uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */ uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */
#define P_BRANCH 0x01 /* branch page */ #define P_BRANCH 0x01 /* branch page */