From 9eeb00f448d21cdaa8a702c26f9dae8ea6085389 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 24 May 2017 21:43:29 +0300 Subject: [PATCH] mdbx: trinity of meta-pages. --- TODO.md | 1 + mdbx.h | 17 +- src/bits.h | 53 +--- src/mdbx.c | 565 ++++++++++++++++++++++++++---------------- src/osal.c | 4 +- src/osal.h | 2 +- src/tools/mdbx_chk.c | 222 ++++++++++++----- src/tools/mdbx_stat.c | 19 +- 8 files changed, 537 insertions(+), 346 deletions(-) diff --git a/TODO.md b/TODO.md index ab59b4c3..92dd691c 100644 --- a/TODO.md +++ b/TODO.md @@ -15,3 +15,4 @@ - [ ] актуализация README.md - [ ] возможность хранения ключей внутри data (libfptu) - [ ] асинхронная фиксация (https://github.com/ReOpen/libmdbx/issues/5) +- [ ] (пере)выделять память под IDL-списки с учетом реального кол-ва страниц, т.е. max(MDB_IDL_UM_MAX/MDB_IDL_UM_MAX, npages) diff --git a/mdbx.h b/mdbx.h index 625cca46..45668cf6 100644 --- a/mdbx.h +++ b/mdbx.h @@ -427,13 +427,14 @@ typedef struct MDBX_stat { /* Information about the environment */ typedef struct MDBX_envinfo { - void *me_mapaddr; /* Address of map, if fixed */ - uint64_t me_mapsize; /* Size of the data memory map */ - uint64_t me_last_pgno; /* ID of the last used page */ - uint64_t me_last_txnid; /* ID of the last committed transaction */ - uint32_t me_maxreaders; /* max reader slots in the environment */ - uint32_t me_numreaders; /* max reader slots used in the environment */ - uint64_t me_tail_txnid; /* ID of the last reader transaction */ + void *me_mapaddr; /* Address of map, if fixed */ + uint64_t me_mapsize; /* Size of the data memory map */ + uint64_t me_recent_pgno; /* ID of the last used page */ + uint64_t me_recent_txnid; /* ID of the last committed transaction */ + uint32_t me_maxreaders; /* max reader slots in the environment */ + uint32_t me_numreaders; /* max reader slots used in the environment */ + uint64_t me_latter_reader_txnid; /* ID of the last reader transaction */ + uint64_t me_meta0_txnid, me_meta0_sign; uint64_t me_meta1_txnid, me_meta1_sign; uint64_t me_meta2_txnid, me_meta2_sign; } MDBX_envinfo; @@ -868,7 +869,7 @@ LIBMDBX_API void *mdbx_env_get_userctx(MDBX_env *env); * * [in] env An environment handle returned by mdbx_env_create(). * [in] msg The assertion message, not including newline. */ -typedef void MDBX_assert_func(MDBX_env *env, const char *msg, +typedef void MDBX_assert_func(const MDBX_env *env, const char *msg, const char *function, unsigned line); /* Set or reset the assert() callback of the environment. diff --git a/src/bits.h b/src/bits.h index 3a566cab..cbd2cc09 100644 --- a/src/bits.h +++ b/src/bits.h @@ -95,6 +95,7 @@ * pressure from other processes is high. So until OSs have * actual paging support for Huge pages, they're not viable. */ #define MAX_PAGESIZE (PAGEBASE ? 0x10000 : 0x8000) +#define MIN_PAGESIZE 1024 /* The minimum number of keys required in a database page. * Setting this to a larger value will place a smaller bound on the @@ -127,14 +128,14 @@ #define CORE_DBS 2 /* Number of meta pages - also hardcoded elsewhere */ -#define NUM_METAS 2 +#define NUM_METAS 3 /* A page number in the database. * * MDBX uses 32 bit for page numbers. This limits database * size up to 2^44 bytes, in case of 4K pages. */ -typedef uint32_t pgno_t; -#define PRIaPGNO PRIu32 +typedef uint64_t pgno_t; +#define PRIaPGNO PRIu64 /* TODO */ /* A transaction ID. */ typedef uint64_t txnid_t; @@ -253,18 +254,12 @@ typedef struct MDBX_meta { #define MDBX_DATASIGN_WEAK 1u volatile uint64_t mm_datasync_sign; -#define MDBX_TEMPORARY_CRUTCH FIXME -#ifndef MDBX_TEMPORARY_CRUTCH #define SIGN_IS_WEAK(sign) ((sign) == MDBX_DATASIGN_WEAK) #define SIGN_IS_STEADY(sign) ((sign) > MDBX_DATASIGN_WEAK) -#else -#define SIGN_IS_WEAK(sign) (false && (sign) == MDBX_DATASIGN_WEAK) -#define SIGN_IS_STEADY(sign) (true || (sign) > MDBX_DATASIGN_WEAK) -#endif /* FIXME: MDBX_TEMPORARY_CRUTCH */ #define META_IS_WEAK(meta) SIGN_IS_WEAK((meta)->mm_datasync_sign) #define META_IS_STEADY(meta) SIGN_IS_STEADY((meta)->mm_datasync_sign) - volatile mdbx_canary mm_canary; + mdbx_canary mm_canary; } MDBX_meta; /* Common header for all page types. The page type depends on mp_flags. @@ -307,23 +302,17 @@ typedef struct MDBX_page { }; uint32_t mp_pages; /* number of overflow pages */ }; - indx_t mp_ptrs[1]; /* dynamic size */ + + /* dynamic size */ + union { + indx_t mp_ptrs[1]; + MDBX_meta mp_meta; + uint8_t mp_data[1]; + }; } MDBX_page; /* Size of the page header, excluding dynamic data at the end */ -#define PAGEHDRSZ ((unsigned)offsetof(MDBX_page, mp_ptrs)) - -/* Buffer for a stack-allocated meta page. - * The members define size and alignment, and silence type - * aliasing warnings. They are not used directly; that could - * mean incorrectly using several union members in parallel. */ -typedef union MDBX_metabuf { - MDBX_page mb_page; - struct { - char mm_pad[PAGEHDRSZ]; - MDBX_meta mm_meta; - } mb_metabuf; -} MDBX_metabuf; +#define PAGEHDRSZ ((unsigned)offsetof(MDBX_page, mp_data)) /* The header for the reader table (a memory-mapped lock file). */ typedef struct MDBX_lockinfo { @@ -795,22 +784,6 @@ static __inline void mdbx_jitter4testing(bool tiny) { /* Internal prototypes and inlines */ int mdbx_reader_check0(MDBX_env *env, int rlocked, int *dead); - -#define METAPAGE_1(env) (&((MDBX_metabuf *)(env)->me_map)->mb_metabuf.mm_meta) - -#define METAPAGE_2(env) \ - (&((MDBX_metabuf *)((env)->me_map + env->me_psize))->mb_metabuf.mm_meta) - -static __inline MDBX_meta *mdbx_meta_head(MDBX_env *env) { - mdbx_jitter4testing(true); - MDBX_meta *a = METAPAGE_1(env); - mdbx_jitter4testing(true); - MDBX_meta *b = METAPAGE_2(env); - mdbx_jitter4testing(true); - - return (a->mm_txnid > b->mm_txnid) ? a : b; -} - void mdbx_rthc_dtor(void *rthc); void mdbx_rthc_lock(void); void mdbx_rthc_unlock(void); diff --git a/src/mdbx.c b/src/mdbx.c index 4c93522e..083e300f 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -553,7 +553,7 @@ static int mdbx_page_split(MDBX_cursor *mc, MDBX_val *newkey, MDBX_val *newdata, static int mdbx_read_header(MDBX_env *env, MDBX_meta *meta); static int mdbx_env_sync_locked(MDBX_env *env, unsigned flags, - MDBX_meta *pending); + MDBX_meta *const pending); static void mdbx_env_close0(MDBX_env *env); static MDBX_node *mdbx_node_search(MDBX_cursor *mc, MDBX_val *key, int *exactp); @@ -1272,7 +1272,7 @@ bailout: return rc; } -static __inline uint64_t mdbx_meta_sign(MDBX_meta *meta) { +static __inline uint64_t mdbx_meta_sign(const MDBX_meta *meta) { uint64_t sign = MDBX_DATASIGN_NONE; #if 0 /* TODO */ sign = hippeus_hash64(&meta->mm_mapsize, @@ -1285,22 +1285,98 @@ static __inline uint64_t mdbx_meta_sign(MDBX_meta *meta) { return (sign > MDBX_DATASIGN_WEAK) ? sign : ~sign; } -static __inline MDBX_meta *mdbx_env_meta_flipflop(const MDBX_env *env, - MDBX_meta *meta) { - return (meta == METAPAGE_1(env)) ? METAPAGE_2(env) : METAPAGE_1(env); +static __inline bool mdbx_meta_ot(const MDBX_meta *a, const MDBX_meta *b, + const bool roolback2steady) { + mdbx_jitter4testing(true); + if (a->mm_txnid == b->mm_txnid) + return META_IS_STEADY(b); + + mdbx_jitter4testing(true); + if (roolback2steady && META_IS_STEADY(a) != META_IS_STEADY(b)) + return META_IS_STEADY(b); + + mdbx_jitter4testing(true); + return a->mm_txnid < b->mm_txnid; } -static __inline int mdbx_meta_lt(const MDBX_meta *a, const MDBX_meta *b) { - if (META_IS_STEADY(a) == META_IS_STEADY(b)) - return a->mm_txnid < b->mm_txnid; - return META_IS_STEADY(b); +static __inline bool mdbx_meta_eq(const MDBX_meta *a, const MDBX_meta *b) { + mdbx_jitter4testing(true); + if (a->mm_txnid != b->mm_txnid) + return false; + + mdbx_jitter4testing(true); + if (META_IS_STEADY(a) != META_IS_STEADY(b)) + return false; + + mdbx_jitter4testing(true); + return true; +} + +#define METAPAGE(env, n) \ + (&((MDBX_page *)((env)->me_map + env->me_psize * (n)))->mp_meta) + +static int mdbx_meta_eq_mask(const MDBX_env *env) { + MDBX_meta *m0 = METAPAGE(env, 0); + MDBX_meta *m1 = METAPAGE(env, 1); + MDBX_meta *m2 = METAPAGE(env, 2); + + int rc = mdbx_meta_eq(m0, m1) ? 1 : 0; + if (mdbx_meta_eq(m1, m2)) + rc += 2; + if (mdbx_meta_eq(m2, m0)) + rc += 4; + return rc; +} + +static __inline MDBX_meta *mdbx_meta_recent(const MDBX_env *env, MDBX_meta *a, + MDBX_meta *b, + const bool roolback2steady) { + const bool a_older_that_b = mdbx_meta_ot(a, b, roolback2steady); + mdbx_assert(env, !mdbx_meta_eq(a, b)); + return a_older_that_b ? b : a; +} + +static __inline MDBX_meta *mdbx_meta_ancient(const MDBX_env *env, MDBX_meta *a, + MDBX_meta *b, + const bool roolback2steady) { + const bool a_older_that_b = mdbx_meta_ot(a, b, roolback2steady); + mdbx_assert(env, !mdbx_meta_eq(a, b)); + return a_older_that_b ? a : b; +} + +static __inline MDBX_meta *mdbx_meta_head(const MDBX_env *env, + const bool roolback2steady) { + MDBX_meta *m0 = METAPAGE(env, 0); + MDBX_meta *m1 = METAPAGE(env, 1); + MDBX_meta *m2 = METAPAGE(env, 2); + + MDBX_meta *head = mdbx_meta_recent(env, m0, m1, roolback2steady); + head = mdbx_meta_recent(env, head, m2, roolback2steady); + return head; +} + +static __hot MDBX_meta *mdbx_meta_steady_head(const MDBX_env *env) { + return mdbx_meta_head(env, true); +} + +static __hot MDBX_meta *mdbx_meta_fluid_head(const MDBX_env *env) { + return mdbx_meta_head(env, false); +} + +static const char *mdbx_durable_str(const MDBX_meta *const meta) { + if (META_IS_WEAK(meta)) + return "Weak"; + if (META_IS_STEADY(meta)) + return (meta->mm_datasync_sign == mdbx_meta_sign(meta)) ? "Steady" + : "Tainted"; + return "Legacy"; } /* Find oldest txnid still referenced. */ static txnid_t mdbx_find_oldest(MDBX_env *env, int *laggard) { - const MDBX_meta *const a = METAPAGE_1(env); - const MDBX_meta *const b = METAPAGE_2(env); - txnid_t oldest = mdbx_meta_lt(a, b) ? b->mm_txnid : a->mm_txnid; + const MDBX_meta *const head = mdbx_meta_head( + env, F_ISSET(env->me_flags, MDBX_UTTERLY_NOSYNC) ? false : true); + txnid_t oldest = head->mm_txnid; int i, reader; const MDBX_reader *const r = env->me_lck->mti_readers; @@ -1589,12 +1665,11 @@ static int mdbx_page_alloc(MDBX_cursor *mc, int num, MDBX_page **mp, if ((flags & MDBX_ALLOC_GC) && ((flags & MDBX_ALLOC_KICK) || rc == MDBX_MAP_FULL)) { - MDBX_meta *head = mdbx_meta_head(env); - MDBX_meta *tail = mdbx_env_meta_flipflop(env, head); + MDBX_meta *fluid = mdbx_meta_fluid_head(env); + MDBX_meta *steady = mdbx_meta_steady_head(env); - if (oldest == tail->mm_txnid && META_IS_WEAK(head) && - !META_IS_WEAK(tail)) { - MDBX_meta meta = *head; + if (oldest == steady->mm_txnid && META_IS_WEAK(fluid) && + !META_IS_WEAK(steady)) { /* LY: Here an oom was happened: * - all pages had allocated; * - reclaiming was stopped at the last steady-sync; @@ -1605,16 +1680,17 @@ static int mdbx_page_alloc(MDBX_cursor *mc, int num, MDBX_page **mp, * don't make a steady-sync, but only a legacy-mode checkpoint, * just for resume reclaiming only, not for data consistency. */ - mdbx_debug("kick-gc: head %" PRIaTXN "/%c, tail %" PRIaTXN - "/%c, oldest %" PRIaTXN "", - head->mm_txnid, META_IS_WEAK(head) ? 'W' : 'N', - tail->mm_txnid, META_IS_WEAK(tail) ? 'W' : 'N', oldest); + mdbx_debug("kick-gc: head %" PRIaTXN "-%s, tail %" PRIaTXN + "-%s, oldest %" PRIaTXN "", + fluid->mm_txnid, mdbx_durable_str(fluid), steady->mm_txnid, + mdbx_durable_str(steady), oldest); - int me_flags = env->me_flags & MDBX_WRITEMAP; - if ((env->me_flags & MDBX_UTTERLY_NOSYNC) == MDBX_UTTERLY_NOSYNC) + unsigned me_flags = env->me_flags & MDBX_WRITEMAP; + if (F_ISSET(env->me_flags, MDBX_UTTERLY_NOSYNC)) me_flags |= MDBX_UTTERLY_NOSYNC; mdbx_assert(env, env->me_sync_pending > 0); + MDBX_meta meta = *fluid; if (mdbx_env_sync_locked(env, me_flags, &meta) == MDBX_SUCCESS) { txnid_t snap = mdbx_find_oldest(env, NULL); if (snap > oldest) { @@ -1878,7 +1954,7 @@ int mdbx_env_sync(MDBX_env *env, int force) { if (unlikely(rc != MDBX_SUCCESS)) return rc; - MDBX_meta *head = mdbx_meta_head(env); + MDBX_meta *head = mdbx_meta_fluid_head(env); if (!META_IS_STEADY(head) || env->me_sync_pending || env->me_mapsize != head->mm_mapsize) { @@ -1907,11 +1983,16 @@ int mdbx_env_sync(MDBX_env *env, int force) { return rc; /* LY: head may be changed. */ - head = mdbx_meta_head(env); + head = mdbx_meta_fluid_head(env); } if (!META_IS_STEADY(head) || env->me_sync_pending || env->me_mapsize != head->mm_mapsize) { + mdbx_debug("meta-head %" PRIaPGNO ", %s, sync_pending %" PRIu64 + ", mapsize env=%" PRIuPTR " meta=%" PRIuPTR, + container_of(head, MDBX_page, mp_data)->mp_pgno, + mdbx_durable_str(head), env->me_sync_pending, env->me_mapsize, + head->mm_mapsize); MDBX_meta meta = *head; rc = mdbx_env_sync_locked(env, flags, &meta); if (unlikely(rc != MDBX_SUCCESS)) { @@ -2058,7 +2139,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { env->me_live_reader = pid; } - for (;;) { + while (1) { nr = env->me_lck->mti_numreaders; for (i = 0; i < nr; i++) if (env->me_lck->mti_readers[i].mr_pid == 0) @@ -2096,7 +2177,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { } while (1) { - MDBX_meta *const meta = mdbx_meta_head(txn->mt_env); + MDBX_meta *const meta = mdbx_meta_fluid_head(txn->mt_env); mdbx_jitter4testing(false); const txnid_t snap = meta->mm_txnid; mdbx_jitter4testing(false); @@ -2114,8 +2195,11 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { txn->mt_canary = meta->mm_canary; /* LY: Retry on a race, ITS#7970. */ - if (likely(meta == mdbx_meta_head(txn->mt_env) && snap == meta->mm_txnid)) + if (likely(meta == mdbx_meta_fluid_head(txn->mt_env) && + snap == meta->mm_txnid)) { + mdbx_jitter4testing(false); break; + } } txn->mt_ro_reader = r; @@ -2128,7 +2212,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { return rc; mdbx_jitter4testing(false); - MDBX_meta *meta = mdbx_meta_head(env); + MDBX_meta *meta = mdbx_meta_fluid_head(env); mdbx_jitter4testing(false); txn->mt_canary = meta->mm_canary; txn->mt_txnid = meta->mm_txnid + 1; @@ -3188,63 +3272,64 @@ fail: /* Read the environment parameters of a DB environment * before mapping it into memory. */ static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *meta) { - assert(offsetof(MDBX_metabuf, mb_metabuf.mm_meta) == PAGEHDRSZ); + assert(offsetof(MDBX_page, mp_meta) == PAGEHDRSZ); memset(meta, 0, sizeof(MDBX_meta)); meta->mm_datasync_sign = MDBX_DATASIGN_WEAK; - unsigned offset = 0; - /* Read both meta pages so we can use the latest one. */ - for (int loops_left = 2; --loops_left >= 0;) { - MDBX_metabuf buf; + /* Read twice all meta pages so we can find the latest one. */ + unsigned loop_limit = NUM_METAS * 2; + for (unsigned loop_count = 0; loop_count < loop_limit; ++loop_count) { + MDBX_page page; - /* We don't know the page size on first time, so use a minimum value. */ - int rc = mdbx_pread(env->me_fd, &buf, sizeof(buf), offset); + /* We don't know the page size on first time. + * So, just guess it. */ + unsigned guess_pagesize = meta->mm_psize; + if (guess_pagesize == 0) + guess_pagesize = + (loop_count > NUM_METAS) ? env->me_psize : env->me_os_psize; + + const unsigned meta_number = loop_count % NUM_METAS; + const unsigned offset = guess_pagesize * meta_number; + int rc = mdbx_pread(env->me_fd, &page, sizeof(page), offset); if (rc != MDBX_SUCCESS) { - mdbx_debug("read meta[%u,%u]: %i, %s", offset, (unsigned)sizeof(buf), rc, + mdbx_debug("read meta[%u,%u]: %i, %s", offset, (unsigned)sizeof(page), rc, mdbx_strerror(rc)); return rc; } - MDBX_page *p = (MDBX_page *)&buf; - if (!F_ISSET(p->mp_flags, P_META)) { - mdbx_debug("page %" PRIaPGNO " not a meta-page", p->mp_pgno); + if (page.mp_pgno != meta_number) { + mdbx_debug("meta[%u] has invalid pageno %" PRIaPGNO, meta_number, + page.mp_pgno); return MDBX_INVALID; } - MDBX_meta *m = PAGEDATA(p); - if (m->mm_magic != MDBX_MAGIC) { - mdbx_debug("meta[%u] has invalid magic", offset); + if (!F_ISSET(page.mp_flags, P_META)) { + mdbx_debug("page #%u not a meta-page", meta_number); return MDBX_INVALID; } - if (m->mm_version != MDBX_DATA_VERSION) { - mdbx_debug("database is version %u, expected version %u", m->mm_version, - MDBX_DATA_VERSION); + if (page.mp_meta.mm_magic != MDBX_MAGIC) { + mdbx_debug("meta[%u] has invalid magic", meta_number); + return MDBX_INVALID; + } + + if (page.mp_meta.mm_version != MDBX_DATA_VERSION) { + mdbx_debug("database is version %u, expected version %u", + page.mp_meta.mm_version, MDBX_DATA_VERSION); return MDBX_VERSION_MISMATCH; } -#ifndef MDBX_TEMPORARY_CRUTCH /* LY: check signature as a checksum */ - if (META_IS_STEADY(m) && m->mm_datasync_sign != mdbx_meta_sign(m)) { - mdbx_debug("steady-meta[%u] has invalid checksum", offset); + if (META_IS_STEADY(&page.mp_meta) && + page.mp_meta.mm_datasync_sign != mdbx_meta_sign(&page.mp_meta)) { + mdbx_debug("steady-meta[%u] has invalid checksum", meta_number); continue; } -#endif /* FIXME: MDBX_TEMPORARY_CRUTCH */ - if (mdbx_meta_lt(meta, m)) { - *meta = *m; + if (mdbx_meta_ot(meta, &page.mp_meta, true)) { + *meta = page.mp_meta; if (META_IS_WEAK(meta)) - loops_left += 1; /* LY: should re-read to avoid race */ - } - - if (offset) - offset = 0; - else { - offset = meta->mm_psize; - if (!offset) - offset = m->mm_psize; - if (!offset) - offset = env->me_os_psize; + loop_limit += 1; /* LY: should re-read to hush race with update */ } } @@ -3256,78 +3341,67 @@ static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *meta) { return MDBX_SUCCESS; } -/* Fill in most of the zeroed MDBX_meta for an empty database environment */ -static void __cold mdbx_meta_model(const MDBX_env *env, MDBX_meta *model) { +static MDBX_page *__cold mdbx_meta_model(const MDBX_env *env, MDBX_page *model, + unsigned num) { memset(model, 0, sizeof(*model)); - model->mm_magic = MDBX_MAGIC; - model->mm_version = MDBX_DATA_VERSION; - model->mm_mapsize = env->me_mapsize; - model->mm_psize = env->me_psize; - model->mm_last_pg = NUM_METAS - 1; - model->mm_flags = (uint16_t)env->me_flags; - model->mm_flags |= MDBX_INTEGERKEY; /* this is mm_dbs[FREE_DBI].md_flags */ - model->mm_dbs[FREE_DBI].md_root = P_INVALID; - model->mm_dbs[MAIN_DBI].md_root = P_INVALID; - model->mm_datasync_sign = mdbx_meta_sign(model); + model->mp_pgno = num; + model->mp_flags = P_META; + model->mp_meta.mm_magic = MDBX_MAGIC; + model->mp_meta.mm_version = MDBX_DATA_VERSION; + model->mp_meta.mm_mapsize = env->me_mapsize; + model->mp_meta.mm_psize = env->me_psize; + model->mp_meta.mm_last_pg = NUM_METAS - 1; + model->mp_meta.mm_flags = (uint16_t)env->me_flags; + model->mp_meta.mm_flags |= + MDBX_INTEGERKEY; /* this is mm_dbs[FREE_DBI].md_flags */ + model->mp_meta.mm_dbs[FREE_DBI].md_root = P_INVALID; + model->mp_meta.mm_dbs[MAIN_DBI].md_root = P_INVALID; + model->mp_meta.mm_txnid = num; + model->mp_meta.mm_datasync_sign = mdbx_meta_sign(&model->mp_meta); + return (MDBX_page *)((uint8_t *)model + env->me_psize); } -/* Write the environment parameters of a freshly created DB environment. */ -static int __cold mdbx_env_init_metas(const MDBX_env *env, MDBX_meta *model) { - mdbx_debug("writing new meta pages"); - assert(offsetof(MDBX_metabuf, mb_metabuf.mm_meta) == PAGEHDRSZ); - - unsigned page_size = env->me_psize; - MDBX_page *first = calloc(NUM_METAS, page_size); - if (!first) - return MDBX_ENOMEM; - first->mp_pgno = 0; - first->mp_flags = P_META; - MDBX_meta *first_meta = (MDBX_meta *)PAGEDATA(first); - - MDBX_page *second = (MDBX_page *)((char *)first + page_size); - second->mp_pgno = 1; - second->mp_flags = P_META; - MDBX_meta *second_meta = (MDBX_meta *)PAGEDATA(second); - - *first_meta = *model; - model->mm_txnid += 1; - *second_meta = *model; - - int rc = mdbx_pwrite(env->me_fd, first, page_size * NUM_METAS, 0); - - free(first); - return rc; +/* Fill in most of the zeroed meta-pages for an empty database environment. + * Return pointer to recenly (head) meta-page. */ +static MDBX_page *__cold mdbx_init_metas(const MDBX_env *env, void *buffer) { + MDBX_page *page0 = (MDBX_page *)buffer; + MDBX_page *page1 = mdbx_meta_model(env, page0, 0); + MDBX_page *page2 = mdbx_meta_model(env, page1, 1); + mdbx_meta_model(env, page2, 2); + page2->mp_meta.mm_datasync_sign = MDBX_DATASIGN_WEAK; + mdbx_assert(env, !mdbx_meta_eq(&page0->mp_meta, &page1->mp_meta)); + mdbx_assert(env, !mdbx_meta_eq(&page1->mp_meta, &page2->mp_meta)); + mdbx_assert(env, !mdbx_meta_eq(&page2->mp_meta, &page0->mp_meta)); + return page1; } static int mdbx_env_sync_locked(MDBX_env *env, unsigned flags, - MDBX_meta *pending) { - int rc; - MDBX_meta *head = mdbx_meta_head(env); - size_t prev_mapsize = head->mm_mapsize; - size_t used_size = env->me_psize * (pending->mm_last_pg + 1); + MDBX_meta *const pending) { + MDBX_meta *const meta0 = METAPAGE(env, 0); + MDBX_meta *const meta1 = METAPAGE(env, 1); + MDBX_meta *const meta2 = METAPAGE(env, 2); + MDBX_meta *const head = mdbx_meta_fluid_head(env); - mdbx_assert(env, pending != METAPAGE_1(env) && pending != METAPAGE_2(env)); + const size_t prev_mapsize = head->mm_mapsize; + const size_t used_size = env->me_psize * (pending->mm_last_pg + 1); + + mdbx_assert(env, mdbx_meta_eq_mask(env) == 0); + mdbx_assert(env, + pending < METAPAGE(env, 0) || pending > METAPAGE(env, NUM_METAS)); mdbx_assert(env, (env->me_flags & (MDBX_RDONLY | MDBX_FATAL_ERROR)) == 0); mdbx_assert(env, !META_IS_STEADY(head) || env->me_sync_pending != 0 || env->me_mapsize != prev_mapsize); pending->mm_mapsize = env->me_mapsize; mdbx_assert(env, pending->mm_mapsize >= used_size); - if (unlikely(pending->mm_mapsize != prev_mapsize)) { - if (pending->mm_mapsize < prev_mapsize) { - /* LY: currently this can't happen, but force full-sync. */ - flags &= MDBX_WRITEMAP; - } else { - /* Persist any increases of mapsize config */ - } - } if (env->me_sync_threshold && env->me_sync_pending >= env->me_sync_threshold) flags &= MDBX_WRITEMAP; /* LY: step#1 - sync previously written/updated data-pages */ + int rc = MDBX_RESULT_TRUE; if (env->me_sync_pending && (flags & MDBX_NOSYNC) == 0) { - assert(((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); + mdbx_assert(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); if (flags & MDBX_WRITEMAP) { rc = mdbx_msync(env->me_map, used_size, flags & MDBX_MAPASYNC); if (unlikely(rc != MDBX_SUCCESS)) @@ -3356,7 +3430,7 @@ static int mdbx_env_sync_locked(MDBX_env *env, unsigned flags, } } - /* LY: step#2 - update meta-page. */ + /* Steady or Weak */ if (env->me_sync_pending == 0) { pending->mm_datasync_sign = mdbx_meta_sign(pending); } else { @@ -3366,27 +3440,60 @@ static int mdbx_env_sync_locked(MDBX_env *env, unsigned flags, : MDBX_DATASIGN_WEAK; } - volatile MDBX_meta *target = - (pending->mm_txnid == head->mm_txnid || META_IS_WEAK(head)) - ? head - : mdbx_env_meta_flipflop(env, head); - size_t offset = (char *)target - env->me_map; + volatile MDBX_meta *target = nullptr; + if (head->mm_txnid == pending->mm_txnid) { + mdbx_assert(env, memcmp(&head->mm_dbs, &pending->mm_dbs, + sizeof(head->mm_dbs)) == 0); + mdbx_assert(env, memcmp(&head->mm_canary, &pending->mm_canary, + sizeof(head->mm_canary)) == 0); + mdbx_assert(env, head->mm_last_pg == pending->mm_last_pg); + mdbx_assert(env, head->mm_mapsize == pending->mm_mapsize); + if (!META_IS_STEADY(head) && META_IS_STEADY(pending)) + target = head; + else { + mdbx_assert(env, mdbx_meta_eq(head, pending)); + mdbx_debug("skip update meta"); + return MDBX_SUCCESS; + } + } else if (head == meta0) + target = mdbx_meta_ancient(env, meta1, meta2, true); + else if (head == meta1) + target = mdbx_meta_ancient(env, meta0, meta2, true); + else if (head == meta2) + target = mdbx_meta_ancient(env, meta0, meta1, true); - MDBX_meta *stay = mdbx_env_meta_flipflop(env, (MDBX_meta *)target); - mdbx_debug( - "writing meta %d (%s, was %" PRIaTXN "/%s, stay %s %" PRIaTXN - "/%s), root %" PRIaPGNO ", " - "txn_id %" PRIaTXN ", %s", - offset >= env->me_psize, target == head ? "head" : "tail", - target->mm_txnid, - META_IS_WEAK(target) ? "Weak" : META_IS_STEADY(target) ? "Steady" - : "Legacy", - stay == head ? "head" : "tail", stay->mm_txnid, - META_IS_WEAK(stay) ? "Weak" : META_IS_STEADY(stay) ? "Steady" : "Legacy", - pending->mm_dbs[MAIN_DBI].md_root, pending->mm_txnid, - META_IS_WEAK(pending) ? "Weak" : META_IS_STEADY(pending) ? "Steady" - : "Legacy"); + /* LY: step#2 - update meta-page. */ + mdbx_debug("writing meta%" PRIaPGNO " (%s, was %" PRIaTXN + ", %s), root %" PRIaPGNO "/%" PRIaPGNO ", " + "txn_id %" PRIaTXN ", %s", + container_of(target, MDBX_page, mp_data)->mp_pgno, + (target == head) ? "head" : "tail", target->mm_txnid, + mdbx_durable_str((const MDBX_meta *)target), + pending->mm_dbs[MAIN_DBI].md_root, + pending->mm_dbs[FREE_DBI].md_root, pending->mm_txnid, + mdbx_durable_str(pending)); + mdbx_debug("meta0: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO + "/%" PRIaPGNO, + (meta0 == head) ? "head" : (meta0 == target) ? "tail" : "stay", + mdbx_durable_str(meta0), meta0->mm_txnid, + meta0->mm_dbs[MAIN_DBI].md_root, meta0->mm_dbs[FREE_DBI].md_root); + mdbx_debug("meta1: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO + "/%" PRIaPGNO, + (meta1 == head) ? "head" : (meta1 == target) ? "tail" : "stay", + mdbx_durable_str(meta1), meta1->mm_txnid, + meta1->mm_dbs[MAIN_DBI].md_root, meta1->mm_dbs[FREE_DBI].md_root); + mdbx_debug("meta2: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO + "/%" PRIaPGNO, + (meta2 == head) ? "head" : (meta2 == target) ? "tail" : "stay", + mdbx_durable_str(meta2), meta2->mm_txnid, + meta2->mm_dbs[MAIN_DBI].md_root, meta2->mm_dbs[FREE_DBI].md_root); + + mdbx_assert(env, !mdbx_meta_eq(pending, meta0)); + mdbx_assert(env, !mdbx_meta_eq(pending, meta1)); + mdbx_assert(env, !mdbx_meta_eq(pending, meta2)); + + const size_t offset = (char *)target - env->me_map; if (env->me_flags & MDBX_WRITEMAP) { /* LY: 'invalidate' the meta. */ mdbx_jitter4testing(true); @@ -3432,7 +3539,7 @@ static int mdbx_env_sync_locked(MDBX_env *env, unsigned flags, /* LY: step#3 - sync meta-pages. */ if ((flags & (MDBX_NOSYNC | MDBX_NOMETASYNC)) == 0) { - assert(((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); + mdbx_assert(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); if (flags & MDBX_WRITEMAP) { char *ptr = env->me_map + (offset & ~(env->me_os_psize - 1)); rc = mdbx_msync(ptr, env->me_os_psize, flags & MDBX_MAPASYNC); @@ -3570,9 +3677,9 @@ static int __cold mdbx_env_map(MDBX_env *env, void *addr, size_t usedsize) { #endif /* Lock meta pages to avoid unexpected write, - * before the data pages would be synchronized. */ + * before the data pages would be synchronized. */ if (flags & MDBX_WRITEMAP) { - rc = mdbx_mlock(env->me_map, env->me_psize * 2); + rc = mdbx_mlock(env->me_map, env->me_psize * NUM_METAS); if (unlikely(rc != MDBX_SUCCESS)) return rc; } @@ -3604,7 +3711,7 @@ int __cold mdbx_env_set_mapsize(MDBX_env *env, size_t size) { return MDBX_EINVAL; /* FIXME: lock/unlock */ - meta = mdbx_meta_head(env); + meta = mdbx_meta_fluid_head(env); if (!size) size = meta->mm_mapsize; /* Silently round up to minimum if the size is too small */ @@ -3674,9 +3781,10 @@ int __cold mdbx_env_get_maxreaders(MDBX_env *env, unsigned *readers) { } /* Further setup required for opening an MDBX environment */ -static int __cold mdbx_setup_dxb(MDBX_env *env, MDBX_meta *meta, int lck_rc) { +static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) { + MDBX_meta meta; int rc = MDBX_RESULT_FALSE; - int err = mdbx_read_header(env, meta); + int err = mdbx_read_header(env, &meta); if (unlikely(err != MDBX_SUCCESS)) { if (lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE || err != MDBX_ENODATA || (env->me_flags & MDBX_RDONLY) != 0) @@ -3689,26 +3797,43 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, MDBX_meta *meta, int lck_rc) { env->me_psize = env->me_os_psize; if (env->me_psize > MAX_PAGESIZE) env->me_psize = MAX_PAGESIZE; + env->me_mapsize = roundup2( env->me_mapsize ? env->me_mapsize : DEFAULT_MAPSIZE, env->me_os_psize); - mdbx_meta_model(env, meta); - err = mdbx_env_init_metas(env, meta); + + void *buffer = calloc(NUM_METAS, env->me_psize); + if (!buffer) + return MDBX_ENOMEM; + + meta = mdbx_init_metas(env, buffer)->mp_meta; + err = mdbx_pwrite(env->me_fd, buffer, env->me_psize * NUM_METAS, 0); + free(buffer); + if (unlikely(err != MDBX_SUCCESS)) + return err; + +#ifndef NDEBUG /* just for checking */ + err = mdbx_read_header(env, &meta); + if (unlikely(err != MDBX_SUCCESS)) + return err; +#endif + + err = mdbx_ftruncate(env->me_fd, env->me_mapsize); if (unlikely(err != MDBX_SUCCESS)) return err; } else { - env->me_psize = meta->mm_psize; + env->me_psize = meta.mm_psize; /* Make sure mapsize >= committed data size. Even when using * mm_mapsize, which could be broken in old files (ITS#7789). */ const size_t usedsize = - roundup2((meta->mm_last_pg + 1) * meta->mm_psize, env->me_os_psize); - if (meta->mm_mapsize < usedsize) - meta->mm_mapsize = usedsize; + roundup2((meta.mm_last_pg + 1) * env->me_psize, env->me_os_psize); + if (meta.mm_mapsize < usedsize) + meta.mm_mapsize = usedsize; /* Was a mapsize configured? */ if (!env->me_mapsize || (env->me_flags & MDBX_RDONLY) || lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE) - env->me_mapsize = meta->mm_mapsize; + env->me_mapsize = meta.mm_mapsize; else if (env->me_mapsize < usedsize) env->me_mapsize = usedsize; } @@ -3717,9 +3842,9 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, MDBX_meta *meta, int lck_rc) { err = mdbx_filesize(env->me_fd, &size); if (unlikely(err != MDBX_SUCCESS)) return err; - if (size != env->me_mapsize) { - mdbx_trace("filesize mismatch"); + mdbx_notice("filesize mismatch (wanna %" PRIu64 ", have %" PRIu64 ")", + env->me_mapsize, size); if ((env->me_flags & MDBX_RDONLY) || lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE) return MDBX_WANNA_RECOVERY /* LY: could not mdbx_ftruncate */; @@ -3733,20 +3858,28 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, MDBX_meta *meta, int lck_rc) { if (err) return err; - const MDBX_meta *head = mdbx_meta_head(env); - if (head->mm_txnid != meta->mm_txnid) { - mdbx_trace("head->mm_txnid (%" PRIaTXN ") != (%" PRIaTXN ") meta->mm_txnid", - head->mm_txnid, meta->mm_txnid); + const unsigned meta_clash_mask = mdbx_meta_eq_mask(env); + if (meta_clash_mask) { + mdbx_error("meta-pages are clashed: mask 0x%d", meta_clash_mask); + return MDBX_WANNA_RECOVERY; + } + + const MDBX_meta *head = mdbx_meta_fluid_head(env); + if (head->mm_txnid != meta.mm_txnid) { if (lck_rc == /* lck exclusive */ MDBX_RESULT_TRUE) { - assert(META_IS_STEADY(meta) && !META_IS_STEADY(head)); + assert(META_IS_STEADY(&meta) && !META_IS_STEADY(head)); if (env->me_flags & MDBX_RDONLY) { - mdbx_trace("exclusive, but read-only, unable recovery/rollback"); + mdbx_error("rollback needed: (from head %" PRIaTXN + " to steady %" PRIaTXN "), but unable in read-only mode", + head->mm_txnid, meta.mm_txnid); return MDBX_WANNA_RECOVERY /* LY: could not recovery/rollback */; } /* LY: rollback weak checkpoint */ MDBX_meta rollback = *head; rollback.mm_txnid = 0; + mdbx_trace("rollback: from %" PRIaTXN ", to %" PRIaTXN, head->mm_txnid, + meta.mm_txnid); err = mdbx_pwrite(env->me_fd, &rollback, sizeof(MDBX_meta), (uint8_t *)head - (uint8_t *)env->me_map); if (err) @@ -3763,18 +3896,19 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, MDBX_meta *meta, int lck_rc) { } } - head = mdbx_meta_head(env); + head = mdbx_meta_fluid_head(env); if (head->mm_mapsize != env->me_mapsize) { - mdbx_trace("head->mm_mapsize (%" PRIu64 ") != (%" PRIu64 - ") env->mm_mapsize", - head->mm_mapsize, env->me_mapsize); + mdbx_info("mismatch meta.mapsize: present %" PRIu64 ", should %" PRIu64, + head->mm_mapsize, env->me_mapsize); if ((env->me_flags & MDBX_RDONLY) || lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE) return MDBX_MAP_RESIZED; - *meta = *head; - meta->mm_mapsize = env->me_mapsize; - err = mdbx_env_sync_locked(env, env->me_flags & MDBX_WRITEMAP, meta); + mdbx_trace("updating meta.mapsize: from %" PRIu64 " to %" PRIu64, + head->mm_mapsize, env->me_mapsize); + meta = *head; + meta.mm_mapsize = env->me_mapsize; + err = mdbx_env_sync_locked(env, env->me_flags & MDBX_WRITEMAP, &meta); if (err) return err; } @@ -3989,8 +4123,7 @@ int __cold mdbx_env_open_ex(MDBX_env *env, const char *path, unsigned flags, goto bailout; } - MDBX_meta meta; - const int dxb_rc = mdbx_setup_dxb(env, &meta, lck_rc); + const int dxb_rc = mdbx_setup_dxb(env, lck_rc); if (MDBX_IS_ERROR(dxb_rc)) { rc = dxb_rc; goto bailout; @@ -4051,13 +4184,13 @@ int __cold mdbx_env_open_ex(MDBX_env *env, const char *path, unsigned flags, #if MDBX_DEBUG if (rc == MDBX_SUCCESS) { - MDBX_meta *meta = mdbx_meta_head(env); + MDBX_meta *meta = mdbx_meta_fluid_head(env); MDBX_db *db = &meta->mm_dbs[MAIN_DBI]; - int toggle = ((char *)meta == PAGEDATA(env->me_map)) ? 0 : 1; mdbx_debug("opened database version %u, pagesize %u", meta->mm_version, env->me_psize); - mdbx_debug("using meta page %d, txn %" PRIaTXN "", toggle, meta->mm_txnid); + mdbx_debug("using meta page %" PRIaPGNO ", txn %" PRIaTXN "", + container_of(meta, MDBX_page, mp_data)->mp_pgno, meta->mm_txnid); mdbx_debug("depth: %u", db->md_depth); mdbx_debug("entries: %" PRIu64 "", db->md_entries); mdbx_debug("branch pages: %" PRIaPGNO "", db->md_branch_pages); @@ -4653,7 +4786,7 @@ static int mdbx_page_search(MDBX_cursor *mc, MDBX_val *key, int flags) { } } - mdbx_cassert(mc, root > 1); + mdbx_cassert(mc, root >= NUM_METAS); if (!mc->mc_pg[0] || mc->mc_pg[0]->mp_pgno != root) if (unlikely((rc = mdbx_page_get(mc, root, &mc->mc_pg[0], NULL)) != 0)) return rc; @@ -8337,18 +8470,7 @@ static int __cold mdbx_env_compact(MDBX_env *env, mdbx_filehandle_t fd) { if (unlikely(rc != MDBX_SUCCESS)) goto finish; - MDBX_page* mp = (MDBX_page *)my.mc_wbuf[0]; - memset(mp, 0, NUM_METAS * env->me_psize); - mp->mp_pgno = 0; - mp->mp_flags = P_META; - MDBX_meta* mm = (MDBX_meta *)PAGEDATA(mp); - mdbx_meta_model(env, mm); - - mp = (MDBX_page *)(my.mc_wbuf[0] + env->me_psize); - mp->mp_pgno = 1; - mp->mp_flags = P_META; - *(MDBX_meta *)PAGEDATA(mp) = *mm; - mm = (MDBX_meta *)PAGEDATA(mp); + MDBX_page *meta = mdbx_init_metas(env, my.mc_wbuf[0]); /* Set metapage 1 with current main DB */ pgno_t new_root, root = txn->mt_dbs[MAIN_DBI].md_root; @@ -8370,18 +8492,24 @@ static int __cold mdbx_env_compact(MDBX_env *env, mdbx_filehandle_t fd) { txn->mt_dbs[FREE_DBI].md_overflow_pages; new_root = txn->mt_next_pgno - 1 - freecount; - mm->mm_last_pg = new_root; - mm->mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; - mm->mm_dbs[MAIN_DBI].md_root = new_root; + meta->mp_meta.mm_last_pg = new_root; + meta->mp_meta.mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; + meta->mp_meta.mm_dbs[MAIN_DBI].md_root = new_root; } else { /* When the DB is empty, handle it specially to * fix any breakage like page leaks from ITS#8174. */ - mm->mm_dbs[MAIN_DBI].md_flags = txn->mt_dbs[MAIN_DBI].md_flags; + meta->mp_meta.mm_dbs[MAIN_DBI].md_flags = txn->mt_dbs[MAIN_DBI].md_flags; } - if (root != P_INVALID || mm->mm_dbs[MAIN_DBI].md_flags) { - mm->mm_txnid = 1; /* use metapage 1 */ + + /* copy canary sequenses if present */ + if (txn->mt_canary.v) { + meta->mp_meta.mm_canary = txn->mt_canary; + meta->mp_meta.mm_canary.v = meta->mp_meta.mm_txnid; } + /* update signature */ + meta->mp_meta.mm_datasync_sign = mdbx_meta_sign(&meta->mp_meta); + my.mc_wlen[0] = env->me_psize * NUM_METAS; my.mc_txn = txn; rc = mdbx_env_cwalk(&my, &root, 0); @@ -8582,12 +8710,11 @@ int __cold mdbx_env_stat(MDBX_env *env, MDBX_stat *arg, size_t bytes) { if (unlikely(bytes != sizeof(MDBX_stat))) return MDBX_EINVAL; - meta = mdbx_meta_head(env); + meta = mdbx_meta_fluid_head(env); return mdbx_stat0(env, &meta->mm_dbs[MAIN_DBI], arg); } int __cold mdbx_env_info(MDBX_env *env, MDBX_envinfo *arg, size_t bytes) { - MDBX_meta *meta; if (unlikely(env == NULL || arg == NULL)) return MDBX_EINVAL; @@ -8595,37 +8722,38 @@ int __cold mdbx_env_info(MDBX_env *env, MDBX_envinfo *arg, size_t bytes) { if (bytes != sizeof(MDBX_envinfo)) return MDBX_EINVAL; - MDBX_meta *m1, *m2; - MDBX_reader *r; - unsigned i; - - m1 = METAPAGE_1(env); - m2 = METAPAGE_2(env); - + const MDBX_meta *const meta0 = METAPAGE(env, 0); + const MDBX_meta *const meta1 = METAPAGE(env, 1); + const MDBX_meta *const meta2 = METAPAGE(env, 2); do { - meta = mdbx_meta_head(env); - arg->me_last_txnid = meta->mm_txnid; - arg->me_last_pgno = meta->mm_last_pg; - arg->me_meta1_txnid = m1->mm_txnid; - arg->me_meta1_sign = m1->mm_datasync_sign; - arg->me_meta2_txnid = m2->mm_txnid; - arg->me_meta2_sign = m2->mm_datasync_sign; - } while (unlikely(arg->me_last_txnid != mdbx_meta_head(env)->mm_txnid || - arg->me_meta1_sign != m1->mm_datasync_sign || - arg->me_meta2_sign != m2->mm_datasync_sign)); + const MDBX_meta *meta = mdbx_meta_fluid_head(env); + arg->me_meta0_txnid = meta0->mm_txnid; + arg->me_meta0_sign = meta0->mm_datasync_sign; + arg->me_meta1_txnid = meta1->mm_txnid; + arg->me_meta1_sign = meta1->mm_datasync_sign; + arg->me_meta2_txnid = meta2->mm_txnid; + arg->me_meta2_sign = meta2->mm_datasync_sign; + arg->me_recent_txnid = meta->mm_txnid; + arg->me_recent_pgno = meta->mm_last_pg; + } while (unlikely(arg->me_meta0_txnid != meta0->mm_txnid || + arg->me_meta0_sign != meta0->mm_datasync_sign || + arg->me_meta1_txnid != meta1->mm_txnid || + arg->me_meta1_sign != meta1->mm_datasync_sign || + arg->me_meta2_txnid != meta2->mm_txnid || + arg->me_meta2_sign != meta2->mm_datasync_sign)); arg->me_mapsize = env->me_mapsize; arg->me_maxreaders = env->me_maxreaders; arg->me_numreaders = env->me_lck->mti_numreaders; - arg->me_tail_txnid = 0; + arg->me_latter_reader_txnid = 0; - r = env->me_lck->mti_readers; - arg->me_tail_txnid = arg->me_last_txnid; - for (i = 0; i < arg->me_numreaders; ++i) { + MDBX_reader *r = env->me_lck->mti_readers; + arg->me_latter_reader_txnid = arg->me_recent_txnid; + for (unsigned i = 0; i < arg->me_numreaders; ++i) { if (r[i].mr_pid) { txnid_t mr = r[i].mr_txnid; - if (arg->me_tail_txnid > mr) - arg->me_tail_txnid = mr; + if (arg->me_latter_reader_txnid > mr) + arg->me_latter_reader_txnid = mr; } } @@ -9264,7 +9392,7 @@ static txnid_t __cold mdbx_oomkick(MDBX_env *env, txnid_t oldest) { continue; rc = env->me_oom_func(env, pid, tid, oldest, - mdbx_meta_head(env)->mm_txnid - oldest, retry); + mdbx_meta_fluid_head(env)->mm_txnid - oldest, retry); if (rc < 0) break; @@ -9329,7 +9457,7 @@ int mdbx_txn_straggler(MDBX_txn *txn, int *percent) return -1; MDBX_env *env = txn->mt_env; - MDBX_meta *meta = mdbx_meta_head(env); + MDBX_meta *meta = mdbx_meta_fluid_head(env); if (percent) { size_t maxpg = env->me_maxpg; size_t last = meta->mm_last_pg + 1; @@ -9487,9 +9615,10 @@ int __cold mdbx_env_pgwalk(MDBX_txn *txn, MDBX_pgvisitor_func *visitor, ctx.mw_user = user; ctx.mw_visitor = visitor; - int rc = visitor(0, 2, user, "mdbx", "meta", 2, sizeof(MDBX_meta) * 2, - PAGEHDRSZ * 2, - (txn->mt_env->me_psize - sizeof(MDBX_meta) - PAGEHDRSZ) * 2); + int rc = visitor(0, NUM_METAS, user, "mdbx", "meta", NUM_METAS, + sizeof(MDBX_meta) * NUM_METAS, PAGEHDRSZ * NUM_METAS, + (txn->mt_env->me_psize - sizeof(MDBX_meta) - PAGEHDRSZ) * + NUM_METAS); if (!rc) rc = mdbx_env_walk(&ctx, "free", txn->mt_dbs[FREE_DBI].md_root, 0); if (!rc) diff --git a/src/osal.c b/src/osal.c index 458e50e3..0611ad38 100644 --- a/src/osal.c +++ b/src/osal.c @@ -48,8 +48,8 @@ __extern_C __declspec(dllimport) void __cdecl _assert(char const *message, #endif /* _MSC_VER */ #ifndef mdbx_assert_fail -void __cold mdbx_assert_fail(MDBX_env *env, const char *msg, const char *func, - int line) { +void __cold mdbx_assert_fail(const MDBX_env *env, const char *msg, + const char *func, int line) { #if MDBX_DEBUG if (env && env->me_assert_func) { env->me_assert_func(env, msg, func, line); diff --git a/src/osal.h b/src/osal.h index daea002c..123d42ac 100644 --- a/src/osal.h +++ b/src/osal.h @@ -338,7 +338,7 @@ static __inline void mdbx_invalidate_cache(void *addr, size_t nbytes) { /* libc compatibility stuff */ #ifndef mdbx_assert_fail -void mdbx_assert_fail(MDBX_env *env, const char *msg, const char *func, +void mdbx_assert_fail(const MDBX_env *env, const char *msg, const char *func, int line); #endif /* mdbx_assert_fail */ diff --git a/src/tools/mdbx_chk.c b/src/tools/mdbx_chk.c index 71e8e103..f4578a6e 100644 --- a/src/tools/mdbx_chk.c +++ b/src/tools/mdbx_chk.c @@ -323,7 +323,7 @@ static int handle_freedb(const uint64_t record_number, const MDBX_val *key, if (key->iov_len != sizeof(txnid_t)) problem_add("entry", record_number, "wrong txn-id size", "key-size %" PRIiPTR "", key->iov_len); - else if (txnid < 1 || txnid > envinfo.me_last_txnid) + else if (txnid < 1 || txnid > envinfo.me_recent_txnid) problem_add("entry", record_number, "wrong txn-id", "%" PRIaTXN "", txnid); if (data->iov_len < sizeof(pgno_t) || data->iov_len % sizeof(pgno_t)) @@ -340,14 +340,14 @@ static int handle_freedb(const uint64_t record_number, const MDBX_val *key, data->iov_len); else { freedb_pages += number; - if (envinfo.me_tail_txnid > txnid) + if (envinfo.me_latter_reader_txnid > txnid) reclaimable_pages += number; for (i = number, prev = 1; --i >= 0;) { pg = iptr[i]; - if (pg < NUM_METAS || pg > envinfo.me_last_pgno) + if (pg < NUM_METAS || pg > envinfo.me_recent_pgno) problem_add("entry", record_number, "wrong idl entry", "%u < %" PRIiPTR " < %" PRIiPTR "", NUM_METAS, pg, - envinfo.me_last_pgno); + envinfo.me_recent_pgno); else if (pg <= prev) { bad = " [bad sequence]"; problem_add("entry", record_number, "bad sequence", @@ -431,8 +431,7 @@ static int process_db(MDBX_dbi dbi, char *name, visitor *handler, int silent) { } } - if (dbi >= 2 /* CORE_DBS */ && name && only_subdb && - strcmp(only_subdb, name)) { + if (dbi >= CORE_DBS && name && only_subdb && strcmp(only_subdb, name)) { if (verbose) { print("Skip processing '%s'...\n", name); fflush(NULL); @@ -592,19 +591,132 @@ static void usage(char *prog) { const char *meta_synctype(uint64_t sign) { switch (sign) { - case 0: + case MDBX_DATASIGN_NONE: return "no-sync/legacy"; - case 1: + case MDBX_DATASIGN_WEAK: return "weak"; default: return "steady"; } } -int meta_lt(txnid_t txn1, uint64_t sign1, txnid_t txn2, uint64_t sign2) { - return (SIGN_IS_STEADY(sign1) == SIGN_IS_STEADY(sign2)) - ? txn1 < txn2 - : txn2 && SIGN_IS_STEADY(sign2); +static __inline bool meta_ot(txnid_t txn_a, uint64_t sign_a, txnid_t txn_b, + uint64_t sign_b, const bool roolback2steady) { + if (txn_a == txn_b) + return SIGN_IS_STEADY(sign_b); + + if (roolback2steady && SIGN_IS_STEADY(sign_a) != SIGN_IS_STEADY(sign_b)) + return SIGN_IS_STEADY(sign_b); + + return txn_a < txn_b; +} + +static __inline bool meta_eq(txnid_t txn_a, uint64_t sign_a, txnid_t txn_b, + uint64_t sign_b) { + if (txn_a != txn_b) + return false; + + if (SIGN_IS_STEADY(sign_a) != SIGN_IS_STEADY(sign_b)) + return false; + + return true; +} + +static __inline int meta_recent(const bool roolback2steady) { + + if (meta_ot(envinfo.me_meta0_txnid, envinfo.me_meta0_sign, + envinfo.me_meta1_txnid, envinfo.me_meta1_sign, roolback2steady)) + return meta_ot(envinfo.me_meta2_txnid, envinfo.me_meta2_sign, + envinfo.me_meta1_txnid, envinfo.me_meta1_sign, + roolback2steady) + ? 1 + : 2; + + return meta_ot(envinfo.me_meta0_txnid, envinfo.me_meta0_sign, + envinfo.me_meta2_txnid, envinfo.me_meta2_sign, roolback2steady) + ? 2 + : 0; +} + +static __inline int meta_ancient(const bool roolback2steady) { + + if (meta_ot(envinfo.me_meta0_txnid, envinfo.me_meta0_sign, + envinfo.me_meta1_txnid, envinfo.me_meta1_sign, roolback2steady)) + return meta_ot(envinfo.me_meta0_txnid, envinfo.me_meta0_sign, + envinfo.me_meta2_txnid, envinfo.me_meta2_sign, + roolback2steady) + ? 0 + : 2; + return meta_ot(envinfo.me_meta2_txnid, envinfo.me_meta2_sign, + envinfo.me_meta1_txnid, envinfo.me_meta1_sign, roolback2steady) + ? 2 + : 1; +} + +static int meta_steady_head(void) { return meta_recent(true); } + +static int meta_weak_head(void) { return meta_recent(false); } + +static int meta_tail(void) { return meta_ancient(true); } + +void verbose_meta(int num, txnid_t txnid, uint64_t sign) { + print(" - meta-%d: %s %" PRIu64, num, meta_synctype(sign), txnid); + bool stay = true; + + if (num == meta_steady_head() && num == meta_weak_head()) { + print(", head"); + stay = false; + } else if (num == meta_steady_head()) { + print(", head-steady"); + stay = false; + } else if (num == meta_weak_head()) { + print(", head-weak"); + stay = false; + } + if (num == meta_tail()) { + print(", tail"); + stay = false; + } + if (stay) + print(", stay"); + + if (txnid > envinfo.me_recent_txnid) + print(", rolled-back %" PRIu64 " (%" PRIu64 " >>> %" PRIu64 ")", + txnid - envinfo.me_recent_txnid, txnid, envinfo.me_recent_txnid); + print("\n"); +} + +static int check_meta_head(bool steady) { + switch (meta_recent(steady)) { + default: + assert(false); + error(" - unexpected internal error (%s)\n", + steady ? "meta_steady_head" : "meta_weak_head"); + case 0: + if (envinfo.me_meta0_txnid != envinfo.me_recent_txnid) { + print(" - meta-%d txn-id mismatch recent-txn-id (%" PRIi64 " != %" PRIi64 + ")\n", + 0, envinfo.me_meta0_txnid, envinfo.me_recent_txnid); + return 1; + } + break; + case 1: + if (envinfo.me_meta1_txnid != envinfo.me_recent_txnid) { + print(" - meta-%d txn-id mismatch recent-txn-id (%" PRIi64 " != %" PRIi64 + ")\n", + 1, envinfo.me_meta1_txnid, envinfo.me_recent_txnid); + return 1; + } + break; + case 2: + if (envinfo.me_meta2_txnid != envinfo.me_recent_txnid) { + print(" - meta-%d txn-id mismatch recent-txn-id (%" PRIi64 " != %" PRIi64 + ")\n", + 2, envinfo.me_meta2_txnid, envinfo.me_recent_txnid); + return 1; + } + } + return 0; } int main(int argc, char *argv[]) { @@ -739,7 +851,7 @@ int main(int argc, char *argv[]) { goto bailout; } - lastpgno = envinfo.me_last_pgno + 1; + lastpgno = envinfo.me_recent_pgno + 1; errno = 0; if (verbose) { @@ -754,71 +866,45 @@ int main(int argc, char *argv[]) { print(" - mapaddr %p\n", envinfo.me_mapaddr); print(" - pagesize %u, max keysize %" PRIuPTR ", max readers %u\n", envstat.ms_psize, maxkeysize, envinfo.me_maxreaders); - print(" - transactions: last %" PRIu64 ", bottom %" PRIu64 - ", lag reading %" PRIi64 "\n", - envinfo.me_last_txnid, envinfo.me_tail_txnid, - envinfo.me_last_txnid - envinfo.me_tail_txnid); + print(" - transactions: recent %" PRIu64 ", latter reader %" PRIu64 + ", lag %" PRIi64 "\n", + envinfo.me_recent_txnid, envinfo.me_latter_reader_txnid, + envinfo.me_recent_txnid - envinfo.me_latter_reader_txnid); - print(" - meta-1: %s %" PRIu64 ", %s", meta_synctype(envinfo.me_meta1_sign), - envinfo.me_meta1_txnid, - meta_lt(envinfo.me_meta1_txnid, envinfo.me_meta1_sign, - envinfo.me_meta2_txnid, envinfo.me_meta2_sign) - ? "tail" - : "head"); - if (envinfo.me_meta1_txnid > envinfo.me_last_txnid) - print(", rolled-back %" PRIu64 " (%" PRIu64 " >>> %" PRIu64 ")", - envinfo.me_meta1_txnid - envinfo.me_last_txnid, - envinfo.me_meta1_txnid, envinfo.me_last_txnid); - print("\n"); + verbose_meta(0, envinfo.me_meta0_txnid, envinfo.me_meta0_sign); + verbose_meta(1, envinfo.me_meta1_txnid, envinfo.me_meta1_sign); + verbose_meta(2, envinfo.me_meta2_txnid, envinfo.me_meta2_sign); + } - print(" - meta-2: %s %" PRIu64 ", %s", meta_synctype(envinfo.me_meta2_sign), - envinfo.me_meta2_txnid, - meta_lt(envinfo.me_meta2_txnid, envinfo.me_meta2_sign, - envinfo.me_meta1_txnid, envinfo.me_meta1_sign) - ? "tail" - : "head"); - if (envinfo.me_meta2_txnid > envinfo.me_last_txnid) - print(", rolled-back %" PRIu64 " (%" PRIu64 " >>> %" PRIu64 ")", - envinfo.me_meta2_txnid - envinfo.me_last_txnid, - envinfo.me_meta2_txnid, envinfo.me_last_txnid); - print("\n"); + if (verbose) + print(" - performs check for meta-pages overlap\n"); + if (meta_eq(envinfo.me_meta0_txnid, envinfo.me_meta0_sign, + envinfo.me_meta1_txnid, envinfo.me_meta1_sign)) { + print(" - meta-%d and meta-%d are clashed\n", 0, 1); + ++problems_meta; + } + if (meta_eq(envinfo.me_meta1_txnid, envinfo.me_meta1_sign, + envinfo.me_meta2_txnid, envinfo.me_meta2_sign)) { + print(" - meta-%d and meta-%d are clashed\n", 1, 2); + ++problems_meta; + } + if (meta_eq(envinfo.me_meta2_txnid, envinfo.me_meta2_sign, + envinfo.me_meta0_txnid, envinfo.me_meta0_sign)) { + print(" - meta-%d and meta-%d are clashed\n", 2, 0); + ++problems_meta; } if (exclusive > 1) { if (verbose) - print(" - perform full check last-txn-id with meta-pages\n"); - - if (!meta_lt(envinfo.me_meta1_txnid, envinfo.me_meta1_sign, - envinfo.me_meta2_txnid, envinfo.me_meta2_sign) && - envinfo.me_meta1_txnid != envinfo.me_last_txnid) { - print(" - meta-1 txn-id mismatch last-txn-id (%" PRIi64 " != %" PRIi64 - ")\n", - envinfo.me_meta1_txnid, envinfo.me_last_txnid); - ++problems_meta; - } - - if (!meta_lt(envinfo.me_meta2_txnid, envinfo.me_meta2_sign, - envinfo.me_meta1_txnid, envinfo.me_meta1_sign) && - envinfo.me_meta2_txnid != envinfo.me_last_txnid) { - print(" - meta-2 txn-id mismatch last-txn-id (%" PRIi64 " != %" PRIi64 - ")\n", - envinfo.me_meta2_txnid, envinfo.me_last_txnid); - ++problems_meta; - } + print(" - performs full check recent-txn-id with meta-pages\n"); + problems_meta += check_meta_head(true); } else if (locktxn) { if (verbose) - print(" - perform lite check last-txn-id with meta-pages (not a " + print(" - performs lite check recent-txn-id with meta-pages (not a " "monopolistic mode)\n"); - uint64_t last = (envinfo.me_meta2_txnid > envinfo.me_meta1_txnid) - ? envinfo.me_meta2_txnid - : envinfo.me_meta1_txnid; - if (last != envinfo.me_last_txnid) { - print(" - last-meta mismatch last-txn-id (%" PRIi64 " != %" PRIi64 ")\n", - last, envinfo.me_last_txnid); - ++problems_meta; - } + problems_meta += check_meta_head(false); } else if (verbose) { - print(" - skip check last-txn-id with meta-pages (monopolistic or " + print(" - skip check recent-txn-id with meta-pages (monopolistic or " "write-lock mode only)\n"); } diff --git a/src/tools/mdbx_stat.c b/src/tools/mdbx_stat.c index e3c53a9d..efa69b42 100644 --- a/src/tools/mdbx_stat.c +++ b/src/tools/mdbx_stat.c @@ -125,10 +125,11 @@ int main(int argc, char *argv[]) { printf(" Map size: %" PRIu64 "\n", mei.me_mapsize); printf(" Page size: %u\n", mst.ms_psize); printf(" Max pages: %" PRIu64 "\n", mei.me_mapsize / mst.ms_psize); - printf(" Number of pages used: %" PRIu64 "\n", mei.me_last_pgno + 1); - printf(" Last transaction ID: %" PRIu64 "\n", mei.me_last_txnid); + printf(" Number of pages used: %" PRIu64 "\n", mei.me_recent_pgno + 1); + printf(" Last transaction ID: %" PRIu64 "\n", mei.me_recent_txnid); printf(" Tail transaction ID: %" PRIu64 " (%" PRIi64 ")\n", - mei.me_tail_txnid, mei.me_tail_txnid - mei.me_last_txnid); + mei.me_latter_reader_txnid, + mei.me_latter_reader_txnid - mei.me_recent_txnid); printf(" Max readers: %u\n", mei.me_maxreaders); printf(" Number of readers used: %u\n", mei.me_numreaders); } else { @@ -181,7 +182,7 @@ int main(int argc, char *argv[]) { while ((rc = mdbx_cursor_get(cursor, &key, &data, MDBX_NEXT)) == 0) { iptr = data.iov_base; pages += *iptr; - if (envinfo && mei.me_tail_txnid > *(size_t *)key.iov_base) + if (envinfo && mei.me_latter_reader_txnid > *(size_t *)key.iov_base) reclaimable += *iptr; if (freinfo > 1) { char *bad = ""; @@ -220,14 +221,14 @@ int main(int argc, char *argv[]) { printf("Page Allocation Info\n"); printf(" Max pages: %9zu 100%%\n", value); - value = mei.me_last_pgno + 1; + value = mei.me_recent_pgno + 1; printf(" Number of pages used: %" PRIuPTR " %.1f%%\n", value, value / percent); - value = mei.me_mapsize / mst.ms_psize - (mei.me_last_pgno + 1); + value = mei.me_mapsize / mst.ms_psize - (mei.me_recent_pgno + 1); printf(" Remained: %" PRIuPTR " %.1f%%\n", value, value / percent); - value = mei.me_last_pgno + 1 - pages; + value = mei.me_recent_pgno + 1 - pages; printf(" Used now: %" PRIuPTR " %.1f%%\n", value, value / percent); value = pages; @@ -239,8 +240,8 @@ int main(int argc, char *argv[]) { value = reclaimable; printf(" Reclaimable: %" PRIuPTR " %.1f%%\n", value, value / percent); - value = - mei.me_mapsize / mst.ms_psize - (mei.me_last_pgno + 1) + reclaimable; + value = mei.me_mapsize / mst.ms_psize - (mei.me_recent_pgno + 1) + + reclaimable; printf(" Available: %" PRIuPTR " %.1f%%\n", value, value / percent); } else printf(" Free pages: %" PRIuPTR "\n", pages);