From 61a3766e23673f662f288644ab457e17bd306e72 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 30 May 2017 16:22:42 +0300 Subject: [PATCH] mdbx: update DB format and signatures. Change-Id: I9c4b187e8ebc3df63fef15ae98872e27d56a01ab --- src/bits.h | 87 ++++++++++++++++++++++++++++++------------------- src/mdbx.c | 95 ++++++++++++++++++++++++++---------------------------- src/osal.h | 4 +-- 3 files changed, 100 insertions(+), 86 deletions(-) diff --git a/src/bits.h b/src/bits.h index b21c639b..734c1fe7 100644 --- a/src/bits.h +++ b/src/bits.h @@ -25,7 +25,7 @@ /* Features under development */ #ifndef MDBX_DEVEL -# define MDBX_DEVEL 0 +# define MDBX_DEVEL 1 #endif /*----------------------------------------------------------------------------*/ @@ -116,12 +116,12 @@ /* A stamp that identifies a file as an MDBX file. * There's nothing special about this value other than that it is easily * recognizable, and it will reflect any byte order mismatches. */ -#define MDBX_MAGIC 0xBEEFC0DE +#define MDBX_MAGIC UINT64_C(/* 56-bit prime */ 0x59659DBDEF4C11) /* The version number for a database's datafile format. */ -#define MDBX_DATA_VERSION ((MDBX_DEVEL) ? 999 : 1) +#define MDBX_DATA_VERSION ((MDBX_DEVEL) ? 255 : 2) /* The version number for a database's lockfile format. */ -#define MDBX_LOCK_VERSION ((MDBX_DEVEL) ? 999 : 1) +#define MDBX_LOCK_VERSION ((MDBX_DEVEL) ? 255 : 2) /* handle for the DB used to track free pages. */ #define FREE_DBI 0 @@ -241,21 +241,29 @@ typedef struct MDBX_db { uint64_t md_leaf_pages; /* number of leaf pages */ uint64_t md_overflow_pages; /* number of overflow pages */ uint64_t md_entries; /* number of data items */ + uint64_t md_merkle; /* Merkle tree checksum */ } MDBX_db; /* Meta page content. * A meta page is the start point for accessing a database snapshot. * Pages 0-1 are meta pages. Transaction N writes meta page (N % 2). */ typedef struct MDBX_meta { - /* Stamp identifying this as an MDBX file. It must be set - * to MDBX_MAGIC. */ - uint32_t mm_magic; - /* Version number of this file. Must be set to MDBX_DATA_VERSION. */ - uint32_t mm_version; - /* txnid that committed this page, */ - volatile txnid_t mm_txnid_top; + /* Stamp identifying this as an MDBX file. + * It must be set to MDBX_MAGIC with MDBX_DATA_VERSION. */ + uint64_t mm_magic_and_version; - uint64_t mm_mapsize; /* size of mmap region */ + /* txnid that committed this page, the first of a two-phase-update pair */ + volatile txnid_t mm_txnid_a; + + uint16_t mm_extra_flags; /* extra DB flags, zero (nothing) for now */ + uint8_t mm_validator_id; /* ID of checksum and page validation method, + * zero (nothing) for now */ + uint8_t mm_extra_pagehdr; /* extra bytes in the page header, + * zero (nothing) for now */ + uint32_t mm_reserved_pad; /* padding for aligment, unused for now */ + + uint64_t mm_dbsize_min; /* minimal size of db */ + uint64_t mm_dbsize_max; /* maximal size of db */ MDBX_db mm_dbs[CORE_DBS]; /* first is free space, 2nd is main db */ /* The size of pages used in this DB */ #define mm_psize mm_dbs[FREE_DBI].md_xsize @@ -265,17 +273,20 @@ typedef struct MDBX_meta { /* Last used page in the datafile. * Actually the file may be shorter if the freeDB lists the final pages. */ uint64_t mm_last_pg; + #define MDBX_DATASIGN_NONE 0u #define MDBX_DATASIGN_WEAK 1u - volatile uint64_t mm_datasync_sign; - #define SIGN_IS_WEAK(sign) ((sign) == MDBX_DATASIGN_WEAK) #define SIGN_IS_STEADY(sign) ((sign) > MDBX_DATASIGN_WEAK) - #define META_IS_WEAK(meta) SIGN_IS_WEAK((meta)->mm_datasync_sign) #define META_IS_STEADY(meta) SIGN_IS_STEADY((meta)->mm_datasync_sign) - /* txnid that committed this page */ - volatile txnid_t mm_txnid_bottom; + volatile uint64_t mm_datasync_sign; + + /* to be removed */ + uint64_t mm_mapsize; /* current size of mmap region */ + + /* txnid that committed this page, the second of a two-phase-update pair */ + volatile txnid_t mm_txnid_b; } MDBX_meta; /* Common header for all page types. The page type depends on mp_flags. @@ -297,7 +308,8 @@ typedef struct MDBX_meta { * in the snapshot: Either used by a database or listed in a freeDB record. */ typedef struct MDBX_page { union { - pgno_t mp_pgno; /* page number */ + uint64_t mp_validator; /* checksum of page content or a txnid during + * which the page has been updated */ struct MDBX_page *mp_next; /* for in-memory list of freed pages */ }; uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */ @@ -318,6 +330,7 @@ typedef struct MDBX_page { }; uint32_t mp_pages; /* number of overflow pages */ }; + pgno_t mp_pgno; /* page number */ /* dynamic size */ union { @@ -330,15 +343,19 @@ typedef struct MDBX_page { /* Size of the page header, excluding dynamic data at the end */ #define PAGEHDRSZ ((unsigned)offsetof(MDBX_page, mp_data)) +#pragma pack(pop) + /* The header for the reader table (a memory-mapped lock file). */ typedef struct MDBX_lockinfo { - /* Stamp identifying this as an MDBX file. It must be set to MDBX_MAGIC. */ - uint64_t mti_magic; + /* Stamp identifying this as an MDBX file. + * It must be set to MDBX_MAGIC with with MDBX_LOCK_VERSION. */ + uint64_t mti_magic_and_version; + /* Format of this lock file. Must be set to MDBX_LOCK_FORMAT. */ - uint64_t mti_format; + uint32_t mti_os_and_format; + /* Flags which environment was opened. */ - uint32_t mti_envmode; - uint32_t mti_reserved; + volatile uint32_t mti_envmode; #ifdef MDBX_OSAL_LOCK MDBX_OSAL_LOCK mti_wmutex; @@ -355,7 +372,19 @@ typedef struct MDBX_lockinfo { MDBX_reader __cache_aligned mti_readers[1]; } MDBX_lockinfo; -#pragma pack(pop) +#define MDBX_LOCKINFO_WHOLE_SIZE \ + ((sizeof(MDBX_lockinfo) + MDBX_CACHELINE_SIZE - 1) & \ + ~((size_t)MDBX_CACHELINE_SIZE - 1)) + +/* Lockfile format signature: version, features and field layout */ +#define MDBX_LOCK_FORMAT \ + ((MDBX_OSAL_LOCK_SIGN << 16) + \ + (uint16_t)(MDBX_LOCKINFO_WHOLE_SIZE + MDBX_CACHELINE_SIZE - 1)) + +#define MDBX_DATA_MAGIC ((MDBX_MAGIC << 8) + MDBX_DATA_VERSION) + +#define MDBX_LOCK_MAGIC ((MDBX_MAGIC << 8) + MDBX_LOCK_VERSION) + /*----------------------------------------------------------------------------*/ /* Two kind lists of pages (aka IDL) */ @@ -574,16 +603,6 @@ typedef struct MDBX_pgstate { txnid_t mf_pglast; /* ID of last used record, or 0 if !mf_pghead */ } MDBX_pgstate; -#define MDBX_LOCKINFO_WHOLE_SIZE \ - ((sizeof(MDBX_lockinfo) + MDBX_CACHELINE_SIZE - 1) & \ - ~((size_t)MDBX_CACHELINE_SIZE - 1)) - -/* Lockfile format signature: version, features and field layout */ -#define MDBX_LOCK_FORMAT \ - (((uint64_t)(MDBX_OSAL_LOCK_SIGN) << 32) + \ - ((MDBX_LOCKINFO_WHOLE_SIZE + MDBX_CACHELINE_SIZE - 1) << 16) + \ - (MDBX_LOCK_VERSION) /* Flags which describe functionality */) - /* The database environment. */ struct MDBX_env { #define MDBX_ME_SIGNATURE (0x9A899641) diff --git a/src/mdbx.c b/src/mdbx.c index 856e638f..e7f55865 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -1279,12 +1279,12 @@ bailout: static __inline txnid_t meta_txnid(const MDBX_env *env, const MDBX_meta *meta, bool allow_volatile) { mdbx_assert(env, meta >= METAPAGE(env, 0) || meta < METAPAGE_END(env)); - txnid_t top = meta->mm_txnid_top; - txnid_t bottom = meta->mm_txnid_bottom; + txnid_t a = meta->mm_txnid_a; + txnid_t b = meta->mm_txnid_b; if (allow_volatile) - return (top < bottom) ? top : bottom; - mdbx_assert(env, top == bottom); - return top; + return (a < b) ? a : b; + mdbx_assert(env, a == b); + return a; } static __inline txnid_t mdbx_meta_txnid_stable(const MDBX_env *env, @@ -1300,8 +1300,8 @@ static __inline txnid_t mdbx_meta_txnid_fluid(const MDBX_env *env, static __inline void mdbx_meta_update_begin(const MDBX_env *env, MDBX_meta *meta, txnid_t txnid) { mdbx_assert(env, meta >= METAPAGE(env, 0) || meta < METAPAGE_END(env)); - mdbx_assert(env, meta->mm_txnid_top < txnid && meta->mm_txnid_bottom < txnid); - meta->mm_txnid_top = txnid; + mdbx_assert(env, meta->mm_txnid_a < txnid && meta->mm_txnid_b < txnid); + meta->mm_txnid_a = txnid; (void)env; mdbx_coherent_barrier(); } @@ -1309,19 +1309,19 @@ static __inline void mdbx_meta_update_begin(const MDBX_env *env, static __inline void mdbx_meta_update_end(const MDBX_env *env, MDBX_meta *meta, txnid_t txnid) { mdbx_assert(env, meta >= METAPAGE(env, 0) || meta < METAPAGE_END(env)); - mdbx_assert(env, meta->mm_txnid_top == txnid); - mdbx_assert(env, meta->mm_txnid_bottom < txnid); + mdbx_assert(env, meta->mm_txnid_a == txnid); + mdbx_assert(env, meta->mm_txnid_b < txnid); mdbx_jitter4testing(true); - meta->mm_txnid_bottom = txnid; + meta->mm_txnid_b = txnid; mdbx_coherent_barrier(); } static __inline void mdbx_meta_set_txnid(const MDBX_env *env, MDBX_meta *meta, txnid_t txnid) { mdbx_assert(env, meta < METAPAGE(env, 0) || meta > METAPAGE_END(env)); - meta->mm_txnid_top = txnid; - meta->mm_txnid_bottom = txnid; + meta->mm_txnid_a = txnid; + meta->mm_txnid_b = txnid; } static __inline uint64_t mdbx_meta_sign(const MDBX_meta *meta) { @@ -1329,7 +1329,7 @@ static __inline uint64_t mdbx_meta_sign(const MDBX_meta *meta) { #if 0 /* TODO */ sign = hippeus_hash64(&meta->mm_mapsize, sizeof(MDBX_meta) - offsetof(MDBX_meta, mm_mapsize), - meta->mm_version | (uint64_t)MDBX_MAGIC << 32); + meta->mm_version | (uint64_t)MDBX_DXD_MAGIC << 32); #else (void)meta; #endif @@ -2183,8 +2183,8 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { } else if (env->me_lck) { const mdbx_pid_t pid = env->me_pid; const mdbx_tid_t tid = mdbx_thread_self(); - mdbx_assert(env, env->me_lck->mti_magic == MDBX_MAGIC); - mdbx_assert(env, env->me_lck->mti_format == MDBX_LOCK_FORMAT); + mdbx_assert(env, env->me_lck->mti_magic_and_version == MDBX_LOCK_MAGIC); + mdbx_assert(env, env->me_lck->mti_os_and_format == MDBX_LOCK_FORMAT); rc = mdbx_rdt_lock(env); if (unlikely(MDBX_IS_ERROR(rc))) @@ -3390,18 +3390,14 @@ static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *meta) { return MDBX_INVALID; } - if (page.mp_meta.mm_magic != MDBX_MAGIC) { - mdbx_error("meta[%u] has invalid magic", meta_number); - return MDBX_INVALID; + if (page.mp_meta.mm_magic_and_version != MDBX_DATA_MAGIC) { + mdbx_error("meta[%u] has invalid magic/version", meta_number); + return ((page.mp_meta.mm_magic_and_version >> 8) != MDBX_MAGIC) + ? MDBX_INVALID + : MDBX_VERSION_MISMATCH; } - if (page.mp_meta.mm_version != MDBX_DATA_VERSION) { - mdbx_error("database is version %u, expected version %u", - page.mp_meta.mm_version, MDBX_DATA_VERSION); - return MDBX_VERSION_MISMATCH; - } - - if (page.mp_meta.mm_txnid_top != page.mp_meta.mm_txnid_bottom) { + if (page.mp_meta.mm_txnid_a != page.mp_meta.mm_txnid_b) { mdbx_warning("meta[%u] not completely updated, skip it", meta_number); continue; } @@ -3511,8 +3507,7 @@ static MDBX_page *__cold mdbx_meta_model(const MDBX_env *env, MDBX_page *model, memset(model, 0, sizeof(*model)); model->mp_pgno = num; model->mp_flags = P_META; - model->mp_meta.mm_magic = MDBX_MAGIC; - model->mp_meta.mm_version = MDBX_DATA_VERSION; + model->mp_meta.mm_magic_and_version = MDBX_DATA_MAGIC; model->mp_meta.mm_mapsize = env->me_mapsize; model->mp_meta.mm_psize = env->me_psize; model->mp_meta.mm_last_pg = NUM_METAS - 1; @@ -3607,7 +3602,7 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, } MDBX_meta *target = nullptr; - if (mdbx_meta_txnid_stable(env, head) == pending->mm_txnid_top) { + if (mdbx_meta_txnid_stable(env, head) == pending->mm_txnid_a) { mdbx_assert(env, memcmp(&head->mm_dbs, &pending->mm_dbs, sizeof(head->mm_dbs)) == 0); mdbx_assert(env, memcmp(&head->mm_canary, &pending->mm_canary, @@ -3637,7 +3632,7 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, (target == head) ? "head" : "tail", mdbx_meta_txnid_stable(env, target), mdbx_durable_str((const MDBX_meta *)target), pending->mm_dbs[MAIN_DBI].md_root, pending->mm_dbs[FREE_DBI].md_root, - pending->mm_txnid_top, mdbx_durable_str(pending)); + pending->mm_txnid_a, mdbx_durable_str(pending)); mdbx_debug("meta0: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO "/%" PRIaPGNO, @@ -3663,13 +3658,13 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, mdbx_assert(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0); mdbx_ensure(env, target == head || - mdbx_meta_txnid_stable(env, target) < pending->mm_txnid_top); + mdbx_meta_txnid_stable(env, target) < pending->mm_txnid_a); if (env->me_flags & MDBX_WRITEMAP) { mdbx_jitter4testing(true); if (likely(target != head)) { /* LY: 'invalidate' the meta. */ target->mm_datasync_sign = MDBX_DATASIGN_WEAK; - mdbx_meta_update_begin(env, target, pending->mm_txnid_top); + mdbx_meta_update_begin(env, target, pending->mm_txnid_a); #ifndef NDEBUG /* debug: provoke failure to catch a violators */ memset(target->mm_dbs, 0xCC, @@ -3687,13 +3682,13 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, mdbx_coherent_barrier(); /* LY: 'commit' the meta */ - mdbx_meta_update_end(env, target, pending->mm_txnid_bottom); + mdbx_meta_update_end(env, target, pending->mm_txnid_b); mdbx_jitter4testing(true); } else { /* dangerous case (target == head), only mm_datasync_sign could * me updated, check assertions once again */ mdbx_ensure(env, - mdbx_meta_txnid_stable(env, head) == pending->mm_txnid_top && + mdbx_meta_txnid_stable(env, head) == pending->mm_txnid_a && !META_IS_STEADY(head) && META_IS_STEADY(pending)); mdbx_ensure(env, head->mm_last_pg == pending->mm_last_pg); mdbx_ensure(env, head->mm_mapsize == pending->mm_mapsize); @@ -3706,8 +3701,7 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, mdbx_coherent_barrier(); mdbx_jitter4testing(true); } else { - pending->mm_magic = MDBX_MAGIC; - pending->mm_version = MDBX_DATA_VERSION; + pending->mm_magic_and_version = MDBX_DATA_MAGIC; rc = mdbx_pwrite(env->me_fd, pending, sizeof(MDBX_meta), offset); if (unlikely(rc != MDBX_SUCCESS)) { undo: @@ -4081,13 +4075,13 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) { const MDBX_meta *head = mdbx_meta_head(env); const txnid_t head_txnid = mdbx_meta_txnid_fluid(env, head); - if (head_txnid != meta.mm_txnid_top) { + if (head_txnid != meta.mm_txnid_a) { if (lck_rc == /* lck exclusive */ MDBX_RESULT_TRUE) { assert(META_IS_STEADY(&meta) && !META_IS_STEADY(head)); if (env->me_flags & MDBX_RDONLY) { mdbx_error("rollback needed: (from head %" PRIaTXN " to steady %" PRIaTXN "), but unable in read-only mode", - head_txnid, meta.mm_txnid_top); + head_txnid, meta.mm_txnid_a); return MDBX_WANNA_RECOVERY /* LY: could not recovery/rollback */; } @@ -4095,7 +4089,7 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) { MDBX_meta rollback = *head; mdbx_meta_set_txnid(env, &rollback, 0); mdbx_trace("rollback: from %" PRIaTXN ", to %" PRIaTXN, head_txnid, - meta.mm_txnid_top); + meta.mm_txnid_a); mdbx_ensure(env, head_txnid == mdbx_meta_txnid_stable(env, head)); err = mdbx_pwrite(env->me_fd, &rollback, sizeof(MDBX_meta), (uint8_t *)head - (uint8_t *)env->me_map); @@ -4126,7 +4120,7 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) { head->mm_mapsize, env->me_mapsize); meta = *head; meta.mm_mapsize = env->me_mapsize; - mdbx_meta_set_txnid(env, &meta, meta.mm_txnid_top + 1); + mdbx_meta_set_txnid(env, &meta, meta.mm_txnid_a + 1); if (META_IS_STEADY(head)) meta.mm_datasync_sign = mdbx_meta_sign(&meta); err = mdbx_sync_locked(env, env->me_flags & MDBX_WRITEMAP, &meta); @@ -4236,17 +4230,18 @@ static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname, int mode) { if (err) return err; - env->me_lck->mti_magic = MDBX_MAGIC; - env->me_lck->mti_format = MDBX_LOCK_FORMAT; + env->me_lck->mti_magic_and_version = MDBX_LOCK_MAGIC; + env->me_lck->mti_os_and_format = MDBX_LOCK_FORMAT; } else { - if (env->me_lck->mti_magic != MDBX_MAGIC) { - mdbx_error("lock region has invalid magic"); - return MDBX_INVALID; + if (env->me_lck->mti_magic_and_version != MDBX_LOCK_MAGIC) { + mdbx_error("lock region has invalid magic/version"); + return ((env->me_lck->mti_magic_and_version >> 8) != MDBX_MAGIC) + ? MDBX_INVALID + : MDBX_VERSION_MISMATCH; } - if (env->me_lck->mti_format != MDBX_LOCK_FORMAT) { - mdbx_error("lock region has format+version 0x%" PRIx64 - ", expected 0x%" PRIx64, - env->me_lck->mti_format, MDBX_LOCK_FORMAT); + if (env->me_lck->mti_os_and_format != MDBX_LOCK_FORMAT) { + mdbx_error("lock region has os/format 0x%" PRIx32 ", expected 0x%" PRIx32, + env->me_lck->mti_os_and_format, MDBX_LOCK_FORMAT); return MDBX_VERSION_MISMATCH; } } @@ -4417,8 +4412,8 @@ int __cold mdbx_env_open_ex(MDBX_env *env, const char *path, unsigned flags, MDBX_meta *meta = mdbx_meta_head(env); MDBX_db *db = &meta->mm_dbs[MAIN_DBI]; - mdbx_debug("opened database version %u, pagesize %u", meta->mm_version, - env->me_psize); + mdbx_debug("opened database version %u, pagesize %u", + (uint8_t)meta->mm_magic_and_version, env->me_psize); mdbx_debug("using meta page %" PRIaPGNO ", txn %" PRIaTXN "", container_of(meta, MDBX_page, mp_data)->mp_pgno, mdbx_meta_txnid_fluid(env, meta)); diff --git a/src/osal.h b/src/osal.h index 5491cd45..ee776358 100644 --- a/src/osal.h +++ b/src/osal.h @@ -472,10 +472,10 @@ void mdbx_osal_jitter(bool tiny); #if defined(_WIN32) || defined(_WIN64) #undef MDBX_OSAL_LOCK -#define MDBX_OSAL_LOCK_SIGN MDBX_TETRAD('f', 'l', 'c', 'k') +#define MDBX_OSAL_LOCK_SIGN UINT32_C(0xF10C) #else #define MDBX_OSAL_LOCK pthread_mutex_t -#define MDBX_OSAL_LOCK_SIGN MDBX_TETRAD('P', 'T', 'M', 'X') +#define MDBX_OSAL_LOCK_SIGN UINT32_C(0x8017) #endif int mdbx_lck_init(MDBX_env *env);