mdbx: update DB format and signatures.

Change-Id: I9c4b187e8ebc3df63fef15ae98872e27d56a01ab
This commit is contained in:
Leo Yuriev 2017-05-30 16:22:42 +03:00
parent d99b2a4b16
commit 61a3766e23
3 changed files with 100 additions and 86 deletions

View File

@ -25,7 +25,7 @@
/* Features under development */
#ifndef MDBX_DEVEL
# define MDBX_DEVEL 0
# define MDBX_DEVEL 1
#endif
/*----------------------------------------------------------------------------*/
@ -116,12 +116,12 @@
/* A stamp that identifies a file as an MDBX file.
* There's nothing special about this value other than that it is easily
* recognizable, and it will reflect any byte order mismatches. */
#define MDBX_MAGIC 0xBEEFC0DE
#define MDBX_MAGIC UINT64_C(/* 56-bit prime */ 0x59659DBDEF4C11)
/* The version number for a database's datafile format. */
#define MDBX_DATA_VERSION ((MDBX_DEVEL) ? 999 : 1)
#define MDBX_DATA_VERSION ((MDBX_DEVEL) ? 255 : 2)
/* The version number for a database's lockfile format. */
#define MDBX_LOCK_VERSION ((MDBX_DEVEL) ? 999 : 1)
#define MDBX_LOCK_VERSION ((MDBX_DEVEL) ? 255 : 2)
/* handle for the DB used to track free pages. */
#define FREE_DBI 0
@ -241,21 +241,29 @@ typedef struct MDBX_db {
uint64_t md_leaf_pages; /* number of leaf pages */
uint64_t md_overflow_pages; /* number of overflow pages */
uint64_t md_entries; /* number of data items */
uint64_t md_merkle; /* Merkle tree checksum */
} MDBX_db;
/* Meta page content.
* A meta page is the start point for accessing a database snapshot.
* Pages 0-1 are meta pages. Transaction N writes meta page (N % 2). */
typedef struct MDBX_meta {
/* Stamp identifying this as an MDBX file. It must be set
* to MDBX_MAGIC. */
uint32_t mm_magic;
/* Version number of this file. Must be set to MDBX_DATA_VERSION. */
uint32_t mm_version;
/* txnid that committed this page, */
volatile txnid_t mm_txnid_top;
/* Stamp identifying this as an MDBX file.
* It must be set to MDBX_MAGIC with MDBX_DATA_VERSION. */
uint64_t mm_magic_and_version;
uint64_t mm_mapsize; /* size of mmap region */
/* txnid that committed this page, the first of a two-phase-update pair */
volatile txnid_t mm_txnid_a;
uint16_t mm_extra_flags; /* extra DB flags, zero (nothing) for now */
uint8_t mm_validator_id; /* ID of checksum and page validation method,
* zero (nothing) for now */
uint8_t mm_extra_pagehdr; /* extra bytes in the page header,
* zero (nothing) for now */
uint32_t mm_reserved_pad; /* padding for aligment, unused for now */
uint64_t mm_dbsize_min; /* minimal size of db */
uint64_t mm_dbsize_max; /* maximal size of db */
MDBX_db mm_dbs[CORE_DBS]; /* first is free space, 2nd is main db */
/* The size of pages used in this DB */
#define mm_psize mm_dbs[FREE_DBI].md_xsize
@ -265,17 +273,20 @@ typedef struct MDBX_meta {
/* Last used page in the datafile.
* Actually the file may be shorter if the freeDB lists the final pages. */
uint64_t mm_last_pg;
#define MDBX_DATASIGN_NONE 0u
#define MDBX_DATASIGN_WEAK 1u
volatile uint64_t mm_datasync_sign;
#define SIGN_IS_WEAK(sign) ((sign) == MDBX_DATASIGN_WEAK)
#define SIGN_IS_STEADY(sign) ((sign) > MDBX_DATASIGN_WEAK)
#define META_IS_WEAK(meta) SIGN_IS_WEAK((meta)->mm_datasync_sign)
#define META_IS_STEADY(meta) SIGN_IS_STEADY((meta)->mm_datasync_sign)
/* txnid that committed this page */
volatile txnid_t mm_txnid_bottom;
volatile uint64_t mm_datasync_sign;
/* to be removed */
uint64_t mm_mapsize; /* current size of mmap region */
/* txnid that committed this page, the second of a two-phase-update pair */
volatile txnid_t mm_txnid_b;
} MDBX_meta;
/* Common header for all page types. The page type depends on mp_flags.
@ -297,7 +308,8 @@ typedef struct MDBX_meta {
* in the snapshot: Either used by a database or listed in a freeDB record. */
typedef struct MDBX_page {
union {
pgno_t mp_pgno; /* page number */
uint64_t mp_validator; /* checksum of page content or a txnid during
* which the page has been updated */
struct MDBX_page *mp_next; /* for in-memory list of freed pages */
};
uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */
@ -318,6 +330,7 @@ typedef struct MDBX_page {
};
uint32_t mp_pages; /* number of overflow pages */
};
pgno_t mp_pgno; /* page number */
/* dynamic size */
union {
@ -330,15 +343,19 @@ typedef struct MDBX_page {
/* Size of the page header, excluding dynamic data at the end */
#define PAGEHDRSZ ((unsigned)offsetof(MDBX_page, mp_data))
#pragma pack(pop)
/* The header for the reader table (a memory-mapped lock file). */
typedef struct MDBX_lockinfo {
/* Stamp identifying this as an MDBX file. It must be set to MDBX_MAGIC. */
uint64_t mti_magic;
/* Stamp identifying this as an MDBX file.
* It must be set to MDBX_MAGIC with with MDBX_LOCK_VERSION. */
uint64_t mti_magic_and_version;
/* Format of this lock file. Must be set to MDBX_LOCK_FORMAT. */
uint64_t mti_format;
uint32_t mti_os_and_format;
/* Flags which environment was opened. */
uint32_t mti_envmode;
uint32_t mti_reserved;
volatile uint32_t mti_envmode;
#ifdef MDBX_OSAL_LOCK
MDBX_OSAL_LOCK mti_wmutex;
@ -355,7 +372,19 @@ typedef struct MDBX_lockinfo {
MDBX_reader __cache_aligned mti_readers[1];
} MDBX_lockinfo;
#pragma pack(pop)
#define MDBX_LOCKINFO_WHOLE_SIZE \
((sizeof(MDBX_lockinfo) + MDBX_CACHELINE_SIZE - 1) & \
~((size_t)MDBX_CACHELINE_SIZE - 1))
/* Lockfile format signature: version, features and field layout */
#define MDBX_LOCK_FORMAT \
((MDBX_OSAL_LOCK_SIGN << 16) + \
(uint16_t)(MDBX_LOCKINFO_WHOLE_SIZE + MDBX_CACHELINE_SIZE - 1))
#define MDBX_DATA_MAGIC ((MDBX_MAGIC << 8) + MDBX_DATA_VERSION)
#define MDBX_LOCK_MAGIC ((MDBX_MAGIC << 8) + MDBX_LOCK_VERSION)
/*----------------------------------------------------------------------------*/
/* Two kind lists of pages (aka IDL) */
@ -574,16 +603,6 @@ typedef struct MDBX_pgstate {
txnid_t mf_pglast; /* ID of last used record, or 0 if !mf_pghead */
} MDBX_pgstate;
#define MDBX_LOCKINFO_WHOLE_SIZE \
((sizeof(MDBX_lockinfo) + MDBX_CACHELINE_SIZE - 1) & \
~((size_t)MDBX_CACHELINE_SIZE - 1))
/* Lockfile format signature: version, features and field layout */
#define MDBX_LOCK_FORMAT \
(((uint64_t)(MDBX_OSAL_LOCK_SIGN) << 32) + \
((MDBX_LOCKINFO_WHOLE_SIZE + MDBX_CACHELINE_SIZE - 1) << 16) + \
(MDBX_LOCK_VERSION) /* Flags which describe functionality */)
/* The database environment. */
struct MDBX_env {
#define MDBX_ME_SIGNATURE (0x9A899641)

View File

@ -1279,12 +1279,12 @@ bailout:
static __inline txnid_t meta_txnid(const MDBX_env *env, const MDBX_meta *meta,
bool allow_volatile) {
mdbx_assert(env, meta >= METAPAGE(env, 0) || meta < METAPAGE_END(env));
txnid_t top = meta->mm_txnid_top;
txnid_t bottom = meta->mm_txnid_bottom;
txnid_t a = meta->mm_txnid_a;
txnid_t b = meta->mm_txnid_b;
if (allow_volatile)
return (top < bottom) ? top : bottom;
mdbx_assert(env, top == bottom);
return top;
return (a < b) ? a : b;
mdbx_assert(env, a == b);
return a;
}
static __inline txnid_t mdbx_meta_txnid_stable(const MDBX_env *env,
@ -1300,8 +1300,8 @@ static __inline txnid_t mdbx_meta_txnid_fluid(const MDBX_env *env,
static __inline void mdbx_meta_update_begin(const MDBX_env *env,
MDBX_meta *meta, txnid_t txnid) {
mdbx_assert(env, meta >= METAPAGE(env, 0) || meta < METAPAGE_END(env));
mdbx_assert(env, meta->mm_txnid_top < txnid && meta->mm_txnid_bottom < txnid);
meta->mm_txnid_top = txnid;
mdbx_assert(env, meta->mm_txnid_a < txnid && meta->mm_txnid_b < txnid);
meta->mm_txnid_a = txnid;
(void)env;
mdbx_coherent_barrier();
}
@ -1309,19 +1309,19 @@ static __inline void mdbx_meta_update_begin(const MDBX_env *env,
static __inline void mdbx_meta_update_end(const MDBX_env *env, MDBX_meta *meta,
txnid_t txnid) {
mdbx_assert(env, meta >= METAPAGE(env, 0) || meta < METAPAGE_END(env));
mdbx_assert(env, meta->mm_txnid_top == txnid);
mdbx_assert(env, meta->mm_txnid_bottom < txnid);
mdbx_assert(env, meta->mm_txnid_a == txnid);
mdbx_assert(env, meta->mm_txnid_b < txnid);
mdbx_jitter4testing(true);
meta->mm_txnid_bottom = txnid;
meta->mm_txnid_b = txnid;
mdbx_coherent_barrier();
}
static __inline void mdbx_meta_set_txnid(const MDBX_env *env, MDBX_meta *meta,
txnid_t txnid) {
mdbx_assert(env, meta < METAPAGE(env, 0) || meta > METAPAGE_END(env));
meta->mm_txnid_top = txnid;
meta->mm_txnid_bottom = txnid;
meta->mm_txnid_a = txnid;
meta->mm_txnid_b = txnid;
}
static __inline uint64_t mdbx_meta_sign(const MDBX_meta *meta) {
@ -1329,7 +1329,7 @@ static __inline uint64_t mdbx_meta_sign(const MDBX_meta *meta) {
#if 0 /* TODO */
sign = hippeus_hash64(&meta->mm_mapsize,
sizeof(MDBX_meta) - offsetof(MDBX_meta, mm_mapsize),
meta->mm_version | (uint64_t)MDBX_MAGIC << 32);
meta->mm_version | (uint64_t)MDBX_DXD_MAGIC << 32);
#else
(void)meta;
#endif
@ -2183,8 +2183,8 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) {
} else if (env->me_lck) {
const mdbx_pid_t pid = env->me_pid;
const mdbx_tid_t tid = mdbx_thread_self();
mdbx_assert(env, env->me_lck->mti_magic == MDBX_MAGIC);
mdbx_assert(env, env->me_lck->mti_format == MDBX_LOCK_FORMAT);
mdbx_assert(env, env->me_lck->mti_magic_and_version == MDBX_LOCK_MAGIC);
mdbx_assert(env, env->me_lck->mti_os_and_format == MDBX_LOCK_FORMAT);
rc = mdbx_rdt_lock(env);
if (unlikely(MDBX_IS_ERROR(rc)))
@ -3390,18 +3390,14 @@ static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *meta) {
return MDBX_INVALID;
}
if (page.mp_meta.mm_magic != MDBX_MAGIC) {
mdbx_error("meta[%u] has invalid magic", meta_number);
return MDBX_INVALID;
if (page.mp_meta.mm_magic_and_version != MDBX_DATA_MAGIC) {
mdbx_error("meta[%u] has invalid magic/version", meta_number);
return ((page.mp_meta.mm_magic_and_version >> 8) != MDBX_MAGIC)
? MDBX_INVALID
: MDBX_VERSION_MISMATCH;
}
if (page.mp_meta.mm_version != MDBX_DATA_VERSION) {
mdbx_error("database is version %u, expected version %u",
page.mp_meta.mm_version, MDBX_DATA_VERSION);
return MDBX_VERSION_MISMATCH;
}
if (page.mp_meta.mm_txnid_top != page.mp_meta.mm_txnid_bottom) {
if (page.mp_meta.mm_txnid_a != page.mp_meta.mm_txnid_b) {
mdbx_warning("meta[%u] not completely updated, skip it", meta_number);
continue;
}
@ -3511,8 +3507,7 @@ static MDBX_page *__cold mdbx_meta_model(const MDBX_env *env, MDBX_page *model,
memset(model, 0, sizeof(*model));
model->mp_pgno = num;
model->mp_flags = P_META;
model->mp_meta.mm_magic = MDBX_MAGIC;
model->mp_meta.mm_version = MDBX_DATA_VERSION;
model->mp_meta.mm_magic_and_version = MDBX_DATA_MAGIC;
model->mp_meta.mm_mapsize = env->me_mapsize;
model->mp_meta.mm_psize = env->me_psize;
model->mp_meta.mm_last_pg = NUM_METAS - 1;
@ -3607,7 +3602,7 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags,
}
MDBX_meta *target = nullptr;
if (mdbx_meta_txnid_stable(env, head) == pending->mm_txnid_top) {
if (mdbx_meta_txnid_stable(env, head) == pending->mm_txnid_a) {
mdbx_assert(env, memcmp(&head->mm_dbs, &pending->mm_dbs,
sizeof(head->mm_dbs)) == 0);
mdbx_assert(env, memcmp(&head->mm_canary, &pending->mm_canary,
@ -3637,7 +3632,7 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags,
(target == head) ? "head" : "tail", mdbx_meta_txnid_stable(env, target),
mdbx_durable_str((const MDBX_meta *)target),
pending->mm_dbs[MAIN_DBI].md_root, pending->mm_dbs[FREE_DBI].md_root,
pending->mm_txnid_top, mdbx_durable_str(pending));
pending->mm_txnid_a, mdbx_durable_str(pending));
mdbx_debug("meta0: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO
"/%" PRIaPGNO,
@ -3663,13 +3658,13 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags,
mdbx_assert(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0);
mdbx_ensure(env,
target == head ||
mdbx_meta_txnid_stable(env, target) < pending->mm_txnid_top);
mdbx_meta_txnid_stable(env, target) < pending->mm_txnid_a);
if (env->me_flags & MDBX_WRITEMAP) {
mdbx_jitter4testing(true);
if (likely(target != head)) {
/* LY: 'invalidate' the meta. */
target->mm_datasync_sign = MDBX_DATASIGN_WEAK;
mdbx_meta_update_begin(env, target, pending->mm_txnid_top);
mdbx_meta_update_begin(env, target, pending->mm_txnid_a);
#ifndef NDEBUG
/* debug: provoke failure to catch a violators */
memset(target->mm_dbs, 0xCC,
@ -3687,13 +3682,13 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags,
mdbx_coherent_barrier();
/* LY: 'commit' the meta */
mdbx_meta_update_end(env, target, pending->mm_txnid_bottom);
mdbx_meta_update_end(env, target, pending->mm_txnid_b);
mdbx_jitter4testing(true);
} else {
/* dangerous case (target == head), only mm_datasync_sign could
* me updated, check assertions once again */
mdbx_ensure(env,
mdbx_meta_txnid_stable(env, head) == pending->mm_txnid_top &&
mdbx_meta_txnid_stable(env, head) == pending->mm_txnid_a &&
!META_IS_STEADY(head) && META_IS_STEADY(pending));
mdbx_ensure(env, head->mm_last_pg == pending->mm_last_pg);
mdbx_ensure(env, head->mm_mapsize == pending->mm_mapsize);
@ -3706,8 +3701,7 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags,
mdbx_coherent_barrier();
mdbx_jitter4testing(true);
} else {
pending->mm_magic = MDBX_MAGIC;
pending->mm_version = MDBX_DATA_VERSION;
pending->mm_magic_and_version = MDBX_DATA_MAGIC;
rc = mdbx_pwrite(env->me_fd, pending, sizeof(MDBX_meta), offset);
if (unlikely(rc != MDBX_SUCCESS)) {
undo:
@ -4081,13 +4075,13 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) {
const MDBX_meta *head = mdbx_meta_head(env);
const txnid_t head_txnid = mdbx_meta_txnid_fluid(env, head);
if (head_txnid != meta.mm_txnid_top) {
if (head_txnid != meta.mm_txnid_a) {
if (lck_rc == /* lck exclusive */ MDBX_RESULT_TRUE) {
assert(META_IS_STEADY(&meta) && !META_IS_STEADY(head));
if (env->me_flags & MDBX_RDONLY) {
mdbx_error("rollback needed: (from head %" PRIaTXN
" to steady %" PRIaTXN "), but unable in read-only mode",
head_txnid, meta.mm_txnid_top);
head_txnid, meta.mm_txnid_a);
return MDBX_WANNA_RECOVERY /* LY: could not recovery/rollback */;
}
@ -4095,7 +4089,7 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) {
MDBX_meta rollback = *head;
mdbx_meta_set_txnid(env, &rollback, 0);
mdbx_trace("rollback: from %" PRIaTXN ", to %" PRIaTXN, head_txnid,
meta.mm_txnid_top);
meta.mm_txnid_a);
mdbx_ensure(env, head_txnid == mdbx_meta_txnid_stable(env, head));
err = mdbx_pwrite(env->me_fd, &rollback, sizeof(MDBX_meta),
(uint8_t *)head - (uint8_t *)env->me_map);
@ -4126,7 +4120,7 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) {
head->mm_mapsize, env->me_mapsize);
meta = *head;
meta.mm_mapsize = env->me_mapsize;
mdbx_meta_set_txnid(env, &meta, meta.mm_txnid_top + 1);
mdbx_meta_set_txnid(env, &meta, meta.mm_txnid_a + 1);
if (META_IS_STEADY(head))
meta.mm_datasync_sign = mdbx_meta_sign(&meta);
err = mdbx_sync_locked(env, env->me_flags & MDBX_WRITEMAP, &meta);
@ -4236,17 +4230,18 @@ static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname, int mode) {
if (err)
return err;
env->me_lck->mti_magic = MDBX_MAGIC;
env->me_lck->mti_format = MDBX_LOCK_FORMAT;
env->me_lck->mti_magic_and_version = MDBX_LOCK_MAGIC;
env->me_lck->mti_os_and_format = MDBX_LOCK_FORMAT;
} else {
if (env->me_lck->mti_magic != MDBX_MAGIC) {
mdbx_error("lock region has invalid magic");
return MDBX_INVALID;
if (env->me_lck->mti_magic_and_version != MDBX_LOCK_MAGIC) {
mdbx_error("lock region has invalid magic/version");
return ((env->me_lck->mti_magic_and_version >> 8) != MDBX_MAGIC)
? MDBX_INVALID
: MDBX_VERSION_MISMATCH;
}
if (env->me_lck->mti_format != MDBX_LOCK_FORMAT) {
mdbx_error("lock region has format+version 0x%" PRIx64
", expected 0x%" PRIx64,
env->me_lck->mti_format, MDBX_LOCK_FORMAT);
if (env->me_lck->mti_os_and_format != MDBX_LOCK_FORMAT) {
mdbx_error("lock region has os/format 0x%" PRIx32 ", expected 0x%" PRIx32,
env->me_lck->mti_os_and_format, MDBX_LOCK_FORMAT);
return MDBX_VERSION_MISMATCH;
}
}
@ -4417,8 +4412,8 @@ int __cold mdbx_env_open_ex(MDBX_env *env, const char *path, unsigned flags,
MDBX_meta *meta = mdbx_meta_head(env);
MDBX_db *db = &meta->mm_dbs[MAIN_DBI];
mdbx_debug("opened database version %u, pagesize %u", meta->mm_version,
env->me_psize);
mdbx_debug("opened database version %u, pagesize %u",
(uint8_t)meta->mm_magic_and_version, env->me_psize);
mdbx_debug("using meta page %" PRIaPGNO ", txn %" PRIaTXN "",
container_of(meta, MDBX_page, mp_data)->mp_pgno,
mdbx_meta_txnid_fluid(env, meta));

View File

@ -472,10 +472,10 @@ void mdbx_osal_jitter(bool tiny);
#if defined(_WIN32) || defined(_WIN64)
#undef MDBX_OSAL_LOCK
#define MDBX_OSAL_LOCK_SIGN MDBX_TETRAD('f', 'l', 'c', 'k')
#define MDBX_OSAL_LOCK_SIGN UINT32_C(0xF10C)
#else
#define MDBX_OSAL_LOCK pthread_mutex_t
#define MDBX_OSAL_LOCK_SIGN MDBX_TETRAD('P', 'T', 'M', 'X')
#define MDBX_OSAL_LOCK_SIGN UINT32_C(0x8017)
#endif
int mdbx_lck_init(MDBX_env *env);