mirror of
https://github.com/isar/libmdbx.git
synced 2025-01-02 00:14:14 +08:00
mdbx: move boot-id from LCK to meta.
Change-Id: I7a371feb1a2c43e3606c516fe7b4c7d7a4ff6e73
This commit is contained in:
parent
dc03299dc6
commit
d20b9d9ed7
23
mdbx.h
23
mdbx.h
@ -1657,16 +1657,19 @@ typedef struct MDBX_envinfo {
|
||||
uint32_t mi_dxb_pagesize; /* database pagesize */
|
||||
uint32_t mi_sys_pagesize; /* system pagesize */
|
||||
|
||||
uint64_t
|
||||
mi_bootid[2]; /* A mostly unique ID that is regenerated on each boot.
|
||||
As such it can be used to identify the local
|
||||
machine's current boot. MDBX uses such when open
|
||||
the database to determine whether rollback required
|
||||
to the last steady sync point or not. I.e. if current
|
||||
bootid is differ from the value within a database then
|
||||
the system was rebooted and all changes since last steady
|
||||
sync must be reverted for data integrity. Zeros mean that
|
||||
no relevant information is available from the system. */
|
||||
struct {
|
||||
/* A mostly unique ID that is regenerated on each boot. As such it can be
|
||||
used to identify the local machine's current boot. MDBX uses such when
|
||||
open the database to determine whether rollback required to the last
|
||||
steady sync point or not. I.e. if current bootid is differ from the value
|
||||
within a database then the system was rebooted and all changes since last
|
||||
steady sync must be reverted for data integrity. Zeros mean that no
|
||||
relevant information is available from the system. */
|
||||
struct {
|
||||
uint64_t l, h;
|
||||
} current, meta0, meta1, meta2;
|
||||
} mi_bootid;
|
||||
|
||||
uint64_t mi_unsync_volume; /* bytes not explicitly synchronized to disk */
|
||||
uint64_t mi_autosync_threshold; /* current auto-sync threshold, see
|
||||
mdbx_env_set_syncbytes(). */
|
||||
|
@ -799,10 +799,6 @@ typedef struct rthc_entry_t {
|
||||
#endif
|
||||
|
||||
static bin128_t bootid;
|
||||
static __inline bool bootid_match(const struct MDBX_lockinfo *const lck) {
|
||||
return (bootid.x | bootid.y) != 0 && lck && lck->mti_bootid.x == bootid.x &&
|
||||
lck->mti_bootid.y == bootid.y;
|
||||
}
|
||||
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
static CRITICAL_SECTION rthc_critical_section;
|
||||
@ -2039,7 +2035,7 @@ static int __must_check_result mdbx_page_split(MDBX_cursor *mc,
|
||||
|
||||
static int __must_check_result mdbx_read_header(MDBX_env *env, MDBX_meta *meta,
|
||||
uint64_t *filesize,
|
||||
const bool accept_weak);
|
||||
const int lck_exclusive);
|
||||
static int __must_check_result mdbx_sync_locked(MDBX_env *env, unsigned flags,
|
||||
MDBX_meta *const pending);
|
||||
static int mdbx_env_close0(MDBX_env *env);
|
||||
@ -3177,11 +3173,23 @@ bailout:
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
|
||||
static __inline bool meta_bootid_match(const MDBX_meta *meta) {
|
||||
return meta->mm_bootid.x == bootid.x && meta->mm_bootid.y == bootid.y &&
|
||||
(bootid.x | bootid.y) != 0;
|
||||
}
|
||||
|
||||
static bool meta_weak_acceptable(const MDBX_env *env, const MDBX_meta *meta,
|
||||
const int lck_exlusive) {
|
||||
return lck_exlusive ? /* exclusive lock */ meta_bootid_match(meta)
|
||||
: /* db already opened */ env->me_lck &&
|
||||
(env->me_lck->mti_envmode & MDBX_RDONLY) == 0;
|
||||
}
|
||||
|
||||
#define METAPAGE(env, n) page_meta(pgno2page(env, n))
|
||||
#define METAPAGE_END(env) METAPAGE(env, NUM_METAS)
|
||||
|
||||
static __inline txnid_t meta_txnid(const MDBX_env *env, const MDBX_meta *meta,
|
||||
bool allow_volatile) {
|
||||
const bool allow_volatile) {
|
||||
mdbx_assert(env, meta >= METAPAGE(env, 0) || meta < METAPAGE_END(env));
|
||||
txnid_t a = safe64_read(&meta->mm_txnid_a);
|
||||
txnid_t b = safe64_read(&meta->mm_txnid_b);
|
||||
@ -3217,6 +3225,7 @@ static __inline void mdbx_meta_update_end(const MDBX_env *env, MDBX_meta *meta,
|
||||
mdbx_assert(env, meta->mm_txnid_b.inconsistent < txnid);
|
||||
(void)env;
|
||||
mdbx_jitter4testing(true);
|
||||
meta->mm_bootid = bootid;
|
||||
safe64_update(&meta->mm_txnid_b, txnid);
|
||||
}
|
||||
|
||||
@ -3226,6 +3235,7 @@ static __inline void mdbx_meta_set_txnid(const MDBX_env *env, MDBX_meta *meta,
|
||||
(void)env;
|
||||
/* update inconsistent since this function used ONLY for filling meta-image
|
||||
* for writing, but not the actual meta-page */
|
||||
meta->mm_bootid = bootid;
|
||||
meta->mm_txnid_a.inconsistent = txnid;
|
||||
meta->mm_txnid_b.inconsistent = txnid;
|
||||
}
|
||||
@ -6837,10 +6847,8 @@ int mdbx_txn_commit(MDBX_txn *txn) {
|
||||
goto fail;
|
||||
}
|
||||
|
||||
if (likely(env->me_lck)) {
|
||||
env->me_lck->mti_bootid = bootid;
|
||||
if (likely(env->me_lck))
|
||||
env->me_lck->mti_readers_refresh_flag = false;
|
||||
}
|
||||
end_mode = MDBX_END_COMMITTED | MDBX_END_UPDATE | MDBX_END_EOTDONE;
|
||||
|
||||
done:
|
||||
@ -7038,7 +7046,8 @@ static int __cold mdbx_validate_meta(MDBX_env *env, MDBX_meta *const meta,
|
||||
/* Read the environment parameters of a DB environment
|
||||
* before mapping it into memory. */
|
||||
static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *dest,
|
||||
uint64_t *filesize, const bool accept_weak) {
|
||||
uint64_t *filesize,
|
||||
const int lck_exclusive) {
|
||||
int rc = mdbx_filesize(env->me_fd, filesize);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
@ -7103,8 +7112,7 @@ static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *dest,
|
||||
if (rc != MDBX_SUCCESS)
|
||||
continue;
|
||||
|
||||
if (mdbx_meta_ot(accept_weak ? prefer_last : prefer_noweak, env, dest,
|
||||
meta)) {
|
||||
if (mdbx_meta_ot(prefer_noweak, env, dest, meta)) {
|
||||
*dest = *meta;
|
||||
if (META_IS_WEAK(dest))
|
||||
loop_limit += 1; /* LY: should re-read to hush race with update */
|
||||
@ -7112,7 +7120,8 @@ static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *dest,
|
||||
}
|
||||
}
|
||||
|
||||
if (dest->mm_psize == 0 || (META_IS_WEAK(dest) && !accept_weak)) {
|
||||
if (dest->mm_psize == 0 ||
|
||||
(META_IS_WEAK(dest) && !meta_weak_acceptable(env, dest, lck_exclusive))) {
|
||||
mdbx_error("%s", "no usable meta-pages, database is corrupted");
|
||||
return rc;
|
||||
}
|
||||
@ -7964,10 +7973,7 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, const int lck_rc) {
|
||||
uint64_t filesize_before;
|
||||
MDBX_meta meta;
|
||||
int rc = MDBX_RESULT_FALSE;
|
||||
int err = mdbx_read_header(
|
||||
env, &meta, &filesize_before,
|
||||
lck_rc ? bootid_match(env->me_lck)
|
||||
: (env->me_lck && !(env->me_lck->mti_envmode & MDBX_RDONLY)));
|
||||
int err = mdbx_read_header(env, &meta, &filesize_before, lck_rc);
|
||||
if (unlikely(err != MDBX_SUCCESS)) {
|
||||
if (lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE || err != MDBX_ENODATA ||
|
||||
(env->me_flags & MDBX_RDONLY) != 0)
|
||||
@ -7998,7 +8004,7 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, const int lck_rc) {
|
||||
return err;
|
||||
|
||||
#ifndef NDEBUG /* just for checking */
|
||||
err = mdbx_read_header(env, &meta, &filesize_before, false);
|
||||
err = mdbx_read_header(env, &meta, &filesize_before, lck_rc);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
return err;
|
||||
#endif
|
||||
@ -8152,58 +8158,11 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, const int lck_rc) {
|
||||
return err;
|
||||
#endif
|
||||
|
||||
*env->me_discarded_tail = bytes2pgno(env, used_aligned2os_bytes);
|
||||
if (used_aligned2os_bytes < env->me_dxb_mmap.current) {
|
||||
#if defined(MADV_REMOVE)
|
||||
if (lck_rc && (env->me_flags & MDBX_WRITEMAP) != 0) {
|
||||
mdbx_notice("open-MADV_%s %u..%u", "REMOVE", *env->me_discarded_tail,
|
||||
bytes2pgno(env, env->me_dxb_mmap.current));
|
||||
err =
|
||||
madvise(env->me_map + used_aligned2os_bytes,
|
||||
env->me_dxb_mmap.current - used_aligned2os_bytes, MADV_REMOVE)
|
||||
? ignore_enosys(errno)
|
||||
: MDBX_SUCCESS;
|
||||
if (unlikely(MDBX_IS_ERROR(err)))
|
||||
return err;
|
||||
}
|
||||
#endif /* MADV_REMOVE */
|
||||
#if defined(MADV_DONTNEED)
|
||||
mdbx_notice("open-MADV_%s %u..%u", "DONTNEED", *env->me_discarded_tail,
|
||||
bytes2pgno(env, env->me_dxb_mmap.current));
|
||||
err =
|
||||
madvise(env->me_map + used_aligned2os_bytes,
|
||||
env->me_dxb_mmap.current - used_aligned2os_bytes, MADV_DONTNEED)
|
||||
? ignore_enosys(errno)
|
||||
: MDBX_SUCCESS;
|
||||
if (unlikely(MDBX_IS_ERROR(err)))
|
||||
return err;
|
||||
#elif defined(POSIX_MADV_DONTNEED)
|
||||
err = ignore_enosys(posix_madvise(
|
||||
env->me_map + used_aligned2os_bytes,
|
||||
env->me_dxb_mmap.current - used_aligned2os_bytes, POSIX_MADV_DONTNEED));
|
||||
if (unlikely(MDBX_IS_ERROR(err)))
|
||||
return err;
|
||||
#elif defined(POSIX_FADV_DONTNEED)
|
||||
err = ignore_enosys(posix_fadvise(
|
||||
env->me_fd, used_aligned2os_bytes,
|
||||
env->me_dxb_mmap.current - used_aligned2os_bytes, POSIX_FADV_DONTNEED));
|
||||
if (unlikely(MDBX_IS_ERROR(err)))
|
||||
return err;
|
||||
#endif /* MADV_DONTNEED */
|
||||
}
|
||||
|
||||
#ifdef MDBX_USE_VALGRIND
|
||||
env->me_valgrind_handle =
|
||||
VALGRIND_CREATE_BLOCK(env->me_map, env->me_dxb_mmap.limit, "mdbx");
|
||||
#endif
|
||||
|
||||
const bool readahead = (env->me_flags & MDBX_NORDAHEAD) == 0 &&
|
||||
mdbx_is_readahead_reasonable(env->me_dxb_mmap.current,
|
||||
0) == MDBX_RESULT_TRUE;
|
||||
err = mdbx_set_readahead(env, 0, used_bytes, readahead);
|
||||
if (err != MDBX_SUCCESS && lck_rc == /* lck exclusive */ MDBX_RESULT_TRUE)
|
||||
return err;
|
||||
|
||||
mdbx_assert(env, used_bytes >= pgno2bytes(env, NUM_METAS) &&
|
||||
used_bytes <= env->me_dxb_mmap.limit);
|
||||
#if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__)
|
||||
@ -8227,7 +8186,9 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, const int lck_rc) {
|
||||
while (1) {
|
||||
MDBX_meta *const head = mdbx_meta_head(env);
|
||||
const txnid_t head_txnid = mdbx_meta_txnid_fluid(env, head);
|
||||
if (head_txnid == meta.mm_txnid_a.inconsistent)
|
||||
MDBX_meta *const steady = mdbx_meta_steady(env);
|
||||
const txnid_t steady_txnid = mdbx_meta_txnid_fluid(env, steady);
|
||||
if (head_txnid == steady_txnid)
|
||||
break;
|
||||
|
||||
if (lck_rc == /* lck exclusive */ MDBX_RESULT_TRUE) {
|
||||
@ -8235,11 +8196,11 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, const int lck_rc) {
|
||||
if (env->me_flags & MDBX_RDONLY) {
|
||||
mdbx_error("rollback needed: (from head %" PRIaTXN
|
||||
" to steady %" PRIaTXN "), but unable in read-only mode",
|
||||
head_txnid, meta.mm_txnid_a.inconsistent);
|
||||
head_txnid, steady_txnid);
|
||||
return MDBX_WANNA_RECOVERY /* LY: could not recovery/rollback */;
|
||||
}
|
||||
|
||||
if (bootid_match(env->me_lck)) {
|
||||
if (meta_bootid_match(head)) {
|
||||
MDBX_meta clone = *head;
|
||||
uint64_t filesize = env->me_dbgeo.now;
|
||||
err = mdbx_validate_meta(
|
||||
@ -8252,6 +8213,7 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, const int lck_rc) {
|
||||
"rollback NOT needed",
|
||||
bootid.x, bootid.y);
|
||||
meta = clone;
|
||||
*env->me_unsynced_pages = meta.mm_geo.next;
|
||||
break;
|
||||
}
|
||||
mdbx_notice("opening after an unclean shutdown, "
|
||||
@ -8269,15 +8231,15 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, const int lck_rc) {
|
||||
(head != meta1 && mdbx_meta_txnid_fluid(env, meta1) == undo_txnid) ||
|
||||
(head != meta2 && mdbx_meta_txnid_fluid(env, meta2) == undo_txnid))
|
||||
undo_txnid = safe64_txnid_next(undo_txnid);
|
||||
if (unlikely(undo_txnid >= meta.mm_txnid_a.inconsistent)) {
|
||||
if (unlikely(undo_txnid >= steady_txnid)) {
|
||||
mdbx_fatal("rollback failed: no suitable txnid (0,1,2) < %" PRIaTXN,
|
||||
meta.mm_txnid_a.inconsistent);
|
||||
steady_txnid);
|
||||
return MDBX_PANIC /* LY: could not recovery/rollback */;
|
||||
}
|
||||
|
||||
/* LY: rollback weak checkpoint */
|
||||
mdbx_trace("rollback: from %" PRIaTXN ", to %" PRIaTXN " as %" PRIaTXN,
|
||||
head_txnid, meta.mm_txnid_a.inconsistent, undo_txnid);
|
||||
head_txnid, steady_txnid, undo_txnid);
|
||||
mdbx_ensure(env, head_txnid == mdbx_meta_txnid_stable(env, head));
|
||||
|
||||
if (env->me_flags & MDBX_WRITEMAP) {
|
||||
@ -8301,7 +8263,7 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, const int lck_rc) {
|
||||
if (err) {
|
||||
mdbx_error("error %d rollback from %" PRIaTXN ", to %" PRIaTXN
|
||||
" as %" PRIaTXN,
|
||||
err, head_txnid, meta.mm_txnid_a.inconsistent, undo_txnid);
|
||||
err, head_txnid, steady_txnid, undo_txnid);
|
||||
return err;
|
||||
}
|
||||
|
||||
@ -8374,6 +8336,53 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, const int lck_rc) {
|
||||
}
|
||||
}
|
||||
|
||||
*env->me_discarded_tail = bytes2pgno(env, used_aligned2os_bytes);
|
||||
if (used_aligned2os_bytes < env->me_dxb_mmap.current) {
|
||||
#if defined(MADV_REMOVE)
|
||||
if (lck_rc && (env->me_flags & MDBX_WRITEMAP) != 0) {
|
||||
mdbx_notice("open-MADV_%s %u..%u", "REMOVE", *env->me_discarded_tail,
|
||||
bytes2pgno(env, env->me_dxb_mmap.current));
|
||||
err =
|
||||
madvise(env->me_map + used_aligned2os_bytes,
|
||||
env->me_dxb_mmap.current - used_aligned2os_bytes, MADV_REMOVE)
|
||||
? ignore_enosys(errno)
|
||||
: MDBX_SUCCESS;
|
||||
if (unlikely(MDBX_IS_ERROR(err)))
|
||||
return err;
|
||||
}
|
||||
#endif /* MADV_REMOVE */
|
||||
#if defined(MADV_DONTNEED)
|
||||
mdbx_notice("open-MADV_%s %u..%u", "DONTNEED", *env->me_discarded_tail,
|
||||
bytes2pgno(env, env->me_dxb_mmap.current));
|
||||
err =
|
||||
madvise(env->me_map + used_aligned2os_bytes,
|
||||
env->me_dxb_mmap.current - used_aligned2os_bytes, MADV_DONTNEED)
|
||||
? ignore_enosys(errno)
|
||||
: MDBX_SUCCESS;
|
||||
if (unlikely(MDBX_IS_ERROR(err)))
|
||||
return err;
|
||||
#elif defined(POSIX_MADV_DONTNEED)
|
||||
err = ignore_enosys(posix_madvise(
|
||||
env->me_map + used_aligned2os_bytes,
|
||||
env->me_dxb_mmap.current - used_aligned2os_bytes, POSIX_MADV_DONTNEED));
|
||||
if (unlikely(MDBX_IS_ERROR(err)))
|
||||
return err;
|
||||
#elif defined(POSIX_FADV_DONTNEED)
|
||||
err = ignore_enosys(posix_fadvise(
|
||||
env->me_fd, used_aligned2os_bytes,
|
||||
env->me_dxb_mmap.current - used_aligned2os_bytes, POSIX_FADV_DONTNEED));
|
||||
if (unlikely(MDBX_IS_ERROR(err)))
|
||||
return err;
|
||||
#endif /* MADV_DONTNEED */
|
||||
}
|
||||
|
||||
const bool readahead = (env->me_flags & MDBX_NORDAHEAD) == 0 &&
|
||||
mdbx_is_readahead_reasonable(env->me_dxb_mmap.current,
|
||||
0) == MDBX_RESULT_TRUE;
|
||||
err = mdbx_set_readahead(env, 0, used_bytes, readahead);
|
||||
if (err != MDBX_SUCCESS && lck_rc == /* lck exclusive */ MDBX_RESULT_TRUE)
|
||||
return err;
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
@ -8522,12 +8531,10 @@ static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname,
|
||||
struct MDBX_lockinfo *const lck = env->me_lck;
|
||||
if (lck_seize_rc == MDBX_RESULT_TRUE) {
|
||||
/* LY: exlcusive mode, check and reset lck content */
|
||||
const bin128_t save_bootid = lck->mti_bootid;
|
||||
memset(lck, 0, (size_t)size);
|
||||
mdbx_jitter4testing(false);
|
||||
lck->mti_magic_and_version = MDBX_LOCK_MAGIC;
|
||||
lck->mti_os_and_format = MDBX_LOCK_FORMAT;
|
||||
lck->mti_bootid = save_bootid;
|
||||
} else {
|
||||
if (lck->mti_magic_and_version != MDBX_LOCK_MAGIC) {
|
||||
mdbx_error("%s", "lock region has invalid magic/version");
|
||||
@ -14402,6 +14409,14 @@ int __cold mdbx_env_info_ex(const MDBX_env *env, const MDBX_txn *txn,
|
||||
arg->mi_meta1_sign = meta1->mm_datasync_sign;
|
||||
arg->mi_meta2_txnid = mdbx_meta_txnid_fluid(env, meta2);
|
||||
arg->mi_meta2_sign = meta2->mm_datasync_sign;
|
||||
if (likely(bytes > size_before_bootid)) {
|
||||
arg->mi_bootid.meta0.l = meta0->mm_bootid.x;
|
||||
arg->mi_bootid.meta1.l = meta0->mm_bootid.x;
|
||||
arg->mi_bootid.meta2.l = meta0->mm_bootid.x;
|
||||
arg->mi_bootid.meta0.h = meta0->mm_bootid.y;
|
||||
arg->mi_bootid.meta1.h = meta0->mm_bootid.y;
|
||||
arg->mi_bootid.meta2.h = meta0->mm_bootid.y;
|
||||
}
|
||||
|
||||
const MDBX_meta *txn_meta = recent_meta;
|
||||
arg->mi_last_pgno = txn_meta->mm_geo.next - 1;
|
||||
@ -14455,8 +14470,8 @@ int __cold mdbx_env_info_ex(const MDBX_env *env, const MDBX_txn *txn,
|
||||
arg->mi_autosync_threshold = pgno2bytes(env, *env->me_autosync_threshold);
|
||||
arg->mi_autosync_period_seconds16dot16 =
|
||||
mdbx_osal_monotime_to_16dot16(*env->me_autosync_period);
|
||||
arg->mi_bootid[0] = lck ? lck->mti_bootid.x : 0;
|
||||
arg->mi_bootid[1] = lck ? lck->mti_bootid.y : 0;
|
||||
arg->mi_bootid.current.l = bootid.x;
|
||||
arg->mi_bootid.current.h = bootid.y;
|
||||
arg->mi_mode = lck ? lck->mti_envmode : env->me_flags;
|
||||
}
|
||||
|
||||
|
@ -319,6 +319,14 @@ typedef struct MDBX_meta {
|
||||
* This value in couple with mr_snapshot_pages_retired allows fast estimation
|
||||
* of "how much reader is restraining GC recycling". */
|
||||
uint64_t mm_pages_retired;
|
||||
|
||||
/* The analogue /proc/sys/kernel/random/boot_id or similar to determine
|
||||
* whether the system was rebooted after the last use of the database files.
|
||||
* If there was no reboot, but there is no need to rollback to the last
|
||||
* steady sync point. Zeros mean that no relevant information is available
|
||||
* from the system. */
|
||||
bin128_t mm_bootid;
|
||||
|
||||
} MDBX_meta;
|
||||
|
||||
/* Common header for all page types. The page type depends on mp_flags.
|
||||
@ -507,13 +515,6 @@ typedef struct MDBX_lockinfo {
|
||||
/* Marker to distinguish uniqueness of DB/CLK.*/
|
||||
volatile uint64_t mti_bait_uniqueness;
|
||||
|
||||
/* The analogue /proc/sys/kernel/random/boot_id or similar to determine
|
||||
* whether the system was rebooted after the last use of the database files.
|
||||
* If there was no reboot, but there is no need to rollback to the last
|
||||
* steady sync point. Zeros mean that no relevant information is available
|
||||
* from the system. */
|
||||
volatile bin128_t mti_bootid;
|
||||
|
||||
alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/
|
||||
|
||||
/* Write transation lock. */
|
||||
|
Loading…
x
Reference in New Issue
Block a user