mdbx: add env_open_for_recovery() (squashed).

Change-Id: I0151b21610def433745c33d1f6e0b66ce655d1a9
This commit is contained in:
Leonid Yuriev 2020-09-19 00:18:55 +03:00
parent b321f67ed2
commit d9daf2944d
3 changed files with 126 additions and 54 deletions

9
mdbx.h
View File

@ -4232,6 +4232,15 @@ typedef int MDBX_pgvisitor_func(
/** \brief B-tree traversal function. */ /** \brief B-tree traversal function. */
LIBMDBX_API int mdbx_env_pgwalk(MDBX_txn *txn, MDBX_pgvisitor_func *visitor, LIBMDBX_API int mdbx_env_pgwalk(MDBX_txn *txn, MDBX_pgvisitor_func *visitor,
void *ctx, bool dont_check_keys_ordering); void *ctx, bool dont_check_keys_ordering);
/** \brief Open an environment instance using specific meta-page
* for checking and recovery.
*
* This function mostly of internal API for `mdbx_chk` utility. */
LIBMDBX_API int mdbx_env_open_for_recovery(MDBX_env *env, const char *pathname,
unsigned target_meta,
bool writeable);
/** @} B-tree Traversal */ /** @} B-tree Traversal */
/**** Attribute support functions for Nexenta /**** Attribute support functions for Nexenta

View File

@ -6163,42 +6163,65 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) {
} }
/* Seek & fetch the last meta */ /* Seek & fetch the last meta */
while (1) { if (likely(/* not recovery mode */ env->me_stuck_meta < 0)) {
MDBX_meta *const meta = mdbx_meta_head(env); while (1) {
mdbx_jitter4testing(false); MDBX_meta *const meta = mdbx_meta_head(env);
const txnid_t snap = mdbx_meta_txnid_fluid(env, meta); mdbx_jitter4testing(false);
mdbx_jitter4testing(false); const txnid_t snap = mdbx_meta_txnid_fluid(env, meta);
mdbx_jitter4testing(false);
if (likely(r)) {
safe64_reset(&r->mr_txnid, false);
r->mr_snapshot_pages_used = meta->mm_geo.next;
r->mr_snapshot_pages_retired = meta->mm_pages_retired;
safe64_write(&r->mr_txnid, snap);
mdbx_jitter4testing(false);
mdbx_assert(env, r->mr_pid == mdbx_getpid());
mdbx_assert(
env, r->mr_tid ==
((env->me_flags & MDBX_NOTLS) ? 0 : mdbx_thread_self()));
mdbx_assert(env, r->mr_txnid.inconsistent == snap);
mdbx_compiler_barrier();
env->me_lck->mti_readers_refresh_flag = true;
mdbx_flush_incoherent_cpu_writeback();
}
mdbx_jitter4testing(true);
/* Snap the state from current meta-head */
txn->mt_txnid = snap;
txn->mt_geo = meta->mm_geo;
memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(MDBX_db));
txn->mt_canary = meta->mm_canary;
/* LY: Retry on a race, ITS#7970. */
mdbx_compiler_barrier();
if (likely(meta == mdbx_meta_head(env) &&
snap == mdbx_meta_txnid_fluid(env, meta) &&
snap >= *env->me_oldest)) {
mdbx_jitter4testing(false);
break;
}
}
} else {
/* recovery mode */
MDBX_meta *const meta = METAPAGE(env, env->me_stuck_meta);
txn->mt_txnid = mdbx_meta_txnid_stable(env, meta);
txn->mt_geo = meta->mm_geo;
memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(MDBX_db));
txn->mt_canary = meta->mm_canary;
if (likely(r)) { if (likely(r)) {
safe64_reset(&r->mr_txnid, false);
r->mr_snapshot_pages_used = meta->mm_geo.next; r->mr_snapshot_pages_used = meta->mm_geo.next;
r->mr_snapshot_pages_retired = meta->mm_pages_retired; r->mr_snapshot_pages_retired = meta->mm_pages_retired;
safe64_write(&r->mr_txnid, snap); r->mr_txnid.inconsistent = txn->mt_txnid;
mdbx_jitter4testing(false); mdbx_jitter4testing(false);
mdbx_assert(env, r->mr_pid == mdbx_getpid()); mdbx_assert(env, r->mr_pid == mdbx_getpid());
mdbx_assert( mdbx_assert(
env, r->mr_tid == env, r->mr_tid ==
((env->me_flags & MDBX_NOTLS) ? 0 : mdbx_thread_self())); ((env->me_flags & MDBX_NOTLS) ? 0 : mdbx_thread_self()));
mdbx_assert(env, r->mr_txnid.inconsistent == snap); mdbx_assert(env, r->mr_txnid.inconsistent == txn->mt_txnid);
mdbx_compiler_barrier(); mdbx_compiler_barrier();
env->me_lck->mti_readers_refresh_flag = true; env->me_lck->mti_readers_refresh_flag = true;
mdbx_flush_incoherent_cpu_writeback(); mdbx_flush_incoherent_cpu_writeback();
} }
mdbx_jitter4testing(true);
/* Snap the state from current meta-head */
txn->mt_txnid = snap;
txn->mt_geo = meta->mm_geo;
memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(MDBX_db));
txn->mt_canary = meta->mm_canary;
/* LY: Retry on a race, ITS#7970. */
mdbx_compiler_barrier();
if (likely(meta == mdbx_meta_head(env) &&
snap == mdbx_meta_txnid_fluid(env, meta) &&
snap >= *env->me_oldest)) {
mdbx_jitter4testing(false);
break;
}
} }
if (unlikely(txn->mt_txnid < MIN_TXNID || txn->mt_txnid > MAX_TXNID)) { if (unlikely(txn->mt_txnid < MIN_TXNID || txn->mt_txnid > MAX_TXNID)) {
@ -6214,7 +6237,8 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) {
} else { } else {
mdbx_assert(env, (flags & ~(MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_SPILLS | mdbx_assert(env, (flags & ~(MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_SPILLS |
MDBX_WRITEMAP)) == 0); MDBX_WRITEMAP)) == 0);
if (unlikely(txn->mt_owner == tid)) if (unlikely(txn->mt_owner == tid ||
/* not recovery mode */ env->me_stuck_meta >= 0))
return MDBX_BUSY; return MDBX_BUSY;
MDBX_lockinfo *const lck = env->me_lck; MDBX_lockinfo *const lck = env->me_lck;
if (lck && (env->me_flags & MDBX_NOTLS) == 0 && if (lck && (env->me_flags & MDBX_NOTLS) == 0 &&
@ -8578,7 +8602,9 @@ static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *dest,
if (rc != MDBX_SUCCESS) if (rc != MDBX_SUCCESS)
continue; continue;
if (mdbx_meta_ot(prefer_steady, env, dest, meta)) { if ((env->me_stuck_meta < 0)
? mdbx_meta_ot(prefer_steady, env, dest, meta)
: (meta_number == (unsigned)env->me_stuck_meta)) {
*dest = *meta; *dest = *meta;
if (!META_IS_STEADY(dest)) if (!META_IS_STEADY(dest))
loop_limit += 1; /* LY: should re-read to hush race with update */ loop_limit += 1; /* LY: should re-read to hush race with update */
@ -8587,8 +8613,10 @@ static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *dest,
} }
if (dest->mm_psize == 0 || if (dest->mm_psize == 0 ||
(!META_IS_STEADY(dest) && ((env->me_stuck_meta < 0)
!meta_weak_acceptable(env, dest, lck_exclusive))) { ? (!META_IS_STEADY(dest) &&
!meta_weak_acceptable(env, dest, lck_exclusive))
: false)) {
mdbx_error("%s", "no usable meta-pages, database is corrupted"); mdbx_error("%s", "no usable meta-pages, database is corrupted");
if (rc == MDBX_SUCCESS) { if (rc == MDBX_SUCCESS) {
/* TODO: try to restore the database by fully checking b-tree structure /* TODO: try to restore the database by fully checking b-tree structure
@ -9001,6 +9029,7 @@ int __cold mdbx_env_create(MDBX_env **penv) {
env->me_dsync_fd = INVALID_HANDLE_VALUE; env->me_dsync_fd = INVALID_HANDLE_VALUE;
env->me_lfd = INVALID_HANDLE_VALUE; env->me_lfd = INVALID_HANDLE_VALUE;
env->me_pid = mdbx_getpid(); env->me_pid = mdbx_getpid();
env->me_stuck_meta = -1;
int rc; int rc;
const size_t os_psize = mdbx_syspagesize(); const size_t os_psize = mdbx_syspagesize();
@ -9450,7 +9479,8 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, const int lck_rc) {
int err = mdbx_read_header(env, &meta, &filesize_before, lck_rc); int err = mdbx_read_header(env, &meta, &filesize_before, lck_rc);
if (unlikely(err != MDBX_SUCCESS)) { if (unlikely(err != MDBX_SUCCESS)) {
if (lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE || err != MDBX_ENODATA || if (lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE || err != MDBX_ENODATA ||
(env->me_flags & MDBX_RDONLY) != 0) (env->me_flags & MDBX_RDONLY) != 0 ||
/* recovery mode */ env->me_stuck_meta >= 0)
return err; return err;
mdbx_debug("%s", "create new database"); mdbx_debug("%s", "create new database");
@ -9497,7 +9527,8 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, const int lck_rc) {
const size_t used_aligned2os_bytes = const size_t used_aligned2os_bytes =
ceil_powerof2(used_bytes, env->me_os_psize); ceil_powerof2(used_bytes, env->me_os_psize);
if ((env->me_flags & MDBX_RDONLY) /* readonly */ if ((env->me_flags & MDBX_RDONLY) /* readonly */
|| lck_rc != MDBX_RESULT_TRUE /* not exclusive */) { || lck_rc != MDBX_RESULT_TRUE /* not exclusive */
|| /* recovery mode */ env->me_stuck_meta >= 0) {
/* use present params from db */ /* use present params from db */
const size_t pagesize = meta.mm_psize; const size_t pagesize = meta.mm_psize;
err = mdbx_env_set_geometry( err = mdbx_env_set_geometry(
@ -9505,7 +9536,7 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, const int lck_rc) {
meta.mm_geo.upper * pagesize, meta.mm_geo.grow * pagesize, meta.mm_geo.upper * pagesize, meta.mm_geo.grow * pagesize,
meta.mm_geo.shrink * pagesize, meta.mm_psize); meta.mm_geo.shrink * pagesize, meta.mm_psize);
if (unlikely(err != MDBX_SUCCESS)) { if (unlikely(err != MDBX_SUCCESS)) {
mdbx_error("%s: err %d", "could not apply preconfigured db-geometry", mdbx_error("%s: err %d", "could not apply preconfigured geometry from db",
err); err);
return (err == MDBX_EINVAL) ? MDBX_INCOMPATIBLE : err; return (err == MDBX_EINVAL) ? MDBX_INCOMPATIBLE : err;
} }
@ -9651,12 +9682,17 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, const int lck_rc) {
#endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */ #endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */
const unsigned meta_clash_mask = mdbx_meta_eq_mask(env); const unsigned meta_clash_mask = mdbx_meta_eq_mask(env);
if (meta_clash_mask) { if (unlikely(meta_clash_mask)) {
mdbx_error("meta-pages are clashed: mask 0x%d", meta_clash_mask); if (/* not recovery mode */ env->me_stuck_meta < 0) {
return MDBX_CORRUPTED; mdbx_error("meta-pages are clashed: mask 0x%d", meta_clash_mask);
return MDBX_CORRUPTED;
} else {
mdbx_warning("ignore meta-pages clashing (mask 0x%d) in recovery mode",
meta_clash_mask);
}
} }
while (1) { while (likely(/* not recovery mode */ env->me_stuck_meta < 0)) {
MDBX_meta *const head = mdbx_meta_head(env); MDBX_meta *const head = mdbx_meta_head(env);
const txnid_t head_txnid = mdbx_meta_txnid_fluid(env, head); const txnid_t head_txnid = mdbx_meta_txnid_fluid(env, head);
MDBX_meta *const steady = mdbx_meta_steady(env); MDBX_meta *const steady = mdbx_meta_steady(env);
@ -9771,21 +9807,22 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, const int lck_rc) {
} }
if (env->me_dxb_mmap.current != env->me_dbgeo.now) { if (env->me_dxb_mmap.current != env->me_dbgeo.now) {
meta.mm_geo.now = bytes2pgno(env, env->me_dxb_mmap.current); meta.mm_geo.now = bytes2pgno(env, env->me_dxb_mmap.current);
mdbx_notice("update meta-geo to filesize %" PRIuPTR " bytes, %" PRIaPGNO mdbx_notice("need update meta-geo to filesize %" PRIuPTR
" pages", " bytes, %" PRIaPGNO " pages",
env->me_dxb_mmap.current, meta.mm_geo.now); env->me_dxb_mmap.current, meta.mm_geo.now);
} }
if (memcmp(&meta.mm_geo, &head->mm_geo, sizeof(meta.mm_geo))) { if (memcmp(&meta.mm_geo, &head->mm_geo, sizeof(meta.mm_geo))) {
if (env->me_flags & MDBX_RDONLY) { if ((env->me_flags & MDBX_RDONLY) != 0 ||
mdbx_warning( /* recovery mode */ env->me_stuck_meta >= 0) {
"skipped update meta.geo in read-only mode: from l%" PRIaPGNO mdbx_warning("skipped update meta.geo in %s mode: from l%" PRIaPGNO
"-n%" PRIaPGNO "-u%" PRIaPGNO "/s%u-g%u, to l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO "/s%u-g%u, to l%" PRIaPGNO
"-n%" PRIaPGNO "-u%" PRIaPGNO "/s%u-g%u", "-n%" PRIaPGNO "-u%" PRIaPGNO "/s%u-g%u",
head->mm_geo.lower, head->mm_geo.now, head->mm_geo.upper, (env->me_stuck_meta < 0) ? "read-only" : "recovery",
head->mm_geo.shrink, head->mm_geo.grow, meta.mm_geo.lower, head->mm_geo.lower, head->mm_geo.now, head->mm_geo.upper,
meta.mm_geo.now, meta.mm_geo.upper, meta.mm_geo.shrink, head->mm_geo.shrink, head->mm_geo.grow, meta.mm_geo.lower,
meta.mm_geo.grow); meta.mm_geo.now, meta.mm_geo.upper, meta.mm_geo.shrink,
meta.mm_geo.grow);
} else { } else {
const txnid_t txnid = mdbx_meta_txnid_stable(env, head); const txnid_t txnid = mdbx_meta_txnid_stable(env, head);
const txnid_t next_txnid = safe64_txnid_next(txnid); const txnid_t next_txnid = safe64_txnid_next(txnid);
@ -9822,8 +9859,10 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, const int lck_rc) {
*env->me_discarded_tail = bytes2pgno(env, used_aligned2os_bytes); *env->me_discarded_tail = bytes2pgno(env, used_aligned2os_bytes);
if (used_aligned2os_bytes < env->me_dxb_mmap.current) { if (used_aligned2os_bytes < env->me_dxb_mmap.current) {
#if defined(MADV_REMOVE) #if defined(MADV_REMOVE)
if (lck_rc && (env->me_flags & MDBX_WRITEMAP) != 0) { if (lck_rc && (env->me_flags & MDBX_WRITEMAP) != 0 &&
mdbx_notice("open-MADV_%s %u..%u", "REMOVE", *env->me_discarded_tail, /* not recovery mode */ env->me_stuck_meta < 0) {
mdbx_notice("open-MADV_%s %u..%u", "REMOVE (deallocate file space)",
*env->me_discarded_tail,
bytes2pgno(env, env->me_dxb_mmap.current)); bytes2pgno(env, env->me_dxb_mmap.current));
err = err =
madvise(env->me_map + used_aligned2os_bytes, madvise(env->me_map + used_aligned2os_bytes,
@ -10207,7 +10246,21 @@ static uint32_t merge_sync_flags(const uint32_t a, const uint32_t b) {
return r; return r;
} }
int __cold mdbx_env_open(MDBX_env *env, const char *pathname, __cold int mdbx_env_open_for_recovery(MDBX_env *env, const char *pathname,
unsigned target_meta, bool writeable) {
if (unlikely(target_meta >= NUM_METAS))
return MDBX_EINVAL;
int rc = check_env(env);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
env->me_stuck_meta = (int8_t)target_meta;
return mdbx_env_open(
env, pathname, writeable ? MDBX_EXCLUSIVE : MDBX_EXCLUSIVE | MDBX_RDONLY,
0);
}
__cold int mdbx_env_open(MDBX_env *env, const char *pathname,
MDBX_env_flags_t flags, mdbx_mode_t mode) { MDBX_env_flags_t flags, mdbx_mode_t mode) {
int rc = check_env(env); int rc = check_env(env);
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
@ -10434,6 +10487,14 @@ int __cold mdbx_env_open(MDBX_env *env, const char *pathname,
goto bailout; goto bailout;
} }
if (unlikely(/* recovery mode */ env->me_stuck_meta >= 0) &&
(lck_rc != /* exclusive */ MDBX_RESULT_TRUE ||
(flags & MDBX_EXCLUSIVE) == 0)) {
mdbx_error("%s", "recovery requires exclusive mode");
rc = MDBX_BUSY;
goto bailout;
}
mdbx_debug("opened dbenv %p", (void *)env); mdbx_debug("opened dbenv %p", (void *)env);
if (env->me_lck) { if (env->me_lck) {
if (lck_rc == MDBX_RESULT_TRUE) { if (lck_rc == MDBX_RESULT_TRUE) {
@ -10523,6 +10584,7 @@ bailout:
/* Destroy resources from mdbx_env_open(), clear our readers & DBIs */ /* Destroy resources from mdbx_env_open(), clear our readers & DBIs */
static int __cold mdbx_env_close0(MDBX_env *env) { static int __cold mdbx_env_close0(MDBX_env *env) {
env->me_stuck_meta = -1;
if (!(env->me_flags & MDBX_ENV_ACTIVE)) { if (!(env->me_flags & MDBX_ENV_ACTIVE)) {
mdbx_ensure(env, env->me_lcklist_next == nullptr); mdbx_ensure(env, env->me_lcklist_next == nullptr);
return MDBX_SUCCESS; return MDBX_SUCCESS;

View File

@ -929,17 +929,18 @@ struct MDBX_env {
#define MDBX_DEPRECATED_MAPASYNC UINT32_C(0x100000) #define MDBX_DEPRECATED_MAPASYNC UINT32_C(0x100000)
#define ENV_INTERNAL_FLAGS (MDBX_FATAL_ERROR | MDBX_ENV_ACTIVE | MDBX_ENV_TXKEY) #define ENV_INTERNAL_FLAGS (MDBX_FATAL_ERROR | MDBX_ENV_ACTIVE | MDBX_ENV_TXKEY)
uint32_t me_flags; uint32_t me_flags;
mdbx_mmap_t me_dxb_mmap; /* The main data file */ mdbx_mmap_t me_dxb_mmap; /* The main data file */
#define me_map me_dxb_mmap.dxb #define me_map me_dxb_mmap.dxb
#define me_lazy_fd me_dxb_mmap.fd #define me_lazy_fd me_dxb_mmap.fd
mdbx_filehandle_t me_dsync_fd; mdbx_filehandle_t me_dsync_fd;
mdbx_mmap_t me_lck_mmap; /* The lock file */ mdbx_mmap_t me_lck_mmap; /* The lock file */
#define me_lfd me_lck_mmap.fd #define me_lfd me_lck_mmap.fd
#define me_lck me_lck_mmap.lck #define me_lck me_lck_mmap.lck
unsigned me_psize; /* DB page size, inited from me_os_psize */ unsigned me_psize; /* DB page size, inited from me_os_psize */
unsigned me_psize2log; /* log2 of DB page size */ uint8_t me_psize2log; /* log2 of DB page size */
unsigned me_os_psize; /* OS page size, from mdbx_syspagesize() */ int8_t me_stuck_meta; /* recovery-only: target meta page or less that zero */
unsigned me_os_psize; /* OS page size, from mdbx_syspagesize() */
unsigned me_maxreaders; /* size of the reader table */ unsigned me_maxreaders; /* size of the reader table */
mdbx_fastmutex_t me_dbi_lock; mdbx_fastmutex_t me_dbi_lock;
MDBX_dbi me_numdbs; /* number of DBs opened */ MDBX_dbi me_numdbs; /* number of DBs opened */