mdbx: refine rollback while opening weak/invalid DB.

More for https://github.com/erthink/libmdbx/issues/217
This commit is contained in:
Leonid Yuriev 2021-07-09 17:40:15 +03:00
parent 8bdee27248
commit 108398c213

View File

@ -11637,6 +11637,7 @@ __cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc,
env->me_poison_edge = bytes2pgno(env, env->me_dxb_mmap.limit); env->me_poison_edge = bytes2pgno(env, env->me_dxb_mmap.limit);
#endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */ #endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */
//----------------------------------------- validate head & steady meta-pages
if (unlikely(env->me_stuck_meta >= 0)) { if (unlikely(env->me_stuck_meta >= 0)) {
/* recovery mode */ /* recovery mode */
MDBX_meta clone; MDBX_meta clone;
@ -11656,7 +11657,8 @@ __cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc,
} }
if (lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE) { if (lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE) {
/* non-exclusive mode */ /* non-exclusive mode,
* meta-pages should be validated by a first process opened the DB */
MDBX_meta *const head = mdbx_meta_head(env); MDBX_meta *const head = mdbx_meta_head(env);
MDBX_meta *const steady = mdbx_meta_steady(env); MDBX_meta *const steady = mdbx_meta_steady(env);
const txnid_t head_txnid = mdbx_meta_txnid_fluid(env, head); const txnid_t head_txnid = mdbx_meta_txnid_fluid(env, head);
@ -11679,129 +11681,77 @@ __cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc,
mdbx_assert(env, lck_rc == MDBX_RESULT_TRUE); mdbx_assert(env, lck_rc == MDBX_RESULT_TRUE);
/* exclusive mode */ /* exclusive mode */
MDBX_meta head_clone; MDBX_meta const *const steady = mdbx_meta_steady(env);
MDBX_meta const *const head = mdbx_meta_head(env); const txnid_t steady_txnid = mdbx_meta_txnid_fluid(env, steady);
err = mdbx_validate_meta_copy(env, head, &head_clone); MDBX_meta steady_clone;
err = mdbx_validate_meta_copy(env, steady, &steady_clone);
if (unlikely(err != MDBX_SUCCESS)) { if (unlikely(err != MDBX_SUCCESS)) {
mdbx_error("meta[%u] with %s txnid is corrupted", mdbx_error("meta[%u] with %s txnid %" PRIaTXN
bytes2pgno(env, (uint8_t *)data_page(head) - env->me_map), " is corrupted, %s needed",
"last"); bytes2pgno(env, (uint8_t *)steady - env->me_map), "steady",
steady_txnid, "manual recovery");
return MDBX_CORRUPTED; return MDBX_CORRUPTED;
} }
MDBX_meta steady_clone; MDBX_meta const *const head = mdbx_meta_head(env);
MDBX_meta const *const steady = mdbx_meta_steady(env);
if (steady == head) if (steady == head)
break; break;
err = mdbx_validate_meta_copy(env, steady, &steady_clone); const pgno_t pgno = bytes2pgno(env, (uint8_t *)head - env->me_map);
if (unlikely(err != MDBX_SUCCESS)) { const txnid_t head_txnid = mdbx_meta_txnid_fluid(env, head);
mdbx_error("meta[%u] with %s txnid is corrupted", MDBX_meta head_clone;
bytes2pgno(env, (uint8_t *)data_page(steady) - env->me_map), const bool head_valid =
"steady"); mdbx_validate_meta_copy(env, head, &head_clone) == MDBX_SUCCESS;
return MDBX_CORRUPTED; if (unlikely(!head_valid)) {
mdbx_error("meta[%u] with %s txnid %" PRIaTXN
" is corrupted, %s needed",
pgno, "last", head_txnid, "rollback");
goto purge_meta_head;
} }
const txnid_t head_txnid = mdbx_meta_txnid_fluid(env, head);
const txnid_t steady_txnid = mdbx_meta_txnid_fluid(env, steady);
mdbx_assert(env, head_txnid != head_txnid); mdbx_assert(env, head_txnid != head_txnid);
if (head_txnid == steady_txnid) if (head_txnid == steady_txnid)
break; break;
mdbx_assert(env, META_IS_STEADY(steady) && !META_IS_STEADY(head)); mdbx_assert(env, META_IS_STEADY(steady) && !META_IS_STEADY(head));
if (meta_bootid_match(head)) { if (meta_bootid_match(head)) {
MDBX_meta clone = *head; mdbx_warning(
err = mdbx_validate_meta( "opening after an unclean shutdown, but boot-id(%016" PRIx64
env, &clone, data_page(head), "-%016" PRIx64
bytes2pgno(env, (uint8_t *)data_page(head) - env->me_map), nullptr); ") is MATCH: rollback NOT needed, steady-sync NEEDED%s",
if (err == MDBX_SUCCESS) { bootid.x, bootid.y,
mdbx_warning( (env->me_flags & MDBX_RDONLY) ? ", but unable in read-only mode"
"opening after an unclean shutdown, but boot-id(%016" PRIx64 : "");
"-%016" PRIx64 if (env->me_flags & MDBX_RDONLY)
") is MATCH: rollback NOT needed, steady-sync NEEDED%s", return MDBX_WANNA_RECOVERY;
bootid.x, bootid.y, meta = head_clone;
(env->me_flags & MDBX_RDONLY) ? ", but unable in read-only mode" atomic_store32(&env->me_lck->mti_unsynced_pages, meta.mm_geo.next,
: ""); mo_Relaxed);
if (env->me_flags & MDBX_RDONLY) break;
return MDBX_WANNA_RECOVERY /* LY: could not recovery/sync */;
meta = clone;
atomic_store32(&env->me_lck->mti_unsynced_pages, meta.mm_geo.next,
mo_Relaxed);
break;
}
mdbx_warning("opening after an unclean shutdown, "
"but boot-id(%016" PRIx64 "-%016" PRIx64 ") is MATCH, "
"but last meta not valid, rollback needed",
bootid.x, bootid.y);
} }
if (env->me_flags & MDBX_RDONLY) { if (env->me_flags & MDBX_RDONLY) {
mdbx_error("rollback needed: (from head %" PRIaTXN mdbx_error("rollback needed: (from head %" PRIaTXN
" to steady %" PRIaTXN "), but unable in read-only mode", " to steady %" PRIaTXN "), but unable in read-only mode",
head_txnid, steady_txnid); head_txnid, steady_txnid);
return MDBX_WANNA_RECOVERY /* LY: could not recovery/rollback */; return MDBX_WANNA_RECOVERY;
} }
const MDBX_meta *const meta0 = METAPAGE(env, 0); purge_meta_head:
const MDBX_meta *const meta1 = METAPAGE(env, 1); mdbx_notice("rollback: purge%s meta[%u] with%s txnid %" PRIaTXN,
const MDBX_meta *const meta2 = METAPAGE(env, 2); head_valid ? "" : " invalid", pgno, head_valid ? " weak" : "",
txnid_t undo_txnid = 0 /* zero means undo is unneeded */; head_txnid);
while ( err = mdbx_override_meta(env, pgno, 0, head_valid ? head : steady);
(head != meta0 && mdbx_meta_txnid_fluid(env, meta0) == undo_txnid) ||
(head != meta1 && mdbx_meta_txnid_fluid(env, meta1) == undo_txnid) ||
(head != meta2 && mdbx_meta_txnid_fluid(env, meta2) == undo_txnid))
undo_txnid = safe64_txnid_next(undo_txnid);
if (unlikely(undo_txnid >= steady_txnid)) {
mdbx_fatal("rollback failed: no suitable txnid (0,1,2) < %" PRIaTXN,
steady_txnid);
return MDBX_PANIC /* LY: could not recovery/rollback */;
}
/* LY: rollback weak checkpoint */
mdbx_notice("rollback: from %" PRIaTXN ", to %" PRIaTXN " as %" PRIaTXN,
head_txnid, steady_txnid, undo_txnid);
mdbx_ensure(env, head_txnid == mdbx_meta_txnid_stable(env, head));
#if MDBX_ENABLE_PGOP_STAT
safe64_inc(&env->me_lck->mti_pgop_stat.wops, 1);
#endif /* MDBX_ENABLE_PGOP_STAT */
if (env->me_flags & MDBX_WRITEMAP) {
/* It is possible to update txnid without safe64_write(),
* since DB opened exclusive for now */
unaligned_poke_u64(4, (MDBX_meta *)head->mm_txnid_a, undo_txnid);
unaligned_poke_u64(4, (MDBX_meta *)head->mm_datasync_sign,
MDBX_DATASIGN_WEAK);
unaligned_poke_u64(4, (MDBX_meta *)head->mm_txnid_b, undo_txnid);
const size_t offset = (uint8_t *)data_page(head) - env->me_dxb_mmap.dxb;
const size_t paged_offset = floor_powerof2(offset, env->me_os_psize);
const size_t paged_length = ceil_powerof2(
env->me_psize + offset - paged_offset, env->me_os_psize);
err = mdbx_msync(&env->me_dxb_mmap, paged_offset, paged_length,
MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
} else {
MDBX_meta rollback = *head;
mdbx_meta_set_txnid(env, &rollback, undo_txnid);
unaligned_poke_u64(4, rollback.mm_datasync_sign, MDBX_DATASIGN_WEAK);
const mdbx_filehandle_t fd = (env->me_dsync_fd != INVALID_HANDLE_VALUE)
? env->me_dsync_fd
: env->me_lazy_fd;
err = mdbx_pwrite(fd, &rollback, sizeof(MDBX_meta),
(uint8_t *)head - (uint8_t *)env->me_map);
if (err == MDBX_SUCCESS && fd == env->me_lazy_fd)
err = mdbx_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
}
if (err) { if (err) {
mdbx_error("error %d rollback from %" PRIaTXN ", to %" PRIaTXN mdbx_error("rollback: overwrite meta[%u] with txnid %" PRIaTXN
" as %" PRIaTXN, ", error %d",
err, head_txnid, steady_txnid, undo_txnid); pgno, head_txnid, err);
return err; return err;
} }
mdbx_ensure(env, 0 == mdbx_meta_txnid_fluid(env, head));
mdbx_flush_incoherent_mmap(env->me_map, pgno2bytes(env, NUM_METAS),
env->me_os_psize);
mdbx_ensure(env, undo_txnid == mdbx_meta_txnid_fluid(env, head));
mdbx_ensure(env, 0 == mdbx_meta_eq_mask(env)); mdbx_ensure(env, 0 == mdbx_meta_eq_mask(env));
} }
//---------------------------------------------------- shrink DB & update geo
const MDBX_meta *head = mdbx_meta_head(env); const MDBX_meta *head = mdbx_meta_head(env);
if (lck_rc == /* lck exclusive */ MDBX_RESULT_TRUE) { if (lck_rc == /* lck exclusive */ MDBX_RESULT_TRUE) {
/* re-check size after mmap */ /* re-check size after mmap */
@ -11870,6 +11820,7 @@ __cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc,
} }
} }
//--------------------------------------------------- setup madvise/readahead
atomic_store32(&env->me_lck->mti_discarded_tail, atomic_store32(&env->me_lck->mti_discarded_tail,
bytes2pgno(env, used_aligned2os_bytes), mo_Relaxed); bytes2pgno(env, used_aligned2os_bytes), mo_Relaxed);
#if MDBX_ENABLE_MADVISE #if MDBX_ENABLE_MADVISE