libmdbx/src/meta.c

755 lines
30 KiB
C
Raw Normal View History

/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#include "internals.h"
typedef struct meta_snap {
uint64_t txnid;
size_t is_steady;
} meta_snap_t;
static inline txnid_t fetch_txnid(const volatile mdbx_atomic_uint32_t *ptr) {
#if (defined(__amd64__) || defined(__e2k__)) && !defined(ENABLE_UBSAN) && \
MDBX_UNALIGNED_OK >= 8
return atomic_load64((const volatile mdbx_atomic_uint64_t *)ptr,
mo_AcquireRelease);
#else
const uint32_t l = atomic_load32(
&ptr[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__], mo_AcquireRelease);
const uint32_t h = atomic_load32(
&ptr[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__], mo_AcquireRelease);
return (uint64_t)h << 32 | l;
#endif
}
static inline meta_snap_t meta_snap(const volatile meta_t *meta) {
txnid_t txnid = fetch_txnid(meta->txnid_a);
jitter4testing(true);
size_t is_steady = meta_is_steady(meta) && txnid >= MIN_TXNID;
jitter4testing(true);
if (unlikely(txnid != fetch_txnid(meta->txnid_b)))
txnid = is_steady = 0;
meta_snap_t r = {txnid, is_steady};
return r;
}
txnid_t meta_txnid(const volatile meta_t *meta) {
return meta_snap(meta).txnid;
}
meta_ptr_t meta_ptr(const MDBX_env *env, unsigned n) {
eASSERT(env, n < NUM_METAS);
meta_ptr_t r;
meta_snap_t snap = meta_snap(r.ptr_v = METAPAGE(env, n));
r.txnid = snap.txnid;
r.is_steady = snap.is_steady;
return r;
}
static uint8_t meta_cmp2pack(uint8_t c01, uint8_t c02, uint8_t c12, bool s0,
bool s1, bool s2) {
assert(c01 < 3 && c02 < 3 && c12 < 3);
/* assert(s0 < 2 && s1 < 2 && s2 < 2); */
const uint8_t recent = meta_cmp2recent(c01, s0, s1)
? (meta_cmp2recent(c02, s0, s2) ? 0 : 2)
: (meta_cmp2recent(c12, s1, s2) ? 1 : 2);
const uint8_t prefer_steady = meta_cmp2steady(c01, s0, s1)
? (meta_cmp2steady(c02, s0, s2) ? 0 : 2)
: (meta_cmp2steady(c12, s1, s2) ? 1 : 2);
uint8_t tail;
if (recent == 0)
tail = meta_cmp2steady(c12, s1, s2) ? 2 : 1;
else if (recent == 1)
tail = meta_cmp2steady(c02, s0, s2) ? 2 : 0;
else
tail = meta_cmp2steady(c01, s0, s1) ? 1 : 0;
const bool valid =
c01 != 1 || s0 != s1 || c02 != 1 || s0 != s2 || c12 != 1 || s1 != s2;
const bool strict = (c01 != 1 || s0 != s1) && (c02 != 1 || s0 != s2) &&
(c12 != 1 || s1 != s2);
return tail | recent << 2 | prefer_steady << 4 | strict << 6 | valid << 7;
}
static inline void meta_troika_unpack(troika_t *troika, const uint8_t packed) {
troika->recent = (packed >> 2) & 3;
troika->prefer_steady = (packed >> 4) & 3;
troika->tail_and_flags = packed & 0xC3;
#if MDBX_WORDBITS > 32 /* Workaround for false-positives from Valgrind */
troika->unused_pad = 0;
#endif
}
static const uint8_t troika_fsm_map[2 * 2 * 2 * 3 * 3 * 3] = {
232, 201, 216, 216, 232, 233, 232, 232, 168, 201, 216, 152, 168, 233, 232,
168, 233, 201, 216, 201, 233, 233, 232, 233, 168, 201, 152, 216, 232, 169,
232, 168, 168, 193, 152, 152, 168, 169, 232, 168, 169, 193, 152, 194, 233,
169, 232, 169, 232, 201, 216, 216, 232, 201, 232, 232, 168, 193, 216, 152,
168, 193, 232, 168, 193, 193, 210, 194, 225, 193, 225, 193, 168, 137, 212,
214, 232, 233, 168, 168, 168, 137, 212, 150, 168, 233, 168, 168, 169, 137,
216, 201, 233, 233, 168, 169, 168, 137, 148, 214, 232, 169, 168, 168, 40,
129, 148, 150, 168, 169, 168, 40, 169, 129, 152, 194, 233, 169, 168, 169,
168, 137, 214, 214, 232, 201, 168, 168, 168, 129, 214, 150, 168, 193, 168,
168, 129, 129, 210, 194, 225, 193, 161, 129, 212, 198, 212, 214, 228, 228,
212, 212, 148, 201, 212, 150, 164, 233, 212, 148, 233, 201, 216, 201, 233,
233, 216, 233, 148, 198, 148, 214, 228, 164, 212, 148, 148, 194, 148, 150,
164, 169, 212, 148, 169, 194, 152, 194, 233, 169, 216, 169, 214, 198, 214,
214, 228, 198, 212, 214, 150, 194, 214, 150, 164, 193, 212, 150, 194, 194,
210, 194, 225, 193, 210, 194};
__cold bool troika_verify_fsm(void) {
bool ok = true;
for (size_t i = 0; i < 2 * 2 * 2 * 3 * 3 * 3; ++i) {
const bool s0 = (i >> 0) & 1;
const bool s1 = (i >> 1) & 1;
const bool s2 = (i >> 2) & 1;
const uint8_t c01 = (i / (8 * 1)) % 3;
const uint8_t c02 = (i / (8 * 3)) % 3;
const uint8_t c12 = (i / (8 * 9)) % 3;
const uint8_t packed = meta_cmp2pack(c01, c02, c12, s0, s1, s2);
troika_t troika;
troika.fsm = (uint8_t)i;
meta_troika_unpack(&troika, packed);
const uint8_t tail = TROIKA_TAIL(&troika);
const bool strict = TROIKA_STRICT_VALID(&troika);
const bool valid = TROIKA_VALID(&troika);
const uint8_t recent_chk = meta_cmp2recent(c01, s0, s1)
? (meta_cmp2recent(c02, s0, s2) ? 0 : 2)
: (meta_cmp2recent(c12, s1, s2) ? 1 : 2);
const uint8_t prefer_steady_chk =
meta_cmp2steady(c01, s0, s1) ? (meta_cmp2steady(c02, s0, s2) ? 0 : 2)
: (meta_cmp2steady(c12, s1, s2) ? 1 : 2);
uint8_t tail_chk;
if (recent_chk == 0)
tail_chk = meta_cmp2steady(c12, s1, s2) ? 2 : 1;
else if (recent_chk == 1)
tail_chk = meta_cmp2steady(c02, s0, s2) ? 2 : 0;
else
tail_chk = meta_cmp2steady(c01, s0, s1) ? 1 : 0;
const bool valid_chk =
c01 != 1 || s0 != s1 || c02 != 1 || s0 != s2 || c12 != 1 || s1 != s2;
const bool strict_chk = (c01 != 1 || s0 != s1) && (c02 != 1 || s0 != s2) &&
(c12 != 1 || s1 != s2);
assert(troika.recent == recent_chk);
assert(troika.prefer_steady == prefer_steady_chk);
assert(tail == tail_chk);
assert(valid == valid_chk);
assert(strict == strict_chk);
assert(troika_fsm_map[troika.fsm] == packed);
if (troika.recent != recent_chk ||
troika.prefer_steady != prefer_steady_chk || tail != tail_chk ||
valid != valid_chk || strict != strict_chk ||
troika_fsm_map[troika.fsm] != packed) {
ok = false;
}
}
return ok;
}
__hot troika_t meta_tap(const MDBX_env *env) {
meta_snap_t snap;
troika_t troika;
snap = meta_snap(METAPAGE(env, 0));
troika.txnid[0] = snap.txnid;
troika.fsm = (uint8_t)snap.is_steady << 0;
snap = meta_snap(METAPAGE(env, 1));
troika.txnid[1] = snap.txnid;
troika.fsm += (uint8_t)snap.is_steady << 1;
troika.fsm += meta_cmp2int(troika.txnid[0], troika.txnid[1], 8);
snap = meta_snap(METAPAGE(env, 2));
troika.txnid[2] = snap.txnid;
troika.fsm += (uint8_t)snap.is_steady << 2;
troika.fsm += meta_cmp2int(troika.txnid[0], troika.txnid[2], 8 * 3);
troika.fsm += meta_cmp2int(troika.txnid[1], troika.txnid[2], 8 * 3 * 3);
meta_troika_unpack(&troika, troika_fsm_map[troika.fsm]);
return troika;
}
txnid_t recent_committed_txnid(const MDBX_env *env) {
const txnid_t m0 = meta_txnid(METAPAGE(env, 0));
const txnid_t m1 = meta_txnid(METAPAGE(env, 1));
const txnid_t m2 = meta_txnid(METAPAGE(env, 2));
return (m0 > m1) ? ((m0 > m2) ? m0 : m2) : ((m1 > m2) ? m1 : m2);
}
static inline bool meta_eq(const troika_t *troika, size_t a, size_t b) {
assert(a < NUM_METAS && b < NUM_METAS);
return troika->txnid[a] == troika->txnid[b] &&
(((troika->fsm >> a) ^ (troika->fsm >> b)) & 1) == 0 &&
troika->txnid[a];
}
unsigned meta_eq_mask(const troika_t *troika) {
return meta_eq(troika, 0, 1) | meta_eq(troika, 1, 2) << 1 |
meta_eq(troika, 2, 0) << 2;
}
__hot bool meta_should_retry(const MDBX_env *env, troika_t *troika) {
const troika_t prev = *troika;
*troika = meta_tap(env);
return prev.fsm != troika->fsm || prev.txnid[0] != troika->txnid[0] ||
prev.txnid[1] != troika->txnid[1] || prev.txnid[2] != troika->txnid[2];
}
const char *durable_caption(const meta_t *const meta) {
if (meta_is_steady(meta))
return (meta_sign_get(meta) == meta_sign_calculate(meta)) ? "Steady"
: "Tainted";
return "Weak";
}
__cold void meta_troika_dump(const MDBX_env *env, const troika_t *troika) {
const meta_ptr_t recent = meta_recent(env, troika);
const meta_ptr_t prefer_steady = meta_prefer_steady(env, troika);
const meta_ptr_t tail = meta_tail(env, troika);
NOTICE("troika: %" PRIaTXN ".%c:%" PRIaTXN ".%c:%" PRIaTXN ".%c, fsm=0x%02x, "
"head=%d-%" PRIaTXN ".%c, "
"base=%d-%" PRIaTXN ".%c, "
"tail=%d-%" PRIaTXN ".%c, "
"valid %c, strict %c",
troika->txnid[0], (troika->fsm & 1) ? 's' : 'w', troika->txnid[1],
(troika->fsm & 2) ? 's' : 'w', troika->txnid[2],
(troika->fsm & 4) ? 's' : 'w', troika->fsm, troika->recent,
recent.txnid, recent.is_steady ? 's' : 'w', troika->prefer_steady,
prefer_steady.txnid, prefer_steady.is_steady ? 's' : 'w',
troika->tail_and_flags % NUM_METAS, tail.txnid,
tail.is_steady ? 's' : 'w', TROIKA_VALID(troika) ? 'Y' : 'N',
TROIKA_STRICT_VALID(troika) ? 'Y' : 'N');
}
/*----------------------------------------------------------------------------*/
static int meta_unsteady(MDBX_env *env, const txnid_t inclusive_upto,
const pgno_t pgno) {
meta_t *const meta = METAPAGE(env, pgno);
const txnid_t txnid = constmeta_txnid(meta);
if (!meta_is_steady(meta) || txnid > inclusive_upto)
return MDBX_RESULT_FALSE;
WARNING("wipe txn #%" PRIaTXN ", meta %" PRIaPGNO, txnid, pgno);
const uint64_t wipe = DATASIGN_NONE;
const void *ptr = &wipe;
size_t bytes = sizeof(meta->sign),
offset = ptr_dist(&meta->sign, env->dxb_mmap.base);
if (env->flags & MDBX_WRITEMAP) {
unaligned_poke_u64(4, meta->sign, wipe);
osal_flush_incoherent_cpu_writeback();
if (!MDBX_AVOID_MSYNC)
return MDBX_RESULT_TRUE;
ptr = data_page(meta);
offset = ptr_dist(ptr, env->dxb_mmap.base);
bytes = env->ps;
}
#if MDBX_ENABLE_PGOP_STAT
env->lck->pgops.wops.weak += 1;
#endif /* MDBX_ENABLE_PGOP_STAT */
int err = osal_pwrite(env->fd4meta, ptr, bytes, offset);
return likely(err == MDBX_SUCCESS) ? MDBX_RESULT_TRUE : err;
}
__cold int meta_wipe_steady(MDBX_env *env, txnid_t inclusive_upto) {
int err = meta_unsteady(env, inclusive_upto, 0);
if (likely(!MDBX_IS_ERROR(err)))
err = meta_unsteady(env, inclusive_upto, 1);
if (likely(!MDBX_IS_ERROR(err)))
err = meta_unsteady(env, inclusive_upto, 2);
if (err == MDBX_RESULT_TRUE) {
err = MDBX_SUCCESS;
if (!MDBX_AVOID_MSYNC && (env->flags & MDBX_WRITEMAP)) {
err = osal_msync(&env->dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS),
MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
#if MDBX_ENABLE_PGOP_STAT
env->lck->pgops.msync.weak += 1;
#endif /* MDBX_ENABLE_PGOP_STAT */
} else if (env->fd4meta == env->lazy_fd) {
err = osal_fsync(env->lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
#if MDBX_ENABLE_PGOP_STAT
env->lck->pgops.fsync.weak += 1;
#endif /* MDBX_ENABLE_PGOP_STAT */
}
}
osal_flush_incoherent_mmap(env->dxb_mmap.base, pgno2bytes(env, NUM_METAS),
globals.sys_pagesize);
/* force oldest refresh */
atomic_store32(&env->lck->rdt_refresh_flag, true, mo_Relaxed);
env->basal_txn->tw.troika = meta_tap(env);
for (MDBX_txn *scan = env->basal_txn->nested; scan; scan = scan->nested)
scan->tw.troika = env->basal_txn->tw.troika;
return err;
}
int meta_sync(const MDBX_env *env, const meta_ptr_t head) {
eASSERT(env, atomic_load32(&env->lck->meta_sync_txnid, mo_Relaxed) !=
(uint32_t)head.txnid);
/* Функция может вызываться (в том числе) при (env->flags &
* MDBX_NOMETASYNC) == 0 и env->fd4meta == env->dsync_fd, например если
* предыдущая транзакция была выполненна с флагом MDBX_NOMETASYNC. */
int rc = MDBX_RESULT_TRUE;
if (env->flags & MDBX_WRITEMAP) {
if (!MDBX_AVOID_MSYNC) {
rc = osal_msync(&env->dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS),
MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
#if MDBX_ENABLE_PGOP_STAT
env->lck->pgops.msync.weak += 1;
#endif /* MDBX_ENABLE_PGOP_STAT */
} else {
#if MDBX_ENABLE_PGOP_ST
env->lck->pgops.wops.weak += 1;
#endif /* MDBX_ENABLE_PGOP_STAT */
const page_t *page = data_page(head.ptr_c);
rc = osal_pwrite(env->fd4meta, page, env->ps,
ptr_dist(page, env->dxb_mmap.base));
if (likely(rc == MDBX_SUCCESS) && env->fd4meta == env->lazy_fd) {
rc = osal_fsync(env->lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
#if MDBX_ENABLE_PGOP_STAT
env->lck->pgops.fsync.weak += 1;
#endif /* MDBX_ENABLE_PGOP_STAT */
}
}
} else {
rc = osal_fsync(env->lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
#if MDBX_ENABLE_PGOP_STAT
env->lck->pgops.fsync.weak += 1;
#endif /* MDBX_ENABLE_PGOP_STAT */
}
if (likely(rc == MDBX_SUCCESS))
env->lck->meta_sync_txnid.weak = (uint32_t)head.txnid;
return rc;
}
__cold static page_t *meta_model(const MDBX_env *env, page_t *model, size_t num,
const bin128_t *guid) {
ENSURE(env, is_powerof2(env->ps));
ENSURE(env, env->ps >= MDBX_MIN_PAGESIZE);
ENSURE(env, env->ps <= MDBX_MAX_PAGESIZE);
ENSURE(env, env->geo_in_bytes.lower >= MIN_MAPSIZE);
ENSURE(env, env->geo_in_bytes.upper <= MAX_MAPSIZE);
ENSURE(env, env->geo_in_bytes.now >= env->geo_in_bytes.lower);
ENSURE(env, env->geo_in_bytes.now <= env->geo_in_bytes.upper);
memset(model, 0, env->ps);
model->pgno = (pgno_t)num;
model->flags = P_META;
meta_t *const model_meta = page_meta(model);
unaligned_poke_u64(4, model_meta->magic_and_version, MDBX_DATA_MAGIC);
model_meta->geometry.lower = bytes2pgno(env, env->geo_in_bytes.lower);
model_meta->geometry.upper = bytes2pgno(env, env->geo_in_bytes.upper);
model_meta->geometry.grow_pv =
pages2pv(bytes2pgno(env, env->geo_in_bytes.grow));
model_meta->geometry.shrink_pv =
pages2pv(bytes2pgno(env, env->geo_in_bytes.shrink));
model_meta->geometry.now = bytes2pgno(env, env->geo_in_bytes.now);
model_meta->geometry.first_unallocated = NUM_METAS;
ENSURE(env, model_meta->geometry.lower >= MIN_PAGENO);
ENSURE(env, model_meta->geometry.upper <= MAX_PAGENO + 1);
ENSURE(env, model_meta->geometry.now >= model_meta->geometry.lower);
ENSURE(env, model_meta->geometry.now <= model_meta->geometry.upper);
ENSURE(env, model_meta->geometry.first_unallocated >= MIN_PAGENO);
ENSURE(env,
model_meta->geometry.first_unallocated <= model_meta->geometry.now);
ENSURE(env, model_meta->geometry.grow_pv ==
pages2pv(pv2pages(model_meta->geometry.grow_pv)));
ENSURE(env, model_meta->geometry.shrink_pv ==
pages2pv(pv2pages(model_meta->geometry.shrink_pv)));
model_meta->pagesize = env->ps;
model_meta->trees.gc.flags = MDBX_INTEGERKEY;
model_meta->trees.gc.root = P_INVALID;
model_meta->trees.main.root = P_INVALID;
memcpy(&model_meta->dxbid, guid, sizeof(model_meta->dxbid));
meta_set_txnid(env, model_meta, MIN_TXNID + num);
unaligned_poke_u64(4, model_meta->sign, meta_sign_calculate(model_meta));
eASSERT(env, coherency_check_meta(env, model_meta, true));
return ptr_disp(model, env->ps);
}
__cold meta_t *meta_init_triplet(const MDBX_env *env, void *buffer) {
const bin128_t guid = osal_guid(env);
page_t *page0 = (page_t *)buffer;
page_t *page1 = meta_model(env, page0, 0, &guid);
page_t *page2 = meta_model(env, page1, 1, &guid);
meta_model(env, page2, 2, &guid);
return page_meta(page2);
}
__cold int __must_check_result meta_override(MDBX_env *env, size_t target,
txnid_t txnid,
const meta_t *shape) {
page_t *const page = env->page_auxbuf;
meta_model(env, page, target,
&((target == 0 && shape) ? shape : METAPAGE(env, 0))->dxbid);
meta_t *const model = page_meta(page);
meta_set_txnid(env, model, txnid);
if (txnid)
eASSERT(env, coherency_check_meta(env, model, true));
if (shape) {
if (txnid && unlikely(!coherency_check_meta(env, shape, false))) {
ERROR("bailout overriding meta-%zu since model failed "
"FreeDB/MainDB %s-check for txnid #%" PRIaTXN,
target, "pre", constmeta_txnid(shape));
return MDBX_PROBLEM;
}
if (globals.runtime_flags & MDBX_DBG_DONT_UPGRADE)
memcpy(&model->magic_and_version, &shape->magic_and_version,
sizeof(model->magic_and_version));
model->reserve16 = shape->reserve16;
model->validator_id = shape->validator_id;
model->extra_pagehdr = shape->extra_pagehdr;
memcpy(&model->geometry, &shape->geometry, sizeof(model->geometry));
memcpy(&model->trees, &shape->trees, sizeof(model->trees));
memcpy(&model->canary, &shape->canary, sizeof(model->canary));
memcpy(&model->pages_retired, &shape->pages_retired,
sizeof(model->pages_retired));
if (txnid) {
if ((!model->trees.gc.mod_txnid && model->trees.gc.root != P_INVALID) ||
(!model->trees.main.mod_txnid && model->trees.main.root != P_INVALID))
memcpy(&model->magic_and_version, &shape->magic_and_version,
sizeof(model->magic_and_version));
if (unlikely(!coherency_check_meta(env, model, false))) {
ERROR("bailout overriding meta-%zu since model failed "
"FreeDB/MainDB %s-check for txnid #%" PRIaTXN,
target, "post", txnid);
return MDBX_PROBLEM;
}
}
}
if (target == 0 && (model->dxbid.x | model->dxbid.y) == 0) {
const bin128_t guid = osal_guid(env);
memcpy(&model->dxbid, &guid, sizeof(model->dxbid));
}
meta_sign_as_steady(model);
int rc = meta_validate(env, model, page, (pgno_t)target, nullptr);
if (unlikely(MDBX_IS_ERROR(rc)))
return MDBX_PROBLEM;
if (shape && memcmp(model, shape, sizeof(meta_t)) == 0) {
NOTICE("skip overriding meta-%zu since no changes "
"for txnid #%" PRIaTXN,
target, txnid);
return MDBX_SUCCESS;
}
if (env->flags & MDBX_WRITEMAP) {
#if MDBX_ENABLE_PGOP_STAT
env->lck->pgops.msync.weak += 1;
#endif /* MDBX_ENABLE_PGOP_STAT */
rc = osal_msync(&env->dxb_mmap, 0,
pgno_align2os_bytes(env, model->geometry.first_unallocated),
MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
/* meta_override() called only while current process have exclusive
* lock of a DB file. So meta-page could be updated directly without
* clearing consistency flag by mdbx_meta_update_begin() */
memcpy(pgno2page(env, target), page, env->ps);
osal_flush_incoherent_cpu_writeback();
#if MDBX_ENABLE_PGOP_STAT
env->lck->pgops.msync.weak += 1;
#endif /* MDBX_ENABLE_PGOP_STAT */
rc = osal_msync(&env->dxb_mmap, 0, pgno_align2os_bytes(env, target + 1),
MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
} else {
#if MDBX_ENABLE_PGOP_STAT
env->lck->pgops.wops.weak += 1;
#endif /* MDBX_ENABLE_PGOP_STAT */
rc = osal_pwrite(env->fd4meta, page, env->ps, pgno2bytes(env, target));
if (rc == MDBX_SUCCESS && env->fd4meta == env->lazy_fd) {
#if MDBX_ENABLE_PGOP_STAT
env->lck->pgops.fsync.weak += 1;
#endif /* MDBX_ENABLE_PGOP_STAT */
rc = osal_fsync(env->lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
}
osal_flush_incoherent_mmap(env->dxb_mmap.base, pgno2bytes(env, NUM_METAS),
globals.sys_pagesize);
}
eASSERT(env, (!env->txn && (env->flags & ENV_ACTIVE) == 0) ||
(env->stuck_meta == (int)target &&
(env->flags & (MDBX_EXCLUSIVE | MDBX_RDONLY)) ==
MDBX_EXCLUSIVE));
return rc;
}
__cold int meta_validate(MDBX_env *env, meta_t *const meta,
const page_t *const page, const unsigned meta_number,
unsigned *guess_pagesize) {
const uint64_t magic_and_version =
unaligned_peek_u64(4, &meta->magic_and_version);
if (unlikely(magic_and_version != MDBX_DATA_MAGIC &&
magic_and_version != MDBX_DATA_MAGIC_LEGACY_COMPAT &&
magic_and_version != MDBX_DATA_MAGIC_LEGACY_DEVEL)) {
ERROR("meta[%u] has invalid magic/version %" PRIx64, meta_number,
magic_and_version);
return ((magic_and_version >> 8) != MDBX_MAGIC) ? MDBX_INVALID
: MDBX_VERSION_MISMATCH;
}
if (unlikely(page->pgno != meta_number)) {
ERROR("meta[%u] has invalid pageno %" PRIaPGNO, meta_number, page->pgno);
return MDBX_INVALID;
}
if (unlikely(page->flags != P_META)) {
ERROR("page #%u not a meta-page", meta_number);
return MDBX_INVALID;
}
if (unlikely(!is_powerof2(meta->pagesize) ||
meta->pagesize < MDBX_MIN_PAGESIZE ||
meta->pagesize > MDBX_MAX_PAGESIZE)) {
WARNING("meta[%u] has invalid pagesize (%u), skip it", meta_number,
meta->pagesize);
return is_powerof2(meta->pagesize) ? MDBX_VERSION_MISMATCH : MDBX_INVALID;
}
if (guess_pagesize && *guess_pagesize != meta->pagesize) {
*guess_pagesize = meta->pagesize;
VERBOSE("meta[%u] took pagesize %u", meta_number, meta->pagesize);
}
const txnid_t txnid = unaligned_peek_u64(4, &meta->txnid_a);
if (unlikely(txnid != unaligned_peek_u64(4, &meta->txnid_b))) {
WARNING("meta[%u] not completely updated, skip it", meta_number);
return MDBX_RESULT_TRUE;
}
/* LY: check signature as a checksum */
const uint64_t sign = meta_sign_get(meta);
const uint64_t sign_stready = meta_sign_calculate(meta);
if (SIGN_IS_STEADY(sign) && unlikely(sign != sign_stready)) {
WARNING("meta[%u] has invalid steady-checksum (0x%" PRIx64 " != 0x%" PRIx64
"), skip it",
meta_number, sign, sign_stready);
return MDBX_RESULT_TRUE;
}
if (unlikely(meta->trees.gc.flags != MDBX_INTEGERKEY) &&
((meta->trees.gc.flags & DB_PERSISTENT_FLAGS) != MDBX_INTEGERKEY ||
magic_and_version == MDBX_DATA_MAGIC)) {
WARNING("meta[%u] has invalid %s flags 0x%x, skip it", meta_number,
"GC/FreeDB", meta->trees.gc.flags);
return MDBX_INCOMPATIBLE;
}
if (unlikely(!check_table_flags(meta->trees.main.flags))) {
WARNING("meta[%u] has invalid %s flags 0x%x, skip it", meta_number,
"MainDB", meta->trees.main.flags);
return MDBX_INCOMPATIBLE;
}
DEBUG("checking meta%" PRIaPGNO " = root %" PRIaPGNO "/%" PRIaPGNO
", geo %" PRIaPGNO "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO
" +%u -%u, txn_id %" PRIaTXN ", %s",
page->pgno, meta->trees.main.root, meta->trees.gc.root,
meta->geometry.lower, meta->geometry.first_unallocated,
meta->geometry.now, meta->geometry.upper,
pv2pages(meta->geometry.grow_pv), pv2pages(meta->geometry.shrink_pv),
txnid, durable_caption(meta));
if (unlikely(txnid < MIN_TXNID || txnid > MAX_TXNID)) {
WARNING("meta[%u] has invalid txnid %" PRIaTXN ", skip it", meta_number,
txnid);
return MDBX_RESULT_TRUE;
}
if (unlikely(meta->geometry.lower < MIN_PAGENO ||
meta->geometry.lower > MAX_PAGENO + 1)) {
WARNING("meta[%u] has invalid min-pages (%" PRIaPGNO "), skip it",
meta_number, meta->geometry.lower);
return MDBX_INVALID;
}
if (unlikely(meta->geometry.upper < MIN_PAGENO ||
meta->geometry.upper > MAX_PAGENO + 1 ||
meta->geometry.upper < meta->geometry.lower)) {
WARNING("meta[%u] has invalid max-pages (%" PRIaPGNO "), skip it",
meta_number, meta->geometry.upper);
return MDBX_INVALID;
}
if (unlikely(meta->geometry.first_unallocated < MIN_PAGENO ||
meta->geometry.first_unallocated - 1 > MAX_PAGENO)) {
WARNING("meta[%u] has invalid next-pageno (%" PRIaPGNO "), skip it",
meta_number, meta->geometry.first_unallocated);
return MDBX_CORRUPTED;
}
const uint64_t used_bytes =
meta->geometry.first_unallocated * (uint64_t)meta->pagesize;
if (unlikely(used_bytes > env->dxb_mmap.filesize)) {
/* Here could be a race with DB-shrinking performed by other process */
int err = osal_filesize(env->lazy_fd, &env->dxb_mmap.filesize);
if (unlikely(err != MDBX_SUCCESS))
return err;
if (unlikely(used_bytes > env->dxb_mmap.filesize)) {
WARNING("meta[%u] used-bytes (%" PRIu64 ") beyond filesize (%" PRIu64
"), skip it",
meta_number, used_bytes, env->dxb_mmap.filesize);
return MDBX_CORRUPTED;
}
}
if (unlikely(meta->geometry.first_unallocated - 1 > MAX_PAGENO ||
used_bytes > MAX_MAPSIZE)) {
WARNING("meta[%u] has too large used-space (%" PRIu64 "), skip it",
meta_number, used_bytes);
return MDBX_TOO_LARGE;
}
pgno_t geo_lower = meta->geometry.lower;
uint64_t mapsize_min = geo_lower * (uint64_t)meta->pagesize;
STATIC_ASSERT(MAX_MAPSIZE < PTRDIFF_MAX - MDBX_MAX_PAGESIZE);
STATIC_ASSERT(MIN_MAPSIZE < MAX_MAPSIZE);
STATIC_ASSERT((uint64_t)(MAX_PAGENO + 1) * MDBX_MIN_PAGESIZE % (4ul << 20) ==
0);
if (unlikely(mapsize_min < MIN_MAPSIZE || mapsize_min > MAX_MAPSIZE)) {
if (MAX_MAPSIZE != MAX_MAPSIZE64 && mapsize_min > MAX_MAPSIZE &&
mapsize_min <= MAX_MAPSIZE64) {
eASSERT(env, meta->geometry.first_unallocated - 1 <= MAX_PAGENO &&
used_bytes <= MAX_MAPSIZE);
WARNING("meta[%u] has too large min-mapsize (%" PRIu64 "), "
"but size of used space still acceptable (%" PRIu64 ")",
meta_number, mapsize_min, used_bytes);
geo_lower = (pgno_t)((mapsize_min = MAX_MAPSIZE) / meta->pagesize);
if (geo_lower > MAX_PAGENO + 1) {
geo_lower = MAX_PAGENO + 1;
mapsize_min = geo_lower * (uint64_t)meta->pagesize;
}
WARNING("meta[%u] consider get-%s pageno is %" PRIaPGNO
" instead of wrong %" PRIaPGNO
", will be corrected on next commit(s)",
meta_number, "lower", geo_lower, meta->geometry.lower);
meta->geometry.lower = geo_lower;
} else {
WARNING("meta[%u] has invalid min-mapsize (%" PRIu64 "), skip it",
meta_number, mapsize_min);
return MDBX_VERSION_MISMATCH;
}
}
pgno_t geo_upper = meta->geometry.upper;
uint64_t mapsize_max = geo_upper * (uint64_t)meta->pagesize;
STATIC_ASSERT(MIN_MAPSIZE < MAX_MAPSIZE);
if (unlikely(mapsize_max > MAX_MAPSIZE ||
(MAX_PAGENO + 1) <
ceil_powerof2((size_t)mapsize_max, globals.sys_pagesize) /
(size_t)meta->pagesize)) {
if (mapsize_max > MAX_MAPSIZE64) {
WARNING("meta[%u] has invalid max-mapsize (%" PRIu64 "), skip it",
meta_number, mapsize_max);
return MDBX_VERSION_MISMATCH;
}
/* allow to open large DB from a 32-bit environment */
eASSERT(env, meta->geometry.first_unallocated - 1 <= MAX_PAGENO &&
used_bytes <= MAX_MAPSIZE);
WARNING("meta[%u] has too large max-mapsize (%" PRIu64 "), "
"but size of used space still acceptable (%" PRIu64 ")",
meta_number, mapsize_max, used_bytes);
geo_upper = (pgno_t)((mapsize_max = MAX_MAPSIZE) / meta->pagesize);
if (geo_upper > MAX_PAGENO + 1) {
geo_upper = MAX_PAGENO + 1;
mapsize_max = geo_upper * (uint64_t)meta->pagesize;
}
WARNING("meta[%u] consider get-%s pageno is %" PRIaPGNO
" instead of wrong %" PRIaPGNO
", will be corrected on next commit(s)",
meta_number, "upper", geo_upper, meta->geometry.upper);
meta->geometry.upper = geo_upper;
}
/* LY: check and silently put geometry.now into [geo.lower...geo.upper].
*
* Copy-with-compaction by old version of libmdbx could produce DB-file
* less than meta.geo.lower bound, in case actual filling is low or no data
* at all. This is not a problem as there is no damage or loss of data.
* Therefore it is better not to consider such situation as an error, but
* silently correct it. */
pgno_t geo_now = meta->geometry.now;
if (geo_now < geo_lower)
geo_now = geo_lower;
if (geo_now > geo_upper && meta->geometry.first_unallocated <= geo_upper)
geo_now = geo_upper;
if (unlikely(meta->geometry.first_unallocated > geo_now)) {
WARNING("meta[%u] next-pageno (%" PRIaPGNO
") is beyond end-pgno (%" PRIaPGNO "), skip it",
meta_number, meta->geometry.first_unallocated, geo_now);
return MDBX_CORRUPTED;
}
if (meta->geometry.now != geo_now) {
WARNING("meta[%u] consider geo-%s pageno is %" PRIaPGNO
" instead of wrong %" PRIaPGNO
", will be corrected on next commit(s)",
meta_number, "now", geo_now, meta->geometry.now);
meta->geometry.now = geo_now;
}
/* GC */
if (meta->trees.gc.root == P_INVALID) {
if (unlikely(meta->trees.gc.branch_pages || meta->trees.gc.height ||
meta->trees.gc.items || meta->trees.gc.leaf_pages ||
meta->trees.gc.large_pages)) {
WARNING("meta[%u] has false-empty %s, skip it", meta_number, "GC");
return MDBX_CORRUPTED;
}
} else if (unlikely(meta->trees.gc.root >=
meta->geometry.first_unallocated)) {
WARNING("meta[%u] has invalid %s-root %" PRIaPGNO ", skip it", meta_number,
"GC", meta->trees.gc.root);
return MDBX_CORRUPTED;
}
/* MainDB */
if (meta->trees.main.root == P_INVALID) {
if (unlikely(meta->trees.main.branch_pages || meta->trees.main.height ||
meta->trees.main.items || meta->trees.main.leaf_pages ||
meta->trees.main.large_pages)) {
WARNING("meta[%u] has false-empty %s", meta_number, "MainDB");
return MDBX_CORRUPTED;
}
} else if (unlikely(meta->trees.main.root >=
meta->geometry.first_unallocated)) {
WARNING("meta[%u] has invalid %s-root %" PRIaPGNO ", skip it", meta_number,
"MainDB", meta->trees.main.root);
return MDBX_CORRUPTED;
}
if (unlikely(meta->trees.gc.mod_txnid > txnid)) {
WARNING("meta[%u] has wrong mod_txnid %" PRIaTXN " for %s, skip it",
meta_number, meta->trees.gc.mod_txnid, "GC");
return MDBX_CORRUPTED;
}
if (unlikely(meta->trees.main.mod_txnid > txnid)) {
WARNING("meta[%u] has wrong mod_txnid %" PRIaTXN " for %s, skip it",
meta_number, meta->trees.main.mod_txnid, "MainDB");
return MDBX_CORRUPTED;
}
return MDBX_SUCCESS;
}
__cold int meta_validate_copy(MDBX_env *env, const meta_t *meta, meta_t *dest) {
*dest = *meta;
return meta_validate(env, dest, data_page(meta),
bytes2pgno(env, ptr_dist(meta, env->dxb_mmap.base)),
nullptr);
}