mdbx: merge branch erigon into devel.

This commit is contained in:
Леонид Юрьев (Leonid Yuriev) 2022-07-07 23:24:45 +03:00
commit 2cfcfcf91c
4 changed files with 279 additions and 108 deletions

3
mdbx.h
View File

@ -2494,6 +2494,9 @@ struct MDBX_envinfo {
uint64_t unspill; /**< Quantity of unspilled/reloaded pages */
uint64_t wops; /**< Number of explicit write operations (not a pages)
to a disk */
uint64_t
gcrtime_seconds16dot16; /**< Time spent loading and searching inside
GC (aka FreeDB) in 1/65536 of second. */
} mi_pgop_stat;
};
#ifndef __cplusplus

View File

@ -5747,6 +5747,15 @@ constmeta_txnid(const MDBX_env *env, const MDBX_meta *meta) {
return (a == b) ? a : 0;
}
static __inline void meta_cache_clear(MDBX_env *env) {
#if MDBX_CACHE_METAS
env->cache_last_meta = nullptr;
env->cache_steady_meta = nullptr;
#else
(void)env;
#endif /* MDBX_CACHE_METAS */
}
static __inline txnid_t meta_txnid(const MDBX_env *env,
volatile const MDBX_meta *meta) {
(void)env;
@ -5855,7 +5864,7 @@ static int meta_eq_mask(const MDBX_env *env) {
return rc;
}
static __inline volatile const MDBX_meta *
static __always_inline volatile const MDBX_meta *
meta_recent(const enum meta_choise_mode mode, const MDBX_env *env,
volatile const MDBX_meta *a, volatile const MDBX_meta *b) {
const bool a_older_that_b = meta_ot(mode, env, a, b);
@ -5871,7 +5880,7 @@ static const MDBX_meta *meta_ancient_prefer_weak(const MDBX_env *env,
return a_older_that_b ? a : b;
}
static __inline volatile const MDBX_meta *
static __always_inline volatile const MDBX_meta *
meta_mostrecent(const enum meta_choise_mode mode, const MDBX_env *env) {
volatile const MDBX_meta *m0 = METAPAGE(env, 0);
volatile const MDBX_meta *m1 = METAPAGE(env, 1);
@ -5883,45 +5892,58 @@ meta_mostrecent(const enum meta_choise_mode mode, const MDBX_env *env) {
}
static volatile const MDBX_meta *meta_prefer_steady(const MDBX_env *env) {
return meta_mostrecent(prefer_steady, env);
return
#if MDBX_CACHE_METAS
((MDBX_env *)env)->cache_steady_meta =
#endif /* MDBX_CACHE_METAS */
meta_mostrecent(prefer_steady, env);
}
MDBX_NOTHROW_PURE_FUNCTION static const MDBX_meta *
constmeta_prefer_steady(const MDBX_env *env) {
return (const MDBX_meta *)meta_mostrecent(prefer_steady, env);
#if MDBX_CACHE_METAS
mdbx_assert(env, !env->cache_steady_meta ||
env->cache_steady_meta ==
meta_mostrecent(prefer_steady, env));
return (const MDBX_meta *)(env->cache_steady_meta ? env->cache_steady_meta :
#else
return (const MDBX_meta *)(
#endif /* MDBX_CACHE_METAS */
meta_prefer_steady(env));
}
static volatile const MDBX_meta *meta_prefer_last(const MDBX_env *env) {
return meta_mostrecent(prefer_last, env);
return
#if MDBX_CACHE_METAS
((MDBX_env *)env)->cache_last_meta =
#endif /* MDBX_CACHE_METAS */
meta_mostrecent(prefer_last, env);
}
MDBX_NOTHROW_PURE_FUNCTION static const MDBX_meta *
constmeta_prefer_last(const MDBX_env *env) {
return (const MDBX_meta *)meta_mostrecent(prefer_last, env);
#if MDBX_CACHE_METAS
mdbx_assert(env,
!env->cache_last_meta ||
env->cache_last_meta == meta_mostrecent(prefer_last, env));
return (const MDBX_meta *)(env->cache_last_meta ? env->cache_last_meta :
#else
return (const MDBX_meta *)(
#endif /* MDBX_CACHE_METAS */
meta_prefer_last(env));
}
static txnid_t mdbx_recent_committed_txnid(const MDBX_env *env) {
while (true) {
volatile const MDBX_meta *head = meta_prefer_last(env);
const txnid_t recent = meta_txnid(env, head);
mdbx_compiler_barrier();
mdbx_memory_barrier();
if (likely(head == meta_prefer_last(env) &&
recent == meta_txnid(env, head)))
return recent;
}
}
static txnid_t mdbx_recent_steady_txnid(const MDBX_env *env) {
while (true) {
volatile const MDBX_meta *head = meta_prefer_steady(env);
const txnid_t recent = meta_txnid(env, head);
mdbx_compiler_barrier();
if (likely(head == meta_prefer_steady(env) &&
recent == meta_txnid(env, head)))
return recent;
}
}
static const char *mdbx_durable_str(volatile const MDBX_meta *const meta) {
if (META_IS_STEADY(meta))
return (unaligned_peek_u64_volatile(4, meta->mm_datasync_sign) ==
@ -5934,23 +5956,22 @@ static const char *mdbx_durable_str(volatile const MDBX_meta *const meta) {
/*----------------------------------------------------------------------------*/
/* Find oldest txnid still referenced. */
static txnid_t mdbx_find_oldest(const MDBX_txn *txn) {
mdbx_tassert(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0);
MDBX_env *env = txn->mt_env;
const txnid_t edge = mdbx_recent_steady_txnid(env);
mdbx_tassert(txn, edge <= txn->mt_txnid);
static txnid_t find_oldest_reader(const MDBX_env *env) {
const txnid_t steady_edge =
constmeta_txnid(env, constmeta_prefer_steady(env));
mdbx_assert(env, steady_edge <= env->me_txn0->mt_txnid);
MDBX_lockinfo *const lck = env->me_lck_mmap.lck;
if (unlikely(lck == NULL /* exclusive mode */)) {
if (unlikely(lck == NULL /* exclusive without-lck mode */)) {
mdbx_assert(env, env->me_lck == (void *)&env->x_lckless_stub);
return env->me_lck->mti_oldest_reader.weak = edge;
return env->me_lck->mti_oldest_reader.weak = steady_edge;
}
const txnid_t last_oldest =
atomic_load64(&lck->mti_oldest_reader, mo_AcquireRelease);
mdbx_tassert(txn, edge >= last_oldest);
if (likely(last_oldest == edge))
return edge;
mdbx_assert(env, steady_edge >= last_oldest);
if (likely(last_oldest == steady_edge))
return steady_edge;
const uint32_t nothing_changed = MDBX_STRING_TETRAD("None");
const uint32_t snap_readers_refresh_flag =
@ -5959,15 +5980,15 @@ static txnid_t mdbx_find_oldest(const MDBX_txn *txn) {
if (snap_readers_refresh_flag == nothing_changed)
return last_oldest;
txnid_t oldest = edge;
atomic_store32(&lck->mti_readers_refresh_flag, nothing_changed, mo_Relaxed);
const unsigned snap_nreaders =
atomic_load32(&lck->mti_numreaders, mo_AcquireRelease);
txnid_t oldest = steady_edge;
for (unsigned i = 0; i < snap_nreaders; ++i) {
if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease)) {
/* mdbx_jitter4testing(true); */
const txnid_t snap = safe64_read(&lck->mti_readers[i].mr_txnid);
if (oldest > snap && last_oldest <= /* ignore pending updates */ snap) {
if (oldest > snap && /* ignore pending updates */ snap <= steady_edge) {
oldest = snap;
if (oldest == last_oldest)
return oldest;
@ -5978,20 +5999,21 @@ static txnid_t mdbx_find_oldest(const MDBX_txn *txn) {
if (oldest != last_oldest) {
mdbx_verbose("update oldest %" PRIaTXN " -> %" PRIaTXN, last_oldest,
oldest);
mdbx_tassert(txn, oldest >= lck->mti_oldest_reader.weak);
mdbx_assert(env, oldest >= lck->mti_oldest_reader.weak);
atomic_store64(&lck->mti_oldest_reader, oldest, mo_Relaxed);
}
return oldest;
}
/* Find largest mvcc-snapshot still referenced. */
__cold static pgno_t mdbx_find_largest(MDBX_env *env, pgno_t largest) {
__cold static pgno_t find_largest_snapshot(const MDBX_env *env,
pgno_t last_used_page) {
MDBX_lockinfo *const lck = env->me_lck_mmap.lck;
if (likely(lck != NULL /* exclusive mode */)) {
if (likely(lck != NULL /* check for exclusive without-lck mode */)) {
retry:;
const unsigned snap_nreaders =
atomic_load32(&lck->mti_numreaders, mo_AcquireRelease);
for (unsigned i = 0; i < snap_nreaders; ++i) {
retry:
if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease)) {
/* mdbx_jitter4testing(true); */
const pgno_t snap_pages = atomic_load32(
@ -6003,16 +6025,13 @@ __cold static pgno_t mdbx_find_largest(MDBX_env *env, pgno_t largest) {
mo_AcquireRelease) ||
snap_txnid != safe64_read(&lck->mti_readers[i].mr_txnid)))
goto retry;
if (largest < snap_pages &&
atomic_load64(&lck->mti_oldest_reader, mo_AcquireRelease) <=
/* ignore pending updates */ snap_txnid &&
snap_txnid <= env->me_txn0->mt_txnid)
largest = snap_pages;
if (last_used_page < snap_pages && snap_txnid <= env->me_txn0->mt_txnid)
last_used_page = snap_pages;
}
}
}
return largest;
return last_used_page;
}
/* Add a page to the txn's dirty list */
@ -6359,6 +6378,7 @@ __cold static int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno,
}
#endif /* MDBX_ENABLE_MADVISE */
meta_cache_clear(env);
rc = mdbx_mresize(mresize_flags, &env->me_dxb_mmap, size_bytes, limit_bytes);
#if MDBX_ENABLE_MADVISE
@ -6511,6 +6531,7 @@ __cold static int mdbx_wipe_steady(MDBX_env *env, const txnid_t last_steady) {
/* force oldest refresh */
atomic_store32(&env->me_lck->mti_readers_refresh_flag, true, mo_Relaxed);
meta_cache_clear(env);
return MDBX_SUCCESS;
}
@ -6628,7 +6649,10 @@ __cold static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num,
pgno_t pgno, *re_list = txn->tw.reclaimed_pglist;
unsigned re_len = MDBX_PNL_SIZE(re_list);
pgno_t *range = nullptr;
txnid_t oldest = 0, last = 0;
txnid_t detent = 0, last = 0;
#if MDBX_ENABLE_PGOP_STAT
uint64_t timestamp = 0;
#endif /* MDBX_ENABLE_PGOP_STAT */
while (true) { /* hsr-kick retry loop */
MDBX_cursor_couple recur;
@ -6654,18 +6678,20 @@ __cold static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num,
if (unlikely(!(flags & MDBX_ALLOC_GC)))
break /* reclaiming is prohibited for now */;
/* Prepare to fetch more and coalesce */
oldest = (flags & MDBX_LIFORECLAIM)
? mdbx_find_oldest(txn)
: atomic_load64(&env->me_lck->mti_oldest_reader,
mo_AcquireRelease);
/* Prepare to fetch and coalesce */
#if MDBX_ENABLE_PGOP_STAT
if (likely(timestamp == 0))
timestamp = mdbx_osal_monotime();
#endif /* MDBX_ENABLE_PGOP_STAT */
detent = find_oldest_reader(env) + 1;
ret.err = mdbx_cursor_init(&recur.outer, txn, FREE_DBI);
if (unlikely(ret.err != MDBX_SUCCESS))
goto fail;
if (flags & MDBX_LIFORECLAIM) {
/* Begin from oldest reader if any */
if (oldest > MIN_TXNID) {
last = oldest - 1;
if (detent > MIN_TXNID) {
last = detent - 1;
op = MDBX_SET_RANGE;
}
} else if (txn->tw.last_reclaimed) {
@ -6680,9 +6706,9 @@ __cold static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num,
if (!(flags & MDBX_LIFORECLAIM)) {
/* Do not try fetch more if the record will be too recent */
if (op != MDBX_FIRST && ++last >= oldest) {
oldest = mdbx_find_oldest(txn);
if (oldest <= last)
if (op != MDBX_FIRST && ++last >= detent) {
detent = find_oldest_reader(env) + 1;
if (detent <= last)
break;
}
}
@ -6691,10 +6717,10 @@ __cold static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num,
if (ret.err == MDBX_NOTFOUND && (flags & MDBX_LIFORECLAIM)) {
if (op == MDBX_SET_RANGE)
continue;
txnid_t snap = mdbx_find_oldest(txn);
if (oldest < snap) {
oldest = snap;
last = oldest - 1;
const txnid_t snap = find_oldest_reader(env);
if (unlikely(detent <= snap)) {
detent = snap + 1;
last = snap;
key.iov_base = &last;
key.iov_len = sizeof(last);
op = MDBX_SET_RANGE;
@ -6712,9 +6738,9 @@ __cold static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num,
goto fail;
}
last = unaligned_peek_u64(4, key.iov_base);
if (oldest <= last) {
oldest = mdbx_find_oldest(txn);
if (oldest <= last) {
if (detent <= last) {
detent = find_oldest_reader(env) + 1;
if (detent <= last) {
if (flags & MDBX_LIFORECLAIM)
continue;
break;
@ -6825,6 +6851,11 @@ __cold static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num,
/* Done for a kick-reclaim mode, actually no page needed */
if (unlikely(flags & MDBX_ALLOC_SLOT)) {
mdbx_debug("early-return NULL-page for %s mode", "MDBX_ALLOC_SLOT");
#if MDBX_ENABLE_PGOP_STAT
mdbx_assert(env, timestamp != 0);
env->me_lck->mti_pgop_stat.gcrtime.weak +=
mdbx_osal_monotime() - timestamp;
#endif /* MDBX_ENABLE_PGOP_STAT */
ret.err = MDBX_SUCCESS;
ret.page = NULL;
return ret;
@ -6866,12 +6897,12 @@ __cold static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num,
const MDBX_meta *const steady = constmeta_prefer_steady(env);
/* does reclaiming stopped at the last steady point? */
if (head != steady && META_IS_STEADY(steady) &&
oldest == constmeta_txnid(env, steady)) {
detent == constmeta_txnid(env, steady) + 1) {
mdbx_debug("gc-kick-steady: head %" PRIaTXN "-%s, tail %" PRIaTXN
"-%s, oldest %" PRIaTXN,
constmeta_txnid(env, head), mdbx_durable_str(head),
constmeta_txnid(env, steady), mdbx_durable_str(steady),
oldest);
detent);
ret.err = MDBX_RESULT_TRUE;
const pgno_t autosync_threshold =
atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed);
@ -6890,7 +6921,7 @@ __cold static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num,
next >= steady->mm_geo.now)) {
/* wipe steady checkpoint in MDBX_UTTERLY_NOSYNC mode
* without any auto-sync threshold(s). */
ret.err = mdbx_wipe_steady(env, oldest);
ret.err = mdbx_wipe_steady(env, detent);
mdbx_debug("gc-wipe-steady, rc %d", ret.err);
mdbx_assert(env, steady != meta_prefer_steady(env));
} else if ((flags & MDBX_ALLOC_NEW) == 0 ||
@ -6911,16 +6942,11 @@ __cold static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num,
mdbx_debug("gc-make-steady, rc %d", ret.err);
mdbx_assert(env, steady != meta_prefer_steady(env));
}
if (ret.err == MDBX_SUCCESS) {
if (mdbx_find_oldest(txn) > oldest)
continue;
/* it is reasonable check/kick lagging reader(s) here,
* since we made a new steady point or wipe the last. */
if (oldest < txn->mt_txnid - xMDBX_TXNID_STEP &&
mdbx_kick_longlived_readers(env, oldest) > oldest)
continue;
} else if (unlikely(ret.err != MDBX_RESULT_TRUE))
if (likely(ret.err != MDBX_RESULT_TRUE)) {
if (unlikely(ret.err != MDBX_SUCCESS))
goto fail;
continue;
}
}
}
@ -6928,9 +6954,14 @@ __cold static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num,
* at the end of database file. */
if ((flags & MDBX_ALLOC_NEW) && next <= txn->mt_end_pgno)
goto done;
if ((flags & MDBX_ALLOC_GC) && oldest < txn->mt_txnid - xMDBX_TXNID_STEP &&
mdbx_kick_longlived_readers(env, oldest) > oldest)
if (flags & MDBX_ALLOC_GC) {
const txnid_t laggard = find_oldest_reader(env);
if (laggard >= detent ||
(laggard < txn->mt_txnid - xMDBX_TXNID_STEP &&
mdbx_kick_longlived_readers(env, laggard) >= detent))
continue;
}
ret.err = MDBX_NOTFOUND;
if (flags & MDBX_ALLOC_NEW) {
@ -6963,6 +6994,11 @@ __cold static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num,
}
fail:
#if MDBX_ENABLE_PGOP_STAT
if (timestamp)
env->me_lck->mti_pgop_stat.gcrtime.weak +=
mdbx_osal_monotime() - timestamp;
#endif /* MDBX_ENABLE_PGOP_STAT */
mdbx_assert(env,
mdbx_pnl_check4assert(txn->tw.reclaimed_pglist,
txn->mt_next_pgno - MDBX_ENABLE_REFUND));
@ -6989,6 +7025,10 @@ __cold static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num,
done:
mdbx_assert(env, !(flags & MDBX_ALLOC_SLOT));
mdbx_ensure(env, pgno >= NUM_METAS);
#if MDBX_ENABLE_PGOP_STAT
if (likely(timestamp))
env->me_lck->mti_pgop_stat.gcrtime.weak += mdbx_osal_monotime() - timestamp;
#endif /* MDBX_ENABLE_PGOP_STAT */
if (unlikely(flags & MDBX_ALLOC_FAKE)) {
mdbx_debug("return NULL-page for %u pages %s allocation", num,
"gc-slot/backlog");
@ -7415,12 +7455,13 @@ retry:;
#if MDBX_ENABLE_PGOP_STAT
env->me_lck->mti_pgop_stat.wops.weak += wops;
#endif /* MDBX_ENABLE_PGOP_STAT */
meta_cache_clear(env);
goto retry;
}
env->me_txn0->mt_txnid = head_txnid;
mdbx_assert(env, head_txnid == meta_txnid(env, head));
mdbx_assert(env, head_txnid == mdbx_recent_committed_txnid(env));
mdbx_find_oldest(env->me_txn0);
find_oldest_reader(env);
flags |= MDBX_SHRINK_ALLOWED;
}
@ -7659,6 +7700,7 @@ static void mdbx_txn_valgrind(MDBX_env *env, MDBX_txn *txn) {
/* no write-txn */
last = NUM_METAS;
should_unlock = true;
meta_cache_clear(env);
} else {
/* write txn is running, therefore shouldn't poison any memory range */
return;
@ -8000,6 +8042,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) {
if (likely(/* not recovery mode */ env->me_stuck_meta < 0)) {
uint64_t timestamp = 0;
while (1) {
meta_cache_clear(env);
volatile const MDBX_meta *const meta = meta_prefer_last(env);
mdbx_jitter4testing(false);
const txnid_t snap = meta_txnid(env, meta);
@ -8042,8 +8085,6 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) {
snap == meta_txnid(env, meta) &&
snap >= atomic_load64(&env->me_lck->mti_oldest_reader,
mo_AcquireRelease))) {
/* workaround for todo4recovery://erased_by_github/libmdbx/issues/269
*/
rc = meta_waittxnid(env, (const MDBX_meta *)meta, &timestamp);
mdbx_jitter4testing(false);
if (likely(rc == MDBX_SUCCESS))
@ -8128,6 +8169,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) {
}
#endif /* Windows */
meta_cache_clear(env);
mdbx_jitter4testing(false);
const MDBX_meta *meta = constmeta_prefer_last(env);
uint64_t timestamp = 0;
@ -8633,7 +8675,7 @@ int mdbx_txn_info(const MDBX_txn *txn, MDBX_txn_info *info, bool scan_rlt) {
const unsigned snap_nreaders =
atomic_load32(&lck->mti_numreaders, mo_AcquireRelease);
if (snap_nreaders) {
oldest_snapshot = mdbx_find_oldest(txn);
oldest_snapshot = find_oldest_reader(env);
if (oldest_snapshot == txn->mt_txnid - 1) {
/* check if there is at least one reader */
bool exists = false;
@ -9195,12 +9237,18 @@ typedef struct gc_update_context {
unsigned settled, cleaned_slot, reused_slot, filled_slot;
txnid_t cleaned_id, rid;
bool lifo, dense;
#if MDBX_ENABLE_BIGFOOT
txnid_t bigfoot;
#endif /* MDBX_ENABLE_BIGFOOT */
MDBX_cursor_couple cursor;
} gcu_context_t;
static __inline int gcu_context_init(MDBX_txn *txn, gcu_context_t *ctx) {
memset(ctx, 0, offsetof(gcu_context_t, cursor));
ctx->lifo = (txn->mt_env->me_flags & MDBX_LIFORECLAIM) != 0;
#if MDBX_ENABLE_BIGFOOT
ctx->bigfoot = txn->mt_txnid;
#endif /* MDBX_ENABLE_BIGFOOT */
return mdbx_cursor_init(&ctx->cursor.outer, txn, FREE_DBI);
}
@ -9210,19 +9258,29 @@ static __always_inline unsigned gcu_backlog_size(MDBX_txn *txn) {
static int gcu_clean_stored_retired(MDBX_txn *txn, gcu_context_t *ctx) {
int err = MDBX_SUCCESS;
if (ctx->retired_stored) {
if (ctx->retired_stored)
do {
MDBX_val key, val;
#if MDBX_ENABLE_BIGFOOT
key.iov_base = &ctx->bigfoot;
#else
key.iov_base = &txn->mt_txnid;
#endif /* MDBX_ENABLE_BIGFOOT */
key.iov_len = sizeof(txnid_t);
const struct cursor_set_result csr =
mdbx_cursor_set(&ctx->cursor.outer, &key, &val, MDBX_SET);
if (csr.err == MDBX_SUCCESS && csr.exact) {
ctx->retired_stored = 0;
err = mdbx_cursor_del(&ctx->cursor.outer, 0);
mdbx_trace("== clear-4linear, backlog %u, err %d", gcu_backlog_size(txn),
err);
mdbx_trace("== clear-4linear, backlog %u, err %d",
gcu_backlog_size(txn), err);
}
}
#if MDBX_ENABLE_BIGFOOT
while (!err && --ctx->bigfoot >= txn->mt_txnid);
#else
while (0);
#endif /* MDBX_ENABLE_BIGFOOT */
return err;
}
@ -9379,7 +9437,7 @@ retry:
do {
ctx->cleaned_id = txn->tw.lifo_reclaimed[++ctx->cleaned_slot];
mdbx_tassert(txn, ctx->cleaned_slot > 0 &&
ctx->cleaned_id <
ctx->cleaned_id <=
env->me_lck->mti_oldest_reader.weak);
key.iov_base = &ctx->cleaned_id;
key.iov_len = sizeof(ctx->cleaned_id);
@ -9394,7 +9452,7 @@ retry:
goto bailout;
}
mdbx_tassert(txn,
ctx->cleaned_id < env->me_lck->mti_oldest_reader.weak);
ctx->cleaned_id <= env->me_lck->mti_oldest_reader.weak);
mdbx_trace("%s: cleanup-reclaimed-id [%u]%" PRIaTXN, dbg_prefix_mode,
ctx->cleaned_slot, ctx->cleaned_id);
mdbx_tassert(txn, *txn->mt_cursors == &ctx->cursor.outer);
@ -9431,7 +9489,7 @@ retry:
}
mdbx_tassert(txn, ctx->cleaned_id <= txn->tw.last_reclaimed);
mdbx_tassert(txn,
ctx->cleaned_id < env->me_lck->mti_oldest_reader.weak);
ctx->cleaned_id <= env->me_lck->mti_oldest_reader.weak);
mdbx_trace("%s: cleanup-reclaimed-id %" PRIaTXN, dbg_prefix_mode,
ctx->cleaned_id);
mdbx_tassert(txn, *txn->mt_cursors == &ctx->cursor.outer);
@ -9560,6 +9618,63 @@ retry:
goto bailout;
}
#if MDBX_ENABLE_BIGFOOT
unsigned retired_pages_before;
do {
if (ctx->bigfoot > txn->mt_txnid) {
rc = gcu_clean_stored_retired(txn, ctx);
mdbx_tassert(txn, ctx->bigfoot <= txn->mt_txnid);
}
retired_pages_before = MDBX_PNL_SIZE(txn->tw.retired_pages);
rc = gcu_prepare_backlog(txn, ctx, true);
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
mdbx_pnl_sort(txn->tw.retired_pages, txn->mt_next_pgno);
ctx->retired_stored = 0;
ctx->bigfoot = txn->mt_txnid;
do {
key.iov_len = sizeof(txnid_t);
key.iov_base = &ctx->bigfoot;
const unsigned left = (unsigned)MDBX_PNL_SIZE(txn->tw.retired_pages) -
ctx->retired_stored;
const unsigned chunk =
(left > env->me_maxgc_ov1page && ctx->bigfoot < MAX_TXNID)
? env->me_maxgc_ov1page
: left;
data.iov_len = (chunk + 1) * sizeof(pgno_t);
rc = mdbx_cursor_put(&ctx->cursor.outer, &key, &data, MDBX_RESERVE);
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
if (retired_pages_before == MDBX_PNL_SIZE(txn->tw.retired_pages)) {
const unsigned at = (ctx->lifo == MDBX_PNL_ASCENDING)
? left - chunk
: ctx->retired_stored;
pgno_t *const begin = txn->tw.retired_pages + at;
/* MDBX_PNL_ASCENDING == false && LIFO == false:
* - the larger pgno is at the beginning of retired list
* and should be placed with the larger txnid.
* MDBX_PNL_ASCENDING == true && LIFO == true:
* - the larger pgno is at the ending of retired list
* and should be placed with the smaller txnid.
*/
const pgno_t save = *begin;
*begin = chunk;
memcpy(data.iov_base, begin, data.iov_len);
*begin = save;
mdbx_trace("%s: put-retired/bigfoot @ %" PRIaTXN
" (slice #%u) #%u [%u..%u] of %u",
dbg_prefix_mode, ctx->bigfoot,
(unsigned)(ctx->bigfoot - txn->mt_txnid), chunk, at,
at + chunk, retired_pages_before);
}
ctx->retired_stored += chunk;
} while (ctx->retired_stored < MDBX_PNL_SIZE(txn->tw.retired_pages) &&
(++ctx->bigfoot, true));
} while (retired_pages_before != MDBX_PNL_SIZE(txn->tw.retired_pages));
#else
/* Write to last page of GC */
key.iov_len = sizeof(txnid_t);
key.iov_base = &txn->mt_txnid;
@ -9579,6 +9694,7 @@ retry:
mdbx_trace("%s: put-retired #%u @ %" PRIaTXN, dbg_prefix_mode,
ctx->retired_stored, txn->mt_txnid);
#endif /* MDBX_ENABLE_BIGFOOT */
if (mdbx_log_enabled(MDBX_LOG_EXTRA)) {
unsigned i = ctx->retired_stored;
mdbx_debug_extra("txn %" PRIaTXN " root %" PRIaPGNO
@ -9645,7 +9761,7 @@ retry:
retry_rid:
ctx->cursor.outer.mc_flags &= ~C_RECLAIMING;
do {
snap_oldest = mdbx_find_oldest(txn);
snap_oldest = find_oldest_reader(env);
rc = page_alloc_slowpath(&ctx->cursor.outer, 0,
MDBX_ALLOC_GC | MDBX_ALLOC_SLOT |
MDBX_ALLOC_FAKE)
@ -9678,13 +9794,13 @@ retry:
ctx->rid = MDBX_PNL_LAST(txn->tw.lifo_reclaimed);
} else {
mdbx_tassert(txn, txn->tw.last_reclaimed == 0);
if (unlikely(mdbx_find_oldest(txn) != snap_oldest))
if (unlikely(find_oldest_reader(env) != snap_oldest))
/* should retry page_alloc_slowpath(MDBX_ALLOC_GC)
* if the oldest reader changes since the last attempt */
goto retry_rid;
/* no reclaimable GC entries,
* therefore no entries with ID < mdbx_find_oldest(txn) */
txn->tw.last_reclaimed = ctx->rid = snap_oldest - 1;
txn->tw.last_reclaimed = ctx->rid = snap_oldest;
mdbx_trace("%s: none recycled yet, set rid to @%" PRIaTXN,
dbg_prefix_mode, ctx->rid);
}
@ -9775,7 +9891,7 @@ retry:
} else {
mdbx_tassert(txn, txn->tw.lifo_reclaimed == NULL);
if (unlikely(ctx->rid == 0)) {
ctx->rid = mdbx_find_oldest(txn) - 1;
ctx->rid = find_oldest_reader(env);
rc = mdbx_cursor_get(&ctx->cursor.outer, &key, NULL, MDBX_FIRST);
if (rc == MDBX_SUCCESS) {
if (unlikely(key.iov_len != sizeof(txnid_t))) {
@ -9858,10 +9974,10 @@ retry:
mdbx_trace("%s: chunk %u, gc-per-ovpage %u", dbg_prefix_mode, chunk,
env->me_maxgc_ov1page);
mdbx_tassert(txn, reservation_gc_id < env->me_lck->mti_oldest_reader.weak);
mdbx_tassert(txn, reservation_gc_id <= env->me_lck->mti_oldest_reader.weak);
if (unlikely(
reservation_gc_id < MIN_TXNID ||
reservation_gc_id >=
reservation_gc_id >
atomic_load64(&env->me_lck->mti_oldest_reader, mo_Relaxed))) {
mdbx_error("** internal error (reservation_gc_id %" PRIaTXN ")",
reservation_gc_id);
@ -9970,7 +10086,7 @@ retry:
? MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)
: 0));
mdbx_tassert(txn, fill_gc_id > 0 &&
fill_gc_id < env->me_lck->mti_oldest_reader.weak);
fill_gc_id <= env->me_lck->mti_oldest_reader.weak);
key.iov_base = &fill_gc_id;
key.iov_len = sizeof(fill_gc_id);
@ -10762,7 +10878,17 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) {
meta.mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI];
meta.mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI];
meta.mm_canary = txn->mt_canary;
meta_set_txnid(env, &meta, txn->mt_txnid);
txnid_t commit_txnid = txn->mt_txnid;
#if MDBX_ENABLE_BIGFOOT
if (gcu_ctx.bigfoot > txn->mt_txnid) {
commit_txnid = gcu_ctx.bigfoot;
mdbx_trace("use @%" PRIaTXN " (+%u) for commit bigfoot-txn", commit_txnid,
(unsigned)(commit_txnid - txn->mt_txnid));
}
#endif
meta_set_txnid(env, &meta, commit_txnid);
rc = mdbx_sync_locked(
env, env->me_flags | txn->mt_flags | MDBX_SHRINK_ALLOWED, &meta);
}
@ -11259,7 +11385,7 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags,
pgno_t shrink = 0;
if (flags & MDBX_SHRINK_ALLOWED) {
/* LY: check conditions to discard unused pages */
const pgno_t largest_pgno = mdbx_find_largest(
const pgno_t largest_pgno = find_largest_snapshot(
env, (head->mm_geo.next > pending->mm_geo.next) ? head->mm_geo.next
: pending->mm_geo.next);
mdbx_assert(env, largest_pgno >= NUM_METAS);
@ -11549,7 +11675,16 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags,
if (rc != MDBX_SUCCESS)
goto undo;
}
mdbx_assert(env, meta_checktxnid(env, target, true));
}
meta_cache_clear(env);
uint64_t timestamp = 0;
while ("workaround for todo4recovery://erased_by_github/libmdbx/issues/269") {
rc = meta_waittxnid(env, target, &timestamp);
if (likely(rc == MDBX_SUCCESS))
break;
if (unlikely(rc != MDBX_RESULT_TRUE))
goto fail;
}
env->me_lck->mti_meta_sync_txnid.weak =
(uint32_t)unaligned_peek_u64(4, pending->mm_txnid_a) -
@ -11802,11 +11937,12 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now,
if (unlikely(err != MDBX_SUCCESS))
return err;
need_unlock = true;
meta_cache_clear(env);
}
const MDBX_meta *head = constmeta_prefer_last(env);
if (!inside_txn) {
env->me_txn0->mt_txnid = constmeta_txnid(env, head);
mdbx_find_oldest(env->me_txn0);
find_oldest_reader(env);
}
/* get untouched params from DB */
@ -11828,7 +11964,7 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now,
goto bailout;
}
const size_t usedbytes =
pgno2bytes(env, mdbx_find_largest(env, head->mm_geo.next));
pgno2bytes(env, find_largest_snapshot(env, head->mm_geo.next));
if ((size_t)size_upper < usedbytes) {
rc = MDBX_MAP_FULL;
goto bailout;
@ -13027,6 +13163,7 @@ __cold static int __must_check_result mdbx_override_meta(
}
mdbx_flush_incoherent_mmap(env->me_map, pgno2bytes(env, NUM_METAS),
env->me_os_psize);
meta_cache_clear(env);
return rc;
}
@ -13500,7 +13637,7 @@ __cold int mdbx_env_open(MDBX_env *env, const char *pathname,
#if MDBX_DEBUG
if (rc == MDBX_SUCCESS) {
const MDBX_meta *meta = constmeta_prefer_last(env);
const MDBX_meta *meta = (const MDBX_meta *)meta_prefer_last(env);
const MDBX_db *db = &meta->mm_dbs[MAIN_DBI];
mdbx_debug("opened database version %u, pagesize %u",
@ -20243,6 +20380,7 @@ __cold int mdbx_env_set_flags(MDBX_env *env, MDBX_env_flags_t flags,
if (unlikely(rc))
return rc;
should_unlock = true;
meta_cache_clear(env);
}
if (onoff)
@ -20622,6 +20760,8 @@ __cold static int fetch_envinfo_ex(const MDBX_env *env, const MDBX_txn *txn,
atomic_load64(&lck->mti_pgop_stat.unspill, mo_Relaxed);
arg->mi_pgop_stat.wops =
atomic_load64(&lck->mti_pgop_stat.wops, mo_Relaxed);
arg->mi_pgop_stat.gcrtime_seconds16dot16 = mdbx_osal_monotime_to_16dot16(
atomic_load64(&lck->mti_pgop_stat.gcrtime, mo_Relaxed));
#else
memset(&arg->mi_pgop_stat, 0, sizeof(arg->mi_pgop_stat));
#endif /* MDBX_ENABLE_PGOP_STAT*/
@ -21502,7 +21642,7 @@ __cold static txnid_t mdbx_kick_longlived_readers(MDBX_env *env,
int retry;
for (retry = 0; retry < INT_MAX; ++retry) {
txnid_t oldest = mdbx_recent_steady_txnid(env);
txnid_t oldest = constmeta_txnid(env, constmeta_prefer_steady(env));
mdbx_assert(env, oldest < env->me_txn0->mt_txnid);
mdbx_assert(env, oldest >= laggard);
mdbx_assert(env, oldest >= env->me_lck->mti_oldest_reader.weak);
@ -21592,7 +21732,7 @@ __cold static txnid_t mdbx_kick_longlived_readers(MDBX_env *env,
/* LY: notify end of hsr-loop */
env->me_hsr_callback(env, env->me_txn, 0, 0, laggard, 0, 0, -retry);
}
return mdbx_find_oldest(env->me_txn);
return find_oldest_reader(env);
}
#ifndef LIBMDBX_NO_EXPORTS_LEGACY_API
@ -23061,6 +23201,7 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option,
if (unlikely(err != MDBX_SUCCESS))
return err;
should_unlock = true;
meta_cache_clear(env);
}
env->me_options.dp_reserve_limit = (unsigned)value;
while (env->me_dp_reserve_len > env->me_options.dp_reserve_limit) {
@ -23097,6 +23238,7 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option,
if (unlikely(err != MDBX_SUCCESS))
return err;
should_unlock = true;
meta_cache_clear(env);
}
if (env->me_txn)
err = MDBX_EPERM /* unable change during transaction */;
@ -23496,6 +23638,7 @@ __dll_export
#else
#error "FIXME: Unsupported byte order"
#endif /* __BYTE_ORDER__ */
" MDBX_ENABLE_BIGFOOT=" MDBX_STRINGIFY(MDBX_ENABLE_BIGFOOT)
" MDBX_ENV_CHECKPID=" MDBX_ENV_CHECKPID_CONFIG
" MDBX_TXN_CHECKOWNER=" MDBX_TXN_CHECKOWNER_CONFIG
" MDBX_64BIT_ATOMIC=" MDBX_64BIT_ATOMIC_CONFIG

View File

@ -572,6 +572,9 @@ typedef struct {
MDBX_atomic_uint64_t unspill; /* Quantity of unspilled/reloaded pages */
MDBX_atomic_uint64_t
wops; /* Number of explicit write operations (not a pages) to a disk */
MDBX_atomic_uint64_t
gcrtime; /* Time spending for reading/searching GC (aka FreeDB). The
unit/scale is platform-depended, see mdbx_osal_monotime(). */
} MDBX_pgop_stat_t;
#endif /* MDBX_ENABLE_PGOP_STAT */
@ -1186,6 +1189,10 @@ struct MDBX_env {
MDBX_txn *me_txn; /* current write transaction */
mdbx_fastmutex_t me_dbi_lock;
#if MDBX_CACHE_METAS
volatile const MDBX_meta *cache_last_meta;
volatile const MDBX_meta *cache_steady_meta;
#endif /* MDBX_CACHE_METAS */
MDBX_dbi me_numdbs; /* number of DBs opened */
MDBX_page *me_dp_reserve; /* list of malloc'ed blocks for re-use */

View File

@ -80,6 +80,24 @@
#error MDBX_ENABLE_PGOP_STAT must be defined as 0 or 1
#endif /* MDBX_ENABLE_PGOP_STAT */
/** Enables chunking long list of retired pages during huge transactions commit
* to avoid use sequences of pages. */
#ifndef MDBX_ENABLE_BIGFOOT
#if MDBX_WORDBITS >= 64 || defined(DOXYGEN)
#define MDBX_ENABLE_BIGFOOT 1
#else
#define MDBX_ENABLE_BIGFOOT 0
#endif
#elif !(MDBX_ENABLE_BIGFOOT == 0 || MDBX_ENABLE_BIGFOOT == 1)
#error MDBX_ENABLE_BIGFOOT must be defined as 0 or 1
#endif /* MDBX_ENABLE_BIGFOOT */
#ifndef MDBX_CACHE_METAS
#define MDBX_CACHE_METAS 0
#elif !(MDBX_CACHE_METAS == 0 || MDBX_CACHE_METAS == 1)
#error MDBX_CACHE_METAS must be defined as 0 or 1
#endif /* MDBX_CACHE_METAS */
/** Controls use of POSIX madvise() hints and friends. */
#ifndef MDBX_ENABLE_MADVISE
#define MDBX_ENABLE_MADVISE 1